11626163864

Committed 01 Nov 2024 07:49AM UTC coverage: 14.17% (+14.2%) from 0.0%

Build # 11626163864

Build Type

Pull #952

github

Committed by

web-flow

Commit Message

Merge 8f2551bc9 into 89ea62ebc

Pull Request Pull Request #952: Specify a limited test suite

Run Details

44 of 80 new or added lines in 48 files covered. (55.0%)

1048 of 7396 relevant lines covered (14.17%)

0.14 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0

/pythainlp/tokenize/thaisumcut.py

# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-FileCopyrightText: Copyright 2020 Nakhun Chumpolsathien
# SPDX-License-Identifier: Apache-2.0
"""
The implementation of sentence segmentator from Nakhun Chumpolsathien, 2020
original codes are from: https://github.com/nakhunchumpolsathien/ThaiSum

Cite:

@mastersthesis{chumpolsathien_2020,
    title={Using Knowledge Distillation from Keyword Extraction to Improve the Informativeness of Neural Cross-lingual Summarization},
    author={Chumpolsathien, Nakhun},
    year={2020},
    school={Beijing Institute of Technology}
"""

import math
import operator
import re
from typing import List

from pythainlp.tokenize import word_tokenize


def list_to_string(list: List[str]) -> str:
    string = "".join(list)
    string = " ".join(string.split())
    return string


def middle_cut(sentences: List[str]) -> List[str]:
    new_text = ""
    for sentence in sentences:
        sentence_size = len(word_tokenize(sentence, keep_whitespace=False))

        for k in range(0, len(sentence)):
            if k == 0 or k + 1 >= len(sentence):
                continue
            if sentence[k].isdigit() and sentence[k - 1] == " ":
                sentence = sentence[: k - 1] + sentence[k:]
            if k + 2 <= len(sentence):
                if sentence[k].isdigit() and sentence[k + 1] == " ":
                    sentence = sentence[: k + 1] + sentence[k + 2 :]

        fixed_text_lenth = 20

        if sentence_size > fixed_text_lenth:
            partition = math.floor(sentence_size / fixed_text_lenth)
            tokens = word_tokenize(sentence, keep_whitespace=True)
            for i in range(0, partition):
                middle_space = sentence_size / (partition + 1) * (i + 1)
                white_space_index = []
                white_space_diff = {}

                for j in range(len(tokens)):
                    if tokens[j] == " ":
                        white_space_index.append(j)

                for white_space in white_space_index:
                    white_space_diff.update(
                        {white_space: abs(white_space - middle_space)}
                    )

                if len(white_space_diff) > 0:
                    min_diff = min(
                        white_space_diff.items(), key=operator.itemgetter(1)
                    )
                    tokens.pop(min_diff[0])
                    tokens.insert(min_diff[0], "<stop>")
            new_text = new_text + list_to_string(tokens) + "<stop>"
        else:
            new_text = new_text + sentence + "<stop>"

    sentences = new_text.split("<stop>")
    sentences = [s.strip() for s in sentences]
    if "" in sentences:
        sentences.remove("")
    if "nan" in sentences:
        sentences.remove("nan")

    sentences = list(filter(None, sentences))
    return sentences


class ThaiSentenceSegmentor:
    def split_into_sentences(
        self, text: str, isMiddleCut: bool = False
    ) -> List[str]:
        # Declare Variables
        th_alphabets = "([ก-๙])"
        th_conjunction = "(ทำให้|โดย|เพราะ|นอกจากนี้|แต่|กรณีที่|หลังจากนี้|ต่อมา|ภายหลัง|นับตั้งแต่|หลังจาก|ซึ่งเหตุการณ์|ผู้สื่อข่าวรายงานอีก|ส่วนที่|ส่วนสาเหตุ|ฉะนั้น|เพราะฉะนั้น|เพื่อ|เนื่องจาก|จากการสอบสวนทราบว่า|จากกรณี|จากนี้|อย่างไรก็ดี)"
        th_cite = "(กล่าวว่า|เปิดเผยว่า|รายงานว่า|ให้การว่า|เผยว่า|บนทวิตเตอร์ว่า|แจ้งว่า|พลเมืองดีว่า|อ้างว่า)"
        th_ka_krub = "(ครับ|ค่ะ)"
        th_stop_after = "(หรือไม่|โดยเร็ว|แล้ว|อีกด้วย)"
        th_stop_before = "(ล่าสุด|เบื้องต้น|ซึ่ง|ทั้งนี้|แม้ว่า|เมื่อ|แถมยัง|ตอนนั้น|จนเป็นเหตุให้|จากนั้น|อย่างไรก็ตาม|และก็|อย่างใดก็ตาม|เวลานี้|เช่น|กระทั่ง)"
        degit = "([0-9])"
        th_title = "(นาย|นาง|นางสาว|เด็กชาย|เด็กหญิง|น.ส.|ด.ช.|ด.ญ.)"

        text = f" {text} "
        text = text.replace("\n", " ")
        text = text.replace("", "")
        text = text.replace("โดยเร็ว", "<rth_Doeirew>")
        text = text.replace("เพื่อน", "<rth_friend>")
        text = text.replace("แต่ง", "<rth_but>")
        text = text.replace("โดยสาร", "<rth_passenger>")
        text = text.replace("แล้วแต่", "<rth_leawtea>")
        text = text.replace("หรือเปล่า", "<rth_repraw>")
        text = text.replace("หรือไม่", "<rth_remai>")
        text = text.replace("จึงรุ่งเรืองกิจ", "<rth_tanatorn_lastname>")
        text = text.replace("ตั้งแต่", "<rth_tangtea>")
        text = text.replace("แต่ละ", "<rth_teala>")
        text = text.replace("วิตแล้ว", "<rth_chiwitleaw>")
        text = text.replace("โดยประ", "<rth_doipra>")
        text = text.replace("แต่หลังจากนั้น", "<rth_tealangjaknan>")
        text = text.replace("พรรคเพื่อ", "<for_party>")
        text = text.replace("แต่เนื่อง", "<rth_teaneung>")
        text = text.replace("เพื่อทำให้", "เพื่อ<rth_tamhai>")
        text = text.replace("ทำเพื่อ", "ทำ<rth_for>")
        text = text.replace("จึงทำให้", "จึง<tamhai>")
        text = text.replace("มาโดยตลอด", "<madoitalod>")
        text = text.replace("แต่อย่างใด", "<teayangdaikptam>")
        text = text.replace("แต่หลังจาก", "แต่<langjak>")
        text = text.replace("คงทำให้", "<rth_kongtamhai>")
        text = text.replace("แต่ทั้งนี้", "แต่<tangni>")
        text = text.replace("มีแต่", "มี<tea>")
        text = text.replace("เหตุที่ทำให้", "<hedteetamhai>")
        text = text.replace("โดยหลังจาก", "โดย<langjak>")
        text = text.replace("ซึ่งหลังจาก", "ซึ่ง<langjak>")
        text = text.replace("ตั้งโดย", "<rth_tangdoi>")
        text = text.replace("โดยตรง", "<rth_doitong>")
        text = text.replace("นั้นหรือ", "<rth_nanhlor>")
        text = text.replace("ซึ่งต้องทำให้", "ซึ่งต้อง<tamhai>")
        text = text.replace("ชื่อต่อมา", "ชื่อ<tomar>")
        text = text.replace("โดยเร่งด่วน", "<doi>เร่งด่วน")
        text = text.replace("ไม่ได้ทำให้", "ไม่ได้<tamhai>")
        text = text.replace("จะทำให้", "จะ<tamhai>")
        text = text.replace("จนทำให้", "จน<tamhai>")
        text = text.replace("เว้นแต่", "เว้น<rth_tea>")
        text = text.replace("ก็ทำให้", "ก็<tamhai>")
        text = text.replace(" ณ ตอนนั้น", " ณ <tonnan>")
        text = text.replace("บางส่วน", "บาง<rth_suan>")
        text = text.replace("หรือแม้แต่", "หรือ<rth_meatea>")
        text = text.replace("โดยทำให้", "โดย<tamhai>")
        text = text.replace("หรือเพราะ", "หรือ<rth_orbecause>")
        text = text.replace("มาแต่", "มา<rth_tea>")
        text = text.replace("แต่ไม่ทำให้", "แต่<maitamhai>")
        text = text.replace("ฉะนั้นเมื่อ", "ฉะนั้น<rth_moe>")
        text = text.replace("เพราะฉะนั้น", "เพราะ<rth_chanan>")
        text = text.replace("เพราะหลังจาก", "เพราะ<rth_langjak>")
        text = text.replace("สามารถทำให้", "สามารถ<rth_tamhai>")
        text = text.replace("อาจทำ", "อาจ<rth_tam>")
        text = text.replace("จะทำ", "จะ<rth_tam>")
        text = text.replace("และนอกจากนี้", "นอกจากนี้")
        text = text.replace("อีกทั้งเพื่อ", "อีกทั้ง<rth_for>")
        text = text.replace("ทั้งนี้เพื่อ", "ทั้งนี้<rth_for>")
        text = text.replace("เวลาต่อมา", "เวลา<rth_toma>")
        text = text.replace("อย่างไรก็ตาม", "อย่างไรก็ตาม")
        text = text.replace(
            "อย่างไรก็ตามหลังจาก", "<stop>อย่างไรก็ตาม<rth_langjak>"
        )
        text = text.replace("ซึ่งทำให้", "ซึ่ง<rth_tamhai>")
        text = text.replace("โดยประมาท", "<doi>ประมาท")
        text = text.replace("โดยธรรม", "<doi>ธรรม")
        text = text.replace("โดยสัจจริง", "<doi>สัจจริง")

        if "และ" in text:
            tokens = word_tokenize(text.strip(), keep_whitespace=True)
            and_position = -1
            nearest_space_position = -1
            last_position = len(tokens)
            pop_split_position = []
            split_position = []
            for i in range(len(tokens)):
                if tokens[i] == "และ":
                    and_position = i

                if (
                    and_position != -1
                    and i > and_position
                    and tokens[i] == " "
                    and nearest_space_position == -1
                ):
                    if i - and_position != 1:
                        nearest_space_position = i

                if and_position != -1 and last_position - and_position == 3:
                    split_position.append(last_position)
                    and_position = -1
                    nearest_space_position = -1

                if nearest_space_position != -1:
                    if nearest_space_position - and_position < 5:
                        pop_split_position.append(nearest_space_position)
                    else:
                        split_position.append(and_position)
                    and_position = -1
                    nearest_space_position = -1
            for pop in pop_split_position:
                tokens.pop(pop)
                tokens.insert(pop, "<stop>")
            for split in split_position:
                tokens.insert(split, "<stop>")
            text = list_to_string(tokens)

        if "หรือ" in text:
            tokens = word_tokenize(text.strip(), keep_whitespace=True)
            or_position = -1
            nearest_space_position = -1
            last_position = len(tokens)
            pop_split_position = []
            split_position = []
            for i in range(len(tokens)):
                if tokens[i] == "หรือ":
                    or_position = i
                if (
                    or_position != -1
                    and i > or_position
                    and tokens[i] == " "
                    and nearest_space_position == -1
                ):
                    if i - or_position != 1:
                        nearest_space_position = i

                if or_position != -1 and last_position - or_position == 3:
                    split_position.append(last_position)
                    or_position = -1
                    nearest_space_position = -1

                if nearest_space_position != -1:
                    if nearest_space_position - or_position < 4:
                        pop_split_position.append(nearest_space_position)
                    else:
                        split_position.append(or_position)
                    or_position = -1
                    nearest_space_position = -1
            for pop in pop_split_position:
                tokens.pop(pop)
                tokens.insert(pop, "<stop>")
            for split in split_position:
                tokens.insert(split, "<stop>")
            text = list_to_string(tokens)

        if "จึง" in text:
            tokens = word_tokenize(text.strip(), keep_whitespace=True)
            cung_position = -1
            nearest_space_position = -1
            pop_split_position = []
            last_position = len(tokens)
            split_position = []
            for i in range(len(tokens)):
                if tokens[i] == "จึง":
                    cung_position = i

                if (
                    cung_position != -1
                    and tokens[i] == " "
                    and i > cung_position
                    and nearest_space_position == -1
                ):
                    if i - cung_position != 1:
                        nearest_space_position = i

                if cung_position != -1 and last_position - cung_position == 2:
                    split_position.append(last_position)
                    cung_position = -1
                    nearest_space_position = -1

                if nearest_space_position != -1:
                    if nearest_space_position - cung_position < 3:
                        pop_split_position.append(nearest_space_position)
                    else:
                        split_position.append(cung_position)
                    cung_position = -1
                    nearest_space_position = -1

            for pop in pop_split_position:
                tokens.pop(pop)
                tokens.insert(pop, "<stop>")
            for split in split_position:
                tokens.insert(split, "<stop>")

            text = list_to_string(tokens)

        text = re.sub(" " + th_stop_before, "<stop>\\1", text)
        text = re.sub(th_ka_krub, "\\1<stop>", text)
        text = re.sub(th_conjunction, "<stop>\\1", text)
        text = re.sub(th_cite, "\\1<stop>", text)
        text = re.sub(" " + degit + "[.]" + th_title, "<stop>\\1.\\2", text)
        text = re.sub(
            " " + degit + degit + "[.]" + th_title, "<stop>\\1\\2.\\3", text
        )
        text = re.sub(th_alphabets + th_stop_after + " ", "\\1\\2<stop>", text)
        if "”" in text:
            text = text.replace(".”", "”.")
        if '"' in text:
            text = text.replace('."', '".')
        if "!" in text:
            text = text.replace('!"', '"!')
        if "?" in text:
            text = text.replace('?"', '"?')
        text = text.replace("<rth_Doeirew>", "โดยเร็ว")
        text = text.replace("<rth_friend>", "เพื่อน")
        text = text.replace("<rth_but>", "แต่ง")
        text = text.replace("<rth_passenger>", "โดยสาร")
        text = text.replace("<rth_leawtea>", "แล้วแต่")
        text = text.replace("<rth_repraw>", "หรือเปล่า")
        text = text.replace("<rth_remai>", "หรือไม่")
        text = text.replace("<rth_tanatorn_lastname>", "จึงรุ่งเรืองกิจ")
        text = text.replace("<rth_tangtea>", "ตั้งแต่")
        text = text.replace("<rth_teala>", "แต่ละ")
        text = text.replace("<rth_chiwitleaw>", "วิตแล้ว")
        text = text.replace("<rth_doipra>", "โดยประ")
        text = text.replace("<rth_tealangjaknan>", "แต่หลังจากนั้น")
        text = text.replace("<for_party>", "พรรคเพื่อ")
        text = text.replace("<rth_teaneung>", "แต่เนื่อง")
        text = text.replace("เพื่อ<rth_tamhai>", "เพื่อทำให้")
        text = text.replace("ทำ<rth_for>", "ทำเพื่อ")
        text = text.replace("จึง<tamhai>", "จึงทำให้")
        text = text.replace("<madoitalod>", "มาโดยตลอด")
        text = text.replace("แต่<langjak>", "แต่หลังจาก")
        text = text.replace("แต่<tangni>", "แต่ทั้งนี้")
        text = text.replace("มี<tea>", "มีแต่")
        text = text.replace("<teayangdaikptam>", "แต่อย่างใด")
        text = text.replace("<rth_kongtamhai>", "คงทำให้")
        text = text.replace("<hedteetamhai>", "เหตุที่ทำให้")
        text = text.replace("โดย<langjak>", "โดยหลังจาก")
        text = text.replace("ซึ่ง<langjak>", "ซึ่งหลังจาก")
        text = text.replace("<rth_tangdoi>", "ตั้งโดย")
        text = text.replace("<rth_doitong>", "โดยตรง")
        text = text.replace("<rth_nanhlor>", "นั้นหรือ")
        text = text.replace("ซึ่งต้อง<tamhai>", "ซึ่งต้องทำให้")
        text = text.replace("ชื่อ<tomar>", "ชื่อต่อมา")
        text = text.replace("<doi>เร่งด่วน", "โดยเร่งด่วน")
        text = text.replace("ไม่ได้<tamhai>", "ไม่ได้ทำให้")
        text = text.replace("จะ<tamhai>", "จะทำให้")
        text = text.replace("จน<tamhai>", "จนทำให้")
        text = text.replace("เว้น<rth_tea>", "เว้นแต่")
        text = text.replace("ก็<tamhai>", "ก็ทำให้")
        text = text.replace(" ณ <tonnan>", " ณ ตอนนั้น")
        text = text.replace("บาง<rth_suan>", "บางส่วน")
        text = text.replace("หรือ<rth_meatea>", "หรือแม้แต่")
        text = text.replace("โดย<tamhai>", "โดยทำให้")
        text = text.replace("หรือ<rth_orbecause>", "หรือเพราะ")
        text = text.replace("มา<rth_tea>", "มาแต่")
        text = text.replace("แต่<maitamhai>", "แต่ไม่ทำให้")
        text = text.replace("ฉะนั้น<rth_moe>", "ฉะนั้นเมื่อ")
        text = text.replace("เพราะ<rth_chanan>", "เพราะฉะนั้น")
        text = text.replace("เพราะ<rth_langjak>", "เพราะหลังจาก")
        text = text.replace("สามารถ<rth_tamhai>", "สามารถทำให้")
        text = text.replace("อาจ<rth_tam>", "อาจทำ")
        text = text.replace("จะ<rth_tam>", "จะทำ")
        text = text.replace("อีกทั้ง<rth_for>", "อีกทั้งเพื่อ")
        text = text.replace("ทั้งนี้<rth_for>", "ทั้งนี้เพื่อ")
        text = text.replace("เวลา<rth_toma>", "เวลาต่อมา")
        text = text.replace(
            "อย่างไรก็ตาม<rth_langjak>",
            "อย่างไรก็ตามหลังจาก",
        )
        text = text.replace("ซึ่ง<rth_tamhai>", "ซึ่งทำให้")
        text = text.replace("<doi>ประมาท", "โดยประมาท")
        text = text.replace("<doi>ธรรม", "โดยธรรม")
        text = text.replace("<doi>สัจจริง", "โดยสัจจริง")
        text = text.replace("?", "?<stop>")
        text = text.replace("!", "!<stop>")
        text = text.replace("<prd>", ".")
        sentences = text.split("<stop>")
        sentences = [s.strip() for s in sentences]
        if "" in sentences:
            sentences.remove("")
        if "nan" in sentences:
            sentences.remove("nan")

        sentences = list(filter(None, sentences))

        if isMiddleCut:
            return middle_cut(sentences)
        else:
            return sentences

1	# -- coding: utf-8 --
2	# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
3	# SPDX-FileCopyrightText: Copyright 2020 Nakhun Chumpolsathien
4	# SPDX-License-Identifier: Apache-2.0
5	"""
6	The implementation of sentence segmentator from Nakhun Chumpolsathien, 2020
7	original codes are from: https://github.com/nakhunchumpolsathien/ThaiSum
8
9	Cite:
10
11	@mastersthesis{chumpolsathien_2020,
12	title={Using Knowledge Distillation from Keyword Extraction to Improve the Informativeness of Neural Cross-lingual Summarization},
13	author={Chumpolsathien, Nakhun},
14	year={2020},
15	school={Beijing Institute of Technology}
16	"""
17
18	import math	×
NEW 19	import operator	×
NEW 20	import re	×
21	from typing import List	×
22
23	from pythainlp.tokenize import word_tokenize	×
24
25
26	def list_to_string(list: List[str]) -> str:	×
27	string = "".join(list)	×
28	string = " ".join(string.split())	×
29	return string	×
30
31
32	def middle_cut(sentences: List[str]) -> List[str]:	×
33	new_text = ""	×
34	for sentence in sentences:	×
35	sentence_size = len(word_tokenize(sentence, keep_whitespace=False))	×
36
37	for k in range(0, len(sentence)):	×
38	if k == 0 or k + 1 >= len(sentence):	×
39	continue	×
40	if sentence[k].isdigit() and sentence[k - 1] == " ":	×
41	sentence = sentence[: k - 1] + sentence[k:]	×
42	if k + 2 <= len(sentence):	×
43	if sentence[k].isdigit() and sentence[k + 1] == " ":	×
44	sentence = sentence[: k + 1] + sentence[k + 2 :]	×
45
46	fixed_text_lenth = 20	×
47
48	if sentence_size > fixed_text_lenth:	×
49	partition = math.floor(sentence_size / fixed_text_lenth)	×
50	tokens = word_tokenize(sentence, keep_whitespace=True)	×
51	for i in range(0, partition):	×
52	middle_space = sentence_size / (partition + 1) * (i + 1)	×
53	white_space_index = []	×
54	white_space_diff = {}	×
55
56	for j in range(len(tokens)):	×
57	if tokens[j] == " ":	×
58	white_space_index.append(j)	×
59
60	for white_space in white_space_index:	×
61	white_space_diff.update(	×
62	{white_space: abs(white_space - middle_space)}
63	)
64
65	if len(white_space_diff) > 0:	×
66	min_diff = min(	×
67	white_space_diff.items(), key=operator.itemgetter(1)
68	)
69	tokens.pop(min_diff[0])	×
70	tokens.insert(min_diff[0], "<stop>")	×
71	new_text = new_text + list_to_string(tokens) + "<stop>"	×
72	else:
73	new_text = new_text + sentence + "<stop>"	×
74
75	sentences = new_text.split("<stop>")	×
76	sentences = [s.strip() for s in sentences]	×
77	if "" in sentences:	×
78	sentences.remove("")	×
79	if "nan" in sentences:	×
80	sentences.remove("nan")	×
81
82	sentences = list(filter(None, sentences))	×
83	return sentences	×
84
85
86	class ThaiSentenceSegmentor:	×
87	def split_into_sentences(	×
88	self, text: str, isMiddleCut: bool = False
89	) -> List[str]:
90	# Declare Variables
91	th_alphabets = "([ก-๙])"	×
92	th_conjunction = "(ทำให้\|โดย\|เพราะ\|นอกจากนี้\|แต่\|กรณีที่\|หลังจากนี้\|ต่อมา\|ภายหลัง\|นับตั้งแต่\|หลังจาก\|ซึ่งเหตุการณ์\|ผู้สื่อข่าวรายงานอีก\|ส่วนที่\|ส่วนสาเหตุ\|ฉะนั้น\|เพราะฉะนั้น\|เพื่อ\|เนื่องจาก\|จากการสอบสวนทราบว่า\|จากกรณี\|จากนี้\|อย่างไรก็ดี)"	×
93	th_cite = "(กล่าวว่า\|เปิดเผยว่า\|รายงานว่า\|ให้การว่า\|เผยว่า\|บนทวิตเตอร์ว่า\|แจ้งว่า\|พลเมืองดีว่า\|อ้างว่า)"	×
94	th_ka_krub = "(ครับ\|ค่ะ)"	×
95	th_stop_after = "(หรือไม่\|โดยเร็ว\|แล้ว\|อีกด้วย)"	×
96	th_stop_before = "(ล่าสุด\|เบื้องต้น\|ซึ่ง\|ทั้งนี้\|แม้ว่า\|เมื่อ\|แถมยัง\|ตอนนั้น\|จนเป็นเหตุให้\|จากนั้น\|อย่างไรก็ตาม\|และก็\|อย่างใดก็ตาม\|เวลานี้\|เช่น\|กระทั่ง)"	×
97	degit = "([0-9])"	×
98	th_title = "(นาย\|นาง\|นางสาว\|เด็กชาย\|เด็กหญิง\|น.ส.\|ด.ช.\|ด.ญ.)"	×
99
100	text = f" {text} "	×
101	text = text.replace("\n", " ")	×
102	text = text.replace("", "")	×
103	text = text.replace("โดยเร็ว", "<rth_Doeirew>")	×
104	text = text.replace("เพื่อน", "<rth_friend>")	×
105	text = text.replace("แต่ง", "<rth_but>")	×
106	text = text.replace("โดยสาร", "<rth_passenger>")	×
107	text = text.replace("แล้วแต่", "<rth_leawtea>")	×
108	text = text.replace("หรือเปล่า", "<rth_repraw>")	×
109	text = text.replace("หรือไม่", "<rth_remai>")	×
110	text = text.replace("จึงรุ่งเรืองกิจ", "<rth_tanatorn_lastname>")	×
111	text = text.replace("ตั้งแต่", "<rth_tangtea>")	×
112	text = text.replace("แต่ละ", "<rth_teala>")	×
113	text = text.replace("วิตแล้ว", "<rth_chiwitleaw>")	×
114	text = text.replace("โดยประ", "<rth_doipra>")	×
115	text = text.replace("แต่หลังจากนั้น", "<rth_tealangjaknan>")	×
116	text = text.replace("พรรคเพื่อ", "<for_party>")	×
117	text = text.replace("แต่เนื่อง", "<rth_teaneung>")	×
118	text = text.replace("เพื่อทำให้", "เพื่อ<rth_tamhai>")	×
119	text = text.replace("ทำเพื่อ", "ทำ<rth_for>")	×
120	text = text.replace("จึงทำให้", "จึง<tamhai>")	×
121	text = text.replace("มาโดยตลอด", "<madoitalod>")	×
122	text = text.replace("แต่อย่างใด", "<teayangdaikptam>")	×
123	text = text.replace("แต่หลังจาก", "แต่<langjak>")	×
124	text = text.replace("คงทำให้", "<rth_kongtamhai>")	×
125	text = text.replace("แต่ทั้งนี้", "แต่<tangni>")	×
126	text = text.replace("มีแต่", "มี<tea>")	×
127	text = text.replace("เหตุที่ทำให้", "<hedteetamhai>")	×
128	text = text.replace("โดยหลังจาก", "โดย<langjak>")	×
129	text = text.replace("ซึ่งหลังจาก", "ซึ่ง<langjak>")	×
130	text = text.replace("ตั้งโดย", "<rth_tangdoi>")	×
131	text = text.replace("โดยตรง", "<rth_doitong>")	×
132	text = text.replace("นั้นหรือ", "<rth_nanhlor>")	×
133	text = text.replace("ซึ่งต้องทำให้", "ซึ่งต้อง<tamhai>")	×
134	text = text.replace("ชื่อต่อมา", "ชื่อ<tomar>")	×
135	text = text.replace("โดยเร่งด่วน", "<doi>เร่งด่วน")	×
136	text = text.replace("ไม่ได้ทำให้", "ไม่ได้<tamhai>")	×
137	text = text.replace("จะทำให้", "จะ<tamhai>")	×
138	text = text.replace("จนทำให้", "จน<tamhai>")	×
139	text = text.replace("เว้นแต่", "เว้น<rth_tea>")	×
140	text = text.replace("ก็ทำให้", "ก็<tamhai>")	×
141	text = text.replace(" ณ ตอนนั้น", " ณ <tonnan>")	×
142	text = text.replace("บางส่วน", "บาง<rth_suan>")	×
143	text = text.replace("หรือแม้แต่", "หรือ<rth_meatea>")	×
144	text = text.replace("โดยทำให้", "โดย<tamhai>")	×
145	text = text.replace("หรือเพราะ", "หรือ<rth_orbecause>")	×
146	text = text.replace("มาแต่", "มา<rth_tea>")	×
147	text = text.replace("แต่ไม่ทำให้", "แต่<maitamhai>")	×
148	text = text.replace("ฉะนั้นเมื่อ", "ฉะนั้น<rth_moe>")	×
149	text = text.replace("เพราะฉะนั้น", "เพราะ<rth_chanan>")	×
150	text = text.replace("เพราะหลังจาก", "เพราะ<rth_langjak>")	×
151	text = text.replace("สามารถทำให้", "สามารถ<rth_tamhai>")	×
152	text = text.replace("อาจทำ", "อาจ<rth_tam>")	×
153	text = text.replace("จะทำ", "จะ<rth_tam>")	×
154	text = text.replace("และนอกจากนี้", "นอกจากนี้")	×
155	text = text.replace("อีกทั้งเพื่อ", "อีกทั้ง<rth_for>")	×
156	text = text.replace("ทั้งนี้เพื่อ", "ทั้งนี้<rth_for>")	×
157	text = text.replace("เวลาต่อมา", "เวลา<rth_toma>")	×
158	text = text.replace("อย่างไรก็ตาม", "อย่างไรก็ตาม")	×
159	text = text.replace(	×
160	"อย่างไรก็ตามหลังจาก", "<stop>อย่างไรก็ตาม<rth_langjak>"
161	)
162	text = text.replace("ซึ่งทำให้", "ซึ่ง<rth_tamhai>")	×
163	text = text.replace("โดยประมาท", "<doi>ประมาท")	×
164	text = text.replace("โดยธรรม", "<doi>ธรรม")	×
165	text = text.replace("โดยสัจจริง", "<doi>สัจจริง")	×
166
167	if "และ" in text:	×
168	tokens = word_tokenize(text.strip(), keep_whitespace=True)	×
169	and_position = -1	×
170	nearest_space_position = -1	×
171	last_position = len(tokens)	×
172	pop_split_position = []	×
173	split_position = []	×
174	for i in range(len(tokens)):	×
175	if tokens[i] == "และ":	×
176	and_position = i	×
177
178	if (	×
179	and_position != -1
180	and i > and_position
181	and tokens[i] == " "
182	and nearest_space_position == -1
183	):
184	if i - and_position != 1:	×
185	nearest_space_position = i	×
186
187	if and_position != -1 and last_position - and_position == 3:	×
188	split_position.append(last_position)	×
189	and_position = -1	×
190	nearest_space_position = -1	×
191
192	if nearest_space_position != -1:	×
193	if nearest_space_position - and_position < 5:	×
194	pop_split_position.append(nearest_space_position)	×
195	else:
196	split_position.append(and_position)	×
197	and_position = -1	×
198	nearest_space_position = -1	×
199	for pop in pop_split_position:	×
200	tokens.pop(pop)	×
201	tokens.insert(pop, "<stop>")	×
202	for split in split_position:	×
203	tokens.insert(split, "<stop>")	×
204	text = list_to_string(tokens)	×
205
206	if "หรือ" in text:	×
207	tokens = word_tokenize(text.strip(), keep_whitespace=True)	×
208	or_position = -1	×
209	nearest_space_position = -1	×
210	last_position = len(tokens)	×
211	pop_split_position = []	×
212	split_position = []	×
213	for i in range(len(tokens)):	×
214	if tokens[i] == "หรือ":	×
215	or_position = i	×
216	if (	×
217	or_position != -1
218	and i > or_position
219	and tokens[i] == " "
220	and nearest_space_position == -1
221	):
222	if i - or_position != 1:	×
223	nearest_space_position = i	×
224
225	if or_position != -1 and last_position - or_position == 3:	×
226	split_position.append(last_position)	×
227	or_position = -1	×
228	nearest_space_position = -1	×
229
230	if nearest_space_position != -1:	×
231	if nearest_space_position - or_position < 4:	×
232	pop_split_position.append(nearest_space_position)	×
233	else:
234	split_position.append(or_position)	×
235	or_position = -1	×
236	nearest_space_position = -1	×
237	for pop in pop_split_position:	×
238	tokens.pop(pop)	×
239	tokens.insert(pop, "<stop>")	×
240	for split in split_position:	×
241	tokens.insert(split, "<stop>")	×
242	text = list_to_string(tokens)	×
243
244	if "จึง" in text:	×
245	tokens = word_tokenize(text.strip(), keep_whitespace=True)	×
246	cung_position = -1	×
247	nearest_space_position = -1	×
248	pop_split_position = []	×
249	last_position = len(tokens)	×
250	split_position = []	×
251	for i in range(len(tokens)):	×
252	if tokens[i] == "จึง":	×
253	cung_position = i	×
254
255	if (	×
256	cung_position != -1
257	and tokens[i] == " "
258	and i > cung_position
259	and nearest_space_position == -1
260	):
261	if i - cung_position != 1:	×
262	nearest_space_position = i	×
263
264	if cung_position != -1 and last_position - cung_position == 2:	×
265	split_position.append(last_position)	×
266	cung_position = -1	×
267	nearest_space_position = -1	×
268
269	if nearest_space_position != -1:	×
270	if nearest_space_position - cung_position < 3:	×
271	pop_split_position.append(nearest_space_position)	×
272	else:
273	split_position.append(cung_position)	×
274	cung_position = -1	×
275	nearest_space_position = -1	×
276
277	for pop in pop_split_position:	×
278	tokens.pop(pop)	×
279	tokens.insert(pop, "<stop>")	×
280	for split in split_position:	×
281	tokens.insert(split, "<stop>")	×
282
283	text = list_to_string(tokens)	×
284
285	text = re.sub(" " + th_stop_before, "<stop>\\1", text)	×
286	text = re.sub(th_ka_krub, "\\1<stop>", text)	×
287	text = re.sub(th_conjunction, "<stop>\\1", text)	×
288	text = re.sub(th_cite, "\\1<stop>", text)	×
289	text = re.sub(" " + degit + "[.]" + th_title, "<stop>\\1.\\2", text)	×
290	text = re.sub(	×
291	" " + degit + degit + "[.]" + th_title, "<stop>\\1\\2.\\3", text
292	)
293	text = re.sub(th_alphabets + th_stop_after + " ", "\\1\\2<stop>", text)	×
294	if "”" in text:	×
295	text = text.replace(".”", "”.")	×
296	if '"' in text:	×
297	text = text.replace('."', '".')	×
298	if "!" in text:	×
299	text = text.replace('!"', '"!')	×
300	if "?" in text:	×
301	text = text.replace('?"', '"?')	×
302	text = text.replace("<rth_Doeirew>", "โดยเร็ว")	×
303	text = text.replace("<rth_friend>", "เพื่อน")	×
304	text = text.replace("<rth_but>", "แต่ง")	×
305	text = text.replace("<rth_passenger>", "โดยสาร")	×
306	text = text.replace("<rth_leawtea>", "แล้วแต่")	×
307	text = text.replace("<rth_repraw>", "หรือเปล่า")	×
308	text = text.replace("<rth_remai>", "หรือไม่")	×
309	text = text.replace("<rth_tanatorn_lastname>", "จึงรุ่งเรืองกิจ")	×
310	text = text.replace("<rth_tangtea>", "ตั้งแต่")	×
311	text = text.replace("<rth_teala>", "แต่ละ")	×
312	text = text.replace("<rth_chiwitleaw>", "วิตแล้ว")	×
313	text = text.replace("<rth_doipra>", "โดยประ")	×
314	text = text.replace("<rth_tealangjaknan>", "แต่หลังจากนั้น")	×
315	text = text.replace("<for_party>", "พรรคเพื่อ")	×
316	text = text.replace("<rth_teaneung>", "แต่เนื่อง")	×
317	text = text.replace("เพื่อ<rth_tamhai>", "เพื่อทำให้")	×
318	text = text.replace("ทำ<rth_for>", "ทำเพื่อ")	×
319	text = text.replace("จึง<tamhai>", "จึงทำให้")	×
320	text = text.replace("<madoitalod>", "มาโดยตลอด")	×
321	text = text.replace("แต่<langjak>", "แต่หลังจาก")	×
322	text = text.replace("แต่<tangni>", "แต่ทั้งนี้")	×
323	text = text.replace("มี<tea>", "มีแต่")	×
324	text = text.replace("<teayangdaikptam>", "แต่อย่างใด")	×
325	text = text.replace("<rth_kongtamhai>", "คงทำให้")	×
326	text = text.replace("<hedteetamhai>", "เหตุที่ทำให้")	×
327	text = text.replace("โดย<langjak>", "โดยหลังจาก")	×
328	text = text.replace("ซึ่ง<langjak>", "ซึ่งหลังจาก")	×
329	text = text.replace("<rth_tangdoi>", "ตั้งโดย")	×
330	text = text.replace("<rth_doitong>", "โดยตรง")	×
331	text = text.replace("<rth_nanhlor>", "นั้นหรือ")	×
332	text = text.replace("ซึ่งต้อง<tamhai>", "ซึ่งต้องทำให้")	×
333	text = text.replace("ชื่อ<tomar>", "ชื่อต่อมา")	×
334	text = text.replace("<doi>เร่งด่วน", "โดยเร่งด่วน")	×
335	text = text.replace("ไม่ได้<tamhai>", "ไม่ได้ทำให้")	×
336	text = text.replace("จะ<tamhai>", "จะทำให้")	×
337	text = text.replace("จน<tamhai>", "จนทำให้")	×
338	text = text.replace("เว้น<rth_tea>", "เว้นแต่")	×
339	text = text.replace("ก็<tamhai>", "ก็ทำให้")	×
340	text = text.replace(" ณ <tonnan>", " ณ ตอนนั้น")	×
341	text = text.replace("บาง<rth_suan>", "บางส่วน")	×
342	text = text.replace("หรือ<rth_meatea>", "หรือแม้แต่")	×
343	text = text.replace("โดย<tamhai>", "โดยทำให้")	×
344	text = text.replace("หรือ<rth_orbecause>", "หรือเพราะ")	×
345	text = text.replace("มา<rth_tea>", "มาแต่")	×
346	text = text.replace("แต่<maitamhai>", "แต่ไม่ทำให้")	×
347	text = text.replace("ฉะนั้น<rth_moe>", "ฉะนั้นเมื่อ")	×
348	text = text.replace("เพราะ<rth_chanan>", "เพราะฉะนั้น")	×
349	text = text.replace("เพราะ<rth_langjak>", "เพราะหลังจาก")	×
350	text = text.replace("สามารถ<rth_tamhai>", "สามารถทำให้")	×
351	text = text.replace("อาจ<rth_tam>", "อาจทำ")	×
352	text = text.replace("จะ<rth_tam>", "จะทำ")	×
353	text = text.replace("อีกทั้ง<rth_for>", "อีกทั้งเพื่อ")	×
354	text = text.replace("ทั้งนี้<rth_for>", "ทั้งนี้เพื่อ")	×
355	text = text.replace("เวลา<rth_toma>", "เวลาต่อมา")	×
356	text = text.replace(	×
357	"อย่างไรก็ตาม<rth_langjak>",
358	"อย่างไรก็ตามหลังจาก",
359	)
360	text = text.replace("ซึ่ง<rth_tamhai>", "ซึ่งทำให้")	×
361	text = text.replace("<doi>ประมาท", "โดยประมาท")	×
362	text = text.replace("<doi>ธรรม", "โดยธรรม")	×
363	text = text.replace("<doi>สัจจริง", "โดยสัจจริง")	×
364	text = text.replace("?", "?<stop>")	×
365	text = text.replace("!", "!<stop>")	×
366	text = text.replace("<prd>", ".")	×
367	sentences = text.split("<stop>")	×
368	sentences = [s.strip() for s in sentences]	×
369	if "" in sentences:	×
370	sentences.remove("")	×
371	if "nan" in sentences:	×
372	sentences.remove("nan")	×
373
374	sentences = list(filter(None, sentences))	×
375
376	if isMiddleCut:	×
377	return middle_cut(sentences)	×
378	else:
379	return sentences	×

PyThaiNLP / pythainlp / 11626163864

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous