11626163864

Committed 01 Nov 2024 07:49AM UTC coverage: 14.17% (+14.2%) from 0.0%

Build # 11626163864

Build Type

Pull #952

github

Committed by

web-flow

Commit Message

Merge 8f2551bc9 into 89ea62ebc

Pull Request Pull Request #952: Specify a limited test suite

Run Details

44 of 80 new or added lines in 48 files covered. (55.0%)

1048 of 7396 relevant lines covered (14.17%)

0.14 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

3.62

/pythainlp/phayathaibert/core.py

# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0

import random
import re
import warnings
from typing import Callable, List, Tuple, Union

from transformers import (
    CamembertTokenizer,
)

from pythainlp.tokenize import word_tokenize

_PAT_URL = r"(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?"

_model_name = "clicknext/phayathaibert"
_tokenizer = CamembertTokenizer.from_pretrained(_model_name)


class ThaiTextProcessor:
    def __init__(self):
        (
            self._TK_UNK,
            self._TK_REP,
            self._TK_WREP,
            self._TK_URL,
            self._TK_END,
        ) = "<unk> <rep> <wrep> <url> </s>".split()
        self.SPACE_SPECIAL_TOKEN = "<_>"

    def replace_url(self, text: str) -> str:
        """
        Replace url in `text` with TK_URL (https://stackoverflow.com/a/6041965)
        :param str text: text to replace url
        :return: text where urls  are replaced
        :rtype: str
        :Example:
            >>> replace_url("go to https://github.com")
            go to <url>
        """
        return re.sub(_PAT_URL, self._TK_URL, text)

    def rm_brackets(self, text: str) -> str:
        """
        Remove all empty brackets and artifacts within brackets from `text`.
        :param str text: text to remove useless brackets
        :return: text where all useless brackets are removed
        :rtype: str
        :Example:
            >>> rm_brackets("hey() whats[;] up{*&} man(hey)")
            hey whats up man(hey)
        """
        # remove empty brackets
        new_line = re.sub(r"\(\)", "", text)
        new_line = re.sub(r"\{\}", "", new_line)
        new_line = re.sub(r"\[\]", "", new_line)
        # brackets with only punctuations
        new_line = re.sub(r"\([^a-zA-Z0-9ก-๙]+\)", "", new_line)
        new_line = re.sub(r"\{[^a-zA-Z0-9ก-๙]+\}", "", new_line)
        new_line = re.sub(r"\[[^a-zA-Z0-9ก-๙]+\]", "", new_line)
        # artifiacts after (
        new_line = re.sub(
            r"(?<=\()[^a-zA-Z0-9ก-๙]+(?=[a-zA-Z0-9ก-๙])", "", new_line
        )
        new_line = re.sub(
            r"(?<=\{)[^a-zA-Z0-9ก-๙]+(?=[a-zA-Z0-9ก-๙])", "", new_line
        )
        new_line = re.sub(
            r"(?<=\[)[^a-zA-Z0-9ก-๙]+(?=[a-zA-Z0-9ก-๙])", "", new_line
        )
        # artifacts before )
        new_line = re.sub(
            r"(?<=[a-zA-Z0-9ก-๙])[^a-zA-Z0-9ก-๙]+(?=\))", "", new_line
        )
        new_line = re.sub(
            r"(?<=[a-zA-Z0-9ก-๙])[^a-zA-Z0-9ก-๙]+(?=\})", "", new_line
        )
        new_line = re.sub(
            r"(?<=[a-zA-Z0-9ก-๙])[^a-zA-Z0-9ก-๙]+(?=\])", "", new_line
        )
        return new_line

    def replace_newlines(self, text: str) -> str:
        """
        Replace newlines in `text` with spaces.
        :param str text: text to replace all newlines with spaces
        :return: text where all newlines are replaced with spaces
        :rtype: str
        :Example:
            >>> rm_useless_spaces("hey whats\n\nup")
            hey whats  up
        """

        return re.sub(r"[\n]", " ", text.strip())

    def rm_useless_spaces(self, text: str) -> str:
        """
        Remove multiple spaces in `text`. (code from `fastai`)
        :param str text: text to replace useless spaces
        :return: text where all spaces are reduced to one
        :rtype: str
        :Example:
            >>> rm_useless_spaces("oh         no")
            oh no
        """
        return re.sub(" {2,}", " ", text)

    def replace_spaces(self, text: str, space_token: str = "<_>") -> str:
        """
        Replace spaces with _
        :param str text: text to replace spaces
        :return: text where all spaces replaced with _
        :rtype: str
        :Example:
            >>> replace_spaces("oh no")
            oh_no
        """
        return re.sub(" ", space_token, text)

    def replace_rep_after(self, text: str) -> str:
        """
        Replace repetitions at the character level in `text`
        :param str text: input text to replace character repetition
        :return: text with repetitive tokens removed.
        :rtype: str
        :Example:
            >>> text = "กาาาาาาา"
            >>> replace_rep_after(text)
            'กา'
        """

        def _replace_rep(m):
            c, cc = m.groups()
            return f"{c}"

        re_rep = re.compile(r"(\S)(\1{3,})")
        return re_rep.sub(_replace_rep, text)

    def replace_wrep_post(self, toks: List[str]) -> List[str]:
        """
        Replace repetitive words post tokenization;
        fastai `replace_wrep` does not work well with Thai.
        :param List[str] toks: list of tokens
        :return: list of tokens where repetitive words are removed.
        :rtype: List[str]
        :Example:
            >>> toks = ["กา", "น้ำ", "น้ำ", "น้ำ", "น้ำ"]
            >>> replace_wrep_post(toks)
            ['กา', 'น้ำ']
        """
        previous_word = ""
        rep_count = 0
        res = []
        for current_word in toks + [self._TK_END]:
            if current_word == previous_word:
                rep_count += 1
            elif (current_word != previous_word) & (rep_count > 0):
                res += [previous_word]
                rep_count = 0
            else:
                res.append(previous_word)
            previous_word = current_word

        return res[1:]

    def remove_space(self, toks: List[str]) -> List[str]:
        """
        Do not include space for bag-of-word models.
        :param List[str] toks: list of tokens
        :return: List of tokens where space tokens (" ") are filtered out
        :rtype: List[str]
        :Example:
            >>> toks = ["ฉัน", "เดิน", " ", "กลับ", "บ้าน"]
            >>> remove_space(toks)
            ['ฉัน', 'เดิน', 'กลับ', 'บ้าน']
        """
        res = []
        for t in toks:
            t = t.strip()
            if t:
                res.append(t)

        return res

    # combine them together
    def preprocess(
        self,
        text: str,
        pre_rules: List[Callable] = [
            rm_brackets,
            replace_newlines,
            rm_useless_spaces,
            replace_spaces,
            replace_rep_after,
        ],
        tok_func: Callable = word_tokenize,
    ) -> str:
        text = text.lower()
        for rule in pre_rules:
            text = rule(text)
        toks = tok_func(text)

        return "".join(toks)


class ThaiTextAugmenter:
    def __init__(self) -> None:
        from transformers import (
            AutoModelForMaskedLM,
            AutoTokenizer,
            pipeline,
        )

        self.tokenizer = AutoTokenizer.from_pretrained(_model_name)
        self.model_for_masked_lm = AutoModelForMaskedLM.from_pretrained(
            _model_name
        )
        self.model = pipeline(
            "fill-mask",
            tokenizer=self.tokenizer,
            model=self.model_for_masked_lm,
        )
        self.processor = ThaiTextProcessor()

    def generate(
        self,
        sample_text: str,
        word_rank: int,
        max_length: int = 3,
        sample: bool = False,
    ) -> str:
        sample_txt = sample_text
        final_text = ""
        for j in range(max_length):
            input = self.processor.preprocess(sample_txt)
            if sample:
                random_word_idx = random.randint(0, 4)
                output = self.model(input)[random_word_idx]["sequence"]
            else:
                output = self.model(input)[word_rank]["sequence"]
            sample_txt = output + "<mask>"
            final_text = sample_txt

        gen_txt = re.sub("<mask>", "", final_text)

        return gen_txt

    def augment(
        self,
        text: str,
        num_augs: int = 3,
        sample: bool = False,
    ) -> List[str]:
        """
        Text augmentation from PhayaThaiBERT

        :param str text: Thai text
        :param int num_augs: an amount of augmentation text needed as an output
        :param bool sample: whether to sample the text as an output or not,\
              true if more word diversity is needed

        :return: list of text augment
        :rtype: List[str]

        :Example:
        ::

            from pythainlp.augment.lm import ThaiTextAugmenter

            aug = ThaiTextAugmenter()
            aug.augment("ช้างมีทั้งหมด 50 ตัว บน", num_args=5)

            # output = ['ช้างมีทั้งหมด 50 ตัว บนโลกใบนี้ครับ.',
                'ช้างมีทั้งหมด 50 ตัว บนพื้นดินครับ...',
                'ช้างมีทั้งหมด 50 ตัว บนท้องฟ้าครับ...',
                'ช้างมีทั้งหมด 50 ตัว บนดวงจันทร์.‼',
                'ช้างมีทั้งหมด 50 ตัว บนเขาค่ะ😁']
        """
        MAX_NUM_AUGS = 5
        augment_list = []

        if num_augs <= MAX_NUM_AUGS:
            for rank in range(num_augs):
                gen_text = self.generate(
                    text,
                    rank,
                    sample=sample,
                )
                processed_text = re.sub(
                    "<_>", " ", self.processor.preprocess(gen_text)
                )
                augment_list.append(processed_text)
        else:
            raise ValueError(
                f"augmentation of more than {num_augs} is exceeded \
                    the default limit: {MAX_NUM_AUGS}"
            )

        return augment_list


class PartOfSpeechTagger:
    def __init__(self, model: str = "lunarlist/pos_thai_phayathai") -> None:
        # Load model directly
        from transformers import (
            AutoModelForTokenClassification,
            AutoTokenizer,
        )

        self.tokenizer = AutoTokenizer.from_pretrained(model)
        self.model = AutoModelForTokenClassification.from_pretrained(model)

    def get_tag(
        self, sentence: str, strategy: str = "simple"
    ) -> List[List[Tuple[str, str]]]:
        """
        Marks sentences with part-of-speech (POS) tags.

        :param str sentence: a list of lists of tokenized words
        :return: a list of lists of tuples (word, POS tag)
        :rtype: list[list[tuple[str, str]]]

        :Example:

        Labels POS for given sentence::

            from pythainlp.phayathaibert.core import PartOfSpeechTagger

            tagger = PartOfSpeechTagger()
            tagger.get_tag("แมวทำอะไรตอนห้าโมงเช้า")
            # output:
            # [[('แมว', 'NOUN'), ('ทําอะไร', 'VERB'), ('ตอนห้าโมงเช้า', 'NOUN')]]
        """
        from transformers import TokenClassificationPipeline

        pipeline = TokenClassificationPipeline(
            model=self.model,
            tokenizer=self.tokenizer,
            aggregation_strategy=strategy,
        )
        outputs = pipeline(sentence)
        word_tags = [[(tag["word"], tag["entity_group"]) for tag in outputs]]

        return word_tags


class NamedEntityTagger:
    def __init__(self, model: str = "Pavarissy/phayathaibert-thainer") -> None:
        from transformers import (
            AutoModelForTokenClassification,
            AutoTokenizer,
        )

        self.tokenizer = AutoTokenizer.from_pretrained(model)
        self.model = AutoModelForTokenClassification.from_pretrained(model)

    def get_ner(
        self,
        text: str,
        tag: bool = False,
        pos: bool = False,
        strategy: str = "simple",
    ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]:
        """
        This function tags named entities in text in IOB format.

        :param str text: text in Thai to be tagged
        :param bool pos: output with part-of-speech tags.\
            (PhayaThaiBERT is supported in PartOfSpeechTagger)
        :return: a list of tuples associated with tokenized words, NER tags,
                 POS tags (if the parameter `pos` is specified as `True`),
                 and output HTML-like tags (if the parameter `tag` is
                 specified as `True`).
                 Otherwise, return a list of tuples associated with tokenized
                 words and NER tags
        :rtype: Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]
        :Example:

            >>> from pythainlp.phayathaibert.core import NamedEntityTagger
            >>>
            >>> tagger = NamedEntityTagger()
            >>> tagger.get_ner("ทดสอบนายปวริศ เรืองจุติโพธิ์พานจากประเทศไทย")
            [('นายปวริศ เรืองจุติโพธิ์พานจากประเทศไทย', 'PERSON'),
            ('จาก', 'LOCATION'),
            ('ประเทศไทย', 'LOCATION')]
            >>> ner.tag("ทดสอบนายปวริศ เรืองจุติโพธิ์พานจากประเทศไทย", tag=True)
            'ทดสอบ<PERSON>นายปวริศ เรืองจุติโพธิ์พาน</PERSON>\
                <LOCATION>จาก</LOCATION><LOCATION>ประเทศไทย</LOCATION>'
        """
        from transformers import TokenClassificationPipeline

        if pos:
            warnings.warn(
                "This model doesn't support output \
                          postag and It doesn't output the postag."
            )

        sample_output = []
        tag_text_list = []
        current_pos = 0
        pipeline = TokenClassificationPipeline(
            model=self.model,
            tokenizer=self.tokenizer,
            aggregation_strategy=strategy,
        )
        outputs = pipeline(text)

        for token in outputs:
            ner_tag = token["entity_group"]
            begin_pos, end_pos = token["start"], token["end"]
            if current_pos == 0:
                text_tag = (
                    text[:begin_pos]
                    + f"<{ner_tag}>"
                    + text[begin_pos:end_pos]
                    + f"</{ner_tag}>"
                )
            else:
                text_tag = (
                    text[current_pos:begin_pos]
                    + f"<{ner_tag}>"
                    + text[begin_pos:end_pos]
                    + f"</{ner_tag}>"
                )
            tag_text_list.append(text_tag)
            sample_output.append((token["word"], token["entity_group"]))
            current_pos = end_pos

        if tag:
            return str("".join(tag_text_list))

        return sample_output


def segment(sentence: str) -> List[str]:
    """
    Subword tokenize of PhayaThaiBERT, \
    sentencepiece from WangchanBERTa model with vocabulary expansion.

    :param str sentence: text to be tokenized
    :return: list of subwords
    :rtype: list[str]
    """
    if not sentence or not isinstance(sentence, str):
        return []

    return _tokenizer.tokenize(sentence)

1	# -- coding: utf-8 --
2	# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
3	# SPDX-License-Identifier: Apache-2.0
4
5	import random	1✔
6	import re	1✔
7	import warnings	1✔
8	from typing import Callable, List, Tuple, Union	1✔
9
10	from transformers import (	1✔
11	CamembertTokenizer,
12	)
13
NEW 14	from pythainlp.tokenize import word_tokenize	×
15
16	_PAT_URL = r"(http\|ftp\|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?"	×
17
18	_model_name = "clicknext/phayathaibert"	×
19	_tokenizer = CamembertTokenizer.from_pretrained(_model_name)	×
20
21
22	class ThaiTextProcessor:	×
23	def __init__(self):	×
24	(	×
25	self._TK_UNK,
26	self._TK_REP,
27	self._TK_WREP,
28	self._TK_URL,
29	self._TK_END,
30	) = "<unk> <rep> <wrep> <url> </s>".split()
31	self.SPACE_SPECIAL_TOKEN = "<_>"	×
32
33	def replace_url(self, text: str) -> str:	×
34	"""
35	Replace url in `text` with TK_URL (https://stackoverflow.com/a/6041965)
36	:param str text: text to replace url
37	:return: text where urls are replaced
38	:rtype: str
39	:Example:
40	>>> replace_url("go to https://github.com")
41	go to <url>
42	"""
43	return re.sub(_PAT_URL, self._TK_URL, text)	×
44
45	def rm_brackets(self, text: str) -> str:	×
46	"""
47	Remove all empty brackets and artifacts within brackets from `text`.
48	:param str text: text to remove useless brackets
49	:return: text where all useless brackets are removed
50	:rtype: str
51	:Example:
52	>>> rm_brackets("hey() whats[;] up{*&} man(hey)")
53	hey whats up man(hey)
54	"""
55	# remove empty brackets
56	new_line = re.sub(r"\(\)", "", text)	×
57	new_line = re.sub(r"\{\}", "", new_line)	×
58	new_line = re.sub(r"\[\]", "", new_line)	×
59	# brackets with only punctuations
60	new_line = re.sub(r"\([^a-zA-Z0-9ก-๙]+\)", "", new_line)	×
61	new_line = re.sub(r"\{[^a-zA-Z0-9ก-๙]+\}", "", new_line)	×
62	new_line = re.sub(r"\[[^a-zA-Z0-9ก-๙]+\]", "", new_line)	×
63	# artifiacts after (
64	new_line = re.sub(	×
65	r"(?<=\()[^a-zA-Z0-9ก-๙]+(?=[a-zA-Z0-9ก-๙])", "", new_line
66	)
67	new_line = re.sub(	×
68	r"(?<=\{)[^a-zA-Z0-9ก-๙]+(?=[a-zA-Z0-9ก-๙])", "", new_line
69	)
70	new_line = re.sub(	×
71	r"(?<=\[)[^a-zA-Z0-9ก-๙]+(?=[a-zA-Z0-9ก-๙])", "", new_line
72	)
73	# artifacts before )
74	new_line = re.sub(	×
75	r"(?<=[a-zA-Z0-9ก-๙])[^a-zA-Z0-9ก-๙]+(?=\))", "", new_line
76	)
77	new_line = re.sub(	×
78	r"(?<=[a-zA-Z0-9ก-๙])[^a-zA-Z0-9ก-๙]+(?=\})", "", new_line
79	)
80	new_line = re.sub(	×
81	r"(?<=[a-zA-Z0-9ก-๙])[^a-zA-Z0-9ก-๙]+(?=\])", "", new_line
82	)
83	return new_line	×
84
85	def replace_newlines(self, text: str) -> str:	×
86	"""
87	Replace newlines in `text` with spaces.
88	:param str text: text to replace all newlines with spaces
89	:return: text where all newlines are replaced with spaces
90	:rtype: str
91	:Example:
92	>>> rm_useless_spaces("hey whats\n\nup")
93	hey whats up
94	"""
95
96	return re.sub(r"[\n]", " ", text.strip())	×
97
98	def rm_useless_spaces(self, text: str) -> str:	×
99	"""
100	Remove multiple spaces in `text`. (code from `fastai`)
101	:param str text: text to replace useless spaces
102	:return: text where all spaces are reduced to one
103	:rtype: str
104	:Example:
105	>>> rm_useless_spaces("oh no")
106	oh no
107	"""
108	return re.sub(" {2,}", " ", text)	×
109
110	def replace_spaces(self, text: str, space_token: str = "<_>") -> str:	×
111	"""
112	Replace spaces with _
113	:param str text: text to replace spaces
114	:return: text where all spaces replaced with _
115	:rtype: str
116	:Example:
117	>>> replace_spaces("oh no")
118	oh_no
119	"""
120	return re.sub(" ", space_token, text)	×
121
122	def replace_rep_after(self, text: str) -> str:	×
123	"""
124	Replace repetitions at the character level in `text`
125	:param str text: input text to replace character repetition
126	:return: text with repetitive tokens removed.
127	:rtype: str
128	:Example:
129	>>> text = "กาาาาาาา"
130	>>> replace_rep_after(text)
131	'กา'
132	"""
133
134	def _replace_rep(m):	×
135	c, cc = m.groups()	×
136	return f"{c}"	×
137
138	re_rep = re.compile(r"(\S)(\1{3,})")	×
139	return re_rep.sub(_replace_rep, text)	×
140
141	def replace_wrep_post(self, toks: List[str]) -> List[str]:	×
142	"""
143	Replace repetitive words post tokenization;
144	fastai `replace_wrep` does not work well with Thai.
145	:param List[str] toks: list of tokens
146	:return: list of tokens where repetitive words are removed.
147	:rtype: List[str]
148	:Example:
149	>>> toks = ["กา", "น้ำ", "น้ำ", "น้ำ", "น้ำ"]
150	>>> replace_wrep_post(toks)
151	['กา', 'น้ำ']
152	"""
153	previous_word = ""	×
154	rep_count = 0	×
155	res = []	×
156	for current_word in toks + [self._TK_END]:	×
157	if current_word == previous_word:	×
158	rep_count += 1	×
159	elif (current_word != previous_word) & (rep_count > 0):	×
160	res += [previous_word]	×
161	rep_count = 0	×
162	else:
163	res.append(previous_word)	×
164	previous_word = current_word	×
165
166	return res[1:]	×
167
168	def remove_space(self, toks: List[str]) -> List[str]:	×
169	"""
170	Do not include space for bag-of-word models.
171	:param List[str] toks: list of tokens
172	:return: List of tokens where space tokens (" ") are filtered out
173	:rtype: List[str]
174	:Example:
175	>>> toks = ["ฉัน", "เดิน", " ", "กลับ", "บ้าน"]
176	>>> remove_space(toks)
177	['ฉัน', 'เดิน', 'กลับ', 'บ้าน']
178	"""
179	res = []	×
180	for t in toks:	×
181	t = t.strip()	×
182	if t:	×
183	res.append(t)	×
184
185	return res	×
186
187	# combine them together
188	def preprocess(	×
189	self,
190	text: str,
191	pre_rules: List[Callable] = [
192	rm_brackets,
193	replace_newlines,
194	rm_useless_spaces,
195	replace_spaces,
196	replace_rep_after,
197	],
198	tok_func: Callable = word_tokenize,
199	) -> str:
200	text = text.lower()	×
201	for rule in pre_rules:	×
202	text = rule(text)	×
203	toks = tok_func(text)	×
204
205	return "".join(toks)	×
206
207
208	class ThaiTextAugmenter:	×
209	def __init__(self) -> None:	×
210	from transformers import (	×
211	AutoModelForMaskedLM,
212	AutoTokenizer,
213	pipeline,
214	)
215
216	self.tokenizer = AutoTokenizer.from_pretrained(_model_name)	×
217	self.model_for_masked_lm = AutoModelForMaskedLM.from_pretrained(	×
218	_model_name
219	)
220	self.model = pipeline(	×
221	"fill-mask",
222	tokenizer=self.tokenizer,
223	model=self.model_for_masked_lm,
224	)
225	self.processor = ThaiTextProcessor()	×
226
227	def generate(	×
228	self,
229	sample_text: str,
230	word_rank: int,
231	max_length: int = 3,
232	sample: bool = False,
233	) -> str:
234	sample_txt = sample_text	×
235	final_text = ""	×
236	for j in range(max_length):	×
237	input = self.processor.preprocess(sample_txt)	×
238	if sample:	×
239	random_word_idx = random.randint(0, 4)	×
240	output = self.model(input)[random_word_idx]["sequence"]	×
241	else:
242	output = self.model(input)[word_rank]["sequence"]	×
243	sample_txt = output + "<mask>"	×
244	final_text = sample_txt	×
245
246	gen_txt = re.sub("<mask>", "", final_text)	×
247
248	return gen_txt	×
249
250	def augment(	×
251	self,
252	text: str,
253	num_augs: int = 3,
254	sample: bool = False,
255	) -> List[str]:
256	"""
257	Text augmentation from PhayaThaiBERT
258
259	:param str text: Thai text
260	:param int num_augs: an amount of augmentation text needed as an output
261	:param bool sample: whether to sample the text as an output or not,\
262	true if more word diversity is needed
263
264	:return: list of text augment
265	:rtype: List[str]
266
267	:Example:
268	::
269
270	from pythainlp.augment.lm import ThaiTextAugmenter
271
272	aug = ThaiTextAugmenter()
273	aug.augment("ช้างมีทั้งหมด 50 ตัว บน", num_args=5)
274
275	# output = ['ช้างมีทั้งหมด 50 ตัว บนโลกใบนี้ครับ.',
276	'ช้างมีทั้งหมด 50 ตัว บนพื้นดินครับ...',
277	'ช้างมีทั้งหมด 50 ตัว บนท้องฟ้าครับ...',
278	'ช้างมีทั้งหมด 50 ตัว บนดวงจันทร์.‼',
279	'ช้างมีทั้งหมด 50 ตัว บนเขาค่ะ😁']
280	"""
281	MAX_NUM_AUGS = 5	×
282	augment_list = []	×
283
284	if num_augs <= MAX_NUM_AUGS:	×
285	for rank in range(num_augs):	×
286	gen_text = self.generate(	×
287	text,
288	rank,
289	sample=sample,
290	)
291	processed_text = re.sub(	×
292	"<_>", " ", self.processor.preprocess(gen_text)
293	)
294	augment_list.append(processed_text)	×
295	else:
296	raise ValueError(	×
297	f"augmentation of more than {num_augs} is exceeded \
298	the default limit: {MAX_NUM_AUGS}"
299	)
300
301	return augment_list	×
302
303
304	class PartOfSpeechTagger:	×
305	def __init__(self, model: str = "lunarlist/pos_thai_phayathai") -> None:	×
306	# Load model directly
307	from transformers import (	×
308	AutoModelForTokenClassification,
309	AutoTokenizer,
310	)
311
312	self.tokenizer = AutoTokenizer.from_pretrained(model)	×
313	self.model = AutoModelForTokenClassification.from_pretrained(model)	×
314
315	def get_tag(	×
316	self, sentence: str, strategy: str = "simple"
317	) -> List[List[Tuple[str, str]]]:
318	"""
319	Marks sentences with part-of-speech (POS) tags.
320
321	:param str sentence: a list of lists of tokenized words
322	:return: a list of lists of tuples (word, POS tag)
323	:rtype: list[list[tuple[str, str]]]
324
325	:Example:
326
327	Labels POS for given sentence::
328
329	from pythainlp.phayathaibert.core import PartOfSpeechTagger
330
331	tagger = PartOfSpeechTagger()
332	tagger.get_tag("แมวทำอะไรตอนห้าโมงเช้า")
333	# output:
334	# [[('แมว', 'NOUN'), ('ทําอะไร', 'VERB'), ('ตอนห้าโมงเช้า', 'NOUN')]]
335	"""
336	from transformers import TokenClassificationPipeline	×
337
338	pipeline = TokenClassificationPipeline(	×
339	model=self.model,
340	tokenizer=self.tokenizer,
341	aggregation_strategy=strategy,
342	)
343	outputs = pipeline(sentence)	×
344	word_tags = [[(tag["word"], tag["entity_group"]) for tag in outputs]]	×
345
346	return word_tags	×
347
348
349	class NamedEntityTagger:	×
350	def __init__(self, model: str = "Pavarissy/phayathaibert-thainer") -> None:	×
351	from transformers import (	×
352	AutoModelForTokenClassification,
353	AutoTokenizer,
354	)
355
356	self.tokenizer = AutoTokenizer.from_pretrained(model)	×
357	self.model = AutoModelForTokenClassification.from_pretrained(model)	×
358
359	def get_ner(	×
360	self,
361	text: str,
362	tag: bool = False,
363	pos: bool = False,
364	strategy: str = "simple",
365	) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]:
366	"""
367	This function tags named entities in text in IOB format.
368
369	:param str text: text in Thai to be tagged
370	:param bool pos: output with part-of-speech tags.\
371	(PhayaThaiBERT is supported in PartOfSpeechTagger)
372	:return: a list of tuples associated with tokenized words, NER tags,
373	POS tags (if the parameter `pos` is specified as `True`),
374	and output HTML-like tags (if the parameter `tag` is
375	specified as `True`).
376	Otherwise, return a list of tuples associated with tokenized
377	words and NER tags
378	:rtype: Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]
379	:Example:
380
381	>>> from pythainlp.phayathaibert.core import NamedEntityTagger
382	>>>
383	>>> tagger = NamedEntityTagger()
384	>>> tagger.get_ner("ทดสอบนายปวริศ เรืองจุติโพธิ์พานจากประเทศไทย")
385	[('นายปวริศ เรืองจุติโพธิ์พานจากประเทศไทย', 'PERSON'),
386	('จาก', 'LOCATION'),
387	('ประเทศไทย', 'LOCATION')]
388	>>> ner.tag("ทดสอบนายปวริศ เรืองจุติโพธิ์พานจากประเทศไทย", tag=True)
389	'ทดสอบ<PERSON>นายปวริศ เรืองจุติโพธิ์พาน</PERSON>\
390	<LOCATION>จาก</LOCATION><LOCATION>ประเทศไทย</LOCATION>'
391	"""
392	from transformers import TokenClassificationPipeline	×
393
394	if pos:	×
395	warnings.warn(	×
396	"This model doesn't support output \
397	postag and It doesn't output the postag."
398	)
399
400	sample_output = []	×
401	tag_text_list = []	×
402	current_pos = 0	×
403	pipeline = TokenClassificationPipeline(	×
404	model=self.model,
405	tokenizer=self.tokenizer,
406	aggregation_strategy=strategy,
407	)
408	outputs = pipeline(text)	×
409
410	for token in outputs:	×
411	ner_tag = token["entity_group"]	×
412	begin_pos, end_pos = token["start"], token["end"]	×
413	if current_pos == 0:	×
414	text_tag = (	×
415	text[:begin_pos]
416	+ f"<{ner_tag}>"
417	+ text[begin_pos:end_pos]
418	+ f"</{ner_tag}>"
419	)
420	else:
421	text_tag = (	×
422	text[current_pos:begin_pos]
423	+ f"<{ner_tag}>"
424	+ text[begin_pos:end_pos]
425	+ f"</{ner_tag}>"
426	)
427	tag_text_list.append(text_tag)	×
428	sample_output.append((token["word"], token["entity_group"]))	×
429	current_pos = end_pos	×
430
431	if tag:	×
432	return str("".join(tag_text_list))	×
433
434	return sample_output	×
435
436
437	def segment(sentence: str) -> List[str]:	×
438	"""
439	Subword tokenize of PhayaThaiBERT, \
440	sentencepiece from WangchanBERTa model with vocabulary expansion.
441
442	:param str sentence: text to be tokenized
443	:return: list of subwords
444	:rtype: list[str]
445	"""
446	if not sentence or not isinstance(sentence, str):	×
447	return []	×
448
449	return _tokenizer.tokenize(sentence)	×

PyThaiNLP / pythainlp / 11626163864

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous