• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

PyThaiNLP / pythainlp / 11626163864

01 Nov 2024 07:49AM UTC coverage: 14.17% (+14.2%) from 0.0%
11626163864

Pull #952

github

web-flow
Merge 8f2551bc9 into 89ea62ebc
Pull Request #952: Specify a limited test suite

44 of 80 new or added lines in 48 files covered. (55.0%)

1048 of 7396 relevant lines covered (14.17%)

0.14 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

3.62
/pythainlp/phayathaibert/core.py
1
# -*- coding: utf-8 -*-
2
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
3
# SPDX-License-Identifier: Apache-2.0
4

5
import random
1✔
6
import re
1✔
7
import warnings
1✔
8
from typing import Callable, List, Tuple, Union
1✔
9

10
from transformers import (
1✔
11
    CamembertTokenizer,
12
)
13

NEW
14
from pythainlp.tokenize import word_tokenize
×
15

16
_PAT_URL = r"(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?"
×
17

18
_model_name = "clicknext/phayathaibert"
×
19
_tokenizer = CamembertTokenizer.from_pretrained(_model_name)
×
20

21

22
class ThaiTextProcessor:
×
23
    def __init__(self):
×
24
        (
×
25
            self._TK_UNK,
26
            self._TK_REP,
27
            self._TK_WREP,
28
            self._TK_URL,
29
            self._TK_END,
30
        ) = "<unk> <rep> <wrep> <url> </s>".split()
31
        self.SPACE_SPECIAL_TOKEN = "<_>"
×
32

33
    def replace_url(self, text: str) -> str:
×
34
        """
35
        Replace url in `text` with TK_URL (https://stackoverflow.com/a/6041965)
36
        :param str text: text to replace url
37
        :return: text where urls  are replaced
38
        :rtype: str
39
        :Example:
40
            >>> replace_url("go to https://github.com")
41
            go to <url>
42
        """
43
        return re.sub(_PAT_URL, self._TK_URL, text)
×
44

45
    def rm_brackets(self, text: str) -> str:
×
46
        """
47
        Remove all empty brackets and artifacts within brackets from `text`.
48
        :param str text: text to remove useless brackets
49
        :return: text where all useless brackets are removed
50
        :rtype: str
51
        :Example:
52
            >>> rm_brackets("hey() whats[;] up{*&} man(hey)")
53
            hey whats up man(hey)
54
        """
55
        # remove empty brackets
56
        new_line = re.sub(r"\(\)", "", text)
×
57
        new_line = re.sub(r"\{\}", "", new_line)
×
58
        new_line = re.sub(r"\[\]", "", new_line)
×
59
        # brackets with only punctuations
60
        new_line = re.sub(r"\([^a-zA-Z0-9ก-๙]+\)", "", new_line)
×
61
        new_line = re.sub(r"\{[^a-zA-Z0-9ก-๙]+\}", "", new_line)
×
62
        new_line = re.sub(r"\[[^a-zA-Z0-9ก-๙]+\]", "", new_line)
×
63
        # artifiacts after (
64
        new_line = re.sub(
×
65
            r"(?<=\()[^a-zA-Z0-9ก-๙]+(?=[a-zA-Z0-9ก-๙])", "", new_line
66
        )
67
        new_line = re.sub(
×
68
            r"(?<=\{)[^a-zA-Z0-9ก-๙]+(?=[a-zA-Z0-9ก-๙])", "", new_line
69
        )
70
        new_line = re.sub(
×
71
            r"(?<=\[)[^a-zA-Z0-9ก-๙]+(?=[a-zA-Z0-9ก-๙])", "", new_line
72
        )
73
        # artifacts before )
74
        new_line = re.sub(
×
75
            r"(?<=[a-zA-Z0-9ก-๙])[^a-zA-Z0-9ก-๙]+(?=\))", "", new_line
76
        )
77
        new_line = re.sub(
×
78
            r"(?<=[a-zA-Z0-9ก-๙])[^a-zA-Z0-9ก-๙]+(?=\})", "", new_line
79
        )
80
        new_line = re.sub(
×
81
            r"(?<=[a-zA-Z0-9ก-๙])[^a-zA-Z0-9ก-๙]+(?=\])", "", new_line
82
        )
83
        return new_line
×
84

85
    def replace_newlines(self, text: str) -> str:
×
86
        """
87
        Replace newlines in `text` with spaces.
88
        :param str text: text to replace all newlines with spaces
89
        :return: text where all newlines are replaced with spaces
90
        :rtype: str
91
        :Example:
92
            >>> rm_useless_spaces("hey whats\n\nup")
93
            hey whats  up
94
        """
95

96
        return re.sub(r"[\n]", " ", text.strip())
×
97

98
    def rm_useless_spaces(self, text: str) -> str:
×
99
        """
100
        Remove multiple spaces in `text`. (code from `fastai`)
101
        :param str text: text to replace useless spaces
102
        :return: text where all spaces are reduced to one
103
        :rtype: str
104
        :Example:
105
            >>> rm_useless_spaces("oh         no")
106
            oh no
107
        """
108
        return re.sub(" {2,}", " ", text)
×
109

110
    def replace_spaces(self, text: str, space_token: str = "<_>") -> str:
×
111
        """
112
        Replace spaces with _
113
        :param str text: text to replace spaces
114
        :return: text where all spaces replaced with _
115
        :rtype: str
116
        :Example:
117
            >>> replace_spaces("oh no")
118
            oh_no
119
        """
120
        return re.sub(" ", space_token, text)
×
121

122
    def replace_rep_after(self, text: str) -> str:
×
123
        """
124
        Replace repetitions at the character level in `text`
125
        :param str text: input text to replace character repetition
126
        :return: text with repetitive tokens removed.
127
        :rtype: str
128
        :Example:
129
            >>> text = "กาาาาาาา"
130
            >>> replace_rep_after(text)
131
            'กา'
132
        """
133

134
        def _replace_rep(m):
×
135
            c, cc = m.groups()
×
136
            return f"{c}"
×
137

138
        re_rep = re.compile(r"(\S)(\1{3,})")
×
139
        return re_rep.sub(_replace_rep, text)
×
140

141
    def replace_wrep_post(self, toks: List[str]) -> List[str]:
×
142
        """
143
        Replace repetitive words post tokenization;
144
        fastai `replace_wrep` does not work well with Thai.
145
        :param List[str] toks: list of tokens
146
        :return: list of tokens where repetitive words are removed.
147
        :rtype: List[str]
148
        :Example:
149
            >>> toks = ["กา", "น้ำ", "น้ำ", "น้ำ", "น้ำ"]
150
            >>> replace_wrep_post(toks)
151
            ['กา', 'น้ำ']
152
        """
153
        previous_word = ""
×
154
        rep_count = 0
×
155
        res = []
×
156
        for current_word in toks + [self._TK_END]:
×
157
            if current_word == previous_word:
×
158
                rep_count += 1
×
159
            elif (current_word != previous_word) & (rep_count > 0):
×
160
                res += [previous_word]
×
161
                rep_count = 0
×
162
            else:
163
                res.append(previous_word)
×
164
            previous_word = current_word
×
165

166
        return res[1:]
×
167

168
    def remove_space(self, toks: List[str]) -> List[str]:
×
169
        """
170
        Do not include space for bag-of-word models.
171
        :param List[str] toks: list of tokens
172
        :return: List of tokens where space tokens (" ") are filtered out
173
        :rtype: List[str]
174
        :Example:
175
            >>> toks = ["ฉัน", "เดิน", " ", "กลับ", "บ้าน"]
176
            >>> remove_space(toks)
177
            ['ฉัน', 'เดิน', 'กลับ', 'บ้าน']
178
        """
179
        res = []
×
180
        for t in toks:
×
181
            t = t.strip()
×
182
            if t:
×
183
                res.append(t)
×
184

185
        return res
×
186

187
    # combine them together
188
    def preprocess(
×
189
        self,
190
        text: str,
191
        pre_rules: List[Callable] = [
192
            rm_brackets,
193
            replace_newlines,
194
            rm_useless_spaces,
195
            replace_spaces,
196
            replace_rep_after,
197
        ],
198
        tok_func: Callable = word_tokenize,
199
    ) -> str:
200
        text = text.lower()
×
201
        for rule in pre_rules:
×
202
            text = rule(text)
×
203
        toks = tok_func(text)
×
204

205
        return "".join(toks)
×
206

207

208
class ThaiTextAugmenter:
×
209
    def __init__(self) -> None:
×
210
        from transformers import (
×
211
            AutoModelForMaskedLM,
212
            AutoTokenizer,
213
            pipeline,
214
        )
215

216
        self.tokenizer = AutoTokenizer.from_pretrained(_model_name)
×
217
        self.model_for_masked_lm = AutoModelForMaskedLM.from_pretrained(
×
218
            _model_name
219
        )
220
        self.model = pipeline(
×
221
            "fill-mask",
222
            tokenizer=self.tokenizer,
223
            model=self.model_for_masked_lm,
224
        )
225
        self.processor = ThaiTextProcessor()
×
226

227
    def generate(
×
228
        self,
229
        sample_text: str,
230
        word_rank: int,
231
        max_length: int = 3,
232
        sample: bool = False,
233
    ) -> str:
234
        sample_txt = sample_text
×
235
        final_text = ""
×
236
        for j in range(max_length):
×
237
            input = self.processor.preprocess(sample_txt)
×
238
            if sample:
×
239
                random_word_idx = random.randint(0, 4)
×
240
                output = self.model(input)[random_word_idx]["sequence"]
×
241
            else:
242
                output = self.model(input)[word_rank]["sequence"]
×
243
            sample_txt = output + "<mask>"
×
244
            final_text = sample_txt
×
245

246
        gen_txt = re.sub("<mask>", "", final_text)
×
247

248
        return gen_txt
×
249

250
    def augment(
×
251
        self,
252
        text: str,
253
        num_augs: int = 3,
254
        sample: bool = False,
255
    ) -> List[str]:
256
        """
257
        Text augmentation from PhayaThaiBERT
258

259
        :param str text: Thai text
260
        :param int num_augs: an amount of augmentation text needed as an output
261
        :param bool sample: whether to sample the text as an output or not,\
262
              true if more word diversity is needed
263

264
        :return: list of text augment
265
        :rtype: List[str]
266

267
        :Example:
268
        ::
269

270
            from pythainlp.augment.lm import ThaiTextAugmenter
271

272
            aug = ThaiTextAugmenter()
273
            aug.augment("ช้างมีทั้งหมด 50 ตัว บน", num_args=5)
274

275
            # output = ['ช้างมีทั้งหมด 50 ตัว บนโลกใบนี้ครับ.',
276
                'ช้างมีทั้งหมด 50 ตัว บนพื้นดินครับ...',
277
                'ช้างมีทั้งหมด 50 ตัว บนท้องฟ้าครับ...',
278
                'ช้างมีทั้งหมด 50 ตัว บนดวงจันทร์.‼',
279
                'ช้างมีทั้งหมด 50 ตัว บนเขาค่ะ😁']
280
        """
281
        MAX_NUM_AUGS = 5
×
282
        augment_list = []
×
283

284
        if num_augs <= MAX_NUM_AUGS:
×
285
            for rank in range(num_augs):
×
286
                gen_text = self.generate(
×
287
                    text,
288
                    rank,
289
                    sample=sample,
290
                )
291
                processed_text = re.sub(
×
292
                    "<_>", " ", self.processor.preprocess(gen_text)
293
                )
294
                augment_list.append(processed_text)
×
295
        else:
296
            raise ValueError(
×
297
                f"augmentation of more than {num_augs} is exceeded \
298
                    the default limit: {MAX_NUM_AUGS}"
299
            )
300

301
        return augment_list
×
302

303

304
class PartOfSpeechTagger:
×
305
    def __init__(self, model: str = "lunarlist/pos_thai_phayathai") -> None:
×
306
        # Load model directly
307
        from transformers import (
×
308
            AutoModelForTokenClassification,
309
            AutoTokenizer,
310
        )
311

312
        self.tokenizer = AutoTokenizer.from_pretrained(model)
×
313
        self.model = AutoModelForTokenClassification.from_pretrained(model)
×
314

315
    def get_tag(
×
316
        self, sentence: str, strategy: str = "simple"
317
    ) -> List[List[Tuple[str, str]]]:
318
        """
319
        Marks sentences with part-of-speech (POS) tags.
320

321
        :param str sentence: a list of lists of tokenized words
322
        :return: a list of lists of tuples (word, POS tag)
323
        :rtype: list[list[tuple[str, str]]]
324

325
        :Example:
326

327
        Labels POS for given sentence::
328

329
            from pythainlp.phayathaibert.core import PartOfSpeechTagger
330

331
            tagger = PartOfSpeechTagger()
332
            tagger.get_tag("แมวทำอะไรตอนห้าโมงเช้า")
333
            # output:
334
            # [[('แมว', 'NOUN'), ('ทําอะไร', 'VERB'), ('ตอนห้าโมงเช้า', 'NOUN')]]
335
        """
336
        from transformers import TokenClassificationPipeline
×
337

338
        pipeline = TokenClassificationPipeline(
×
339
            model=self.model,
340
            tokenizer=self.tokenizer,
341
            aggregation_strategy=strategy,
342
        )
343
        outputs = pipeline(sentence)
×
344
        word_tags = [[(tag["word"], tag["entity_group"]) for tag in outputs]]
×
345

346
        return word_tags
×
347

348

349
class NamedEntityTagger:
×
350
    def __init__(self, model: str = "Pavarissy/phayathaibert-thainer") -> None:
×
351
        from transformers import (
×
352
            AutoModelForTokenClassification,
353
            AutoTokenizer,
354
        )
355

356
        self.tokenizer = AutoTokenizer.from_pretrained(model)
×
357
        self.model = AutoModelForTokenClassification.from_pretrained(model)
×
358

359
    def get_ner(
×
360
        self,
361
        text: str,
362
        tag: bool = False,
363
        pos: bool = False,
364
        strategy: str = "simple",
365
    ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]:
366
        """
367
        This function tags named entities in text in IOB format.
368

369
        :param str text: text in Thai to be tagged
370
        :param bool pos: output with part-of-speech tags.\
371
            (PhayaThaiBERT is supported in PartOfSpeechTagger)
372
        :return: a list of tuples associated with tokenized words, NER tags,
373
                 POS tags (if the parameter `pos` is specified as `True`),
374
                 and output HTML-like tags (if the parameter `tag` is
375
                 specified as `True`).
376
                 Otherwise, return a list of tuples associated with tokenized
377
                 words and NER tags
378
        :rtype: Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]
379
        :Example:
380

381
            >>> from pythainlp.phayathaibert.core import NamedEntityTagger
382
            >>>
383
            >>> tagger = NamedEntityTagger()
384
            >>> tagger.get_ner("ทดสอบนายปวริศ เรืองจุติโพธิ์พานจากประเทศไทย")
385
            [('นายปวริศ เรืองจุติโพธิ์พานจากประเทศไทย', 'PERSON'),
386
            ('จาก', 'LOCATION'),
387
            ('ประเทศไทย', 'LOCATION')]
388
            >>> ner.tag("ทดสอบนายปวริศ เรืองจุติโพธิ์พานจากประเทศไทย", tag=True)
389
            'ทดสอบ<PERSON>นายปวริศ เรืองจุติโพธิ์พาน</PERSON>\
390
                <LOCATION>จาก</LOCATION><LOCATION>ประเทศไทย</LOCATION>'
391
        """
392
        from transformers import TokenClassificationPipeline
×
393

394
        if pos:
×
395
            warnings.warn(
×
396
                "This model doesn't support output \
397
                          postag and It doesn't output the postag."
398
            )
399

400
        sample_output = []
×
401
        tag_text_list = []
×
402
        current_pos = 0
×
403
        pipeline = TokenClassificationPipeline(
×
404
            model=self.model,
405
            tokenizer=self.tokenizer,
406
            aggregation_strategy=strategy,
407
        )
408
        outputs = pipeline(text)
×
409

410
        for token in outputs:
×
411
            ner_tag = token["entity_group"]
×
412
            begin_pos, end_pos = token["start"], token["end"]
×
413
            if current_pos == 0:
×
414
                text_tag = (
×
415
                    text[:begin_pos]
416
                    + f"<{ner_tag}>"
417
                    + text[begin_pos:end_pos]
418
                    + f"</{ner_tag}>"
419
                )
420
            else:
421
                text_tag = (
×
422
                    text[current_pos:begin_pos]
423
                    + f"<{ner_tag}>"
424
                    + text[begin_pos:end_pos]
425
                    + f"</{ner_tag}>"
426
                )
427
            tag_text_list.append(text_tag)
×
428
            sample_output.append((token["word"], token["entity_group"]))
×
429
            current_pos = end_pos
×
430

431
        if tag:
×
432
            return str("".join(tag_text_list))
×
433

434
        return sample_output
×
435

436

437
def segment(sentence: str) -> List[str]:
×
438
    """
439
    Subword tokenize of PhayaThaiBERT, \
440
    sentencepiece from WangchanBERTa model with vocabulary expansion.
441

442
    :param str sentence: text to be tokenized
443
    :return: list of subwords
444
    :rtype: list[str]
445
    """
446
    if not sentence or not isinstance(sentence, str):
×
447
        return []
×
448

449
    return _tokenizer.tokenize(sentence)
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc