• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

PyThaiNLP / pythainlp / 11626163864

01 Nov 2024 07:49AM UTC coverage: 14.17% (+14.2%) from 0.0%
11626163864

Pull #952

github

web-flow
Merge 8f2551bc9 into 89ea62ebc
Pull Request #952: Specify a limited test suite

44 of 80 new or added lines in 48 files covered. (55.0%)

1048 of 7396 relevant lines covered (14.17%)

0.14 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/pythainlp/augment/lm/phayathaibert.py
1
# -*- coding: utf-8 -*-
2
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
3
# SPDX-License-Identifier: Apache-2.0
4

5
import random
×
6
import re
×
NEW
7
from typing import List
×
8

9
from pythainlp.phayathaibert.core import ThaiTextProcessor
×
10

11
_MODEL_NAME = "clicknext/phayathaibert"
×
12

13

14
class ThaiTextAugmenter:
×
15
    def __init__(self) -> None:
×
16
        from transformers import (
×
17
            AutoModelForMaskedLM,
18
            AutoTokenizer,
19
            pipeline,
20
        )
21

22
        self.tokenizer = AutoTokenizer.from_pretrained(_MODEL_NAME)
×
23
        self.model_for_masked_lm = AutoModelForMaskedLM.from_pretrained(
×
24
            _MODEL_NAME
25
        )
26
        self.model = pipeline(
×
27
            "fill-mask",
28
            tokenizer=self.tokenizer,
29
            model=self.model_for_masked_lm,
30
        )
31
        self.processor = ThaiTextProcessor()
×
32

33
    def generate(
×
34
        self,
35
        sample_text: str,
36
        word_rank: int,
37
        max_length: int = 3,
38
        sample: bool = False,
39
    ) -> str:
40
        sample_txt = sample_text
×
41
        final_text = ""
×
42

43
        for j in range(max_length):
×
44
            input = self.processor.preprocess(sample_txt)
×
45
            if sample:
×
46
                random_word_idx = random.randint(0, 4)
×
47
                output = self.model(input)[random_word_idx]["sequence"]
×
48
            else:
49
                output = self.model(input)[word_rank]["sequence"]
×
50
            sample_txt = output + "<mask>"
×
51
            final_text = sample_txt
×
52

53
        gen_txt = re.sub("<mask>", "", final_text)
×
54

55
        return gen_txt
×
56

57
    def augment(
×
58
        self, text: str, num_augs: int = 3, sample: bool = False
59
    ) -> List[str]:
60
        """
61
        Text augmentation from PhayaThaiBERT
62

63
        :param str text: Thai text
64
        :param int num_augs: an amount of augmentation text needed as an output
65
        :param bool sample: whether to sample the text as an output or not, \
66
                            true if more word diversity is needed
67

68
        :return: list of text augment
69
        :rtype: List[str]
70

71
        :Example:
72
        ::
73

74
            from pythainlp.augment.lm import ThaiTextAugmenter
75

76
            aug = ThaiTextAugmenter()
77
            aug.augment("ช้างมีทั้งหมด 50 ตัว บน", num_args=5)
78

79
            # output = ['ช้างมีทั้งหมด 50 ตัว บนโลกใบนี้ครับ.',
80
                'ช้างมีทั้งหมด 50 ตัว บนพื้นดินครับ...',
81
                'ช้างมีทั้งหมด 50 ตัว บนท้องฟ้าครับ...',
82
                'ช้างมีทั้งหมด 50 ตัว บนดวงจันทร์.‼',
83
                'ช้างมีทั้งหมด 50 ตัว บนเขาค่ะ😁']
84
        """
85
        MAX_NUM_AUGS = 5
×
86
        augment_list = []
×
87

88
        if "<mask>" not in text:
×
89
            text = text + "<mask>"
×
90

91
        if num_augs <= MAX_NUM_AUGS:
×
92
            for rank in range(num_augs):
×
93
                gen_text = self.generate(text, rank, sample=sample)
×
94
                processed_text = re.sub(
×
95
                    "<_>", " ", self.processor.preprocess(gen_text)
96
                )
97
                augment_list.append(processed_text)
×
98
        else:
99
            raise ValueError(
×
100
                f"augmentation of more than {num_augs} is exceeded \
101
                    the default limit: {MAX_NUM_AUGS}"
102
            )
103

104
        return augment_list
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc