• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

PyThaiNLP / pythainlp / 18284629720

06 Oct 2025 02:40PM UTC coverage: 52.869% (-0.01%) from 52.882%
18284629720

push

github

web-flow
Merge pull request #1152 from PyThaiNLP/dependabot/pip/transformers-4.57.0

Bump transformers from 4.55.2 to 4.57.0

4146 of 7842 relevant lines covered (52.87%)

0.53 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

94.07
/pythainlp/generate/core.py
1
# -*- coding: utf-8 -*-
2
# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
3
# SPDX-FileType: SOURCE
4
# SPDX-License-Identifier: Apache-2.0
5
"""
1✔
6
Text generator using n-gram language model
7

8
codes are from
9
https://towardsdatascience.com/understanding-word-n-grams-and-n-gram-probability-in-natural-language-processing-9d9eef0fa058
10
"""
11

12
import random
1✔
13
from typing import List, Union
1✔
14

15
from pythainlp.corpus.oscar import (
1✔
16
    unigram_word_freqs as oscar_word_freqs_unigram,
17
)
18
from pythainlp.corpus.tnc import bigram_word_freqs as tnc_word_freqs_bigram
1✔
19
from pythainlp.corpus.tnc import trigram_word_freqs as tnc_word_freqs_trigram
1✔
20
from pythainlp.corpus.tnc import unigram_word_freqs as tnc_word_freqs_unigram
1✔
21
from pythainlp.corpus.ttc import unigram_word_freqs as ttc_word_freqs_unigram
1✔
22

23

24
class Unigram:
1✔
25
    """
26
    Text generator using Unigram
27

28
    :param str name: corpus name
29
        * *tnc* - Thai National Corpus (default)
30
        * *ttc* - Thai Textbook Corpus (TTC)
31
        * *oscar* - OSCAR Corpus
32
    """
33

34
    def __init__(self, name: str = "tnc"):
1✔
35
        if name == "tnc":
1✔
36
            self.counts = tnc_word_freqs_unigram()
1✔
37
        elif name == "ttc":
1✔
38
            self.counts = ttc_word_freqs_unigram()
1✔
39
        elif name == "oscar":
1✔
40
            self.counts = oscar_word_freqs_unigram()
1✔
41
        self.word = list(self.counts.keys())
1✔
42
        self.n = 0
1✔
43
        for i in self.word:
1✔
44
            self.n += self.counts[i]
1✔
45
        self.prob = {i: self.counts[i] / self.n for i in self.word}
1✔
46
        self._word_prob: dict = {}
1✔
47

48
    def gen_sentence(
1✔
49
        self,
50
        start_seq: str = "",
51
        N: int = 3,
52
        prob: float = 0.001,
53
        output_str: bool = True,
54
        duplicate: bool = False,
55
    ) -> Union[List[str], str]:
56
        """
57
        :param str start_seq: word to begin sentence with
58
        :param int N: number of words
59
        :param bool output_str: output as string
60
        :param bool duplicate: allow duplicate words in sentence
61

62
        :return: list of words or a word string
63
        :rtype: List[str], str
64

65
        :Example:
66
        ::
67

68
            from pythainlp.generate import Unigram
69

70
            gen = Unigram()
71

72
            gen.gen_sentence("แมว")
73
            # output: 'แมวเวลานะนั้น'
74
        """
75
        if not start_seq:
1✔
76
            start_seq = random.choice(self.word)
1✔
77
        rand_text = start_seq.lower()
1✔
78
        self._word_prob = {
1✔
79
            i: self.counts[i] / self.n
80
            for i in self.word
81
            if self.counts[i] / self.n >= prob
82
        }
83
        return self._next_word(
1✔
84
            rand_text, N, output_str, prob=prob, duplicate=duplicate
85
        )
86

87
    def _next_word(
1✔
88
        self,
89
        text: str,
90
        N: int,
91
        output_str: bool,
92
        prob: float,
93
        duplicate: bool = False,
94
    ):
95
        words = []
1✔
96
        words.append(text)
1✔
97
        word_list = list(self._word_prob.keys())
1✔
98
        if N > len(word_list):
1✔
99
            N = len(word_list)
×
100
        for _ in range(N):
1✔
101
            w = random.choice(word_list)
1✔
102
            if duplicate is False:
1✔
103
                while w in words:
1✔
104
                    w = random.choice(word_list)
×
105
            words.append(w)
1✔
106

107
        if output_str:
1✔
108
            return "".join(words)
1✔
109
        return words
1✔
110

111

112
class Bigram:
1✔
113
    """
114
    Text generator using Bigram
115

116
    :param str name: corpus name
117
        * *tnc* - Thai National Corpus (default)
118
    """
119

120
    def __init__(self, name: str = "tnc"):
1✔
121
        if name == "tnc":
1✔
122
            self.uni = tnc_word_freqs_unigram()
1✔
123
            self.bi = tnc_word_freqs_bigram()
1✔
124
        self.uni_keys = list(self.uni.keys())
1✔
125
        self.bi_keys = list(self.bi.keys())
1✔
126
        self.words = [i[-1] for i in self.bi_keys]
1✔
127

128
    def prob(self, t1: str, t2: str) -> float:
1✔
129
        """
130
        probability of word
131

132
        :param int t1: text 1
133
        :param int t2: text 2
134

135
        :return: probability value
136
        :rtype: float
137
        """
138
        try:
1✔
139
            v = self.bi[(t1, t2)] / self.uni[t1]
1✔
140
        except ZeroDivisionError:
×
141
            v = 0.0
×
142
        return v
1✔
143

144
    def gen_sentence(
1✔
145
        self,
146
        start_seq: str = "",
147
        N: int = 4,
148
        prob: float = 0.001,
149
        output_str: bool = True,
150
        duplicate: bool = False,
151
    ) -> Union[List[str], str]:
152
        """
153
        :param str start_seq: word to begin sentence with
154
        :param int N: number of words
155
        :param bool output_str: output as string
156
        :param bool duplicate: allow duplicate words in sentence
157

158
        :return: list of words or a word string
159
        :rtype: List[str], str
160

161
        :Example:
162
        ::
163

164
            from pythainlp.generate import Bigram
165

166
            gen = Bigram()
167

168
            gen.gen_sentence("แมว")
169
            # output: 'แมวไม่ได้รับเชื้อมัน'
170
        """
171
        if not start_seq:
1✔
172
            start_seq = random.choice(self.words)
1✔
173
        late_word = start_seq
1✔
174
        list_word = []
1✔
175
        list_word.append(start_seq)
1✔
176

177
        for _ in range(N):
1✔
178
            if duplicate:
1✔
179
                temp = [j for j in self.bi_keys if j[0] == late_word]
1✔
180
            else:
181
                temp = [
1✔
182
                    j
183
                    for j in self.bi_keys
184
                    if j[0] == late_word and j[1] not in list_word
185
                ]
186
            probs = [self.prob(late_word, next_word[-1]) for next_word in temp]
1✔
187
            p2 = [j for j in probs if j >= prob]
1✔
188
            if len(p2) == 0:
1✔
189
                break
×
190
            items = temp[probs.index(random.choice(p2))]
1✔
191
            late_word = items[-1]
1✔
192
            list_word.append(late_word)
1✔
193

194
        if output_str:
1✔
195
            return "".join(list_word)
1✔
196

197
        return list_word
1✔
198

199

200
class Trigram:
1✔
201
    """
202
    Text generator using Trigram
203

204
    :param str name: corpus name
205
        * *tnc* - Thai National Corpus (default)
206
    """
207

208
    def __init__(self, name: str = "tnc"):
1✔
209
        if name == "tnc":
1✔
210
            self.uni = tnc_word_freqs_unigram()
1✔
211
            self.bi = tnc_word_freqs_bigram()
1✔
212
            self.ti = tnc_word_freqs_trigram()
1✔
213
        self.uni_keys = list(self.uni.keys())
1✔
214
        self.bi_keys = list(self.bi.keys())
1✔
215
        self.ti_keys = list(self.ti.keys())
1✔
216
        self.words = [i[-1] for i in self.bi_keys]
1✔
217

218
    def prob(self, t1: str, t2: str, t3: str) -> float:
1✔
219
        """
220
        probability of word
221

222
        :param int t1: text 1
223
        :param int t2: text 2
224
        :param int t3: text 3
225

226
        :return: probability value
227
        :rtype: float
228
        """
229
        try:
1✔
230
            v = self.ti[(t1, t2, t3)] / self.bi[(t1, t2)]
1✔
231
        except ZeroDivisionError:
×
232
            v = 0.0
×
233

234
        return v
1✔
235

236
    def gen_sentence(
1✔
237
        self,
238
        start_seq: str = "",
239
        N: int = 4,
240
        prob: float = 0.001,
241
        output_str: bool = True,
242
        duplicate: bool = False,
243
    ) -> Union[List[str], str]:
244
        """
245
        :param str start_seq: word to begin sentence with
246
        :param int N: number of words
247
        :param bool output_str: output as string
248
        :param bool duplicate: allow duplicate words in sentence
249

250
        :return: list of words or a word string
251
        :rtype: List[str], str
252

253
        :Example:
254
        ::
255

256
            from pythainlp.generate import Trigram
257

258
            gen = Trigram()
259

260
            gen.gen_sentence()
261
            # output: 'ยังทำตัวเป็นเซิร์ฟเวอร์คือ'
262
        """
263
        if not start_seq:
1✔
264
            start_seq = random.choice(self.bi_keys)
1✔
265
        late_word = start_seq
1✔
266
        list_word = []
1✔
267
        list_word.append(start_seq)
1✔
268

269
        for i in range(N):
1✔
270
            if duplicate:
1✔
271
                temp = [j for j in self.ti_keys if j[:2] == late_word]
1✔
272
            else:
273
                temp = [
1✔
274
                    j
275
                    for j in self.ti_keys
276
                    if j[:2] == late_word and j[1:] not in list_word
277
                ]
278
            probs = [self.prob(word[0], word[1], word[2]) for word in temp]
1✔
279
            p2 = [j for j in probs if j >= prob]
1✔
280
            if len(p2) == 0:
1✔
281
                break
1✔
282
            items = temp[probs.index(random.choice(p2))]
1✔
283
            late_word = items[1:]
1✔
284
            list_word.append(late_word)
1✔
285

286
        listdata = []
1✔
287
        for i in list_word:
1✔
288
            for j in i:
1✔
289
                if j not in listdata:
1✔
290
                    listdata.append(j)
1✔
291

292
        if output_str:
1✔
293
            return "".join(listdata)
1✔
294

295
        return listdata
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc