• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

PyThaiNLP / pythainlp / 11625814262

01 Nov 2024 07:14AM UTC coverage: 20.782% (+20.8%) from 0.0%
11625814262

Pull #952

github

web-flow
Merge c8385dcae into 515fe7ced
Pull Request #952: Specify a limited test suite

45 of 80 new or added lines in 48 files covered. (56.25%)

1537 of 7396 relevant lines covered (20.78%)

0.21 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/pythainlp/generate/thai2fit.py
1
# -*- coding: utf-8 -*-
2
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
3
# SPDX-License-Identifier: Apache-2.0
4
"""
5
Thai2fit: Thai Wikipeida Language Model for Text Generation
6

7
Codes are from
8
https://github.com/PyThaiNLP/tutorials/blob/master/source/notebooks/text_generation.ipynb
9
"""
10
__all__ = ["gen_sentence"]
×
11

12
import pickle
×
NEW
13
import random
×
14
from typing import List, Union
×
15

16
# fastai
17
import fastai
×
NEW
18
import pandas as pd
×
19
from fastai.text import *
×
20

21
# pythainlp
NEW
22
from pythainlp.ulmfit import (
×
23
    THWIKI_LSTM,
24
    ThaiTokenizer,
25
    post_rules_th,
26
    pre_rules_th,
27
)
28

29
# get dummy data
30
imdb = untar_data(URLs.IMDB_SAMPLE)
×
31
dummy_df = pd.read_csv(imdb / "texts.csv")
×
32

33
# get vocab
34
thwiki = THWIKI_LSTM
×
35

36
thwiki_itos = pickle.load(open(thwiki["itos_fname"], "rb"))
×
37
thwiki_vocab = fastai.text.transform.Vocab(thwiki_itos)
×
38

39
# dummy databunch
40
tt = Tokenizer(
×
41
    tok_func=ThaiTokenizer,
42
    lang="th",
43
    pre_rules=pre_rules_th,
44
    post_rules=post_rules_th,
45
)
46
processor = [
×
47
    TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False),
48
    NumericalizeProcessor(vocab=thwiki_vocab, max_vocab=60000, min_freq=3),
49
]
50
data_lm = (
×
51
    TextList.from_df(dummy_df, imdb, cols=["text"], processor=processor)
52
    .split_by_rand_pct(0.2)
53
    .label_for_lm()
54
    .databunch(bs=64)
55
)
56

57

58
data_lm.sanity_check()
×
59

60
config = {
×
61
    "emb_sz": 400,
62
    "n_hid": 1550,
63
    "n_layers": 4,
64
    "pad_token": 1,
65
    "qrnn": False,
66
    "tie_weights": True,
67
    "out_bias": True,
68
    "output_p": 0.25,
69
    "hidden_p": 0.1,
70
    "input_p": 0.2,
71
    "embed_p": 0.02,
72
    "weight_p": 0.15,
73
}
74
trn_args = {"drop_mult": 0.9, "clip": 0.12, "alpha": 2, "beta": 1}
×
75

76
learn = language_model_learner(
×
77
    data_lm, AWD_LSTM, config=config, pretrained=False, **trn_args
78
)
79

80
# load pretrained models
81
learn.load_pretrained(**thwiki)
×
82

83

84
def gen_sentence(
×
85
    start_seq: str = None,
86
    n: int = 4,
87
    prob: float = 0.001,
88
    output_str: bool = True,
89
) -> Union[List[str], str]:
90
    """
91
    Text generator using Thai2fit
92

93
    :param str start_seq: word to begin sentence with
94
    :param int N: number of words
95
    :param bool output_str: output as string
96
    :param bool duplicate: allow duplicate words in sentence
97

98
    :return: list words or str words
99
    :rtype: List[str], str
100

101
    :Example:
102
    ::
103

104
      from pythainlp.generate.thai2fit import gen_sentence
105

106
      gen_sentence()
107
      # output: 'แคทรียา อิงลิช  (นักแสดง'
108

109
      gen_sentence("แมว")
110
      # output: 'แมว คุณหลวง '
111
    """
112
    if start_seq is None:
×
113
        start_seq = random.choice(list(thwiki_itos))
×
114
    list_word = learn.predict(
×
115
        start_seq, n, temperature=0.8, min_p=prob, sep="-*-"
116
    ).split("-*-")
117
    if output_str:
×
118
        return "".join(list_word)
×
119
    return list_word
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc