11625814262

Committed 01 Nov 2024 07:14AM UTC coverage: 20.782% (+20.8%) from 0.0%

Build # 11625814262

Build Type

Pull #952

github

Committed by

web-flow

Commit Message

Merge c8385dcae into 515fe7ced

Pull Request Pull Request #952: Specify a limited test suite

Run Details

45 of 80 new or added lines in 48 files covered. (56.25%)

1537 of 7396 relevant lines covered (20.78%)

0.21 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0

/pythainlp/generate/thai2fit.py

# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
"""
Thai2fit: Thai Wikipeida Language Model for Text Generation

Codes are from
https://github.com/PyThaiNLP/tutorials/blob/master/source/notebooks/text_generation.ipynb
"""
__all__ = ["gen_sentence"]

import pickle
import random
from typing import List, Union

# fastai
import fastai
import pandas as pd
from fastai.text import *

# pythainlp
from pythainlp.ulmfit import (
    THWIKI_LSTM,
    ThaiTokenizer,
    post_rules_th,
    pre_rules_th,
)

# get dummy data
imdb = untar_data(URLs.IMDB_SAMPLE)
dummy_df = pd.read_csv(imdb / "texts.csv")

# get vocab
thwiki = THWIKI_LSTM

thwiki_itos = pickle.load(open(thwiki["itos_fname"], "rb"))
thwiki_vocab = fastai.text.transform.Vocab(thwiki_itos)

# dummy databunch
tt = Tokenizer(
    tok_func=ThaiTokenizer,
    lang="th",
    pre_rules=pre_rules_th,
    post_rules=post_rules_th,
)
processor = [
    TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False),
    NumericalizeProcessor(vocab=thwiki_vocab, max_vocab=60000, min_freq=3),
]
data_lm = (
    TextList.from_df(dummy_df, imdb, cols=["text"], processor=processor)
    .split_by_rand_pct(0.2)
    .label_for_lm()
    .databunch(bs=64)
)


data_lm.sanity_check()

config = {
    "emb_sz": 400,
    "n_hid": 1550,
    "n_layers": 4,
    "pad_token": 1,
    "qrnn": False,
    "tie_weights": True,
    "out_bias": True,
    "output_p": 0.25,
    "hidden_p": 0.1,
    "input_p": 0.2,
    "embed_p": 0.02,
    "weight_p": 0.15,
}
trn_args = {"drop_mult": 0.9, "clip": 0.12, "alpha": 2, "beta": 1}

learn = language_model_learner(
    data_lm, AWD_LSTM, config=config, pretrained=False, **trn_args
)

# load pretrained models
learn.load_pretrained(**thwiki)


def gen_sentence(
    start_seq: str = None,
    n: int = 4,
    prob: float = 0.001,
    output_str: bool = True,
) -> Union[List[str], str]:
    """
    Text generator using Thai2fit

    :param str start_seq: word to begin sentence with
    :param int N: number of words
    :param bool output_str: output as string
    :param bool duplicate: allow duplicate words in sentence

    :return: list words or str words
    :rtype: List[str], str

    :Example:
    ::

      from pythainlp.generate.thai2fit import gen_sentence

      gen_sentence()
      # output: 'แคทรียา อิงลิช  (นักแสดง'

      gen_sentence("แมว")
      # output: 'แมว คุณหลวง '
    """
    if start_seq is None:
        start_seq = random.choice(list(thwiki_itos))
    list_word = learn.predict(
        start_seq, n, temperature=0.8, min_p=prob, sep="-*-"
    ).split("-*-")
    if output_str:
        return "".join(list_word)
    return list_word

1	# -- coding: utf-8 --
2	# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
3	# SPDX-License-Identifier: Apache-2.0
4	"""
5	Thai2fit: Thai Wikipeida Language Model for Text Generation
6
7	Codes are from
8	https://github.com/PyThaiNLP/tutorials/blob/master/source/notebooks/text_generation.ipynb
9	"""
10	__all__ = ["gen_sentence"]	×
11
12	import pickle	×
NEW 13	import random	×
14	from typing import List, Union	×
15
16	# fastai
17	import fastai	×
NEW 18	import pandas as pd	×
19	from fastai.text import *	×
20
21	# pythainlp
NEW 22	from pythainlp.ulmfit import (	×
23	THWIKI_LSTM,
24	ThaiTokenizer,
25	post_rules_th,
26	pre_rules_th,
27	)
28
29	# get dummy data
30	imdb = untar_data(URLs.IMDB_SAMPLE)	×
31	dummy_df = pd.read_csv(imdb / "texts.csv")	×
32
33	# get vocab
34	thwiki = THWIKI_LSTM	×
35
36	thwiki_itos = pickle.load(open(thwiki["itos_fname"], "rb"))	×
37	thwiki_vocab = fastai.text.transform.Vocab(thwiki_itos)	×
38
39	# dummy databunch
40	tt = Tokenizer(	×
41	tok_func=ThaiTokenizer,
42	lang="th",
43	pre_rules=pre_rules_th,
44	post_rules=post_rules_th,
45	)
46	processor = [	×
47	TokenizeProcessor(tokenizer=tt, chunksize=10000, mark_fields=False),
48	NumericalizeProcessor(vocab=thwiki_vocab, max_vocab=60000, min_freq=3),
49	]
50	data_lm = (	×
51	TextList.from_df(dummy_df, imdb, cols=["text"], processor=processor)
52	.split_by_rand_pct(0.2)
53	.label_for_lm()
54	.databunch(bs=64)
55	)
56
57
58	data_lm.sanity_check()	×
59
60	config = {	×
61	"emb_sz": 400,
62	"n_hid": 1550,
63	"n_layers": 4,
64	"pad_token": 1,
65	"qrnn": False,
66	"tie_weights": True,
67	"out_bias": True,
68	"output_p": 0.25,
69	"hidden_p": 0.1,
70	"input_p": 0.2,
71	"embed_p": 0.02,
72	"weight_p": 0.15,
73	}
74	trn_args = {"drop_mult": 0.9, "clip": 0.12, "alpha": 2, "beta": 1}	×
75
76	learn = language_model_learner(	×
77	data_lm, AWD_LSTM, config=config, pretrained=False, **trn_args
78	)
79
80	# load pretrained models
81	learn.load_pretrained(**thwiki)	×
82
83
84	def gen_sentence(	×
85	start_seq: str = None,
86	n: int = 4,
87	prob: float = 0.001,
88	output_str: bool = True,
89	) -> Union[List[str], str]:
90	"""
91	Text generator using Thai2fit
92
93	:param str start_seq: word to begin sentence with
94	:param int N: number of words
95	:param bool output_str: output as string
96	:param bool duplicate: allow duplicate words in sentence
97
98	:return: list words or str words
99	:rtype: List[str], str
100
101	:Example:
102	::
103
104	from pythainlp.generate.thai2fit import gen_sentence
105
106	gen_sentence()
107	# output: 'แคทรียา อิงลิช (นักแสดง'
108
109	gen_sentence("แมว")
110	# output: 'แมว คุณหลวง '
111	"""
112	if start_seq is None:	×
113	start_seq = random.choice(list(thwiki_itos))	×
114	list_word = learn.predict(	×
115	start_seq, n, temperature=0.8, min_p=prob, sep="-*-"
116	).split("-*-")
117	if output_str:	×
118	return "".join(list_word)	×
119	return list_word	×

PyThaiNLP / pythainlp / 11625814262

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous