• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

PyThaiNLP / pythainlp / 4699361508

pending completion
4699361508

push

github

GitHub
Merge pull request #789 from PyThaiNLP/4.0

22 of 22 new or added lines in 6 files covered. (100.0%)

5749 of 6246 relevant lines covered (92.04%)

0.92 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

31.25
/pythainlp/augment/lm/fasttext.py
1
# -*- coding: utf-8 -*-
2
# Copyright (C) 2016-2023 PyThaiNLP Project
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
from typing import List, Tuple
1✔
16
from gensim.models.fasttext import FastText as FastText_gensim
1✔
17
from pythainlp.tokenize import word_tokenize
1✔
18
from gensim.models.keyedvectors import KeyedVectors
1✔
19
import itertools
1✔
20

21

22
class FastTextAug:
1✔
23
    """
24
    Text Augment from FastText
25

26
    :param str model_path: path of model file
27
    """
28

29
    def __init__(self, model_path: str):
1✔
30
        """
31
        :param str model_path: path of model file
32
        """
33
        if model_path.endswith(".bin"):
×
34
            self.model = FastText_gensim.load_facebook_vectors(model_path)
×
35
        elif model_path.endswith(".vec"):
×
36
            self.model = KeyedVectors.load_word2vec_format(model_path)
×
37
        else:
38
            self.model = FastText_gensim.load(model_path)
×
39
        self.dict_wv = list(self.model.key_to_index.keys())
×
40

41
    def tokenize(self, text: str) -> List[str]:
1✔
42
        """
43
        Thai text tokenize for fasttext
44

45
        :param str text: thai text
46

47
        :return: list of word
48
        :rtype: List[str]
49
        """
50
        return word_tokenize(text, engine="icu")
×
51

52
    def modify_sent(self, sent: str, p: float = 0.7) -> List[List[str]]:
1✔
53
        """
54
        :param str sent: text sentence
55
        :param float p: probability
56
        :rtype: List[List[str]]
57
        """
58
        list_sent_new = []
×
59
        for i in sent:
×
60
            if i in self.dict_wv:
×
61
                w = [j for j, v in self.model.most_similar(i) if v >= p]
×
62
                if w == []:
×
63
                    list_sent_new.append([i])
×
64
                else:
65
                    list_sent_new.append(w)
×
66
            else:
67
                list_sent_new.append([i])
×
68
        return list_sent_new
×
69

70
    def augment(
1✔
71
        self, sentence: str, n_sent: int = 1, p: float = 0.7
72
    ) -> List[Tuple[str]]:
73
        """
74
        Text Augment from FastText
75

76
        You wants to download thai model
77
        from https://fasttext.cc/docs/en/crawl-vectors.html.
78

79
        :param str sentence: thai sentence
80
        :param int n_sent: number sentence
81
        :param float p: Probability of word
82

83
        :return: list of synonyms
84
        :rtype: List[Tuple[str]]
85
        """
86
        self.sentence = self.tokenize(sentence)
×
87
        self.list_synonym = self.modify_sent(self.sentence, p=p)
×
88
        new_sentences = []
×
89
        for x in list(itertools.product(*self.list_synonym))[0:n_sent]:
×
90
            new_sentences.append(x)
×
91
        return new_sentences
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc