• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

PyThaiNLP / pythainlp / 11625814262

01 Nov 2024 07:14AM UTC coverage: 20.782% (+20.8%) from 0.0%
11625814262

Pull #952

github

web-flow
Merge c8385dcae into 515fe7ced
Pull Request #952: Specify a limited test suite

45 of 80 new or added lines in 48 files covered. (56.25%)

1537 of 7396 relevant lines covered (20.78%)

0.21 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

9.43
/pythainlp/augment/wordnet.py
1
# -*- coding: utf-8 -*-
2
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
3
# SPDX-License-Identifier: Apache-2.0
4
"""
5
Thank https://dev.to/ton_ami/text-data-augmentation-synonym-replacement-4h8l
6
"""
7
__all__ = [
1✔
8
    "WordNetAug",
9
    "postype2wordnet",
10
]
11

12
import itertools
1✔
13
from collections import OrderedDict
1✔
14
from typing import List
1✔
15

16
from nltk.corpus import wordnet as wn
1✔
17

18
from pythainlp.corpus import wordnet
×
19
from pythainlp.tag import pos_tag
×
NEW
20
from pythainlp.tokenize import word_tokenize
×
21

22
orchid = {
×
23
    "": "",
24
    # NOUN
25
    "NOUN": wn.NOUN,
26
    "NCMN": wn.NOUN,
27
    "NTTL": wn.NOUN,
28
    "CNIT": wn.NOUN,
29
    "CLTV": wn.NOUN,
30
    "CMTR": wn.NOUN,
31
    "CFQC": wn.NOUN,
32
    "CVBL": wn.NOUN,
33
    # VERB
34
    "VACT": wn.VERB,
35
    "VSTA": wn.VERB,
36
    # PROPN
37
    "PROPN": "",
38
    "NPRP": "",
39
    # ADJ
40
    "ADJ": wn.ADJ,
41
    "NONM": wn.ADJ,
42
    "VATT": wn.ADJ,
43
    "DONM": wn.ADJ,
44
    # ADV
45
    "ADV": wn.ADV,
46
    "ADVN": wn.ADV,
47
    "ADVI": wn.ADV,
48
    "ADVP": wn.ADV,
49
    "ADVS": wn.ADV,
50
    # INT
51
    "INT": "",
52
    # PRON
53
    "PRON": "",
54
    "PPRS": "",
55
    "PDMN": "",
56
    "PNTR": "",
57
    # DET
58
    "DET": "",
59
    "DDAN": "",
60
    "DDAC": "",
61
    "DDBQ": "",
62
    "DDAQ": "",
63
    "DIAC": "",
64
    "DIBQ": "",
65
    "DIAQ": "",
66
    # NUM
67
    "NUM": "",
68
    "NCNM": "",
69
    "NLBL": "",
70
    "DCNM": "",
71
    # AUX
72
    "AUX": "",
73
    "XVBM": "",
74
    "XVAM": "",
75
    "XVMM": "",
76
    "XVBB": "",
77
    "XVAE": "",
78
    # ADP
79
    "ADP": "",
80
    "RPRE": "",
81
    # CCONJ
82
    "CCONJ": "",
83
    "JCRG": "",
84
    # SCONJ
85
    "SCONJ": "",
86
    "PREL": "",
87
    "JSBR": "",
88
    "JCMP": "",
89
    # PART
90
    "PART": "",
91
    "FIXN": "",
92
    "FIXV": "",
93
    "EAFF": "",
94
    "EITT": "",
95
    "AITT": "",
96
    "NEG": "",
97
    # PUNCT
98
    "PUNCT": "",
99
    "PUNC": "",
100
}
101

102

103
def postype2wordnet(pos: str, corpus: str):
×
104
    """
105
    Convert part-of-speech type to wordnet type
106

107
    :param str pos: POS type
108
    :param str corpus: part-of-speech corpus
109

110
    **Options for corpus**
111
        * *orchid* - Orchid Corpus
112
    """
113
    if corpus not in ["orchid"]:
×
114
        return None
×
115
    return orchid[pos]
×
116

117

118
class WordNetAug:
×
119
    """
120
    Text Augment using wordnet
121
    """
122

123
    def __init__(self):
×
124
        pass
×
125

126
    def find_synonyms(
×
127
        self, word: str, pos: str = None, postag_corpus: str = "orchid"
128
    ) -> List[str]:
129
        """
130
        Find synonyms using wordnet
131

132
        :param str word: word
133
        :param str pos: part-of-speech type
134
        :param str postag_corpus: name of POS tag corpus
135
        :return: list of synonyms
136
        :rtype: List[str]
137
        """
138
        self.synonyms = []
×
139
        if pos is None:
×
140
            self.list_synsets = wordnet.synsets(word)
×
141
        else:
142
            self.p2w_pos = postype2wordnet(pos, postag_corpus)
×
143
            if self.p2w_pos != "":
×
144
                self.list_synsets = wordnet.synsets(word, pos=self.p2w_pos)
×
145
            else:
146
                self.list_synsets = wordnet.synsets(word)
×
147

148
        for self.synset in wordnet.synsets(word):
×
149
            for self.syn in self.synset.lemma_names(lang="tha"):
×
150
                self.synonyms.append(self.syn)
×
151

152
        self.synonyms_without_duplicates = list(
×
153
            OrderedDict.fromkeys(self.synonyms)
154
        )
155
        return self.synonyms_without_duplicates
×
156

157
    def augment(
×
158
        self,
159
        sentence: str,
160
        tokenize: object = word_tokenize,
161
        max_syn_sent: int = 6,
162
        postag: bool = True,
163
        postag_corpus: str = "orchid",
164
    ) -> List[List[str]]:
165
        """
166
        Text Augment using wordnet
167

168
        :param str sentence: Thai sentence
169
        :param object tokenize: function for tokenizing words
170
        :param int max_syn_sent: maximum number of synonymous sentences
171
        :param bool postag: use part-of-speech
172
        :param str postag_corpus: name of POS tag corpus
173

174
        :return: list of synonyms
175
        :rtype: List[Tuple[str]]
176

177
        :Example:
178
        ::
179

180
            from pythainlp.augment import WordNetAug
181

182
            aug = WordNetAug()
183
            aug.augment("เราชอบไปโรงเรียน")
184
            # output: [('เรา', 'ชอบ', 'ไป', 'ร.ร.'),
185
             ('เรา', 'ชอบ', 'ไป', 'รร.'),
186
             ('เรา', 'ชอบ', 'ไป', 'โรงเรียน'),
187
             ('เรา', 'ชอบ', 'ไป', 'อาคารเรียน'),
188
             ('เรา', 'ชอบ', 'ไปยัง', 'ร.ร.'),
189
             ('เรา', 'ชอบ', 'ไปยัง', 'รร.')]
190
        """
191
        new_sentences = []
×
192
        self.list_words = tokenize(sentence)
×
193
        self.list_synonym = []
×
194
        self.p_all = 1
×
195
        if postag:
×
196
            self.list_pos = pos_tag(self.list_words, corpus=postag_corpus)
×
197
            for word, pos in self.list_pos:
×
198
                self.temp = self.find_synonyms(word, pos, postag_corpus)
×
199
                if not self.temp:
×
200
                    self.list_synonym.append([word])
×
201
                else:
202
                    self.list_synonym.append(self.temp)
×
203
                    self.p_all *= len(self.temp)
×
204
        else:
205
            for word in self.list_words:
×
206
                self.temp = self.find_synonyms(word)
×
207
                if not self.temp:
×
208
                    self.list_synonym.append([word])
×
209
                else:
210
                    self.list_synonym.append(self.temp)
×
211
                    self.p_all *= len(self.temp)
×
212
        if max_syn_sent > self.p_all:
×
213
            max_syn_sent = self.p_all
×
214
        for x in list(itertools.product(*self.list_synonym))[0:max_syn_sent]:
×
215
            new_sentences.append(x)
×
216
        return new_sentences
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc