11625814262

Committed 01 Nov 2024 07:14AM UTC coverage: 20.782% (+20.8%) from 0.0%

Build # 11625814262

Build Type

Pull #952

github

Committed by

web-flow

Commit Message

Merge c8385dcae into 515fe7ced

Pull Request Pull Request #952: Specify a limited test suite

Run Details

45 of 80 new or added lines in 48 files covered. (56.25%)

1537 of 7396 relevant lines covered (20.78%)

0.21 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

9.43

/pythainlp/augment/wordnet.py

# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
"""
Thank https://dev.to/ton_ami/text-data-augmentation-synonym-replacement-4h8l
"""
__all__ = [
    "WordNetAug",
    "postype2wordnet",
]

import itertools
from collections import OrderedDict
from typing import List

from nltk.corpus import wordnet as wn

from pythainlp.corpus import wordnet
from pythainlp.tag import pos_tag
from pythainlp.tokenize import word_tokenize

orchid = {
    "": "",
    # NOUN
    "NOUN": wn.NOUN,
    "NCMN": wn.NOUN,
    "NTTL": wn.NOUN,
    "CNIT": wn.NOUN,
    "CLTV": wn.NOUN,
    "CMTR": wn.NOUN,
    "CFQC": wn.NOUN,
    "CVBL": wn.NOUN,
    # VERB
    "VACT": wn.VERB,
    "VSTA": wn.VERB,
    # PROPN
    "PROPN": "",
    "NPRP": "",
    # ADJ
    "ADJ": wn.ADJ,
    "NONM": wn.ADJ,
    "VATT": wn.ADJ,
    "DONM": wn.ADJ,
    # ADV
    "ADV": wn.ADV,
    "ADVN": wn.ADV,
    "ADVI": wn.ADV,
    "ADVP": wn.ADV,
    "ADVS": wn.ADV,
    # INT
    "INT": "",
    # PRON
    "PRON": "",
    "PPRS": "",
    "PDMN": "",
    "PNTR": "",
    # DET
    "DET": "",
    "DDAN": "",
    "DDAC": "",
    "DDBQ": "",
    "DDAQ": "",
    "DIAC": "",
    "DIBQ": "",
    "DIAQ": "",
    # NUM
    "NUM": "",
    "NCNM": "",
    "NLBL": "",
    "DCNM": "",
    # AUX
    "AUX": "",
    "XVBM": "",
    "XVAM": "",
    "XVMM": "",
    "XVBB": "",
    "XVAE": "",
    # ADP
    "ADP": "",
    "RPRE": "",
    # CCONJ
    "CCONJ": "",
    "JCRG": "",
    # SCONJ
    "SCONJ": "",
    "PREL": "",
    "JSBR": "",
    "JCMP": "",
    # PART
    "PART": "",
    "FIXN": "",
    "FIXV": "",
    "EAFF": "",
    "EITT": "",
    "AITT": "",
    "NEG": "",
    # PUNCT
    "PUNCT": "",
    "PUNC": "",
}


def postype2wordnet(pos: str, corpus: str):
    """
    Convert part-of-speech type to wordnet type

    :param str pos: POS type
    :param str corpus: part-of-speech corpus

    **Options for corpus**
        * *orchid* - Orchid Corpus
    """
    if corpus not in ["orchid"]:
        return None
    return orchid[pos]


class WordNetAug:
    """
    Text Augment using wordnet
    """

    def __init__(self):
        pass

    def find_synonyms(
        self, word: str, pos: str = None, postag_corpus: str = "orchid"
    ) -> List[str]:
        """
        Find synonyms using wordnet

        :param str word: word
        :param str pos: part-of-speech type
        :param str postag_corpus: name of POS tag corpus
        :return: list of synonyms
        :rtype: List[str]
        """
        self.synonyms = []
        if pos is None:
            self.list_synsets = wordnet.synsets(word)
        else:
            self.p2w_pos = postype2wordnet(pos, postag_corpus)
            if self.p2w_pos != "":
                self.list_synsets = wordnet.synsets(word, pos=self.p2w_pos)
            else:
                self.list_synsets = wordnet.synsets(word)

        for self.synset in wordnet.synsets(word):
            for self.syn in self.synset.lemma_names(lang="tha"):
                self.synonyms.append(self.syn)

        self.synonyms_without_duplicates = list(
            OrderedDict.fromkeys(self.synonyms)
        )
        return self.synonyms_without_duplicates

    def augment(
        self,
        sentence: str,
        tokenize: object = word_tokenize,
        max_syn_sent: int = 6,
        postag: bool = True,
        postag_corpus: str = "orchid",
    ) -> List[List[str]]:
        """
        Text Augment using wordnet

        :param str sentence: Thai sentence
        :param object tokenize: function for tokenizing words
        :param int max_syn_sent: maximum number of synonymous sentences
        :param bool postag: use part-of-speech
        :param str postag_corpus: name of POS tag corpus

        :return: list of synonyms
        :rtype: List[Tuple[str]]

        :Example:
        ::

            from pythainlp.augment import WordNetAug

            aug = WordNetAug()
            aug.augment("เราชอบไปโรงเรียน")
            # output: [('เรา', 'ชอบ', 'ไป', 'ร.ร.'),
             ('เรา', 'ชอบ', 'ไป', 'รร.'),
             ('เรา', 'ชอบ', 'ไป', 'โรงเรียน'),
             ('เรา', 'ชอบ', 'ไป', 'อาคารเรียน'),
             ('เรา', 'ชอบ', 'ไปยัง', 'ร.ร.'),
             ('เรา', 'ชอบ', 'ไปยัง', 'รร.')]
        """
        new_sentences = []
        self.list_words = tokenize(sentence)
        self.list_synonym = []
        self.p_all = 1
        if postag:
            self.list_pos = pos_tag(self.list_words, corpus=postag_corpus)
            for word, pos in self.list_pos:
                self.temp = self.find_synonyms(word, pos, postag_corpus)
                if not self.temp:
                    self.list_synonym.append([word])
                else:
                    self.list_synonym.append(self.temp)
                    self.p_all *= len(self.temp)
        else:
            for word in self.list_words:
                self.temp = self.find_synonyms(word)
                if not self.temp:
                    self.list_synonym.append([word])
                else:
                    self.list_synonym.append(self.temp)
                    self.p_all *= len(self.temp)
        if max_syn_sent > self.p_all:
            max_syn_sent = self.p_all
        for x in list(itertools.product(*self.list_synonym))[0:max_syn_sent]:
            new_sentences.append(x)
        return new_sentences

1	# -- coding: utf-8 --
2	# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
3	# SPDX-License-Identifier: Apache-2.0
4	"""
5	Thank https://dev.to/ton_ami/text-data-augmentation-synonym-replacement-4h8l
6	"""
7	__all__ = [	1✔
8	"WordNetAug",
9	"postype2wordnet",
10	]
11
12	import itertools	1✔
13	from collections import OrderedDict	1✔
14	from typing import List	1✔
15
16	from nltk.corpus import wordnet as wn	1✔
17
18	from pythainlp.corpus import wordnet	×
19	from pythainlp.tag import pos_tag	×
NEW 20	from pythainlp.tokenize import word_tokenize	×
21
22	orchid = {	×
23	"": "",
24	# NOUN
25	"NOUN": wn.NOUN,
26	"NCMN": wn.NOUN,
27	"NTTL": wn.NOUN,
28	"CNIT": wn.NOUN,
29	"CLTV": wn.NOUN,
30	"CMTR": wn.NOUN,
31	"CFQC": wn.NOUN,
32	"CVBL": wn.NOUN,
33	# VERB
34	"VACT": wn.VERB,
35	"VSTA": wn.VERB,
36	# PROPN
37	"PROPN": "",
38	"NPRP": "",
39	# ADJ
40	"ADJ": wn.ADJ,
41	"NONM": wn.ADJ,
42	"VATT": wn.ADJ,
43	"DONM": wn.ADJ,
44	# ADV
45	"ADV": wn.ADV,
46	"ADVN": wn.ADV,
47	"ADVI": wn.ADV,
48	"ADVP": wn.ADV,
49	"ADVS": wn.ADV,
50	# INT
51	"INT": "",
52	# PRON
53	"PRON": "",
54	"PPRS": "",
55	"PDMN": "",
56	"PNTR": "",
57	# DET
58	"DET": "",
59	"DDAN": "",
60	"DDAC": "",
61	"DDBQ": "",
62	"DDAQ": "",
63	"DIAC": "",
64	"DIBQ": "",
65	"DIAQ": "",
66	# NUM
67	"NUM": "",
68	"NCNM": "",
69	"NLBL": "",
70	"DCNM": "",
71	# AUX
72	"AUX": "",
73	"XVBM": "",
74	"XVAM": "",
75	"XVMM": "",
76	"XVBB": "",
77	"XVAE": "",
78	# ADP
79	"ADP": "",
80	"RPRE": "",
81	# CCONJ
82	"CCONJ": "",
83	"JCRG": "",
84	# SCONJ
85	"SCONJ": "",
86	"PREL": "",
87	"JSBR": "",
88	"JCMP": "",
89	# PART
90	"PART": "",
91	"FIXN": "",
92	"FIXV": "",
93	"EAFF": "",
94	"EITT": "",
95	"AITT": "",
96	"NEG": "",
97	# PUNCT
98	"PUNCT": "",
99	"PUNC": "",
100	}
101
102
103	def postype2wordnet(pos: str, corpus: str):	×
104	"""
105	Convert part-of-speech type to wordnet type
106
107	:param str pos: POS type
108	:param str corpus: part-of-speech corpus
109
110	Options for corpus
111	* orchid - Orchid Corpus
112	"""
113	if corpus not in ["orchid"]:	×
114	return None	×
115	return orchid[pos]	×
116
117
118	class WordNetAug:	×
119	"""
120	Text Augment using wordnet
121	"""
122
123	def __init__(self):	×
124	pass	×
125
126	def find_synonyms(	×
127	self, word: str, pos: str = None, postag_corpus: str = "orchid"
128	) -> List[str]:
129	"""
130	Find synonyms using wordnet
131
132	:param str word: word
133	:param str pos: part-of-speech type
134	:param str postag_corpus: name of POS tag corpus
135	:return: list of synonyms
136	:rtype: List[str]
137	"""
138	self.synonyms = []	×
139	if pos is None:	×
140	self.list_synsets = wordnet.synsets(word)	×
141	else:
142	self.p2w_pos = postype2wordnet(pos, postag_corpus)	×
143	if self.p2w_pos != "":	×
144	self.list_synsets = wordnet.synsets(word, pos=self.p2w_pos)	×
145	else:
146	self.list_synsets = wordnet.synsets(word)	×
147
148	for self.synset in wordnet.synsets(word):	×
149	for self.syn in self.synset.lemma_names(lang="tha"):	×
150	self.synonyms.append(self.syn)	×
151
152	self.synonyms_without_duplicates = list(	×
153	OrderedDict.fromkeys(self.synonyms)
154	)
155	return self.synonyms_without_duplicates	×
156
157	def augment(	×
158	self,
159	sentence: str,
160	tokenize: object = word_tokenize,
161	max_syn_sent: int = 6,
162	postag: bool = True,
163	postag_corpus: str = "orchid",
164	) -> List[List[str]]:
165	"""
166	Text Augment using wordnet
167
168	:param str sentence: Thai sentence
169	:param object tokenize: function for tokenizing words
170	:param int max_syn_sent: maximum number of synonymous sentences
171	:param bool postag: use part-of-speech
172	:param str postag_corpus: name of POS tag corpus
173
174	:return: list of synonyms
175	:rtype: List[Tuple[str]]
176
177	:Example:
178	::
179
180	from pythainlp.augment import WordNetAug
181
182	aug = WordNetAug()
183	aug.augment("เราชอบไปโรงเรียน")
184	# output: [('เรา', 'ชอบ', 'ไป', 'ร.ร.'),
185	('เรา', 'ชอบ', 'ไป', 'รร.'),
186	('เรา', 'ชอบ', 'ไป', 'โรงเรียน'),
187	('เรา', 'ชอบ', 'ไป', 'อาคารเรียน'),
188	('เรา', 'ชอบ', 'ไปยัง', 'ร.ร.'),
189	('เรา', 'ชอบ', 'ไปยัง', 'รร.')]
190	"""
191	new_sentences = []	×
192	self.list_words = tokenize(sentence)	×
193	self.list_synonym = []	×
194	self.p_all = 1	×
195	if postag:	×
196	self.list_pos = pos_tag(self.list_words, corpus=postag_corpus)	×
197	for word, pos in self.list_pos:	×
198	self.temp = self.find_synonyms(word, pos, postag_corpus)	×
199	if not self.temp:	×
200	self.list_synonym.append([word])	×
201	else:
202	self.list_synonym.append(self.temp)	×
203	self.p_all *= len(self.temp)	×
204	else:
205	for word in self.list_words:	×
206	self.temp = self.find_synonyms(word)	×
207	if not self.temp:	×
208	self.list_synonym.append([word])	×
209	else:
210	self.list_synonym.append(self.temp)	×
211	self.p_all *= len(self.temp)	×
212	if max_syn_sent > self.p_all:	×
213	max_syn_sent = self.p_all	×
214	for x in list(itertools.product(*self.list_synonym))[0:max_syn_sent]:	×
215	new_sentences.append(x)	×
216	return new_sentences	×

PyThaiNLP / pythainlp / 11625814262

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous