• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

PyThaiNLP / pythainlp / 11626163864

01 Nov 2024 07:49AM UTC coverage: 14.17% (+14.2%) from 0.0%
11626163864

Pull #952

github

web-flow
Merge 8f2551bc9 into 89ea62ebc
Pull Request #952: Specify a limited test suite

44 of 80 new or added lines in 48 files covered. (55.0%)

1048 of 7396 relevant lines covered (14.17%)

0.14 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/pythainlp/augment/word2vec/core.py
1
# -*- coding: utf-8 -*-
2
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
3
# SPDX-License-Identifier: Apache-2.0
4
import itertools
×
NEW
5
from typing import List, Tuple
×
6

7

8
class Word2VecAug:
×
9
    def __init__(
×
10
        self, model: str, tokenize: object, type: str = "file"
11
    ) -> None:
12
        """
13
        :param str model: path of model
14
        :param object tokenize: tokenize function
15
        :param str type: model type (file, binary)
16
        """
17
        import gensim.models.keyedvectors as word2vec
×
18

19
        self.tokenizer = tokenize
×
20
        if type == "file":
×
21
            self.model = word2vec.KeyedVectors.load_word2vec_format(model)
×
22
        elif type == "binary":
×
23
            self.model = word2vec.KeyedVectors.load_word2vec_format(
×
24
                model, binary=True, unicode_errors="ignore"
25
            )
26
        else:
27
            self.model = model
×
28
        self.dict_wv = list(self.model.key_to_index.keys())
×
29

30
    def modify_sent(self, sent: str, p: float = 0.7) -> List[List[str]]:
×
31
        """
32
        :param str sent: text of sentence
33
        :param float p: probability
34
        :rtype: List[List[str]]
35
        """
36
        list_sent_new = []
×
37
        for i in sent:
×
38
            if i in self.dict_wv:
×
39
                w = [j for j, v in self.model.most_similar(i) if v >= p]
×
40
                if w == []:
×
41
                    list_sent_new.append([i])
×
42
                else:
43
                    list_sent_new.append(w)
×
44
            else:
45
                list_sent_new.append([i])
×
46
        return list_sent_new
×
47

48
    def augment(
×
49
        self, sentence: str, n_sent: int = 1, p: float = 0.7
50
    ) -> List[Tuple[str]]:
51
        """
52
        :param str sentence: text of sentence
53
        :param int n_sent: maximum number of synonymous sentences
54
        :param int p: probability
55

56
        :return: list of synonyms
57
        :rtype: List[Tuple[str]]
58
        """
59
        self.sentence = self.tokenizer(sentence)
×
60
        self.list_synonym = self.modify_sent(self.sentence, p=p)
×
61
        new_sentences = []
×
62
        for x in list(itertools.product(*self.list_synonym))[0:n_sent]:
×
63
            new_sentences.append(x)
×
64
        return new_sentences
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc