• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

PyThaiNLP / pythainlp / 4699361508

pending completion
4699361508

push

github

GitHub
Merge pull request #789 from PyThaiNLP/4.0

22 of 22 new or added lines in 6 files covered. (100.0%)

5749 of 6246 relevant lines covered (92.04%)

0.92 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

88.89
/pythainlp/tag/tltk.py
1
# -*- coding: utf-8 -*-
2
# Copyright (C) 2016-2023 PyThaiNLP Project
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
from typing import List, Tuple, Union
1✔
16
try:
1✔
17
    from tltk import nlp
1✔
18
except ImportError:
×
19
    raise ImportError("Not found tltk! Please install tltk by pip install tltk")
×
20
from pythainlp.tokenize import word_tokenize
1✔
21

22
nlp.pos_load()
1✔
23
nlp.ner_load()
1✔
24

25

26
def pos_tag(words: List[str], corpus: str = "tnc") -> List[Tuple[str, str]]:
1✔
27
    if corpus != "tnc":
1✔
28
        raise ValueError("tltk not support {0} corpus.".format(0))
1✔
29
    return nlp.pos_tag_wordlist(words)
1✔
30

31

32
def _post_process(text: str) -> str:
1✔
33
    return text.replace("<s/>", " ")
1✔
34

35

36
def get_ner(
1✔
37
    text: str, pos: bool = True, tag: bool = False
38
) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]:
39
    """
40
    Named-entity recognizer from **TLTK**
41

42
    This function tags named-entitiy from text in IOB format.
43

44
    :param str text: text in Thai to be tagged
45
    :param bool pos: To include POS tags in the results (`True`) or
46
        exclude (`False`). The defualt value is `True`
47
    :param bool tag: output like html tag.
48
    :return: a list of tuple associated with tokenized word, NER tag,
49
        POS tag (if the parameter `pos` is specified as `True`),
50
        and output like html tag (if the parameter `tag` is
51
        specified as `True`).
52
        Otherwise, return a list of tuple associated with tokenized
53
        word and NER tag
54
    :rtype: Union[list[tuple[str, str]], list[tuple[str, str, str]]], str
55

56
    :Example:
57

58
        >>> from pythainlp.tag.tltk import get_ner
59
        >>> get_ner("เขาเรียนที่โรงเรียนนางรอง")
60
        [('เขา', 'PRON', 'O'),
61
        ('เรียน', 'VERB', 'O'),
62
        ('ที่', 'SCONJ', 'O'),
63
        ('โรงเรียน', 'NOUN', 'B-L'),
64
        ('นางรอง', 'VERB', 'I-L')]
65
        >>> get_ner("เขาเรียนที่โรงเรียนนางรอง", pos=False)
66
        [('เขา', 'O'),
67
        ('เรียน', 'O'),
68
        ('ที่', 'O'),
69
        ('โรงเรียน', 'B-L'),
70
        ('นางรอง', 'I-L')]
71
        >>> get_ner("เขาเรียนที่โรงเรียนนางรอง", tag=True)
72
        'เขาเรียนที่<L>โรงเรียนนางรอง</L>'
73
    """
74
    if not text:
1✔
75
        return []
1✔
76
    list_word = []
1✔
77
    for i in word_tokenize(text, engine="tltk"):
1✔
78
        if i == " ":
1✔
79
            i = "<s/>"
1✔
80
        list_word.append(i)
1✔
81
    _pos = nlp.pos_tag_wordlist(list_word)
1✔
82
    sent_ner = [
1✔
83
        (_post_process(word), pos, ner) for word, pos, ner in nlp.ner(_pos)
84
    ]
85
    if tag:
1✔
86
        temp = ""
1✔
87
        sent = ""
1✔
88
        for idx, (word, pos, ner) in enumerate(sent_ner):
1✔
89
            if ner.startswith("B-") and temp != "":
1✔
90
                sent += "</" + temp + ">"
×
91
                temp = ner[2:]
×
92
                sent += "<" + temp + ">"
×
93
            elif ner.startswith("B-"):
1✔
94
                temp = ner[2:]
1✔
95
                sent += "<" + temp + ">"
1✔
96
            elif ner == "O" and temp != "":
1✔
97
                sent += "</" + temp + ">"
1✔
98
                temp = ""
1✔
99
            sent += word
1✔
100

101
            if idx == len(sent_ner) - 1 and temp != "":
1✔
102
                sent += "</" + temp + ">"
1✔
103

104
        return sent
1✔
105
    if pos is False:
1✔
106
        return [(word, ner) for word, pos, ner in sent_ner]
1✔
107
    return sent_ner
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc