• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

PyThaiNLP / pythainlp / 5337431273

pending completion
5337431273

push

github

wannaphong
Add กาลพฤกษ์ to list words

3573 of 6329 relevant lines covered (56.45%)

0.56 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

79.31
/pythainlp/cli/tokenize.py
1
# -*- coding: utf-8 -*-
2
# Copyright (C) 2016-2023 PyThaiNLP Project
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
"""
1✔
16
thainlp tokenize command line.
17
"""
18

19
import argparse
1✔
20

21
from pythainlp import cli
1✔
22
from pythainlp.tokenize import (
1✔
23
    DEFAULT_SENT_TOKENIZE_ENGINE,
24
    DEFAULT_SUBWORD_TOKENIZE_ENGINE,
25
    DEFAULT_SYLLABLE_TOKENIZE_ENGINE,
26
    DEFAULT_WORD_TOKENIZE_ENGINE,
27
    sent_tokenize,
28
    subword_tokenize,
29
    word_tokenize,
30
)
31

32
DEFAULT_SENT_TOKEN_SEPARATOR = "@@"
1✔
33
DEFAULT_SUBWORD_TOKEN_SEPARATOR = "/"
1✔
34
DEFAULT_SYLLABLE_TOKEN_SEPARATOR = "~"
1✔
35
DEFAULT_WORD_TOKEN_SEPARATOR = "|"
1✔
36

37

38
class SubAppBase:
1✔
39
    def __init__(self, name, argv):
1✔
40
        parser = argparse.ArgumentParser(**cli.make_usage("tokenize " + name))
1✔
41
        parser.add_argument(
1✔
42
            "text",
43
            type=str,
44
            nargs="?",
45
            help="input text",
46
        )
47
        parser.add_argument(
1✔
48
            "-s",
49
            "--sep",
50
            dest="separator",
51
            type=str,
52
            help=f"default: {self.separator}",
53
            default=self.separator,
54
        )
55
        parser.add_argument(
1✔
56
            "-a",
57
            "--algo",
58
            dest="algorithm",
59
            type=str,
60
            help=f"default: {self.algorithm}",
61
            default=self.algorithm,
62
        )
63
        parser.add_argument(
1✔
64
            "-w",
65
            "--keep-whitespace",
66
            dest="keep_whitespace",
67
            action="store_true",
68
        )
69
        parser.add_argument(
1✔
70
            "-nw",
71
            "--no-whitespace",
72
            dest="keep_whitespace",
73
            action="store_false",
74
        )
75
        parser.set_defaults(keep_whitespace=True)
1✔
76

77
        args = parser.parse_args(argv)
1✔
78
        self.args = args
1✔
79

80
        cli.exit_if_empty(args.text, parser)
1✔
81
        result = self.run(
1✔
82
            args.text,
83
            engine=args.algorithm,
84
            keep_whitespace=args.keep_whitespace,
85
        )
86
        print(args.separator.join(result) + args.separator)
1✔
87

88

89
class WordTokenizationApp(SubAppBase):
1✔
90
    def __init__(self, *args, **kwargs):
1✔
91
        self.keep_whitespace = True
×
92
        self.algorithm = DEFAULT_WORD_TOKENIZE_ENGINE
×
93
        self.separator = DEFAULT_WORD_TOKEN_SEPARATOR
×
94
        self.run = word_tokenize
×
95
        super().__init__(*args, **kwargs)
×
96

97

98
class SentenceTokenizationApp(SubAppBase):
1✔
99
    def __init__(self, *args, **kwargs):
1✔
100
        self.keep_whitespace = True
×
101
        self.algorithm = DEFAULT_SENT_TOKENIZE_ENGINE
×
102
        self.separator = DEFAULT_SENT_TOKEN_SEPARATOR
×
103
        self.run = sent_tokenize
×
104
        super().__init__(*args, **kwargs)
×
105

106

107
class SubwordTokenizationApp(SubAppBase):
1✔
108
    def __init__(self, *args, **kwargs):
1✔
109
        self.keep_whitespace = True
1✔
110
        self.algorithm = DEFAULT_SUBWORD_TOKENIZE_ENGINE
1✔
111
        self.separator = DEFAULT_SUBWORD_TOKEN_SEPARATOR
1✔
112
        self.run = subword_tokenize
1✔
113
        super().__init__(*args, **kwargs)
1✔
114

115

116
class App:
1✔
117
    def __init__(self, argv):
1✔
118
        parser = argparse.ArgumentParser(
1✔
119
            prog="tokenize",
120
            description="Break a text into small units (tokens).",
121
            usage=(
122
                'thainlp tokenize <token_type> [options] "<text>"\n\n'
123
                "token_type:\n\n"
124
                "subword            subword (may not be a linguistic unit)\n"
125
                "syllable           syllable\n"
126
                "word               word\n"
127
                "sent               sentence\n\n"
128
                "options:\n\n"
129
                "--sep or -s <separator>    specify custom separator\n"
130
                "                           (default is a space)\n"
131
                "--algo or -a <algorithm>   tokenization algorithm\n"
132
                "                           (see API doc for more info)\n"
133
                "--keep-whitespace or -w    keep whitespaces in output\n"
134
                "                           (default)\n\n"
135
                "<separator> and <text> should be inside double quotes.\n\n"
136
                "Example:\n\n"
137
                'thainlp tokenize word -s "|" "ใต้แสงนีออนเปลี่ยวเหงา"\n\n'
138
                "--"
139
            ),
140
        )
141
        parser.add_argument(
1✔
142
            "token_type",
143
            type=str,
144
            help="[subword|word|sent]",
145
        )
146

147
        args = parser.parse_args(argv[2:3])
1✔
148
        cli.exit_if_empty(args.token_type, parser)
1✔
149
        token_type = str.lower(args.token_type)
1✔
150

151
        argv = argv[3:]
1✔
152
        if token_type.startswith("w"):
1✔
153
            WordTokenizationApp("word", argv)
×
154
        elif token_type.startswith("su"):
1✔
155
            SubwordTokenizationApp("subword", argv)
1✔
156
        elif token_type.startswith("se"):
1✔
157
            SentenceTokenizationApp("sent", argv)
×
158
        else:
159
            print(f"Token type not available: {token_type}")
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc