• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

PyThaiNLP / pythainlp / 4095479191

pending completion
4095479191

push

github

Wannaphong Phatthiyaphaibun
Update phoneme.py

4 of 4 new or added lines in 1 file covered. (100.0%)

41 of 5843 relevant lines covered (0.7%)

0.01 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/pythainlp/cli/tokenize.py
1
"""
2
thainlp tokenize command line.
3
"""
4

5
import argparse
×
6

7
from pythainlp import cli
×
8
from pythainlp.tokenize import (
×
9
    DEFAULT_SENT_TOKENIZE_ENGINE,
10
    DEFAULT_SUBWORD_TOKENIZE_ENGINE,
11
    DEFAULT_SYLLABLE_TOKENIZE_ENGINE,
12
    DEFAULT_WORD_TOKENIZE_ENGINE,
13
    sent_tokenize,
14
    subword_tokenize,
15
    word_tokenize,
16
)
17

18
DEFAULT_SENT_TOKEN_SEPARATOR = "@@"
×
19
DEFAULT_SUBWORD_TOKEN_SEPARATOR = "/"
×
20
DEFAULT_SYLLABLE_TOKEN_SEPARATOR = "~"
×
21
DEFAULT_WORD_TOKEN_SEPARATOR = "|"
×
22

23

24
class SubAppBase:
×
25
    def __init__(self, name, argv):
×
26
        parser = argparse.ArgumentParser(**cli.make_usage("tokenize " + name))
×
27
        parser.add_argument(
×
28
            "text",
29
            type=str,
30
            nargs="?",
31
            help="input text",
32
        )
33
        parser.add_argument(
×
34
            "-s",
35
            "--sep",
36
            dest="separator",
37
            type=str,
38
            help=f"default: {self.separator}",
39
            default=self.separator,
40
        )
41
        parser.add_argument(
×
42
            "-a",
43
            "--algo",
44
            dest="algorithm",
45
            type=str,
46
            help=f"default: {self.algorithm}",
47
            default=self.algorithm,
48
        )
49
        parser.add_argument(
×
50
            "-w",
51
            "--keep-whitespace",
52
            dest="keep_whitespace",
53
            action="store_true",
54
        )
55
        parser.add_argument(
×
56
            "-nw",
57
            "--no-whitespace",
58
            dest="keep_whitespace",
59
            action="store_false",
60
        )
61
        parser.set_defaults(keep_whitespace=True)
×
62

63
        args = parser.parse_args(argv)
×
64
        self.args = args
×
65

66
        cli.exit_if_empty(args.text, parser)
×
67
        result = self.run(
×
68
            args.text,
69
            engine=args.algorithm,
70
            keep_whitespace=args.keep_whitespace,
71
        )
72
        print(args.separator.join(result) + args.separator)
×
73

74

75
class WordTokenizationApp(SubAppBase):
×
76
    def __init__(self, *args, **kwargs):
×
77
        self.keep_whitespace = True
×
78
        self.algorithm = DEFAULT_WORD_TOKENIZE_ENGINE
×
79
        self.separator = DEFAULT_WORD_TOKEN_SEPARATOR
×
80
        self.run = word_tokenize
×
81
        super().__init__(*args, **kwargs)
×
82

83

84
class SentenceTokenizationApp(SubAppBase):
×
85
    def __init__(self, *args, **kwargs):
×
86
        self.keep_whitespace = True
×
87
        self.algorithm = DEFAULT_SENT_TOKENIZE_ENGINE
×
88
        self.separator = DEFAULT_SENT_TOKEN_SEPARATOR
×
89
        self.run = sent_tokenize
×
90
        super().__init__(*args, **kwargs)
×
91

92

93
class SubwordTokenizationApp(SubAppBase):
×
94
    def __init__(self, *args, **kwargs):
×
95
        self.keep_whitespace = True
×
96
        self.algorithm = DEFAULT_SUBWORD_TOKENIZE_ENGINE
×
97
        self.separator = DEFAULT_SUBWORD_TOKEN_SEPARATOR
×
98
        self.run = subword_tokenize
×
99
        super().__init__(*args, **kwargs)
×
100

101

102
class App:
×
103
    def __init__(self, argv):
×
104
        parser = argparse.ArgumentParser(
×
105
            prog="tokenize",
106
            description="Break a text into small units (tokens).",
107
            usage=(
108
                'thainlp tokenize <token_type> [options] "<text>"\n\n'
109
                "token_type:\n\n"
110
                "subword            subword (may not be a linguistic unit)\n"
111
                "syllable           syllable\n"
112
                "word               word\n"
113
                "sent               sentence\n\n"
114
                "options:\n\n"
115
                "--sep or -s <separator>    specify custom separator\n"
116
                "                           (default is a space)\n"
117
                "--algo or -a <algorithm>   tokenization algorithm\n"
118
                "                           (see API doc for more info)\n"
119
                "--keep-whitespace or -w    keep whitespaces in output\n"
120
                "                           (default)\n\n"
121
                "<separator> and <text> should be inside double quotes.\n\n"
122
                "Example:\n\n"
123
                'thainlp tokenize word -s "|" "ใต้แสงนีออนเปลี่ยวเหงา"\n\n'
124
                "--"
125
            ),
126
        )
127
        parser.add_argument(
×
128
            "token_type",
129
            type=str,
130
            help="[subword|word|sent]",
131
        )
132

133
        args = parser.parse_args(argv[2:3])
×
134
        cli.exit_if_empty(args.token_type, parser)
×
135
        token_type = str.lower(args.token_type)
×
136

137
        argv = argv[3:]
×
138
        if token_type.startswith("w"):
×
139
            WordTokenizationApp("word", argv)
×
140
        elif token_type.startswith("su"):
×
141
            SubwordTokenizationApp("subword", argv)
×
142
        elif token_type.startswith("se"):
×
143
            SentenceTokenizationApp("sent", argv)
×
144
        else:
145
            print(f"Token type not available: {token_type}")
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc