• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

PyThaiNLP / pythainlp / 7218641444

15 Dec 2023 06:09AM UTC coverage: 84.897% (-0.6%) from 85.496%
7218641444

push

github

wannaphong
Fix code block

6234 of 7343 relevant lines covered (84.9%)

0.85 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

79.31
/pythainlp/cli/tokenize.py
1
# -*- coding: utf-8 -*-
2
# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
3
# SPDX-License-Identifier: Apache-2.0
4
"""
1✔
5
Command line for PyThaiNLP's tokenizers.
6
"""
7

8
import argparse
1✔
9

10
from pythainlp import cli
1✔
11
from pythainlp.tokenize import (
1✔
12
    DEFAULT_SENT_TOKENIZE_ENGINE,
13
    DEFAULT_SUBWORD_TOKENIZE_ENGINE,
14
    DEFAULT_WORD_TOKENIZE_ENGINE,
15
    sent_tokenize,
16
    subword_tokenize,
17
    word_tokenize,
18
)
19

20
DEFAULT_SENT_TOKEN_SEPARATOR = "@@"
1✔
21
DEFAULT_SUBWORD_TOKEN_SEPARATOR = "/"
1✔
22
DEFAULT_SYLLABLE_TOKEN_SEPARATOR = "~"
1✔
23
DEFAULT_WORD_TOKEN_SEPARATOR = "|"
1✔
24

25

26
class SubAppBase:
1✔
27
    def __init__(self, name, argv):
1✔
28
        parser = argparse.ArgumentParser(**cli.make_usage("tokenize " + name))
1✔
29
        parser.add_argument(
1✔
30
            "text",
31
            type=str,
32
            nargs="?",
33
            help="input text",
34
        )
35
        parser.add_argument(
1✔
36
            "-s",
37
            "--sep",
38
            dest="separator",
39
            type=str,
40
            help=f"default: {self.separator}",
41
            default=self.separator,
42
        )
43
        parser.add_argument(
1✔
44
            "-a",
45
            "--algo",
46
            dest="algorithm",
47
            type=str,
48
            help=f"default: {self.algorithm}",
49
            default=self.algorithm,
50
        )
51
        parser.add_argument(
1✔
52
            "-w",
53
            "--keep-whitespace",
54
            dest="keep_whitespace",
55
            action="store_true",
56
        )
57
        parser.add_argument(
1✔
58
            "-nw",
59
            "--no-whitespace",
60
            dest="keep_whitespace",
61
            action="store_false",
62
        )
63
        parser.set_defaults(keep_whitespace=True)
1✔
64

65
        args = parser.parse_args(argv)
1✔
66
        self.args = args
1✔
67

68
        cli.exit_if_empty(args.text, parser)
1✔
69
        result = self.run(
1✔
70
            args.text,
71
            engine=args.algorithm,
72
            keep_whitespace=args.keep_whitespace,
73
        )
74
        print(args.separator.join(result) + args.separator)
1✔
75

76

77
class WordTokenizationApp(SubAppBase):
1✔
78
    def __init__(self, *args, **kwargs):
1✔
79
        self.keep_whitespace = True
×
80
        self.algorithm = DEFAULT_WORD_TOKENIZE_ENGINE
×
81
        self.separator = DEFAULT_WORD_TOKEN_SEPARATOR
×
82
        self.run = word_tokenize
×
83
        super().__init__(*args, **kwargs)
×
84

85

86
class SentenceTokenizationApp(SubAppBase):
1✔
87
    def __init__(self, *args, **kwargs):
1✔
88
        self.keep_whitespace = True
×
89
        self.algorithm = DEFAULT_SENT_TOKENIZE_ENGINE
×
90
        self.separator = DEFAULT_SENT_TOKEN_SEPARATOR
×
91
        self.run = sent_tokenize
×
92
        super().__init__(*args, **kwargs)
×
93

94

95
class SubwordTokenizationApp(SubAppBase):
1✔
96
    def __init__(self, *args, **kwargs):
1✔
97
        self.keep_whitespace = True
1✔
98
        self.algorithm = DEFAULT_SUBWORD_TOKENIZE_ENGINE
1✔
99
        self.separator = DEFAULT_SUBWORD_TOKEN_SEPARATOR
1✔
100
        self.run = subword_tokenize
1✔
101
        super().__init__(*args, **kwargs)
1✔
102

103

104
class App:
1✔
105
    def __init__(self, argv):
1✔
106
        parser = argparse.ArgumentParser(
1✔
107
            prog="tokenize",
108
            description="Break a text into small units (tokens).",
109
            usage=(
110
                'thainlp tokenize <token_type> [options] "<text>"\n\n'
111
                "token_type:\n\n"
112
                "subword            subword (may not be a linguistic unit)\n"
113
                "syllable           syllable\n"
114
                "word               word\n"
115
                "sent               sentence\n\n"
116
                "options:\n\n"
117
                "--sep or -s <separator>    specify custom separator\n"
118
                "                           (default is a space)\n"
119
                "--algo or -a <algorithm>   tokenization algorithm\n"
120
                "                           (see API doc for more info)\n"
121
                "--keep-whitespace or -w    keep whitespaces in output\n"
122
                "                           (default)\n\n"
123
                "<separator> and <text> should be inside double quotes.\n\n"
124
                "Example:\n\n"
125
                'thainlp tokenize word -s "|" "ใต้แสงนีออนเปลี่ยวเหงา"\n\n'
126
                "--"
127
            ),
128
        )
129
        parser.add_argument(
1✔
130
            "token_type",
131
            type=str,
132
            help="[subword|word|sent]",
133
        )
134

135
        args = parser.parse_args(argv[2:3])
1✔
136
        cli.exit_if_empty(args.token_type, parser)
1✔
137
        token_type = str.lower(args.token_type)
1✔
138

139
        argv = argv[3:]
1✔
140
        if token_type.startswith("w"):
1✔
141
            WordTokenizationApp("word", argv)
×
142
        elif token_type.startswith("su"):
1✔
143
            SubwordTokenizationApp("subword", argv)
1✔
144
        elif token_type.startswith("se"):
1✔
145
            SentenceTokenizationApp("sent", argv)
×
146
        else:
147
            print(f"Token type not available: {token_type}")
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc