5337431273

pending completion

Build # 5337431273

Build Type

push

github

Committed by

wannaphong

Commit Message

Add กาลพฤกษ์ to list words

Run Details

3573 of 6329 relevant lines covered (56.45%)

0.56 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

79.31

/pythainlp/cli/tokenize.py

# -*- coding: utf-8 -*-
# Copyright (C) 2016-2023 PyThaiNLP Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
thainlp tokenize command line.
"""

import argparse

from pythainlp import cli
from pythainlp.tokenize import (
    DEFAULT_SENT_TOKENIZE_ENGINE,
    DEFAULT_SUBWORD_TOKENIZE_ENGINE,
    DEFAULT_SYLLABLE_TOKENIZE_ENGINE,
    DEFAULT_WORD_TOKENIZE_ENGINE,
    sent_tokenize,
    subword_tokenize,
    word_tokenize,
)

DEFAULT_SENT_TOKEN_SEPARATOR = "@@"
DEFAULT_SUBWORD_TOKEN_SEPARATOR = "/"
DEFAULT_SYLLABLE_TOKEN_SEPARATOR = "~"
DEFAULT_WORD_TOKEN_SEPARATOR = "|"


class SubAppBase:
    def __init__(self, name, argv):
        parser = argparse.ArgumentParser(**cli.make_usage("tokenize " + name))
        parser.add_argument(
            "text",
            type=str,
            nargs="?",
            help="input text",
        )
        parser.add_argument(
            "-s",
            "--sep",
            dest="separator",
            type=str,
            help=f"default: {self.separator}",
            default=self.separator,
        )
        parser.add_argument(
            "-a",
            "--algo",
            dest="algorithm",
            type=str,
            help=f"default: {self.algorithm}",
            default=self.algorithm,
        )
        parser.add_argument(
            "-w",
            "--keep-whitespace",
            dest="keep_whitespace",
            action="store_true",
        )
        parser.add_argument(
            "-nw",
            "--no-whitespace",
            dest="keep_whitespace",
            action="store_false",
        )
        parser.set_defaults(keep_whitespace=True)

        args = parser.parse_args(argv)
        self.args = args

        cli.exit_if_empty(args.text, parser)
        result = self.run(
            args.text,
            engine=args.algorithm,
            keep_whitespace=args.keep_whitespace,
        )
        print(args.separator.join(result) + args.separator)


class WordTokenizationApp(SubAppBase):
    def __init__(self, *args, **kwargs):
        self.keep_whitespace = True
        self.algorithm = DEFAULT_WORD_TOKENIZE_ENGINE
        self.separator = DEFAULT_WORD_TOKEN_SEPARATOR
        self.run = word_tokenize
        super().__init__(*args, **kwargs)


class SentenceTokenizationApp(SubAppBase):
    def __init__(self, *args, **kwargs):
        self.keep_whitespace = True
        self.algorithm = DEFAULT_SENT_TOKENIZE_ENGINE
        self.separator = DEFAULT_SENT_TOKEN_SEPARATOR
        self.run = sent_tokenize
        super().__init__(*args, **kwargs)


class SubwordTokenizationApp(SubAppBase):
    def __init__(self, *args, **kwargs):
        self.keep_whitespace = True
        self.algorithm = DEFAULT_SUBWORD_TOKENIZE_ENGINE
        self.separator = DEFAULT_SUBWORD_TOKEN_SEPARATOR
        self.run = subword_tokenize
        super().__init__(*args, **kwargs)


class App:
    def __init__(self, argv):
        parser = argparse.ArgumentParser(
            prog="tokenize",
            description="Break a text into small units (tokens).",
            usage=(
                'thainlp tokenize <token_type> [options] "<text>"\n\n'
                "token_type:\n\n"
                "subword            subword (may not be a linguistic unit)\n"
                "syllable           syllable\n"
                "word               word\n"
                "sent               sentence\n\n"
                "options:\n\n"
                "--sep or -s <separator>    specify custom separator\n"
                "                           (default is a space)\n"
                "--algo or -a <algorithm>   tokenization algorithm\n"
                "                           (see API doc for more info)\n"
                "--keep-whitespace or -w    keep whitespaces in output\n"
                "                           (default)\n\n"
                "<separator> and <text> should be inside double quotes.\n\n"
                "Example:\n\n"
                'thainlp tokenize word -s "|" "ใต้แสงนีออนเปลี่ยวเหงา"\n\n'
                "--"
            ),
        )
        parser.add_argument(
            "token_type",
            type=str,
            help="[subword|word|sent]",
        )

        args = parser.parse_args(argv[2:3])
        cli.exit_if_empty(args.token_type, parser)
        token_type = str.lower(args.token_type)

        argv = argv[3:]
        if token_type.startswith("w"):
            WordTokenizationApp("word", argv)
        elif token_type.startswith("su"):
            SubwordTokenizationApp("subword", argv)
        elif token_type.startswith("se"):
            SentenceTokenizationApp("sent", argv)
        else:
            print(f"Token type not available: {token_type}")

1	# -- coding: utf-8 --
2	# Copyright (C) 2016-2023 PyThaiNLP Project
3	#
4	# Licensed under the Apache License, Version 2.0 (the "License");
5	# you may not use this file except in compliance with the License.
6	# You may obtain a copy of the License at
7	#
8	# http://www.apache.org/licenses/LICENSE-2.0
9	#
10	# Unless required by applicable law or agreed to in writing, software
11	# distributed under the License is distributed on an "AS IS" BASIS,
12	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	# See the License for the specific language governing permissions and
14	# limitations under the License.
15	"""	1✔
16	thainlp tokenize command line.
17	"""
18
19	import argparse	1✔
20
21	from pythainlp import cli	1✔
22	from pythainlp.tokenize import (	1✔
23	DEFAULT_SENT_TOKENIZE_ENGINE,
24	DEFAULT_SUBWORD_TOKENIZE_ENGINE,
25	DEFAULT_SYLLABLE_TOKENIZE_ENGINE,
26	DEFAULT_WORD_TOKENIZE_ENGINE,
27	sent_tokenize,
28	subword_tokenize,
29	word_tokenize,
30	)
31
32	DEFAULT_SENT_TOKEN_SEPARATOR = "@@"	1✔
33	DEFAULT_SUBWORD_TOKEN_SEPARATOR = "/"	1✔
34	DEFAULT_SYLLABLE_TOKEN_SEPARATOR = "~"	1✔
35	DEFAULT_WORD_TOKEN_SEPARATOR = "\|"	1✔
36
37
38	class SubAppBase:	1✔
39	def __init__(self, name, argv):	1✔
40	parser = argparse.ArgumentParser(**cli.make_usage("tokenize " + name))	1✔
41	parser.add_argument(	1✔
42	"text",
43	type=str,
44	nargs="?",
45	help="input text",
46	)
47	parser.add_argument(	1✔
48	"-s",
49	"--sep",
50	dest="separator",
51	type=str,
52	help=f"default: {self.separator}",
53	default=self.separator,
54	)
55	parser.add_argument(	1✔
56	"-a",
57	"--algo",
58	dest="algorithm",
59	type=str,
60	help=f"default: {self.algorithm}",
61	default=self.algorithm,
62	)
63	parser.add_argument(	1✔
64	"-w",
65	"--keep-whitespace",
66	dest="keep_whitespace",
67	action="store_true",
68	)
69	parser.add_argument(	1✔
70	"-nw",
71	"--no-whitespace",
72	dest="keep_whitespace",
73	action="store_false",
74	)
75	parser.set_defaults(keep_whitespace=True)	1✔
76
77	args = parser.parse_args(argv)	1✔
78	self.args = args	1✔
79
80	cli.exit_if_empty(args.text, parser)	1✔
81	result = self.run(	1✔
82	args.text,
83	engine=args.algorithm,
84	keep_whitespace=args.keep_whitespace,
85	)
86	print(args.separator.join(result) + args.separator)	1✔
87
88
89	class WordTokenizationApp(SubAppBase):	1✔
90	def __init__(self, args, *kwargs):	1✔
91	self.keep_whitespace = True	×
92	self.algorithm = DEFAULT_WORD_TOKENIZE_ENGINE	×
93	self.separator = DEFAULT_WORD_TOKEN_SEPARATOR	×
94	self.run = word_tokenize	×
95	super().__init__(args, *kwargs)	×
96
97
98	class SentenceTokenizationApp(SubAppBase):	1✔
99	def __init__(self, args, *kwargs):	1✔
100	self.keep_whitespace = True	×
101	self.algorithm = DEFAULT_SENT_TOKENIZE_ENGINE	×
102	self.separator = DEFAULT_SENT_TOKEN_SEPARATOR	×
103	self.run = sent_tokenize	×
104	super().__init__(args, *kwargs)	×
105
106
107	class SubwordTokenizationApp(SubAppBase):	1✔
108	def __init__(self, args, *kwargs):	1✔
109	self.keep_whitespace = True	1✔
110	self.algorithm = DEFAULT_SUBWORD_TOKENIZE_ENGINE	1✔
111	self.separator = DEFAULT_SUBWORD_TOKEN_SEPARATOR	1✔
112	self.run = subword_tokenize	1✔
113	super().__init__(args, *kwargs)	1✔
114
115
116	class App:	1✔
117	def __init__(self, argv):	1✔
118	parser = argparse.ArgumentParser(	1✔
119	prog="tokenize",
120	description="Break a text into small units (tokens).",
121	usage=(
122	'thainlp tokenize <token_type> [options] "<text>"\n\n'
123	"token_type:\n\n"
124	"subword subword (may not be a linguistic unit)\n"
125	"syllable syllable\n"
126	"word word\n"
127	"sent sentence\n\n"
128	"options:\n\n"
129	"--sep or -s <separator> specify custom separator\n"
130	" (default is a space)\n"
131	"--algo or -a <algorithm> tokenization algorithm\n"
132	" (see API doc for more info)\n"
133	"--keep-whitespace or -w keep whitespaces in output\n"
134	" (default)\n\n"
135	"<separator> and <text> should be inside double quotes.\n\n"
136	"Example:\n\n"
137	'thainlp tokenize word -s "\|" "ใต้แสงนีออนเปลี่ยวเหงา"\n\n'
138	"--"
139	),
140	)
141	parser.add_argument(	1✔
142	"token_type",
143	type=str,
144	help="[subword\|word\|sent]",
145	)
146
147	args = parser.parse_args(argv[2:3])	1✔
148	cli.exit_if_empty(args.token_type, parser)	1✔
149	token_type = str.lower(args.token_type)	1✔
150
151	argv = argv[3:]	1✔
152	if token_type.startswith("w"):	1✔
153	WordTokenizationApp("word", argv)	×
154	elif token_type.startswith("su"):	1✔
155	SubwordTokenizationApp("subword", argv)	1✔
156	elif token_type.startswith("se"):	1✔
157	SentenceTokenizationApp("sent", argv)	×
158	else:
159	print(f"Token type not available: {token_type}")	1✔

PyThaiNLP / pythainlp / 5337431273

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous