7218641444

Committed 15 Dec 2023 06:09AM UTC coverage: 84.897% (-0.6%) from 85.496%

Build # 7218641444

Build Type

push

github

Committed by

wannaphong

Commit Message

Fix code block

Run Details

6234 of 7343 relevant lines covered (84.9%)

0.85 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

79.31

/pythainlp/cli/tokenize.py

# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
"""
Command line for PyThaiNLP's tokenizers.
"""

import argparse

from pythainlp import cli
from pythainlp.tokenize import (
    DEFAULT_SENT_TOKENIZE_ENGINE,
    DEFAULT_SUBWORD_TOKENIZE_ENGINE,
    DEFAULT_WORD_TOKENIZE_ENGINE,
    sent_tokenize,
    subword_tokenize,
    word_tokenize,
)

DEFAULT_SENT_TOKEN_SEPARATOR = "@@"
DEFAULT_SUBWORD_TOKEN_SEPARATOR = "/"
DEFAULT_SYLLABLE_TOKEN_SEPARATOR = "~"
DEFAULT_WORD_TOKEN_SEPARATOR = "|"


class SubAppBase:
    def __init__(self, name, argv):
        parser = argparse.ArgumentParser(**cli.make_usage("tokenize " + name))
        parser.add_argument(
            "text",
            type=str,
            nargs="?",
            help="input text",
        )
        parser.add_argument(
            "-s",
            "--sep",
            dest="separator",
            type=str,
            help=f"default: {self.separator}",
            default=self.separator,
        )
        parser.add_argument(
            "-a",
            "--algo",
            dest="algorithm",
            type=str,
            help=f"default: {self.algorithm}",
            default=self.algorithm,
        )
        parser.add_argument(
            "-w",
            "--keep-whitespace",
            dest="keep_whitespace",
            action="store_true",
        )
        parser.add_argument(
            "-nw",
            "--no-whitespace",
            dest="keep_whitespace",
            action="store_false",
        )
        parser.set_defaults(keep_whitespace=True)

        args = parser.parse_args(argv)
        self.args = args

        cli.exit_if_empty(args.text, parser)
        result = self.run(
            args.text,
            engine=args.algorithm,
            keep_whitespace=args.keep_whitespace,
        )
        print(args.separator.join(result) + args.separator)


class WordTokenizationApp(SubAppBase):
    def __init__(self, *args, **kwargs):
        self.keep_whitespace = True
        self.algorithm = DEFAULT_WORD_TOKENIZE_ENGINE
        self.separator = DEFAULT_WORD_TOKEN_SEPARATOR
        self.run = word_tokenize
        super().__init__(*args, **kwargs)


class SentenceTokenizationApp(SubAppBase):
    def __init__(self, *args, **kwargs):
        self.keep_whitespace = True
        self.algorithm = DEFAULT_SENT_TOKENIZE_ENGINE
        self.separator = DEFAULT_SENT_TOKEN_SEPARATOR
        self.run = sent_tokenize
        super().__init__(*args, **kwargs)


class SubwordTokenizationApp(SubAppBase):
    def __init__(self, *args, **kwargs):
        self.keep_whitespace = True
        self.algorithm = DEFAULT_SUBWORD_TOKENIZE_ENGINE
        self.separator = DEFAULT_SUBWORD_TOKEN_SEPARATOR
        self.run = subword_tokenize
        super().__init__(*args, **kwargs)


class App:
    def __init__(self, argv):
        parser = argparse.ArgumentParser(
            prog="tokenize",
            description="Break a text into small units (tokens).",
            usage=(
                'thainlp tokenize <token_type> [options] "<text>"\n\n'
                "token_type:\n\n"
                "subword            subword (may not be a linguistic unit)\n"
                "syllable           syllable\n"
                "word               word\n"
                "sent               sentence\n\n"
                "options:\n\n"
                "--sep or -s <separator>    specify custom separator\n"
                "                           (default is a space)\n"
                "--algo or -a <algorithm>   tokenization algorithm\n"
                "                           (see API doc for more info)\n"
                "--keep-whitespace or -w    keep whitespaces in output\n"
                "                           (default)\n\n"
                "<separator> and <text> should be inside double quotes.\n\n"
                "Example:\n\n"
                'thainlp tokenize word -s "|" "ใต้แสงนีออนเปลี่ยวเหงา"\n\n'
                "--"
            ),
        )
        parser.add_argument(
            "token_type",
            type=str,
            help="[subword|word|sent]",
        )

        args = parser.parse_args(argv[2:3])
        cli.exit_if_empty(args.token_type, parser)
        token_type = str.lower(args.token_type)

        argv = argv[3:]
        if token_type.startswith("w"):
            WordTokenizationApp("word", argv)
        elif token_type.startswith("su"):
            SubwordTokenizationApp("subword", argv)
        elif token_type.startswith("se"):
            SentenceTokenizationApp("sent", argv)
        else:
            print(f"Token type not available: {token_type}")

1	# -- coding: utf-8 --
2	# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
3	# SPDX-License-Identifier: Apache-2.0
4	"""	1✔
5	Command line for PyThaiNLP's tokenizers.
6	"""
7
8	import argparse	1✔
9
10	from pythainlp import cli	1✔
11	from pythainlp.tokenize import (	1✔
12	DEFAULT_SENT_TOKENIZE_ENGINE,
13	DEFAULT_SUBWORD_TOKENIZE_ENGINE,
14	DEFAULT_WORD_TOKENIZE_ENGINE,
15	sent_tokenize,
16	subword_tokenize,
17	word_tokenize,
18	)
19
20	DEFAULT_SENT_TOKEN_SEPARATOR = "@@"	1✔
21	DEFAULT_SUBWORD_TOKEN_SEPARATOR = "/"	1✔
22	DEFAULT_SYLLABLE_TOKEN_SEPARATOR = "~"	1✔
23	DEFAULT_WORD_TOKEN_SEPARATOR = "\|"	1✔
24
25
26	class SubAppBase:	1✔
27	def __init__(self, name, argv):	1✔
28	parser = argparse.ArgumentParser(**cli.make_usage("tokenize " + name))	1✔
29	parser.add_argument(	1✔
30	"text",
31	type=str,
32	nargs="?",
33	help="input text",
34	)
35	parser.add_argument(	1✔
36	"-s",
37	"--sep",
38	dest="separator",
39	type=str,
40	help=f"default: {self.separator}",
41	default=self.separator,
42	)
43	parser.add_argument(	1✔
44	"-a",
45	"--algo",
46	dest="algorithm",
47	type=str,
48	help=f"default: {self.algorithm}",
49	default=self.algorithm,
50	)
51	parser.add_argument(	1✔
52	"-w",
53	"--keep-whitespace",
54	dest="keep_whitespace",
55	action="store_true",
56	)
57	parser.add_argument(	1✔
58	"-nw",
59	"--no-whitespace",
60	dest="keep_whitespace",
61	action="store_false",
62	)
63	parser.set_defaults(keep_whitespace=True)	1✔
64
65	args = parser.parse_args(argv)	1✔
66	self.args = args	1✔
67
68	cli.exit_if_empty(args.text, parser)	1✔
69	result = self.run(	1✔
70	args.text,
71	engine=args.algorithm,
72	keep_whitespace=args.keep_whitespace,
73	)
74	print(args.separator.join(result) + args.separator)	1✔
75
76
77	class WordTokenizationApp(SubAppBase):	1✔
78	def __init__(self, args, *kwargs):	1✔
79	self.keep_whitespace = True	×
80	self.algorithm = DEFAULT_WORD_TOKENIZE_ENGINE	×
81	self.separator = DEFAULT_WORD_TOKEN_SEPARATOR	×
82	self.run = word_tokenize	×
83	super().__init__(args, *kwargs)	×
84
85
86	class SentenceTokenizationApp(SubAppBase):	1✔
87	def __init__(self, args, *kwargs):	1✔
88	self.keep_whitespace = True	×
89	self.algorithm = DEFAULT_SENT_TOKENIZE_ENGINE	×
90	self.separator = DEFAULT_SENT_TOKEN_SEPARATOR	×
91	self.run = sent_tokenize	×
92	super().__init__(args, *kwargs)	×
93
94
95	class SubwordTokenizationApp(SubAppBase):	1✔
96	def __init__(self, args, *kwargs):	1✔
97	self.keep_whitespace = True	1✔
98	self.algorithm = DEFAULT_SUBWORD_TOKENIZE_ENGINE	1✔
99	self.separator = DEFAULT_SUBWORD_TOKEN_SEPARATOR	1✔
100	self.run = subword_tokenize	1✔
101	super().__init__(args, *kwargs)	1✔
102
103
104	class App:	1✔
105	def __init__(self, argv):	1✔
106	parser = argparse.ArgumentParser(	1✔
107	prog="tokenize",
108	description="Break a text into small units (tokens).",
109	usage=(
110	'thainlp tokenize <token_type> [options] "<text>"\n\n'
111	"token_type:\n\n"
112	"subword subword (may not be a linguistic unit)\n"
113	"syllable syllable\n"
114	"word word\n"
115	"sent sentence\n\n"
116	"options:\n\n"
117	"--sep or -s <separator> specify custom separator\n"
118	" (default is a space)\n"
119	"--algo or -a <algorithm> tokenization algorithm\n"
120	" (see API doc for more info)\n"
121	"--keep-whitespace or -w keep whitespaces in output\n"
122	" (default)\n\n"
123	"<separator> and <text> should be inside double quotes.\n\n"
124	"Example:\n\n"
125	'thainlp tokenize word -s "\|" "ใต้แสงนีออนเปลี่ยวเหงา"\n\n'
126	"--"
127	),
128	)
129	parser.add_argument(	1✔
130	"token_type",
131	type=str,
132	help="[subword\|word\|sent]",
133	)
134
135	args = parser.parse_args(argv[2:3])	1✔
136	cli.exit_if_empty(args.token_type, parser)	1✔
137	token_type = str.lower(args.token_type)	1✔
138
139	argv = argv[3:]	1✔
140	if token_type.startswith("w"):	1✔
141	WordTokenizationApp("word", argv)	×
142	elif token_type.startswith("su"):	1✔
143	SubwordTokenizationApp("subword", argv)	1✔
144	elif token_type.startswith("se"):	1✔
145	SentenceTokenizationApp("sent", argv)	×
146	else:
147	print(f"Token type not available: {token_type}")	1✔

PyThaiNLP / pythainlp / 7218641444

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous