4095479191

Build Type

push

github

Committed by Wannaphong Phatthiyaphaibun

Commit Message

Update phoneme.py

Run Details

4 of 4 new or added lines in 1 file covered. (100.0%)

41 of 5843 relevant lines covered (0.7%)

0.01 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0

/pythainlp/cli/tokenize.py

"""
thainlp tokenize command line.
"""

import argparse

from pythainlp import cli
from pythainlp.tokenize import (
    DEFAULT_SENT_TOKENIZE_ENGINE,
    DEFAULT_SUBWORD_TOKENIZE_ENGINE,
    DEFAULT_SYLLABLE_TOKENIZE_ENGINE,
    DEFAULT_WORD_TOKENIZE_ENGINE,
    sent_tokenize,
    subword_tokenize,
    word_tokenize,
)

DEFAULT_SENT_TOKEN_SEPARATOR = "@@"
DEFAULT_SUBWORD_TOKEN_SEPARATOR = "/"
DEFAULT_SYLLABLE_TOKEN_SEPARATOR = "~"
DEFAULT_WORD_TOKEN_SEPARATOR = "|"


class SubAppBase:
    def __init__(self, name, argv):
        parser = argparse.ArgumentParser(**cli.make_usage("tokenize " + name))
        parser.add_argument(
            "text",
            type=str,
            nargs="?",
            help="input text",
        )
        parser.add_argument(
            "-s",
            "--sep",
            dest="separator",
            type=str,
            help=f"default: {self.separator}",
            default=self.separator,
        )
        parser.add_argument(
            "-a",
            "--algo",
            dest="algorithm",
            type=str,
            help=f"default: {self.algorithm}",
            default=self.algorithm,
        )
        parser.add_argument(
            "-w",
            "--keep-whitespace",
            dest="keep_whitespace",
            action="store_true",
        )
        parser.add_argument(
            "-nw",
            "--no-whitespace",
            dest="keep_whitespace",
            action="store_false",
        )
        parser.set_defaults(keep_whitespace=True)

        args = parser.parse_args(argv)
        self.args = args

        cli.exit_if_empty(args.text, parser)
        result = self.run(
            args.text,
            engine=args.algorithm,
            keep_whitespace=args.keep_whitespace,
        )
        print(args.separator.join(result) + args.separator)


class WordTokenizationApp(SubAppBase):
    def __init__(self, *args, **kwargs):
        self.keep_whitespace = True
        self.algorithm = DEFAULT_WORD_TOKENIZE_ENGINE
        self.separator = DEFAULT_WORD_TOKEN_SEPARATOR
        self.run = word_tokenize
        super().__init__(*args, **kwargs)


class SentenceTokenizationApp(SubAppBase):
    def __init__(self, *args, **kwargs):
        self.keep_whitespace = True
        self.algorithm = DEFAULT_SENT_TOKENIZE_ENGINE
        self.separator = DEFAULT_SENT_TOKEN_SEPARATOR
        self.run = sent_tokenize
        super().__init__(*args, **kwargs)


class SubwordTokenizationApp(SubAppBase):
    def __init__(self, *args, **kwargs):
        self.keep_whitespace = True
        self.algorithm = DEFAULT_SUBWORD_TOKENIZE_ENGINE
        self.separator = DEFAULT_SUBWORD_TOKEN_SEPARATOR
        self.run = subword_tokenize
        super().__init__(*args, **kwargs)


class App:
    def __init__(self, argv):
        parser = argparse.ArgumentParser(
            prog="tokenize",
            description="Break a text into small units (tokens).",
            usage=(
                'thainlp tokenize <token_type> [options] "<text>"\n\n'
                "token_type:\n\n"
                "subword            subword (may not be a linguistic unit)\n"
                "syllable           syllable\n"
                "word               word\n"
                "sent               sentence\n\n"
                "options:\n\n"
                "--sep or -s <separator>    specify custom separator\n"
                "                           (default is a space)\n"
                "--algo or -a <algorithm>   tokenization algorithm\n"
                "                           (see API doc for more info)\n"
                "--keep-whitespace or -w    keep whitespaces in output\n"
                "                           (default)\n\n"
                "<separator> and <text> should be inside double quotes.\n\n"
                "Example:\n\n"
                'thainlp tokenize word -s "|" "ใต้แสงนีออนเปลี่ยวเหงา"\n\n'
                "--"
            ),
        )
        parser.add_argument(
            "token_type",
            type=str,
            help="[subword|word|sent]",
        )

        args = parser.parse_args(argv[2:3])
        cli.exit_if_empty(args.token_type, parser)
        token_type = str.lower(args.token_type)

        argv = argv[3:]
        if token_type.startswith("w"):
            WordTokenizationApp("word", argv)
        elif token_type.startswith("su"):
            SubwordTokenizationApp("subword", argv)
        elif token_type.startswith("se"):
            SentenceTokenizationApp("sent", argv)
        else:
            print(f"Token type not available: {token_type}")

1	"""
2	thainlp tokenize command line.
3	"""
4
5	import argparse	×
6
7	from pythainlp import cli	×
8	from pythainlp.tokenize import (	×
9	DEFAULT_SENT_TOKENIZE_ENGINE,
10	DEFAULT_SUBWORD_TOKENIZE_ENGINE,
11	DEFAULT_SYLLABLE_TOKENIZE_ENGINE,
12	DEFAULT_WORD_TOKENIZE_ENGINE,
13	sent_tokenize,
14	subword_tokenize,
15	word_tokenize,
16	)
17
18	DEFAULT_SENT_TOKEN_SEPARATOR = "@@"	×
19	DEFAULT_SUBWORD_TOKEN_SEPARATOR = "/"	×
20	DEFAULT_SYLLABLE_TOKEN_SEPARATOR = "~"	×
21	DEFAULT_WORD_TOKEN_SEPARATOR = "\|"	×
22
23
24	class SubAppBase:	×
25	def __init__(self, name, argv):	×
26	parser = argparse.ArgumentParser(**cli.make_usage("tokenize " + name))	×
27	parser.add_argument(	×
28	"text",
29	type=str,
30	nargs="?",
31	help="input text",
32	)
33	parser.add_argument(	×
34	"-s",
35	"--sep",
36	dest="separator",
37	type=str,
38	help=f"default: {self.separator}",
39	default=self.separator,
40	)
41	parser.add_argument(	×
42	"-a",
43	"--algo",
44	dest="algorithm",
45	type=str,
46	help=f"default: {self.algorithm}",
47	default=self.algorithm,
48	)
49	parser.add_argument(	×
50	"-w",
51	"--keep-whitespace",
52	dest="keep_whitespace",
53	action="store_true",
54	)
55	parser.add_argument(	×
56	"-nw",
57	"--no-whitespace",
58	dest="keep_whitespace",
59	action="store_false",
60	)
61	parser.set_defaults(keep_whitespace=True)	×
62
63	args = parser.parse_args(argv)	×
64	self.args = args	×
65
66	cli.exit_if_empty(args.text, parser)	×
67	result = self.run(	×
68	args.text,
69	engine=args.algorithm,
70	keep_whitespace=args.keep_whitespace,
71	)
72	print(args.separator.join(result) + args.separator)	×
73
74
75	class WordTokenizationApp(SubAppBase):	×
76	def __init__(self, args, *kwargs):	×
77	self.keep_whitespace = True	×
78	self.algorithm = DEFAULT_WORD_TOKENIZE_ENGINE	×
79	self.separator = DEFAULT_WORD_TOKEN_SEPARATOR	×
80	self.run = word_tokenize	×
81	super().__init__(args, *kwargs)	×
82
83
84	class SentenceTokenizationApp(SubAppBase):	×
85	def __init__(self, args, *kwargs):	×
86	self.keep_whitespace = True	×
87	self.algorithm = DEFAULT_SENT_TOKENIZE_ENGINE	×
88	self.separator = DEFAULT_SENT_TOKEN_SEPARATOR	×
89	self.run = sent_tokenize	×
90	super().__init__(args, *kwargs)	×
91
92
93	class SubwordTokenizationApp(SubAppBase):	×
94	def __init__(self, args, *kwargs):	×
95	self.keep_whitespace = True	×
96	self.algorithm = DEFAULT_SUBWORD_TOKENIZE_ENGINE	×
97	self.separator = DEFAULT_SUBWORD_TOKEN_SEPARATOR	×
98	self.run = subword_tokenize	×
99	super().__init__(args, *kwargs)	×
100
101
102	class App:	×
103	def __init__(self, argv):	×
104	parser = argparse.ArgumentParser(	×
105	prog="tokenize",
106	description="Break a text into small units (tokens).",
107	usage=(
108	'thainlp tokenize <token_type> [options] "<text>"\n\n'
109	"token_type:\n\n"
110	"subword subword (may not be a linguistic unit)\n"
111	"syllable syllable\n"
112	"word word\n"
113	"sent sentence\n\n"
114	"options:\n\n"
115	"--sep or -s <separator> specify custom separator\n"
116	" (default is a space)\n"
117	"--algo or -a <algorithm> tokenization algorithm\n"
118	" (see API doc for more info)\n"
119	"--keep-whitespace or -w keep whitespaces in output\n"
120	" (default)\n\n"
121	"<separator> and <text> should be inside double quotes.\n\n"
122	"Example:\n\n"
123	'thainlp tokenize word -s "\|" "ใต้แสงนีออนเปลี่ยวเหงา"\n\n'
124	"--"
125	),
126	)
127	parser.add_argument(	×
128	"token_type",
129	type=str,
130	help="[subword\|word\|sent]",
131	)
132
133	args = parser.parse_args(argv[2:3])	×
134	cli.exit_if_empty(args.token_type, parser)	×
135	token_type = str.lower(args.token_type)	×
136
137	argv = argv[3:]	×
138	if token_type.startswith("w"):	×
139	WordTokenizationApp("word", argv)	×
140	elif token_type.startswith("su"):	×
141	SubwordTokenizationApp("subword", argv)	×
142	elif token_type.startswith("se"):	×
143	SentenceTokenizationApp("sent", argv)	×
144	else:
145	print(f"Token type not available: {token_type}")	×

PyThaiNLP / pythainlp / 4095479191

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous