• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

PyThaiNLP / pythainlp / 21764458295

06 Feb 2026 08:14PM UTC coverage: 65.333% (-0.02%) from 65.355%
21764458295

push

github

web-flow
Merge pull request #1291 from PyThaiNLP/copilot/improve-corpus-test-speed

Optimize corpus tests: mock downloads, separate data validation, suppress CLI output

6059 of 9274 relevant lines covered (65.33%)

0.65 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.74
/pythainlp/cli/data.py
1
# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
2
# SPDX-FileType: SOURCE
3
# SPDX-License-Identifier: Apache-2.0
4
"""Command line for PyThaiNLP's dataset/corpus management."""
5

6
from __future__ import annotations
1✔
7

8
import argparse
1✔
9
from typing import TYPE_CHECKING
1✔
10

11
from pythainlp import corpus
1✔
12
from pythainlp.tools import get_pythainlp_data_path
1✔
13

14
if TYPE_CHECKING:
15
    from collections.abc import Sequence
16

17

18
class App:
1✔
19
    def __init__(self, argv: Sequence[str]) -> None:
1✔
20
        parser = argparse.ArgumentParser(
1✔
21
            prog="data",
22
            description="Manage dataset/corpus.",
23
            usage=(
24
                "thainlp data <subcommand>\n\n"
25
                "subcommands:\n\n"
26
                "catalog                show list of available datasets\n"
27
                "info <dataset_name>    show information about the dataset\n"
28
                "get <dataset_name>     download the dataset\n"
29
                "rm <dataset_name>      remove the dataset\n"
30
                "path                   show full path to data directory\n\n"
31
                "Example:\n\n"
32
                "thainlp data get thai2fit_wv\n\n"
33
                "Current data path:\n\n"
34
                f"{get_pythainlp_data_path()}\n\n"
35
                "To change PyThaiNLP data path, set the operating system's\n"
36
                "PYTHAINLP_DATA_DIR environment variable.\n\n"
37
                "For more information about corpora that PyThaiNLP use, see:\n"
38
                "https://github.com/PyThaiNLP/pythainlp-corpus/\n\n"
39
                "--"
40
            ),
41
        )
42
        parser.add_argument(
1✔
43
            "subcommand",
44
            type=str,
45
            choices=["catalog", "info", "get", "rm", "path"],
46
            help="action on dataset/corpus",
47
        )
48
        args = parser.parse_args(argv[2:3])
1✔
49
        getattr(self, args.subcommand)(argv)
1✔
50

51
    def get(self, argv: Sequence[str]) -> None:
1✔
52
        parser = argparse.ArgumentParser(
1✔
53
            description="Download a dataset",
54
            usage="thainlp data get <dataset_name>",
55
        )
56
        parser.add_argument(
1✔
57
            "dataset_name",
58
            type=str,
59
            help="dataset/corpus's name",
60
        )
61
        args = parser.parse_args(argv[3:])
1✔
62
        if corpus.download(args.dataset_name):
1✔
63
            print("Downloaded successfully.")
1✔
64
        else:
65
            print("Not found.")
1✔
66

67
    def rm(self, argv: Sequence[str]) -> None:
1✔
68
        parser = argparse.ArgumentParser(
1✔
69
            description="Remove a dataset",
70
            usage="thainlp data rm <dataset_name>",
71
        )
72
        parser.add_argument(
1✔
73
            "dataset_name",
74
            type=str,
75
            help="dataset/corpus's name",
76
        )
77
        args = parser.parse_args(argv[3:])
1✔
78
        if corpus.remove(args.dataset_name):
1✔
79
            print("Removed successfully.")
1✔
80
        else:
81
            print("Not found.")
1✔
82

83
    def info(self, argv: Sequence[str]) -> None:
1✔
84
        parser = argparse.ArgumentParser(
1✔
85
            description="Print information about a dataset",
86
            usage="thainlp data info <dataset_name>",
87
        )
88
        parser.add_argument(
1✔
89
            "dataset_name",
90
            type=str,
91
            help="dataset/corpus's name",
92
        )
93
        args = parser.parse_args(argv[3:])
1✔
94
        info = corpus.get_corpus_db_detail(args.dataset_name)
1✔
95
        if info:
1✔
96
            print(info)
×
97
        else:
98
            print("Not found.")
1✔
99

100
    def catalog(self, argv: Sequence[str]) -> None:
1✔
101
        """Print dataset/corpus available for download."""
102
        corpus_db_response = corpus.get_corpus_db(corpus.corpus_db_url())
1✔
103
        corpus_db_dict: dict[str, dict[str, str]] = corpus_db_response.json()  # type: ignore[union-attr]
1✔
104
        corpus_names = sorted(corpus_db_dict.keys())
1✔
105
        print("Dataset/corpus available for download:")
1✔
106
        for name in corpus_names:
1✔
107
            print(f"- {name} {corpus_db_dict[name]['latest_version']}", end="")
1✔
108
            corpus_info = corpus.get_corpus_db_detail(name)
1✔
109
            if corpus_info:
1✔
110
                print(f"  (Local: {corpus_info['version']})")
×
111
            else:
112
                print()
1✔
113

114
        print(
1✔
115
            "\nUse subcommand 'get' to download a dataset.\n\n"
116
            "Example: thainlp data get crfcut\n"
117
        )
118

119
    def path(self, argv: Sequence[str]) -> None:
1✔
120
        """Print path of local dataset."""
121
        print(get_pythainlp_data_path())
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc