• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

PyThaiNLP / pythainlp / 11625814262

01 Nov 2024 07:14AM UTC coverage: 20.782% (+20.8%) from 0.0%
11625814262

Pull #952

github

web-flow
Merge c8385dcae into 515fe7ced
Pull Request #952: Specify a limited test suite

45 of 80 new or added lines in 48 files covered. (56.25%)

1537 of 7396 relevant lines covered (20.78%)

0.21 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

54.22
/pythainlp/tokenize/core.py
1
# -*- coding: utf-8 -*-
2
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
3
# SPDX-License-Identifier: Apache-2.0
4
"""
5
Generic functions of tokenizers
6
"""
7

8
import copy
1✔
9
import re
1✔
10
from typing import Iterable, List, Union
1✔
11

12
from pythainlp.tokenize import (
1✔
13
    DEFAULT_SENT_TOKENIZE_ENGINE,
14
    DEFAULT_SUBWORD_TOKENIZE_ENGINE,
15
    DEFAULT_SYLLABLE_DICT_TRIE,
16
    DEFAULT_SYLLABLE_TOKENIZE_ENGINE,
17
    DEFAULT_WORD_DICT_TRIE,
18
    DEFAULT_WORD_TOKENIZE_ENGINE,
19
)
20
from pythainlp.tokenize._utils import (
1✔
21
    apply_postprocessors,
22
    rejoin_formatted_num,
23
    strip_whitespace,
24
)
25
from pythainlp.util.trie import Trie, dict_trie
1✔
26

27

28
def clause_tokenize(doc: List[str]) -> List[List[str]]:
1✔
29
    """
30
    Clause tokenizer. (or Clause segmentation)
31
    Tokenizes running word list into list of clauses (list of strings).
32
    Split by CRF trained on Blackboard Treebank.
33

34
    :param str doc: word list to be clause tokenized
35
    :return: list of clauses
36
    :rtype: list[list[str]]
37
    :Example:
38
    ::
39

40
        from pythainlp.tokenize import clause_tokenize
41

42
        clause_tokenize(
43
            [
44
                "ฉัน",
45
                "นอน",
46
                "และ",
47
                "คุณ",
48
                "เล่น",
49
                "มือถือ",
50
                "ส่วน",
51
                "น้อง",
52
                "เขียน",
53
                "โปรแกรม",
54
            ]
55
        )
56
        # [['ฉัน', 'นอน'],
57
        # ['และ', 'คุณ', 'เล่น', 'มือถือ'],
58
        # ['ส่วน', 'น้อง', 'เขียน', 'โปรแกรม']]
59
    """
60
    from pythainlp.tokenize.crfcls import segment
×
61

62
    return segment(doc)
×
63

64

65
def word_detokenize(
1✔
66
    segments: Union[List[List[str]], List[str]], output: str = "str"
67
) -> Union[List[str], str]:
68
    """
69
    Word detokenizer.
70

71
    This function will detokenize the list of words in each sentence into text.
72

73
    :param str segments: List of sentences, each with a list of words.
74
    :param str output: the output type (str or list)
75
    :return: the Thai text
76
    :rtype: Union[str,List[str]]
77
    :Example:
78
    ::
79

80
        from pythainlp.tokenize import word_detokenize
81

82
        print(word_detokenize(["เรา", "เล่น"]))
83
        # output: เราเล่น
84
    """
85
    list_all = []
1✔
86

87
    if isinstance(segments[0], str):
1✔
88
        segments = [segments]
1✔
89

90
    from pythainlp import thai_characters
1✔
91

92
    for i, s in enumerate(segments):
1✔
93
        list_sents = []
1✔
94
        add_index = []
1✔
95
        space_index = []
1✔
96
        mark_index = []
1✔
97
        for j, w in enumerate(s):
1✔
98
            if j > 0:
1✔
99
                # previous word
100
                p_w = s[j - 1]
1✔
101
                # if w is number or other language and is not space
102
                if (
1✔
103
                    w[0] not in thai_characters
104
                    and not w.isspace()
105
                    and not p_w.isspace()
106
                ):
107
                    list_sents.append(" ")
1✔
108
                    add_index.append(j)
1✔
109
                # if previous word is number or other language and is not space
110
                elif p_w[0] not in thai_characters and not p_w.isspace():
1✔
111
                    list_sents.append(" ")
1✔
112
                    add_index.append(j)
1✔
113
                # if word is Thai iteration mark
114
                elif w == "ๆ":
1✔
115
                    if not p_w.isspace():
1✔
116
                        list_sents.append(" ")
1✔
117
                    mark_index.append(j)
1✔
118
                elif w.isspace() and j - 1 not in space_index:
1✔
119
                    space_index.append(j)
1✔
120
                elif j - 1 in mark_index:
1✔
121
                    list_sents.append(" ")
1✔
122
            list_sents.append(w)
1✔
123
        list_all.append(list_sents)
1✔
124

125
    if output == "list":
1✔
126
        return list_all
1✔
127

128
    text = []
1✔
129
    for i in list_all:
1✔
130
        text.append("".join(i))
1✔
131
    return " ".join(text)
1✔
132

133

134
def word_tokenize(
1✔
135
    text: str,
136
    custom_dict: Trie = Trie([]),
137
    engine: str = DEFAULT_WORD_TOKENIZE_ENGINE,
138
    keep_whitespace: bool = True,
139
    join_broken_num: bool = True,
140
) -> List[str]:
141
    """
142
    Word tokenizer.
143

144
    Tokenizes running text into words (list of strings).
145

146
    :param str text: text to be tokenized
147
    :param str engine: name of the tokenizer to be used
148
    :param pythainlp.util.Trie custom_dict: dictionary trie (some engine may not support)
149
    :param bool keep_whitespace: True to keep whitespace, a common mark
150
                                 for end of phrase in Thai.
151
                                 Otherwise, whitespace is omitted.
152
    :param bool join_broken_num: True to rejoin formatted numeric that could be wrongly separated.
153
                                 Otherwise, formatted numeric could be wrongly separated.
154

155
    :return: list of words
156
    :rtype: List[str]
157
    **Options for engine**
158
        * *attacut* - wrapper for
159
          `AttaCut <https://github.com/PyThaiNLP/attacut>`_.,
160
          learning-based approach
161
        * *deepcut* - wrapper for
162
          `DeepCut <https://github.com/rkcosmos/deepcut>`_,
163
          learning-based approach
164
        * *icu* - wrapper for a word tokenizer in
165
          `PyICU <https://gitlab.pyicu.org/main/pyicu>`_.,
166
          from ICU (International Components for Unicode),
167
          dictionary-based          
168
        * *longest* - dictionary-based, longest matching
169
        * *mm* - "multi-cut", dictionary-based, maximum matching
170
        * *nercut* - dictionary-based, maximal matching,
171
          constrained by Thai Character Cluster (TCC) boundaries,
172
          combining tokens that are parts of the same named-entity
173
        * *newmm* (default) - "new multi-cut",
174
          dictionary-based, maximum matching,
175
          constrained by Thai Character Cluster (TCC) boundaries
176
          with improved TCC rules that are used in newmm.
177
        * *newmm-safe* - newmm, with a mechanism to avoid long
178
          processing time for text with continuously ambiguous breaking points
179
        * *nlpo3* - wrapper for a word tokenizer in
180
          `nlpO3 <https://github.com/PyThaiNLP/nlpo3>`_.,
181
          adaptation of newmm in Rust (2.5x faster)
182
        * *oskut* - wrapper for
183
          `OSKut <https://github.com/mrpeerat/OSKut>`_.,
184
          Out-of-domain StacKed cut for Word Segmentation
185
        * *sefr_cut* - wrapper for
186
          `SEFR CUT <https://github.com/mrpeerat/SEFR_CUT>`_.,
187
          Stacked Ensemble Filter and Refine for Word Segmentation
188
        * *tltk* - wrapper for
189
          `TLTK <https://pypi.org/project/tltk/>`_.,
190
           maximum collocation approach
191
    :Note:
192
        - The **custom_dict** parameter only works for \
193
          *deepcut*, *longest*, *newmm*, and *newmm-safe* engines.
194
    :Example:
195

196
    Tokenize text with different tokenizers::
197

198
        from pythainlp.tokenize import word_tokenize
199

200
        text = "โอเคบ่พวกเรารักภาษาบ้านเกิด"
201

202
        word_tokenize(text, engine="newmm")
203
        # output: ['โอเค', 'บ่', 'พวกเรา', 'รัก', 'ภาษา', 'บ้านเกิด']
204

205
        word_tokenize(text, engine='attacut')
206
        # output: ['โอเค', 'บ่', 'พวกเรา', 'รัก', 'ภาษา', 'บ้านเกิด']
207

208
    Tokenize text with whitespace omitted::
209

210
        text = "วรรณกรรม ภาพวาด และการแสดงงิ้ว "
211

212
        word_tokenize(text, engine="newmm")
213
        # output:
214
        # ['วรรณกรรม', ' ', 'ภาพวาด', ' ', 'และ', 'การแสดง', 'งิ้ว', ' ']
215

216
        word_tokenize(text, engine="newmm", keep_whitespace=False)
217
        # output: ['วรรณกรรม', 'ภาพวาด', 'และ', 'การแสดง', 'งิ้ว']
218

219
    Join broken formatted numeric (e.g. time, decimals, IP addresses)::
220

221
        text = "เงิน1,234บาท19:32น 127.0.0.1"
222

223
        word_tokenize(text, engine="attacut", join_broken_num=False)
224
        # output:
225
        # ['เงิน', '1', ',', '234', 'บาท', '19', ':', '32น', ' ',
226
        #  '127', '.', '0', '.', '0', '.', '1']
227

228
        word_tokenize(text, engine="attacut", join_broken_num=True)
229
        # output:
230
        # ['เงิน', '1,234', 'บาท', '19:32น', ' ', '127.0.0.1']
231

232
    Tokenize with default and custom dictionaries::
233

234
        from pythainlp.corpus.common import thai_words
235
        from pythainlp.tokenize import dict_trie
236

237
        text = 'ชินโซ อาเบะ เกิด 21 กันยายน'
238

239
        word_tokenize(text, engine="newmm")
240
        # output:
241
        # ['ชิน', 'โซ', ' ', 'อา', 'เบะ', ' ',
242
        #  'เกิด', ' ', '21', ' ', 'กันยายน']
243

244
        custom_dict_japanese_name = set(thai_words()
245
        custom_dict_japanese_name.add('ชินโซ')
246
        custom_dict_japanese_name.add('อาเบะ')
247

248
        trie = dict_trie(dict_source=custom_dict_japanese_name)
249

250
        word_tokenize(text, engine="newmm", custom_dict=trie))
251
        # output:
252
        # ['ชินโซ', ' ', 'อาเบะ', ' ',
253
        #  'เกิด', ' ', '21', ' ', 'กันยายน']
254
    """
255
    if not text or not isinstance(text, str):
1✔
256
        return []
1✔
257

258
    segments = []
1✔
259

260
    if engine in ("newmm", "onecut"):
1✔
261
        from pythainlp.tokenize.newmm import segment
1✔
262

263
        segments = segment(text, custom_dict)
1✔
264
    elif engine == "newmm-safe":
1✔
265
        from pythainlp.tokenize.newmm import segment
1✔
266

267
        segments = segment(text, custom_dict, safe_mode=True)
1✔
268
    elif engine == "attacut":
1✔
269
        from pythainlp.tokenize.attacut import segment
×
270

271
        segments = segment(text)
×
272
    elif engine == "longest":
1✔
273
        from pythainlp.tokenize.longest import segment
1✔
274

275
        segments = segment(text, custom_dict)
1✔
276
    elif engine in ("mm", "multi_cut"):
1✔
277
        from pythainlp.tokenize.multi_cut import segment
1✔
278

279
        segments = segment(text, custom_dict)
1✔
280
    elif engine == "deepcut":  # deepcut can optionally use dictionary
1✔
281
        from pythainlp.tokenize.deepcut import segment
×
282

283
        if custom_dict:
×
284
            custom_dict = list(custom_dict)
×
285
            segments = segment(text, custom_dict)
×
286
        else:
287
            segments = segment(text)
×
288
    elif engine == "icu":
1✔
289
        from pythainlp.tokenize.pyicu import segment
×
290

291
        segments = segment(text)
×
292
    elif engine == "nercut":
1✔
293
        from pythainlp.tokenize.nercut import segment
×
294

295
        segments = segment(text)
×
296
    elif engine == "sefr_cut":
1✔
297
        from pythainlp.tokenize.sefr_cut import segment
×
298

299
        segments = segment(text)
×
300
    elif engine == "tltk":
1✔
301
        from pythainlp.tokenize.tltk import segment
×
302

303
        segments = segment(text)
×
304
    elif engine == "oskut":
1✔
305
        from pythainlp.tokenize.oskut import segment
×
306

307
        segments = segment(text)
×
308
    elif engine == "nlpo3":
1✔
309
        from pythainlp.tokenize.nlpo3 import segment
×
310

311
        # Currently cannot handle custom_dict from inside word_tokenize(),
312
        # due to difference in type.
313
        # if isinstance(custom_dict, str):
314
        #    segments = segment(text, custom_dict=custom_dict)
315
        # elif not isinstance(custom_dict, str) and not custom_dict:
316
        #    raise ValueError(
317
        #        f"""Tokenizer \"{engine}\":
318
        #        custom_dict must be a str.
319
        #        It is a dictionary name as assigned with load_dict().
320
        #        See pythainlp.tokenize.nlpo3.load_dict()"""
321
        #    )
322
        # else:
323
        #    segments = segment(text)
324
        segments = segment(text)
×
325
    else:
326
        raise ValueError(
1✔
327
            f"""Tokenizer \"{engine}\" not found.
328
            It might be a typo; if not, please consult our document."""
329
        )
330

331
    postprocessors = []
1✔
332
    if join_broken_num:
1✔
333
        postprocessors.append(rejoin_formatted_num)
1✔
334

335
    if not keep_whitespace:
1✔
336
        postprocessors.append(strip_whitespace)
1✔
337

338
    segments = apply_postprocessors(segments, postprocessors)
1✔
339

340
    return segments
1✔
341

342

343
def indices_words(words):
1✔
344
    indices = []
×
345
    start_index = 0
×
346
    for word in words:
×
347
        end_index = start_index + len(word) - 1
×
348
        indices.append((start_index, end_index))
×
349
        start_index += len(word)
×
350

351
    return indices
×
352

353

354
def map_indices_to_words(index_list, sentences):
1✔
355
    result = []
×
356
    c = copy.copy(index_list)
×
357
    n_sum = 0
×
358
    for sentence in sentences:
×
359
        words = sentence
×
360
        sentence_result = []
×
361
        n = 0
×
362
        for start, end in c:
×
363
            if start > n_sum + len(words) - 1:
×
364
                break
×
365
            else:
NEW
366
                word = sentence[start - n_sum : end + 1 - n_sum]
×
367
                sentence_result.append(word)
×
368
                n += 1
×
369

370
        result.append(sentence_result)
×
371
        n_sum += len(words)
×
372
        for _ in range(n):
×
373
            del c[0]
×
374
    return result
×
375

376

377
def sent_tokenize(
1✔
378
    text: Union[str, List[str]],
379
    engine: str = DEFAULT_SENT_TOKENIZE_ENGINE,
380
    keep_whitespace: bool = True,
381
) -> List[str]:
382
    """
383
    Sentence tokenizer.
384

385
    Tokenizes running text into "sentences". Supports both string and list of strings.
386

387
    :param text: the text (string) or list of words (list of strings) to be tokenized
388
    :param str engine: choose among *'crfcut'*, *'whitespace'*, \
389
    *'whitespace+newline'*
390
    :return: list of split sentences
391
    :rtype: list[str]
392
    **Options for engine**
393
        * *crfcut* - (default) split by CRF trained on TED dataset
394
        * *thaisum* - The implementation of sentence segmenter from \
395
            Nakhun Chumpolsathien, 2020
396
        * *tltk* - split by `TLTK <https://pypi.org/project/tltk/>`_.,
397
        * *wtp* - split by `wtpsplitaxe <https://github.com/bminixhofer/wtpsplit>`_., \
398
            It supports many sizes of models. You can use ``wtp`` to use mini model, \
399
            ``wtp-tiny`` to use ``wtp-bert-tiny`` model (default), \
400
            ``wtp-mini`` to use ``wtp-bert-mini`` model, \
401
            ``wtp-base`` to use ``wtp-canine-s-1l`` model, \
402
            and ``wtp-large`` to use ``wtp-canine-s-12l`` model.
403
        * *whitespace+newline* - split by whitespace and newline.
404
        * *whitespace* - split by whitespace, specifically with \
405
                         :class:`regex` pattern  ``r" +"``
406
    :Example:
407

408
    Split the text based on *whitespace*::
409

410
        from pythainlp.tokenize import sent_tokenize
411

412
        sentence_1 = "ฉันไปประชุมเมื่อวันที่ 11 มีนาคม"
413
        sentence_2 = "ข้าราชการได้รับการหมุนเวียนเป็นระยะ \\
414
        และได้รับมอบหมายให้ประจำในระดับภูมิภาค"
415

416
        sent_tokenize(sentence_1, engine="whitespace")
417
        # output: ['ฉันไปประชุมเมื่อวันที่', '11', 'มีนาคม']
418

419
        sent_tokenize(sentence_2, engine="whitespace")
420
        # output: ['ข้าราชการได้รับการหมุนเวียนเป็นระยะ',
421
        #   '\\nและได้รับมอบหมายให้ประจำในระดับภูมิภาค']
422

423
    Split the text based on *whitespace* and *newline*::
424

425
        sentence_1 = "ฉันไปประชุมเมื่อวันที่ 11 มีนาคม"
426
        sentence_2 = "ข้าราชการได้รับการหมุนเวียนเป็นระยะ \\
427
        และได้รับมอบหมายให้ประจำในระดับภูมิภาค"
428

429
        sent_tokenize(sentence_1, engine="whitespace+newline")
430
        # output: ['ฉันไปประชุมเมื่อวันที่', '11', 'มีนาคม']
431
        sent_tokenize(sentence_2, engine="whitespace+newline")
432
        # output: ['ข้าราชการได้รับการหมุนเวียนเป็นระยะ',
433
        '\\nและได้รับมอบหมายให้ประจำในระดับภูมิภาค']
434

435
    Split the text using CRF trained on TED dataset::
436

437
        sentence_1 = "ฉันไปประชุมเมื่อวันที่ 11 มีนาคม"
438
        sentence_2 = "ข้าราชการได้รับการหมุนเวียนเป็นระยะ \\
439
        และเขาได้รับมอบหมายให้ประจำในระดับภูมิภาค"
440

441
        sent_tokenize(sentence_1, engine="crfcut")
442
        # output: ['ฉันไปประชุมเมื่อวันที่ 11 มีนาคม']
443

444
        sent_tokenize(sentence_2, engine="crfcut")
445
        # output: ['ข้าราชการได้รับการหมุนเวียนเป็นระยะ ',
446
        'และเขาได้รับมอบหมายให้ประจำในระดับภูมิภาค']
447
    """
448

449
    if not text or not isinstance(text, (str, list)):
1✔
450
        return []
1✔
451

452
    is_list_input = isinstance(text, list)
1✔
453

454
    if is_list_input:
1✔
455
        try:
×
456
            original_text = "".join(text)
×
457
        except ValueError:
×
458
            return []
×
459

460
    else:
461
        original_text = text
1✔
462

463
    segments = []
1✔
464

465
    if engine == "crfcut":
1✔
466
        from pythainlp.tokenize.crfcut import segment
×
467

468
        segments = segment(original_text)
×
469

470
        if is_list_input:
×
471
            word_indices = indices_words(text)
×
472
            result = map_indices_to_words(word_indices, [original_text])
×
473
            return result
×
474
    elif engine == "whitespace":
1✔
475
        segments = re.split(r" +", original_text, flags=re.U)
1✔
476
        if is_list_input:
1✔
477
            result = []
×
NEW
478
            _temp: list[str] = []
×
479
            for i, w in enumerate(text):
×
480
                if re.findall(r" ", w) != [] and re.findall(r"\w", w) == []:
×
NEW
481
                    if not _temp:
×
482
                        continue
×
483
                    result.append(_temp)
×
484
                    _temp = []
×
485
                else:
486
                    _temp.append(w)
×
487
                if i + 1 == len(text):
×
488
                    result.append(_temp)
×
489
            return result
×
490
    elif engine == "whitespace+newline":
×
491
        segments = original_text.split()
×
492
        if is_list_input:
×
493
            result = []
×
494
            _temp = []
×
495
            for i, w in enumerate(text):
×
496
                if (
×
497
                    re.findall(r"\s", w) != [] or re.findall(r"\n", w) != []
498
                ) and re.findall(r"\w", w) == []:
NEW
499
                    if not _temp:
×
500
                        continue
×
501
                    result.append(_temp)
×
502
                    _temp = []
×
503
                else:
504
                    _temp.append(w)
×
505
                if i + 1 == len(text):
×
506
                    result.append(_temp)
×
507
            return result
×
508
    elif engine == "tltk":
×
509
        from pythainlp.tokenize.tltk import sent_tokenize as segment
×
510

511
        segments = segment(original_text)
×
512
    elif engine == "thaisum":
×
513
        from pythainlp.tokenize.thaisumcut import (
×
514
            ThaiSentenceSegmentor as segmentor,
515
        )
516

517
        segment = segmentor()
×
518
        segments = segment.split_into_sentences(original_text)
×
519
    elif engine.startswith("wtp"):
×
520
        if "-" not in engine:
×
521
            _size = "mini"
×
522
        else:
523
            _size = engine.split("-")[-1]
×
524
        from pythainlp.tokenize.wtsplit import tokenize as segment
×
525

526
        segments = segment(original_text, size=_size, tokenize="sentence")
×
527
    else:
528
        raise ValueError(
×
529
            f"""Tokenizer \"{engine}\" not found.
530
            It might be a typo; if not, please consult our document."""
531
        )
532

533
    if not keep_whitespace:
1✔
534
        segments = strip_whitespace(segments)
×
535

536
    if is_list_input and engine not in ["crfcut"]:
1✔
537
        word_indices = indices_words(text)
×
538
        result = map_indices_to_words(word_indices, segments)
×
539
        return result
×
540
    else:
541
        return [segments]
1✔
542

543

544
def paragraph_tokenize(
1✔
545
    text: str,
546
    engine: str = "wtp-mini",
547
    paragraph_threshold: float = 0.5,
548
    style: str = "newline",
549
) -> List[List[str]]:
550
    """
551
    Paragraph tokenizer.
552

553
    Tokenizes text into paragraphs.
554

555
    :param str text: text to be tokenized
556
    :param str engine: the name of paragraph tokenizer
557
    :return: list of paragraphs
558
    :rtype: List[List[str]]
559
    **Options for engine**
560
        * *wtp* - split by `wtpsplitaxe <https://github.com/bminixhofer/wtpsplit>`_., \
561
            It supports many sizes of models. You can use ``wtp`` to use mini model, \
562
            ``wtp-tiny`` to use ``wtp-bert-tiny`` model (default), \
563
            ``wtp-mini`` to use ``wtp-bert-mini`` model, \
564
            ``wtp-base`` to use ``wtp-canine-s-1l`` model, \
565
            and ``wtp-large`` to use ``wtp-canine-s-12l`` model.
566

567
    :Example:
568

569
    Split the text based on *wtp*::
570

571
        from pythainlp.tokenize import paragraph_tokenize
572

573
        sent = (
574
            "(1) บทความนี้ผู้เขียนสังเคราะห์ขึ้นมาจากผลงานวิจัยที่เคยทำมาในอดีต"
575
            +"  มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด"
576
            +" จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ณ ที่นี้"
577
        )
578

579
        paragraph_tokenize(sent)
580
        # output: [
581
        # ['(1) '], 
582
        # [
583
        #   'บทความนี้ผู้เขียนสังเคราะห์ขึ้นมาจากผลงานวิจัยที่เคยทำมาในอดีต  ',
584
        #   'มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด ',
585
        #   'จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ',
586
        #   'ณ ที่นี้'
587
        # ]]
588
    """
589
    if engine.startswith("wtp"):
×
590
        if "-" not in engine:
×
591
            size = "mini"
×
592
        else:
593
            size = engine.split("-")[-1]
×
594

595
        from pythainlp.tokenize.wtsplit import tokenize as segment
×
596

597
        segments = segment(
×
598
            text,
599
            size=size,
600
            tokenize="paragraph",
601
            paragraph_threshold=paragraph_threshold,
602
            style=style,
603
        )
604
    else:
605
        raise ValueError(
×
606
            f"""Tokenizer \"{engine}\" not found.
607
            It might be a typo; if not, please consult our document."""
608
        )
609

610
    return segments
×
611

612

613
def subword_tokenize(
1✔
614
    text: str,
615
    engine: str = DEFAULT_SUBWORD_TOKENIZE_ENGINE,
616
    keep_whitespace: bool = True,
617
) -> List[str]:
618
    """
619
    Subword tokenizer for tokenizing text into units smaller than syllables.
620

621
    Tokenizes text into inseparable units of
622
    Thai contiguous characters, namely
623
    `Thai Character Clusters (TCCs) \
624
    <https://www.researchgate.net/publication/2853284_Character_Cluster_Based_Thai_Information_Retrieval>`_
625
    TCCs are units based on Thai spelling features that could not be
626
    separated any character further such as 'ก็', 'จะ', 'ไม่', and 'ฝา'.
627
    If the following units are separated, they could not be spelled out.
628
    This function applies TCC rules to tokenize the text into
629
    the smallest units.
630

631
    For example, the word 'ขนมชั้น' would be tokenized
632
    into 'ข', 'น', 'ม', and 'ชั้น'.
633

634
    :param str text: text to be tokenized
635
    :param str engine: the name of subword tokenizer
636
    :param bool keep_whitespace: keep whitespace
637
    :return: list of subwords
638
    :rtype: List[str]
639
    **Options for engine**
640
        * *dict* - newmm word tokenizer with a syllable dictionary
641
        * *etcc* - Enhanced Thai Character Cluster (Inrut et al. 2001)
642
        * *han_solo* - CRF syllable segmenter for Thai that can work in the \
643
            Thai social media domain. See `PyThaiNLP/Han-solo \
644
        <https://github.com/PyThaiNLP/Han-solo>`_.
645
        * *ssg* - CRF syllable segmenter for Thai. See `ponrawee/ssg \
646
        <https://github.com/ponrawee/ssg>`_.
647
        * *tcc* (default) - Thai Character Cluster (Theeramunkong et al. 2000)
648
        * *tcc_p* - Thai Character Cluster + improved rules that are used in newmm
649
        * *tltk* - syllable tokenizer from tltk. See `tltk \
650
        <https://pypi.org/project/tltk/>`_.
651
        * *wangchanberta* - SentencePiece from wangchanberta model
652
    :Example:
653

654
    Tokenize text into subwords based on *tcc*::
655

656
        from pythainlp.tokenize import subword_tokenize
657

658
        text_1 = "ยุคเริ่มแรกของ ราชวงศ์หมิง"
659
        text_2 = "ความแปลกแยกและพัฒนาการ"
660

661
        subword_tokenize(text_1, engine='tcc')
662
        # output: ['ยุ', 'ค', 'เริ่ม', 'แร', 'ก',
663
        #   'ข', 'อ', 'ง', ' ', 'รา', 'ช', 'ว', 'ง',
664
        #   'ศ', '์', 'ห', 'มิ', 'ง']
665

666
        subword_tokenize(text_2, engine='tcc')
667
        # output: ['ค', 'วา', 'ม', 'แป', 'ล', 'ก', 'แย', 'ก',
668
        'และ', 'พัฒ','นา', 'กา', 'ร']
669

670
    Tokenize text into subwords based on *etcc*::
671

672
        text_1 = "ยุคเริ่มแรกของ ราชวงศ์หมิง"
673
        text_2 = "ความแปลกแยกและพัฒนาการ"
674

675
        subword_tokenize(text_1, engine='etcc')
676
        # output: ['ยุคเริ่มแรกของ ราชวงศ์หมิง']
677

678
        subword_tokenize(text_2, engine='etcc')
679
        # output: ['ความแปลกแยกและ', 'พัฒ', 'นาการ']
680

681
    Tokenize text into subwords based on *wangchanberta*::
682

683
        text_1 = "ยุคเริ่มแรกของ ราชวงศ์หมิง"
684
        text_2 = "ความแปลกแยกและพัฒนาการ"
685

686
        subword_tokenize(text_1, engine='wangchanberta')
687
        # output: ['▁', 'ยุค', 'เริ่มแรก', 'ของ', '▁', 'ราชวงศ์', 'หมิง']
688

689
        subword_tokenize(text_2, engine='wangchanberta')
690
        # output: ['▁ความ', 'แปลก', 'แยก', 'และ', 'พัฒนาการ']
691
    """
692
    if not text or not isinstance(text, str):
1✔
693
        return []
1✔
694

695
    segments = []
1✔
696

697
    if engine == "tcc":
1✔
698
        from pythainlp.tokenize.tcc import segment
1✔
699
    elif engine == "tcc_p":
1✔
700
        from pythainlp.tokenize.tcc_p import segment
1✔
701
    elif engine == "etcc":
1✔
702
        from pythainlp.tokenize.etcc import segment
1✔
703
    elif engine == "wangchanberta":
1✔
704
        from pythainlp.wangchanberta import segment
×
705
    elif engine == "dict":  # use syllable dictionary
1✔
706
        words = word_tokenize(text)
1✔
707
        for word in words:
1✔
708
            segments.extend(
1✔
709
                word_tokenize(
710
                    text=word, custom_dict=DEFAULT_SYLLABLE_DICT_TRIE
711
                )
712
            )
713
    elif engine == "ssg":
1✔
714
        from pythainlp.tokenize.ssg import segment
×
715
    elif engine == "tltk":
1✔
716
        from pythainlp.tokenize.tltk import syllable_tokenize as segment
×
717
    elif engine == "han_solo":
1✔
718
        from pythainlp.tokenize.han_solo import segment
×
719
    elif engine == "phayathai":
1✔
720
        from pythainlp.phayathaibert import segment
×
721
    else:
722
        raise ValueError(
1✔
723
            f"""Tokenizer \"{engine}\" not found.
724
            It might be a typo; if not, please consult our document."""
725
        )
726

727
    if not segments:
1✔
728
        segments = segment(text)
1✔
729

730
    if not keep_whitespace:
1✔
731
        segments = strip_whitespace(segments)
1✔
732

733
    return segments
1✔
734

735

736
def syllable_tokenize(
1✔
737
    text: str,
738
    engine: str = DEFAULT_SYLLABLE_TOKENIZE_ENGINE,
739
    keep_whitespace: bool = True,
740
) -> List[str]:
741
    """
742
    Syllable tokenizer
743

744
    Tokenizes text into inseparable units of
745
    Thai syllables.
746

747
    :param str text: text to be tokenized
748
    :param str engine: the name of syllable tokenizer
749
    :param bool keep_whitespace: keep whitespace
750
    :return: list of subwords
751
    :rtype: List[str]
752
    **Options for engine**
753
        * *dict* - newmm word tokenizer with a syllable dictionary
754
        * *han_solo* - CRF syllable segmenter for Thai that can work in the \
755
            Thai social media domain. See `PyThaiNLP/Han-solo \
756
        <https://github.com/PyThaiNLP/Han-solo>`_.
757
        * *ssg* - CRF syllable segmenter for Thai. See `ponrawee/ssg \
758
        <https://github.com/ponrawee/ssg>`_.
759
        * *tltk* - syllable tokenizer from tltk. See `tltk \
760
        <https://pypi.org/project/tltk/>`_.
761
    """
762
    if engine not in ["dict", "han_solo", "ssg", "tltk"]:
1✔
763
        raise ValueError(
1✔
764
            f"""Tokenizer \"{engine}\" not found.
765
            It might be a typo; if not, please consult our document."""
766
        )
767
    return subword_tokenize(
1✔
768
        text=text, engine=engine, keep_whitespace=keep_whitespace
769
    )
770

771

772
class Tokenizer:
1✔
773
    """
774
    Tokenizer class for a custom tokenizer.
775

776
    This class allows users to pre-define custom dictionary along with
777
    tokenizer and encapsulate them into one single object.
778
    It is an wrapper for both functions, that are
779
    :func:`pythainlp.tokenize.word_tokenize`,
780
    and :func:`pythainlp.util.dict_trie`
781

782
    :Example:
783

784
    Tokenizer object instantiated with :class:`pythainlp.util.Trie`::
785

786
        from pythainlp.tokenize import Tokenizer
787
        from pythainlp.corpus.common import thai_words
788
        from pythainlp.util import dict_trie
789

790
        custom_words_list = set(thai_words())
791
        custom_words_list.add('อะเฟเซีย')
792
        custom_words_list.add('Aphasia')
793
        trie = dict_trie(dict_source=custom_words_list)
794

795
        text = "อะเฟเซีย (Aphasia*) เป็นอาการผิดปกติของการพูด"
796
        _tokenizer = Tokenizer(custom_dict=trie, engine='newmm')
797
        _tokenizer.word_tokenize(text)
798
        # output: ['อะเฟเซีย', ' ', '(', 'Aphasia', ')', ' ', 'เป็น', 'อาการ',
799
        'ผิดปกติ', 'ของ', 'การ', 'พูด']
800

801
    Tokenizer object instantiated with a list of words::
802

803
        text = "อะเฟเซีย (Aphasia) เป็นอาการผิดปกติของการพูด"
804
        _tokenizer = Tokenizer(custom_dict=list(thai_words()), engine='newmm')
805
        _tokenizer.word_tokenize(text)
806
        # output:
807
        # ['อะ', 'เฟเซีย', ' ', '(', 'Aphasia', ')', ' ', 'เป็น', 'อาการ',
808
        #   'ผิดปกติ', 'ของ', 'การ', 'พูด']
809

810
    Tokenizer object instantiated with a file path containing a list of
811
    words separated with *newline* and explicitly setting a new tokenizer
812
    after initiation::
813

814
        PATH_TO_CUSTOM_DICTIONARY = './custom_dictionary.txtt'
815

816
        # write a file
817
        with open(PATH_TO_CUSTOM_DICTIONARY, 'w', encoding='utf-8') as f:
818
            f.write('อะเฟเซีย\\nAphasia\\nผิด\\nปกติ')
819

820
        text = "อะเฟเซีย (Aphasia) เป็นอาการผิดปกติของการพูด"
821

822
        # initiate an object from file with `attacut` as tokenizer
823
        _tokenizer = Tokenizer(custom_dict=PATH_TO_CUSTOM_DICTIONARY, \\
824
            engine='attacut')
825

826
        _tokenizer.word_tokenize(text)
827
        # output:
828
        # ['อะเฟเซีย', ' ', '(', 'Aphasia', ')', ' ', 'เป็น', 'อาการ', 'ผิด',
829
        #   'ปกติ', 'ของ', 'การ', 'พูด']
830

831
        # change tokenizer to `newmm`
832
        _tokenizer.set_tokenizer_engine(engine='newmm')
833
        _tokenizer.word_tokenize(text)
834
        # output:
835
        # ['อะเฟเซีย', ' ', '(', 'Aphasia', ')', ' ', 'เป็นอาการ', 'ผิด',
836
        #   'ปกติ', 'ของการพูด']
837
    """
838

839
    def __init__(
1✔
840
        self,
841
        custom_dict: Union[Trie, Iterable[str], str] = [],
842
        engine: str = "newmm",
843
        keep_whitespace: bool = True,
844
        join_broken_num: bool = True,
845
    ):
846
        """
847
        Initialize tokenizer object.
848

849
        :param str custom_dict: a file path, a list of vocaburaies* to be
850
                    used to create a trie, or an instantiated
851
                    :class:`pythainlp.util.Trie` object.
852
        :param str engine: choose between different options of tokenizer engines
853
                           (i.e.  *newmm*, *mm*, *longest*, *deepcut*)
854
        :param bool keep_whitespace: True to keep whitespace, a common mark
855
                                    for end of phrase in Thai
856
        """
857
        self.__trie_dict = Trie([])
1✔
858
        if custom_dict:
1✔
859
            self.__trie_dict = dict_trie(custom_dict)
1✔
860
        else:
861
            self.__trie_dict = DEFAULT_WORD_DICT_TRIE
1✔
862
        self.__engine = engine
1✔
863
        if self.__engine not in ["newmm", "mm", "longest", "deepcut"]:
1✔
864
            raise NotImplementedError(
1✔
865
                """
866
                The Tokenizer class is not support %s for custom tokenizer
867
                """
868
                % self.__engine
869
            )
870
        self.__keep_whitespace = keep_whitespace
1✔
871
        self.__join_broken_num = join_broken_num
1✔
872

873
    def word_tokenize(self, text: str) -> List[str]:
1✔
874
        """
875
        Main tokenization function.
876

877
        :param str text: text to be tokenized
878
        :return: list of words, tokenized from the text
879
        :rtype: list[str]
880
        """
881
        return word_tokenize(
1✔
882
            text,
883
            custom_dict=self.__trie_dict,
884
            engine=self.__engine,
885
            keep_whitespace=self.__keep_whitespace,
886
            join_broken_num=self.__join_broken_num,
887
        )
888

889
    def set_tokenize_engine(self, engine: str) -> None:
1✔
890
        """
891
        Set the tokenizer's engine.
892

893
        :param str engine: choose between different options of tokenizer engines
894
                           (i.e. *newmm*, *mm*, *longest*, *deepcut*)
895
        """
896
        self.__engine = engine
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc