• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

PyThaiNLP / pythainlp / 11626163864

01 Nov 2024 07:49AM UTC coverage: 14.17% (+14.2%) from 0.0%
11626163864

Pull #952

github

web-flow
Merge 8f2551bc9 into 89ea62ebc
Pull Request #952: Specify a limited test suite

44 of 80 new or added lines in 48 files covered. (55.0%)

1048 of 7396 relevant lines covered (14.17%)

0.14 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/pythainlp/tokenize/crfcls.py
1
# -*- coding: utf-8 -*-
2
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
3
# SPDX-License-Identifier: Apache-2.0
4
"""
5
Clause segmenter
6
"""
7
from typing import List
×
8

9
import pycrfsuite
×
10

11
from pythainlp.corpus import path_pythainlp_corpus
×
NEW
12
from pythainlp.tag import pos_tag
×
13

14

15
def _doc2features(doc, i):
×
16
    # features from current word
17
    curr_word = doc[i][0]
×
18
    curr_pos = doc[i][1]
×
19
    features = {
×
20
        "word.curr_word": curr_word,
21
        "word.curr_isspace": curr_word.isspace(),
22
        "word.curr_isdigit": curr_word.isdigit(),
23
        "word.curr_postag": curr_pos,
24
    }
25

26
    # features from previous word
27
    if i > 0:
×
28
        prev_word = doc[i - 1][0]
×
29
        prev_pos = doc[i - 1][1]
×
30
        features["word.prev_word"] = prev_word
×
31
        features["word.prev_isspace"] = prev_word.isspace()
×
32
        features["word.prev_isdigit"] = prev_word.isdigit()
×
33
        features["word.prev_postag"] = prev_pos
×
34
    else:
35
        features["BOS"] = True  # Beginning of Sequence
×
36

37
    # features from next word
38
    if i < len(doc) - 1:
×
39
        next_word = doc[i + 1][0]
×
40
        next_pos = doc[i + 1][1]
×
41
        features["word.next_word"] = next_word
×
42
        features["word.next_isspace"] = next_word.isspace()
×
43
        features["word.next_isdigit"] = next_word.isdigit()
×
44
        features["word.next_postag"] = next_pos
×
45
    else:
46
        features["EOS"] = True  # End of Sequence
×
47

48
    return features
×
49

50

51
def _extract_features(doc):
×
52
    return [_doc2features(doc, i) for i in range(len(doc))]
×
53

54

55
_CORPUS_NAME = "blackboard-cls_v1.0.crfsuite"
×
56
tagger = pycrfsuite.Tagger()
×
57
tagger.open(path_pythainlp_corpus(_CORPUS_NAME))
×
58

59

60
def segment(doc: List[str]) -> List[List[str]]:
×
61
    word_tags = pos_tag(doc, corpus="blackboard")
×
62
    features = _extract_features(word_tags)
×
63
    word_markers = list(zip(doc, tagger.tag(features)))
×
64

65
    clauses = []
×
66
    temp = []
×
67
    len_doc = len(doc) - 1
×
68
    for i, word_marker in enumerate(word_markers):
×
69
        word, marker = word_marker
×
70
        if marker == "E_CLS" or i == len_doc:
×
71
            temp.append(word)
×
72
            clauses.append(temp)
×
73
            temp = []
×
74
        else:
75
            temp.append(word)
×
76

77
    return clauses
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc