11626163864

Committed 01 Nov 2024 07:49AM UTC coverage: 14.17% (+14.2%) from 0.0%

Build # 11626163864

Build Type

Pull #952

github

Committed by

web-flow

Commit Message

Merge 8f2551bc9 into 89ea62ebc

Pull Request Pull Request #952: Specify a limited test suite

Run Details

44 of 80 new or added lines in 48 files covered. (55.0%)

1048 of 7396 relevant lines covered (14.17%)

0.14 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0

/pythainlp/tokenize/crfcls.py

# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
"""
Clause segmenter
"""
from typing import List

import pycrfsuite

from pythainlp.corpus import path_pythainlp_corpus
from pythainlp.tag import pos_tag


def _doc2features(doc, i):
    # features from current word
    curr_word = doc[i][0]
    curr_pos = doc[i][1]
    features = {
        "word.curr_word": curr_word,
        "word.curr_isspace": curr_word.isspace(),
        "word.curr_isdigit": curr_word.isdigit(),
        "word.curr_postag": curr_pos,
    }

    # features from previous word
    if i > 0:
        prev_word = doc[i - 1][0]
        prev_pos = doc[i - 1][1]
        features["word.prev_word"] = prev_word
        features["word.prev_isspace"] = prev_word.isspace()
        features["word.prev_isdigit"] = prev_word.isdigit()
        features["word.prev_postag"] = prev_pos
    else:
        features["BOS"] = True  # Beginning of Sequence

    # features from next word
    if i < len(doc) - 1:
        next_word = doc[i + 1][0]
        next_pos = doc[i + 1][1]
        features["word.next_word"] = next_word
        features["word.next_isspace"] = next_word.isspace()
        features["word.next_isdigit"] = next_word.isdigit()
        features["word.next_postag"] = next_pos
    else:
        features["EOS"] = True  # End of Sequence

    return features


def _extract_features(doc):
    return [_doc2features(doc, i) for i in range(len(doc))]


_CORPUS_NAME = "blackboard-cls_v1.0.crfsuite"
tagger = pycrfsuite.Tagger()
tagger.open(path_pythainlp_corpus(_CORPUS_NAME))


def segment(doc: List[str]) -> List[List[str]]:
    word_tags = pos_tag(doc, corpus="blackboard")
    features = _extract_features(word_tags)
    word_markers = list(zip(doc, tagger.tag(features)))

    clauses = []
    temp = []
    len_doc = len(doc) - 1
    for i, word_marker in enumerate(word_markers):
        word, marker = word_marker
        if marker == "E_CLS" or i == len_doc:
            temp.append(word)
            clauses.append(temp)
            temp = []
        else:
            temp.append(word)

    return clauses

1	# -- coding: utf-8 --
2	# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
3	# SPDX-License-Identifier: Apache-2.0
4	"""
5	Clause segmenter
6	"""
7	from typing import List	×
8
9	import pycrfsuite	×
10
11	from pythainlp.corpus import path_pythainlp_corpus	×
NEW 12	from pythainlp.tag import pos_tag	×
13
14
15	def _doc2features(doc, i):	×
16	# features from current word
17	curr_word = doc[i][0]	×
18	curr_pos = doc[i][1]	×
19	features = {	×
20	"word.curr_word": curr_word,
21	"word.curr_isspace": curr_word.isspace(),
22	"word.curr_isdigit": curr_word.isdigit(),
23	"word.curr_postag": curr_pos,
24	}
25
26	# features from previous word
27	if i > 0:	×
28	prev_word = doc[i - 1][0]	×
29	prev_pos = doc[i - 1][1]	×
30	features["word.prev_word"] = prev_word	×
31	features["word.prev_isspace"] = prev_word.isspace()	×
32	features["word.prev_isdigit"] = prev_word.isdigit()	×
33	features["word.prev_postag"] = prev_pos	×
34	else:
35	features["BOS"] = True # Beginning of Sequence	×
36
37	# features from next word
38	if i < len(doc) - 1:	×
39	next_word = doc[i + 1][0]	×
40	next_pos = doc[i + 1][1]	×
41	features["word.next_word"] = next_word	×
42	features["word.next_isspace"] = next_word.isspace()	×
43	features["word.next_isdigit"] = next_word.isdigit()	×
44	features["word.next_postag"] = next_pos	×
45	else:
46	features["EOS"] = True # End of Sequence	×
47
48	return features	×
49
50
51	def _extract_features(doc):	×
52	return [_doc2features(doc, i) for i in range(len(doc))]	×
53
54
55	_CORPUS_NAME = "blackboard-cls_v1.0.crfsuite"	×
56	tagger = pycrfsuite.Tagger()	×
57	tagger.open(path_pythainlp_corpus(_CORPUS_NAME))	×
58
59
60	def segment(doc: List[str]) -> List[List[str]]:	×
61	word_tags = pos_tag(doc, corpus="blackboard")	×
62	features = _extract_features(word_tags)	×
63	word_markers = list(zip(doc, tagger.tag(features)))	×
64
65	clauses = []	×
66	temp = []	×
67	len_doc = len(doc) - 1	×
68	for i, word_marker in enumerate(word_markers):	×
69	word, marker = word_marker	×
70	if marker == "E_CLS" or i == len_doc:	×
71	temp.append(word)	×
72	clauses.append(temp)	×
73	temp = []	×
74	else:
75	temp.append(word)	×
76
77	return clauses	×

PyThaiNLP / pythainlp / 11626163864

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous