• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

PyThaiNLP / pythainlp / 23550040738

25 Mar 2026 03:45PM UTC coverage: 66.544%. First build
23550040738

Pull #1369

github

web-flow
Merge 1d31799f8 into 9836c966e
Pull Request #1369: fix: replace os.path.join with safe_path_join to prevent path manipulation (CWE-22)

23 of 44 new or added lines in 8 files covered. (52.27%)

6494 of 9759 relevant lines covered (66.54%)

0.67 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/pythainlp/parse/transformers_ud.py
1
"""TransformersUD
2

3
Author: Prof. Koichi Yasuoka
4

5
This tagger is provided under the terms of the apache-2.0 License.
6

7
The source: https://huggingface.co/KoichiYasuoka/deberta-base-thai-ud-head
8

9
GitHub: https://github.com/KoichiYasuoka
10
"""
11

12
from __future__ import annotations
×
13

14
import os
×
15
from typing import TYPE_CHECKING, Optional, Union
×
16

17
if TYPE_CHECKING:
18
    from transformers import (  # noqa: F401
19
        AutoModelForQuestionAnswering,
20
        AutoTokenizer,
21
        TokenClassificationPipeline,
22
    )
23

NEW
24
from pythainlp.tools.path import safe_path_join
×
25

26

27
class Parse:
×
28
    def __init__(
×
29
        self, model: Optional[str] = "KoichiYasuoka/deberta-base-thai-ud-head"
30
    ) -> None:
31
        from transformers import (
×
32
            AutoConfig,
33
            AutoModelForQuestionAnswering,
34
            AutoModelForTokenClassification,
35
            AutoTokenizer,
36
            TokenClassificationPipeline,
37
        )
38
        from transformers.utils import cached_file
×
39

40
        if model is None:
×
41
            model = "KoichiYasuoka/deberta-base-thai-ud-head"
×
42
        self.tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(model)
×
43
        self.model: AutoModelForQuestionAnswering = (
×
44
            AutoModelForQuestionAnswering.from_pretrained(model)
45
        )
46
        x = AutoModelForTokenClassification.from_pretrained
×
47
        if os.path.isdir(model):
×
48
            d, t = (
×
49
                x(safe_path_join(model, "deprel")),
50
                x(safe_path_join(model, "tagger")),
51
            )
52
        else:
53
            c = AutoConfig.from_pretrained(
×
54
                cached_file(model, "deprel/config.json")
55
            )
56
            d = x(cached_file(model, "deprel/pytorch_model.bin"), config=c)
×
57
            s = AutoConfig.from_pretrained(
×
58
                cached_file(model, "tagger/config.json")
59
            )
60
            t = x(cached_file(model, "tagger/pytorch_model.bin"), config=s)
×
61
        self.deprel: TokenClassificationPipeline = TokenClassificationPipeline(
×
62
            model=d, tokenizer=self.tokenizer, aggregation_strategy="simple"
63
        )
64
        self.tagger: TokenClassificationPipeline = TokenClassificationPipeline(
×
65
            model=t, tokenizer=self.tokenizer
66
        )
67

68
    def __call__(
×
69
        self, text: str, tag: str = "str"
70
    ) -> Union[list[list[str]], str]:
71
        import numpy
×
72
        import torch
×
73
        import ufal.chu_liu_edmonds
×
74

75
        w = [
×
76
            (t["start"], t["end"], t["entity_group"])
77
            for t in self.deprel(text)
78
        ]
79
        z, n = (
×
80
            {t["start"]: t["entity"].split("|") for t in self.tagger(text)},
81
            len(w),
82
        )
83
        r, m = (
×
84
            [text[s:e] for s, e, p in w],
85
            numpy.full((n + 1, n + 1), numpy.nan),
86
        )
87
        v, c = self.tokenizer(r, add_special_tokens=False)["input_ids"], []
×
88
        for i, t in enumerate(v):
×
89
            q = (
×
90
                [self.tokenizer.cls_token_id]
91
                + t
92
                + [self.tokenizer.sep_token_id]
93
            )
94
            c.append(
×
95
                [q]
96
                + v[0:i]
97
                + [[self.tokenizer.mask_token_id]]
98
                + v[i + 1 :]
99
                + [[q[-1]]]
100
            )
101
        b = [[len(sum(x[0 : j + 1], [])) for j in range(len(x))] for x in c]
×
102
        with torch.no_grad():
×
103
            d = self.model(
×
104
                input_ids=torch.tensor([sum(x, []) for x in c]),
105
                token_type_ids=torch.tensor(
106
                    [[0] * x[0] + [1] * (x[-1] - x[0]) for x in b]
107
                ),
108
            )
109
        s, e = d.start_logits.tolist(), d.end_logits.tolist()
×
110
        for i in range(n):
×
111
            for j in range(n):
×
112
                m[i + 1, 0 if i == j else j + 1] = (
×
113
                    s[i][b[i][j]] + e[i][b[i][j + 1] - 1]
114
                )
115
        h = ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0]
×
116
        if [0 for i in h if i == 0] != [0]:
×
117
            i = ([p for s, e, p in w] + ["root"]).index("root")
×
118
            j = i + 1 if i < n else int(numpy.nanargmax(m[:, 0]))
×
119
            m[0:j, 0] = m[j + 1 :, 0] = numpy.nan
×
120
            h = ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0]
×
121
        u = ""
×
122
        if tag == "list":
×
123
            _tag_data = []
×
124
            for i, (s, e, p) in enumerate(w, 1):
×
125
                p = "root" if h[i] == 0 else "dep" if p == "root" else p
×
126
                _tag_data.append(
×
127
                    [
128
                        str(i),
129
                        r[i - 1],
130
                        "_",
131
                        z[s][0][2:],
132
                        "_",
133
                        "|".join(z[s][1:]),
134
                        str(h[i]),
135
                        p,
136
                        "_",
137
                        "_" if i < n and e < w[i][0] else "SpaceAfter=No",
138
                    ]
139
                )
140
            return _tag_data
×
141
        for i, (s, e, p) in enumerate(w, 1):
×
142
            p = "root" if h[i] == 0 else "dep" if p == "root" else p
×
143
            u += (
×
144
                "\t".join(
145
                    [
146
                        str(i),
147
                        r[i - 1],
148
                        "_",
149
                        z[s][0][2:],
150
                        "_",
151
                        "|".join(z[s][1:]),
152
                        str(h[i]),
153
                        p,
154
                        "_",
155
                        "_" if i < n and e < w[i][0] else "SpaceAfter=No",
156
                    ]
157
                )
158
                + "\n"
159
            )
160
        return u + "\n"
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc