23550040738

Committed 25 Mar 2026 03:45PM UTC coverage: 66.544%. First build

Build # 23550040738

Build Type

Pull #1369

github

Committed by

web-flow

Commit Message

Merge 1d31799f8 into 9836c966e

Pull Request Pull Request #1369: fix: replace os.path.join with safe_path_join to prevent path manipulation (CWE-22)

Coverage Stats

23 of 44 new or added lines in 8 files covered. (52.27%)

6494 of 9759 relevant lines covered (66.54%)

0.67 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0

/pythainlp/parse/transformers_ud.py

"""TransformersUD

Author: Prof. Koichi Yasuoka

This tagger is provided under the terms of the apache-2.0 License.

The source: https://huggingface.co/KoichiYasuoka/deberta-base-thai-ud-head

GitHub: https://github.com/KoichiYasuoka
"""

from __future__ import annotations

import os
from typing import TYPE_CHECKING, Optional, Union

if TYPE_CHECKING:
    from transformers import (  # noqa: F401
        AutoModelForQuestionAnswering,
        AutoTokenizer,
        TokenClassificationPipeline,
    )

from pythainlp.tools.path import safe_path_join


class Parse:
    def __init__(
        self, model: Optional[str] = "KoichiYasuoka/deberta-base-thai-ud-head"
    ) -> None:
        from transformers import (
            AutoConfig,
            AutoModelForQuestionAnswering,
            AutoModelForTokenClassification,
            AutoTokenizer,
            TokenClassificationPipeline,
        )
        from transformers.utils import cached_file

        if model is None:
            model = "KoichiYasuoka/deberta-base-thai-ud-head"
        self.tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(model)
        self.model: AutoModelForQuestionAnswering = (
            AutoModelForQuestionAnswering.from_pretrained(model)
        )
        x = AutoModelForTokenClassification.from_pretrained
        if os.path.isdir(model):
            d, t = (
                x(safe_path_join(model, "deprel")),
                x(safe_path_join(model, "tagger")),
            )
        else:
            c = AutoConfig.from_pretrained(
                cached_file(model, "deprel/config.json")
            )
            d = x(cached_file(model, "deprel/pytorch_model.bin"), config=c)
            s = AutoConfig.from_pretrained(
                cached_file(model, "tagger/config.json")
            )
            t = x(cached_file(model, "tagger/pytorch_model.bin"), config=s)
        self.deprel: TokenClassificationPipeline = TokenClassificationPipeline(
            model=d, tokenizer=self.tokenizer, aggregation_strategy="simple"
        )
        self.tagger: TokenClassificationPipeline = TokenClassificationPipeline(
            model=t, tokenizer=self.tokenizer
        )

    def __call__(
        self, text: str, tag: str = "str"
    ) -> Union[list[list[str]], str]:
        import numpy
        import torch
        import ufal.chu_liu_edmonds

        w = [
            (t["start"], t["end"], t["entity_group"])
            for t in self.deprel(text)
        ]
        z, n = (
            {t["start"]: t["entity"].split("|") for t in self.tagger(text)},
            len(w),
        )
        r, m = (
            [text[s:e] for s, e, p in w],
            numpy.full((n + 1, n + 1), numpy.nan),
        )
        v, c = self.tokenizer(r, add_special_tokens=False)["input_ids"], []
        for i, t in enumerate(v):
            q = (
                [self.tokenizer.cls_token_id]
                + t
                + [self.tokenizer.sep_token_id]
            )
            c.append(
                [q]
                + v[0:i]
                + [[self.tokenizer.mask_token_id]]
                + v[i + 1 :]
                + [[q[-1]]]
            )
        b = [[len(sum(x[0 : j + 1], [])) for j in range(len(x))] for x in c]
        with torch.no_grad():
            d = self.model(
                input_ids=torch.tensor([sum(x, []) for x in c]),
                token_type_ids=torch.tensor(
                    [[0] * x[0] + [1] * (x[-1] - x[0]) for x in b]
                ),
            )
        s, e = d.start_logits.tolist(), d.end_logits.tolist()
        for i in range(n):
            for j in range(n):
                m[i + 1, 0 if i == j else j + 1] = (
                    s[i][b[i][j]] + e[i][b[i][j + 1] - 1]
                )
        h = ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0]
        if [0 for i in h if i == 0] != [0]:
            i = ([p for s, e, p in w] + ["root"]).index("root")
            j = i + 1 if i < n else int(numpy.nanargmax(m[:, 0]))
            m[0:j, 0] = m[j + 1 :, 0] = numpy.nan
            h = ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0]
        u = ""
        if tag == "list":
            _tag_data = []
            for i, (s, e, p) in enumerate(w, 1):
                p = "root" if h[i] == 0 else "dep" if p == "root" else p
                _tag_data.append(
                    [
                        str(i),
                        r[i - 1],
                        "_",
                        z[s][0][2:],
                        "_",
                        "|".join(z[s][1:]),
                        str(h[i]),
                        p,
                        "_",
                        "_" if i < n and e < w[i][0] else "SpaceAfter=No",
                    ]
                )
            return _tag_data
        for i, (s, e, p) in enumerate(w, 1):
            p = "root" if h[i] == 0 else "dep" if p == "root" else p
            u += (
                "\t".join(
                    [
                        str(i),
                        r[i - 1],
                        "_",
                        z[s][0][2:],
                        "_",
                        "|".join(z[s][1:]),
                        str(h[i]),
                        p,
                        "_",
                        "_" if i < n and e < w[i][0] else "SpaceAfter=No",
                    ]
                )
                + "\n"
            )
        return u + "\n"

1	"""TransformersUD
2
3	Author: Prof. Koichi Yasuoka
4
5	This tagger is provided under the terms of the apache-2.0 License.
6
7	The source: https://huggingface.co/KoichiYasuoka/deberta-base-thai-ud-head
8
9	GitHub: https://github.com/KoichiYasuoka
10	"""
11
12	from __future__ import annotations	×
13
14	import os	×
15	from typing import TYPE_CHECKING, Optional, Union	×
16
17	if TYPE_CHECKING:
18	from transformers import ( # noqa: F401
19	AutoModelForQuestionAnswering,
20	AutoTokenizer,
21	TokenClassificationPipeline,
22	)
23
NEW 24	from pythainlp.tools.path import safe_path_join	×
25
26
27	class Parse:	×
28	def __init__(	×
29	self, model: Optional[str] = "KoichiYasuoka/deberta-base-thai-ud-head"
30	) -> None:
31	from transformers import (	×
32	AutoConfig,
33	AutoModelForQuestionAnswering,
34	AutoModelForTokenClassification,
35	AutoTokenizer,
36	TokenClassificationPipeline,
37	)
38	from transformers.utils import cached_file	×
39
40	if model is None:	×
41	model = "KoichiYasuoka/deberta-base-thai-ud-head"	×
42	self.tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(model)	×
43	self.model: AutoModelForQuestionAnswering = (	×
44	AutoModelForQuestionAnswering.from_pretrained(model)
45	)
46	x = AutoModelForTokenClassification.from_pretrained	×
47	if os.path.isdir(model):	×
48	d, t = (	×
49	x(safe_path_join(model, "deprel")),
50	x(safe_path_join(model, "tagger")),
51	)
52	else:
53	c = AutoConfig.from_pretrained(	×
54	cached_file(model, "deprel/config.json")
55	)
56	d = x(cached_file(model, "deprel/pytorch_model.bin"), config=c)	×
57	s = AutoConfig.from_pretrained(	×
58	cached_file(model, "tagger/config.json")
59	)
60	t = x(cached_file(model, "tagger/pytorch_model.bin"), config=s)	×
61	self.deprel: TokenClassificationPipeline = TokenClassificationPipeline(	×
62	model=d, tokenizer=self.tokenizer, aggregation_strategy="simple"
63	)
64	self.tagger: TokenClassificationPipeline = TokenClassificationPipeline(	×
65	model=t, tokenizer=self.tokenizer
66	)
67
68	def __call__(	×
69	self, text: str, tag: str = "str"
70	) -> Union[list[list[str]], str]:
71	import numpy	×
72	import torch	×
73	import ufal.chu_liu_edmonds	×
74
75	w = [	×
76	(t["start"], t["end"], t["entity_group"])
77	for t in self.deprel(text)
78	]
79	z, n = (	×
80	{t["start"]: t["entity"].split("\|") for t in self.tagger(text)},
81	len(w),
82	)
83	r, m = (	×
84	[text[s:e] for s, e, p in w],
85	numpy.full((n + 1, n + 1), numpy.nan),
86	)
87	v, c = self.tokenizer(r, add_special_tokens=False)["input_ids"], []	×
88	for i, t in enumerate(v):	×
89	q = (	×
90	[self.tokenizer.cls_token_id]
91	+ t
92	+ [self.tokenizer.sep_token_id]
93	)
94	c.append(	×
95	[q]
96	+ v[0:i]
97	+ [[self.tokenizer.mask_token_id]]
98	+ v[i + 1 :]
99	+ [[q[-1]]]
100	)
101	b = [[len(sum(x[0 : j + 1], [])) for j in range(len(x))] for x in c]	×
102	with torch.no_grad():	×
103	d = self.model(	×
104	input_ids=torch.tensor([sum(x, []) for x in c]),
105	token_type_ids=torch.tensor(
106	[[0] * x[0] + [1] * (x[-1] - x[0]) for x in b]
107	),
108	)
109	s, e = d.start_logits.tolist(), d.end_logits.tolist()	×
110	for i in range(n):	×
111	for j in range(n):	×
112	m[i + 1, 0 if i == j else j + 1] = (	×
113	s[i][b[i][j]] + e[i][b[i][j + 1] - 1]
114	)
115	h = ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0]	×
116	if [0 for i in h if i == 0] != [0]:	×
117	i = ([p for s, e, p in w] + ["root"]).index("root")	×
118	j = i + 1 if i < n else int(numpy.nanargmax(m[:, 0]))	×
119	m[0:j, 0] = m[j + 1 :, 0] = numpy.nan	×
120	h = ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0]	×
121	u = ""	×
122	if tag == "list":	×
123	_tag_data = []	×
124	for i, (s, e, p) in enumerate(w, 1):	×
125	p = "root" if h[i] == 0 else "dep" if p == "root" else p	×
126	_tag_data.append(	×
127	[
128	str(i),
129	r[i - 1],
130	"_",
131	z[s][0][2:],
132	"_",
133	"\|".join(z[s][1:]),
134	str(h[i]),
135	p,
136	"_",
137	"_" if i < n and e < w[i][0] else "SpaceAfter=No",
138	]
139	)
140	return _tag_data	×
141	for i, (s, e, p) in enumerate(w, 1):	×
142	p = "root" if h[i] == 0 else "dep" if p == "root" else p	×
143	u += (	×
144	"\t".join(
145	[
146	str(i),
147	r[i - 1],
148	"_",
149	z[s][0][2:],
150	"_",
151	"\|".join(z[s][1:]),
152	str(h[i]),
153	p,
154	"_",
155	"_" if i < n and e < w[i][0] else "SpaceAfter=No",
156	]
157	)
158	+ "\n"
159	)
160	return u + "\n"	×

PyThaiNLP / pythainlp / 23550040738

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous