11625814262

Committed 01 Nov 2024 07:14AM UTC coverage: 20.782% (+20.8%) from 0.0%

Build # 11625814262

Build Type

Pull #952

github

Committed by

web-flow

Commit Message

Merge c8385dcae into 515fe7ced

Pull Request Pull Request #952: Specify a limited test suite

Run Details

45 of 80 new or added lines in 48 files covered. (56.25%)

1537 of 7396 relevant lines covered (20.78%)

0.21 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0

/pythainlp/soundex/sound.py

# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
from typing import List

import panphon
import panphon.distance

from pythainlp.tokenize import word_tokenize
from pythainlp.transliterate import pronunciate, transliterate

_ft = panphon.FeatureTable()
_dst = panphon.distance.Distance()

def _clean_ipa(ipa: str) -> str:
    """
    Clean IPA by removing tones and space between phonetic codes

    :param str ipa: IPA text
    :return: IPA with tones removed from the text
    :rtype: str
    """
    return ipa.replace("˩˩˦","").replace("˥˩","").replace("˨˩","").replace("˦˥","").replace("˧","").replace("˧","").replace(" .",".").replace(". ",".").strip()

def word2audio(word: str) -> str:
    """
    Convert word to IPA

    :param str word: Thai word
    :return: IPA with tones removed from the text
    :rtype: str

    :Example:
    ::

        from pythainlp.soundex.sound import word2audio

        word2audio("น้ำ")
        # output : 'n aː m .'
    """
    _word = word_tokenize(word)
    _phone = [pronunciate(w, engine="w2p") for w in _word]
    _ipa = [_clean_ipa(transliterate(phone, engine="thaig2p")) for phone in _phone]
    return '.'.join(_ipa)

def audio_vector(word:str) -> List[List[int]]:
    """
    Convert audio to vector list

    :param str word: Thai word
    :return: List of features from panphon
    :rtype: List[List[int]]

    :Example:
    ::

        from pythainlp.soundex.sound import audio_vector

        audio_vector("น้ำ")
        # output : [[-1, 1, 1, -1, -1, -1, ...]]
    """
    return _ft.word_to_vector_list(word2audio(word), numeric=True)

def word_approximation(word:str, list_word:List[str]):
    """
    Thai Word Approximation

    :param str word: Thai word
    :param str list_word: Thai word
    :return: List of approximation of words (The smaller the value, the closer)
    :rtype: List[str]

    :Example:
    ::

        from pythainlp.soundex.sound import word_approximation

        word_approximation("รถ", ["รด", "รส", "รม", "น้ำ"])
        # output : [0.0, 0.0, 3.875, 8.375]
    """
    _word = word2audio(word)
    _list_word = [word2audio(w) for w in list_word]
    _distance = [_dst.weighted_feature_edit_distance(_word, w) for w in _list_word]
    return _distance

1	# -- coding: utf-8 --
2	# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
3	# SPDX-License-Identifier: Apache-2.0
4	from typing import List	×
5
6	import panphon	×
7	import panphon.distance	×
8
9	from pythainlp.tokenize import word_tokenize	×
NEW 10	from pythainlp.transliterate import pronunciate, transliterate	×
11
12	_ft = panphon.FeatureTable()	×
13	_dst = panphon.distance.Distance()	×
14
15	def _clean_ipa(ipa: str) -> str:	×
16	"""
17	Clean IPA by removing tones and space between phonetic codes
18
19	:param str ipa: IPA text
20	:return: IPA with tones removed from the text
21	:rtype: str
22	"""
23	return ipa.replace("˩˩˦","").replace("˥˩","").replace("˨˩","").replace("˦˥","").replace("˧","").replace("˧","").replace(" .",".").replace(". ",".").strip()	×
24
25	def word2audio(word: str) -> str:	×
26	"""
27	Convert word to IPA
28
29	:param str word: Thai word
30	:return: IPA with tones removed from the text
31	:rtype: str
32
33	:Example:
34	::
35
36	from pythainlp.soundex.sound import word2audio
37
38	word2audio("น้ำ")
39	# output : 'n aː m .'
40	"""
41	_word = word_tokenize(word)	×
42	_phone = [pronunciate(w, engine="w2p") for w in _word]	×
43	_ipa = [_clean_ipa(transliterate(phone, engine="thaig2p")) for phone in _phone]	×
44	return '.'.join(_ipa)	×
45
46	def audio_vector(word:str) -> List[List[int]]:	×
47	"""
48	Convert audio to vector list
49
50	:param str word: Thai word
51	:return: List of features from panphon
52	:rtype: List[List[int]]
53
54	:Example:
55	::
56
57	from pythainlp.soundex.sound import audio_vector
58
59	audio_vector("น้ำ")
60	# output : [[-1, 1, 1, -1, -1, -1, ...]]
61	"""
62	return _ft.word_to_vector_list(word2audio(word), numeric=True)	×
63
64	def word_approximation(word:str, list_word:List[str]):	×
65	"""
66	Thai Word Approximation
67
68	:param str word: Thai word
69	:param str list_word: Thai word
70	:return: List of approximation of words (The smaller the value, the closer)
71	:rtype: List[str]
72
73	:Example:
74	::
75
76	from pythainlp.soundex.sound import word_approximation
77
78	word_approximation("รถ", ["รด", "รส", "รม", "น้ำ"])
79	# output : [0.0, 0.0, 3.875, 8.375]
80	"""
81	_word = word2audio(word)	×
82	_list_word = [word2audio(w) for w in list_word]	×
83	_distance = [_dst.weighted_feature_edit_distance(_word, w) for w in _list_word]	×
84	return _distance	×

PyThaiNLP / pythainlp / 11625814262

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous