Skip to content

Commit 52ff97e

Browse files
authored
Merge pull request #793 from HRNPH/dev
Fix mishandling Karun in Kavee Matra Checker
2 parents 462a83e + dd4f0a1 commit 52ff97e

File tree

2 files changed

+22
-7
lines changed

2 files changed

+22
-7
lines changed

pythainlp/khavee/core.py

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
from pythainlp.tokenize import subword_tokenize
1717
from pythainlp.util import sound_syllable
1818

19-
2019
class KhaveeVerifier:
2120
def __init__(self):
2221
"""
@@ -215,11 +214,7 @@ def check_marttra(self, word: str) -> str:
215214
"""
216215
if word[-1] == 'ร' and word[-2] in ['ต','ท'] :
217216
word = word[:-1]
218-
if '์' in word[-1]:
219-
if 'ิ' in word[-2] or 'ุ' in word[-2]:
220-
word = word[:-3]
221-
else:
222-
word = word[:-2]
217+
word = self.handle_karun_sound_silence(word)
223218
if 'ำ' in word or ('ํ' in word and 'า' in word) or 'ไ' in word or 'ใ' in word:
224219
return 'กา'
225220
elif word[-1] in ['า','ะ','ิ','ี','ุ','ู','อ'] or ('ี' in word and 'ย' in word[-1]) or ('ื' in word and 'อ' in word[-1]):
@@ -417,7 +412,6 @@ def check_klon(self, text: str,k_type: int=8) -> Union[List[str], str]:
417412
def check_aek_too(self, text: Union[List[str], str], dead_syllable_as_aek:bool = False) -> Union[List[bool], List[str], bool, str]:
418413
"""
419414
Thai tonal word checker
420-
421415
:param Union[List[str], str] text: Thai word or list of Thai words
422416
:param bool dead_syllable_as_aek: if True, dead syllable will be considered as aek
423417
:return: the check if the word is aek or too or False(not both) or list of the check if input is list
@@ -453,3 +447,22 @@ def check_aek_too(self, text: Union[List[str], str], dead_syllable_as_aek:bool =
453447
return 'aek'
454448
else:
455449
return False
450+
451+
def handle_karun_sound_silence(self, word: str) -> str:
452+
"""
453+
Handle sound silence in Thai word using '์' character (Karun)
454+
by stripping all the characters before the 'Karun' character that should be silenced
455+
456+
:param str text: Thai word
457+
:return: Thai word with silence word stripped
458+
:rtype: str
459+
"""
460+
sound_silenced = True if word.endswith('์') else False
461+
if not sound_silenced:
462+
return word
463+
thai_consonants = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ"
464+
locate_silenced = word.rfind('์') - 1
465+
can_silence_two = True if word[locate_silenced-2] in thai_consonants else False
466+
cut_off = 2 if can_silence_two else 1
467+
word = word[:locate_silenced + 1 - cut_off]
468+
return word

pythainlp/khavee/example.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,3 +66,5 @@
6666
# -> False, aek, too
6767
print(kv.check_aek_too(['เอง', 'เอ่ง', 'เอ้ง'])) # ใช้ List ได้เหมือนกัน
6868
# -> [False, 'aek', 'too']
69+
print(kv.check_aek_too(['ห๊ะ', 'เอ่ง', 'เอ้ง'], dead_syllable_as_aek=True)) # ใช้ List ได้เหมือนกัน และสามารถตั้งค่า ให้นับคำที่เสียงตายเป็นเอกได้ ตามการเช็คคฉันทลักษณ์กลอน
70+
# -> ['aek', 'aek', 'too']

0 commit comments

Comments
 (0)