Skip to content

Commit ef1fb09

Browse files
authored
Merge pull request #896 from PyThaiNLP/add-pythainlp.morpheme
Add pythainlp.morpheme
2 parents a8993d3 + dd64c25 commit ef1fb09

File tree

8 files changed

+414
-129
lines changed

8 files changed

+414
-129
lines changed

docs/api/morpheme.rst

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
.. currentmodule:: pythainlp.morpheme
2+
3+
pythainlp.morpheme
4+
==================
5+
6+
The `pythainlp.benchmarks` module is collect functions for morpheme analysis, word formation and more for Thai language.
7+
8+
.. autofunction:: nighit
9+
10+
.. autofunction:: is_native_thai
11+
:noindex:
12+
13+
The `is_native_thai` function is a language detection tool that identifies whether text is predominantly in the Thai language or not. It aids in language identification and text categorization tasks.

docs/api/util.rst

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -77,11 +77,6 @@ Modules
7777

7878
The `ipa_to_rtgs` function focuses on converting International Phonetic Alphabet (IPA) transcriptions into Royal Thai General System of Transcription (RTGS) format. This is valuable for phonetic analysis and pronunciation guides.
7979

80-
.. autofunction:: is_native_thai
81-
:noindex:
82-
83-
The `is_native_thai` function is a language detection tool that identifies whether text is predominantly in the Thai language or not. It aids in language identification and text categorization tasks.
84-
8580
.. autofunction:: isthai
8681
:noindex:
8782

notebooks/create_words.ipynb

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"from pythainlp.transliterate import pronunciate\n",
10+
"from pythainlp import thai_consonants"
11+
]
12+
},
13+
{
14+
"cell_type": "code",
15+
"execution_count": 2,
16+
"metadata": {},
17+
"outputs": [
18+
{
19+
"data": {
20+
"text/plain": [
21+
"'พุด-ทะ'"
22+
]
23+
},
24+
"execution_count": 2,
25+
"metadata": {},
26+
"output_type": "execute_result"
27+
}
28+
],
29+
"source": [
30+
"pronunciate(\"พุทธ\")"
31+
]
32+
},
33+
{
34+
"cell_type": "code",
35+
"execution_count": 3,
36+
"metadata": {},
37+
"outputs": [
38+
{
39+
"data": {
40+
"text/plain": [
41+
"'บู-ชา'"
42+
]
43+
},
44+
"execution_count": 3,
45+
"metadata": {},
46+
"output_type": "execute_result"
47+
}
48+
],
49+
"source": [
50+
"pronunciate(\"บูชา\")"
51+
]
52+
},
53+
{
54+
"cell_type": "code",
55+
"execution_count": 4,
56+
"metadata": {},
57+
"outputs": [
58+
{
59+
"data": {
60+
"text/plain": [
61+
"'อะ-นุก'"
62+
]
63+
},
64+
"execution_count": 4,
65+
"metadata": {},
66+
"output_type": "execute_result"
67+
}
68+
],
69+
"source": [
70+
"pronunciate(\"อนุค\")"
71+
]
72+
},
73+
{
74+
"cell_type": "code",
75+
"execution_count": 5,
76+
"metadata": {},
77+
"outputs": [],
78+
"source": [
79+
"def nighit(w1,w2): # read: https://www.trueplookpanya.com/learning/detail/1180\n",
80+
" if not str(w1).endswith('ํ') and len(w1)!=2:\n",
81+
" raise NotImplementedError(f\"The function doesn't support {w1}.\")\n",
82+
" list_w1 = list(w1)\n",
83+
" list_w2 = list(w2)\n",
84+
" newword = list()\n",
85+
" newword.append(list_w1[0])\n",
86+
" newword.append(\"\")\n",
87+
" consonant_start = [i for i in list_w2 if i in set(thai_consonants)][0]\n",
88+
" if consonant_start in [\"\",\"\",\"\",\"\",\"\"]:\n",
89+
" newword.append(\"\")\n",
90+
" elif consonant_start in [\"\",\"\",\"\",\"\"]:\n",
91+
" newword.append(\"\")\n",
92+
" elif consonant_start in [\"\",\"\",\"\",\"\"]:\n",
93+
" newword.append(\"\")\n",
94+
" elif consonant_start in [\"\",\"\",\"\",\"\",\"\"]:\n",
95+
" newword.append(\"\")\n",
96+
" elif consonant_start in [\"\",\"\",\"\",\"\"]:\n",
97+
" newword.append(\"\")\n",
98+
" elif consonant_start in [\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\"]:\n",
99+
" newword.append(\"\")\n",
100+
" else:\n",
101+
" raise NotImplementedError(f\"The function doesn't support {w1} and {w2}.\")\n",
102+
" newword.extend(list_w2)\n",
103+
" return ''.join(newword)"
104+
]
105+
},
106+
{
107+
"cell_type": "code",
108+
"execution_count": 6,
109+
"metadata": {},
110+
"outputs": [],
111+
"source": [
112+
"assert nighit(\"สํ\",\"คีต\")==\"สังคีต\"\n",
113+
"assert nighit(\"สํ\",\"จร\")==\"สัญจร\"\n",
114+
"assert nighit(\"สํ\",\"ฐาน\")==\"สัณฐาน\"\n",
115+
"assert nighit(\"สํ\",\"นิษฐาน\")==\"สันนิษฐาน\"\n",
116+
"assert nighit(\"สํ\",\"ปทา\")==\"สัมปทา\"\n",
117+
"assert nighit(\"สํ\",\"โยค\")==\"สังโยค\""
118+
]
119+
},
120+
{
121+
"cell_type": "code",
122+
"execution_count": null,
123+
"metadata": {},
124+
"outputs": [],
125+
"source": []
126+
}
127+
],
128+
"metadata": {
129+
"kernelspec": {
130+
"display_name": "Python 3.8.13 ('base')",
131+
"language": "python",
132+
"name": "python3"
133+
},
134+
"language_info": {
135+
"codemirror_mode": {
136+
"name": "ipython",
137+
"version": 3
138+
},
139+
"file_extension": ".py",
140+
"mimetype": "text/x-python",
141+
"name": "python",
142+
"nbconvert_exporter": "python",
143+
"pygments_lexer": "ipython3",
144+
"version": "3.8.13"
145+
},
146+
"orig_nbformat": 4,
147+
"vscode": {
148+
"interpreter": {
149+
"hash": "a1d6ff38954a1cdba4cf61ffa51e42f4658fc35985cd256cd89123cae8466a39"
150+
}
151+
}
152+
},
153+
"nbformat": 4,
154+
"nbformat_minor": 2
155+
}

pythainlp/morpheme/__init__.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# -*- coding: utf-8 -*-
2+
# SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
"""
6+
PyThaiNLP morpheme
7+
"""
8+
__all__ = [
9+
"nighit",
10+
"is_native_thai"
11+
]
12+
from pythainlp.morpheme.word_formation import nighit
13+
from pythainlp.morpheme.thaiwordcheck import is_native_thai
Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
# -*- coding: utf-8 -*-
2+
# SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
3+
# SPDX-License-Identifier: Apache-2.0
4+
"""
5+
Check if a word is a "native Thai word"
6+
7+
Adapted from
8+
https://github.com/wannaphong/open-thai-nlp-document/blob/master/check_thai_word.md
9+
10+
References
11+
- ทีมงานทรูปลูกปัญญา 2015. ลักษณะของคำไทยแท้ \
12+
http://www.trueplookpanya.com/learning/detail/30589-043067
13+
- วารุณี บำรุงรส 2010. คำไทยแท้ https://www.gotoknow.org/posts/377619
14+
"""
15+
import re
16+
17+
_THANTHAKHAT_CHAR = "\u0e4c" # Thanthakhat (cancellation of sound)
18+
19+
# Non-native Thai characters
20+
_TH_NON_NATIVE_CHARS = {
21+
"ฆ",
22+
"ณ",
23+
"ฌ",
24+
"ฎ",
25+
"ฏ",
26+
"ฐ",
27+
"ฑ",
28+
"ฒ",
29+
"ธ",
30+
"ศ",
31+
"ษ",
32+
"ฬ",
33+
_THANTHAKHAT_CHAR,
34+
}
35+
36+
# Native Thai final consonants
37+
_TH_NATIVE_FINALS = {"ก", "ด", "บ", "น", "ง", "ม", "ย", "ว"}
38+
39+
# Known native Thai words (exceptions)
40+
_TH_NATIVE_WORDS = {
41+
"ฆ่า",
42+
"เฆี่ยน",
43+
"ศึก",
44+
"ศอก",
45+
"เศิก",
46+
"เศร้า",
47+
"ธ",
48+
"ณ",
49+
"ฯพณฯ",
50+
"ใหญ่",
51+
"หญ้า",
52+
"ควาย",
53+
"ความ",
54+
"กริ่งเกรง",
55+
"ผลิ",
56+
}
57+
58+
# Diphthong prefixes (can start native Thai word)
59+
_TH_PREFIX_DIPHTHONG = {"กะ", "กระ", "ปะ", "ประ"}
60+
61+
# Thai consonant filter
62+
# O ANG (U+0E2D) is omitted, as it can be considered as vowel
63+
_TH_CONSONANTS_PATTERN = re.compile(r"[ก-ฬฮ]", re.U)
64+
65+
66+
def is_native_thai(word: str) -> bool:
67+
"""
68+
Check if a word is an "native Thai word" (Thai: "คำไทยแท้")
69+
This function is based on a simple heuristic algorithm
70+
and cannot be entirely reliable.
71+
72+
:param str word: word
73+
:return: True or False
74+
:rtype: bool
75+
76+
:Example:
77+
78+
English word::
79+
80+
from pythainlp.util import is_native_thai
81+
82+
is_native_thai("Avocado")
83+
# output: False
84+
85+
Native Thai word::
86+
87+
is_native_thai("มะม่วง")
88+
# output: True
89+
is_native_thai("ตะวัน")
90+
# output: True
91+
92+
Non-native Thai word::
93+
94+
is_native_thai("สามารถ")
95+
# output: False
96+
is_native_thai("อิสริยาภรณ์")
97+
# output: False
98+
"""
99+
if not isinstance(word, str) or not word.strip():
100+
return False
101+
102+
word = word.strip()
103+
104+
# Known native Thai words (exceptions)
105+
if word in _TH_NATIVE_WORDS:
106+
return True
107+
108+
# If a word contains non-Thai chars, it is not a native Thai
109+
if any(ch in word for ch in _TH_NON_NATIVE_CHARS):
110+
return False
111+
112+
# If it does not contain any Thai consonants -> it cannot be Thai
113+
chs = re.findall(_TH_CONSONANTS_PATTERN, word)
114+
if not chs:
115+
return False
116+
117+
# If there's only one Thai consonant -> it can be a native Thai
118+
if len(chs) == 1:
119+
return True
120+
121+
# If a word ends with native final, it can be a native Thai
122+
if word[-1] in _TH_NATIVE_FINALS:
123+
return True
124+
125+
# Note: This will not work, as it check the whole word, not the prefix.
126+
# Prefix-sensitive tokenization is required in order to be able to check this.
127+
if word in _TH_PREFIX_DIPHTHONG:
128+
return True
129+
130+
return False
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
# -*- coding: utf-8 -*-
2+
# SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
3+
# SPDX-License-Identifier: Apache-2.0
4+
from pythainlp import thai_consonants
5+
6+
7+
def nighit(w1: str, w2: str) -> str:
8+
"""
9+
Nighit (นิคหิต or ํ ) is the niggahita in Thai language for create new \
10+
words from Pali language in Thai.
11+
The function use simple method to create new Thai word from two words \
12+
that the root is from Pali language.
13+
14+
Read more: https://www.trueplookpanya.com/learning/detail/1180
15+
16+
:param str w1: A Thai word that has a nighit.
17+
:param str w2: A Thai word.
18+
:return: Thai word.
19+
:rtype: str
20+
:Example:
21+
::
22+
from pythainlp.morpheme import nighit
23+
24+
assert nighit("สํ","คีต")=="สังคีต"
25+
assert nighit("สํ","จร")=="สัญจร"
26+
assert nighit("สํ","ฐาน")=="สัณฐาน"
27+
assert nighit("สํ","นิษฐาน")=="สันนิษฐาน"
28+
assert nighit("สํ","ปทา")=="สัมปทา"
29+
assert nighit("สํ","โยค")=="สังโยค"
30+
"""
31+
if not str(w1).endswith('ํ') and len(w1) != 2:
32+
raise NotImplementedError(f"The function doesn't support {w1}.")
33+
list_w1 = list(w1)
34+
list_w2 = list(w2)
35+
newword = list()
36+
newword.append(list_w1[0])
37+
newword.append("ั")
38+
consonant_start = [i for i in list_w2 if i in set(thai_consonants)][0]
39+
if consonant_start in ["ก", "ช", "ค", "ข", "ง"]:
40+
newword.append("ง")
41+
elif consonant_start in ["จ", "ฉ", "ช", "ฌ"]:
42+
newword.append("ญ")
43+
elif consonant_start in ["ฎ", "ฐ", "ฑ", "ณ"]:
44+
newword.append("ณ")
45+
elif consonant_start in ["ด", "ถ", "ท", "ธ", "น"]:
46+
newword.append("น")
47+
elif consonant_start in ["ป", "ผ", "พ", "ภ"]:
48+
newword.append("ม")
49+
elif consonant_start in ["ย", "ร", "ล", "ฬ", "ว", "ศ", "ษ", "ส", "ห"]:
50+
newword.append("ง")
51+
else:
52+
raise NotImplementedError(f"""
53+
The function doesn't support {w1} and {w2}.
54+
""")
55+
newword.extend(list_w2)
56+
return ''.join(newword)

0 commit comments

Comments
 (0)