Skip to content

Commit e99825b

Browse files
committed
Add pythainlp.util.abbreviation_to_full_text
This function convert Thai text (with abbreviation) to full text.
1 parent 71719f9 commit e99825b

File tree

5 files changed

+66
-0
lines changed

5 files changed

+66
-0
lines changed

docs/api/util.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ The :class:`pythainlp.util` contains utility functions, like text conversion and
77
Modules
88
-------
99

10+
.. autofunction:: abbreviation_to_full_text
1011
.. autofunction:: arabic_digit_to_thai_digit
1112
.. autofunction:: bahttext
1213
.. autofunction:: convert_years

pythainlp/util/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
__all__ = [
2020
"Trie",
21+
"abbreviation_to_full_text",
2122
"arabic_digit_to_thai_digit",
2223
"bahttext",
2324
"convert_years",
@@ -125,3 +126,4 @@
125126
from pythainlp.util.phoneme import nectec_to_ipa, ipa_to_rtgs, remove_tone_ipa
126127
from pythainlp.util.encoding import tis620_to_utf8
127128
import pythainlp.util.spell_words as spell_words
129+
from pythainlp.util.abbreviation import abbreviation_to_full_text

pythainlp/util/abbreviation.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
# -*- coding: utf-8 -*-
2+
# Copyright (C) 2016-2023 PyThaiNLP Project
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
"""
16+
Thai abbreviation tools
17+
"""
18+
from typing import List, Tuple, Union
19+
20+
21+
def abbreviation_to_full_text(text: str) -> List[Tuple[str, Union[float, None]]]:
22+
"""
23+
This function convert Thai text (with abbreviation) to full text.
24+
25+
This function use KhamYo for handles abbreviations.
26+
See more `KhamYo <https://github.com/wannaphong/KhamYo>`_.
27+
28+
:param str text: Thai text
29+
:return: Thai full text that handles abbreviations as full text.
30+
:rtype: List[Tuple[str, Union[float, None]]]
31+
32+
:Example:
33+
::
34+
35+
from pythainlp.util import abbreviation_to_full_text
36+
37+
text = "รร.ของเราน่าอยู่"
38+
39+
abbreviation_to_full_text(text)
40+
# output: [
41+
# ('โรงเรียนของเราน่าอยู่', tensor(0.3734)),
42+
# ('โรงแรมของเราน่าอยู่', tensor(0.2438))
43+
# ]
44+
"""
45+
try:
46+
from khamyo import replace as _replace
47+
except ImportError:
48+
raise ImportError(
49+
"""
50+
This funtion need to use khamyo.
51+
You can install by pip install khamyo or
52+
pip install pythainlp[abbreviation].
53+
"""
54+
)
55+
return _replace(text)

setup.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,9 @@
128128
"el":{
129129
"multiel>=0.5"
130130
},
131+
"abbreviation":{
132+
"khamyo>=0.2.0"
133+
},
131134
"full": [
132135
"PyYAML>=5.3.1",
133136
"attacut>=1.0.4",
@@ -162,6 +165,7 @@
162165
"ufal.chu-liu-edmonds>=1.0.2",
163166
"panphon>=0.20.0",
164167
"sentence-transformers>=2.2.2",
168+
"khamyo>=0.2.0",
165169
],
166170
}
167171

tests/test_util.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from pythainlp.corpus.common import _THAI_WORDS_FILENAME
1212
from pythainlp.util import (
1313
Trie,
14+
abbreviation_to_full_text,
1415
arabic_digit_to_thai_digit,
1516
bahttext,
1617
collate,
@@ -851,3 +852,6 @@ def test_spell_word(self):
851852
self.assertEqual(spell_word("เสื้อ"),['สอ', 'เอือ', 'ไม้โท', 'เสื้อ'])
852853
self.assertEqual(spell_word("คน"),['คอ', 'นอ', 'คน'])
853854
self.assertEqual(spell_word("คนดี"),['คอ', 'นอ', 'คน', 'ดอ', 'อี', 'ดี', 'คนดี'])
855+
856+
def test_abbreviation_to_full_text(self):
857+
self.assertIsInstance(abbreviation_to_full_text("รร.ของเราน่าอยู่", list))

0 commit comments

Comments
 (0)