Skip to content

Commit 218fd27

Browse files
authored
Merge pull request #703 from PyThaiNLP/dev
Update add-word_detokenize from dev
2 parents ae452a2 + 7479baa commit 218fd27

File tree

10 files changed

+197
-13
lines changed

10 files changed

+197
-13
lines changed

.github/workflows/windows-test.yml

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
name: Windows Unit test and code coverage
2+
3+
on:
4+
push:
5+
paths-ignore:
6+
- '**.md'
7+
- 'docs/**'
8+
pull_request:
9+
branches:
10+
- dev
11+
paths-ignore:
12+
- '**.md'
13+
- 'docs/**'
14+
15+
jobs:
16+
build:
17+
18+
runs-on: ${{ matrix.os }}
19+
strategy:
20+
fail-fast: false
21+
matrix:
22+
os: [windows-latest]
23+
python-version: [3.8]
24+
25+
steps:
26+
- uses: actions/checkout@v2
27+
- uses: conda-incubator/setup-miniconda@v2
28+
with:
29+
python-version: ${{ matrix.python-version }}
30+
auto-activate-base: true
31+
auto-update-conda: true
32+
- shell: powershell
33+
run: |
34+
conda info
35+
conda list
36+
- name: Install PyTorch
37+
shell: powershell
38+
run: |
39+
pip install torch==1.8.1
40+
- name: Install dependencies
41+
shell: powershell
42+
run: |
43+
python -m pip install --disable-pip-version-check --user --upgrade pip setuptools
44+
python -m pip --version
45+
python -m pip install pytest coverage coveralls
46+
conda install -y -c conda-forge fairseq
47+
python -m pip install https://www.dropbox.com/s/o6p2sj5z50iim1e/PyICU-2.3.1-cp38-cp38-win_amd64.whl?dl=1
48+
python -m pip install -r docker_requirements.txt
49+
python -m pip install .[full]
50+
python -m nltk.downloader omw-1.4
51+
python -m pip install spacy deepcut
52+
- name: Test
53+
shell: powershell
54+
env:
55+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
56+
COVERALLS_SERVICE_NAME: github
57+
run: |
58+
coverage run -m unittest discover
59+
coveralls

docs/api/soundex.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ Modules
1111
.. autofunction:: lk82
1212
.. autofunction:: udom83
1313
.. autofunction:: metasound
14+
.. autofunction:: prayut_and_somchaip
1415

1516
References
1617
----------
@@ -23,3 +24,5 @@ References
2324
Master Thesis. Chulalongkorn University, Thailand.
2425
2526
.. [#lk82] วิชิต หล่อจีระชุณห์กุล และ เจริญ คุวินทร์พันธุ์. `โปรแกรมการสืบค้นคำไทยตามเสียงอ่าน (Thai Soundex) <http://guru.sanook.com/1520/>`_.
27+
28+
.. [#prayut_and_somchaip] Prayut Suwanvisat, Somchai Prasitjutrakul. Thai-English Cross-Language Transliterated Word Retrieval using Soundex Technique. In 1998 [cited 2022 Sep 8]. Available from: https://www.cp.eng.chula.ac.th/~somchai/spj/papers/ThaiText/ncsec98-clir.pdf

pythainlp/corpus/core.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def get_corpus_db_detail(name: str, version: str = None) -> dict:
4747
:return: details about a corpus
4848
:rtype: dict
4949
"""
50-
with open(corpus_db_path(), "r", encoding="utf-8") as f:
50+
with open(corpus_db_path(), "r", encoding="utf-8-sig") as f:
5151
local_db = json.load(f)
5252

5353
if version is None:
@@ -378,7 +378,7 @@ def download(
378378

379379
# check if corpus is available
380380
if name in corpus_db:
381-
with open(corpus_db_path(), "r", encoding="utf-8") as f:
381+
with open(corpus_db_path(), "r", encoding="utf-8-sig") as f:
382382
local_db = json.load(f)
383383

384384
corpus = corpus_db[name]
@@ -509,7 +509,7 @@ def remove(name: str) -> bool:
509509
if _CHECK_MODE == "1":
510510
print("PyThaiNLP is read-only mode. It can't remove corpus.")
511511
return False
512-
with open(corpus_db_path(), "r", encoding="utf-8") as f:
512+
with open(corpus_db_path(), "r", encoding="utf-8-sig") as f:
513513
db = json.load(f)
514514
data = [
515515
corpus for corpus in db["_default"].values() if corpus["name"] == name

pythainlp/soundex/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,13 @@
1010
"lk82",
1111
"metasound",
1212
"udom83",
13+
"prayut_and_somchaip",
1314
]
1415

1516
from pythainlp.soundex.lk82 import lk82
1617
from pythainlp.soundex.metasound import metasound
1718
from pythainlp.soundex.udom83 import udom83
19+
from pythainlp.soundex.prayut_and_somchaip import prayut_and_somchaip
1820

1921
DEFAULT_SOUNDEX_ENGINE = "udom83"
2022

pythainlp/soundex/core.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,21 @@
77
from pythainlp.soundex.lk82 import lk82
88
from pythainlp.soundex.metasound import metasound
99
from pythainlp.soundex.udom83 import udom83
10+
from pythainlp.soundex.prayut_and_somchaip import prayut_and_somchaip
1011
from pythainlp.soundex import DEFAULT_SOUNDEX_ENGINE
1112

1213
# Other Thai soundex systems (not implemented yet): Arun91, KSS97
1314
# [KSS97] https://linux.thai.net/~thep/soundex/soundex.html
1415

1516

16-
def soundex(text: str, engine: str = DEFAULT_SOUNDEX_ENGINE) -> str:
17+
def soundex(text: str, engine: str = DEFAULT_SOUNDEX_ENGINE, length: int = 4) -> str:
1718
"""
1819
This function converts Thai text into phonetic code.
1920
2021
:param str text: word
2122
:param str engine: soundex engine
23+
:param int length: preferred length of the Soundex code (default is 4)\
24+
for metasound and prayut_and_somchaip only
2225
:return: Soundex code
2326
:rtype: str
2427
@@ -29,6 +32,8 @@ def soundex(text: str, engine: str = DEFAULT_SOUNDEX_ENGINE) -> str:
2932
Wannee Udompanich [#lk82]_
3033
* *metasound* - Thai soundex algorithm based on a combination
3134
of Metaphone and Soundex proposed by Snae & Brückner [#metasound]_
35+
* *prayut_and_somchaip* - Thai-English Cross-Language Transliterated
36+
Word Retrieval using Soundex Technique [#prayut_and_somchaip]_
3237
3338
:Example:
3439
::
@@ -54,12 +59,18 @@ def soundex(text: str, engine: str = DEFAULT_SOUNDEX_ENGINE) -> str:
5459
soundex("ปัจจุบัน"), soundex("ปัจจุบัน", engine='lk82'), \\
5560
soundex("ปัจจุบัน", engine='metasound')
5661
# output: ('ป775300', 'ป3E54', 'ป223')
62+
63+
soundex("vp", engine="prayut_and_somchaip")
64+
# output: '11'
65+
soundex("วีพี", engine="prayut_and_somchaip")
66+
# output: '11'
5767
"""
5868
if engine == "lk82":
59-
_soundex = lk82
69+
_soundex = lk82(text)
70+
elif engine == "prayut_and_somchaip":
71+
_soundex = prayut_and_somchaip(text, length=length)
6072
elif engine == "metasound":
61-
_soundex = metasound
73+
_soundex = metasound(text, length=length)
6274
else: # default, use "udom83"
63-
_soundex = udom83
64-
65-
return _soundex(text)
75+
_soundex = udom83(text)
76+
return _soundex

pythainlp/soundex/metasound.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def metasound(text: str, length: int = 4) -> str:
3737
:Example:
3838
::
3939
40-
from pythainlp.metasound import metasound
40+
from pythainlp.soundex.metasound import metasound
4141
4242
metasound("ลัก")
4343
# output: 'ล100'
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Thai-English Cross-Language Transliterated Word Retrieval
4+
using Soundex Technique
5+
6+
References:
7+
Prayut Suwanvisat, Somchai Prasitjutrakul.Thai-English Cross-Language Transliterated Word Retrieval using Soundex Technique. In 1998 [cited 2022 Sep 8]. Available from: https://www.cp.eng.chula.ac.th/~somchai/spj/papers/ThaiText/ncsec98-clir.pdf
8+
"""
9+
from pythainlp import thai_characters
10+
_C0 = "AEIOUHWYอ"
11+
_C1 = "BFPVบฝฟปผพภว"
12+
_C2 = "CGJKQSXZขฃคฅฆฉขฌกจซศษส"
13+
_C3 = "DTฎดฏตฐฑฒถทธ"
14+
_C4 = "Lลฬ"
15+
_C5 = "MNมณน"
16+
_C6 = "Rร"
17+
_C7 = "AEIOUอ"
18+
_C8 = "Hหฮ"
19+
_C1_1 = "Wว"
20+
_C9 = "Yยญ"
21+
_C52 = "ง"
22+
23+
24+
def prayut_and_somchaip(text: str, length: int = 4) -> str:
25+
"""
26+
This function converts English-Thai Cross-Language Transliterated Word into
27+
phonetic code with the mactching technique called **Soundex** [#prayut_and_somchaip]_.
28+
29+
:param str text: English-Thai Cross-Language Transliterated Word
30+
:param int length: preferred length of the Soundex code (default is 4)
31+
32+
:return: Soundex for the given text
33+
:rtype: str
34+
35+
:Example:
36+
::
37+
38+
from pythainlp.soundex.prayut_and_somchaip import prayut_and_somchaip
39+
40+
prayut_and_somchaip("king", 2)
41+
# output: '52'
42+
43+
prayut_and_somchaip("คิง", 2)
44+
# output: '52'
45+
"""
46+
if not text or not isinstance(text, str):
47+
return ""
48+
text = text.upper()
49+
# keep only consonants (English-Thai)
50+
chars = []
51+
for ch in text:
52+
if ch in thai_characters+"ABCDEFGHIJKLMNOPQRSTUVWXYZ":
53+
chars.append(ch)
54+
55+
i = 0
56+
while i < len(chars):
57+
if i == 0 and chars[i] in _C0:
58+
chars[i] = "0"
59+
elif chars[i] in _C1:
60+
chars[i] = "1"
61+
elif chars[i] in _C2:
62+
chars[i] = "2"
63+
elif chars[i] in _C3:
64+
chars[i] = "3"
65+
elif chars[i] in _C4:
66+
chars[i] = "4"
67+
elif chars[i] in _C5:
68+
chars[i] = "5"
69+
elif chars[i] in _C6:
70+
chars[i] = "6"
71+
elif chars[i] in _C52:
72+
chars[i] = "52"
73+
elif chars[i] in _C7 and i != 0:
74+
chars[i] = "7"
75+
elif chars[i] in _C8 and i != 0:
76+
chars[i] = "8"
77+
elif chars[i] in _C1_1 and i != 0:
78+
chars[i] = "1"
79+
elif chars[i] in _C9 and i != 0:
80+
chars[i] = "9"
81+
else:
82+
chars[i] = None
83+
i += 1
84+
chars = list(
85+
''.join([i for i in chars if i is not None])
86+
)
87+
return "".join(chars[-length:])

pythainlp/tag/_tag_perceptron.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ def load(self, loc: str) -> None:
202202
:param str loc: model path
203203
"""
204204
try:
205-
with open(loc, "r", encoding='utf-8') as f:
205+
with open(loc, "r", encoding='utf-8-sig') as f:
206206
w_td_c = json.load(f)
207207
except IOError:
208208
msg = "Missing trontagger.json file."

pythainlp/tokenize/crfcut.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,10 @@ def segment(text: str) -> List[str]:
189189
:param str text: text to be tokenized to sentences
190190
:return: list of words, tokenized from the text
191191
"""
192-
toks = word_tokenize(text)
192+
if isinstance(text, str):
193+
toks = word_tokenize(text)
194+
else:
195+
toks = text
193196
feat = extract_features(toks)
194197
labs = _tagger.tag(feat)
195198
labs[-1] = "E" # make sure it cuts the last sentence

tests/test_soundex.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,18 @@
22

33
import unittest
44

5-
from pythainlp.soundex import lk82, metasound, soundex, udom83
5+
from pythainlp.soundex import lk82, metasound, soundex, udom83, prayut_and_somchaip
66

77

88
class TestSoundexPackage(unittest.TestCase):
99
def test_soundex(self):
1010
self.assertIsNotNone(soundex("a", engine="lk82"))
1111
self.assertIsNotNone(soundex("a", engine="udom83"))
1212
self.assertIsNotNone(soundex("a", engine="metasound"))
13+
self.assertEqual(
14+
soundex("vp", engine="prayut_and_somchaip"),
15+
soundex("วีพี", engine="prayut_and_somchaip")
16+
)
1317
self.assertIsNotNone(soundex("a", engine="XXX"))
1418

1519
self.assertEqual(lk82(None), "")
@@ -54,3 +58,18 @@ def test_soundex(self):
5458
self.assertIsNotNone(metasound("กาโวกาโว"))
5559
self.assertIsNotNone(metasound("สุวรรณา"))
5660
self.assertIsNotNone(metasound("ดอยบอย"))
61+
62+
self.assertEqual(prayut_and_somchaip(None), "")
63+
self.assertEqual(prayut_and_somchaip(""), "")
64+
self.assertEqual(prayut_and_somchaip("vp"), "11")
65+
self.assertIsNotNone(prayut_and_somchaip("บา"))
66+
self.assertIsNotNone(prayut_and_somchaip("go"))
67+
self.assertIsNotNone(prayut_and_somchaip("อด"))
68+
self.assertIsNotNone(prayut_and_somchaip("ลน"))
69+
self.assertIsNotNone(prayut_and_somchaip("มอ"))
70+
self.assertIsNotNone(prayut_and_somchaip("รอ"))
71+
self.assertIsNotNone(prayut_and_somchaip("ขอ"))
72+
self.assertIsNotNone(prayut_and_somchaip("บน"))
73+
self.assertIsNotNone(prayut_and_somchaip("ณาญ"))
74+
self.assertIsNotNone(prayut_and_somchaip("กาง"))
75+
self.assertIsNotNone(prayut_and_somchaip("ว้าว"))

0 commit comments

Comments
 (0)