Skip to content

Commit 1954736

Browse files
authored
Merge pull request #200 from bact/dev
Improvement in romanization
2 parents 2ca00e3 + 47cd40a commit 1954736

File tree

17 files changed

+164
-112
lines changed

17 files changed

+164
-112
lines changed

README-pypi.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
![PyThaiNLP Logo](https://avatars0.githubusercontent.com/u/32934255?s=200&v=4)
22

3-
# PyThaiNLP 2.0.3
3+
# PyThaiNLP
44

55
PyThaiNLP is a Python library for natural language processing (NLP) of Thai language.
66

77
PyThaiNLP includes Thai word tokenizers, transliterators, soundex converters, part-of-speech taggers, and spell checkers.
88

99
📫 follow us on Facebook [PyThaiNLP](https://www.facebook.com/pythainlp/)
1010

11-
## What's new in version 2.0 ?
11+
## What's new in 2.0 ?
1212

1313
- Terminate Python 2 support. Remove all Python 2 compatibility code.
1414
- Improved `word_tokenize` ("newmm" and "mm" engine) and `dict_word_tokenize`

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,8 @@ $ pip install pythainlp[extra1,extra2,...]
6565
where ```extras``` can be
6666
- ```artagger``` (to support artagger part-of-speech tagger)*
6767
- ```deepcut``` (to support deepcut machine-learnt tokenizer)
68-
- ```icu``` (for ICU support in transliteration and tokenization)
69-
- ```ipa``` (for International Phonetic Alphabet support in transliteration)
68+
- ```icu``` (for ICU, International Components for Unicode, support in transliteration and tokenization)
69+
- ```ipa``` (for IPA, International Phonetic Alphabet, support in transliteration)
7070
- ```ml``` (to support fastai 1.0.22 ULMFiT models)
7171
- ```ner``` (for named-entity recognizer)
7272
- ```thai2fit``` (for Thai word vector)

notebooks/pythainlp-get-started.ipynb

Lines changed: 39 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -386,7 +386,7 @@
386386
"cell_type": "markdown",
387387
"metadata": {},
388388
"source": [
389-
"### Thai Character Cluster (TCC)\n",
389+
"### Subword and Thai Character Cluster (TCC)\n",
390390
"\n",
391391
"According to [Character Cluster Based Thai Information Retrieval](https://www.researchgate.net/publication/2853284_Character_Cluster_Based_Thai_Information_Retrieval) (Theeramunkong et al. 2004)."
392392
]
@@ -408,31 +408,11 @@
408408
}
409409
],
410410
"source": [
411-
"from pythainlp.tokenize import subword_tokenize\n",
411+
"from pythainlp import subword_tokenize\n",
412412
"\n",
413413
"subword_tokenize(\"ประเทศไทย\")"
414414
]
415415
},
416-
{
417-
"cell_type": "code",
418-
"execution_count": 16,
419-
"metadata": {},
420-
"outputs": [
421-
{
422-
"data": {
423-
"text/plain": [
424-
"False"
425-
]
426-
},
427-
"execution_count": 16,
428-
"metadata": {},
429-
"output_type": "execute_result"
430-
}
431-
],
432-
"source": [
433-
"isinstance(subword_tokenize(\"ประเทศไทย\", engine=\"etcc\"), str)"
434-
]
435-
},
436416
{
437417
"cell_type": "markdown",
438418
"metadata": {},
@@ -442,7 +422,7 @@
442422
},
443423
{
444424
"cell_type": "code",
445-
"execution_count": 17,
425+
"execution_count": 16,
446426
"metadata": {},
447427
"outputs": [
448428
{
@@ -451,20 +431,20 @@
451431
"['ป', 'ระ', 'เท', 'ศ', 'ไท', 'ย']"
452432
]
453433
},
454-
"execution_count": 17,
434+
"execution_count": 16,
455435
"metadata": {},
456436
"output_type": "execute_result"
457437
}
458438
],
459439
"source": [
460-
"from pythainlp import tcc\n",
440+
"from pythainlp.tokenize import tcc\n",
461441
"\n",
462442
"tcc.segment(\"ประเทศไทย\")"
463443
]
464444
},
465445
{
466446
"cell_type": "code",
467-
"execution_count": 18,
447+
"execution_count": 17,
468448
"metadata": {},
469449
"outputs": [
470450
{
@@ -473,7 +453,7 @@
473453
"{1, 3, 5, 6, 8, 9}"
474454
]
475455
},
476-
"execution_count": 18,
456+
"execution_count": 17,
477457
"metadata": {},
478458
"output_type": "execute_result"
479459
}
@@ -484,7 +464,7 @@
484464
},
485465
{
486466
"cell_type": "code",
487-
"execution_count": 19,
467+
"execution_count": 18,
488468
"metadata": {},
489469
"outputs": [
490470
{
@@ -509,7 +489,7 @@
509489
},
510490
{
511491
"cell_type": "code",
512-
"execution_count": 20,
492+
"execution_count": 19,
513493
"metadata": {},
514494
"outputs": [
515495
{
@@ -518,7 +498,7 @@
518498
"'maeo'"
519499
]
520500
},
521-
"execution_count": 20,
501+
"execution_count": 19,
522502
"metadata": {},
523503
"output_type": "execute_result"
524504
}
@@ -531,21 +511,34 @@
531511
},
532512
{
533513
"cell_type": "code",
534-
"execution_count": 21,
514+
"execution_count": 20,
535515
"metadata": {},
536516
"outputs": [
537517
{
538-
"name": "stdout",
539-
"output_type": "stream",
540-
"text": [
541-
"mɛːw\n"
542-
]
518+
"data": {
519+
"text/plain": [
520+
"'mɛːw'"
521+
]
522+
},
523+
"execution_count": 20,
524+
"metadata": {},
525+
"output_type": "execute_result"
543526
}
544527
],
545528
"source": [
546529
"from pythainlp.transliterate import transliterate\n",
547530
"\n",
548-
"print(transliterate(\"แมว\"))"
531+
"transliterate(\"แมว\")"
532+
]
533+
},
534+
{
535+
"cell_type": "code",
536+
"execution_count": 21,
537+
"metadata": {},
538+
"outputs": [],
539+
"source": [
540+
"#!pip3 install pythainlp[icu]\n",
541+
"#transliterate(\"แมว\", engine=\"icu\")"
549542
]
550543
},
551544
{
@@ -736,15 +729,15 @@
736729
{
737730
"data": {
738731
"text/plain": [
739-
"[('งวงช้าง', 12),\n",
740-
" ('เทิบทาบ', 7),\n",
741-
" ('กริน', 3),\n",
742-
" ('นาภี', 2),\n",
743-
" ('แด่วๆ', 3),\n",
744-
" ('คู่ใจ', 7),\n",
745-
" ('คุณพ่อ', 732),\n",
746-
" ('สิ้น', 755),\n",
747-
" ('เยาะ', 150)]"
732+
"[('ลุ่น', 4),\n",
733+
" ('คั่น', 53),\n",
734+
" ('ไก่ป่า', 29),\n",
735+
" ('ปริพาชก', 4),\n",
736+
" ('สิกขาบท', 4),\n",
737+
" ('คัดลายมือ', 2),\n",
738+
" ('เลียบ', 53),\n",
739+
" ('เกือบๆ', 6),\n",
740+
" ('จันทรคติ', 6)]"
748741
]
749742
},
750743
"execution_count": 28,

pythainlp/__init__.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,11 @@
2727
from pythainlp.soundex import soundex
2828
from pythainlp.spell import correct, spell
2929
from pythainlp.tag import pos_tag
30-
from pythainlp.tokenize import sent_tokenize, tcc, word_tokenize, Tokenizer
30+
from pythainlp.tokenize import (
31+
Tokenizer,
32+
sent_tokenize,
33+
subword_tokenize,
34+
word_tokenize,
35+
)
3136
from pythainlp.transliterate import romanize, transliterate
3237
from pythainlp.util import collate, thai_strftime

pythainlp/soundex/__init__.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# [KSS97] https://linux.thai.net/~thep/soundex/soundex.html
1313

1414

15-
def soundex(text: str, engine="udom83") -> str:
15+
def soundex(text: str, engine: str = "udom83") -> str:
1616
"""
1717
Thai Soundex
1818
@@ -24,9 +24,7 @@ def soundex(text: str, engine="udom83") -> str:
2424
* metasound
2525
:return: soundex code
2626
"""
27-
if engine == "udom83":
28-
_soundex = udom83
29-
elif engine == "lk82":
27+
if engine == "lk82":
3028
_soundex = lk82
3129
elif engine == "metasound":
3230
_soundex = metasound

pythainlp/spell/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
__all__ = ["DEFAULT_SPELL_CHECKER", "correct", "spell", "NorvigSpellChecker"]
1111

1212

13-
def spell(word: str, engine="pn") -> List[str]:
13+
def spell(word: str, engine: str = "pn") -> List[str]:
1414
"""
1515
:param str word: word to check spelling
1616
:param str engine:
@@ -21,7 +21,7 @@ def spell(word: str, engine="pn") -> List[str]:
2121
return DEFAULT_SPELL_CHECKER.spell(word)
2222

2323

24-
def correct(word: str, engine="pn") -> str:
24+
def correct(word: str, engine: str = "pn") -> str:
2525
"""
2626
:param str word: word to correct spelling
2727
:param str engine:

pythainlp/summarize/__init__.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,16 @@
33
Summarization
44
"""
55

6+
from typing import List
7+
68
from pythainlp.tokenize import sent_tokenize
79

810
from .freq import FrequencySummarizer
911

1012

11-
def summarize(text, n, engine="frequency", tokenizer="newmm"):
13+
def summarize(
14+
text: str, n: int, engine: str = "frequency", tokenizer: str = "newmm"
15+
) -> List[str]:
1216
"""
1317
Thai text summarization
1418

pythainlp/summarize/freq.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from collections import defaultdict
66
from heapq import nlargest
77
from string import punctuation
8+
from typing import List
89

910
from pythainlp.corpus import thai_stopwords
1011
from pythainlp.tokenize import sent_tokenize, word_tokenize
@@ -36,7 +37,7 @@ def __compute_frequencies(self, word_tokenized_sents):
3637
def __rank(self, ranking, n: int):
3738
return nlargest(n, ranking, key=ranking.get)
3839

39-
def summarize(self, text: str, n: int, tokenizer: str):
40+
def summarize(self, text: str, n: int, tokenizer: str) -> List[str]:
4041
sents = sent_tokenize(text)
4142
word_tokenized_sents = [word_tokenize(sent, tokenizer) for sent in sents]
4243
self.__freq = self.__compute_frequencies(word_tokenized_sents)

pythainlp/tokenize/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def word_tokenize(
2323
:Parameters for engine:
2424
* newmm (default) - dictionary-based, Maximum Matching + Thai Character Cluster
2525
* longest - dictionary-based, Longest Matching
26-
* icu - wrapper for ICU, dictionary-based
26+
* icu - wrapper for ICU (International Components for Unicode, using PyICU), dictionary-based
2727
* deepcut - wrapper for deepcut, language-model-based https://github.com/rkcosmos/deepcut
2828
* ulmfit - use newmm engine with a specific dictionary for use with thai2vec
2929
:return: list of words, tokenized from the text

pythainlp/tokenize/pyicu.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# -*- coding: utf-8 -*-
22
"""
3-
Wrapper for ICU word segmentation
3+
Wrapper for PyICU word segmentation
4+
https://github.com/ovalhub/pyicu
45
"""
56
import re
67
from typing import List

0 commit comments

Comments
 (0)