Skip to content

Commit d460708

Browse files
authored
Merge pull request #418 from PyThaiNLP/unittest-main
Add unit test to CLI functions
2 parents ee67869 + 461034a commit d460708

File tree

13 files changed

+445
-224
lines changed

13 files changed

+445
-224
lines changed

pythainlp/__main__.py

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,33 +5,35 @@
55
from pythainlp import cli
66

77

8-
def main(args=None):
8+
def main(argv=None):
99
"""ThaiNLP command line."""
10-
if args is None:
11-
args = sys.argv[1:]
10+
if not argv:
11+
argv = sys.argv
1212

1313
parser = argparse.ArgumentParser(
14-
"thainlp", usage="thainlp <command> [options]"
14+
prog="thainlp",
15+
description="Thai natural language processing.",
16+
usage=(
17+
"thainlp <command> [options]\n\n"
18+
"Example:\n\n"
19+
"thainlp data catalog\n\n"
20+
"--"
21+
),
1522
)
16-
1723
parser.add_argument(
1824
"command",
1925
type=str,
2026
choices=cli.COMMANDS,
2127
help="text processing action",
2228
)
2329

24-
args = parser.parse_args(sys.argv[1:2])
25-
30+
args = parser.parse_args(argv[1:2])
2631
cli.exit_if_empty(args.command, parser)
2732

2833
if hasattr(cli, args.command):
2934
command = getattr(cli, args.command)
30-
command.App(sys.argv)
31-
else:
32-
print(f"Command not available: {args.command}")
33-
print("Please run with --help for alternatives")
35+
command.App(argv)
3436

3537

3638
if __name__ == "__main__":
37-
main()
39+
main(argv=sys.argv)

pythainlp/cli/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import sys
33
from argparse import ArgumentParser
44

5-
from . import data, soundex, tag, tokenize
5+
from pythainlp.cli import data, soundex, tag, tokenize
66

77
# a command should be a verb when possible
88
COMMANDS = sorted(["data", "soundex", "tag", "tokenize"])

pythainlp/cli/data.py

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,26 @@
1010
class App:
1111
def __init__(self, argv):
1212
parser = argparse.ArgumentParser(
13-
prog="data", usage="thainlp data <subcommand>",
13+
prog="data",
14+
description="Manage dataset/corpus.",
15+
usage=(
16+
"thainlp data <subcommand>\n\n"
17+
"subcommands:\n\n"
18+
"catalog show list of available datasets\n"
19+
"info <dataset_name> show information about the dataset\n"
20+
"get <dataset_name> download the dataset\n"
21+
"rm <dataset_name> remove the dataset\n"
22+
"path show full path to data directory\n\n"
23+
"Example:\n\n"
24+
"thainlp data get thai2fit_wv\n\n"
25+
"Current data path:\n\n"
26+
f"{get_pythainlp_data_path()}\n\n"
27+
"To change PyThaiNLP data path, set the operating system's\n"
28+
"PYTHAINLP_DATA_DIR environment variable.\n\n"
29+
"For more information about corpora that PyThaiNLP use, see:\n"
30+
"https://github.com/PyThaiNLP/pythainlp-corpus/\n\n"
31+
"--"
32+
),
1433
)
1534
parser.add_argument(
1635
"subcommand",
@@ -78,8 +97,10 @@ def catalog(self, argv):
7897
else:
7998
print()
8099

81-
print("\nUse subcommand 'get' to download dataset.")
82-
print("Example: thainlp data get crfcut")
100+
print(
101+
"\nUse subcommand 'get' to download a dataset.\n\n"
102+
"Example: thainlp data get crfcut\n"
103+
)
83104

84105
def path(self, argv):
85106
"""Print path for local dataset."""

pythainlp/cli/soundex.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,18 @@ def __init__(self, argv):
1414
parser = argparse.ArgumentParser(
1515
prog="soundex",
1616
description="Convert a text to its sound-based index.",
17+
usage=(
18+
"thainlp soundex [-a algorithm] <text>\n\n"
19+
"algorithms:\n\n"
20+
"udom83\n"
21+
"lk82\n"
22+
"metasound\n\n"
23+
f"Default soundex algorithm is {DEFAULT_SOUNDEX_ENGINE}.\n\n"
24+
"<text> should be inside double quotes.\n\n"
25+
"Example:\n\n"
26+
'thainlp soundex -a lk82 "มอเตอร์ไซค์"\n\n'
27+
"--"
28+
),
1729
)
1830
parser.add_argument(
1931
"-a",

pythainlp/cli/tag.py

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,10 @@
99

1010
class SubAppBase:
1111
def __init__(self, name, argv):
12-
parser = argparse.ArgumentParser(name)
12+
parser = argparse.ArgumentParser(**cli.make_usage("tag " + name))
1313
parser.add_argument(
1414
"text", type=str, help="input text",
1515
)
16-
1716
parser.add_argument(
1817
"-s",
1918
"--sep",
@@ -43,17 +42,30 @@ def __init__(self, *args, **kwargs):
4342

4443
class App:
4544
def __init__(self, argv):
46-
parser = argparse.ArgumentParser(**cli.make_usage("tag"))
47-
parser.add_argument("subcommand", type=str, help="[pos]")
45+
parser = argparse.ArgumentParser(
46+
prog="tag",
47+
description="Annotate a text with linguistic information",
48+
usage=(
49+
'thainlp tag <tag_type> [--sep "<separator>"] "<text>"\n\n'
50+
"tag_type:\n\n"
51+
"pos part-of-speech\n\n"
52+
"<separator> and <text> should be inside double quotes.\n"
53+
"<text> should be a tokenized text, "
54+
"with tokens separated by <separator>.\n\n"
55+
"Example:\n\n"
56+
'thainlp tag pos -s " " "แรงดึงดูด เก็บ หัว คุณ ลง"\n\n'
57+
"--"
58+
),
59+
)
60+
parser.add_argument("tag_type", type=str, help="[pos]")
4861

4962
args = parser.parse_args(argv[2:3])
50-
51-
cli.exit_if_empty(args.subcommand, parser)
52-
subcommand = str.lower(args.subcommand)
63+
cli.exit_if_empty(args.tag_type, parser)
64+
tag_type = str.lower(args.tag_type)
5365

5466
argv = argv[3:]
5567

56-
if subcommand == "pos":
68+
if tag_type == "pos":
5769
POSTaggingApp("Part-of-Speech tagging", argv)
5870
else:
59-
print(f"Tag type not available: {subcommand}")
71+
print(f"Tag type not available: {tag_type}")

pythainlp/cli/tokenize.py

Lines changed: 31 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -108,25 +108,45 @@ def __init__(self, *args, **kwargs):
108108

109109
class App:
110110
def __init__(self, argv):
111-
parser = argparse.ArgumentParser(**cli.make_usage("tokenize"))
111+
parser = argparse.ArgumentParser(
112+
prog="tokenize",
113+
description="Break a text into small units (tokens).",
114+
usage=(
115+
'thainlp tokenize <token_type> [options] "<text>"\n\n'
116+
"token_type:\n\n"
117+
"subword subword (may not be a linguistic unit)\n"
118+
"syllable syllable\n"
119+
"word word\n"
120+
"sent sentence\n\n"
121+
"options:\n\n"
122+
"--sep or -s <separator> specify custom separator\n"
123+
" (default is a space)\n"
124+
"--algo or -a <algorithm> tokenization algorithm\n"
125+
" (see API doc for more info)\n"
126+
"--keep-whitespace or -w keep whitespaces in output\n"
127+
" (default)\n\n"
128+
"<separator> and <text> should be inside double quotes.\n\n"
129+
"Example:\n\n"
130+
'thainlp tokenize word -s "|" "ใต้แสงนีออนเปลี่ยวเหงา"\n\n'
131+
"--"
132+
),
133+
)
112134
parser.add_argument(
113-
"subcommand", type=str, help="[subword|syllable|word|sent]",
135+
"token_type", type=str, help="[subword|syllable|word|sent]",
114136
)
115137

116138
args = parser.parse_args(argv[2:3])
117-
118-
cli.exit_if_empty(args.subcommand, parser)
119-
subcommand = str.lower(args.subcommand)
139+
cli.exit_if_empty(args.token_type, parser)
140+
token_type = str.lower(args.token_type)
120141

121142
argv = argv[3:]
122-
123-
if subcommand.startswith("w"):
143+
if token_type.startswith("w"):
124144
WordTokenizationApp("word", argv)
125-
elif subcommand.startswith("sy"):
145+
elif token_type.startswith("sy"):
126146
SyllableTokenizationApp("syllable", argv)
127-
elif subcommand.startswith("su"):
147+
elif token_type.startswith("su"):
128148
SubwordTokenizationApp("subword", argv)
129-
elif subcommand.startswith("se"):
149+
elif token_type.startswith("se"):
130150
SentenceTokenizationApp("sent", argv)
131151
else:
132-
print(f"Subcommand not available: {subcommand}")
152+
print(f"Token type not available: {token_type}")

pythainlp/corpus/corpus_license.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
# Corpus License
22

3+
For more information about corpora that PyThaiNLP use,
4+
see [https://github.com/PyThaiNLP/pythainlp-corpus/](https://github.com/PyThaiNLP/pythainlp-corpus/).
5+
36
## tha-wn.db
47

58
```

tests/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
Unit test.
44
55
Each file in tests/ is for each main package.
6-
#TODO Test for CLI
76
"""
87
import sys
98
import unittest

tests/test_cli.py

Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
# -*- coding: utf-8 -*-
2+
3+
import unittest
4+
from argparse import ArgumentError
5+
from types import ModuleType
6+
7+
from pythainlp import __main__, cli
8+
9+
10+
class TestMainPackage(unittest.TestCase):
11+
def test_cli_main(self):
12+
# call with no argument, should exit with 2
13+
with self.assertRaises(SystemExit) as ex:
14+
__main__.main()
15+
self.assertEqual(ex.exception.code, 2)
16+
17+
with self.assertRaises((ArgumentError, SystemExit)):
18+
self.assertIsNone(__main__.main(["thainlp"]))
19+
20+
with self.assertRaises((ArgumentError, SystemExit)):
21+
self.assertIsNone(
22+
__main__.main(["thainlp", "NOT_EXIST", "command"])
23+
)
24+
25+
self.assertIsNone(__main__.main(["thainlp", "data", "path"]))
26+
27+
def test_cli_data(self):
28+
self.assertIsInstance(getattr(cli, "data"), ModuleType)
29+
30+
with self.assertRaises(SystemExit) as ex:
31+
cli.data.App(["thainlp", "data"])
32+
self.assertEqual(ex.exception.code, 2)
33+
34+
self.assertIsNotNone(cli.data.App(["thainlp", "data", "catalog"]))
35+
self.assertIsNotNone(cli.data.App(["thainlp", "data", "path"]))
36+
self.assertIsNotNone(cli.data.App(["thainlp", "data", "get", "test"]))
37+
self.assertIsNotNone(cli.data.App(["thainlp", "data", "info", "test"]))
38+
self.assertIsNotNone(cli.data.App(["thainlp", "data", "rm", "test"]))
39+
self.assertIsNotNone(
40+
cli.data.App(["thainlp", "data", "get", "NOT_EXIST"])
41+
)
42+
self.assertIsNotNone(
43+
cli.data.App(["thainlp", "data", "info", "NOT_EXIST"])
44+
)
45+
self.assertIsNotNone(
46+
cli.data.App(["thainlp", "data", "rm", "NOT_EXIST"])
47+
)
48+
49+
def test_cli_soundex(self):
50+
self.assertIsInstance(getattr(cli, "soundex"), ModuleType)
51+
52+
with self.assertRaises(SystemExit) as ex:
53+
cli.data.App(["thainlp", "soundex"])
54+
self.assertEqual(ex.exception.code, 2)
55+
56+
self.assertIsNotNone(cli.soundex.App(["thainlp", "soundex", "ทดสอบ"]))
57+
58+
def test_cli_tag(self):
59+
self.assertIsInstance(getattr(cli, "tag"), ModuleType)
60+
61+
with self.assertRaises(SystemExit) as ex:
62+
cli.data.App(["thainlp", "tag"])
63+
self.assertEqual(ex.exception.code, 2)
64+
65+
self.assertIsNotNone(
66+
cli.tag.App(
67+
[
68+
"thainlp",
69+
"tag",
70+
"pos",
71+
"-s",
72+
" ",
73+
"มอเตอร์ไซค์ ความว่างเปล่า",
74+
]
75+
)
76+
)
77+
self.assertIsNotNone(
78+
cli.tag.App(
79+
[
80+
"thainlp",
81+
"tag",
82+
"role",
83+
"-s",
84+
" ",
85+
"มอเตอร์ไซค์ ความว่างเปล่า",
86+
]
87+
)
88+
)
89+
90+
def test_cli_tokenize(self):
91+
self.assertIsInstance(getattr(cli, "tokenize"), ModuleType)
92+
93+
with self.assertRaises(SystemExit) as ex:
94+
cli.data.App(["thainlp", "tokenize"])
95+
self.assertEqual(ex.exception.code, 2)
96+
97+
self.assertIsNotNone(
98+
cli.tokenize.App(
99+
["thainlp", "tokenize", "NOT_EXIST", "ไม่มีอยู่ จริง"]
100+
)
101+
)
102+
self.assertIsNotNone(
103+
cli.tokenize.App(
104+
[
105+
"thainlp",
106+
"tokenize",
107+
"subword",
108+
"-s",
109+
"|",
110+
"ถ้าฉันยิงกระต่ายได้ ฉันก็ยิงฟาสซิสต์ได้",
111+
]
112+
)
113+
)
114+
self.assertIsNotNone(
115+
cli.tokenize.App(
116+
[
117+
"thainlp",
118+
"tokenize",
119+
"syllable",
120+
"-s",
121+
"|",
122+
"-w",
123+
"ถ้าฉันยิงกระต่ายได้ ฉันก็ยิงฟาสซิสต์ได้",
124+
]
125+
)
126+
)
127+
self.assertIsNotNone(
128+
cli.tokenize.App(
129+
[
130+
"thainlp",
131+
"tokenize",
132+
"word",
133+
"-nw",
134+
"-a",
135+
"newmm",
136+
"-s",
137+
"|",
138+
"ถ้าฉันยิงกระต่ายได้ ฉันก็ยิงฟาสซิสต์ได้",
139+
]
140+
)
141+
)
142+
self.assertIsNotNone(
143+
cli.tokenize.App(
144+
[
145+
"thainlp",
146+
"tokenize",
147+
"sent",
148+
"-s",
149+
"|",
150+
(
151+
"ถ้าฉันยิงกระต่ายได้ ฉันก็ยิงฟาสซิสต์ได้"
152+
"กระสุนสำหรับสมองของคุณวันนี้"
153+
"แต่คุณก็จะลืมมันไปทั้งหมดอีกครั้ง"
154+
),
155+
]
156+
)
157+
)

0 commit comments

Comments
 (0)