Skip to content

Commit 6c75b10

Browse files
authored
Merge pull request #170 from PyThaiNLP/1.7
PyThaiNLP 1.7.2
2 parents 5c1020e + 7a60a2d commit 6c75b10

File tree

11 files changed

+788
-11
lines changed

11 files changed

+788
-11
lines changed
59 KB
Binary file not shown.
57.7 KB
Binary file not shown.

docs/_build/html/_modules/index.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,7 @@ <h1>All modules for which code is available</h1>
171171
<li><a href="pythainlp/summarize.html">pythainlp.summarize</a></li>
172172
<li><a href="pythainlp/tag.html">pythainlp.tag</a></li>
173173
<li><a href="pythainlp/tokenize.html">pythainlp.tokenize</a></li>
174+
<li><a href="pythainlp/ulmfit/utils.html">pythainlp.ulmfit.utils</a></li>
174175
<li><a href="pythainlp/word_vector/thai2vec.html">pythainlp.word_vector.thai2vec</a></li>
175176
</ul>
176177

docs/_build/html/_modules/pythainlp/ulmfit/utils.html

Lines changed: 500 additions & 0 deletions
Large diffs are not rendered by default.

docs/_build/html/api/ulmfit.html

Lines changed: 243 additions & 0 deletions
Large diffs are not rendered by default.

docs/_build/html/genindex.html

Lines changed: 39 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -183,8 +183,12 @@ <h1 id="index">Index</h1>
183183
<h2 id="A">A</h2>
184184
<table style="width: 100%" class="indextable genindextable"><tr>
185185
<td style="width: 33%; vertical-align: top;"><ul>
186-
<li><a href="api/word_vector.html#pythainlp.word_vector.thai2vec.about">about() (in module pythainlp.word_vector.thai2vec)</a>
186+
<li><a href="api/ulmfit.html#pythainlp.ulmfit.utils.about">about() (in module pythainlp.ulmfit.utils)</a>
187+
188+
<ul>
189+
<li><a href="api/word_vector.html#pythainlp.word_vector.thai2vec.about">(in module pythainlp.word_vector.thai2vec)</a>
187190
</li>
191+
</ul></li>
188192
</ul></td>
189193
</tr></table>
190194

@@ -207,6 +211,8 @@ <h2 id="D">D</h2>
207211
</li>
208212
</ul></td>
209213
<td style="width: 33%; vertical-align: top;"><ul>
214+
<li><a href="api/ulmfit.html#pythainlp.ulmfit.utils.document_vector">document_vector() (in module pythainlp.ulmfit.utils)</a>
215+
</li>
210216
<li><a href="api/word_vector.html#pythainlp.word_vector.thai2vec.doesnt_match">doesnt_match() (in module pythainlp.word_vector.thai2vec)</a>
211217
</li>
212218
</ul></td>
@@ -215,11 +221,15 @@ <h2 id="D">D</h2>
215221
<h2 id="G">G</h2>
216222
<table style="width: 100%" class="indextable genindextable"><tr>
217223
<td style="width: 33%; vertical-align: top;"><ul>
224+
<li><a href="api/ulmfit.html#pythainlp.ulmfit.utils.get_all">get_all() (in module pythainlp.ulmfit.utils)</a>
225+
</li>
218226
<li><a href="api/word_vector.html#pythainlp.word_vector.thai2vec.get_model">get_model() (in module pythainlp.word_vector.thai2vec)</a>
219227
</li>
220228
</ul></td>
221229
<td style="width: 33%; vertical-align: top;"><ul>
222230
<li><a href="api/ner.html#pythainlp.ner.thainer.get_ner">get_ner() (pythainlp.ner.thainer method)</a>
231+
</li>
232+
<li><a href="api/ulmfit.html#pythainlp.ulmfit.utils.get_texts">get_texts() (in module pythainlp.ulmfit.utils)</a>
223233
</li>
224234
</ul></td>
225235
</tr></table>
@@ -242,6 +252,10 @@ <h2 id="L">L</h2>
242252

243253
<h2 id="M">M</h2>
244254
<table style="width: 100%" class="indextable genindextable"><tr>
255+
<td style="width: 33%; vertical-align: top;"><ul>
256+
<li><a href="api/ulmfit.html#pythainlp.ulmfit.utils.merge_wgts">merge_wgts() (in module pythainlp.ulmfit.utils)</a>
257+
</li>
258+
</ul></td>
245259
<td style="width: 33%; vertical-align: top;"><ul>
246260
<li><a href="api/word_vector.html#pythainlp.word_vector.thai2vec.most_similar_cosmul">most_similar_cosmul() (in module pythainlp.word_vector.thai2vec)</a>
247261
</li>
@@ -258,6 +272,8 @@ <h2 id="N">N</h2>
258272
</ul></td>
259273
<td style="width: 33%; vertical-align: top;"><ul>
260274
<li><a href="api/number.html#pythainlp.number.num_to_thai_num">num_to_thai_num() (in module pythainlp.number)</a>
275+
</li>
276+
<li><a href="api/ulmfit.html#pythainlp.ulmfit.utils.numericalizer">numericalizer() (in module pythainlp.ulmfit.utils)</a>
261277
</li>
262278
<li><a href="api/number.html#pythainlp.number.numtowords">numtowords() (in module pythainlp.number)</a>
263279
</li>
@@ -268,12 +284,24 @@ <h2 id="P">P</h2>
268284
<table style="width: 100%" class="indextable genindextable"><tr>
269285
<td style="width: 33%; vertical-align: top;"><ul>
270286
<li><a href="api/tag.html#pythainlp.tag.pos_tag">pos_tag() (in module pythainlp.tag)</a>
287+
</li>
288+
<li><a href="api/ulmfit.html#pythainlp.ulmfit.utils.ThaiTokenizer.proc_all">proc_all() (pythainlp.ulmfit.utils.ThaiTokenizer static method)</a>
289+
</li>
290+
</ul></td>
291+
<td style="width: 33%; vertical-align: top;"><ul>
292+
<li><a href="api/ulmfit.html#pythainlp.ulmfit.utils.ThaiTokenizer.proc_all_mp">proc_all_mp() (pythainlp.ulmfit.utils.ThaiTokenizer static method)</a>
293+
</li>
294+
<li><a href="api/ulmfit.html#pythainlp.ulmfit.utils.ThaiTokenizer.proc_text">proc_text() (pythainlp.ulmfit.utils.ThaiTokenizer method)</a>
271295
</li>
272296
</ul></td>
273297
</tr></table>
274298

275299
<h2 id="R">R</h2>
276300
<table style="width: 100%" class="indextable genindextable"><tr>
301+
<td style="width: 33%; vertical-align: top;"><ul>
302+
<li><a href="api/ulmfit.html#pythainlp.ulmfit.utils.ThaiTokenizer.replace_rep">replace_rep() (pythainlp.ulmfit.utils.ThaiTokenizer static method)</a>
303+
</li>
304+
</ul></td>
277305
<td style="width: 33%; vertical-align: top;"><ul>
278306
<li><a href="api/romanization.html#pythainlp.romanization.romanization">romanization() (in module pythainlp.romanization)</a>
279307

@@ -293,11 +321,13 @@ <h2 id="S">S</h2>
293321
</li>
294322
<li><a href="api/sentiment.html#pythainlp.sentiment.sentiment">sentiment() (in module pythainlp.sentiment)</a>
295323
</li>
296-
</ul></td>
297-
<td style="width: 33%; vertical-align: top;"><ul>
298324
<li><a href="api/word_vector.html#pythainlp.word_vector.thai2vec.similarity">similarity() (in module pythainlp.word_vector.thai2vec)</a>
299325
</li>
326+
</ul></td>
327+
<td style="width: 33%; vertical-align: top;"><ul>
300328
<li><a href="api/spell.html#pythainlp.spell.spell">spell() (in module pythainlp.spell)</a>
329+
</li>
330+
<li><a href="api/ulmfit.html#pythainlp.ulmfit.utils.ThaiTokenizer.sub_br">sub_br() (pythainlp.ulmfit.utils.ThaiTokenizer method)</a>
301331
</li>
302332
<li><a href="api/tokenizer.html#pythainlp.tokenize.subword_tokenize">subword_tokenize() (in module pythainlp.tokenize)</a>
303333
</li>
@@ -315,15 +345,19 @@ <h2 id="T">T</h2>
315345
</li>
316346
<li><a href="api/change.html#pythainlp.change.texttothai">texttothai() (in module pythainlp.change)</a>
317347
</li>
318-
</ul></td>
319-
<td style="width: 33%; vertical-align: top;"><ul>
320348
<li><a href="api/romanization.html#pythainlp.romanization.thai2rom.thai2rom">thai2rom (class in pythainlp.romanization.thai2rom)</a>
321349
</li>
350+
</ul></td>
351+
<td style="width: 33%; vertical-align: top;"><ul>
322352
<li><a href="api/number.html#pythainlp.number.thai_num_to_num">thai_num_to_num() (in module pythainlp.number)</a>
323353
</li>
324354
<li><a href="api/number.html#pythainlp.number.thai_num_to_text">thai_num_to_text() (in module pythainlp.number)</a>
325355
</li>
326356
<li><a href="api/ner.html#pythainlp.ner.thainer">thainer (class in pythainlp.ner)</a>
357+
</li>
358+
<li><a href="api/ulmfit.html#pythainlp.ulmfit.utils.ThaiTokenizer">ThaiTokenizer (class in pythainlp.ulmfit.utils)</a>
359+
</li>
360+
<li><a href="api/ulmfit.html#pythainlp.ulmfit.utils.ThaiTokenizer.tokenize">tokenize() (pythainlp.ulmfit.utils.ThaiTokenizer method)</a>
327361
</li>
328362
</ul></td>
329363
</tr></table>

docs/_build/html/objects.inv

115 Bytes
Binary file not shown.

docs/_build/html/searchindex.js

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pythainlp/corpus/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from tqdm import tqdm
1212

1313
CORPUS_DB_URL = (
14-
"https://raw.githubusercontent.com/PyThaiNLP/pythainlp-corpus/master/db.json"
14+
"https://github.com/PyThaiNLP/pythainlp-corpus/raw/1.7/db.json"
1515
)
1616

1717
# __all__ = ["thaipos", "thaiword","alphabet","tone","country","wordnet"]

pythainlp/tokenize/__init__.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import codecs
55
import re
66

7-
import nltk
87
from pythainlp.corpus.thaisyllable import get_data as syllable_dict
98
from pythainlp.corpus.thaiword import get_data as word_dict
109
from six.moves import zip
@@ -104,9 +103,9 @@ def sent_tokenize(text, engine="whitespace+newline"):
104103
:return: a list of text, split by whitespace or new line.
105104
"""
106105
if engine == "whitespace":
107-
sentences = nltk.tokenize.WhitespaceTokenizer().tokenize(text)
106+
sentences = re.split(r' +', text, re.U)
108107
else:
109-
sentences = re.sub(r"\n+|\s+", "|", text, re.U).split("|")
108+
sentences = text.split()
110109

111110
return sentences
112111

0 commit comments

Comments
 (0)