|
5 | 5 | Credit: Korakot Chaovavanich |
6 | 6 | https://www.facebook.com/photo.php?fbid=363640477387469&set=gm.434330506948445&type=3&permPage=1 |
7 | 7 | """ |
8 | | -import re |
9 | 8 | from typing import List, Tuple |
10 | 9 |
|
11 | | -import requests |
12 | 10 | from pythainlp.corpus import get_corpus |
13 | 11 |
|
14 | | -__all__ = ["word_freq", "word_freqs"] |
| 12 | +__all__ = ["word_freqs"] |
15 | 13 |
|
16 | 14 | _FILENAME = "tnc_freq.txt" |
17 | 15 |
|
18 | | - |
19 | | -def word_freq(word: str, domain: str = "all") -> int: |
20 | | - """ |
21 | | -
|
22 | | - .. note:: |
23 | | - **Not officially supported.** |
24 | | - Get word frequency of a word by domain. |
25 | | - This function will make a query to the server of |
26 | | - Thai National Corpus. |
27 | | - Internet connection is required. |
28 | | -
|
29 | | - .. warning:: |
30 | | - Currently (as of 29 April 2019) it is likely to return 0, |
31 | | - regardless of the word, as the service URL has been changed |
32 | | - and the code is not updated yet. |
33 | | - New URL is http://www.arts.chula.ac.th/~ling/tnc3/ |
34 | | -
|
35 | | - :param string word: word |
36 | | - :param string domain: domain |
37 | | - """ |
38 | | - listdomain = { |
39 | | - "all": "", |
40 | | - "imaginative": "1", |
41 | | - "natural-pure-science": "2", |
42 | | - "applied-science": "3", |
43 | | - "social-science": "4", |
44 | | - "world-affairs-history": "5", |
45 | | - "commerce-finance": "6", |
46 | | - "arts": "7", |
47 | | - "belief-thought": "8", |
48 | | - "leisure": "9", |
49 | | - "others": "0", |
50 | | - } |
51 | | - url = "http://www.arts.chula.ac.th/~ling/tnc3/" |
52 | | - data = {"genre[]": "", "domain[]": listdomain[domain], "sortby": "perc", "p": word} |
53 | | - |
54 | | - r = requests.post(url, data=data) |
55 | | - |
56 | | - pat = re.compile(r'TOTAL</font>.*?#ffffff">(.*?)</font>', flags=re.DOTALL) |
57 | | - match = pat.search(r.text) |
58 | | - |
59 | | - n = 0 |
60 | | - if match: |
61 | | - n = int(match.group(1).strip()) |
62 | | - |
63 | | - return n |
64 | | - |
65 | | - |
66 | 16 | def word_freqs() -> List[Tuple[str, int]]: |
67 | 17 | """ |
68 | 18 | Get word frequency from Thai National Corpus (TNC) |
|
0 commit comments