From cc283e9b053725c0c791170797d55e42d812c091 Mon Sep 17 00:00:00 2001 From: Benjamin Phillips Date: Sat, 6 Jun 2015 14:40:22 -0400 Subject: [PATCH 1/9] Updated requirements.txt to include requests, beautifulsoup, and ipython notebook --- requirements.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/requirements.txt b/requirements.txt index 473a3b2..1cf8a0e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,7 @@ scipy pandas numpy matplotlib +ipython[notebook] +pytest +requests +beautifulsoup4 From c05dc28716e53b4b44089092c476b8322503d9ed Mon Sep 17 00:00:00 2001 From: Benjamin Phillips Date: Sat, 6 Jun 2015 14:42:52 -0400 Subject: [PATCH 2/9] Added .idea from PyCharm to .gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index f00dbf2..1f808f3 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,9 @@ # Temporary data .ipynb_checkpoints/ +### PyCharm ### +.idea + # Created by https://www.gitignore.io ### Python ### From d5b1b6d668649d8b0e013a74f3997f6e74864200 Mon Sep 17 00:00:00 2001 From: Benjamin Phillips Date: Sat, 6 Jun 2015 14:43:43 -0400 Subject: [PATCH 3/9] moved test.csv --- test.csv | 32 -------------------------------- 1 file changed, 32 deletions(-) delete mode 100644 test.csv diff --git a/test.csv b/test.csv deleted file mode 100644 index 7d007aa..0000000 --- a/test.csv +++ /dev/null @@ -1,32 +0,0 @@ -1,clojure -2,clojure -3,clojure -4,clojure -5,python -6,python -7,python -8,python -9,javascript -10,javascript -11,javascript -12,javascript -13,ruby -14,ruby -15,ruby -16,haskell -17,haskell -18,haskell -19,scheme -20,scheme -21,scheme -22,java -23,java -24,scala -25,scala -26,tcl -27,tcl -28,php -29,php -30,php -31,ocaml -32,ocaml From 5bf564a22f94d82b6fce43cf9555a69da8e203b3 Mon Sep 17 00:00:00 2001 From: Benjamin Phillips Date: Sat, 6 Jun 2015 14:44:52 -0400 Subject: [PATCH 4/9] Created programming language classifier module, with get_data.py, plc.py, crawler_scarper.py, and tests --- programming_language_classifier/__init__.py | 0 .../crawler_scraper.py | 34 +++++++++++ programming_language_classifier/get_data.py | 43 ++++++++++++++ programming_language_classifier/plc.py | 56 +++++++++++++++++++ .../tests/__init__.py | 0 .../tests/function_testfiles/test1.gcc | 1 + .../tests/function_testfiles/test2.js | 1 + .../tests/function_testfiles/test3.yarv | 1 + .../tests/function_testfiles/test4.python3 | 1 + .../tests/test.csv | 32 +++++++++++ .../tests/test_get_data.py | 32 +++++++++++ .../tests/test_plc.py | 56 +++++++++++++++++++ 12 files changed, 257 insertions(+) create mode 100644 programming_language_classifier/__init__.py create mode 100644 programming_language_classifier/crawler_scraper.py create mode 100644 programming_language_classifier/get_data.py create mode 100644 programming_language_classifier/plc.py create mode 100644 programming_language_classifier/tests/__init__.py create mode 100644 programming_language_classifier/tests/function_testfiles/test1.gcc create mode 100644 programming_language_classifier/tests/function_testfiles/test2.js create mode 100644 programming_language_classifier/tests/function_testfiles/test3.yarv create mode 100644 programming_language_classifier/tests/function_testfiles/test4.python3 create mode 100644 programming_language_classifier/tests/test.csv create mode 100644 programming_language_classifier/tests/test_get_data.py create mode 100644 programming_language_classifier/tests/test_plc.py diff --git a/programming_language_classifier/__init__.py b/programming_language_classifier/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/programming_language_classifier/crawler_scraper.py b/programming_language_classifier/crawler_scraper.py new file mode 100644 index 0000000..ae06e66 --- /dev/null +++ b/programming_language_classifier/crawler_scraper.py @@ -0,0 +1,34 @@ +import bs4 +import requests +import sys +import re + + +languages = {'tcl': '.tcl'} + +def rosetta_scraper(seed, path): + response = requests.get(seed) + soup = bs4.BeautifulSoup(response.text) + divs = soup.select("div") + for div in divs: + if div.attrs.get("id") and div.attrs.get("id") == "mw-pages": + all_a = div.select('a') + links = ["http://rosettacode.org" + a.attrs.get("href") + for a in all_a + if a.attrs.get("href") and "wiki" in a.attrs.get("href")] + count = 1 + for link in links[:50]: + response = requests.get(link) + soup = bs4.BeautifulSoup(response.text) + code = soup.select('pre') + for block in code: + for key in languages: + if block.attrs.get('class') is not None and key in block.attrs.get('class'): + soup = bs4.BeautifulSoup(re.sub(r'
', "\n", str(block))) + with open(path + str(count) + languages[key], "w+") as file: + file.write(soup.text) + count += 1 + + +if __name__ == '__main__': + rosetta_scraper(sys.argv[1], sys.argv[2]) diff --git a/programming_language_classifier/get_data.py b/programming_language_classifier/get_data.py new file mode 100644 index 0000000..27a32cc --- /dev/null +++ b/programming_language_classifier/get_data.py @@ -0,0 +1,43 @@ +import os +import sys +import pandas as pd + +extensions = {".gcc": "C", + ".c": "C", + ".csharp": "C#", + ".sbcl": "Common Lisp", + ".clojure": "Clojure", + ".ghc": "Haskell", + ".java": "Java", + ".javascript": "JavaScript", + ".js": "JavaScript", + ".ocaml": "OCaml", + ".perl": "Perl", + ".hack": "PHP", + ".php": "PHP", + ".py": "Python", + ".python3": "Python", + ".jruby": "Ruby", + ".yarv": "Ruby", + ".scala": "Scala", + ".racket": "Scheme", + ".tcl": "TCL"} + +def get_content(directory): + content = [] + for file in os.listdir(directory): + extension = os.path.splitext(file)[1] + if extension in extensions: + with open(directory + file) as fh: + + content.append([extensions[extension], fh.read()]) + return content + + +def make_dataframe(content_list): + return pd.DataFrame(content_list) + + +if __name__ == '__main__': + content_list = get_content(sys.argv[1]) + print(make_dataframe(content_list)) \ No newline at end of file diff --git a/programming_language_classifier/plc.py b/programming_language_classifier/plc.py new file mode 100644 index 0000000..a816eee --- /dev/null +++ b/programming_language_classifier/plc.py @@ -0,0 +1,56 @@ +import re +import itertools + + +def percent_elements(text): + elements = ")}];:.,\/-_#*!$%|<>& " + results = [] + for element in elements: + total = max(1, len(text)) + results.append(text.count(element)/total) + return results + + +def number_elements(text): + elements = [r'\bbegin\b', r'\bend\b', r'\bdo\b', r'\bvar\b', r'\bdefine\b', r'\bdefn\b', r'\bfunction\b', + r'\bclass\b', r'\bmy\b', r'\brequire\b', r'\bvoid\b', r'\bval\b', r'\bpublic\b', r'\blet\b', + r'\bwhere\b', r'\busing\b', r'\bextend\b', r'\bfunction\b'] + results = [] + for element in elements: + results.append(len(re.findall(element, text))) + return results + + +def longest_run(text): + elements = [r'[)]+',r'[}]+', r'[\]]+', r'[=]+'] + results = [] + for element in elements: + runs = sorted(re.findall(element, text), key=len) + if runs: + results.append(len(runs[-1])) + else: + results.append(0) + return results + + +def line_enders(text): + elements = [r'[)]$', r';$', r'}$', r']$', r'\):$'] + results = [] + for element in elements: + results.append(len(re.findall(element, text, re.MULTILINE))) + return results + + +class Featurizer: + def __init__(self, *feature_makers): + self.feature_makers = feature_makers + + def fit(self, X, y): + return self + + def transform(self, X): + feature_vectors = [] + for item in X: + vector = list(itertools.chain.from_iterable([function(item) for function in self.feature_makers])) + feature_vectors.append(vector) + return feature_vectors diff --git a/programming_language_classifier/tests/__init__.py b/programming_language_classifier/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/programming_language_classifier/tests/function_testfiles/test1.gcc b/programming_language_classifier/tests/function_testfiles/test1.gcc new file mode 100644 index 0000000..233261a --- /dev/null +++ b/programming_language_classifier/tests/function_testfiles/test1.gcc @@ -0,0 +1 @@ +This is a C file diff --git a/programming_language_classifier/tests/function_testfiles/test2.js b/programming_language_classifier/tests/function_testfiles/test2.js new file mode 100644 index 0000000..773f533 --- /dev/null +++ b/programming_language_classifier/tests/function_testfiles/test2.js @@ -0,0 +1 @@ +This is a javascript file diff --git a/programming_language_classifier/tests/function_testfiles/test3.yarv b/programming_language_classifier/tests/function_testfiles/test3.yarv new file mode 100644 index 0000000..6d3b574 --- /dev/null +++ b/programming_language_classifier/tests/function_testfiles/test3.yarv @@ -0,0 +1 @@ +This is a Ruby file diff --git a/programming_language_classifier/tests/function_testfiles/test4.python3 b/programming_language_classifier/tests/function_testfiles/test4.python3 new file mode 100644 index 0000000..0d5c729 --- /dev/null +++ b/programming_language_classifier/tests/function_testfiles/test4.python3 @@ -0,0 +1 @@ +This is a Python file diff --git a/programming_language_classifier/tests/test.csv b/programming_language_classifier/tests/test.csv new file mode 100644 index 0000000..7d007aa --- /dev/null +++ b/programming_language_classifier/tests/test.csv @@ -0,0 +1,32 @@ +1,clojure +2,clojure +3,clojure +4,clojure +5,python +6,python +7,python +8,python +9,javascript +10,javascript +11,javascript +12,javascript +13,ruby +14,ruby +15,ruby +16,haskell +17,haskell +18,haskell +19,scheme +20,scheme +21,scheme +22,java +23,java +24,scala +25,scala +26,tcl +27,tcl +28,php +29,php +30,php +31,ocaml +32,ocaml diff --git a/programming_language_classifier/tests/test_get_data.py b/programming_language_classifier/tests/test_get_data.py new file mode 100644 index 0000000..d3b0e5d --- /dev/null +++ b/programming_language_classifier/tests/test_get_data.py @@ -0,0 +1,32 @@ +from programming_language_classifier import get_data as gd + + +def test_get_content(): + assert gd.get_content("tests/function_testfiles/") == [["C", "This is a C file\n"], + ["JavaScript", "This is a javascript file\n"], + ["Ruby", "This is a Ruby file\n"], + ["Python", "This is a Python file\n"]] + + +def test_make_dataframe(): + test_list = gd.get_content("tests/function_testfiles/") + assert gd.make_dataframe(test_list)[0][0] == "C" + assert gd.make_dataframe(test_list)[1][0] == "This is a C file\n" + assert gd.make_dataframe(test_list)[1][2] == "This is a Ruby file\n" + + +'javascript': '.js', + 'haskell': '.haskell', + 'scala': '.scala', + 'ocaml': '.ocaml', + 'ruby': '.jruby', + 'php': '.php', + 'clojure': '.clojure', + 'perl': '.perl', + 'csharp': '.csharp', + 'java': '.java', + 'c': '.gcc', + 'scheme': '.racket', + 'python': '.py', + 'lisp': '.sbcl', + diff --git a/programming_language_classifier/tests/test_plc.py b/programming_language_classifier/tests/test_plc.py new file mode 100644 index 0000000..62ae668 --- /dev/null +++ b/programming_language_classifier/tests/test_plc.py @@ -0,0 +1,56 @@ +from programming_language_classifier import plc + + +def test_percent_elements(): + """element order: ) } ] ; : . , \ / - _ # * ! $ % | """ + a_string = "..oooooOO}" + assert plc.percent_elements(a_string) == [0, 0.1, 0, 0, 0, .2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + a_string = "]]]*%!,,:M" + assert plc.percent_elements(a_string) == [0, 0, 0.3, 0, 0.1, 0, 0.2, 0, 0, 0, 0, 0, 0.1, 0.1, 0, 0.1, 0, 0, 0, 0, 0] + a_string = "" + assert plc.percent_elements(a_string) == [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + + +def test_number_elements(): + """element order: begin end do""" + a_string = "begin: words!!! end begin itbeginq" + assert plc.number_elements(a_string) == [2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + a_string = "dobeginend do do end, Mend :begin:" + assert plc.number_elements(a_string) == [1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + a_string = "" + assert plc.number_elements(a_string) == [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + + +def test_longest_run(): + """element order: ) } ] =""" + a_string = ")))))[]]]}]]]]}}=" + assert plc.longest_run(a_string) == [5, 2, 4, 1] + a_string = "Adn;ksenfas]]]]]((()====" + assert plc.longest_run(a_string) == [1, 0, 5, 4] + + +def test_line_enders(): + a_string = "....)\n ....;\n....;\n" + assert plc.line_enders(a_string) == [1, 2, 0, 0, 0] + + +def test_featurizer_transform(): + tf = plc.Featurizer(plc.percent_elements, plc.number_elements, plc.longest_run) + test_list = ["begin }}} . end", "do end %%__=====", ""] + array = tf.transform(test_list) + test_array = [[0, 0.2, 0, 0, 0, 0.06666667, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, + 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.125, 0, 0, 0, 0, 0.125, 0, 0, 0, 0, 0.125, + 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 5], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0]] + def rounder(array): + for collection in array: + for index in range(len(collection)): + collection[index] = round(collection[index], 3) + return array + + assert rounder(array) == rounder(test_array) \ No newline at end of file From c6d55580cd8438ad80f84b639e872e502a76e626 Mon Sep 17 00:00:00 2001 From: Benjamin Phillips Date: Sat, 6 Jun 2015 14:48:33 -0400 Subject: [PATCH 5/9] Made crawler_scraper more general use and removed some stray text from test_get_data --- .../crawler_scraper.py | 18 ++++++++++++++++-- .../tests/test_get_data.py | 15 +-------------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/programming_language_classifier/crawler_scraper.py b/programming_language_classifier/crawler_scraper.py index ae06e66..67535d4 100644 --- a/programming_language_classifier/crawler_scraper.py +++ b/programming_language_classifier/crawler_scraper.py @@ -4,7 +4,21 @@ import re -languages = {'tcl': '.tcl'} +languages = {'javascript': '.js', + 'haskell': '.haskell', + 'scala': '.scala', + 'ocaml': '.ocaml', + 'ruby': '.jruby', + 'php': '.php', + 'clojure': '.clojure', + 'perl': '.perl', + 'csharp': '.csharp', + 'java': '.java', + 'c': '.gcc', + 'scheme': '.racket', + 'python': '.py', + 'lisp': '.sbcl', + 'tcl': '.tcl'} def rosetta_scraper(seed, path): response = requests.get(seed) @@ -17,7 +31,7 @@ def rosetta_scraper(seed, path): for a in all_a if a.attrs.get("href") and "wiki" in a.attrs.get("href")] count = 1 - for link in links[:50]: + for link in links: response = requests.get(link) soup = bs4.BeautifulSoup(response.text) code = soup.select('pre') diff --git a/programming_language_classifier/tests/test_get_data.py b/programming_language_classifier/tests/test_get_data.py index d3b0e5d..cf77a25 100644 --- a/programming_language_classifier/tests/test_get_data.py +++ b/programming_language_classifier/tests/test_get_data.py @@ -15,18 +15,5 @@ def test_make_dataframe(): assert gd.make_dataframe(test_list)[1][2] == "This is a Ruby file\n" -'javascript': '.js', - 'haskell': '.haskell', - 'scala': '.scala', - 'ocaml': '.ocaml', - 'ruby': '.jruby', - 'php': '.php', - 'clojure': '.clojure', - 'perl': '.perl', - 'csharp': '.csharp', - 'java': '.java', - 'c': '.gcc', - 'scheme': '.racket', - 'python': '.py', - 'lisp': '.sbcl', + From 9386217686abd035a7078f304f3929efd1df0974 Mon Sep 17 00:00:00 2001 From: Benjamin Phillips Date: Sat, 6 Jun 2015 16:50:10 -0400 Subject: [PATCH 6/9] Fixed some formatting --- programming_language_classifier/get_data.py | 1 - 1 file changed, 1 deletion(-) diff --git a/programming_language_classifier/get_data.py b/programming_language_classifier/get_data.py index 27a32cc..0bbac97 100644 --- a/programming_language_classifier/get_data.py +++ b/programming_language_classifier/get_data.py @@ -29,7 +29,6 @@ def get_content(directory): extension = os.path.splitext(file)[1] if extension in extensions: with open(directory + file) as fh: - content.append([extensions[extension], fh.read()]) return content From 871d015c02fc0d6dbca908a50a1686b62a40ef3f Mon Sep 17 00:00:00 2001 From: Benjamin Phillips Date: Sat, 6 Jun 2015 16:52:34 -0400 Subject: [PATCH 7/9] Split the training and prediction duties of plc.py into two files and added the pickled bayesian classifier file that plc_predict.py reads --- programming_language_classifier/classifier | Bin 0 -> 13258 bytes .../plc_predict.py | 23 ++++++++++++++++++ .../{plc.py => plc_trainer.py} | 15 ++++++++++++ .../{test_plc.py => test_plc_trainer.py} | 2 +- 4 files changed, 39 insertions(+), 1 deletion(-) create mode 100644 programming_language_classifier/classifier create mode 100644 programming_language_classifier/plc_predict.py rename programming_language_classifier/{plc.py => plc_trainer.py} (74%) rename programming_language_classifier/tests/{test_plc.py => test_plc_trainer.py} (97%) diff --git a/programming_language_classifier/classifier b/programming_language_classifier/classifier new file mode 100644 index 0000000000000000000000000000000000000000..988a290a5a1ca8bb986c5f5734f96db19662fdbb GIT binary patch literal 13258 zcma)j30zIv`~J~9(P=sjXWEBKDk*cQRZ&V*l7uuLDov+}LPaE*5@`_41Byx-4EMZ} z2uTW=8c1f!5HkID&fe$V+x_0({jbkw_3U?8&sux!cMW@WcAThp*xCTDXK0W?h+had zz%Pg^<3L{UC=)jGMs4DW=(xyID3l=2K(3E_SU5K%j3>H@C#K^fj%&R!`yk&-a!$8AyEe2 z!J%A(z~LgEp`o5pGVY;V@8F=Y@X!eFa2cMAz)L2`M_}d2+ETXhq0GfC|G#P2cijJ{^RRCKo`do1kLL(HhvJ!w=aFgHci1nG$ivq` zc*gL&5YK*i4#RT@o~`lhNyuw)9OfIYXE={c3vk7-ZzO>u?SGX=`YyqZ-gq8v^Kkvc zb%*15IN9TQxXr`d!)^Xwb>r&a$pQa6+~#5YuQcp4d_Ti={$HHKJj1?Gcph$(FP_Oh z!!Q4l9C6KX|9jzixXk~R$on3?x8XJncC;%#~?U2 z{OFuM4B{5O$|`DO!;??ehwTstVmlV>-*^rKv)=a)zS(1-lxp?VawZ1`yeBW{tH!`T z!vFd677VBpJ}=vr%ZBSQCpR%CaA3Off-y<97$EkR#UjoesQ1x-KEs0pT^o#PiE}w% z>$e^;oj9;#cBkTY2@b?DX&3XKV&MB|T3wzo2Ml%Wk^fg!_{xfTe=tcM-X1g8rZy`< zXu)W%MSzNsZ^-%R7v~3~h4R~A=Vqs`G@*PUvCJr9B}*u4kCPip7n26Nl4RcVU?rj6 zYD-i|@emtGB%`32pgzTLQBMH|>8+-Dx+QFQ6>!R>O#_3l(t;Z<$1$*7qg$TggTdD^3udH-W1wGt za*0DC1|M}KUOAq|V9uxF)MQT#M(tb*vrb}gEjTu<41Zp0F0W-oRbg=1x?=UVBN!A( z@6>*<9fOUDDKi~2G1!0eDW|OhgYHq5nn5zE!gBSa_KzFvq(Q=L7e$A38qn00qw5}u z3+*JoOFFwHrI-Z)4=vVU((*vcBpzIxBGdX+7J}4QtKZXAg@=-nlQ&G1{0EP(LG25> z`?5kkS)Nvx@7q%;50nefwtkCJgodHgu%%NJp;_wwEO!qo+}xEOGt?sjzE1bNLK~m) zpTA39IXCkspXydJz|B_^@?}}HS{Z$2!9M3}HB-lNp!ytTFdM(`U+JBS6OUrBsw}YrzV@a49BH72Ar3zvmYsahrMTnf_ zI)B}5MYv*{eEDHSA76UL`TFiiB_WUDl9;iw=Z5(2Eepl3?PNok^}J(--b`5fcHaq3 zJU;GEM|4ZaU@-N(#fdO!4Dy{Xt#~>KgU=5yYfW{;KtViwx{C`2vU~Fs)59?62rfQO z3&x;%>66j@=UK2m@F8vTMhv!3Ee)is#^5ohL$N~zgB!V$;TLdzkup)oLL7^(R!NIT zVW6j2v8^f^1BULXiR@Sm>>gkbjPdg3a-rT6HepcEG;O}|Xf;sCm@0FuRTW&1r}iEm zD+7I|R4H1lEZBD|`G=-yKoYBA1_DB=;AN7#+B@Dgo*Ly;TpZAwlswOd^$^E#Ey8#mvs|zzj7oL=559JJTx2soF0{i2 zn{98Xz&YvosOJZn5L2J8j}mM;kFCioEWzV9 z%3V&*B8dYB^Hx{yEXLqeSVVjoo~1G6S?S$Z*y z1I6J6fkg!z2%Y_QqSq=8td*{)yPm-T&v_@Q)`j@IqJOEsGn)f7Y7TMf$sBmL^rBWp z2tNNE-{ft+1)sm%^X4St_WJpTseAiWL3#PTi#`Wvkl7pNztUL&;;wDrmPM$+l^F`= zt~lSnVO*RTKhbB*KkQ?kXbcSL$iT}suOh2c>R?yzwBn$Zq|k4|0gl3Y3k;Ndyz9sF znL<5T&%3M3115x1Vb)>=J54_&FcbY!U2muaScU=Ld4d8&ms-wgag_pWfx5#n!^eEr zTIt(O8bkbMgLO~NKV=E|#3ElzFtTF6XSIB}7c=qr9Q?W};yWJan-9CMF=K$!*7J!8 zkxZCMn{~gR&wyHBOX_PeKuqqth4^(Q_)LkKC)34%%rH-nhFB()IA1iHkk5o(*!KNd zHxsnhvMY_bOt>~z%xIwXPksERdC?6InXsL&sK-cQg63fEA8%8cFmAe2gywlB46RPF z8sEl*X-PxDJX4yGkL0C3Q6Jdis00+$npCUzi%;rF%&XRIrG-iSqa0}`F>O^+%-s;x z|MVmMN#3ToGhc%i$wADi(c2GqP+^9iNk^O}6>3rkA9o!a4LV1wy?S@cLb;0A`!7we z_`7zuht9hAgMV)5bszVLlCZuWhm}Lt)l8_c8Rc(hitoFWXVxYSu%P{-6;+GIf)cHC zAA>Fyls`Qr{@8&D17*%?yWg>Z?IvPwHpqhXKg>7P4zXY`_h3wUAPeqVO?w}8iU}X~ zJr&=&kPUg?%Bz1SvOv>gP08a{7R0XlFmVyCZ#}HKY6`{%M%1LOwK(5*%e{WSKUfgm zr&cF_l?Cd>t&}D6*w8y~?4yvCG-yzqrRw@c4z3KXx_q%i8WI|JCS5g=g1s7-zF%2{ z@83q?xjD-Ry!p!i(EpcT$FyauH2VAbq~GGUr&*LlhLDHkJvcSv=R~}JNjZ+*d9d`h z0wnh6xx4S8g2WSzW$Lb@pkMiCs-M{hzQrUI_o-3=NF@2abd094MOd)qmRjf4b8MJh zrt^1EJ=Kys;9Qgd2R*^ey#X?#D{}H=d&OcD;IoU z+Ox(v@+2FYi!nVpGdA4wv%auuHU>VCEp2QG46^Dcl-Ju}P~WlZ#6Fsx$HT zg)FbIR(#)7=-So(hK}!7XK|O$e!&KX!gzx;6AVh@>n7(1(qKo8OI5NfzJ6bmSxD=} z_x&Q<59?)9LGhWkzuO*lIOyVgtXV`^=yz3KuXUooG_+^WmpOQaCiEkD>s$~`40^)&u)r(8)`p7d)Ue@kll2c}R@^49z5#m6Mb!7lDieZK}Oe0IJOyH}eE zSM0XxIW3okO7mBHPnYBS-smEaM_N66&8+t3n;k#+2fN>wBu}IZ`Mw7+r&QznpQ+-X zOiy7fK;f)qcl20L<8ajEx(E~2)EISpq_d!Ny7KxRgG{*X%|Bg+ub;PLw|sKUWP!;F zm&eshSYWhqDlb2X1+VA6yZ)+)2?s*j+J?R};il8~v-SsBpluLxHVu#Snz)iX;n{RZ z=l98&*Rf#C^{>vO^I0%^>#9?)?yx|zsYyL|6$?H-i>r%%!vbt*SA^0k8mQ7zHom>4 z06vRrRJ*&V;N~E`f__sK+&lKWbmHTWv#;ul$~1;hCiNtSpUjy2VAmHuDQBfUuX~cJ z@vr`*U;D(mXz@!b|M06k%r#ypHwN6U#H@5Fq=IhvjHr)H1(1S+oAY$!!Qo!%`wLqX zz#}uv*kw*1U-L?L8sp3_zQfcvv)icQf0mN4oTI7Tpty??Warw1)2^rs^&~He8YP>i z_3EeriT~wKmisq9Qcv<#OYPeI@q#!UQRhc2IjjgrpUz2X7Lx_3!-GC47o~u5cE|h8 zQ(y2&yUUte2VY!Nf*5!I*(bJ+f`Ynlm#Xc)^54AM86S_&Kec{FzLyHv@MGuL1LyGh zw@f1@cb+l6f8QA(SA?G@Ouyc;oQ>}f-%VaLt@|1ie6_Z$;67u+3X}TU2fFyYw7+P7 zODh{FIs4k*b}-?D_YoSio(0)oOhPAaXM;-(r_4fK~aln>NI_Muro4fy)?2`}ts!nmv0vlz} zHMhK7kRuJD#v8|QPK$s|M9TF(1qK+)ltfQ-V#20*^R&19VnIaB#Av4|HYi=nyf8at!o8XF>>tZg;I^=hTC`4oP>l-{@Y?)^G8tP#1H z6_~+>TczzGzk=D|`EGGu&22VdU-#GNYSCb0o}mVlst7)G$xTxn6+w1wGc$E54f@hP zKAu=E0u>Uwze>De3vt@|8^)2I8qn)zWX#JSE!2 zc;8;?Ss|B52a-4RqRi)c&a#kjFD$QlB^8{9mTfrPFAq|$ny4Mq$H4b>Dk~L5KJY0S zj|*(x%0TkTuvu4lpZT3SoVU4dDnj0I@l>>K5T7TuXZF9I&xVhe79Ib#l?ArnrcSi+ zWkE-4rB}2D8#K3Gqy^Nl;B2T&nz=C>RC9WwA{E&1B6X#)WwNLD1`DPe&|b3pSg_mucBw6{|DY{fmTJX@DIGrgpK<)$fGxT=o(+-{&i&{> z`15=C>%ll#e7srI7p39tQFBO7-n5PbRX6=KUKmOvA_d)7Oxe=XB*4)AX>CwGC>*?}B9yxy|@0y{q@c!LaPH4H3E-IAC z`bZv% z0?76RH_8ev zJe$~b=69eLx02i-cpnBeq!pO3d_T+U0B1IaI&C$yoX zj2~CBXSAXV8{1rEO$X2p$28`TpXF%Oqqe=OtPaGZtn~|x=tHGC_wMp9RiS#B=y@3` z_t1*ma%tB)?I^!XmLdM)JyJNP?feJrJSxa89sK#W580s41Nqqp5P5K);GJR^wz&*- zM{IN*ADfOc5^H~!ZoYz6kLro+wQoh$N=|aY(p9L|aNx=AG(9sxxv>I_n@7mA38V;8 zj_2QH%H$C;F2z;wMhA*u`0Wgi)7BkX^_I^8{~Wjxe4{gKR?b0x@o^+42(_*!IcCUR_oP8n)ntC+EFb3BTz z*``msqa$oD<#*C0WPJge@d)|1@h^Xf{)452o$OB%JqSI)Zx%?%f1wz@G$Z{;Jz3t# z?6*c355#kfTP)C1qXbxxEx??30a7pmDHMu)t*+0nkEn^gJJ2XB0m&ctzP<6%P2_EF zV7Z-n3)v`?bDkE|qOzJd8_yWEBD?B0)%BU>=ym_3+zSzdp; z#VQR&7a2}EVw#GQs|{uxa!N%Cv7?6!LiVD4hs=*!~9 z>x}9|&!FPABl+Kps?i}2{nhSkbI=#6w5gbCA>x+Ly%Jvj9If0j+qpM#56U)y$&wAA&h$$pN88_WA#x@$*1d&DpQe>=A2;pxhY8bhjvThkuB(Duf)_<%l07k z{%p@TJCl&@agptnon*X_?e`M2AIR}fj^`2c5oIcIpA;gI{i#goNnX-UASN6Kgdc@K zDFO*zp&wa4>1QhNb1V{I+(M!p?!WiTmu_@S8SQPlx&zJBT;fL*Ijwc`QvD{(YMW8B95b%0cE$mFT6zE zTw}?rjTOi&-Fm6Rfn3yAc{}{}>T>kBcmL%rj5Fv4!!A2=^eOb|`>}q7)5*v){It>d z;41X}502%4BOmEk++3g1c?~@X_~aLFl7t!y&dOaXBm0SnSIQK@{p17__eIXfWL%TC zNzM~wzms~hpM)qh{JzNXHv5ejSAZ0<-${Rh7aR#Y(H@EwAuk{>jzCQTlIu*I522?} zqK{~GU$B3U=D&RqPHC36N{>C^wQ?GI-(%ezcR%JCvv z--|8q_DVww&FbQGUPSp3e$)^`PxKFU-3UMO`3d{~?*wrx34fx$ z<7^3enEmlwLdOD9D!#QgqK{in$$$9Kf^KP5d^sWC zjmBX9q6a0;p%>e=yZ(68jwA}&4q2}3K`|R-in7&H(c8(BL9!8$A|>tG1FrNsU=I=Tw=83mLq4LyavScIG| zn4E`f_XSRPRlXA?=4(9f{7NI+P5FBw)?ths2YKr;bX1UQ`MEhB=A&*mwqQjN+CO6zc z$|44-%2UpxQLkjbCS>%Y8_%{DPb#ZL-!r@%GCHae#@GM<>*{DC(y&`x=Vx*XHF!0%G>h&3C%63{_dqxs_{IjFR5oRkX>iMN=A# zJ`<5q_W+8gIUpbOD{a{j0MOCtA4|6&%ZNBCjn zK9O`F#{nr5{;Gu6{8tuBHO%t=djrO>vNLmE;A&K|tw-i?K(Pw$rqzz8@ zk%g{ow#=tn=)*RBL$#QPDE~ypXG`r0R5th|p(yDxYC4+l7e789%^kOMa6_%A*<-7c zh+BP?=*_i|tb1%g=YAcupJ;pmF}x%^X5PGxu13o^Dw*|Xyik58V%{O+ zi$boam;)`{!rMWJKYwRwKYpO>UB?@7irrbeI%eHLM z6uE_Z0+S7q{~6@g79PbZOF+D1ZTntGeL}a6U8Oelmm}`ZDs$`C)d(hx&zG^yLvzP& zqHH(4gl-Kv>@xqDgfbt!79Wge5$&EOz&JBP9@(EG`iU|kQDcd6Lj6ctX!!fZ*#GTE z_IsQ;VMhcK`%eln9w;dS{mn4K{^z=NWvT?EP3c3CyH4EQVze7|6jrs*a%o0CcMP&b z(l#TF0`ZO?${k2OqPktT_%_PEbtID;(S?)-H)K(l<)ZuZ^v87nIFAf6wF0L|okqHv z^|8gG_YtONR&^t?37O@IX(WC=kGe~Z(-V5u3)h9kAG6;zSy!Os2)%+(=R&kOvpiFS zk%{K$c{o}*m!Q%)zCIPnzUW->rNtp>V~KWCh(yZ&(ci-OCgc1+{R#em^AqaLemllF z2~hZ+K+O9TV!zo<#ydIwFru8N0I?7=gO|08Ci*KRWnH@Gn|3qm>eM>6GOHU|R{EML z`}2^(GsE*8xy@+yb$|a?k3OQ&lMg7qgRA< zS3X8{6{Ram2gzs^#f^s^0i1Jef*zF-ea$khUN5?+5O^40i zp#E-K`lqU!=z*&9V$JM7(7+pRa%E^Ks;X~aKgFjNDOm6CERA`BNUEQdS@M8f zoZ`p6D6T{LX`YkR)pC*Za@LF1bxTo!>c>md&)K1XA^ZI11vzNukxj47X2zj)qL&WQ z9ZsT~snNd6mrO&754OfvtC8_0TwjU%Am7J`{-8z>dJ2&UJt+#$)yRD#h43fO4+ux_ zC-lOCBV>X%$8Ut5jF0I8J(fz?b^h*W=0(W;$o@(Fd{t3CdL3BdS3YV0X?U$aopksG za$bMF=Y!XAR1~~c`&!UFRCscM>)4cQ=r71ITrd z>}MM@-js0OR4*4QUaz3Jg5RU#H|EZpMmBl+aFb^^@@5iEHvZk@-a9h8ZPi4o1#K_ z@&^a}~76kSp|(|i;x*7j>yS|$$e+@^&O^QE9={wog)Ycb%>3g^xCitzgV{R#0i z`Y_%cZ+G`VPro2{cNweSKdtwR=7!4fOz^K&OgHh+-~JB~=(&~~8pboz`Lj$2H`JSp z|8s&HzzxJPOonIvrxE|M#fuyI7n{YOwt(QEHQcap_t1zS8Q$E#=<$zm+_^zMcttY2 pdEq?EFt3Q;;UxI<#%dLBzK-u4U(*DhwXe9ZWCG8I8{uW}{{USz0Q3L= literal 0 HcmV?d00001 diff --git a/programming_language_classifier/plc_predict.py b/programming_language_classifier/plc_predict.py new file mode 100644 index 0000000..5453821 --- /dev/null +++ b/programming_language_classifier/plc_predict.py @@ -0,0 +1,23 @@ +import os +import sys +import pickle +import get_data as gd +from plc_trainer import Featurizer, percent_elements, number_elements, longest_run, line_enders + +def predict(classifier, directory): + content = [] + for filename in os.listdir(directory): + with open(directory + filename) as fh: + content.append([filename, fh.read()]) + test_data = gd.make_dataframe(content) + predictions = list(classifier.predict(test_data[1])) + buffer = max([len(item) for item in test_data[0]]) + 5 + for index in range(len(predictions)): + print(test_data[0][index].ljust(buffer) + "| " + predictions[index]) + + + +if __name__ == '__main__': + with open("./classifier", "rb") as file: + predictor = pickle.load(file) + predict(predictor, sys.argv[1]) diff --git a/programming_language_classifier/plc.py b/programming_language_classifier/plc_trainer.py similarity index 74% rename from programming_language_classifier/plc.py rename to programming_language_classifier/plc_trainer.py index a816eee..f39d7d4 100644 --- a/programming_language_classifier/plc.py +++ b/programming_language_classifier/plc_trainer.py @@ -1,5 +1,9 @@ import re import itertools +import get_data as gd +from sklearn.naive_bayes import MultinomialNB +from sklearn.pipeline import Pipeline +import pickle def percent_elements(text): @@ -54,3 +58,14 @@ def transform(self, X): vector = list(itertools.chain.from_iterable([function(item) for function in self.feature_makers])) feature_vectors.append(vector) return feature_vectors + + + +if __name__ == '__main__': + content_list = gd.get_content("./train/") + train_data = gd.make_dataframe(content_list) + classifier = Pipeline([('features', Featurizer(percent_elements, number_elements, longest_run, line_enders)), + ('bayes', MultinomialNB())]) + classifier.fit(train_data[1], train_data[0]) + with open("./classifier", "wb") as file: + pickle.dump(classifier, file) diff --git a/programming_language_classifier/tests/test_plc.py b/programming_language_classifier/tests/test_plc_trainer.py similarity index 97% rename from programming_language_classifier/tests/test_plc.py rename to programming_language_classifier/tests/test_plc_trainer.py index 62ae668..065f76c 100644 --- a/programming_language_classifier/tests/test_plc.py +++ b/programming_language_classifier/tests/test_plc_trainer.py @@ -1,4 +1,4 @@ -from programming_language_classifier import plc +import programming_language_classifier.plc_trainer as plc def test_percent_elements(): From dae4b924cee3a858a9ba8f957535b3599029f4a4 Mon Sep 17 00:00:00 2001 From: Benjamin Phillips Date: Sat, 6 Jun 2015 20:04:53 -0400 Subject: [PATCH 8/9] Fixed import statement in plc_trainer.py --- programming_language_classifier/plc_trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/programming_language_classifier/plc_trainer.py b/programming_language_classifier/plc_trainer.py index f39d7d4..c083e68 100644 --- a/programming_language_classifier/plc_trainer.py +++ b/programming_language_classifier/plc_trainer.py @@ -1,6 +1,6 @@ import re import itertools -import get_data as gd +from programming_language_classifier import get_data as gd from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import Pipeline import pickle From c9899c09beb7a4de8fe66845fd8ac5904bb82bd0 Mon Sep 17 00:00:00 2001 From: Benjamin Phillips Date: Sat, 6 Jun 2015 20:14:01 -0400 Subject: [PATCH 9/9] Added ipython notebook to demonstrate plc predictor --- Programming Language Guesser.ipynb | 277 +++++++++++++++++++++++++++++ 1 file changed, 277 insertions(+) create mode 100644 Programming Language Guesser.ipynb diff --git a/Programming Language Guesser.ipynb b/Programming Language Guesser.ipynb new file mode 100644 index 0000000..3122ca9 --- /dev/null +++ b/Programming Language Guesser.ipynb @@ -0,0 +1,277 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.cross_validation import train_test_split\n", + "from programming_language_classifier import get_data as gd\n", + "from programming_language_classifier import plc_trainer as plc\n", + "from sklearn.metrics import classification_report, confusion_matrix\n", + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "content_list = gd.get_content(\"programming_language_classifier/train/\")\n", + "train_data = gd.make_dataframe(content_list)\n", + "x_train, x_test, y_train, y_test = train_test_split(train_data[1], train_data[0], test_size=0.2)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "classifier = Pipeline([('features', plc.Featurizer(plc.percent_elements, plc.number_elements,\n", + " plc.longest_run, plc.line_enders)),\n", + " ('bayes', MultinomialNB())])" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Pipeline(steps=[('features', ), ('bayes', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "classifier.fit(x_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Confusion Matrix for Training Data\n", + "\n", + "[[46 0 0 0 0 0 0 0 4 0 0 0 0 0 1]\n", + " [ 0 31 0 0 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 0 31 0 0 0 0 0 0 0 0 0 0 1 0]\n", + " [ 0 0 0 21 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 0 0 0 29 0 1 0 0 0 0 0 0 0 1]\n", + " [ 0 7 0 0 0 45 0 0 0 0 0 0 0 0 0]\n", + " [ 0 0 0 0 0 0 17 0 1 0 0 0 0 0 0]\n", + " [ 0 0 0 0 0 0 0 23 0 0 0 0 0 0 0]\n", + " [ 1 0 0 0 0 0 1 0 34 0 0 0 0 0 0]\n", + " [ 0 0 0 0 0 0 0 0 0 26 0 0 0 0 0]\n", + " [ 0 0 0 0 0 0 0 0 0 0 32 0 0 0 1]\n", + " [ 0 0 0 0 0 0 0 0 0 0 0 56 0 0 0]\n", + " [ 0 0 0 0 0 0 2 0 0 0 0 0 34 0 1]\n", + " [ 0 0 0 0 0 0 0 0 0 0 0 0 0 18 0]\n", + " [ 0 0 0 0 0 0 0 0 0 0 0 1 0 0 38]]\n", + "\n", + "Train Score: 0.954365079365\n" + ] + } + ], + "source": [ + "print(\"Confusion Matrix for Training Data\\n\")\n", + "print(confusion_matrix(classifier.predict(x_train), y_train))\n", + "print(\"\\nTrain Score: \" + str(classifier.score(x_train, y_train)))" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Confusion Matrix for Test Data\n", + "\n", + "[[11 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 3 0 0 0 1 0 0 0 0 0 0 0 0 0]\n", + " [ 0 0 7 0 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 0 0 13 0 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 0 0 0 4 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 0 0 0 0 5 0 0 0 0 0 0 0 0 0]\n", + " [ 0 0 0 0 0 0 4 0 0 0 0 0 1 0 0]\n", + " [ 0 0 0 0 0 0 0 11 0 0 0 0 0 0 0]\n", + " [ 0 0 0 0 0 0 0 0 14 0 0 0 0 0 0]\n", + " [ 0 0 0 0 0 0 0 0 0 8 0 0 0 0 0]\n", + " [ 0 0 0 0 0 0 0 0 0 0 4 0 0 0 0]\n", + " [ 0 0 0 0 0 0 0 0 0 0 0 15 0 0 1]\n", + " [ 0 0 0 0 0 0 0 0 0 0 0 0 7 0 1]\n", + " [ 0 0 0 0 0 0 0 0 0 0 0 0 0 10 0]\n", + " [ 0 0 0 0 0 0 0 0 0 0 0 1 0 0 6]]\n", + "\n", + "Test Score: 0.96062992126\n" + ] + } + ], + "source": [ + "print(\"Confusion Matrix for Test Data\\n\")\n", + "print(confusion_matrix(classifier.predict(x_test), y_test))\n", + "print(\"\\nTest Score: \" + str(classifier.score(x_test, y_test)))" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "content = []\n", + "for file in sorted(os.listdir(\"test/\"), key=int):\n", + " with open(\"test/\" + file) as fh:\n", + " content.append([fh.read()])\n", + "test_data = gd.make_dataframe(content)\n", + "test_labels = ['Clojure', 'Clojure', 'Clojure', 'Clojure', 'Python', 'Python',\n", + " 'Python', 'Python', 'JavaScript', 'JavaScript', 'JavaScript',\n", + " 'JavaScript', 'Ruby', 'Ruby', 'Ruby', 'Haskell', 'Haskell',\n", + " 'Haskell', 'Scheme', 'Scheme', 'Scheme', 'Java', 'Java', 'Scala',\n", + " 'Scala', 'TCL', 'TCL', 'PHP', 'PHP', 'PHP', 'OCaml', 'OCaml']" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Confusion Matrix for New Test Data\n", + "\n", + "[[3 0 0 0 0 0 0 0 0 0 0]\n", + " [0 3 0 0 0 0 0 0 0 0 0]\n", + " [0 0 2 0 0 0 0 0 0 0 0]\n", + " [0 0 0 4 0 0 0 0 0 0 0]\n", + " [0 0 0 0 2 0 0 0 0 0 0]\n", + " [0 0 0 0 0 3 0 0 0 0 0]\n", + " [1 0 0 0 0 0 4 0 0 0 0]\n", + " [0 0 0 0 0 0 0 3 0 0 0]\n", + " [0 0 0 0 0 0 0 0 2 0 0]\n", + " [0 0 0 0 0 0 0 0 0 3 0]\n", + " [0 0 0 0 0 0 0 0 0 0 2]]\n", + "\n", + "New Test Score: 0.96875\n" + ] + } + ], + "source": [ + "print(\"Confusion Matrix for New Test Data\\n\")\n", + "print(confusion_matrix(classifier.predict(test_data[0]), test_labels))\n", + "print(\"\\nNew Test Score: \" + str(classifier.score(test_data[0], test_labels)))" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " Clojure 0.75 1.00 0.86 3\n", + " Haskell 1.00 1.00 1.00 3\n", + " Java 1.00 1.00 1.00 2\n", + " JavaScript 1.00 1.00 1.00 4\n", + " OCaml 1.00 1.00 1.00 2\n", + " PHP 1.00 1.00 1.00 3\n", + " Python 1.00 0.80 0.89 5\n", + " Ruby 1.00 1.00 1.00 3\n", + " Scala 1.00 1.00 1.00 2\n", + " Scheme 1.00 1.00 1.00 3\n", + " TCL 1.00 1.00 1.00 2\n", + "\n", + "avg / total 0.98 0.97 0.97 32\n", + "\n" + ] + } + ], + "source": [ + "print(classification_report(classifier.predict(test_data[0]), test_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.4.3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}