From 62b795cbbf88839f36ac82c1bc2a82d8b203c4c3 Mon Sep 17 00:00:00 2001 From: sovello Date: Fri, 5 Jun 2015 12:29:59 -0400 Subject: [PATCH 1/6] initial upload with working scraper and classifier classes --- Classifier.ipynb | 124 ++++++ Learner.py | 47 +++ Scrapper.py | 56 +++ scraperstart.py | 957 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 1184 insertions(+) create mode 100644 Classifier.ipynb create mode 100644 Learner.py create mode 100644 Scrapper.py create mode 100644 scraperstart.py diff --git a/Classifier.ipynb b/Classifier.ipynb new file mode 100644 index 0000000..d837cf5 --- /dev/null +++ b/Classifier.ipynb @@ -0,0 +1,124 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from Scrapper import scrape\n", + "from Learner import Learner" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "df = scrape(num_links=200, drop_less_than=50)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
01
0adawith Ada.Characters.Latin_1; use Ada.Characte...
1adadeclare F1, F2 : File_Type;begin Open (F1,...
2autohotkeyimg := ppm_read(\"lena50.ppm\") ; x := img[4,4...
4cimage get_ppm(FILE *pf);
5c#include \"imglib.h\" #define PPMREADBUFLEN 256i...
\n", + "
" + ], + "text/plain": [ + " 0 1\n", + "0 ada with Ada.Characters.Latin_1; use Ada.Characte...\n", + "1 ada declare F1, F2 : File_Type;begin Open (F1,...\n", + "2 autohotkey img := ppm_read(\"lena50.ppm\") ; x := img[4,4...\n", + "4 c image get_ppm(FILE *pf);\n", + "5 c #include \"imglib.h\" #define PPMREADBUFLEN 256i..." + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.4.3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/Learner.py b/Learner.py new file mode 100644 index 0000000..3cbf6e7 --- /dev/null +++ b/Learner.py @@ -0,0 +1,47 @@ +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np +import seaborn as sns +from sklearn import linear_model +from sklearn.preprocessing import PolynomialFeatures +from sklearn.linear_model import LinearRegression +from sklearn.pipeline import Pipeline +from sklearn.cross_validation import train_test_split +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.naive_bayes import MultinomialNB +from sklearn.naive_bayes import GaussianNB +from sklearn.naive_bayes import BernoulliNB +from sklearn.ensemble import RandomForestClassifier +from sklearn.neighbors import KNeighborsClassifier + +class Learner(object): + '''Takes a dataframe with outcomes on first column and predictor second column + Makes available the score and predict methods + ''' + def __init__(self, dataframe, alg='NBayes'): + self.outcome, self.predictor = self.split_data(dataframe) + self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.predictor, self.outcome) + + self.pipe = Pipeline([('bag_of_words', CountVectorizer()), + ('bayes', MultinomialNB())]) + self.fit() + + + def fit(self): + self.pipe.fit(self.X_train, self.y_train) + + + def test_score(self): + return self.pipe.score(self.X_test, self.y_test) + + + def train_score(self): + return self.pipe.score(self.X_train, self.y_train) + + + def predict(self, string): + return self.pipe.predict([string]) + + + def split_data(self, data): + return data.loc[:,0], data.loc[:,1] diff --git a/Scrapper.py b/Scrapper.py new file mode 100644 index 0000000..dc6a31f --- /dev/null +++ b/Scrapper.py @@ -0,0 +1,56 @@ +from bs4 import BeautifulSoup +import requests +import urllib +from re import findall +import pandas as pd +import random +import pickle + +def get_text(url): + """Takes a url and returns text""" + req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'}) + content = urllib.request.urlopen(req).read() + page_text=BeautifulSoup(content) + return page_text.get_text() + +def scrape_data(url): + req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'}) + content = urllib.request.urlopen(req).read() + soup = BeautifulSoup(content) + return soup.find_all( "pre", class_="highlighted_source") + #pre is an html tag. We want all text from pre with class highlighted_source + #returns a list of soup objects + + +def pull_code_from_soup(soup_list): + return [[soup_list[i]['class'][0], soup_list[i].get_text()] for i in range(len(soup_list))] + + +def make_data(url_list): + code_snippets = pd.DataFrame(columns=([0, 1])) + for url in url_list: + soup_list = scrape_data(url) + url_df = pd.DataFrame(pull_code_from_soup(soup_list)) + code_snippets = code_snippets.append(url_df, ignore_index=True) + return code_snippets + + +def scrape_links(): + req = urllib.request.Request('http://rosettacode.org/wiki/Category:Programming_Tasks', headers={'User-Agent': 'Mozilla/5.0'}) + content = urllib.request.urlopen(req).read() + soup = BeautifulSoup(content) + link_list = [link.get('href') for link in soup.find_all('a')] + return ["http://www.rosettacode.org{}".format(link) for link in link_list[1:] if link.startswith('/wiki/')] + + +def make_links_list(num_links=30): + return random.sample(scrape_links(), num_links) + + +def scrape(num_links=30, drop_less_than=0, save=False): + df = make_data(make_links_list(num_links)) + ndf = df[df[0] != 'text'] + ndf = ndf.groupby(0).filter( lambda x: len(x) >= drop_less_than) + if save: + ndf.to_pickle('filename_.pkl') + return ndf diff --git a/scraperstart.py b/scraperstart.py new file mode 100644 index 0000000..c77c6d1 --- /dev/null +++ b/scraperstart.py @@ -0,0 +1,957 @@ +from bs4 import BeautifulSoup +import requests +import urllib +from re import findall +import pandas as pd + + +# C (.gcc, .c) +# C# +# Common Lisp (.sbcl) +# Clojure +# Haskell +# Java +# JavaScript +# OCaml +# Perl +# PHP (.hack, .php) +# Python +# Ruby (.jruby, .yarv) +# Scala +# Scheme (.racket) + +languages_list = ['ACL2', + 'Ada', + 'Aime', + 'ALGOL 68', + 'AppleScript', + 'AutoHotkey', + 'AutoIt', + 'AWK', + 'BASIC', + 'BBC BASIC', + 'bc', + 'Brat', + 'C', + 'C++', + 'C#', + 'Clojure', + 'COBOL', + 'CMake', + 'CoffeeScript', + 'Common Lisp', + 'D', + 'Delphi', + 'DWScript', + 'E', + 'Eiffel', + 'Erlang', + 'ERRE', + 'Euphoria', + 'Factor', + 'Fantom', + 'Forth', + 'Fortran', + 'Frink', + 'F#', + 'FunL', + 'GAP', + 'Go', + 'Groovy', + 'Haskell', + 'Icon and Unicon', + 'Inform 6', + 'J', + 'Java', + 'JavaScript', + 'Joy', + 'Julia', + 'LabVIEW', + 'Lasso', + 'Liberty BASIC', + 'Logo', + 'Lua', + 'M4', + 'Mathematica', + 'MATLAB', + 'Maxima', + 'Modula-3', + 'MUMPS', + 'Nemerle', + 'NetRexx', + 'Nim', + 'Objective-C', + 'OCaml', + 'Oforth', + 'Oz', + 'PARI/GP', + 'Pascal', + 'Perl', + 'Perl 6', + 'PHP', + 'PicoLisp', + 'PL/I', + 'PowerShell', + 'PureBasic', + 'Python', + 'R', + 'Racket', + 'REBOL', + 'REXX', + 'Ruby', + 'Run BASIC', + 'Rust', + 'Scala', + 'Scratch', + 'Seed7', + 'Sidef', + 'Smalltalk', + 'SNOBOL4', + 'Swift', + 'Tcl', + 'TI-83 BASIC', + 'TUSCRIPT', + 'UNIX Shell', + 'Ursala', + 'VBScript', + 'Vedit macro language', + 'zkl'] + +task_list = ['24 game', + '24 game/Solve9', + '9 billion names of God the integer', + '99 Bottles of BeerA', + 'A+B', + 'ABC Problem', + 'Abstract type', + 'Abundant, deficient and perfect number classifications', + 'Accumulator factory', + 'Ackermann function', + 'Active Directory/Connect', + 'Active Directory/Search for a user', + 'Active object', + 'Add a variable to a class instance at runtime', + 'Address of a variable', + 'AKS test for primes', + 'Align columns', + 'Aliquot sequence classifications', + 'Almost prime', + 'Amb', + 'Amicable pairs', + 'Anagrams', + 'Anagrams/Deranged anagrams', + 'Animate a pendulum', + 'Animation', + 'Anonymous recursion', + 'Append a record to the end of a text file', + 'Apply a callback to an array', + 'Arbitrary-precision integers (included)', + 'Arena storage pool', + 'Arithmetic evaluation', + 'Arithmetic-geometric mean', + 'Arithmetic-geometric mean/Calculate Pi', + 'Arithmetic/Complex', + 'Arithmetic/Integer', + 'Arithmetic/Rational', + 'Array concatenation', + 'Arrays', + 'Assertions', + 'Associative array/Creation', + 'Associative array/Iteration', + 'Atomic updates', + 'Average loop length', + 'Averages/Arithmetic mean', + 'Averages/Mean angle', + 'Averages/Mean time of day', + 'Averages/Median', + 'Averages/Mode', + 'Averages/Pythagorean means', + 'Averages/Root mean square', + 'Averages/Simple moving averageB', + 'Balanced brackets', + 'Balanced ternary', + "Benford's law", + 'Bernoulli numbers', + 'Best shuffle', + 'Binary digits', + 'Binary search', + 'Binary strings', + 'Bitcoin/address validation', + 'Bitcoin/public point to address', + 'Bitmap', + "Bitmap/Bresenham's line algorithm", + 'Bitmap/Bézier curves/Cubic', + 'Bitmap/Bézier curves/Quadratic', + 'Bitmap/Flood fill', + 'Bitmap/Histogram', + 'Bitmap/Midpoint circle algorithm', + 'Bitmap/PPM conversion through a pipe', + 'Bitmap/Read a PPM file', + 'Bitmap/Read an image through a pipe', + 'Bitmap/Write a PPM file', + 'Bitwise IO', + 'Bitwise operations', + 'Boolean values', + 'Box the compass', + 'Break OO privacy', + 'Brownian tree', + 'Bulls and cows', + 'Bulls and cows/PlayerC', + 'Caesar cipher', + 'Calendar', + 'Calendar - for "REAL" programmers', + 'Call a foreign-language function', + 'Call a function', + 'Call a function in a shared library', + 'Call an object method', + 'Canny edge detector', + 'Carmichael 3 strong pseudoprimes', + 'Case-sensitivity of identifiers', + 'Casting out nines', + 'Catalan numbers', + "Catalan numbers/Pascal's triangle", + 'Catamorphism', + 'Catmull–Clark subdivision surface', + 'Character codes', + 'Chat server', + 'Check Machin-like formulas', + 'Check that file exists', + 'Checkpoint synchronization', + 'Chinese remainder theorem', + 'Cholesky decomposition', + 'Circles of given radius through two points', + 'Classes', + 'Closest-pair problem', + 'Closures/Value capture', + 'Collections', + 'Color of a screen pixel', + 'Color quantization', + 'Colour bars/Display', + 'Colour pinstripe/Display', + 'Colour pinstripe/Printer', + 'Combinations', + 'Combinations and permutations', + 'Combinations with repetitions', + 'Comma quibbling', + 'Command-line arguments', + 'Comments', + "Compare sorting algorithms' performance", + 'Compile-time calculation', + 'Compound data type', + 'Concurrent computing', + 'Conditional structures', + 'Conjugate transpose', + 'Constrained genericity', + 'Constrained random points on a circle', + 'Continued fraction', + 'Continued fraction/Arithmetic/Construct from rational number', + 'Continued fraction/Arithmetic/G(matrix NG, Contined Fraction N)', + 'Continued fraction/Arithmetic/G(matrix NG, Contined Fraction N1, Contined Fraction N2)', + 'Convert decimal number to rational', + "Conway's Game of Life", + 'Copy a string', + 'Count in factors', + 'Count in octal', + 'Count occurrences of a substring', + 'Count the coins', + 'CRC-32', + 'Create a file', + 'Create a file on magnetic tape', + 'Create a two-dimensional array at runtime', + 'Create an HTML table', + 'Create an object at a given address', + 'CSV data manipulation', + 'CSV to HTML translation', + 'Currying', + 'Cut a rectangleD', + 'Date format', + 'Date manipulation', + 'Day of the week', + 'Deal cards for FreeCell', + 'Death Star', + 'Deconvolution/1D', + 'Deconvolution/2D+', + 'Deepcopy', + 'Define a primitive data type', + 'Delegates', + 'Delete a file', + 'Detect division by zero', + 'Determine if a string is numeric', + 'Determine if only one instance is running', + 'Digital root', + 'Digital root/Multiplicative digital root', + "Dinesman's multiple-dwelling problem", + 'Dining philosophers', + 'Discordian date', + 'Distributed programming', + 'DNS query', + 'Documentation', + 'Dot product', + 'Doubly-linked list/Definition', + 'Doubly-linked list/Element definition', + 'Doubly-linked list/Element insertion', + 'Doubly-linked list/Traversal', + 'Dragon curve', + 'Draw a clock', + 'Draw a cuboid', + 'Draw a sphere', + 'Dutch national flag problem', + 'Dynamic variable namesE', + 'Echo server', + 'Element-wise operations', + 'Empty directory', + 'Empty program', + 'Empty string', + 'Enforced immutability', + 'Entropy', + 'Enumerations', + 'Environment variables', + 'Equilibrium index', + 'Ethiopian multiplication', + 'Euler method', + "Euler's sum of powers conjecture", + 'Evaluate binomial coefficients', + 'Even or odd', + 'Events', + 'Evolutionary algorithm', + 'Exceptions', + 'Exceptions/Catch an exception thrown in a nested call', + 'Executable library', + 'Execute a Markov algorithm', + 'Execute a system command', + 'Execute Brain****', + 'Execute HQ9+', + 'Execute SNUSP', + 'Exponentiation operator', + 'Extend your language', + 'Extensible prime generator', + 'Extreme floating point valuesF', + 'Factorial', + 'Factors of a Mersenne number', + 'Factors of an integer', + 'Fast Fourier transform', + 'Fibonacci n-step number sequences', + 'Fibonacci sequence', + 'Fibonacci word', + 'Fibonacci word/fractal', + 'File input/output', + 'File modification time', + 'File size', + 'Filter', + 'Find common directory path', + 'Find largest left truncatable prime in a given base', + 'Find limit of recursion', + 'Find the last Sunday of each month', + 'Find the missing permutation', + 'First class environments', + 'First-class functions', + 'First-class functions/Use numbers analogously', + 'Five weekends', + 'FizzBuzz', + 'Flatten a list', + 'Flipping bits game', + 'Flow-control structures', + "Floyd's triangle", + 'Forest fire', + 'Fork', + 'Formal power series', + 'Formatted numeric output', + 'Forward difference', + 'Four bit adder', + 'Fractal tree', + 'Fractran', + 'Function composition', + 'Function definition', + 'Function frequency', + 'Function prototypeG', + 'Galton box animation', + 'Gamma function', + 'Gaussian elimination', + 'Generate Chess960 starting position', + 'Generate lower case ASCII alphabet', + 'Generator/Exponential', + 'Generic swap', + 'Globally replace text in several files', + 'Go Fish', + 'G cont.', + 'Gray code', + 'Grayscale image', + 'Greatest common divisor', + 'Greatest element of a list', + 'Greatest subsequential sum', + 'Greyscale bars/Display', + 'Guess the number', + 'Guess the number/With feedback', + 'Guess the number/With feedback (player)', + 'GUI component interaction', + 'GUI enabling/disabling of controls', + 'GUI/Maximum window dimensionsH', + 'Hailstone sequence', + 'Hamming numbers', + 'Handle a signal', + 'Happy numbers', + 'Harshad or Niven series', + 'Hash from two arrays', + 'Hash join', + 'Haversine formula', + 'Hello world/Graphical', + 'Hello world/Line printer', + 'Hello world/Newbie', + 'Hello world/Newline omission', + 'Hello world/Standard error', + 'Hello world/Text', + 'Hello world/Web server', + 'Here document', + 'Heronian triangles', + 'Hickerson series of almost integers', + 'Higher-order functions', + 'History variables', + 'Hofstadter Figure-Figure sequences', + 'Hofstadter Q sequence', + 'Hofstadter-Conway $10,000 sequence', + 'Holidays related to Easter', + 'Honeycombs', + 'Horizontal sundial calculations', + "Horner's rule for polynomial evaluation", + 'Host introspection', + 'Hostname', + 'Hough transform', + 'HTTP', + 'HTTPS', + 'HTTPS/Authenticated', + 'HTTPS/Client-authenticated', + 'Huffman codingI', + 'I before E except after C', + 'IBAN', + 'Identity matrix', + 'Image convolution', + 'Image noise', + 'Include a file', + 'Increment a numerical string', + 'Infinity', + 'Inheritance/Multiple', + 'Inheritance/Single', + 'Input loop', + 'Integer comparison', + 'Integer overflow', + 'Integer sequence', + 'Interactive programming', + 'Introspection', + 'Inverted index', + 'Inverted syntax', + 'Iterated digits squaringJ', + "Jensen's Device", + 'JortSort', + 'Josephus problem', + 'Joystick position', + 'JSON', + 'Jump anywhereK', + 'K-d tree', + 'K-means++ clustering', + 'Kaprekar numbers', + 'Keyboard input/Flush the keyboard buffer', + 'Keyboard input/Keypress check', + 'Keyboard input/Obtain a Y or N response', + 'Keyboard macros', + 'Knapsack problem/0-1', + 'Knapsack problem/Bounded', + 'Knapsack problem/Continuous', + 'Knapsack problem/Unbounded', + "Knight's tour", + 'Knuth shuffle', + "Knuth's algorithm SL", + "Langton's ant", + 'Largest int from concatenated ints', + 'Last Friday of each month', + 'Last letter-first letter', + 'Leap year', + 'Least common multiple', + 'Left factorials', + 'Letter frequency', + 'Levenshtein distance', + 'Linear congruential generator', + 'List comprehensions', + 'Literals/Floating point', + 'Literals/Integer', + 'Literals/String', + 'Logical operations', + 'Long multiplication', + 'Longest common subsequence', + 'Longest increasing subsequence', + 'Longest string challenge', + 'Look-and-say sequence', + 'Loop over multiple arrays simultaneously', + 'Loops/Break', + 'Loops/Continue', + 'Loops/Do-while', + 'Loops/Downward for', + 'Loops/For', + 'Loops/For with a specified step', + 'Loops/Foreach', + 'Loops/Infinite', + 'Loops/N plus one half', + 'Loops/Nested', + 'Loops/While', + 'LU decomposition', + 'Lucas-Lehmer test', + 'Ludic numbers', + 'Luhn test of credit card numbers', + 'LZW compressionM', + 'Machine code', + 'Mad Libs', + 'Magic squares of odd order', + 'Main step of GOST 28147-89', + 'Make directory path', + 'Man or boy test', + 'Mandelbrot set', + 'Map range', + 'Matrix arithmetic', + 'Matrix multiplication', + 'Matrix transposition', + 'Matrix-exponentiation operator', + 'Maximum triangle path sum', + 'Maze generation', + 'Maze solving', + 'MD4', + 'MD5', + 'MD5/Implementation', + 'Median filter', + 'Memory allocation', + 'Memory layout of a data structure', + 'Menu', + 'Metaprogramming', + 'Metered concurrency', + 'Metronome', + 'Middle three digits', + 'Miller-Rabin primality test', + 'Minesweeper game', + 'Modular exponentiation', + 'Modular inverse', + 'Monte Carlo methods', + 'Monty Hall problem', + 'Morse code', + 'Mouse position', + 'Move-to-front algorithm', + 'Multifactorial', + 'Multiple distinct objects', + 'Multiple regression', + 'Multiplication tables', + 'Multiplicative order', + 'Multisplit', + 'Munching squares', + 'Mutual recursionN', + "N'th", + 'N-queens problem', + 'Named parameters', + 'Narcissist', + 'Narcissistic decimal number', + 'Natural sorting', + 'Nautical bell', + 'Non-continuous subsequences', + 'Non-decimal radices/Convert', + 'Non-decimal radices/Input', + 'Non-decimal radices/Output', + 'Nth root', + 'Null object', + 'Number names', + 'Number reversal game', + 'Numeric error propagation', + 'Numerical integration', + 'Numerical integration/Gauss-Legendre QuadratureO', + 'Object serialization', + 'Odd word problem', + 'Old lady swallowed a fly', + 'OLE Automation', + 'One of n lines in a file', + 'One-dimensional cellular automata', + 'OpenGL', + 'Operator precedence', + 'Optional parameters', + 'Order disjoint list items', + 'Order two numerical lists', + 'Ordered Partitions', + 'Ordered wordsP', + 'Palindrome detection', + 'Pangram checker', + 'Paraffins', + 'Parallel calculations', + 'Parametric polymorphism', + 'Parametrized SQL statement', + 'Parse an IP Address', + 'Parsing/RPN calculator algorithm', + 'Parsing/RPN to infix conversion', + 'Parsing/Shunting-yard algorithm', + 'Partial function application', + 'Pascal matrix generation', + "Pascal's triangle", + "Pascal's triangle/Puzzle", + 'Pattern matching', + "Penney's game", + 'Percentage difference between images', + 'Percolation/Bond percolation', + 'Percolation/Mean cluster density', + 'Percolation/Mean run density', + 'Percolation/Site percolation', + 'Perfect numbers', + 'Permutation test', + 'Permutations', + 'Permutations by swapping', + 'Permutations/Derangements', + 'Permutations/Rank of a permutation', + 'Pernicious numbers', + 'Phrase reversals', + 'Pi', + 'Pick random element', + 'Pig the dice game', + 'Pig the dice game/Player', + 'Pinstripe/Display', + 'Pinstripe/Printer', + 'Play recorded sounds', + 'Playing cards', + 'Plot coordinate pairs', + 'Pointers and references', + 'Polymorphic copy', + 'Polymorphism', + 'Polynomial long division', + 'Polynomial regression', + 'Power set', + 'Pragmatic directives', + 'Price fraction', + 'Primality by trial division', + 'Prime decomposition', + 'Primes - allocate descendants to their ancestors', + 'Priority queue', + 'Probabilistic choice', + 'Problem of Apollonius', + 'Program name', + 'Program termination', + 'Pythagorean triplesQ', + 'QR decomposition', + 'Quaternion type', + 'Queue/Definition', + 'Queue/Usage', + 'Quickselect algorithm', + 'Q cont.', + 'QuineR', + 'Random number generator (device)', + 'Random number generator (included)', + 'Random numbers', + 'Range expansion', + 'Range extraction', + 'Ranking methods', + 'Rate counter', + 'Ray-casting algorithm', + 'RCRPG', + 'Read a configuration file', + 'Read a file line by line', + 'Read a specific line from a file', + 'Read entire file', + 'Real constants and functions', + 'Record sound', + 'Reduced row echelon form', + 'Regular expressions', + 'Remove duplicate elements', + 'Remove lines from a file', + 'Rename a file', + 'Rendezvous', + 'Rep-string', + 'Repeat a string', + 'Resistor mesh', + 'Respond to an unknown method call', + 'Return multiple values', + 'Reverse a string', + 'Reverse words in a string', + 'RIPEMD-160', + 'Rock-paper-scissors', + 'Roman numerals/Decode', + 'Roman numerals/Encode', + 'Roots of a function', + 'Roots of a quadratic function', + 'Roots of unity', + 'Rosetta Code/Count examples', + 'Rosetta Code/Find bare lang tags', + 'Rosetta Code/Find unimplemented tasks', + 'Rosetta Code/Fix code tags', + 'Rosetta Code/Rank languages by popularity', + 'Rot-13', + 'RSA code', + 'Run-length encoding', + 'Runge-Kutta method', + 'Runtime evaluation', + 'Runtime evaluation/In an environmentS', + 'S-Expressions', + 'Safe addition', + 'Sailors, coconuts and a monkey problem', + 'Same Fringe', + 'Scope modifiers', + 'Scope/Function names and labels', + 'Search a list', + 'Secure temporary file', + 'SEDOLs', + 'Self-describing numbers', + 'Self-referential sequence', + 'Semiprime', + 'Semordnilap', + 'Send an unknown method call', + 'Send email', + 'Sequence of non-squares', + 'Sequence of primes by Trial Division', + 'Set', + 'Set consolidation', + 'Set of real numbers', + 'Set puzzle', + 'Seven-sided dice from five-sided dice', + 'SHA-1', + 'SHA-256', + 'Shell one-liner', + 'Short-circuit evaluation', + 'Show the epoch', + 'Sierpinski carpet', + 'Sierpinski triangle', + 'Sierpinski triangle/Graphical', + 'Sieve of Eratosthenes', + 'Simple database', + 'Simple windowed application', + 'Simulate input/Keyboard', + 'Simulate input/Mouse', + 'Singleton', + 'Singly-linked list/Element definition', + 'Singly-linked list/Element insertion', + 'Singly-linked list/Traversal', + 'Sleep', + 'SOAP', + 'Sockets', + 'Sokoban', + 'Solve a Hidato puzzle', + "Solve a Holy Knight's tour", + 'Solve a Hopido puzzle', + 'Solve a Numbrix puzzle', + 'Solve the no connection puzzle', + 'Sort an array of composite structures', + 'Sort an integer array', + 'Sort disjoint sublist', + 'Sort stability', + 'Sort using a custom comparator', + 'Sorting algorithms/Bead sort', + 'Sorting algorithms/Bogosort', + 'Sorting algorithms/Bubble sort', + 'Sorting algorithms/Cocktail sort', + 'Sorting algorithms/Comb sort', + 'Sorting algorithms/Counting sort', + 'Sorting algorithms/Gnome sort', + 'Sorting algorithms/Heapsort', + 'Sorting algorithms/Insertion sort', + 'Sorting algorithms/Merge sort', + 'Sorting algorithms/Pancake sort', + 'Sorting algorithms/Permutation sort', + 'Sorting algorithms/Quicksort', + 'Sorting algorithms/Radix sort', + 'Sorting algorithms/Selection sort', + 'Sorting algorithms/Shell sort', + 'Sorting algorithms/Sleep sort', + 'Sorting algorithms/Stooge sort', + 'Sorting algorithms/Strand sort', + 'Soundex', + 'Sparkline in unicode', + 'Special characters', + 'Special variables', + 'Speech synthesis', + 'Spiral matrix', + 'SQL-based authentication', + 'Stable marriage problem', + 'Stack', + 'Stack traces', + 'Stair-climbing puzzle', + 'Standard deviation', + 'Start from a main routine', + 'State name puzzle', + 'Statistics/Basic', + 'Stem-and-leaf plot', + 'Stern-Brocot sequence', + 'String append', + 'String case', + 'String comparison', + 'String concatenation', + 'String interpolation (included)', + 'String length', + 'String matching', + 'String prepend', + 'Strip a set of characters from a string', + 'Strip block comments', + 'Strip comments from a string', + 'Strip control codes and extended characters from a string', + 'Strip whitespace from a string/Top and tail', + 'Subleq', + 'Substring', + 'Substring/Top and tail', + 'Subtractive generator', + 'Sudoku', + 'Sum and product of an array', + 'Sum digits of an integer', + 'Sum multiples of 3 and 5', + 'Sum of a series', + 'Sum of squares', + 'Sutherland-Hodgman polygon clipping', + 'Symmetric difference', + 'Synchronous concurrency', + 'System timeT', + 'Table creation/Postal addresses', + 'Take notes on the command line', + 'Temperature conversion', + 'Terminal control/Clear the screen', + 'Terminal control/Coloured text', + 'Terminal control/Cursor movement', + 'Terminal control/Cursor positioning', + 'Terminal control/Dimensions', + 'Terminal control/Display an extended character', + 'Terminal control/Hiding the cursor', + 'Terminal control/Inverse video', + 'Terminal control/Positional read', + 'Terminal control/Preserve screen', + 'Terminal control/Ringing the terminal bell', + 'Terminal control/Unicode output', + 'Ternary logic', + 'Test a function', + 'Text processing/1', + 'Text processing/2', + 'Text processing/Max licenses in use', + 'Textonyms', + 'The ISAAC Cipher', + 'The Twelve Days of Christmas', + "Thiele's interpolation formula", + 'Tic-tac-toe', + 'Time a function', + 'Tokenize a string', + 'Top rank per group', + 'Topic variable', + 'Topological sort', + 'Topswops', + 'Total circles area', + 'Towers of Hanoi', + 'Trabb Pardo–Knuth algorithm', + 'Tree traversal', + 'Trigonometric functions', + 'Truncatable primes', + 'Truncate a file', + 'Twelve statementsU', + 'Ulam spiral (for primes)', + 'Unbias a random generator', + 'Undefined values', + 'Unicode strings', + 'Unicode variable names', + 'Universal Turing machine', + 'Unix/ls', + 'Update a configuration file', + 'URL decoding', + 'URL encoding', + 'Use another language to call a function', + 'User input/Graphical', + 'User input/TextV', + 'Vampire number', + 'Van der Corput sequence', + 'Variable size/Get', + 'Variable size/Set', + 'Variable-length quantity', + 'Variables', + 'Variadic function', + 'Vector products', + 'Verify distribution uniformity/Chi-squared test', + 'Verify distribution uniformity/Naive', + 'Video display modes', + 'Vigenère cipher', + 'Vigenère cipher/Cryptanalysis', + 'Visualize a tree', + "Vogel's approximation method", + 'Voronoi diagramW', + 'Walk a directory/Non-recursively', + 'Walk a directory/Recursively', + 'Web scraping', + 'Window creation', + 'Window creation/X11', + 'Window management', + 'Wireworld', + 'Word wrap', + 'World Cup group stage', + 'Write float arrays to a text file', + 'Write language name in 3D ASCII', + 'Write to Windows event logX', + "Xiaolin Wu's line algorithm", + 'XML/DOM serialization', + 'XML/Input', + 'XML/Output', + 'XML/XPathY', + 'Y combinator', + 'Yahoo! search interface', + 'Yin and yangZ', + 'Zebra puzzle', + 'Zeckendorf arithmetic', + 'Zeckendorf number representation', + 'Zero to the zero power', + 'Zhang-Suen thinning algorithm', + 'Zig-zag matrix'] + + + + + +def get_text(url): + """Takes a url and returns text""" + req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'}) + content = urllib.request.urlopen(req).read() + page_text=BeautifulSoup(content) + return page_text.get_text() + +# def scrape_text(text): +# data_crop = findall("[EDIT] \n.+\n", text) +# return data_crop + + +def scrape_text(text): + """Takes text from get_text and returns a list of tuples with + language in [0] and code in [1]""" + data_crop = findall(r"edit] (.+)\n(.+)\n", text) + return data_crop + ##Should maybe grab all of the text + +def scrape_links(): + """Creates list of links to use with create_url to gather code.""" + with open ("links_list.txt", "r") as myfile: + data=myfile.read() + return findall(r"wiki/(.+)\" ti", data) + + +def create_url_for_scraping(task_string): + return "http://www.rosettacode.org/wiki/{}".format(task_string) + +language_start = ["C", "C#", "Common Lisp", "Clojure", "Haskell", + "Java", "JavaScript", "OCaml", "Perl", "PHP", + "Python", "Ruby", "Scala", "Scheme"] + + +#def make_data(languages=language_start, num_links=50) + #grab data for all of the links in the task list + #go through for each of the languages and grab the associated + #code + #return a df with the code you need in a column and the type of + #code as the index + + +def scrape_data(url): + req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'}) + content = urllib.request.urlopen(req).read() + soup = BeautifulSoup(content) + return soup.find_all( "pre", class_="highlighted_source") + #pre is an html tag. We want all text from pre with class highlighted_source + #returns a list of soup objects + + +def pull_code_from_soup(soup_list): + return [[soup_list[i]['class'][0], soup_list[i].get_text()] for i in range(len(soup_list))] + + +def make_data(url_list): + code_snippets = pd.DataFrame(columns=([0, 1])) + for program in url_list: + soup_list = scrape_data(create_url_for_scraping(program)) + code_snippets = code_snippets.append(pd.DataFrame(pull_code_from_soup(soup_list)), ignore_index=True) + return code_snippets \ No newline at end of file From cee96ea2100f54fbffe6b371c5164455e1f1516e Mon Sep 17 00:00:00 2001 From: sovello Date: Sat, 6 Jun 2015 08:52:28 -0400 Subject: [PATCH 2/6] implements the feature vectorizer --- FeatureVectorizer.py | 47 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 FeatureVectorizer.py diff --git a/FeatureVectorizer.py b/FeatureVectorizer.py new file mode 100644 index 0000000..af87a28 --- /dev/null +++ b/FeatureVectorizer.py @@ -0,0 +1,47 @@ +from sklearn.pipeline import make_pipeline, make_union +from sklearn.base import TransformerMixin +import re +import itertools + + +def longest_run_of_capitol_letters_feature(text): + """Find the longest run of capitol letters and return their length.""" + runs = sorted(re.findall(r"[A-Z]+", text), key=len) + if runs: + return [len(runs[-1])] + else: + return 0 + +def percent_character_feature(char_list): + def feature_fn(text): + return [text.count(i)/len(text) for i in char_list] + return feature_fn + +def longest_run_of_character_feature(text): + chars = ['~+', '\.+', '\|+', ';+', '\:+', '\$+', '\(+', '\)+', '\-+'] + runs = [] + for i in chars: + run = sorted(re.findall(r'{}'.format(i), text), key=len) + if runs: + runs.append(len(run[-1])) + else: + runs.append(0) + return runs + +class FunctionFeaturizer(TransformerMixin): + def __init__(self, *featurizers): + self.featurizers = featurizers + + def fit(self, X, y=None): + """All SciKit-Learn compatible transformers and classifiers have the + same interface. `fit` always returns the same object.""" + return self + + def transform(self, X): + """Given a list of original data, return a list of feature vectors.""" + fvs = [] + for datum in X: + fv = [f(datum) for f in self.featurizers] + a = list(itertools.chain(*fv)) + fvs.append(a) + return fvs From 03dbbdb595be2ae1c04e7de62ed7ca9594bd23ee Mon Sep 17 00:00:00 2001 From: sovello Date: Sun, 7 Jun 2015 23:00:42 -0400 Subject: [PATCH 3/6] finishes writing all code and modifying presentation notebooks --- Classifier.ipynb | 128 ++++++++------- FeatureVectorizer.py | 47 ++++-- Featurizer.ipynb | 364 +++++++++++++++++++++++++++++++++++++++++++ Learner.py | 22 ++- Scrapper.py | 28 +++- 5 files changed, 523 insertions(+), 66 deletions(-) create mode 100644 Featurizer.ipynb diff --git a/Classifier.ipynb b/Classifier.ipynb index d837cf5..139c251 100644 --- a/Classifier.ipynb +++ b/Classifier.ipynb @@ -2,97 +2,119 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from Scrapper import scrape\n", + "from Scrapper import load_data\n", "from Learner import Learner" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df2 = load_data(200)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "classifier = Learner(df2)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.36491387126019947" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "classifier.test_score()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.57978241160471444" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "classifier.train_score()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ - "df = scrape(num_links=200, drop_less_than=50)" + "from FeatureVectorizer import *" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
01
0adawith Ada.Characters.Latin_1; use Ada.Characte...
1adadeclare F1, F2 : File_Type;begin Open (F1,...
2autohotkeyimg := ppm_read(\"lena50.ppm\") ; x := img[4,4...
4cimage get_ppm(FILE *pf);
5c#include \"imglib.h\" #define PPMREADBUFLEN 256i...
\n", - "
" - ], "text/plain": [ - " 0 1\n", - "0 ada with Ada.Characters.Latin_1; use Ada.Characte...\n", - "1 ada declare F1, F2 : File_Type;begin Open (F1,...\n", - "2 autohotkey img := ppm_read(\"lena50.ppm\") ; x := img[4,4...\n", - "4 c image get_ppm(FILE *pf);\n", - "5 c #include \"imglib.h\" #define PPMREADBUFLEN 256i..." + "[0]" ] }, - "execution_count": 13, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df.head()" + "longest_run_of_character_feature('hello . work.dl. in. the junghle.')" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": { "collapsed": true }, diff --git a/FeatureVectorizer.py b/FeatureVectorizer.py index af87a28..e17ad50 100644 --- a/FeatureVectorizer.py +++ b/FeatureVectorizer.py @@ -4,21 +4,17 @@ import itertools -def longest_run_of_capitol_letters_feature(text): - """Find the longest run of capitol letters and return their length.""" +def longest_run_of_capital_letters_feature(text): + """Find the longest run of capital letters and return their length.""" runs = sorted(re.findall(r"[A-Z]+", text), key=len) if runs: return [len(runs[-1])] else: - return 0 - -def percent_character_feature(char_list): - def feature_fn(text): - return [text.count(i)/len(text) for i in char_list] - return feature_fn + return [0] + def longest_run_of_character_feature(text): - chars = ['~+', '\.+', '\|+', ';+', '\:+', '\$+', '\(+', '\)+', '\-+'] + chars = ['~+', '\.+', '\|+', ';+', '\:+', ';+', '\$+', '\(+', '\)+', '\-+', '\s+', '\t+'] runs = [] for i in chars: run = sorted(re.findall(r'{}'.format(i), text), key=len) @@ -28,6 +24,39 @@ def longest_run_of_character_feature(text): runs.append(0) return runs + +def percent_character_feature(text): + """Return percentage of text that is a particular char compared to total text length.""" + chars = [".", "|", "$", "_", "!", "#", "@", "%", "^", "&", "*", "(", ")","+", "=", "{", "}", "[", "]", ":", ";", "?", "<", ">"] + + return [text.count(i)/len(text) for i in chars] + + +def percent_character_combinations(text): + """Return percentage of text that is a particular char compared to total text length.""" + chars = ["==", "\->+", ":\-+", "\+=", "\n\t+if", "\n+", "\n\$+", "\n\t+", "\ndef", "%{", "~=", "\|\|", "\n\t+\(\w+", "^\$", "\.=", "\{:", "===", "!==", "\*\w+", "__", "__name__", "__main__", "^\#", "^def", "^@w+", "^@end", "^begin", "^end", "^functions", "^loop\n", "^procedure", "^func","\+\+"] + runs = [] + for i in chars: + run = re.findall(r'{}'.format(i), text) + if run: + runs.append(len(run)/len(text)) + else: + runs.append(0) + return runs + +def binary_character_combinations(text): + '''Return binary of text that is particular char to total length of text''' + chars = ["==", "\->+", ":\-+", "\+=", "\n\t+if", "\n+", "\n\$+", "\n\t+", "\ndef", "%{", "~=", "\|\|","\n\t+\(\w+", "^\$", "\.=", "\{:", "===", "!==", "\*\w+", "__", "__name__", "__main__", "^\#", "^def", "^@w+", "^@end", "^begin", "^end", "^functions", "^loop\n", "^procedure", "^func","\+\+"] + runs = [] + for i in chars: + run = re.findall(r'{}'.format(i), text) + if run: + runs.append(1) + else: + runs.append(0) + return runs + + class FunctionFeaturizer(TransformerMixin): def __init__(self, *featurizers): self.featurizers = featurizers diff --git a/Featurizer.ipynb b/Featurizer.ipynb new file mode 100644 index 0000000..24ebfd1 --- /dev/null +++ b/Featurizer.ipynb @@ -0,0 +1,364 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from Scrapper import load_data\n", + "from Scrapper import *\n", + "from Learner import Learner\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from sklearn.cross_validation import train_test_split\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from FeatureVectorizer import *\n", + "from sklearn.tree import DecisionTreeClassifier" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df = load_data(500)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "y = df.loc[:,0]\n", + "X = df.loc[:,1]" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "1 #include #include #include ...\n", + "2  #include #include #include...\n", + "4 import std.stdio, std.array, std.conv, std.alg...\n", + "5 import std.stdio, std.conv, std.ascii, std.arr...\n", + "6  -module( solve_hidato_puzzle ). -export( [cre...\n", + "Name: 1, dtype: object" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "language_featurizer = make_union(CountVectorizer(), FunctionFeaturizer(longest_run_of_capital_letters_feature, longest_run_of_character_feature,\n", + " percent_character_combinations, binary_character_combinations,\n", + " percent_character_feature))" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pipe = make_pipeline(language_featurizer, RandomForestClassifier())" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Pipeline(steps=[('featureunion', FeatureUnion(n_jobs=1,\n", + " transformer_list=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n", + " dtype=, encoding='utf-8', input='content',\n", + " lowercase=True, max_df=1.0, max_features=None, min_df=1,\n", + " ...n_jobs=1,\n", + " oob_score=False, random_state=None, verbose=0,\n", + " warm_start=False))])" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipe.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.71267707954958226" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipe.score(X_test, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['go'], dtype=object)" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipe.predict(['~='])" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "multi_pipe = make_pipeline(language_featurizer, MultinomialNB())" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Pipeline(steps=[('featureunion', FeatureUnion(n_jobs=1,\n", + " transformer_list=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n", + " dtype=, encoding='utf-8', input='content',\n", + " lowercase=True, max_df=1.0, max_features=None, min_df=1,\n", + " ...ormer_weights=None)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "multi_pipe.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.59898292771521977" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "multi_pipe.score(X_test, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "language_featurizer2 = make_union(CountVectorizer(), FunctionFeaturizer(longest_run_of_capital_letters_feature, longest_run_of_character_feature,\n", + " percent_character_combinations, binary_character_combinations,\n", + " percent_character_feature))" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "pipe2 = make_pipeline(language_featurizer2, RandomForestClassifier())" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Pipeline(steps=[('featureunion', FeatureUnion(n_jobs=1,\n", + " transformer_list=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n", + " dtype=, encoding='utf-8', input='content',\n", + " lowercase=True, max_df=1.0, max_features=None, min_df=1,\n", + " ...n_jobs=1,\n", + " oob_score=False, random_state=None, verbose=0,\n", + " warm_start=False))])" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipe2.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.71758082092262987" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipe2.score(X_test, y_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Making a smaller data set with smaller number of languages" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "dfilter = scrape_filter(500, 50, True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "df_small=" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.4.3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/Learner.py b/Learner.py index 3cbf6e7..93d0075 100644 --- a/Learner.py +++ b/Learner.py @@ -13,17 +13,25 @@ from sklearn.naive_bayes import BernoulliNB from sklearn.ensemble import RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier +from sklearn.metrics import classification_report, confusion_matrix class Learner(object): + algorithms = {'m':MultinomialNB(), + 'b':BernoulliNB(), + 'r':RandomForestClassifier(), + 'f':RandomForestClassifier(), + 'k':KNeighborsClassifier(), + 'n':KNeighborsClassifier(), + 'p':MultinomialNB() + } + '''Takes a dataframe with outcomes on first column and predictor second column Makes available the score and predict methods ''' def __init__(self, dataframe, alg='NBayes'): self.outcome, self.predictor = self.split_data(dataframe) self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.predictor, self.outcome) - - self.pipe = Pipeline([('bag_of_words', CountVectorizer()), - ('bayes', MultinomialNB())]) + self.pipe = Pipeline([('bag_of_words', CountVectorizer()), ('bayes', self.get_algorithm(alg))]) self.fit() @@ -45,3 +53,11 @@ def predict(self, string): def split_data(self, data): return data.loc[:,0], data.loc[:,1] + + + def classification_report(self): + return classification_report(self.pipe.predict(self.X_test), self.y_test) + + def get_algorithm(self, algorithmchoice): + return self.algorithms[algorithmchoice[0].lower()] + diff --git a/Scrapper.py b/Scrapper.py index dc6a31f..9281b81 100644 --- a/Scrapper.py +++ b/Scrapper.py @@ -52,5 +52,31 @@ def scrape(num_links=30, drop_less_than=0, save=False): ndf = df[df[0] != 'text'] ndf = ndf.groupby(0).filter( lambda x: len(x) >= drop_less_than) if save: - ndf.to_pickle('filename_.pkl') + ndf.to_pickle('data/{}.pkl'.format(num_links)) return ndf + + +def scrape_filter(num_links=30, drop_less_than=0, save=False): + df = make_data(make_links_list(num_links)) + df = df[df[0] != 'text'] + df = df[(df[0] == 'ada') | (df[0] == 'clojure') | (df[0] == "algol68") | (df[0] == "awk") + | (df[0] == "bash") | (df[0] == "haskell") | (df[0] == "java") | (df[0] == "javascript") | (df[0] == "lisp") | (df[0] == "objc") | (df[0] == "ocaml") | (df[0] == "php") | (df[0] == "python") | (df[0] == "ruby") | (df[0] == "scala") | (df[0] == "scheme") | (df[0] == "tcl")] + df = df.groupby(0).filter(lambda x: len(x) >= drop_less_than) + if save: + name = "data/filtered_{}_of_{}.pkl".format(drop_less_than, num_links) + df.to_pickle(name) + return df + + +def scraper_filter_small(num_links=30, drop_less_than=0, save=False): + df = make_data(make_links_list(num_links)) + df = df[df[0] != 'text'] + df = df[(df[0] == 'clojure') | (df[0] == "haskell") | (df[0] == "java") | (df[0] == "javascript") | (df[0] == "ocaml") | (df[0] == "php") | (df[0] == "python") | (df[0] == "ruby") | (df[0] == "scala") | (df[0] == "scheme") | (df[0] == "tcl")] + df = df.groupby(0).filter(lambda x: len(x) >= drop_less_than) + if save: + name = "data/smaller_{}_of_{}.pkl".format(drop_less_than, num_links) + df.to_pickle(name) + return df + +def load_data(file_name): + return pd.read_pickle('data/{}.pkl'.format(file_name)) From 4689a33757036534d1e5af833bbc5cf58582be31 Mon Sep 17 00:00:00 2001 From: Sovello Hildebrand Date: Mon, 8 Jun 2015 08:19:56 -0400 Subject: [PATCH 4/6] Updates to README.md on how the classifier works --- README.md | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/README.md b/README.md index 394f93b..df9f9a4 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,36 @@ +## Programming Language Classifier + +Given a code snippet, the classifier will try to predict what language it is. + +## Steps taken to develop the classifier +### Scrapper +1. Created a scrapper that would collect different code snippets from the web, specifically from [Rosetta Code] (www.rosettacode.org/wiki/Rosetta_Code) +2. The scrapper collects these code snippets by different tasks for each of the programing languages. +3. For the scrapper to work, give it a minimum number of links to scrape and if you want to limit the languages retrieved by frequency, you can give it the minimum number of observations for the language to be in the classifier. Decide if you want to save the data frame or not + +`from Scrapper import scrape` + +`scrape(num_links, drop_less_than, save=(True,False))` + +You can decide to also load that dataframe into a variable so you can use it in the next steps or you can load the `.pkl` file using the number of links as the file name. + +### Classifier +To use the classifier, import the Learner class and use the Learner class to define a classifier object + +`from Learner import Learner` + +`from Scrapper import load_data` + +`dataframe = load_data(500)` + +`classifier = Learner(dataframe, alg)` + +`alg` can be any of MultinomialNB, RandomForestClassifier, KNeighborsClassifier, or Bernoulli + +Learner splits (`test_size .33`) and fits the data and makes availabe the methods `test_score()`, `train_score()`, `predict(string)` and `classification_report()` which you can call on the learner object directly. + +### VectorFeaturizer + # Classify code snippets into programming languages ## Description From 31d48f0cc8df42c05fdcf7c8972805531534fc3c Mon Sep 17 00:00:00 2001 From: sovello Date: Mon, 8 Jun 2015 08:21:31 -0400 Subject: [PATCH 5/6] adds decisiontree classifier --- Featurizer.ipynb | 59 ++- Learner.py | 7 +- scraperstart.py | 957 ----------------------------------------------- 3 files changed, 61 insertions(+), 962 deletions(-) delete mode 100644 scraperstart.py diff --git a/Featurizer.ipynb b/Featurizer.ipynb index 24ebfd1..7c7e747 100644 --- a/Featurizer.ipynb +++ b/Featurizer.ipynb @@ -319,7 +319,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 44, "metadata": { "collapsed": false }, @@ -330,14 +330,67 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 45, "metadata": { "collapsed": true }, "outputs": [], "source": [ - "df_small=" + "df_small= scraper_filter_small(700, 100, True)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "ename": "URLError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/urllib/request.py\u001b[0m in \u001b[0;36mdo_open\u001b[0;34m(self, http_class, req, **http_conn_args)\u001b[0m\n\u001b[1;32m 1181\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1182\u001b[0;31m \u001b[0mh\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_method\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mselector\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1183\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mOSError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# timeout error\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, method, url, body, headers)\u001b[0m\n\u001b[1;32m 1087\u001b[0m \u001b[0;34m\"\"\"Send a complete request to the server.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1088\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_send_request\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1089\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py\u001b[0m in \u001b[0;36m_send_request\u001b[0;34m(self, method, url, body, headers)\u001b[0m\n\u001b[1;32m 1125\u001b[0m \u001b[0mbody\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'iso-8859-1'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1126\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mendheaders\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1127\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py\u001b[0m in \u001b[0;36mendheaders\u001b[0;34m(self, message_body)\u001b[0m\n\u001b[1;32m 1083\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mCannotSendHeader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1084\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_send_output\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmessage_body\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1085\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py\u001b[0m in \u001b[0;36m_send_output\u001b[0;34m(self, message_body)\u001b[0m\n\u001b[1;32m 921\u001b[0m \u001b[0mmessage_body\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 922\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 923\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mmessage_body\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, data)\u001b[0m\n\u001b[1;32m 856\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mauto_open\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 857\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 858\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py\u001b[0m in \u001b[0;36mconnect\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 833\u001b[0m self.sock = self._create_connection((self.host,self.port),\n\u001b[0;32m--> 834\u001b[0;31m self.timeout, self.source_address)\n\u001b[0m\u001b[1;32m 835\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/socket.py\u001b[0m in \u001b[0;36mcreate_connection\u001b[0;34m(address, timeout, source_address)\u001b[0m\n\u001b[1;32m 511\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0merr\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 512\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 513\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/socket.py\u001b[0m in \u001b[0;36mcreate_connection\u001b[0;34m(address, timeout, source_address)\u001b[0m\n\u001b[1;32m 502\u001b[0m \u001b[0msock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbind\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msource_address\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 503\u001b[0;31m \u001b[0msock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msa\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 504\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0msock\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mOSError\u001b[0m: [Errno 50] Network is down", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[0;31mURLError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mscrape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m700\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m100\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/Users/sovello/tiy2015/programming-language-classifier/Scrapper.py\u001b[0m in \u001b[0;36mscrape\u001b[0;34m(num_links, drop_less_than, save)\u001b[0m\n\u001b[1;32m 49\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 50\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mscrape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnum_links\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m30\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdrop_less_than\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msave\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 51\u001b[0;31m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmake_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmake_links_list\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnum_links\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 52\u001b[0m \u001b[0mndf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;34m'text'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 53\u001b[0m \u001b[0mndf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mndf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroupby\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfilter\u001b[0m\u001b[0;34m(\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>=\u001b[0m \u001b[0mdrop_less_than\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Users/sovello/tiy2015/programming-language-classifier/Scrapper.py\u001b[0m in \u001b[0;36mmake_data\u001b[0;34m(url_list)\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[0mcode_snippets\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0murl\u001b[0m \u001b[0;32min\u001b[0m \u001b[0murl_list\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 32\u001b[0;31m \u001b[0msoup_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mscrape_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 33\u001b[0m \u001b[0murl_df\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpull_code_from_soup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msoup_list\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 34\u001b[0m \u001b[0mcode_snippets\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcode_snippets\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl_df\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mignore_index\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Users/sovello/tiy2015/programming-language-classifier/Scrapper.py\u001b[0m in \u001b[0;36mscrape_data\u001b[0;34m(url)\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mscrape_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0mreq\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0murllib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mRequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m'User-Agent'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Mozilla/5.0'\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 18\u001b[0;31m \u001b[0mcontent\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0murllib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0murlopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 19\u001b[0m \u001b[0msoup\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mBeautifulSoup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcontent\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0msoup\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfind_all\u001b[0m\u001b[0;34m(\u001b[0m \u001b[0;34m\"pre\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mclass_\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"highlighted_source\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/urllib/request.py\u001b[0m in \u001b[0;36murlopen\u001b[0;34m(url, data, timeout, cafile, capath, cadefault, context)\u001b[0m\n\u001b[1;32m 159\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 160\u001b[0m \u001b[0mopener\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_opener\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 161\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mopener\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 162\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 163\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0minstall_opener\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mopener\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/urllib/request.py\u001b[0m in \u001b[0;36mopen\u001b[0;34m(self, fullurl, data, timeout)\u001b[0m\n\u001b[1;32m 461\u001b[0m \u001b[0mreq\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmeth\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 462\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 463\u001b[0;31m \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreq\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 464\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 465\u001b[0m \u001b[0;31m# post-process response\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/urllib/request.py\u001b[0m in \u001b[0;36m_open\u001b[0;34m(self, req, data)\u001b[0m\n\u001b[1;32m 479\u001b[0m \u001b[0mprotocol\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mreq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtype\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 480\u001b[0m result = self._call_chain(self.handle_open, protocol, protocol +\n\u001b[0;32m--> 481\u001b[0;31m '_open', req)\n\u001b[0m\u001b[1;32m 482\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 483\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/urllib/request.py\u001b[0m in \u001b[0;36m_call_chain\u001b[0;34m(self, chain, kind, meth_name, *args)\u001b[0m\n\u001b[1;32m 439\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mhandler\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mhandlers\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 440\u001b[0m \u001b[0mfunc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhandler\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmeth_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 441\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 442\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 443\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/urllib/request.py\u001b[0m in \u001b[0;36mhttp_open\u001b[0;34m(self, req)\u001b[0m\n\u001b[1;32m 1208\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1209\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mhttp_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1210\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdo_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhttp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclient\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mHTTPConnection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1211\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1212\u001b[0m \u001b[0mhttp_request\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mAbstractHTTPHandler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdo_request_\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/urllib/request.py\u001b[0m in \u001b[0;36mdo_open\u001b[0;34m(self, http_class, req, **http_conn_args)\u001b[0m\n\u001b[1;32m 1182\u001b[0m \u001b[0mh\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_method\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mselector\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1183\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mOSError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# timeout error\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1184\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mURLError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1185\u001b[0m \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetresponse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1186\u001b[0m \u001b[0;32mexcept\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mURLError\u001b[0m: " + ] + } + ], + "source": [ + "scrape(700, 100, True)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/Learner.py b/Learner.py index 93d0075..4734ac0 100644 --- a/Learner.py +++ b/Learner.py @@ -14,6 +14,8 @@ from sklearn.ensemble import RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import classification_report, confusion_matrix +from sklearn.tree import DecisionTreeClassifier + class Learner(object): algorithms = {'m':MultinomialNB(), @@ -22,7 +24,8 @@ class Learner(object): 'f':RandomForestClassifier(), 'k':KNeighborsClassifier(), 'n':KNeighborsClassifier(), - 'p':MultinomialNB() + 'p':MultinomialNB(), + 'd':DecisionTreeClassifier() } '''Takes a dataframe with outcomes on first column and predictor second column @@ -30,7 +33,7 @@ class Learner(object): ''' def __init__(self, dataframe, alg='NBayes'): self.outcome, self.predictor = self.split_data(dataframe) - self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.predictor, self.outcome) + self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.predictor, self.outcome, test_size=0.33) self.pipe = Pipeline([('bag_of_words', CountVectorizer()), ('bayes', self.get_algorithm(alg))]) self.fit() diff --git a/scraperstart.py b/scraperstart.py deleted file mode 100644 index c77c6d1..0000000 --- a/scraperstart.py +++ /dev/null @@ -1,957 +0,0 @@ -from bs4 import BeautifulSoup -import requests -import urllib -from re import findall -import pandas as pd - - -# C (.gcc, .c) -# C# -# Common Lisp (.sbcl) -# Clojure -# Haskell -# Java -# JavaScript -# OCaml -# Perl -# PHP (.hack, .php) -# Python -# Ruby (.jruby, .yarv) -# Scala -# Scheme (.racket) - -languages_list = ['ACL2', - 'Ada', - 'Aime', - 'ALGOL 68', - 'AppleScript', - 'AutoHotkey', - 'AutoIt', - 'AWK', - 'BASIC', - 'BBC BASIC', - 'bc', - 'Brat', - 'C', - 'C++', - 'C#', - 'Clojure', - 'COBOL', - 'CMake', - 'CoffeeScript', - 'Common Lisp', - 'D', - 'Delphi', - 'DWScript', - 'E', - 'Eiffel', - 'Erlang', - 'ERRE', - 'Euphoria', - 'Factor', - 'Fantom', - 'Forth', - 'Fortran', - 'Frink', - 'F#', - 'FunL', - 'GAP', - 'Go', - 'Groovy', - 'Haskell', - 'Icon and Unicon', - 'Inform 6', - 'J', - 'Java', - 'JavaScript', - 'Joy', - 'Julia', - 'LabVIEW', - 'Lasso', - 'Liberty BASIC', - 'Logo', - 'Lua', - 'M4', - 'Mathematica', - 'MATLAB', - 'Maxima', - 'Modula-3', - 'MUMPS', - 'Nemerle', - 'NetRexx', - 'Nim', - 'Objective-C', - 'OCaml', - 'Oforth', - 'Oz', - 'PARI/GP', - 'Pascal', - 'Perl', - 'Perl 6', - 'PHP', - 'PicoLisp', - 'PL/I', - 'PowerShell', - 'PureBasic', - 'Python', - 'R', - 'Racket', - 'REBOL', - 'REXX', - 'Ruby', - 'Run BASIC', - 'Rust', - 'Scala', - 'Scratch', - 'Seed7', - 'Sidef', - 'Smalltalk', - 'SNOBOL4', - 'Swift', - 'Tcl', - 'TI-83 BASIC', - 'TUSCRIPT', - 'UNIX Shell', - 'Ursala', - 'VBScript', - 'Vedit macro language', - 'zkl'] - -task_list = ['24 game', - '24 game/Solve9', - '9 billion names of God the integer', - '99 Bottles of BeerA', - 'A+B', - 'ABC Problem', - 'Abstract type', - 'Abundant, deficient and perfect number classifications', - 'Accumulator factory', - 'Ackermann function', - 'Active Directory/Connect', - 'Active Directory/Search for a user', - 'Active object', - 'Add a variable to a class instance at runtime', - 'Address of a variable', - 'AKS test for primes', - 'Align columns', - 'Aliquot sequence classifications', - 'Almost prime', - 'Amb', - 'Amicable pairs', - 'Anagrams', - 'Anagrams/Deranged anagrams', - 'Animate a pendulum', - 'Animation', - 'Anonymous recursion', - 'Append a record to the end of a text file', - 'Apply a callback to an array', - 'Arbitrary-precision integers (included)', - 'Arena storage pool', - 'Arithmetic evaluation', - 'Arithmetic-geometric mean', - 'Arithmetic-geometric mean/Calculate Pi', - 'Arithmetic/Complex', - 'Arithmetic/Integer', - 'Arithmetic/Rational', - 'Array concatenation', - 'Arrays', - 'Assertions', - 'Associative array/Creation', - 'Associative array/Iteration', - 'Atomic updates', - 'Average loop length', - 'Averages/Arithmetic mean', - 'Averages/Mean angle', - 'Averages/Mean time of day', - 'Averages/Median', - 'Averages/Mode', - 'Averages/Pythagorean means', - 'Averages/Root mean square', - 'Averages/Simple moving averageB', - 'Balanced brackets', - 'Balanced ternary', - "Benford's law", - 'Bernoulli numbers', - 'Best shuffle', - 'Binary digits', - 'Binary search', - 'Binary strings', - 'Bitcoin/address validation', - 'Bitcoin/public point to address', - 'Bitmap', - "Bitmap/Bresenham's line algorithm", - 'Bitmap/Bézier curves/Cubic', - 'Bitmap/Bézier curves/Quadratic', - 'Bitmap/Flood fill', - 'Bitmap/Histogram', - 'Bitmap/Midpoint circle algorithm', - 'Bitmap/PPM conversion through a pipe', - 'Bitmap/Read a PPM file', - 'Bitmap/Read an image through a pipe', - 'Bitmap/Write a PPM file', - 'Bitwise IO', - 'Bitwise operations', - 'Boolean values', - 'Box the compass', - 'Break OO privacy', - 'Brownian tree', - 'Bulls and cows', - 'Bulls and cows/PlayerC', - 'Caesar cipher', - 'Calendar', - 'Calendar - for "REAL" programmers', - 'Call a foreign-language function', - 'Call a function', - 'Call a function in a shared library', - 'Call an object method', - 'Canny edge detector', - 'Carmichael 3 strong pseudoprimes', - 'Case-sensitivity of identifiers', - 'Casting out nines', - 'Catalan numbers', - "Catalan numbers/Pascal's triangle", - 'Catamorphism', - 'Catmull–Clark subdivision surface', - 'Character codes', - 'Chat server', - 'Check Machin-like formulas', - 'Check that file exists', - 'Checkpoint synchronization', - 'Chinese remainder theorem', - 'Cholesky decomposition', - 'Circles of given radius through two points', - 'Classes', - 'Closest-pair problem', - 'Closures/Value capture', - 'Collections', - 'Color of a screen pixel', - 'Color quantization', - 'Colour bars/Display', - 'Colour pinstripe/Display', - 'Colour pinstripe/Printer', - 'Combinations', - 'Combinations and permutations', - 'Combinations with repetitions', - 'Comma quibbling', - 'Command-line arguments', - 'Comments', - "Compare sorting algorithms' performance", - 'Compile-time calculation', - 'Compound data type', - 'Concurrent computing', - 'Conditional structures', - 'Conjugate transpose', - 'Constrained genericity', - 'Constrained random points on a circle', - 'Continued fraction', - 'Continued fraction/Arithmetic/Construct from rational number', - 'Continued fraction/Arithmetic/G(matrix NG, Contined Fraction N)', - 'Continued fraction/Arithmetic/G(matrix NG, Contined Fraction N1, Contined Fraction N2)', - 'Convert decimal number to rational', - "Conway's Game of Life", - 'Copy a string', - 'Count in factors', - 'Count in octal', - 'Count occurrences of a substring', - 'Count the coins', - 'CRC-32', - 'Create a file', - 'Create a file on magnetic tape', - 'Create a two-dimensional array at runtime', - 'Create an HTML table', - 'Create an object at a given address', - 'CSV data manipulation', - 'CSV to HTML translation', - 'Currying', - 'Cut a rectangleD', - 'Date format', - 'Date manipulation', - 'Day of the week', - 'Deal cards for FreeCell', - 'Death Star', - 'Deconvolution/1D', - 'Deconvolution/2D+', - 'Deepcopy', - 'Define a primitive data type', - 'Delegates', - 'Delete a file', - 'Detect division by zero', - 'Determine if a string is numeric', - 'Determine if only one instance is running', - 'Digital root', - 'Digital root/Multiplicative digital root', - "Dinesman's multiple-dwelling problem", - 'Dining philosophers', - 'Discordian date', - 'Distributed programming', - 'DNS query', - 'Documentation', - 'Dot product', - 'Doubly-linked list/Definition', - 'Doubly-linked list/Element definition', - 'Doubly-linked list/Element insertion', - 'Doubly-linked list/Traversal', - 'Dragon curve', - 'Draw a clock', - 'Draw a cuboid', - 'Draw a sphere', - 'Dutch national flag problem', - 'Dynamic variable namesE', - 'Echo server', - 'Element-wise operations', - 'Empty directory', - 'Empty program', - 'Empty string', - 'Enforced immutability', - 'Entropy', - 'Enumerations', - 'Environment variables', - 'Equilibrium index', - 'Ethiopian multiplication', - 'Euler method', - "Euler's sum of powers conjecture", - 'Evaluate binomial coefficients', - 'Even or odd', - 'Events', - 'Evolutionary algorithm', - 'Exceptions', - 'Exceptions/Catch an exception thrown in a nested call', - 'Executable library', - 'Execute a Markov algorithm', - 'Execute a system command', - 'Execute Brain****', - 'Execute HQ9+', - 'Execute SNUSP', - 'Exponentiation operator', - 'Extend your language', - 'Extensible prime generator', - 'Extreme floating point valuesF', - 'Factorial', - 'Factors of a Mersenne number', - 'Factors of an integer', - 'Fast Fourier transform', - 'Fibonacci n-step number sequences', - 'Fibonacci sequence', - 'Fibonacci word', - 'Fibonacci word/fractal', - 'File input/output', - 'File modification time', - 'File size', - 'Filter', - 'Find common directory path', - 'Find largest left truncatable prime in a given base', - 'Find limit of recursion', - 'Find the last Sunday of each month', - 'Find the missing permutation', - 'First class environments', - 'First-class functions', - 'First-class functions/Use numbers analogously', - 'Five weekends', - 'FizzBuzz', - 'Flatten a list', - 'Flipping bits game', - 'Flow-control structures', - "Floyd's triangle", - 'Forest fire', - 'Fork', - 'Formal power series', - 'Formatted numeric output', - 'Forward difference', - 'Four bit adder', - 'Fractal tree', - 'Fractran', - 'Function composition', - 'Function definition', - 'Function frequency', - 'Function prototypeG', - 'Galton box animation', - 'Gamma function', - 'Gaussian elimination', - 'Generate Chess960 starting position', - 'Generate lower case ASCII alphabet', - 'Generator/Exponential', - 'Generic swap', - 'Globally replace text in several files', - 'Go Fish', - 'G cont.', - 'Gray code', - 'Grayscale image', - 'Greatest common divisor', - 'Greatest element of a list', - 'Greatest subsequential sum', - 'Greyscale bars/Display', - 'Guess the number', - 'Guess the number/With feedback', - 'Guess the number/With feedback (player)', - 'GUI component interaction', - 'GUI enabling/disabling of controls', - 'GUI/Maximum window dimensionsH', - 'Hailstone sequence', - 'Hamming numbers', - 'Handle a signal', - 'Happy numbers', - 'Harshad or Niven series', - 'Hash from two arrays', - 'Hash join', - 'Haversine formula', - 'Hello world/Graphical', - 'Hello world/Line printer', - 'Hello world/Newbie', - 'Hello world/Newline omission', - 'Hello world/Standard error', - 'Hello world/Text', - 'Hello world/Web server', - 'Here document', - 'Heronian triangles', - 'Hickerson series of almost integers', - 'Higher-order functions', - 'History variables', - 'Hofstadter Figure-Figure sequences', - 'Hofstadter Q sequence', - 'Hofstadter-Conway $10,000 sequence', - 'Holidays related to Easter', - 'Honeycombs', - 'Horizontal sundial calculations', - "Horner's rule for polynomial evaluation", - 'Host introspection', - 'Hostname', - 'Hough transform', - 'HTTP', - 'HTTPS', - 'HTTPS/Authenticated', - 'HTTPS/Client-authenticated', - 'Huffman codingI', - 'I before E except after C', - 'IBAN', - 'Identity matrix', - 'Image convolution', - 'Image noise', - 'Include a file', - 'Increment a numerical string', - 'Infinity', - 'Inheritance/Multiple', - 'Inheritance/Single', - 'Input loop', - 'Integer comparison', - 'Integer overflow', - 'Integer sequence', - 'Interactive programming', - 'Introspection', - 'Inverted index', - 'Inverted syntax', - 'Iterated digits squaringJ', - "Jensen's Device", - 'JortSort', - 'Josephus problem', - 'Joystick position', - 'JSON', - 'Jump anywhereK', - 'K-d tree', - 'K-means++ clustering', - 'Kaprekar numbers', - 'Keyboard input/Flush the keyboard buffer', - 'Keyboard input/Keypress check', - 'Keyboard input/Obtain a Y or N response', - 'Keyboard macros', - 'Knapsack problem/0-1', - 'Knapsack problem/Bounded', - 'Knapsack problem/Continuous', - 'Knapsack problem/Unbounded', - "Knight's tour", - 'Knuth shuffle', - "Knuth's algorithm SL", - "Langton's ant", - 'Largest int from concatenated ints', - 'Last Friday of each month', - 'Last letter-first letter', - 'Leap year', - 'Least common multiple', - 'Left factorials', - 'Letter frequency', - 'Levenshtein distance', - 'Linear congruential generator', - 'List comprehensions', - 'Literals/Floating point', - 'Literals/Integer', - 'Literals/String', - 'Logical operations', - 'Long multiplication', - 'Longest common subsequence', - 'Longest increasing subsequence', - 'Longest string challenge', - 'Look-and-say sequence', - 'Loop over multiple arrays simultaneously', - 'Loops/Break', - 'Loops/Continue', - 'Loops/Do-while', - 'Loops/Downward for', - 'Loops/For', - 'Loops/For with a specified step', - 'Loops/Foreach', - 'Loops/Infinite', - 'Loops/N plus one half', - 'Loops/Nested', - 'Loops/While', - 'LU decomposition', - 'Lucas-Lehmer test', - 'Ludic numbers', - 'Luhn test of credit card numbers', - 'LZW compressionM', - 'Machine code', - 'Mad Libs', - 'Magic squares of odd order', - 'Main step of GOST 28147-89', - 'Make directory path', - 'Man or boy test', - 'Mandelbrot set', - 'Map range', - 'Matrix arithmetic', - 'Matrix multiplication', - 'Matrix transposition', - 'Matrix-exponentiation operator', - 'Maximum triangle path sum', - 'Maze generation', - 'Maze solving', - 'MD4', - 'MD5', - 'MD5/Implementation', - 'Median filter', - 'Memory allocation', - 'Memory layout of a data structure', - 'Menu', - 'Metaprogramming', - 'Metered concurrency', - 'Metronome', - 'Middle three digits', - 'Miller-Rabin primality test', - 'Minesweeper game', - 'Modular exponentiation', - 'Modular inverse', - 'Monte Carlo methods', - 'Monty Hall problem', - 'Morse code', - 'Mouse position', - 'Move-to-front algorithm', - 'Multifactorial', - 'Multiple distinct objects', - 'Multiple regression', - 'Multiplication tables', - 'Multiplicative order', - 'Multisplit', - 'Munching squares', - 'Mutual recursionN', - "N'th", - 'N-queens problem', - 'Named parameters', - 'Narcissist', - 'Narcissistic decimal number', - 'Natural sorting', - 'Nautical bell', - 'Non-continuous subsequences', - 'Non-decimal radices/Convert', - 'Non-decimal radices/Input', - 'Non-decimal radices/Output', - 'Nth root', - 'Null object', - 'Number names', - 'Number reversal game', - 'Numeric error propagation', - 'Numerical integration', - 'Numerical integration/Gauss-Legendre QuadratureO', - 'Object serialization', - 'Odd word problem', - 'Old lady swallowed a fly', - 'OLE Automation', - 'One of n lines in a file', - 'One-dimensional cellular automata', - 'OpenGL', - 'Operator precedence', - 'Optional parameters', - 'Order disjoint list items', - 'Order two numerical lists', - 'Ordered Partitions', - 'Ordered wordsP', - 'Palindrome detection', - 'Pangram checker', - 'Paraffins', - 'Parallel calculations', - 'Parametric polymorphism', - 'Parametrized SQL statement', - 'Parse an IP Address', - 'Parsing/RPN calculator algorithm', - 'Parsing/RPN to infix conversion', - 'Parsing/Shunting-yard algorithm', - 'Partial function application', - 'Pascal matrix generation', - "Pascal's triangle", - "Pascal's triangle/Puzzle", - 'Pattern matching', - "Penney's game", - 'Percentage difference between images', - 'Percolation/Bond percolation', - 'Percolation/Mean cluster density', - 'Percolation/Mean run density', - 'Percolation/Site percolation', - 'Perfect numbers', - 'Permutation test', - 'Permutations', - 'Permutations by swapping', - 'Permutations/Derangements', - 'Permutations/Rank of a permutation', - 'Pernicious numbers', - 'Phrase reversals', - 'Pi', - 'Pick random element', - 'Pig the dice game', - 'Pig the dice game/Player', - 'Pinstripe/Display', - 'Pinstripe/Printer', - 'Play recorded sounds', - 'Playing cards', - 'Plot coordinate pairs', - 'Pointers and references', - 'Polymorphic copy', - 'Polymorphism', - 'Polynomial long division', - 'Polynomial regression', - 'Power set', - 'Pragmatic directives', - 'Price fraction', - 'Primality by trial division', - 'Prime decomposition', - 'Primes - allocate descendants to their ancestors', - 'Priority queue', - 'Probabilistic choice', - 'Problem of Apollonius', - 'Program name', - 'Program termination', - 'Pythagorean triplesQ', - 'QR decomposition', - 'Quaternion type', - 'Queue/Definition', - 'Queue/Usage', - 'Quickselect algorithm', - 'Q cont.', - 'QuineR', - 'Random number generator (device)', - 'Random number generator (included)', - 'Random numbers', - 'Range expansion', - 'Range extraction', - 'Ranking methods', - 'Rate counter', - 'Ray-casting algorithm', - 'RCRPG', - 'Read a configuration file', - 'Read a file line by line', - 'Read a specific line from a file', - 'Read entire file', - 'Real constants and functions', - 'Record sound', - 'Reduced row echelon form', - 'Regular expressions', - 'Remove duplicate elements', - 'Remove lines from a file', - 'Rename a file', - 'Rendezvous', - 'Rep-string', - 'Repeat a string', - 'Resistor mesh', - 'Respond to an unknown method call', - 'Return multiple values', - 'Reverse a string', - 'Reverse words in a string', - 'RIPEMD-160', - 'Rock-paper-scissors', - 'Roman numerals/Decode', - 'Roman numerals/Encode', - 'Roots of a function', - 'Roots of a quadratic function', - 'Roots of unity', - 'Rosetta Code/Count examples', - 'Rosetta Code/Find bare lang tags', - 'Rosetta Code/Find unimplemented tasks', - 'Rosetta Code/Fix code tags', - 'Rosetta Code/Rank languages by popularity', - 'Rot-13', - 'RSA code', - 'Run-length encoding', - 'Runge-Kutta method', - 'Runtime evaluation', - 'Runtime evaluation/In an environmentS', - 'S-Expressions', - 'Safe addition', - 'Sailors, coconuts and a monkey problem', - 'Same Fringe', - 'Scope modifiers', - 'Scope/Function names and labels', - 'Search a list', - 'Secure temporary file', - 'SEDOLs', - 'Self-describing numbers', - 'Self-referential sequence', - 'Semiprime', - 'Semordnilap', - 'Send an unknown method call', - 'Send email', - 'Sequence of non-squares', - 'Sequence of primes by Trial Division', - 'Set', - 'Set consolidation', - 'Set of real numbers', - 'Set puzzle', - 'Seven-sided dice from five-sided dice', - 'SHA-1', - 'SHA-256', - 'Shell one-liner', - 'Short-circuit evaluation', - 'Show the epoch', - 'Sierpinski carpet', - 'Sierpinski triangle', - 'Sierpinski triangle/Graphical', - 'Sieve of Eratosthenes', - 'Simple database', - 'Simple windowed application', - 'Simulate input/Keyboard', - 'Simulate input/Mouse', - 'Singleton', - 'Singly-linked list/Element definition', - 'Singly-linked list/Element insertion', - 'Singly-linked list/Traversal', - 'Sleep', - 'SOAP', - 'Sockets', - 'Sokoban', - 'Solve a Hidato puzzle', - "Solve a Holy Knight's tour", - 'Solve a Hopido puzzle', - 'Solve a Numbrix puzzle', - 'Solve the no connection puzzle', - 'Sort an array of composite structures', - 'Sort an integer array', - 'Sort disjoint sublist', - 'Sort stability', - 'Sort using a custom comparator', - 'Sorting algorithms/Bead sort', - 'Sorting algorithms/Bogosort', - 'Sorting algorithms/Bubble sort', - 'Sorting algorithms/Cocktail sort', - 'Sorting algorithms/Comb sort', - 'Sorting algorithms/Counting sort', - 'Sorting algorithms/Gnome sort', - 'Sorting algorithms/Heapsort', - 'Sorting algorithms/Insertion sort', - 'Sorting algorithms/Merge sort', - 'Sorting algorithms/Pancake sort', - 'Sorting algorithms/Permutation sort', - 'Sorting algorithms/Quicksort', - 'Sorting algorithms/Radix sort', - 'Sorting algorithms/Selection sort', - 'Sorting algorithms/Shell sort', - 'Sorting algorithms/Sleep sort', - 'Sorting algorithms/Stooge sort', - 'Sorting algorithms/Strand sort', - 'Soundex', - 'Sparkline in unicode', - 'Special characters', - 'Special variables', - 'Speech synthesis', - 'Spiral matrix', - 'SQL-based authentication', - 'Stable marriage problem', - 'Stack', - 'Stack traces', - 'Stair-climbing puzzle', - 'Standard deviation', - 'Start from a main routine', - 'State name puzzle', - 'Statistics/Basic', - 'Stem-and-leaf plot', - 'Stern-Brocot sequence', - 'String append', - 'String case', - 'String comparison', - 'String concatenation', - 'String interpolation (included)', - 'String length', - 'String matching', - 'String prepend', - 'Strip a set of characters from a string', - 'Strip block comments', - 'Strip comments from a string', - 'Strip control codes and extended characters from a string', - 'Strip whitespace from a string/Top and tail', - 'Subleq', - 'Substring', - 'Substring/Top and tail', - 'Subtractive generator', - 'Sudoku', - 'Sum and product of an array', - 'Sum digits of an integer', - 'Sum multiples of 3 and 5', - 'Sum of a series', - 'Sum of squares', - 'Sutherland-Hodgman polygon clipping', - 'Symmetric difference', - 'Synchronous concurrency', - 'System timeT', - 'Table creation/Postal addresses', - 'Take notes on the command line', - 'Temperature conversion', - 'Terminal control/Clear the screen', - 'Terminal control/Coloured text', - 'Terminal control/Cursor movement', - 'Terminal control/Cursor positioning', - 'Terminal control/Dimensions', - 'Terminal control/Display an extended character', - 'Terminal control/Hiding the cursor', - 'Terminal control/Inverse video', - 'Terminal control/Positional read', - 'Terminal control/Preserve screen', - 'Terminal control/Ringing the terminal bell', - 'Terminal control/Unicode output', - 'Ternary logic', - 'Test a function', - 'Text processing/1', - 'Text processing/2', - 'Text processing/Max licenses in use', - 'Textonyms', - 'The ISAAC Cipher', - 'The Twelve Days of Christmas', - "Thiele's interpolation formula", - 'Tic-tac-toe', - 'Time a function', - 'Tokenize a string', - 'Top rank per group', - 'Topic variable', - 'Topological sort', - 'Topswops', - 'Total circles area', - 'Towers of Hanoi', - 'Trabb Pardo–Knuth algorithm', - 'Tree traversal', - 'Trigonometric functions', - 'Truncatable primes', - 'Truncate a file', - 'Twelve statementsU', - 'Ulam spiral (for primes)', - 'Unbias a random generator', - 'Undefined values', - 'Unicode strings', - 'Unicode variable names', - 'Universal Turing machine', - 'Unix/ls', - 'Update a configuration file', - 'URL decoding', - 'URL encoding', - 'Use another language to call a function', - 'User input/Graphical', - 'User input/TextV', - 'Vampire number', - 'Van der Corput sequence', - 'Variable size/Get', - 'Variable size/Set', - 'Variable-length quantity', - 'Variables', - 'Variadic function', - 'Vector products', - 'Verify distribution uniformity/Chi-squared test', - 'Verify distribution uniformity/Naive', - 'Video display modes', - 'Vigenère cipher', - 'Vigenère cipher/Cryptanalysis', - 'Visualize a tree', - "Vogel's approximation method", - 'Voronoi diagramW', - 'Walk a directory/Non-recursively', - 'Walk a directory/Recursively', - 'Web scraping', - 'Window creation', - 'Window creation/X11', - 'Window management', - 'Wireworld', - 'Word wrap', - 'World Cup group stage', - 'Write float arrays to a text file', - 'Write language name in 3D ASCII', - 'Write to Windows event logX', - "Xiaolin Wu's line algorithm", - 'XML/DOM serialization', - 'XML/Input', - 'XML/Output', - 'XML/XPathY', - 'Y combinator', - 'Yahoo! search interface', - 'Yin and yangZ', - 'Zebra puzzle', - 'Zeckendorf arithmetic', - 'Zeckendorf number representation', - 'Zero to the zero power', - 'Zhang-Suen thinning algorithm', - 'Zig-zag matrix'] - - - - - -def get_text(url): - """Takes a url and returns text""" - req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'}) - content = urllib.request.urlopen(req).read() - page_text=BeautifulSoup(content) - return page_text.get_text() - -# def scrape_text(text): -# data_crop = findall("[EDIT] \n.+\n", text) -# return data_crop - - -def scrape_text(text): - """Takes text from get_text and returns a list of tuples with - language in [0] and code in [1]""" - data_crop = findall(r"edit] (.+)\n(.+)\n", text) - return data_crop - ##Should maybe grab all of the text - -def scrape_links(): - """Creates list of links to use with create_url to gather code.""" - with open ("links_list.txt", "r") as myfile: - data=myfile.read() - return findall(r"wiki/(.+)\" ti", data) - - -def create_url_for_scraping(task_string): - return "http://www.rosettacode.org/wiki/{}".format(task_string) - -language_start = ["C", "C#", "Common Lisp", "Clojure", "Haskell", - "Java", "JavaScript", "OCaml", "Perl", "PHP", - "Python", "Ruby", "Scala", "Scheme"] - - -#def make_data(languages=language_start, num_links=50) - #grab data for all of the links in the task list - #go through for each of the languages and grab the associated - #code - #return a df with the code you need in a column and the type of - #code as the index - - -def scrape_data(url): - req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'}) - content = urllib.request.urlopen(req).read() - soup = BeautifulSoup(content) - return soup.find_all( "pre", class_="highlighted_source") - #pre is an html tag. We want all text from pre with class highlighted_source - #returns a list of soup objects - - -def pull_code_from_soup(soup_list): - return [[soup_list[i]['class'][0], soup_list[i].get_text()] for i in range(len(soup_list))] - - -def make_data(url_list): - code_snippets = pd.DataFrame(columns=([0, 1])) - for program in url_list: - soup_list = scrape_data(create_url_for_scraping(program)) - code_snippets = code_snippets.append(pd.DataFrame(pull_code_from_soup(soup_list)), ignore_index=True) - return code_snippets \ No newline at end of file From e0879fbc2f856efdd1c72effb734c1deffac190f Mon Sep 17 00:00:00 2001 From: Sovello Hildebrand Date: Tue, 9 Jun 2015 10:27:10 -0400 Subject: [PATCH 6/6] Update README.md --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index df9f9a4..543101e 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,10 @@ To use the classifier, import the Learner class and use the Learner class to def Learner splits (`test_size .33`) and fits the data and makes availabe the methods `test_score()`, `train_score()`, `predict(string)` and `classification_report()` which you can call on the learner object directly. ### VectorFeaturizer +The vector featurizer creates more specific feature vectors to use to increase accuracy of the classifier. It mainly adds specific character classes that distinguish different languages more explicitly because most of other general features are common among languages. + +### Passing the tests +Need to import the test files and test those against the data scrapped from RosettaCode # Classify code snippets into programming languages