Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3,953 changes: 3,953 additions & 0 deletions Ideas.ipynb

Large diffs are not rendered by default.

58 changes: 58 additions & 0 deletions More ideas.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#from stack overflow\n",
"import urllib2\n",
"from BeautifulSoup import BeautifulSoup\n",
"# or if you're using BeautifulSoup4:\n",
"# from bs4 import BeautifulSoup\n",
"\n",
"soup = BeautifulSoup(urllib2.urlopen('http://example.com').read())\n",
"\n",
"for row in soup('table', {'class': 'spad'})[0].tbody('tr'):\n",
" tds = row('td')\n",
" print tds[0].string, tds[1].string\n",
" # will print date and sunrise"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.4.3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
114 changes: 114 additions & 0 deletions New_Trial.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import csv\n",
"import re\n",
"import numpy as np\n",
"import random\n",
"\n",
"from collections import Counter\n",
"\n",
"from sklearn.pipeline import make_pipeline, make_union\n",
"from sklearn.base import TransformerMixin\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.cross_validation import train_test_split\n",
"from sklearn.metrics import classification_report, confusion_matrix"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def longest_run_of_capitol_letters_feature(char, text):\n",
" \"\"\"Find the longest run of capitol letters and return their length.\"\"\"\n",
" if char == '~':\n",
" runs = sorted(re.findall(r\"~+\", text), key=len)\n",
" elif char == '.':\n",
" runs = sorted(re.findall(r\"\\.+\", text), key=len)\n",
" elif char == '|':\n",
" runs = sorted(re.findall(r\"\\|+\", text), key=len)\n",
" elif char == ':':\n",
" runs = sorted(re.findall(r\"\\:+\", text), key=len)\n",
" elif char == ';':\n",
" runs = sorted(re.findall(r\";+\", text), key=len)\n",
" elif char == '$':\n",
" runs = sorted(re.findall(r\"\\$+\", text), key=len)\n",
" elif char == '(':\n",
" runs = sorted(re.findall(r\"\\(+\", text), key=len)\n",
" elif char == ')':\n",
" runs = sorted(re.findall(r\"\\)+\", text), key=len)\n",
" elif char == '-':\n",
" runs = sorted(re.findall(r\"\\-+\", text), key=len)\n",
" if runs:\n",
" return len(runs[-1])\n",
" else:\n",
" return 0\n",
"\n",
"def longest_run_of_character_feature(text):\n",
" \"\"\"Find the longest run of characters and return their length.\"\"\"\n",
" runs = sorted(re.findall(r\"[A-Z]+\", text), key=len)\n",
" if runs:\n",
" return len(runs[-1])\n",
" else:\n",
" return 0 \n",
" \n",
"def percent_character_feature(char, text):\n",
" \"\"\"Return percentage of text that is a particular char compared to total text length.\"\"\"\n",
" def feature_fn(text):\n",
" periods = text.count(char)\n",
" return periods / len(text)\n",
" return feature_fn\n",
"\n",
"class FunctionFeaturizer(TransformerMixin):\n",
" def __init__(self, *featurizers):\n",
" self.featurizers = featurizers\n",
" \n",
" def fit(self, X, y=None):\n",
" \"\"\"All SciKit-Learn compatible transformers and classifiers have the\n",
" same interface. `fit` always returns the same object.\"\"\"\n",
" return self\n",
" \n",
" def transform(self, X):\n",
" \"\"\"Given a list of original data, return a list of feature vectors.\"\"\"\n",
" fvs = []\n",
" for datum in X:\n",
" fv = [f(datum) for f in self.featurizers]\n",
" fvs.append(fv)\n",
" return np.array(fvs)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.4.3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
47 changes: 47 additions & 0 deletions Untitled.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from programming_language_classifier import *"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = scrape_and_clean"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.4.3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Empty file added data_scrape.py
Empty file.
Empty file added final.py
Empty file.
129 changes: 129 additions & 0 deletions programming_language_classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
from bs4 import BeautifulSoup
import urllib
from re import findall
import pandas as pd
import random
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import cross_val_score

# C (.gcc, .c)
# C#
# Common Lisp (.sbcl)
# Clojure
# Haskell
# Java
# JavaScript
# OCaml
# Perl
# PHP (.hack, .php)
# Python
# Ruby (.jruby, .yarv)
# Scala
# Scheme (.racket)

# def get_text(url):
# """Takes a url and returns text"""
# req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
# content = urllib.request.urlopen(req).read()
# page_text=BeautifulSoup(content)
# return page_text.get_text()

# def scrape_text(text):
# data_crop = findall("[EDIT] \n.+\n", text)
# return data_crop


# def scrape_text(text):
# """Takes text from get_text and returns a list of tuples with
# language in [0] and code in [1]"""
# data_crop = findall(r"edit] (.+)\n(.+)\n", text)
# return data_crop
# ##Should maybe grab all of the text
#
# def scrape_links():
# """Creates list of links to use with create_url to gather code."""
# with open ("links_list.txt", "r") as myfile:
# data=myfile.read()
# return findall(r"wiki/(.+)\" ti", data)


# def create_url_for_scraping(task_string):
# return "http://www.rosettacode.org{}".format(task_string)

language_start = ["C", "C#", "Common Lisp", "Clojure", "Haskell",
"Java", "JavaScript", "OCaml", "Perl", "PHP",
"Python", "Ruby", "Scala", "Scheme"]


def scrape_data(url):
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
content = urllib.request.urlopen(req).read()
soup = BeautifulSoup(content)
return soup.find_all( "pre", class_="highlighted_source")
#pre is an html tag. We want all text from pre with class highlighted_source
#returns a list of soup objects


def pull_code_from_soup(soup_list):
return [[soup_list[i]['class'][0], soup_list[i].get_text()] for i in range(len(soup_list))]


def make_data(url_list):
code_snippets = pd.DataFrame(columns=([0, 1]))
for url in url_list:
soup_list = scrape_data(url)
code_snippets = code_snippets.append(pd.DataFrame(pull_code_from_soup(soup_list)), ignore_index=True)
return code_snippets


def scrape_links():
req = urllib.request.Request('http://rosettacode.org/wiki/Category:Programming_Tasks', headers={'User-Agent': 'Mozilla/5.0'})
content = urllib.request.urlopen(req).read()
soup = BeautifulSoup(content)
link_list = [link.get('href') for link in soup.find_all('a')]
return ["http://www.rosettacode.org{}".format(link) for link in link_list[1:] if link.startswith('/wiki/')]


def make_links_list(num_links=30):
return random.sample(scrape_links(), num_links)


def scrape_and_clean(num_links=30):
df = make_data(make_links_list(num_links))
new_df = df[df[0]!='text']
return new_df


def scrape_clean_cut(num_links=100, min_examples=40):
df = make_data(make_links_list(num_links))
new_df = df[df[0]!='text']
new_df = new_df.groupby(0).filter(lambda x: len(x) >= min_examples)
return new_df

def pipeline_runner(dataframe, estimator):
##Re-testing with MultinomialNB
y = dataframe.loc[:, 0]
X = dataframe.loc[:, 1]
#splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y)
#running pipe to vectorize and run estimator
if estimator == 'Multinomial':
estimator_pipe = Pipeline([('bag_of_words', CountVectorizer()),
('mnb', MultinomialNB())])
elif estimator == 'Gaussian':
estimator_pipe = Pipeline([('bag_of_words', CountVectorizer()),
('gnb', GaussianNB())])
elif estimator == 'Bernoulli':
estimator_pipe = Pipeline([('bag_of_words', CountVectorizer(binary=True)),
('bnb', BernoulliNB())])
else:
return pipeline_runner(dataframe, estimator)
#fitting
estimator_pipe.fit(X_train, y_train)
#checking score
return estimator_pipe.score(X_train, y_train), estimator_pipe.score(X_test, y_test)
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ scipy
pandas
numpy
matplotlib
beautifulsoup4