tiyd-python-2015-05 · flowersw · Jun 8, 2015 · Jun 8, 2015 · May 20, 2016
diff --git a/Ideas.ipynb b/Ideas.ipynb
diff --git a/More ideas.ipynb b/More ideas.ipynb
@@ -0,0 +1,58 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "#from stack overflow\n",
+    "import urllib2\n",
+    "from BeautifulSoup import BeautifulSoup\n",
+    "# or if you're using BeautifulSoup4:\n",
+    "# from bs4 import BeautifulSoup\n",
+    "\n",
+    "soup = BeautifulSoup(urllib2.urlopen('http://example.com').read())\n",
+    "\n",
+    "for row in soup('table', {'class': 'spad'})[0].tbody('tr'):\n",
+    "    tds = row('td')\n",
+    "    print tds[0].string, tds[1].string\n",
+    "    # will print date and sunrise"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.4.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/New_Trial.ipynb b/New_Trial.ipynb
@@ -0,0 +1,114 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import csv\n",
+    "import re\n",
+    "import numpy as np\n",
+    "import random\n",
+    "\n",
+    "from collections import Counter\n",
+    "\n",
+    "from sklearn.pipeline import make_pipeline, make_union\n",
+    "from sklearn.base import TransformerMixin\n",
+    "from sklearn.tree import DecisionTreeClassifier\n",
+    "from sklearn.feature_extraction.text import CountVectorizer\n",
+    "from sklearn.cross_validation import train_test_split\n",
+    "from sklearn.metrics import classification_report, confusion_matrix"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def longest_run_of_capitol_letters_feature(char, text):\n",
+    "    \"\"\"Find the longest run of capitol letters and return their length.\"\"\"\n",
+    "    if char == '~':\n",
+    "        runs = sorted(re.findall(r\"~+\", text), key=len)\n",
+    "    elif char == '.':\n",
+    "        runs = sorted(re.findall(r\"\\.+\", text), key=len)\n",
+    "    elif char == '|':\n",
+    "        runs = sorted(re.findall(r\"\\|+\", text), key=len)\n",
+    "    elif char == ':':\n",
+    "        runs = sorted(re.findall(r\"\\:+\", text), key=len)\n",
+    "    elif char == ';':\n",
+    "        runs = sorted(re.findall(r\";+\", text), key=len)\n",
+    "    elif char == '$':\n",
+    "        runs = sorted(re.findall(r\"\\$+\", text), key=len)\n",
+    "    elif char == '(':\n",
+    "        runs = sorted(re.findall(r\"\\(+\", text), key=len)\n",
+    "    elif char == ')':\n",
+    "        runs = sorted(re.findall(r\"\\)+\", text), key=len)\n",
+    "    elif char == '-':\n",
+    "        runs = sorted(re.findall(r\"\\-+\", text), key=len)\n",
+    "    if runs:\n",
+    "        return len(runs[-1])\n",
+    "    else:\n",
+    "        return 0\n",
+    "\n",
+    "def longest_run_of_character_feature(text):\n",
+    "    \"\"\"Find the longest run of characters and return their length.\"\"\"\n",
+    "    runs = sorted(re.findall(r\"[A-Z]+\", text), key=len)\n",
+    "    if runs:\n",
+    "        return len(runs[-1])\n",
+    "    else:\n",
+    "        return 0   \n",
+    "    \n",
+    "def percent_character_feature(char, text):\n",
+    "    \"\"\"Return percentage of text that is a particular char compared to total text length.\"\"\"\n",
+    "    def feature_fn(text):\n",
+    "        periods = text.count(char)\n",
+    "        return periods / len(text)\n",
+    "    return feature_fn\n",
+    "\n",
+    "class FunctionFeaturizer(TransformerMixin):\n",
+    "    def __init__(self, *featurizers):\n",
+    "        self.featurizers = featurizers\n",
+    "        \n",
+    "    def fit(self, X, y=None):\n",
+    "        \"\"\"All SciKit-Learn compatible transformers and classifiers have the\n",
+    "        same interface. `fit` always returns the same object.\"\"\"\n",
+    "        return self\n",
+    "        \n",
+    "    def transform(self, X):\n",
+    "        \"\"\"Given a list of original data, return a list of feature vectors.\"\"\"\n",
+    "        fvs = []\n",
+    "        for datum in X:\n",
+    "            fv = [f(datum) for f in self.featurizers]\n",
+    "            fvs.append(fv)\n",
+    "        return np.array(fvs)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.4.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/Untitled.ipynb b/Untitled.ipynb
@@ -0,0 +1,47 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from programming_language_classifier import *"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "df = scrape_and_clean"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.4.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/data_scrape.py b/data_scrape.py
diff --git a/final.py b/final.py
diff --git a/programming_language_classifier.py b/programming_language_classifier.py
@@ -0,0 +1,129 @@
+from bs4 import BeautifulSoup
+import urllib
+from re import findall
+import pandas as pd
+import random
+from sklearn.naive_bayes import GaussianNB
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.naive_bayes import BernoulliNB
+from sklearn.cross_validation import train_test_split
+from sklearn.pipeline import Pipeline
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.cross_validation import cross_val_score
+
+# C (.gcc, .c)
+# C#
+# Common Lisp (.sbcl)
+# Clojure
+# Haskell
+# Java
+# JavaScript
+# OCaml
+# Perl
+# PHP (.hack, .php)
+# Python
+# Ruby (.jruby, .yarv)
+# Scala
+# Scheme (.racket)
+
+# def get_text(url):
+#     """Takes a url and returns text"""
+#     req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
+#     content = urllib.request.urlopen(req).read()
+#     page_text=BeautifulSoup(content)
+#     return page_text.get_text()
+
+# def scrape_text(text):
+#     data_crop = findall("[EDIT] \n.+\n", text)
+#     return data_crop
+
+
+# def scrape_text(text):
+#     """Takes text from get_text and returns a list of tuples with
+#     language in [0] and code in [1]"""
+#     data_crop = findall(r"edit] (.+)\n(.+)\n", text)
+#     return data_crop
+#     ##Should maybe grab all of the text
+#
+# def scrape_links():
+#     """Creates list of links to use with create_url to gather code."""
+#     with open ("links_list.txt", "r") as myfile:
+#         data=myfile.read()
+#     return findall(r"wiki/(.+)\" ti", data)
+
+
+# def create_url_for_scraping(task_string):
+#     return "http://www.rosettacode.org{}".format(task_string)
+
+language_start = ["C", "C#", "Common Lisp", "Clojure", "Haskell",
+                  "Java", "JavaScript", "OCaml", "Perl", "PHP",
+                  "Python", "Ruby", "Scala", "Scheme"]
+
+
+def scrape_data(url):
+    req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
+    content = urllib.request.urlopen(req).read()
+    soup = BeautifulSoup(content)
+    return soup.find_all( "pre", class_="highlighted_source")
+    #pre is an html tag. We want all text from pre with class highlighted_source
+    #returns a list of soup objects
+
+
+def pull_code_from_soup(soup_list):
+    return [[soup_list[i]['class'][0], soup_list[i].get_text()] for i in range(len(soup_list))]
+
+
+def make_data(url_list):
+    code_snippets = pd.DataFrame(columns=([0, 1]))
+    for url in url_list:
+        soup_list = scrape_data(url)
+        code_snippets = code_snippets.append(pd.DataFrame(pull_code_from_soup(soup_list)), ignore_index=True)
+    return code_snippets
+
+
+def scrape_links():
+   req = urllib.request.Request('http://rosettacode.org/wiki/Category:Programming_Tasks', headers={'User-Agent': 'Mozilla/5.0'})
+   content = urllib.request.urlopen(req).read()
+   soup = BeautifulSoup(content)
+   link_list = [link.get('href') for link in soup.find_all('a')]
+   return ["http://www.rosettacode.org{}".format(link) for link in link_list[1:] if link.startswith('/wiki/')]
+
+
+def make_links_list(num_links=30):
+    return random.sample(scrape_links(), num_links)
+
+
+def scrape_and_clean(num_links=30):
+    df = make_data(make_links_list(num_links))
+    new_df = df[df[0]!='text']
+    return new_df
+
+
+def scrape_clean_cut(num_links=100, min_examples=40):
+    df = make_data(make_links_list(num_links))
+    new_df = df[df[0]!='text']
+    new_df = new_df.groupby(0).filter(lambda x: len(x) >= min_examples)
+    return new_df
+
+def pipeline_runner(dataframe, estimator):
+    ##Re-testing with MultinomialNB
+    y = dataframe.loc[:, 0]
+    X = dataframe.loc[:, 1]
+    #splitting data
+    X_train, X_test, y_train, y_test = train_test_split(X, y)
+    #running pipe to vectorize and run estimator
+    if estimator == 'Multinomial':
+        estimator_pipe = Pipeline([('bag_of_words', CountVectorizer()),
+                              ('mnb', MultinomialNB())])
+    elif estimator == 'Gaussian':
+        estimator_pipe = Pipeline([('bag_of_words', CountVectorizer()),
+                              ('gnb', GaussianNB())])
+    elif estimator == 'Bernoulli':
+        estimator_pipe = Pipeline([('bag_of_words', CountVectorizer(binary=True)),
+                              ('bnb', BernoulliNB())])
+    else:
+        return pipeline_runner(dataframe, estimator)
+    #fitting
+    estimator_pipe.fit(X_train, y_train)
+    #checking score
+    return estimator_pipe.score(X_train, y_train), estimator_pipe.score(X_test, y_test)
diff --git a/requirements.txt b/requirements.txt
@@ -3,3 +3,4 @@ scipy
 pandas
 numpy
 matplotlib
+beautifulsoup4
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,3 +3,4 @@ scipy @@
     pandas
     numpy
     matplotlib
+    beautifulsoup4