diff --git a/Ashar_MongoDB_Assign.ipynb b/Ashar_MongoDB_Assign.ipynb new file mode 100644 index 0000000..770dd88 --- /dev/null +++ b/Ashar_MongoDB_Assign.ipynb @@ -0,0 +1,456 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import ijson" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "file_name = \"dataset.json\"\n", + "with open(file_name, 'r') as f:\n", + " objects = ijson.items(f, 'meta.view.columns.item')\n", + " columns = list(objects)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{u'name': u'sid', u'format': {}, u'dataTypeName': u'meta_data', u'fieldName': u':sid', u'renderTypeName': u'meta_data', u'position': 0, u'id': -1, u'flags': [u'hidden']}\n" + ] + } + ], + "source": [ + "print columns[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "column_names = [col[\"fieldName\"] for col in columns]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[u':sid',\n", + " u':id',\n", + " u':position',\n", + " u':created_at',\n", + " u':created_meta',\n", + " u':updated_at',\n", + " u':updated_meta',\n", + " u':meta',\n", + " u'date_of_stop',\n", + " u'time_of_stop',\n", + " u'agency',\n", + " u'subagency',\n", + " u'description',\n", + " u'location',\n", + " u'latitude',\n", + " u'longitude',\n", + " u'accident',\n", + " u'belts',\n", + " u'personal_injury',\n", + " u'property_damage',\n", + " u'fatal',\n", + " u'commercial_license',\n", + " u'hazmat',\n", + " u'commercial_vehicle',\n", + " u'alcohol',\n", + " u'work_zone',\n", + " u'state',\n", + " u'vehicle_type',\n", + " u'year',\n", + " u'make',\n", + " u'model',\n", + " u'color',\n", + " u'violation_type',\n", + " u'charge',\n", + " u'article',\n", + " u'contributed_to_accident',\n", + " u'race',\n", + " u'gender',\n", + " u'driver_city',\n", + " u'driver_state',\n", + " u'dl_state',\n", + " u'arrest_type',\n", + " u'geolocation']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "column_names" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "good_columns = [\n", + " \"date_of_stop\", \n", + " \"time_of_stop\", \n", + " \"agency\", \n", + " \"subagency\",\n", + " \"description\",\n", + " \"location\", \n", + " \"latitude\", \n", + " \"longitude\", \n", + " \"vehicle_type\", \n", + " \"year\", \n", + " \"make\", \n", + " \"model\", \n", + " \"color\", \n", + " \"violation_type\",\n", + " \"race\", \n", + " \"gender\", \n", + " \"driver_state\", \n", + " \"driver_city\", \n", + " \"dl_state\",\n", + " \"arrest_type\"\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "data = []\n", + "with open(file_name, 'r') as f:\n", + " objects = ijson.items(f, 'data.item')\n", + " for row in objects:\n", + " selected_row = []\n", + " for item in good_columns:\n", + " selected_row.append(row[column_names.index(item)])\n", + " data.append(selected_row)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[u'2013-09-24T00:00:00',\n", + " u'17:11:00',\n", + " u'MCP',\n", + " u'3rd district, Silver Spring',\n", + " u'DRIVING VEHICLE ON HIGHWAY WITH SUSPENDED REGISTRATION',\n", + " u'8804 FLOWER AVE',\n", + " None,\n", + " None,\n", + " u'02 - Automobile',\n", + " u'2008',\n", + " u'FORD',\n", + " u'4S',\n", + " u'BLACK',\n", + " u'Citation',\n", + " u'BLACK',\n", + " u'M',\n", + " u'MD',\n", + " u'TAKOMA PARK',\n", + " u'MD',\n", + " u'A - Marked Patrol']" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "dataframe = pd.DataFrame(data, columns=good_columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "dataframe.date_of_stop=pd.to_datetime(dataframe.date_of_stop)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from pymongo import MongoClient\n", + "import json\n", + "client = MongoClient()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "client = MongoClient('localhost', 27017)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "db = client['traffic_violations']\n", + "datarec = db.datarec" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "datarec.insert_many(dataframe.to_dict('records'))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "stopsByColor=datarec.aggregate([{\"$group\" : {\"_id\":\"$color\", \"count\":{\"$sum\":1}}}])" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{u'count': 23, u'_id': u'CHROME'}\n", + "{u'count': 357, u'_id': u'COPPER'}\n", + "{u'count': 4247, u'_id': u'YELLOW'}\n", + "{u'count': 2588, u'_id': u'BRONZE'}\n", + "{u'count': 5245, u'_id': u'BROWN'}\n", + "{u'count': 6491, u'_id': u'GREEN, LGT'}\n", + "{u'count': 19330, u'_id': u'MAROON'}\n", + "{u'count': 2211, u'_id': u'PURPLE'}\n", + "{u'count': 35901, u'_id': u'GOLD'}\n", + "{u'count': 167, u'_id': u'PINK'}\n", + "{u'count': 746, u'_id': u'CREAM'}\n", + "{u'count': 23864, u'_id': u'TAN'}\n", + "{u'count': 12812, u'_id': u'GREEN, DK'}\n", + "{u'count': 14942, u'_id': u'BLUE, LIGHT'}\n", + "{u'count': 14254, u'_id': nan}\n", + "{u'count': 43944, u'_id': u'GREEN'}\n", + "{u'count': 3829, u'_id': u'ORANGE'}\n", + "{u'count': 164915, u'_id': u'WHITE'}\n", + "{u'count': 20, u'_id': u'CAMOUFLAGE'}\n", + "{u'count': 924, u'_id': u'MULTICOLOR'}\n", + "{u'count': 80581, u'_id': u'BLUE'}\n", + "{u'count': 199971, u'_id': u'SILVER'}\n", + "{u'count': 23227, u'_id': u'BLUE, DARK'}\n", + "{u'count': 86613, u'_id': u'RED'}\n", + "{u'count': 116635, u'_id': u'GRAY'}\n", + "{u'count': 216292, u'_id': u'BLACK'}\n", + "{u'count': 13160, u'_id': u'BEIGE'}\n" + ] + } + ], + "source": [ + "for doc in stopsByColor:\n", + " print doc" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "stopsByArrestType=datarec.aggregate([{\"$group\" : {\"_id\":\"$arrest_type\", \"count\":{\"$sum\":1}}}])" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{u'count': 44, u'_id': u'K - Aircraft Assist'}\n", + "{u'count': 299, u'_id': u'J - Unmarked Moving Radar (Moving)'}\n", + "{u'count': 260, u'_id': u'P - Mounted Patrol'}\n", + "{u'count': 561, u'_id': u'H - Unmarked Moving Radar (Stationary)'}\n", + "{u'count': 1545, u'_id': u'I - Marked Moving Radar (Moving)'}\n", + "{u'count': 399, u'_id': u'C - Marked VASCAR'}\n", + "{u'count': 232, u'_id': u'D - Unmarked VASCAR'}\n", + "{u'count': 6882, u'_id': u'E - Marked Stationary Radar'}\n", + "{u'count': 12875, u'_id': u'S - License Plate Recognition'}\n", + "{u'count': 1632, u'_id': u'M - Marked (Off-Duty)'}\n", + "{u'count': 5083, u'_id': u'R - Unmarked Laser'}\n", + "{u'count': 161, u'_id': u'N - Unmarked (Off-Duty)'}\n", + "{u'count': 10336, u'_id': u'L - Motorcycle'}\n", + "{u'count': 110653, u'_id': u'Q - Marked Laser'}\n", + "{u'count': 4077, u'_id': u'G - Marked Moving Radar (Stationary)'}\n", + "{u'count': 11397, u'_id': u'O - Foot Patrol'}\n", + "{u'count': 35195, u'_id': u'B - Unmarked Patrol'}\n", + "{u'count': 772, u'_id': u'F - Unmarked Stationary Radar'}\n", + "{u'count': 890886, u'_id': u'A - Marked Patrol'}\n" + ] + } + ], + "source": [ + "for doc in stopsByArrestType:\n", + " print doc" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "data = pd.DataFrame(list(collections.find()))" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "client.close()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1 187270\n", + "2 176103\n", + "4 173063\n", + "3 168624\n", + "0 147816\n", + "5 127443\n", + "6 112970\n", + "Name: date_of_stop, dtype: int64\n" + ] + } + ], + "source": [ + "print data.date_of_stop.dt.dayofweek.value_counts()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.11+" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}