diff --git a/Saba_KHI_MongoDBAssignment.ipynb b/Saba_KHI_MongoDBAssignment.ipynb new file mode 100644 index 0000000..64ed25a --- /dev/null +++ b/Saba_KHI_MongoDBAssignment.ipynb @@ -0,0 +1,576 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import ijson\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "filename = \"rows.json\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "with open(filename, 'r') as f:\n", + " objects = ijson.items(f, 'meta.view.columns.item')\n", + " columns = list(objects)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'renderTypeName': 'meta_data', 'dataTypeName': 'meta_data', 'format': {}, 'name': 'sid', 'flags': ['hidden'], 'id': -1, 'position': 0, 'fieldName': ':sid'}\n" + ] + } + ], + "source": [ + "print(columns[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "column_names = [col[\"fieldName\"] for col in columns]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[':sid',\n", + " ':id',\n", + " ':position',\n", + " ':created_at',\n", + " ':created_meta',\n", + " ':updated_at',\n", + " ':updated_meta',\n", + " ':meta',\n", + " 'date_of_stop',\n", + " 'time_of_stop',\n", + " 'agency',\n", + " 'subagency',\n", + " 'description',\n", + " 'location',\n", + " 'latitude',\n", + " 'longitude',\n", + " 'accident',\n", + " 'belts',\n", + " 'personal_injury',\n", + " 'property_damage',\n", + " 'fatal',\n", + " 'commercial_license',\n", + " 'hazmat',\n", + " 'commercial_vehicle',\n", + " 'alcohol',\n", + " 'work_zone',\n", + " 'state',\n", + " 'vehicle_type',\n", + " 'year',\n", + " 'make',\n", + " 'model',\n", + " 'color',\n", + " 'violation_type',\n", + " 'charge',\n", + " 'article',\n", + " 'contributed_to_accident',\n", + " 'race',\n", + " 'gender',\n", + " 'driver_city',\n", + " 'driver_state',\n", + " 'dl_state',\n", + " 'arrest_type',\n", + " 'geolocation']" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Total Columns\n", + "column_names" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "#Only important columns\n", + "important_columns = [\n", + " \"date_of_stop\", \n", + " \"color\", \n", + " \"arrest_type\"\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data = []\n", + "with open(filename, 'r') as f:\n", + " objects = ijson.items(f, 'data.item')\n", + " for row in objects:\n", + " selected_row = []\n", + " for item in important_columns:\n", + " selected_row.append(row[column_names.index(item)])\n", + " data.append(selected_row)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df_TV = pd.DataFrame(data, columns=important_columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "df_TV['day_of_stop'] = pd.to_datetime(df_TV['date_of_stop'], errors='coerce')" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df_TV['day_of_stop']=df_TV['day_of_stop'].dt.weekday_name" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from pymongo import MongoClient" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "client = MongoClient('localhost', 27017)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "mongo_db = client['tf_vn']\n" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mongo_db.collections.insert_many(df_TV.to_dict('records'))" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "StopByColor=mongo_db.collections.aggregate([{\"$group\" : {\"_id\":\"$color\", \"count\":{\"$sum\":1}}}])" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'_id': 'CHROME', 'count': 23}\n", + "{'_id': 'COPPER', 'count': 357}\n", + "{'_id': 'YELLOW', 'count': 4247}\n", + "{'_id': 'BRONZE', 'count': 2588}\n", + "{'_id': 'BROWN', 'count': 5245}\n", + "{'_id': 'GREEN, LGT', 'count': 6491}\n", + "{'_id': 'MAROON', 'count': 19330}\n", + "{'_id': 'PURPLE', 'count': 2211}\n", + "{'_id': 'GOLD', 'count': 35901}\n", + "{'_id': 'PINK', 'count': 167}\n", + "{'_id': 'CREAM', 'count': 746}\n", + "{'_id': 'TAN', 'count': 23864}\n", + "{'_id': 'GREEN, DK', 'count': 12812}\n", + "{'_id': 'BLUE, LIGHT', 'count': 14942}\n", + "{'_id': 'GREEN', 'count': 43944}\n", + "{'_id': 'ORANGE', 'count': 3829}\n", + "{'_id': 'WHITE', 'count': 164915}\n", + "{'_id': 'CAMOUFLAGE', 'count': 20}\n", + "{'_id': 'MULTICOLOR', 'count': 924}\n", + "{'_id': 'BLUE', 'count': 80581}\n", + "{'_id': 'SILVER', 'count': 199971}\n", + "{'_id': 'N/A', 'count': 12508}\n", + "{'_id': 'GRAY', 'count': 116635}\n", + "{'_id': 'BLUE, DARK', 'count': 23227}\n", + "{'_id': 'RED', 'count': 86613}\n", + "{'_id': None, 'count': 1746}\n", + "{'_id': 'BLACK', 'count': 216292}\n", + "{'_id': 'BEIGE', 'count': 13160}\n" + ] + } + ], + "source": [ + "#How many stops are made by car color\n", + "for stop in StopByColor:\n", + " print (stop)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "BLACK 216292\n", + "SILVER 199971\n", + "WHITE 164915\n", + "GRAY 116635\n", + "RED 86613\n", + "BLUE 80581\n", + "GREEN 43944\n", + "GOLD 35901\n", + "TAN 23864\n", + "BLUE, DARK 23227\n", + "MAROON 19330\n", + "BLUE, LIGHT 14942\n", + "BEIGE 13160\n", + "GREEN, DK 12812\n", + "N/A 12508\n", + "GREEN, LGT 6491\n", + "BROWN 5245\n", + "YELLOW 4247\n", + "ORANGE 3829\n", + "BRONZE 2588\n", + "PURPLE 2211\n", + "MULTICOLOR 924\n", + "CREAM 746\n", + "COPPER 357\n", + "PINK 167\n", + "CHROME 23\n", + "CAMOUFLAGE 20\n", + "Name: color, dtype: int64" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_TV[\"color\"].value_counts() #verify with pandas\n" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "#Find the value counts of arrest type\n", + "value_counts=mongo_db.collections.aggregate([{\"$group\" : {\"_id\":\"$arrest_type\", \"count\":{\"$sum\":1}}}])" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'_id': 'K - Aircraft Assist', 'count': 44}\n", + "{'_id': 'J - Unmarked Moving Radar (Moving)', 'count': 299}\n", + "{'_id': 'P - Mounted Patrol', 'count': 260}\n", + "{'_id': 'H - Unmarked Moving Radar (Stationary)', 'count': 561}\n", + "{'_id': 'I - Marked Moving Radar (Moving)', 'count': 1545}\n", + "{'_id': 'C - Marked VASCAR', 'count': 399}\n", + "{'_id': 'D - Unmarked VASCAR', 'count': 232}\n", + "{'_id': 'E - Marked Stationary Radar', 'count': 6882}\n", + "{'_id': 'S - License Plate Recognition', 'count': 12875}\n", + "{'_id': 'M - Marked (Off-Duty)', 'count': 1632}\n", + "{'_id': 'R - Unmarked Laser', 'count': 5083}\n", + "{'_id': 'N - Unmarked (Off-Duty)', 'count': 161}\n", + "{'_id': 'L - Motorcycle', 'count': 10336}\n", + "{'_id': 'Q - Marked Laser', 'count': 110653}\n", + "{'_id': 'G - Marked Moving Radar (Stationary)', 'count': 4077}\n", + "{'_id': 'B - Unmarked Patrol', 'count': 35195}\n", + "{'_id': 'O - Foot Patrol', 'count': 11397}\n", + "{'_id': 'F - Unmarked Stationary Radar', 'count': 772}\n", + "{'_id': 'A - Marked Patrol', 'count': 890886}\n" + ] + } + ], + "source": [ + "for counts in value_counts:\n", + " print (counts)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "A - Marked Patrol 890886\n", + "Q - Marked Laser 110653\n", + "B - Unmarked Patrol 35195\n", + "S - License Plate Recognition 12875\n", + "O - Foot Patrol 11397\n", + "L - Motorcycle 10336\n", + "E - Marked Stationary Radar 6882\n", + "R - Unmarked Laser 5083\n", + "G - Marked Moving Radar (Stationary) 4077\n", + "M - Marked (Off-Duty) 1632\n", + "I - Marked Moving Radar (Moving) 1545\n", + "F - Unmarked Stationary Radar 772\n", + "H - Unmarked Moving Radar (Stationary) 561\n", + "C - Marked VASCAR 399\n", + "J - Unmarked Moving Radar (Moving) 299\n", + "P - Mounted Patrol 260\n", + "D - Unmarked VASCAR 232\n", + "N - Unmarked (Off-Duty) 161\n", + "K - Aircraft Assist 44\n", + "Name: arrest_type, dtype: int64" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_TV[\"arrest_type\"].value_counts() # verify with pandas" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "#Find which days result in the most traffic stop\n", + "day_stop=mongo_db.collections.aggregate([{\"$group\" : {\"_id\":\"$day_of_stop\", \"count\":{\"$sum\":1}}},{\"$sort\":{\"count\":-1}}]) " + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'_id': 'Tuesday', 'count': 187270}\n", + "{'_id': 'Wednesday', 'count': 176103}\n", + "{'_id': 'Friday', 'count': 173063}\n", + "{'_id': 'Thursday', 'count': 168624}\n", + "{'_id': 'Monday', 'count': 147816}\n", + "{'_id': 'Saturday', 'count': 127443}\n", + "{'_id': 'Sunday', 'count': 112970}\n" + ] + } + ], + "source": [ + "for day_week in day_stop:\n", + " if (day_week[\"_id\"])!=None:\n", + " print (day_week)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Tuesday 187270\n", + "Wednesday 176103\n", + "Friday 173063\n", + "Thursday 168624\n", + "Monday 147816\n", + "Saturday 127443\n", + "Sunday 112970\n", + "Name: day_of_stop, dtype: int64" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_TV['day_of_stop'].value_counts() # verify with pandas" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "client.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python [conda env:faizi]", + "language": "python", + "name": "conda-env-faizi-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}