From cbd8000c774178e0244a46dffeaf630125872fb6 Mon Sep 17 00:00:00 2001 From: IBC-RashidAli Date: Fri, 12 May 2017 19:04:31 +0500 Subject: [PATCH] Add files via upload --- MongoDB-Assignment.ipynb | 568 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 568 insertions(+) create mode 100644 MongoDB-Assignment.ipynb diff --git a/MongoDB-Assignment.ipynb b/MongoDB-Assignment.ipynb new file mode 100644 index 0000000..b3bfc55 --- /dev/null +++ b/MongoDB-Assignment.ipynb @@ -0,0 +1,568 @@ +{ + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + }, + "name": "", + "signature": "sha256:0c6c4467653d0895554fc98e7499c4a9604207a119afb7e1725d50d0e3018a3c" + }, + "nbformat": 3, + "nbformat_minor": 0, + "worksheets": [ + { + "cells": [ + { + "cell_type": "code", + "collapsed": true, + "input": [ + "import pandas as pd\n", + "import ijson" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 13 + }, + { + "cell_type": "code", + "collapsed": true, + "input": [ + "filename = \"MongoDBAssignment.json\"\n", + "with open(filename, 'r') as f:\n", + " objects = ijson.items(f, 'meta.view.columns.item')\n", + " columns = list(objects)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 2 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "print columns[0]" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "{u'name': u'sid', u'format': {}, u'dataTypeName': u'meta_data', u'fieldName': u':sid', u'renderTypeName': u'meta_data', u'position': 0, u'id': -1, u'flags': [u'hidden']}\n" + ] + } + ], + "prompt_number": 3 + }, + { + "cell_type": "code", + "collapsed": true, + "input": [ + "column_names = [col[\"fieldName\"] for col in columns]" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 4 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "column_names" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 5, + "text": [ + "[u':sid',\n", + " u':id',\n", + " u':position',\n", + " u':created_at',\n", + " u':created_meta',\n", + " u':updated_at',\n", + " u':updated_meta',\n", + " u':meta',\n", + " u'date_of_stop',\n", + " u'time_of_stop',\n", + " u'agency',\n", + " u'subagency',\n", + " u'description',\n", + " u'location',\n", + " u'latitude',\n", + " u'longitude',\n", + " u'accident',\n", + " u'belts',\n", + " u'personal_injury',\n", + " u'property_damage',\n", + " u'fatal',\n", + " u'commercial_license',\n", + " u'hazmat',\n", + " u'commercial_vehicle',\n", + " u'alcohol',\n", + " u'work_zone',\n", + " u'state',\n", + " u'vehicle_type',\n", + " u'year',\n", + " u'make',\n", + " u'model',\n", + " u'color',\n", + " u'violation_type',\n", + " u'charge',\n", + " u'article',\n", + " u'contributed_to_accident',\n", + " u'race',\n", + " u'gender',\n", + " u'driver_city',\n", + " u'driver_state',\n", + " u'dl_state',\n", + " u'arrest_type',\n", + " u'geolocation']" + ] + } + ], + "prompt_number": 5 + }, + { + "cell_type": "code", + "collapsed": true, + "input": [ + "good_columns = [\n", + " \"date_of_stop\", \n", + " \"time_of_stop\", \n", + " \"agency\", \n", + " \"subagency\",\n", + " \"description\",\n", + " \"location\", \n", + " \"latitude\", \n", + " \"longitude\", \n", + " \"vehicle_type\", \n", + " \"year\", \n", + " \"make\", \n", + " \"model\", \n", + " \"color\", \n", + " \"violation_type\",\n", + " \"race\", \n", + " \"gender\", \n", + " \"driver_state\", \n", + " \"driver_city\", \n", + " \"dl_state\",\n", + " \"arrest_type\"\n", + "]" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 6 + }, + { + "cell_type": "code", + "collapsed": true, + "input": [ + "data = []\n", + "with open(filename, 'r') as f:\n", + " objects = ijson.items(f, 'data.item')\n", + " for row in objects:\n", + " selected_row = []\n", + " for item in good_columns:\n", + " selected_row.append(row[column_names.index(item)])\n", + " data.append(selected_row)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 7 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "data[0]" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 8, + "text": [ + "[u'2013-09-24T00:00:00',\n", + " u'17:11:00',\n", + " u'MCP',\n", + " u'3rd district, Silver Spring',\n", + " u'DRIVING VEHICLE ON HIGHWAY WITH SUSPENDED REGISTRATION',\n", + " u'8804 FLOWER AVE',\n", + " None,\n", + " None,\n", + " u'02 - Automobile',\n", + " u'2008',\n", + " u'FORD',\n", + " u'4S',\n", + " u'BLACK',\n", + " u'Citation',\n", + " u'BLACK',\n", + " u'M',\n", + " u'MD',\n", + " u'TAKOMA PARK',\n", + " u'MD',\n", + " u'A - Marked Patrol']" + ] + } + ], + "prompt_number": 8 + }, + { + "cell_type": "code", + "collapsed": true, + "input": [ + "\n", + "df = pd.DataFrame(data, columns=good_columns)\n", + "\n" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 9 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "print df.shape" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "(1093289, 20)\n" + ] + } + ], + "prompt_number": 12 + }, + { + "cell_type": "code", + "collapsed": true, + "input": [ + "df.date_of_stop=pd.to_datetime(df.date_of_stop)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 13 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "print(df.shape)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "(1093289, 20)\n" + ] + } + ], + "prompt_number": 14 + }, + { + "cell_type": "code", + "collapsed": true, + "input": [ + "from pymongo import MongoClient\n", + "import json\n", + "mg_lc = MongoClient()" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 1 + }, + { + "cell_type": "code", + "collapsed": true, + "input": [ + "mg_lc = MongoClient('localhost', 27017)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 2 + }, + { + "cell_type": "code", + "collapsed": true, + "input": [ + "db = mg_lc['traffic_violations']\n", + "collections = db.collections\n" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 3 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "collections.insert_many(df.to_dict('records'))" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 18, + "text": [ + "" + ] + } + ], + "prompt_number": 18 + }, + { + "cell_type": "code", + "collapsed": true, + "input": [ + "mg_lc.close()" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 5 + }, + { + "cell_type": "code", + "collapsed": true, + "input": [ + "mg_lc = MongoClient('localhost', 27017)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 6 + }, + { + "cell_type": "code", + "collapsed": true, + "input": [ + "db = mg_lc['traffic_violations']\n", + "collections = db.collections" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 7 + }, + { + "cell_type": "code", + "collapsed": true, + "input": [ + "resultColor=collections.aggregate([{\"$group\" : {\"_id\":\"$color\", \"count\":{\"$sum\":1}}}])" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 4 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "for doc in resultColor:\n", + " print doc" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "{u'count': 23, u'_id': u'CHROME'}\n", + "{u'count': 357, u'_id': u'COPPER'}\n", + "{u'count': 4247, u'_id': u'YELLOW'}\n", + "{u'count': 2588, u'_id': u'BRONZE'}\n", + "{u'count': 5245, u'_id': u'BROWN'}\n", + "{u'count': 6491, u'_id': u'GREEN, LGT'}\n", + "{u'count': 19330, u'_id': u'MAROON'}\n", + "{u'count': 2211, u'_id': u'PURPLE'}\n", + "{u'count': 35901, u'_id': u'GOLD'}\n", + "{u'count': 167, u'_id': u'PINK'}\n", + "{u'count': 746, u'_id': u'CREAM'}\n", + "{u'count': 23864, u'_id': u'TAN'}\n", + "{u'count': 12812, u'_id': u'GREEN, DK'}\n", + "{u'count': 14942, u'_id': u'BLUE, LIGHT'}\n", + "{u'count': 14254, u'_id': nan}\n", + "{u'count': 43944, u'_id': u'GREEN'}\n", + "{u'count': 3829, u'_id': u'ORANGE'}\n", + "{u'count': 164915, u'_id': u'WHITE'}\n", + "{u'count': 20, u'_id': u'CAMOUFLAGE'}\n", + "{u'count': 924, u'_id': u'MULTICOLOR'}\n", + "{u'count': 80581, u'_id': u'BLUE'}\n", + "{u'count': 199971, u'_id': u'SILVER'}\n", + "{u'count': 23227, u'_id': u'BLUE, DARK'}\n", + "{u'count': 86613, u'_id': u'RED'}\n", + "{u'count': 116635, u'_id': u'GRAY'}\n", + "{u'count': 216292, u'_id': u'BLACK'}\n", + "{u'count': 13160, u'_id': u'BEIGE'}\n" + ] + } + ], + "prompt_number": 5 + }, + { + "cell_type": "code", + "collapsed": true, + "input": [ + "mg_results=collections.aggregate([{\"$group\" : {\"_id\":\"$arrest_type\", \"count\":{\"$sum\":1}}}])" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 6 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "for doc in resultArrest:\n", + " print doc" + ], + "language": "python", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "{u'count': 44, u'_id': u'K - Aircraft Assist'}\n", + "{u'count': 299, u'_id': u'J - Unmarked Moving Radar (Moving)'}\n", + "{u'count': 260, u'_id': u'P - Mounted Patrol'}\n", + "{u'count': 561, u'_id': u'H - Unmarked Moving Radar (Stationary)'}\n", + "{u'count': 1545, u'_id': u'I - Marked Moving Radar (Moving)'}\n", + "{u'count': 399, u'_id': u'C - Marked VASCAR'}\n", + "{u'count': 232, u'_id': u'D - Unmarked VASCAR'}\n", + "{u'count': 6882, u'_id': u'E - Marked Stationary Radar'}\n", + "{u'count': 12875, u'_id': u'S - License Plate Recognition'}\n", + "{u'count': 1632, u'_id': u'M - Marked (Off-Duty)'}\n", + "{u'count': 5083, u'_id': u'R - Unmarked Laser'}\n", + "{u'count': 161, u'_id': u'N - Unmarked (Off-Duty)'}\n", + "{u'count': 10336, u'_id': u'L - Motorcycle'}\n", + "{u'count': 110653, u'_id': u'Q - Marked Laser'}\n", + "{u'count': 4077, u'_id': u'G - Marked Moving Radar (Stationary)'}\n", + "{u'count': 11397, u'_id': u'O - Foot Patrol'}\n", + "{u'count': 35195, u'_id': u'B - Unmarked Patrol'}\n", + "{u'count': 772, u'_id': u'F - Unmarked Stationary Radar'}\n", + "{u'count': 890886, u'_id': u'A - Marked Patrol'}\n" + ] + } + ], + "prompt_number": 7 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "resultDay=collections.aggregate(\n", + " [\n", + " {\n", + " \"$project\":\n", + " {\n", + " \"dayRes\": { \"$dayOfWeek\": \"$date_of_stop\" },\n", + " } \n", + " \n", + " },{\n", + " \"$group\": {\n", + " \"_id\":\"$dayRes\", \"count\":{\"$sum\":1}\n", + " }\n", + " }\n", + " ]\n", + ")" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 11 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "for doc in resultDay:\n", + " print doc\n", + "# 1 -> Sunday\n", + "# 2 -> Monday\n", + "# 3 -> Tuesday\n", + "# 4 -> Wednesday\n", + "# 5 -> Thursday\n", + "# 6 -> Friday\n", + "# 7 -> Saturday" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "{u'count': 127443, u'_id': 7}\n", + "{u'count': 147816, u'_id': 2}\n", + "{u'count': 173063, u'_id': 6}\n", + "{u'count': 176103, u'_id': 4}\n", + "{u'count': 112970, u'_id': 1}\n", + "{u'count': 168624, u'_id': 5}\n", + "{u'count': 187270, u'_id': 3}\n" + ] + } + ], + "prompt_number": 12 + }, + { + "cell_type": "code", + "collapsed": true, + "input": [ + "mg_lc.close()" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 15 + } + ], + "metadata": {} + } + ] +} \ No newline at end of file