From 47e8b925d33c8d69118998dc095f0209bf0e49a7 Mon Sep 17 00:00:00 2001 From: hak Date: Fri, 12 May 2017 17:31:20 +0500 Subject: [PATCH 1/2] Files added --- HammadMongo.ipynb | 845 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 845 insertions(+) create mode 100644 HammadMongo.ipynb diff --git a/HammadMongo.ipynb b/HammadMongo.ipynb new file mode 100644 index 0000000..f74ee79 --- /dev/null +++ b/HammadMongo.ipynb @@ -0,0 +1,845 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import ijson" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "#Retrieve data from json file\n", + "file = \"dataset.json\"\n", + "with open(file, 'r') as IO:\n", + " objects = ijson.items(IO, 'meta.view.columns.item.fieldName')\n", + " columnNames = list(objects)" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[':sid',\n", + " ':id',\n", + " ':position',\n", + " ':created_at',\n", + " ':created_meta',\n", + " ':updated_at',\n", + " ':updated_meta',\n", + " ':meta',\n", + " 'date_of_stop',\n", + " 'time_of_stop',\n", + " 'agency',\n", + " 'subagency',\n", + " 'description',\n", + " 'location',\n", + " 'latitude',\n", + " 'longitude',\n", + " 'accident',\n", + " 'belts',\n", + " 'personal_injury',\n", + " 'property_damage',\n", + " 'fatal',\n", + " 'commercial_license',\n", + " 'hazmat',\n", + " 'commercial_vehicle',\n", + " 'alcohol',\n", + " 'work_zone',\n", + " 'state',\n", + " 'vehicle_type',\n", + " 'year',\n", + " 'make',\n", + " 'model',\n", + " 'color',\n", + " 'violation_type',\n", + " 'charge',\n", + " 'article',\n", + " 'contributed_to_accident',\n", + " 'race',\n", + " 'gender',\n", + " 'driver_city',\n", + " 'driver_state',\n", + " 'dl_state',\n", + " 'arrest_type',\n", + " 'geolocation']" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Get all column names\n", + "columnNames" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['date_of_stop',\n", + " 'time_of_stop',\n", + " 'agency',\n", + " 'subagency',\n", + " 'description',\n", + " 'location',\n", + " 'latitude',\n", + " 'longitude',\n", + " 'accident',\n", + " 'belts',\n", + " 'personal_injury',\n", + " 'property_damage',\n", + " 'fatal',\n", + " 'commercial_license',\n", + " 'hazmat',\n", + " 'commercial_vehicle',\n", + " 'alcohol',\n", + " 'work_zone',\n", + " 'state',\n", + " 'vehicle_type',\n", + " 'year',\n", + " 'make',\n", + " 'model',\n", + " 'color',\n", + " 'violation_type',\n", + " 'charge',\n", + " 'article',\n", + " 'contributed_to_accident',\n", + " 'race',\n", + " 'gender',\n", + " 'driver_city',\n", + " 'driver_state',\n", + " 'dl_state',\n", + " 'arrest_type',\n", + " 'geolocation']" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Extract clean column names from columnNames\n", + "clean_names = [name for name in columnNames if (name[0] not in \":\")]\n", + "clean_names" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [], + "source": [ + "with open(file, 'r') as IO:\n", + " objects = ijson.items(IO, 'data.item')\n", + " info = list(objects)" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[2118167,\n", + " 'EE8BC302-660F-48C4-B422-17427ECE821F',\n", + " 2118167,\n", + " 1482239054,\n", + " '498050',\n", + " 1482239054,\n", + " '498050',\n", + " None,\n", + " '2013-09-24T00:00:00',\n", + " '17:11:00',\n", + " 'MCP',\n", + " '3rd district, Silver Spring',\n", + " 'DRIVING VEHICLE ON HIGHWAY WITH SUSPENDED REGISTRATION',\n", + " '8804 FLOWER AVE',\n", + " None,\n", + " None,\n", + " 'No',\n", + " 'No',\n", + " 'No',\n", + " 'No',\n", + " 'No',\n", + " 'No',\n", + " 'No',\n", + " 'No',\n", + " 'No',\n", + " 'No',\n", + " 'MD',\n", + " '02 - Automobile',\n", + " '2008',\n", + " 'FORD',\n", + " '4S',\n", + " 'BLACK',\n", + " 'Citation',\n", + " '13-401(h)',\n", + " 'Transportation Article',\n", + " 'No',\n", + " 'BLACK',\n", + " 'M',\n", + " 'TAKOMA PARK',\n", + " 'MD',\n", + " 'MD',\n", + " 'A - Marked Patrol',\n", + " [None, None, None, None, None]]" + ] + }, + "execution_count": 103, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "info[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "dataframe = pd.DataFrame(data, columns=clean_names)" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(842, 35)" + ] + }, + "execution_count": 107, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataframe.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "dataframe.date_of_stop = pd.to_datetime(dataframe.date_of_stop)" + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Connected.\n" + ] + } + ], + "source": [ + "#Server connection with MongoClient\n", + "from pymongo import MongoClient\n", + "import json\n", + "server_conn = MongoClient(host='localhost', port=27017)\n", + "try:\n", + " server_conn.admin.command('ismaster')\n", + " print(\"Connected.\")\n", + "except ConnectionFailure:\n", + " print(\"Server not available\")" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'traffic'), 'collections')\n" + ] + } + ], + "source": [ + "db = server_conn['traffic']\n", + "collections = db.collections\n", + "print(collections)" + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 118, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "collections.insert_many(dataframe.to_dict('records'))" + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "metadata": {}, + "outputs": [], + "source": [ + "queryColor = collections.aggregate(\n", + " [{\n", + " \"$group\" : \n", + " {\n", + " \"_id\":\"$color\", \"count\":\n", + " {\n", + " \"$sum\":1\n", + " }\n", + " }\n", + " }])" + ] + }, + { + "cell_type": "code", + "execution_count": 127, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'count': 1, '_id': 'BRONZE'}\n", + "{'count': 2, '_id': 'BROWN'}\n", + "{'count': 3, '_id': 'GREEN, LGT'}\n", + "{'count': 13, '_id': 'MAROON'}\n", + "{'count': 5, '_id': 'PURPLE'}\n", + "{'count': 29, '_id': 'GOLD'}\n", + "{'count': 3, '_id': 'CREAM'}\n", + "{'count': 33, '_id': 'TAN'}\n", + "{'count': 13, '_id': 'GREEN, DK'}\n", + "{'count': 20, '_id': 'BLUE, LIGHT'}\n", + "{'count': 43, '_id': 'GREEN'}\n", + "{'count': 4, '_id': 'ORANGE'}\n", + "{'count': 8, '_id': 'N/A'}\n", + "{'count': 111, '_id': 'WHITE'}\n", + "{'count': 67, '_id': 'BLUE'}\n", + "{'count': 145, '_id': 'SILVER'}\n", + "{'count': 21, '_id': 'BLUE, DARK'}\n", + "{'count': 73, '_id': 'RED'}\n", + "{'count': 76, '_id': 'GRAY'}\n", + "{'count': 1, '_id': None}\n", + "{'count': 159, '_id': 'BLACK'}\n", + "{'count': 12, '_id': 'BEIGE'}\n" + ] + } + ], + "source": [ + "for cursor in queryColor:\n", + " print(cursor)" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "queryArrest = collections.aggregate(\n", + " [{\n", + " \"$group\" : \n", + " {\"_id\":\"$arrest_type\", \"count\":\n", + " {\n", + " \"$sum\":1\n", + " }\n", + " }\n", + " }])" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'count': 2, '_id': 'E - Marked Stationary Radar'}\n", + "{'count': 2, '_id': 'S - License Plate Recognition'}\n", + "{'count': 1, '_id': 'R - Unmarked Laser'}\n", + "{'count': 1, '_id': 'L - Motorcycle'}\n", + "{'count': 4, '_id': 'M - Marked (Off-Duty)'}\n", + "{'count': 35, '_id': 'Q - Marked Laser'}\n", + "{'count': 13, '_id': 'B - Unmarked Patrol'}\n", + "{'count': 3, '_id': 'O - Foot Patrol'}\n", + "{'count': 781, '_id': 'A - Marked Patrol'}\n" + ] + } + ], + "source": [ + "for cursor in queryArrest:\n", + " print(cursor)" + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "queryDay=collections.aggregate(\n", + " [\n", + " {\n", + " \"$project\":\n", + " {\n", + " \"dayOfWeek\": { \"$dayOfWeek\": \"$date_of_stop\" },\n", + " }\n", + " }\n", + " ])" + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 133, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_chunk = pd.DataFrame(list(collections.find()))\n", + "data_chunk.head" + ] + }, + { + "cell_type": "code", + "execution_count": 134, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "#Close Server Connection\n", + "server_conn.close()" + ] + }, + { + "cell_type": "code", + "execution_count": 137, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "4 133\n", + "0 130\n", + "1 129\n", + "3 128\n", + "6 113\n", + "5 113\n", + "2 96\n", + "Name: date_of_stop, dtype: int64\n" + ] + } + ], + "source": [ + "print(data_chunk.date_of_stop.dt.dayofweek.value_counts())" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 620aee54625031d876d08882e7382a017ec5d3d0 Mon Sep 17 00:00:00 2001 From: HAK-CODE Date: Sat, 13 May 2017 00:11:35 +0500 Subject: [PATCH 2/2] Updated readme --- README.md | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index baa5f34..4ea673c 100644 --- a/README.md +++ b/README.md @@ -1 +1,38 @@ -# Database_Assignment \ No newline at end of file +# Database Assignment + +Assignment database for a json dataset. + +# Disclaimer + +This assignment is use 10,000 rows of a json data, cause machine was not capable of processing whole file. Answers of query may differ from actual answers. + +## Getting Started + +Start a mongo client using command. + +``` +$mongo +``` + +create a database using command. + +``` +$use traffic +``` + +create a collection + +``` +$db.createCollection(providedfile) +``` + +### Prerequisites + +Need following libraries: +1. ijson +2. pymongo +3. pandas + +## Author + +Hammad Ali Khan