From 9c05ebae0da6758dd63017eae6d2d90a8e65527f Mon Sep 17 00:00:00 2001 From: Rabin Sapkota <123710399+theRubyPheonix@users.noreply.github.com> Date: Tue, 21 Mar 2023 02:11:55 +0000 Subject: [PATCH] Created using Colaboratory --- Prog4BigData_Lab.ipynb | 835 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 835 insertions(+) create mode 100644 Prog4BigData_Lab.ipynb diff --git a/Prog4BigData_Lab.ipynb b/Prog4BigData_Lab.ipynb new file mode 100644 index 0000000..760a27c --- /dev/null +++ b/Prog4BigData_Lab.ipynb @@ -0,0 +1,835 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "collapsed_sections": [ + "d8JHFIbyXyxd", + "bJhNw4oSXUad", + "xuKBEjeCXKH0" + ], + "authorship_tag": "ABX9TyNOGT36Pd2vBFmqsDCwxjFs", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "source": [ + "**Apache Spark Machine Learning using Dataframes in Google Colab**" + ], + "metadata": { + "id": "xVUbDSjPJatr" + } + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "uz32p9PrJf2G" + }, + "execution_count": 53, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Setup apache instance in Google Colab \n", + "# Latest version 3.3.2 from https://archive.apache.org/dist/spark\n", + "!apt-get install openjdk-8-jdk-headless -qq > /dev/null\n", + "!wget -q https://archive.apache.org/dist/spark/spark-3.3.2/spark-3.3.2-bin-hadoop2.tgz\n", + "!tar xf spark-3.3.2-bin-hadoop2.tgz\n", + "!pip install -q findspark " + ], + "metadata": { + "id": "K34_Wvx5J2b-" + }, + "execution_count": 54, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Set the java and spark environment location \n", + "import os\n", + "os.environ[\"JAVA_HOME\"] = \"/usr/lib/jvm/java-8-openjdk-amd64\"\n", + "os.environ[\"SPARK_HOME\"] = \"/content/spark-3.3.2-bin-hadoop2\"" + ], + "metadata": { + "id": "iRLSHEx-KG5O" + }, + "execution_count": 55, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Local spark session \n", + "import findspark\n", + "findspark.init()" + ], + "metadata": { + "id": "GH3pt5LLKMCK" + }, + "execution_count": 56, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from pyspark.sql import SparkSession\n", + "spark = SparkSession.builder\\\n", + " .master(\"local\")\\\n", + " .appName(\"Colab\")\\\n", + " .config(\"spark.ui.port\",\"4050\")\\\n", + " .getOrCreate()\n", + "spark" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 216 + }, + "id": "AwDevvXcKPau", + "outputId": "b3890014-00f6-4988-f1f3-69cfd9cefdde" + }, + "execution_count": 57, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + "
\n", + "

SparkSession - in-memory

\n", + " \n", + "
\n", + "

SparkContext

\n", + "\n", + "

Spark UI

\n", + "\n", + "
\n", + "
Version
\n", + "
v3.3.2
\n", + "
Master
\n", + "
local
\n", + "
AppName
\n", + "
Colab
\n", + "
\n", + "
\n", + " \n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 57 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Getting tthe Iris dataset\n", + "!wget https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data -O sample_data/iris.data" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "sO-MJ0XWRyfr", + "outputId": "ef5bd0e5-f05f-4d6c-f25d-5300cbf63137" + }, + "execution_count": 58, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--2023-03-20 23:27:31-- https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data\n", + "Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252\n", + "Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 4551 (4.4K) [application/x-httpd-php]\n", + "Saving to: ‘sample_data/iris.data’\n", + "\n", + "sample_data/iris.da 100%[===================>] 4.44K --.-KB/s in 0s \n", + "\n", + "2023-03-20 23:27:32 (171 MB/s) - ‘sample_data/iris.data’ saved [4551/4551]\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Import the dataset into dataframe \n", + "df = spark.read.csv('sample_data/iris.data', inferSchema=True).toDF('SepalLength','SepalWidth','PetalLength','PetalWidth','Class')" + ], + "metadata": { + "id": "tpT9PvaUW61w" + }, + "execution_count": 60, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Feature Extraction" + ], + "metadata": { + "id": "rHAEMVhAXtSU" + } + }, + { + "cell_type": "code", + "source": [ + "df.show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "0ccOU0rcbsLK", + "outputId": "5dbd6306-478b-4fa9-cd1c-3479124a7c88" + }, + "execution_count": 61, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+-----------+----------+-----------+----------+-----------+\n", + "|SepalLength|SepalWidth|PetalLength|PetalWidth| Class|\n", + "+-----------+----------+-----------+----------+-----------+\n", + "| 5.1| 3.5| 1.4| 0.2|Iris-setosa|\n", + "| 4.9| 3.0| 1.4| 0.2|Iris-setosa|\n", + "| 4.7| 3.2| 1.3| 0.2|Iris-setosa|\n", + "| 4.6| 3.1| 1.5| 0.2|Iris-setosa|\n", + "| 5.0| 3.6| 1.4| 0.2|Iris-setosa|\n", + "| 5.4| 3.9| 1.7| 0.4|Iris-setosa|\n", + "| 4.6| 3.4| 1.4| 0.3|Iris-setosa|\n", + "| 5.0| 3.4| 1.5| 0.2|Iris-setosa|\n", + "| 4.4| 2.9| 1.4| 0.2|Iris-setosa|\n", + "| 4.9| 3.1| 1.5| 0.1|Iris-setosa|\n", + "| 5.4| 3.7| 1.5| 0.2|Iris-setosa|\n", + "| 4.8| 3.4| 1.6| 0.2|Iris-setosa|\n", + "| 4.8| 3.0| 1.4| 0.1|Iris-setosa|\n", + "| 4.3| 3.0| 1.1| 0.1|Iris-setosa|\n", + "| 5.8| 4.0| 1.2| 0.2|Iris-setosa|\n", + "| 5.7| 4.4| 1.5| 0.4|Iris-setosa|\n", + "| 5.4| 3.9| 1.3| 0.4|Iris-setosa|\n", + "| 5.1| 3.5| 1.4| 0.3|Iris-setosa|\n", + "| 5.7| 3.8| 1.7| 0.3|Iris-setosa|\n", + "| 5.1| 3.8| 1.5| 0.3|Iris-setosa|\n", + "+-----------+----------+-----------+----------+-----------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "df_temp = df_temp.drop('SepalLength','SepalWidth','PetalLength','PetalWidth')\n", + "df_temp.show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "MQT4KjiHcy8B", + "outputId": "601906b3-a5d7-4c32-a686-d00b5d8c673b" + }, + "execution_count": 63, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+-----------+-----------------+\n", + "| Class| features|\n", + "+-----------+-----------------+\n", + "|Iris-setosa|[5.1,3.5,1.4,0.2]|\n", + "|Iris-setosa|[4.9,3.0,1.4,0.2]|\n", + "|Iris-setosa|[4.7,3.2,1.3,0.2]|\n", + "|Iris-setosa|[4.6,3.1,1.5,0.2]|\n", + "|Iris-setosa|[5.0,3.6,1.4,0.2]|\n", + "|Iris-setosa|[5.4,3.9,1.7,0.4]|\n", + "|Iris-setosa|[4.6,3.4,1.4,0.3]|\n", + "|Iris-setosa|[5.0,3.4,1.5,0.2]|\n", + "|Iris-setosa|[4.4,2.9,1.4,0.2]|\n", + "|Iris-setosa|[4.9,3.1,1.5,0.1]|\n", + "|Iris-setosa|[5.4,3.7,1.5,0.2]|\n", + "|Iris-setosa|[4.8,3.4,1.6,0.2]|\n", + "|Iris-setosa|[4.8,3.0,1.4,0.1]|\n", + "|Iris-setosa|[4.3,3.0,1.1,0.1]|\n", + "|Iris-setosa|[5.8,4.0,1.2,0.2]|\n", + "|Iris-setosa|[5.7,4.4,1.5,0.4]|\n", + "|Iris-setosa|[5.4,3.9,1.3,0.4]|\n", + "|Iris-setosa|[5.1,3.5,1.4,0.3]|\n", + "|Iris-setosa|[5.7,3.8,1.7,0.3]|\n", + "|Iris-setosa|[5.1,3.8,1.5,0.3]|\n", + "+-----------+-----------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# The final data preparation step is to index the Class column - to use numeric rather than text values \n", + "# - run the following command and display your output of Class, features & ClassIndex columns\n", + "from pyspark.ml.feature import StringIndexer\n", + "l_indexer=StringIndexer(inputCol='Class',outputCol='ClassIndex')\n", + "df_temp=l_indexer.fit(df_temp).transform(df_temp)\n", + "df_temp.show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "dtHWFZngdGZW", + "outputId": "074a0923-5e7c-43d1-e33f-222b1261c012" + }, + "execution_count": 64, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+-----------+-----------------+----------+\n", + "| Class| features|ClassIndex|\n", + "+-----------+-----------------+----------+\n", + "|Iris-setosa|[5.1,3.5,1.4,0.2]| 0.0|\n", + "|Iris-setosa|[4.9,3.0,1.4,0.2]| 0.0|\n", + "|Iris-setosa|[4.7,3.2,1.3,0.2]| 0.0|\n", + "|Iris-setosa|[4.6,3.1,1.5,0.2]| 0.0|\n", + "|Iris-setosa|[5.0,3.6,1.4,0.2]| 0.0|\n", + "|Iris-setosa|[5.4,3.9,1.7,0.4]| 0.0|\n", + "|Iris-setosa|[4.6,3.4,1.4,0.3]| 0.0|\n", + "|Iris-setosa|[5.0,3.4,1.5,0.2]| 0.0|\n", + "|Iris-setosa|[4.4,2.9,1.4,0.2]| 0.0|\n", + "|Iris-setosa|[4.9,3.1,1.5,0.1]| 0.0|\n", + "|Iris-setosa|[5.4,3.7,1.5,0.2]| 0.0|\n", + "|Iris-setosa|[4.8,3.4,1.6,0.2]| 0.0|\n", + "|Iris-setosa|[4.8,3.0,1.4,0.1]| 0.0|\n", + "|Iris-setosa|[4.3,3.0,1.1,0.1]| 0.0|\n", + "|Iris-setosa|[5.8,4.0,1.2,0.2]| 0.0|\n", + "|Iris-setosa|[5.7,4.4,1.5,0.4]| 0.0|\n", + "|Iris-setosa|[5.4,3.9,1.3,0.4]| 0.0|\n", + "|Iris-setosa|[5.1,3.5,1.4,0.3]| 0.0|\n", + "|Iris-setosa|[5.7,3.8,1.7,0.3]| 0.0|\n", + "|Iris-setosa|[5.1,3.8,1.5,0.3]| 0.0|\n", + "+-----------+-----------------+----------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Spark ML can only deal with one features column - so we need to vectorise the multiple columns into one:\n", + "from pyspark.ml.linalg import Vectors\n", + "from pyspark.ml.feature import VectorAssembler\n", + "vector_assembler = VectorAssembler(\\\n", + " inputCols=['SepalLength','SepalWidth','PetalLength','PetalWidth'],\\\n", + " outputCol='features')\n", + "df_temp=vector_assembler.transform(df)\n", + "df_temp.show(3)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "fAC5bZo5bvs-", + "outputId": "b2f0d902-9ff7-431f-f2b8-55a467ac65d3" + }, + "execution_count": 62, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+-----------+----------+-----------+----------+-----------+-----------------+\n", + "|SepalLength|SepalWidth|PetalLength|PetalWidth| Class| features|\n", + "+-----------+----------+-----------+----------+-----------+-----------------+\n", + "| 5.1| 3.5| 1.4| 0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|\n", + "| 4.9| 3.0| 1.4| 0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|\n", + "| 4.7| 3.2| 1.3| 0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|\n", + "+-----------+----------+-----------+----------+-----------+-----------------+\n", + "only showing top 3 rows\n", + "\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Split Traning and Test" + ], + "metadata": { + "id": "d8JHFIbyXyxd" + } + }, + { + "cell_type": "code", + "source": [ + "# Spliting the data into traning and test dataset\n", + "(trainingData, testData) = df_temp.randomSplit([0.7,0.3])" + ], + "metadata": { + "id": "j2C2Ij4DdL11" + }, + "execution_count": 65, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Decision Tree " + ], + "metadata": { + "id": "bJhNw4oSXUad" + } + }, + { + "cell_type": "code", + "source": [ + "# Deision Tree Classifier\n", + "from pyspark.ml.classification import DecisionTreeClassifier\n", + "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n", + "\n", + "dt=DecisionTreeClassifier(labelCol='ClassIndex',featuresCol='features')\n", + "model=dt.fit(trainingData)" + ], + "metadata": { + "id": "QBel9HANfPx0" + }, + "execution_count": 66, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Test your model with your test dataset: \n", + "predictions=model.transform(testData)\n", + "predictions.select('prediction','ClassIndex').show(15)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "bN_LZai3gXxc", + "outputId": "4476a3de-d21f-4bd8-e888-5471b07f78b0" + }, + "execution_count": 67, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+----------+----------+\n", + "|prediction|ClassIndex|\n", + "+----------+----------+\n", + "| 0.0| 0.0|\n", + "| 0.0| 0.0|\n", + "| 0.0| 0.0|\n", + "| 0.0| 0.0|\n", + "| 0.0| 0.0|\n", + "| 0.0| 0.0|\n", + "| 0.0| 0.0|\n", + "| 0.0| 0.0|\n", + "| 0.0| 0.0|\n", + "| 0.0| 0.0|\n", + "| 0.0| 0.0|\n", + "| 0.0| 0.0|\n", + "| 0.0| 0.0|\n", + "| 0.0| 0.0|\n", + "| 0.0| 0.0|\n", + "+----------+----------+\n", + "only showing top 15 rows\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Evaluation fn\n", + "evaluator=MulticlassClassificationEvaluator(labelCol='ClassIndex',predictionCol='prediction',metricName='accuracy')\n", + "accuracy=evaluator.evaluate(predictions)\n", + "print('Test Error = %g'%(1.0-accuracy))\n", + "print('Test Set Accuracy =' + str(accuracy))" + ], + "metadata": { + "id": "qr26Bs56is6A", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "1361979f-dde4-4efa-bed9-002700a53124" + }, + "execution_count": 71, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Test Error = 0.0208333\n", + "Test Set Accuracy =0.9791666666666666\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Random Forest" + ], + "metadata": { + "id": "xuKBEjeCXKH0" + } + }, + { + "cell_type": "code", + "source": [ + "# Random Forest Classifier\n", + "from pyspark.ml.classification import RandomForestClassifier\n", + "\n", + "rf=RandomForestClassifier(labelCol='ClassIndex',featuresCol='features',numTrees=10)\n", + "model=rf.fit(trainingData)\n", + "predictions=model.transform(testData)\n", + "predictions.select('prediction','ClassIndex').show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "mlxf8-wgXd_f", + "outputId": "6a378d93-3196-4a99-96b9-16668b0aea70" + }, + "execution_count": 74, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"/content/spark-3.3.2-bin-hadoop2/python/pyspark/ml/wrapper.py\", line 53, in __del__\n", + " if SparkContext._active_spark_context and self._java_obj is not None:\n", + "AttributeError: 'RandomForestClassifier' object has no attribute '_java_obj'\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+----------+----------+\n", + "|prediction|ClassIndex|\n", + "+----------+----------+\n", + "| 0.0| 0.0|\n", + "| 0.0| 0.0|\n", + "| 0.0| 0.0|\n", + "| 0.0| 0.0|\n", + "| 0.0| 0.0|\n", + "| 0.0| 0.0|\n", + "| 0.0| 0.0|\n", + "| 0.0| 0.0|\n", + "| 0.0| 0.0|\n", + "| 0.0| 0.0|\n", + "| 0.0| 0.0|\n", + "| 0.0| 0.0|\n", + "| 0.0| 0.0|\n", + "| 0.0| 0.0|\n", + "| 0.0| 0.0|\n", + "| 0.0| 0.0|\n", + "| 0.0| 0.0|\n", + "| 0.0| 0.0|\n", + "| 0.0| 0.0|\n", + "| 0.0| 0.0|\n", + "+----------+----------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Evaluation Fn\n", + "evaluator=MulticlassClassificationEvaluator(labelCol='ClassIndex',predictionCol='prediction',metricName='accuracy')\n", + "accuracy=evaluator.evaluate(predictions)\n", + "print('Test Error = %g' % (1.0-accuracy))\n", + "print('Test Set Accuracy = '+ str(accuracy))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "0wRgRb7FYfDv", + "outputId": "257884b0-7fec-4a51-ba3f-827e4889ae04" + }, + "execution_count": 76, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Test Error = 0.0208333\n", + "Test Set Accuracy = 0.9791666666666666\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Naive bayes Classifier" + ], + "metadata": { + "id": "5ht5N9Rpa-NR" + } + }, + { + "cell_type": "code", + "source": [ + "# Naive Base Classifier\n", + "from pyspark.ml.classification import NaiveBayes\n", + "\n", + "nb=NaiveBayes(labelCol='ClassIndex',featuresCol='features',smoothing=1.0,modelType='multinomial')\n", + "model=nb.fit(trainingData)\n", + "predictions=model.transform(testData)\n", + "predictions.select('Class','ClassIndex','probability','prediction').show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "qzfkVcMQbBA4", + "outputId": "8f2b6ca4-059e-41ec-9c6f-4665349c71de" + }, + "execution_count": 77, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+-----------+----------+--------------------+----------+\n", + "| Class|ClassIndex| probability|prediction|\n", + "+-----------+----------+--------------------+----------+\n", + "|Iris-setosa| 0.0|[0.65854431190304...| 0.0|\n", + "|Iris-setosa| 0.0|[0.59601222050315...| 0.0|\n", + "|Iris-setosa| 0.0|[0.62416333661554...| 0.0|\n", + "|Iris-setosa| 0.0|[0.58353510118447...| 0.0|\n", + "|Iris-setosa| 0.0|[0.73347005973151...| 0.0|\n", + "|Iris-setosa| 0.0|[0.63851500041302...| 0.0|\n", + "|Iris-setosa| 0.0|[0.58445564581932...| 0.0|\n", + "|Iris-setosa| 0.0|[0.56430482772553...| 0.0|\n", + "|Iris-setosa| 0.0|[0.56269476579405...| 0.0|\n", + "|Iris-setosa| 0.0|[0.63074902380229...| 0.0|\n", + "|Iris-setosa| 0.0|[0.66959617354636...| 0.0|\n", + "|Iris-setosa| 0.0|[0.64492936285987...| 0.0|\n", + "|Iris-setosa| 0.0|[0.50846896200676...| 0.0|\n", + "|Iris-setosa| 0.0|[0.49918864496678...| 0.0|\n", + "|Iris-setosa| 0.0|[0.64960562084001...| 0.0|\n", + "|Iris-setosa| 0.0|[0.67965272979208...| 0.0|\n", + "|Iris-setosa| 0.0|[0.56889671619589...| 0.0|\n", + "|Iris-setosa| 0.0|[0.67120169377758...| 0.0|\n", + "|Iris-setosa| 0.0|[0.69743996582468...| 0.0|\n", + "|Iris-setosa| 0.0|[0.60017434060471...| 0.0|\n", + "+-----------+----------+--------------------+----------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Evaluation Fn\n", + "evaluator=MulticlassClassificationEvaluator(labelCol='ClassIndex',predictionCol='prediction',metricName='accuracy')\n", + "accuracy=evaluator.evaluate(predictions)\n", + "print('Test Error = %g' % (1.0-accuracy))\n", + "print('Test Set Accuracy = '+ str(accuracy))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "O50-b6mkcCeA", + "outputId": "8f6a6939-8325-4d46-b41d-314459f4c0a8" + }, + "execution_count": 78, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Test Error = 0.145833\n", + "Test Set Accuracy = 0.8541666666666666\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "6G-w20Z0c1aa" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Protein Data\n" + ], + "metadata": { + "id": "CWBkOMtzeCGe" + } + }, + { + "cell_type": "code", + "source": [ + "!wget https://raw.githubusercontent.com/OpenDrugAI/AttentiveFP/master/data/SAMPL.csv -O sample_data/SAMPL.csv" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "OmXK0xz7e3_0", + "outputId": "f3cdad4b-7fd5-4e39-b9f9-1f6c7f59ecee" + }, + "execution_count": 82, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--2023-03-21 00:45:28-- https://raw.githubusercontent.com/OpenDrugAI/AttentiveFP/master/data/SAMPL.csv\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 32060 (31K) [text/plain]\n", + "Saving to: ‘sample_data/SAMPL.csv’\n", + "\n", + "\rsample_data/SAMPL.c 0%[ ] 0 --.-KB/s \rsample_data/SAMPL.c 100%[===================>] 31.31K --.-KB/s in 0s \n", + "\n", + "2023-03-21 00:45:28 (114 MB/s) - ‘sample_data/SAMPL.csv’ saved [32060/32060]\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "df = spark.read.option('delimiter',',').option('header',True).csv('sample_data/SAMPL.csv')" + ], + "metadata": { + "id": "0cINbK53fH74" + }, + "execution_count": 113, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df.show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "voW4I_E9gbxW", + "outputId": "ef0a9143-ebb8-4755-f05c-2ffcffb20347" + }, + "execution_count": 114, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+--------------------+--------------------+------+------+\n", + "| iupac| smiles| expt| calc|\n", + "+--------------------+--------------------+------+------+\n", + "|4-methoxy-N,N-dim...|CN(C)C(=O)c1ccc(c...|-11.01|-9.625|\n", + "|methanesulfonyl c...| CS(=O)(=O)Cl| -4.87|-6.219|\n", + "| 3-methylbut-1-ene| CC(C)C=C| 1.83| 2.452|\n", + "| 2-ethylpyrazine| CCc1cnccn1| -5.45|-5.809|\n", + "| heptan-1-ol| CCCCCCCO| -4.21|-2.917|\n", + "| 3,5-dimethylphenol| Cc1cc(cc(c1)O)C| -6.27|-5.444|\n", + "| 2,3-dimethylbutane| CC(C)C(C)C| 2.34| 2.468|\n", + "| 2-methylpentan-2-ol| CCCC(C)(C)O| -3.92|-2.779|\n", + "|1,2-dimethylcyclo...|C[C@@H]1CCCC[C@@H]1C| 1.58| 1.685|\n", + "| butan-2-ol| CC[C@H](C)O| -4.62|-3.145|\n", + "| dibromomethane| C(Br)Br| -1.96|-0.405|\n", + "| 2-methylpentan-3-ol| CC[C@H](C(C)C)O| -3.88|-2.416|\n", + "| 2-ethylpyridine| CCc1ccccn1| -4.33| -3.31|\n", + "| ethyl pentanoate| CCCCC(=O)OCC| -2.49| -3.11|\n", + "| benzenethiol| c1ccc(cc1)S| -2.55|-1.501|\n", + "|(2Z)-3,7-dimethyl...|CC(=CCC/C(=C\\CO)/C)C| -4.78|-2.597|\n", + "| indane| c1ccc2c(c1)CCC2| -1.46|-1.752|\n", + "| ethoxybenzene| CCOc1ccccc1| -2.22|-2.254|\n", + "| 4-bromophenol| c1cc(ccc1O)Br| -5.85|-5.833|\n", + "| 2,2-dimethylpentane| CCCC(C)(C)C| 2.88| 2.686|\n", + "+--------------------+--------------------+------+------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "eZ4vv9EMoe5y" + }, + "execution_count": 115, + "outputs": [] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "LG1zUjtkwRP1" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file