From 9c05ebae0da6758dd63017eae6d2d90a8e65527f Mon Sep 17 00:00:00 2001
From: Rabin Sapkota <123710399+theRubyPheonix@users.noreply.github.com>
Date: Tue, 21 Mar 2023 02:11:55 +0000
Subject: [PATCH] Created using Colaboratory

---
 Prog4BigData_Lab.ipynb | 835 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 835 insertions(+)
 create mode 100644 Prog4BigData_Lab.ipynb
diff --git a/Prog4BigData_Lab.ipynb b/Prog4BigData_Lab.ipynb
new file mode 100644
index 0000000..760a27c
--- /dev/null
+++ b/Prog4BigData_Lab.ipynb
@@ -0,0 +1,835 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "collapsed_sections": [
+        "d8JHFIbyXyxd",
+        "bJhNw4oSXUad",
+        "xuKBEjeCXKH0"
+      ],
+      "authorship_tag": "ABX9TyNOGT36Pd2vBFmqsDCwxjFs",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/theRubyPheonix/AttentiveFP/blob/proxyzzz/Prog4BigData_Lab.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "**Apache Spark Machine Learning using Dataframes in Google Colab**"
+      ],
+      "metadata": {
+        "id": "xVUbDSjPJatr"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "uz32p9PrJf2G"
+      },
+      "execution_count": 53,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Setup apache instance in Google Colab \n",
+        "# Latest version 3.3.2 from https://archive.apache.org/dist/spark\n",
+        "!apt-get install openjdk-8-jdk-headless -qq > /dev/null\n",
+        "!wget -q https://archive.apache.org/dist/spark/spark-3.3.2/spark-3.3.2-bin-hadoop2.tgz\n",
+        "!tar xf spark-3.3.2-bin-hadoop2.tgz\n",
+        "!pip install -q findspark "
+      ],
+      "metadata": {
+        "id": "K34_Wvx5J2b-"
+      },
+      "execution_count": 54,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Set the java and spark environment location \n",
+        "import os\n",
+        "os.environ[\"JAVA_HOME\"] = \"/usr/lib/jvm/java-8-openjdk-amd64\"\n",
+        "os.environ[\"SPARK_HOME\"] = \"/content/spark-3.3.2-bin-hadoop2\""
+      ],
+      "metadata": {
+        "id": "iRLSHEx-KG5O"
+      },
+      "execution_count": 55,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Local spark session \n",
+        "import findspark\n",
+        "findspark.init()"
+      ],
+      "metadata": {
+        "id": "GH3pt5LLKMCK"
+      },
+      "execution_count": 56,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from pyspark.sql import SparkSession\n",
+        "spark = SparkSession.builder\\\n",
+        "        .master(\"local\")\\\n",
+        "        .appName(\"Colab\")\\\n",
+        "        .config(\"spark.ui.port\",\"4050\")\\\n",
+        "        .getOrCreate()\n",
+        "spark"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 216
+        },
+        "id": "AwDevvXcKPau",
+        "outputId": "b3890014-00f6-4988-f1f3-69cfd9cefdde"
+      },
+      "execution_count": 57,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "<pyspark.sql.session.SparkSession at 0x7fc2459e9280>"
+            ],
+            "text/html": [
+              "\n",
+              "            <div>\n",
+              "                <p><b>SparkSession - in-memory</b></p>\n",
+              "                \n",
+              "        <div>\n",
+              "            <p><b>SparkContext</b></p>\n",
+              "\n",
+              "            <p><a href=\"http://794412ef3bbe:4050\">Spark UI</a></p>\n",
+              "\n",
+              "            <dl>\n",
+              "              <dt>Version</dt>\n",
+              "                <dd><code>v3.3.2</code></dd>\n",
+              "              <dt>Master</dt>\n",
+              "                <dd><code>local</code></dd>\n",
+              "              <dt>AppName</dt>\n",
+              "                <dd><code>Colab</code></dd>\n",
+              "            </dl>\n",
+              "        </div>\n",
+              "        \n",
+              "            </div>\n",
+              "        "
+            ]
+          },
+          "metadata": {},
+          "execution_count": 57
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Getting tthe Iris dataset\n",
+        "!wget https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data -O sample_data/iris.data"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "sO-MJ0XWRyfr",
+        "outputId": "ef5bd0e5-f05f-4d6c-f25d-5300cbf63137"
+      },
+      "execution_count": 58,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "--2023-03-20 23:27:31--  https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data\n",
+            "Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252\n",
+            "Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.\n",
+            "HTTP request sent, awaiting response... 200 OK\n",
+            "Length: 4551 (4.4K) [application/x-httpd-php]\n",
+            "Saving to: ‘sample_data/iris.data’\n",
+            "\n",
+            "sample_data/iris.da 100%[===================>]   4.44K  --.-KB/s    in 0s      \n",
+            "\n",
+            "2023-03-20 23:27:32 (171 MB/s) - ‘sample_data/iris.data’ saved [4551/4551]\n",
+            "\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Import the dataset into dataframe \n",
+        "df = spark.read.csv('sample_data/iris.data', inferSchema=True).toDF('SepalLength','SepalWidth','PetalLength','PetalWidth','Class')"
+      ],
+      "metadata": {
+        "id": "tpT9PvaUW61w"
+      },
+      "execution_count": 60,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Feature Extraction"
+      ],
+      "metadata": {
+        "id": "rHAEMVhAXtSU"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "df.show()"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "0ccOU0rcbsLK",
+        "outputId": "5dbd6306-478b-4fa9-cd1c-3479124a7c88"
+      },
+      "execution_count": 61,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "+-----------+----------+-----------+----------+-----------+\n",
+            "|SepalLength|SepalWidth|PetalLength|PetalWidth|      Class|\n",
+            "+-----------+----------+-----------+----------+-----------+\n",
+            "|        5.1|       3.5|        1.4|       0.2|Iris-setosa|\n",
+            "|        4.9|       3.0|        1.4|       0.2|Iris-setosa|\n",
+            "|        4.7|       3.2|        1.3|       0.2|Iris-setosa|\n",
+            "|        4.6|       3.1|        1.5|       0.2|Iris-setosa|\n",
+            "|        5.0|       3.6|        1.4|       0.2|Iris-setosa|\n",
+            "|        5.4|       3.9|        1.7|       0.4|Iris-setosa|\n",
+            "|        4.6|       3.4|        1.4|       0.3|Iris-setosa|\n",
+            "|        5.0|       3.4|        1.5|       0.2|Iris-setosa|\n",
+            "|        4.4|       2.9|        1.4|       0.2|Iris-setosa|\n",
+            "|        4.9|       3.1|        1.5|       0.1|Iris-setosa|\n",
+            "|        5.4|       3.7|        1.5|       0.2|Iris-setosa|\n",
+            "|        4.8|       3.4|        1.6|       0.2|Iris-setosa|\n",
+            "|        4.8|       3.0|        1.4|       0.1|Iris-setosa|\n",
+            "|        4.3|       3.0|        1.1|       0.1|Iris-setosa|\n",
+            "|        5.8|       4.0|        1.2|       0.2|Iris-setosa|\n",
+            "|        5.7|       4.4|        1.5|       0.4|Iris-setosa|\n",
+            "|        5.4|       3.9|        1.3|       0.4|Iris-setosa|\n",
+            "|        5.1|       3.5|        1.4|       0.3|Iris-setosa|\n",
+            "|        5.7|       3.8|        1.7|       0.3|Iris-setosa|\n",
+            "|        5.1|       3.8|        1.5|       0.3|Iris-setosa|\n",
+            "+-----------+----------+-----------+----------+-----------+\n",
+            "only showing top 20 rows\n",
+            "\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "df_temp = df_temp.drop('SepalLength','SepalWidth','PetalLength','PetalWidth')\n",
+        "df_temp.show()"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "MQT4KjiHcy8B",
+        "outputId": "601906b3-a5d7-4c32-a686-d00b5d8c673b"
+      },
+      "execution_count": 63,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "+-----------+-----------------+\n",
+            "|      Class|         features|\n",
+            "+-----------+-----------------+\n",
+            "|Iris-setosa|[5.1,3.5,1.4,0.2]|\n",
+            "|Iris-setosa|[4.9,3.0,1.4,0.2]|\n",
+            "|Iris-setosa|[4.7,3.2,1.3,0.2]|\n",
+            "|Iris-setosa|[4.6,3.1,1.5,0.2]|\n",
+            "|Iris-setosa|[5.0,3.6,1.4,0.2]|\n",
+            "|Iris-setosa|[5.4,3.9,1.7,0.4]|\n",
+            "|Iris-setosa|[4.6,3.4,1.4,0.3]|\n",
+            "|Iris-setosa|[5.0,3.4,1.5,0.2]|\n",
+            "|Iris-setosa|[4.4,2.9,1.4,0.2]|\n",
+            "|Iris-setosa|[4.9,3.1,1.5,0.1]|\n",
+            "|Iris-setosa|[5.4,3.7,1.5,0.2]|\n",
+            "|Iris-setosa|[4.8,3.4,1.6,0.2]|\n",
+            "|Iris-setosa|[4.8,3.0,1.4,0.1]|\n",
+            "|Iris-setosa|[4.3,3.0,1.1,0.1]|\n",
+            "|Iris-setosa|[5.8,4.0,1.2,0.2]|\n",
+            "|Iris-setosa|[5.7,4.4,1.5,0.4]|\n",
+            "|Iris-setosa|[5.4,3.9,1.3,0.4]|\n",
+            "|Iris-setosa|[5.1,3.5,1.4,0.3]|\n",
+            "|Iris-setosa|[5.7,3.8,1.7,0.3]|\n",
+            "|Iris-setosa|[5.1,3.8,1.5,0.3]|\n",
+            "+-----------+-----------------+\n",
+            "only showing top 20 rows\n",
+            "\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# The final data preparation step is to index the Class column - to use numeric rather than text values \n",
+        "# - run the following command and display your output of Class, features & ClassIndex columns\n",
+        "from pyspark.ml.feature import StringIndexer\n",
+        "l_indexer=StringIndexer(inputCol='Class',outputCol='ClassIndex')\n",
+        "df_temp=l_indexer.fit(df_temp).transform(df_temp)\n",
+        "df_temp.show()"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "dtHWFZngdGZW",
+        "outputId": "074a0923-5e7c-43d1-e33f-222b1261c012"
+      },
+      "execution_count": 64,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "+-----------+-----------------+----------+\n",
+            "|      Class|         features|ClassIndex|\n",
+            "+-----------+-----------------+----------+\n",
+            "|Iris-setosa|[5.1,3.5,1.4,0.2]|       0.0|\n",
+            "|Iris-setosa|[4.9,3.0,1.4,0.2]|       0.0|\n",
+            "|Iris-setosa|[4.7,3.2,1.3,0.2]|       0.0|\n",
+            "|Iris-setosa|[4.6,3.1,1.5,0.2]|       0.0|\n",
+            "|Iris-setosa|[5.0,3.6,1.4,0.2]|       0.0|\n",
+            "|Iris-setosa|[5.4,3.9,1.7,0.4]|       0.0|\n",
+            "|Iris-setosa|[4.6,3.4,1.4,0.3]|       0.0|\n",
+            "|Iris-setosa|[5.0,3.4,1.5,0.2]|       0.0|\n",
+            "|Iris-setosa|[4.4,2.9,1.4,0.2]|       0.0|\n",
+            "|Iris-setosa|[4.9,3.1,1.5,0.1]|       0.0|\n",
+            "|Iris-setosa|[5.4,3.7,1.5,0.2]|       0.0|\n",
+            "|Iris-setosa|[4.8,3.4,1.6,0.2]|       0.0|\n",
+            "|Iris-setosa|[4.8,3.0,1.4,0.1]|       0.0|\n",
+            "|Iris-setosa|[4.3,3.0,1.1,0.1]|       0.0|\n",
+            "|Iris-setosa|[5.8,4.0,1.2,0.2]|       0.0|\n",
+            "|Iris-setosa|[5.7,4.4,1.5,0.4]|       0.0|\n",
+            "|Iris-setosa|[5.4,3.9,1.3,0.4]|       0.0|\n",
+            "|Iris-setosa|[5.1,3.5,1.4,0.3]|       0.0|\n",
+            "|Iris-setosa|[5.7,3.8,1.7,0.3]|       0.0|\n",
+            "|Iris-setosa|[5.1,3.8,1.5,0.3]|       0.0|\n",
+            "+-----------+-----------------+----------+\n",
+            "only showing top 20 rows\n",
+            "\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Spark ML can only deal with one features column - so we need to vectorise the multiple columns into one:\n",
+        "from pyspark.ml.linalg import Vectors\n",
+        "from pyspark.ml.feature import VectorAssembler\n",
+        "vector_assembler = VectorAssembler(\\\n",
+        "                                   inputCols=['SepalLength','SepalWidth','PetalLength','PetalWidth'],\\\n",
+        "                                   outputCol='features')\n",
+        "df_temp=vector_assembler.transform(df)\n",
+        "df_temp.show(3)"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "fAC5bZo5bvs-",
+        "outputId": "b2f0d902-9ff7-431f-f2b8-55a467ac65d3"
+      },
+      "execution_count": 62,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "+-----------+----------+-----------+----------+-----------+-----------------+\n",
+            "|SepalLength|SepalWidth|PetalLength|PetalWidth|      Class|         features|\n",
+            "+-----------+----------+-----------+----------+-----------+-----------------+\n",
+            "|        5.1|       3.5|        1.4|       0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|\n",
+            "|        4.9|       3.0|        1.4|       0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|\n",
+            "|        4.7|       3.2|        1.3|       0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|\n",
+            "+-----------+----------+-----------+----------+-----------+-----------------+\n",
+            "only showing top 3 rows\n",
+            "\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Split Traning and Test"
+      ],
+      "metadata": {
+        "id": "d8JHFIbyXyxd"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Spliting the data into traning and test dataset\n",
+        "(trainingData, testData) = df_temp.randomSplit([0.7,0.3])"
+      ],
+      "metadata": {
+        "id": "j2C2Ij4DdL11"
+      },
+      "execution_count": 65,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Decision Tree "
+      ],
+      "metadata": {
+        "id": "bJhNw4oSXUad"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Deision Tree Classifier\n",
+        "from pyspark.ml.classification import DecisionTreeClassifier\n",
+        "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n",
+        "\n",
+        "dt=DecisionTreeClassifier(labelCol='ClassIndex',featuresCol='features')\n",
+        "model=dt.fit(trainingData)"
+      ],
+      "metadata": {
+        "id": "QBel9HANfPx0"
+      },
+      "execution_count": 66,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Test your model with your test dataset: \n",
+        "predictions=model.transform(testData)\n",
+        "predictions.select('prediction','ClassIndex').show(15)"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "bN_LZai3gXxc",
+        "outputId": "4476a3de-d21f-4bd8-e888-5471b07f78b0"
+      },
+      "execution_count": 67,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "+----------+----------+\n",
+            "|prediction|ClassIndex|\n",
+            "+----------+----------+\n",
+            "|       0.0|       0.0|\n",
+            "|       0.0|       0.0|\n",
+            "|       0.0|       0.0|\n",
+            "|       0.0|       0.0|\n",
+            "|       0.0|       0.0|\n",
+            "|       0.0|       0.0|\n",
+            "|       0.0|       0.0|\n",
+            "|       0.0|       0.0|\n",
+            "|       0.0|       0.0|\n",
+            "|       0.0|       0.0|\n",
+            "|       0.0|       0.0|\n",
+            "|       0.0|       0.0|\n",
+            "|       0.0|       0.0|\n",
+            "|       0.0|       0.0|\n",
+            "|       0.0|       0.0|\n",
+            "+----------+----------+\n",
+            "only showing top 15 rows\n",
+            "\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Evaluation fn\n",
+        "evaluator=MulticlassClassificationEvaluator(labelCol='ClassIndex',predictionCol='prediction',metricName='accuracy')\n",
+        "accuracy=evaluator.evaluate(predictions)\n",
+        "print('Test Error = %g'%(1.0-accuracy))\n",
+        "print('Test Set Accuracy =' + str(accuracy))"
+      ],
+      "metadata": {
+        "id": "qr26Bs56is6A",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "1361979f-dde4-4efa-bed9-002700a53124"
+      },
+      "execution_count": 71,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Test Error = 0.0208333\n",
+            "Test Set Accuracy =0.9791666666666666\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Random Forest"
+      ],
+      "metadata": {
+        "id": "xuKBEjeCXKH0"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Random Forest Classifier\n",
+        "from pyspark.ml.classification import RandomForestClassifier\n",
+        "\n",
+        "rf=RandomForestClassifier(labelCol='ClassIndex',featuresCol='features',numTrees=10)\n",
+        "model=rf.fit(trainingData)\n",
+        "predictions=model.transform(testData)\n",
+        "predictions.select('prediction','ClassIndex').show()"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "mlxf8-wgXd_f",
+        "outputId": "6a378d93-3196-4a99-96b9-16668b0aea70"
+      },
+      "execution_count": 74,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "Exception ignored in: <function JavaWrapper.__del__ at 0x7fc244746820>\n",
+            "Traceback (most recent call last):\n",
+            "  File \"/content/spark-3.3.2-bin-hadoop2/python/pyspark/ml/wrapper.py\", line 53, in __del__\n",
+            "    if SparkContext._active_spark_context and self._java_obj is not None:\n",
+            "AttributeError: 'RandomForestClassifier' object has no attribute '_java_obj'\n"
+          ]
+        },
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "+----------+----------+\n",
+            "|prediction|ClassIndex|\n",
+            "+----------+----------+\n",
+            "|       0.0|       0.0|\n",
+            "|       0.0|       0.0|\n",
+            "|       0.0|       0.0|\n",
+            "|       0.0|       0.0|\n",
+            "|       0.0|       0.0|\n",
+            "|       0.0|       0.0|\n",
+            "|       0.0|       0.0|\n",
+            "|       0.0|       0.0|\n",
+            "|       0.0|       0.0|\n",
+            "|       0.0|       0.0|\n",
+            "|       0.0|       0.0|\n",
+            "|       0.0|       0.0|\n",
+            "|       0.0|       0.0|\n",
+            "|       0.0|       0.0|\n",
+            "|       0.0|       0.0|\n",
+            "|       0.0|       0.0|\n",
+            "|       0.0|       0.0|\n",
+            "|       0.0|       0.0|\n",
+            "|       0.0|       0.0|\n",
+            "|       0.0|       0.0|\n",
+            "+----------+----------+\n",
+            "only showing top 20 rows\n",
+            "\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Evaluation Fn\n",
+        "evaluator=MulticlassClassificationEvaluator(labelCol='ClassIndex',predictionCol='prediction',metricName='accuracy')\n",
+        "accuracy=evaluator.evaluate(predictions)\n",
+        "print('Test Error = %g' % (1.0-accuracy))\n",
+        "print('Test Set Accuracy = '+ str(accuracy))"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "0wRgRb7FYfDv",
+        "outputId": "257884b0-7fec-4a51-ba3f-827e4889ae04"
+      },
+      "execution_count": 76,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Test Error = 0.0208333\n",
+            "Test Set Accuracy = 0.9791666666666666\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Naive bayes Classifier"
+      ],
+      "metadata": {
+        "id": "5ht5N9Rpa-NR"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Naive Base Classifier\n",
+        "from pyspark.ml.classification import NaiveBayes\n",
+        "\n",
+        "nb=NaiveBayes(labelCol='ClassIndex',featuresCol='features',smoothing=1.0,modelType='multinomial')\n",
+        "model=nb.fit(trainingData)\n",
+        "predictions=model.transform(testData)\n",
+        "predictions.select('Class','ClassIndex','probability','prediction').show()"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "qzfkVcMQbBA4",
+        "outputId": "8f2b6ca4-059e-41ec-9c6f-4665349c71de"
+      },
+      "execution_count": 77,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "+-----------+----------+--------------------+----------+\n",
+            "|      Class|ClassIndex|         probability|prediction|\n",
+            "+-----------+----------+--------------------+----------+\n",
+            "|Iris-setosa|       0.0|[0.65854431190304...|       0.0|\n",
+            "|Iris-setosa|       0.0|[0.59601222050315...|       0.0|\n",
+            "|Iris-setosa|       0.0|[0.62416333661554...|       0.0|\n",
+            "|Iris-setosa|       0.0|[0.58353510118447...|       0.0|\n",
+            "|Iris-setosa|       0.0|[0.73347005973151...|       0.0|\n",
+            "|Iris-setosa|       0.0|[0.63851500041302...|       0.0|\n",
+            "|Iris-setosa|       0.0|[0.58445564581932...|       0.0|\n",
+            "|Iris-setosa|       0.0|[0.56430482772553...|       0.0|\n",
+            "|Iris-setosa|       0.0|[0.56269476579405...|       0.0|\n",
+            "|Iris-setosa|       0.0|[0.63074902380229...|       0.0|\n",
+            "|Iris-setosa|       0.0|[0.66959617354636...|       0.0|\n",
+            "|Iris-setosa|       0.0|[0.64492936285987...|       0.0|\n",
+            "|Iris-setosa|       0.0|[0.50846896200676...|       0.0|\n",
+            "|Iris-setosa|       0.0|[0.49918864496678...|       0.0|\n",
+            "|Iris-setosa|       0.0|[0.64960562084001...|       0.0|\n",
+            "|Iris-setosa|       0.0|[0.67965272979208...|       0.0|\n",
+            "|Iris-setosa|       0.0|[0.56889671619589...|       0.0|\n",
+            "|Iris-setosa|       0.0|[0.67120169377758...|       0.0|\n",
+            "|Iris-setosa|       0.0|[0.69743996582468...|       0.0|\n",
+            "|Iris-setosa|       0.0|[0.60017434060471...|       0.0|\n",
+            "+-----------+----------+--------------------+----------+\n",
+            "only showing top 20 rows\n",
+            "\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Evaluation Fn\n",
+        "evaluator=MulticlassClassificationEvaluator(labelCol='ClassIndex',predictionCol='prediction',metricName='accuracy')\n",
+        "accuracy=evaluator.evaluate(predictions)\n",
+        "print('Test Error = %g' % (1.0-accuracy))\n",
+        "print('Test Set Accuracy = '+ str(accuracy))"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "O50-b6mkcCeA",
+        "outputId": "8f6a6939-8325-4d46-b41d-314459f4c0a8"
+      },
+      "execution_count": 78,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Test Error = 0.145833\n",
+            "Test Set Accuracy = 0.8541666666666666\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "6G-w20Z0c1aa"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Protein Data\n"
+      ],
+      "metadata": {
+        "id": "CWBkOMtzeCGe"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!wget https://raw.githubusercontent.com/OpenDrugAI/AttentiveFP/master/data/SAMPL.csv -O sample_data/SAMPL.csv"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "OmXK0xz7e3_0",
+        "outputId": "f3cdad4b-7fd5-4e39-b9f9-1f6c7f59ecee"
+      },
+      "execution_count": 82,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "--2023-03-21 00:45:28--  https://raw.githubusercontent.com/OpenDrugAI/AttentiveFP/master/data/SAMPL.csv\n",
+            "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
+            "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
+            "HTTP request sent, awaiting response... 200 OK\n",
+            "Length: 32060 (31K) [text/plain]\n",
+            "Saving to: ‘sample_data/SAMPL.csv’\n",
+            "\n",
+            "\rsample_data/SAMPL.c   0%[                    ]       0  --.-KB/s               \rsample_data/SAMPL.c 100%[===================>]  31.31K  --.-KB/s    in 0s      \n",
+            "\n",
+            "2023-03-21 00:45:28 (114 MB/s) - ‘sample_data/SAMPL.csv’ saved [32060/32060]\n",
+            "\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "df = spark.read.option('delimiter',',').option('header',True).csv('sample_data/SAMPL.csv')"
+      ],
+      "metadata": {
+        "id": "0cINbK53fH74"
+      },
+      "execution_count": 113,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "df.show()"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "voW4I_E9gbxW",
+        "outputId": "ef0a9143-ebb8-4755-f05c-2ffcffb20347"
+      },
+      "execution_count": 114,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "+--------------------+--------------------+------+------+\n",
+            "|               iupac|              smiles|  expt|  calc|\n",
+            "+--------------------+--------------------+------+------+\n",
+            "|4-methoxy-N,N-dim...|CN(C)C(=O)c1ccc(c...|-11.01|-9.625|\n",
+            "|methanesulfonyl c...|        CS(=O)(=O)Cl| -4.87|-6.219|\n",
+            "|   3-methylbut-1-ene|            CC(C)C=C|  1.83| 2.452|\n",
+            "|     2-ethylpyrazine|          CCc1cnccn1| -5.45|-5.809|\n",
+            "|         heptan-1-ol|            CCCCCCCO| -4.21|-2.917|\n",
+            "|  3,5-dimethylphenol|     Cc1cc(cc(c1)O)C| -6.27|-5.444|\n",
+            "|  2,3-dimethylbutane|          CC(C)C(C)C|  2.34| 2.468|\n",
+            "| 2-methylpentan-2-ol|         CCCC(C)(C)O| -3.92|-2.779|\n",
+            "|1,2-dimethylcyclo...|C[C@@H]1CCCC[C@@H]1C|  1.58| 1.685|\n",
+            "|          butan-2-ol|         CC[C@H](C)O| -4.62|-3.145|\n",
+            "|      dibromomethane|             C(Br)Br| -1.96|-0.405|\n",
+            "| 2-methylpentan-3-ol|     CC[C@H](C(C)C)O| -3.88|-2.416|\n",
+            "|     2-ethylpyridine|          CCc1ccccn1| -4.33| -3.31|\n",
+            "|    ethyl pentanoate|        CCCCC(=O)OCC| -2.49| -3.11|\n",
+            "|        benzenethiol|         c1ccc(cc1)S| -2.55|-1.501|\n",
+            "|(2Z)-3,7-dimethyl...|CC(=CCC/C(=C\\CO)/C)C| -4.78|-2.597|\n",
+            "|              indane|     c1ccc2c(c1)CCC2| -1.46|-1.752|\n",
+            "|       ethoxybenzene|         CCOc1ccccc1| -2.22|-2.254|\n",
+            "|       4-bromophenol|       c1cc(ccc1O)Br| -5.85|-5.833|\n",
+            "| 2,2-dimethylpentane|         CCCC(C)(C)C|  2.88| 2.686|\n",
+            "+--------------------+--------------------+------+------+\n",
+            "only showing top 20 rows\n",
+            "\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "eZ4vv9EMoe5y"
+      },
+      "execution_count": 115,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "LG1zUjtkwRP1"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file