From 9c05ebae0da6758dd63017eae6d2d90a8e65527f Mon Sep 17 00:00:00 2001
From: Rabin Sapkota <123710399+theRubyPheonix@users.noreply.github.com>
Date: Tue, 21 Mar 2023 02:11:55 +0000
Subject: [PATCH] Created using Colaboratory
---
Prog4BigData_Lab.ipynb | 835 +++++++++++++++++++++++++++++++++++++++++
1 file changed, 835 insertions(+)
create mode 100644 Prog4BigData_Lab.ipynb
diff --git a/Prog4BigData_Lab.ipynb b/Prog4BigData_Lab.ipynb
new file mode 100644
index 0000000..760a27c
--- /dev/null
+++ b/Prog4BigData_Lab.ipynb
@@ -0,0 +1,835 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "provenance": [],
+ "collapsed_sections": [
+ "d8JHFIbyXyxd",
+ "bJhNw4oSXUad",
+ "xuKBEjeCXKH0"
+ ],
+ "authorship_tag": "ABX9TyNOGT36Pd2vBFmqsDCwxjFs",
+ "include_colab_link": true
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "view-in-github",
+ "colab_type": "text"
+ },
+ "source": [
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**Apache Spark Machine Learning using Dataframes in Google Colab**"
+ ],
+ "metadata": {
+ "id": "xVUbDSjPJatr"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [],
+ "metadata": {
+ "id": "uz32p9PrJf2G"
+ },
+ "execution_count": 53,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Setup apache instance in Google Colab \n",
+ "# Latest version 3.3.2 from https://archive.apache.org/dist/spark\n",
+ "!apt-get install openjdk-8-jdk-headless -qq > /dev/null\n",
+ "!wget -q https://archive.apache.org/dist/spark/spark-3.3.2/spark-3.3.2-bin-hadoop2.tgz\n",
+ "!tar xf spark-3.3.2-bin-hadoop2.tgz\n",
+ "!pip install -q findspark "
+ ],
+ "metadata": {
+ "id": "K34_Wvx5J2b-"
+ },
+ "execution_count": 54,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Set the java and spark environment location \n",
+ "import os\n",
+ "os.environ[\"JAVA_HOME\"] = \"/usr/lib/jvm/java-8-openjdk-amd64\"\n",
+ "os.environ[\"SPARK_HOME\"] = \"/content/spark-3.3.2-bin-hadoop2\""
+ ],
+ "metadata": {
+ "id": "iRLSHEx-KG5O"
+ },
+ "execution_count": 55,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Local spark session \n",
+ "import findspark\n",
+ "findspark.init()"
+ ],
+ "metadata": {
+ "id": "GH3pt5LLKMCK"
+ },
+ "execution_count": 56,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from pyspark.sql import SparkSession\n",
+ "spark = SparkSession.builder\\\n",
+ " .master(\"local\")\\\n",
+ " .appName(\"Colab\")\\\n",
+ " .config(\"spark.ui.port\",\"4050\")\\\n",
+ " .getOrCreate()\n",
+ "spark"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 216
+ },
+ "id": "AwDevvXcKPau",
+ "outputId": "b3890014-00f6-4988-f1f3-69cfd9cefdde"
+ },
+ "execution_count": 57,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
SparkSession - in-memory
\n",
+ " \n",
+ "
\n",
+ "
SparkContext
\n",
+ "\n",
+ "
Spark UI
\n",
+ "\n",
+ "
\n",
+ " - Version
\n",
+ " v3.3.2 \n",
+ " - Master
\n",
+ " local \n",
+ " - AppName
\n",
+ " Colab \n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 57
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Getting tthe Iris dataset\n",
+ "!wget https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data -O sample_data/iris.data"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "sO-MJ0XWRyfr",
+ "outputId": "ef5bd0e5-f05f-4d6c-f25d-5300cbf63137"
+ },
+ "execution_count": 58,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "--2023-03-20 23:27:31-- https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data\n",
+ "Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252\n",
+ "Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.\n",
+ "HTTP request sent, awaiting response... 200 OK\n",
+ "Length: 4551 (4.4K) [application/x-httpd-php]\n",
+ "Saving to: ‘sample_data/iris.data’\n",
+ "\n",
+ "sample_data/iris.da 100%[===================>] 4.44K --.-KB/s in 0s \n",
+ "\n",
+ "2023-03-20 23:27:32 (171 MB/s) - ‘sample_data/iris.data’ saved [4551/4551]\n",
+ "\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Import the dataset into dataframe \n",
+ "df = spark.read.csv('sample_data/iris.data', inferSchema=True).toDF('SepalLength','SepalWidth','PetalLength','PetalWidth','Class')"
+ ],
+ "metadata": {
+ "id": "tpT9PvaUW61w"
+ },
+ "execution_count": 60,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# Feature Extraction"
+ ],
+ "metadata": {
+ "id": "rHAEMVhAXtSU"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.show()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "0ccOU0rcbsLK",
+ "outputId": "5dbd6306-478b-4fa9-cd1c-3479124a7c88"
+ },
+ "execution_count": 61,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "+-----------+----------+-----------+----------+-----------+\n",
+ "|SepalLength|SepalWidth|PetalLength|PetalWidth| Class|\n",
+ "+-----------+----------+-----------+----------+-----------+\n",
+ "| 5.1| 3.5| 1.4| 0.2|Iris-setosa|\n",
+ "| 4.9| 3.0| 1.4| 0.2|Iris-setosa|\n",
+ "| 4.7| 3.2| 1.3| 0.2|Iris-setosa|\n",
+ "| 4.6| 3.1| 1.5| 0.2|Iris-setosa|\n",
+ "| 5.0| 3.6| 1.4| 0.2|Iris-setosa|\n",
+ "| 5.4| 3.9| 1.7| 0.4|Iris-setosa|\n",
+ "| 4.6| 3.4| 1.4| 0.3|Iris-setosa|\n",
+ "| 5.0| 3.4| 1.5| 0.2|Iris-setosa|\n",
+ "| 4.4| 2.9| 1.4| 0.2|Iris-setosa|\n",
+ "| 4.9| 3.1| 1.5| 0.1|Iris-setosa|\n",
+ "| 5.4| 3.7| 1.5| 0.2|Iris-setosa|\n",
+ "| 4.8| 3.4| 1.6| 0.2|Iris-setosa|\n",
+ "| 4.8| 3.0| 1.4| 0.1|Iris-setosa|\n",
+ "| 4.3| 3.0| 1.1| 0.1|Iris-setosa|\n",
+ "| 5.8| 4.0| 1.2| 0.2|Iris-setosa|\n",
+ "| 5.7| 4.4| 1.5| 0.4|Iris-setosa|\n",
+ "| 5.4| 3.9| 1.3| 0.4|Iris-setosa|\n",
+ "| 5.1| 3.5| 1.4| 0.3|Iris-setosa|\n",
+ "| 5.7| 3.8| 1.7| 0.3|Iris-setosa|\n",
+ "| 5.1| 3.8| 1.5| 0.3|Iris-setosa|\n",
+ "+-----------+----------+-----------+----------+-----------+\n",
+ "only showing top 20 rows\n",
+ "\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df_temp = df_temp.drop('SepalLength','SepalWidth','PetalLength','PetalWidth')\n",
+ "df_temp.show()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "MQT4KjiHcy8B",
+ "outputId": "601906b3-a5d7-4c32-a686-d00b5d8c673b"
+ },
+ "execution_count": 63,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "+-----------+-----------------+\n",
+ "| Class| features|\n",
+ "+-----------+-----------------+\n",
+ "|Iris-setosa|[5.1,3.5,1.4,0.2]|\n",
+ "|Iris-setosa|[4.9,3.0,1.4,0.2]|\n",
+ "|Iris-setosa|[4.7,3.2,1.3,0.2]|\n",
+ "|Iris-setosa|[4.6,3.1,1.5,0.2]|\n",
+ "|Iris-setosa|[5.0,3.6,1.4,0.2]|\n",
+ "|Iris-setosa|[5.4,3.9,1.7,0.4]|\n",
+ "|Iris-setosa|[4.6,3.4,1.4,0.3]|\n",
+ "|Iris-setosa|[5.0,3.4,1.5,0.2]|\n",
+ "|Iris-setosa|[4.4,2.9,1.4,0.2]|\n",
+ "|Iris-setosa|[4.9,3.1,1.5,0.1]|\n",
+ "|Iris-setosa|[5.4,3.7,1.5,0.2]|\n",
+ "|Iris-setosa|[4.8,3.4,1.6,0.2]|\n",
+ "|Iris-setosa|[4.8,3.0,1.4,0.1]|\n",
+ "|Iris-setosa|[4.3,3.0,1.1,0.1]|\n",
+ "|Iris-setosa|[5.8,4.0,1.2,0.2]|\n",
+ "|Iris-setosa|[5.7,4.4,1.5,0.4]|\n",
+ "|Iris-setosa|[5.4,3.9,1.3,0.4]|\n",
+ "|Iris-setosa|[5.1,3.5,1.4,0.3]|\n",
+ "|Iris-setosa|[5.7,3.8,1.7,0.3]|\n",
+ "|Iris-setosa|[5.1,3.8,1.5,0.3]|\n",
+ "+-----------+-----------------+\n",
+ "only showing top 20 rows\n",
+ "\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# The final data preparation step is to index the Class column - to use numeric rather than text values \n",
+ "# - run the following command and display your output of Class, features & ClassIndex columns\n",
+ "from pyspark.ml.feature import StringIndexer\n",
+ "l_indexer=StringIndexer(inputCol='Class',outputCol='ClassIndex')\n",
+ "df_temp=l_indexer.fit(df_temp).transform(df_temp)\n",
+ "df_temp.show()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "dtHWFZngdGZW",
+ "outputId": "074a0923-5e7c-43d1-e33f-222b1261c012"
+ },
+ "execution_count": 64,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "+-----------+-----------------+----------+\n",
+ "| Class| features|ClassIndex|\n",
+ "+-----------+-----------------+----------+\n",
+ "|Iris-setosa|[5.1,3.5,1.4,0.2]| 0.0|\n",
+ "|Iris-setosa|[4.9,3.0,1.4,0.2]| 0.0|\n",
+ "|Iris-setosa|[4.7,3.2,1.3,0.2]| 0.0|\n",
+ "|Iris-setosa|[4.6,3.1,1.5,0.2]| 0.0|\n",
+ "|Iris-setosa|[5.0,3.6,1.4,0.2]| 0.0|\n",
+ "|Iris-setosa|[5.4,3.9,1.7,0.4]| 0.0|\n",
+ "|Iris-setosa|[4.6,3.4,1.4,0.3]| 0.0|\n",
+ "|Iris-setosa|[5.0,3.4,1.5,0.2]| 0.0|\n",
+ "|Iris-setosa|[4.4,2.9,1.4,0.2]| 0.0|\n",
+ "|Iris-setosa|[4.9,3.1,1.5,0.1]| 0.0|\n",
+ "|Iris-setosa|[5.4,3.7,1.5,0.2]| 0.0|\n",
+ "|Iris-setosa|[4.8,3.4,1.6,0.2]| 0.0|\n",
+ "|Iris-setosa|[4.8,3.0,1.4,0.1]| 0.0|\n",
+ "|Iris-setosa|[4.3,3.0,1.1,0.1]| 0.0|\n",
+ "|Iris-setosa|[5.8,4.0,1.2,0.2]| 0.0|\n",
+ "|Iris-setosa|[5.7,4.4,1.5,0.4]| 0.0|\n",
+ "|Iris-setosa|[5.4,3.9,1.3,0.4]| 0.0|\n",
+ "|Iris-setosa|[5.1,3.5,1.4,0.3]| 0.0|\n",
+ "|Iris-setosa|[5.7,3.8,1.7,0.3]| 0.0|\n",
+ "|Iris-setosa|[5.1,3.8,1.5,0.3]| 0.0|\n",
+ "+-----------+-----------------+----------+\n",
+ "only showing top 20 rows\n",
+ "\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Spark ML can only deal with one features column - so we need to vectorise the multiple columns into one:\n",
+ "from pyspark.ml.linalg import Vectors\n",
+ "from pyspark.ml.feature import VectorAssembler\n",
+ "vector_assembler = VectorAssembler(\\\n",
+ " inputCols=['SepalLength','SepalWidth','PetalLength','PetalWidth'],\\\n",
+ " outputCol='features')\n",
+ "df_temp=vector_assembler.transform(df)\n",
+ "df_temp.show(3)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "fAC5bZo5bvs-",
+ "outputId": "b2f0d902-9ff7-431f-f2b8-55a467ac65d3"
+ },
+ "execution_count": 62,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "+-----------+----------+-----------+----------+-----------+-----------------+\n",
+ "|SepalLength|SepalWidth|PetalLength|PetalWidth| Class| features|\n",
+ "+-----------+----------+-----------+----------+-----------+-----------------+\n",
+ "| 5.1| 3.5| 1.4| 0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|\n",
+ "| 4.9| 3.0| 1.4| 0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|\n",
+ "| 4.7| 3.2| 1.3| 0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|\n",
+ "+-----------+----------+-----------+----------+-----------+-----------------+\n",
+ "only showing top 3 rows\n",
+ "\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# Split Traning and Test"
+ ],
+ "metadata": {
+ "id": "d8JHFIbyXyxd"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Spliting the data into traning and test dataset\n",
+ "(trainingData, testData) = df_temp.randomSplit([0.7,0.3])"
+ ],
+ "metadata": {
+ "id": "j2C2Ij4DdL11"
+ },
+ "execution_count": 65,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# Decision Tree "
+ ],
+ "metadata": {
+ "id": "bJhNw4oSXUad"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Deision Tree Classifier\n",
+ "from pyspark.ml.classification import DecisionTreeClassifier\n",
+ "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n",
+ "\n",
+ "dt=DecisionTreeClassifier(labelCol='ClassIndex',featuresCol='features')\n",
+ "model=dt.fit(trainingData)"
+ ],
+ "metadata": {
+ "id": "QBel9HANfPx0"
+ },
+ "execution_count": 66,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Test your model with your test dataset: \n",
+ "predictions=model.transform(testData)\n",
+ "predictions.select('prediction','ClassIndex').show(15)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "bN_LZai3gXxc",
+ "outputId": "4476a3de-d21f-4bd8-e888-5471b07f78b0"
+ },
+ "execution_count": 67,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "+----------+----------+\n",
+ "|prediction|ClassIndex|\n",
+ "+----------+----------+\n",
+ "| 0.0| 0.0|\n",
+ "| 0.0| 0.0|\n",
+ "| 0.0| 0.0|\n",
+ "| 0.0| 0.0|\n",
+ "| 0.0| 0.0|\n",
+ "| 0.0| 0.0|\n",
+ "| 0.0| 0.0|\n",
+ "| 0.0| 0.0|\n",
+ "| 0.0| 0.0|\n",
+ "| 0.0| 0.0|\n",
+ "| 0.0| 0.0|\n",
+ "| 0.0| 0.0|\n",
+ "| 0.0| 0.0|\n",
+ "| 0.0| 0.0|\n",
+ "| 0.0| 0.0|\n",
+ "+----------+----------+\n",
+ "only showing top 15 rows\n",
+ "\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Evaluation fn\n",
+ "evaluator=MulticlassClassificationEvaluator(labelCol='ClassIndex',predictionCol='prediction',metricName='accuracy')\n",
+ "accuracy=evaluator.evaluate(predictions)\n",
+ "print('Test Error = %g'%(1.0-accuracy))\n",
+ "print('Test Set Accuracy =' + str(accuracy))"
+ ],
+ "metadata": {
+ "id": "qr26Bs56is6A",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "1361979f-dde4-4efa-bed9-002700a53124"
+ },
+ "execution_count": 71,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Test Error = 0.0208333\n",
+ "Test Set Accuracy =0.9791666666666666\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# Random Forest"
+ ],
+ "metadata": {
+ "id": "xuKBEjeCXKH0"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Random Forest Classifier\n",
+ "from pyspark.ml.classification import RandomForestClassifier\n",
+ "\n",
+ "rf=RandomForestClassifier(labelCol='ClassIndex',featuresCol='features',numTrees=10)\n",
+ "model=rf.fit(trainingData)\n",
+ "predictions=model.transform(testData)\n",
+ "predictions.select('prediction','ClassIndex').show()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "mlxf8-wgXd_f",
+ "outputId": "6a378d93-3196-4a99-96b9-16668b0aea70"
+ },
+ "execution_count": 74,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "Exception ignored in: \n",
+ "Traceback (most recent call last):\n",
+ " File \"/content/spark-3.3.2-bin-hadoop2/python/pyspark/ml/wrapper.py\", line 53, in __del__\n",
+ " if SparkContext._active_spark_context and self._java_obj is not None:\n",
+ "AttributeError: 'RandomForestClassifier' object has no attribute '_java_obj'\n"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "+----------+----------+\n",
+ "|prediction|ClassIndex|\n",
+ "+----------+----------+\n",
+ "| 0.0| 0.0|\n",
+ "| 0.0| 0.0|\n",
+ "| 0.0| 0.0|\n",
+ "| 0.0| 0.0|\n",
+ "| 0.0| 0.0|\n",
+ "| 0.0| 0.0|\n",
+ "| 0.0| 0.0|\n",
+ "| 0.0| 0.0|\n",
+ "| 0.0| 0.0|\n",
+ "| 0.0| 0.0|\n",
+ "| 0.0| 0.0|\n",
+ "| 0.0| 0.0|\n",
+ "| 0.0| 0.0|\n",
+ "| 0.0| 0.0|\n",
+ "| 0.0| 0.0|\n",
+ "| 0.0| 0.0|\n",
+ "| 0.0| 0.0|\n",
+ "| 0.0| 0.0|\n",
+ "| 0.0| 0.0|\n",
+ "| 0.0| 0.0|\n",
+ "+----------+----------+\n",
+ "only showing top 20 rows\n",
+ "\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Evaluation Fn\n",
+ "evaluator=MulticlassClassificationEvaluator(labelCol='ClassIndex',predictionCol='prediction',metricName='accuracy')\n",
+ "accuracy=evaluator.evaluate(predictions)\n",
+ "print('Test Error = %g' % (1.0-accuracy))\n",
+ "print('Test Set Accuracy = '+ str(accuracy))"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "0wRgRb7FYfDv",
+ "outputId": "257884b0-7fec-4a51-ba3f-827e4889ae04"
+ },
+ "execution_count": 76,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Test Error = 0.0208333\n",
+ "Test Set Accuracy = 0.9791666666666666\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# Naive bayes Classifier"
+ ],
+ "metadata": {
+ "id": "5ht5N9Rpa-NR"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Naive Base Classifier\n",
+ "from pyspark.ml.classification import NaiveBayes\n",
+ "\n",
+ "nb=NaiveBayes(labelCol='ClassIndex',featuresCol='features',smoothing=1.0,modelType='multinomial')\n",
+ "model=nb.fit(trainingData)\n",
+ "predictions=model.transform(testData)\n",
+ "predictions.select('Class','ClassIndex','probability','prediction').show()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "qzfkVcMQbBA4",
+ "outputId": "8f2b6ca4-059e-41ec-9c6f-4665349c71de"
+ },
+ "execution_count": 77,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "+-----------+----------+--------------------+----------+\n",
+ "| Class|ClassIndex| probability|prediction|\n",
+ "+-----------+----------+--------------------+----------+\n",
+ "|Iris-setosa| 0.0|[0.65854431190304...| 0.0|\n",
+ "|Iris-setosa| 0.0|[0.59601222050315...| 0.0|\n",
+ "|Iris-setosa| 0.0|[0.62416333661554...| 0.0|\n",
+ "|Iris-setosa| 0.0|[0.58353510118447...| 0.0|\n",
+ "|Iris-setosa| 0.0|[0.73347005973151...| 0.0|\n",
+ "|Iris-setosa| 0.0|[0.63851500041302...| 0.0|\n",
+ "|Iris-setosa| 0.0|[0.58445564581932...| 0.0|\n",
+ "|Iris-setosa| 0.0|[0.56430482772553...| 0.0|\n",
+ "|Iris-setosa| 0.0|[0.56269476579405...| 0.0|\n",
+ "|Iris-setosa| 0.0|[0.63074902380229...| 0.0|\n",
+ "|Iris-setosa| 0.0|[0.66959617354636...| 0.0|\n",
+ "|Iris-setosa| 0.0|[0.64492936285987...| 0.0|\n",
+ "|Iris-setosa| 0.0|[0.50846896200676...| 0.0|\n",
+ "|Iris-setosa| 0.0|[0.49918864496678...| 0.0|\n",
+ "|Iris-setosa| 0.0|[0.64960562084001...| 0.0|\n",
+ "|Iris-setosa| 0.0|[0.67965272979208...| 0.0|\n",
+ "|Iris-setosa| 0.0|[0.56889671619589...| 0.0|\n",
+ "|Iris-setosa| 0.0|[0.67120169377758...| 0.0|\n",
+ "|Iris-setosa| 0.0|[0.69743996582468...| 0.0|\n",
+ "|Iris-setosa| 0.0|[0.60017434060471...| 0.0|\n",
+ "+-----------+----------+--------------------+----------+\n",
+ "only showing top 20 rows\n",
+ "\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Evaluation Fn\n",
+ "evaluator=MulticlassClassificationEvaluator(labelCol='ClassIndex',predictionCol='prediction',metricName='accuracy')\n",
+ "accuracy=evaluator.evaluate(predictions)\n",
+ "print('Test Error = %g' % (1.0-accuracy))\n",
+ "print('Test Set Accuracy = '+ str(accuracy))"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "O50-b6mkcCeA",
+ "outputId": "8f6a6939-8325-4d46-b41d-314459f4c0a8"
+ },
+ "execution_count": 78,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Test Error = 0.145833\n",
+ "Test Set Accuracy = 0.8541666666666666\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [],
+ "metadata": {
+ "id": "6G-w20Z0c1aa"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# Protein Data\n"
+ ],
+ "metadata": {
+ "id": "CWBkOMtzeCGe"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "!wget https://raw.githubusercontent.com/OpenDrugAI/AttentiveFP/master/data/SAMPL.csv -O sample_data/SAMPL.csv"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "OmXK0xz7e3_0",
+ "outputId": "f3cdad4b-7fd5-4e39-b9f9-1f6c7f59ecee"
+ },
+ "execution_count": 82,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "--2023-03-21 00:45:28-- https://raw.githubusercontent.com/OpenDrugAI/AttentiveFP/master/data/SAMPL.csv\n",
+ "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
+ "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
+ "HTTP request sent, awaiting response... 200 OK\n",
+ "Length: 32060 (31K) [text/plain]\n",
+ "Saving to: ‘sample_data/SAMPL.csv’\n",
+ "\n",
+ "\rsample_data/SAMPL.c 0%[ ] 0 --.-KB/s \rsample_data/SAMPL.c 100%[===================>] 31.31K --.-KB/s in 0s \n",
+ "\n",
+ "2023-03-21 00:45:28 (114 MB/s) - ‘sample_data/SAMPL.csv’ saved [32060/32060]\n",
+ "\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df = spark.read.option('delimiter',',').option('header',True).csv('sample_data/SAMPL.csv')"
+ ],
+ "metadata": {
+ "id": "0cINbK53fH74"
+ },
+ "execution_count": 113,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.show()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "voW4I_E9gbxW",
+ "outputId": "ef0a9143-ebb8-4755-f05c-2ffcffb20347"
+ },
+ "execution_count": 114,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "+--------------------+--------------------+------+------+\n",
+ "| iupac| smiles| expt| calc|\n",
+ "+--------------------+--------------------+------+------+\n",
+ "|4-methoxy-N,N-dim...|CN(C)C(=O)c1ccc(c...|-11.01|-9.625|\n",
+ "|methanesulfonyl c...| CS(=O)(=O)Cl| -4.87|-6.219|\n",
+ "| 3-methylbut-1-ene| CC(C)C=C| 1.83| 2.452|\n",
+ "| 2-ethylpyrazine| CCc1cnccn1| -5.45|-5.809|\n",
+ "| heptan-1-ol| CCCCCCCO| -4.21|-2.917|\n",
+ "| 3,5-dimethylphenol| Cc1cc(cc(c1)O)C| -6.27|-5.444|\n",
+ "| 2,3-dimethylbutane| CC(C)C(C)C| 2.34| 2.468|\n",
+ "| 2-methylpentan-2-ol| CCCC(C)(C)O| -3.92|-2.779|\n",
+ "|1,2-dimethylcyclo...|C[C@@H]1CCCC[C@@H]1C| 1.58| 1.685|\n",
+ "| butan-2-ol| CC[C@H](C)O| -4.62|-3.145|\n",
+ "| dibromomethane| C(Br)Br| -1.96|-0.405|\n",
+ "| 2-methylpentan-3-ol| CC[C@H](C(C)C)O| -3.88|-2.416|\n",
+ "| 2-ethylpyridine| CCc1ccccn1| -4.33| -3.31|\n",
+ "| ethyl pentanoate| CCCCC(=O)OCC| -2.49| -3.11|\n",
+ "| benzenethiol| c1ccc(cc1)S| -2.55|-1.501|\n",
+ "|(2Z)-3,7-dimethyl...|CC(=CCC/C(=C\\CO)/C)C| -4.78|-2.597|\n",
+ "| indane| c1ccc2c(c1)CCC2| -1.46|-1.752|\n",
+ "| ethoxybenzene| CCOc1ccccc1| -2.22|-2.254|\n",
+ "| 4-bromophenol| c1cc(ccc1O)Br| -5.85|-5.833|\n",
+ "| 2,2-dimethylpentane| CCCC(C)(C)C| 2.88| 2.686|\n",
+ "+--------------------+--------------------+------+------+\n",
+ "only showing top 20 rows\n",
+ "\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [],
+ "metadata": {
+ "id": "eZ4vv9EMoe5y"
+ },
+ "execution_count": 115,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [],
+ "metadata": {
+ "id": "LG1zUjtkwRP1"
+ },
+ "execution_count": null,
+ "outputs": []
+ }
+ ]
+}
\ No newline at end of file