From d8cb86a26af657771b0fcbcb176368766f304c60 Mon Sep 17 00:00:00 2001 From: Josh Mandel Date: Fri, 11 Sep 2020 15:00:46 -0500 Subject: [PATCH] Minimal example with spark sql --- sql/spark-sql-example.ipynb | 216 ++++++++++++++++++++++++++++++++++++ 1 file changed, 216 insertions(+) create mode 100644 sql/spark-sql-example.ipynb diff --git a/sql/spark-sql-example.ipynb b/sql/spark-sql-example.ipynb new file mode 100644 index 0000000..288264a --- /dev/null +++ b/sql/spark-sql-example.ipynb @@ -0,0 +1,216 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Intitializing Scala interpreter ..." + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "Spark Web UI available at http://51d2d7955110:4040\n", + "SparkContext available as 'sc' (version = 3.0.0, master = local[*], app id = local-1599854411921)\n", + "SparkSession available as 'spark'\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "root\n", + " |-- address: array (nullable = true)\n", + " | |-- element: struct (containsNull = true)\n", + " | | |-- city: string (nullable = true)\n", + " | | |-- city_aa: string (nullable = true)\n", + " | | |-- country: string (nullable = true)\n", + " | | |-- country_aa: string (nullable = true)\n", + " | | |-- extension: array (nullable = true)\n", + " | | | |-- element: struct (containsNull = true)\n", + " | | | | |-- parent: string (nullable = true)\n", + " | | | | |-- url: string (nullable = true)\n", + " | | | | |-- valueDecimal: double (nullable = true)\n", + " | | |-- line: array (nullable = true)\n", + " | | | |-- element: string (containsNull = true)\n", + " | | |-- line_aa: array (nullable = true)\n", + " | | | |-- element: string (containsNull = true)\n", + " | | |-- postalCode: string (nullable = true)\n", + " | | |-- postalCode_aa: string (nullable = true)\n", + " | | |-- state: string (nullable = true)\n", + " | | |-- state_aa: string (nullable = true)\n", + " |-- birthDate: string (nullable = true)\n", + " |-- birthDate_aa: struct (nullable = true)\n", + " | |-- end: string (nullable = true)\n", + " | |-- start: string (nullable = true)\n", + " |-- communication: array (nullable = true)\n", + " | |-- element: struct (containsNull = true)\n", + " | | |-- language: struct (nullable = true)\n", + " | | | |-- coding: array (nullable = true)\n", + " | | | | |-- element: struct (containsNull = true)\n", + " | | | | | |-- code: string (nullable = true)\n", + " | | | | | |-- display: string (nullable = true)\n", + " | | | | | |-- display_aa: string (nullable = true)\n", + " | | | | | |-- system: string (nullable = true)\n", + " | | | |-- text: string (nullable = true)\n", + " | | | |-- text_aa: string (nullable = true)\n", + " |-- deceasedDateTime: timestamp (nullable = true)\n", + " |-- deceasedDateTime_aa: struct (nullable = true)\n", + " | |-- end: string (nullable = true)\n", + " | |-- start: string (nullable = true)\n", + " |-- extension: array (nullable = true)\n", + " | |-- element: struct (containsNull = true)\n", + " | | |-- parent: string (nullable = true)\n", + " | | |-- url: string (nullable = true)\n", + " | | |-- valueAddress: struct (nullable = true)\n", + " | | | |-- city: string (nullable = true)\n", + " | | | |-- city_aa: string (nullable = true)\n", + " | | | |-- country: string (nullable = true)\n", + " | | | |-- country_aa: string (nullable = true)\n", + " | | | |-- state: string (nullable = true)\n", + " | | | |-- state_aa: string (nullable = true)\n", + " | | |-- valueCode: string (nullable = true)\n", + " | | |-- valueCoding: struct (nullable = true)\n", + " | | | |-- code: string (nullable = true)\n", + " | | | |-- display: string (nullable = true)\n", + " | | | |-- display_aa: string (nullable = true)\n", + " | | | |-- system: string (nullable = true)\n", + " | | |-- valueDecimal: double (nullable = true)\n", + " | | |-- valueString: string (nullable = true)\n", + " | | |-- valueString_aa: string (nullable = true)\n", + " |-- gender: string (nullable = true)\n", + " |-- id: string (nullable = true)\n", + " |-- id_prev_aa: string (nullable = true)\n", + " |-- identifier: array (nullable = true)\n", + " | |-- element: struct (containsNull = true)\n", + " | | |-- system: string (nullable = true)\n", + " | | |-- type: struct (nullable = true)\n", + " | | | |-- coding: array (nullable = true)\n", + " | | | | |-- element: struct (containsNull = true)\n", + " | | | | | |-- code: string (nullable = true)\n", + " | | | | | |-- display: string (nullable = true)\n", + " | | | | | |-- display_aa: string (nullable = true)\n", + " | | | | | |-- system: string (nullable = true)\n", + " | | | |-- text: string (nullable = true)\n", + " | | | |-- text_aa: string (nullable = true)\n", + " | | |-- value: string (nullable = true)\n", + " | | |-- value_aa: string (nullable = true)\n", + " |-- maritalStatus: struct (nullable = true)\n", + " | |-- coding: array (nullable = true)\n", + " | | |-- element: struct (containsNull = true)\n", + " | | | |-- code: string (nullable = true)\n", + " | | | |-- display: string (nullable = true)\n", + " | | | |-- display_aa: string (nullable = true)\n", + " | | | |-- system: string (nullable = true)\n", + " | |-- text: string (nullable = true)\n", + " | |-- text_aa: string (nullable = true)\n", + " |-- multipleBirthBoolean: boolean (nullable = true)\n", + " |-- multipleBirthInteger: long (nullable = true)\n", + " |-- name: array (nullable = true)\n", + " | |-- element: struct (containsNull = true)\n", + " | | |-- family: string (nullable = true)\n", + " | | |-- family_aa: string (nullable = true)\n", + " | | |-- given: array (nullable = true)\n", + " | | | |-- element: string (containsNull = true)\n", + " | | |-- given_aa: array (nullable = true)\n", + " | | | |-- element: string (containsNull = true)\n", + " | | |-- prefix: array (nullable = true)\n", + " | | | |-- element: string (containsNull = true)\n", + " | | |-- prefix_aa: array (nullable = true)\n", + " | | | |-- element: string (containsNull = true)\n", + " | | |-- suffix: array (nullable = true)\n", + " | | | |-- element: string (containsNull = true)\n", + " | | |-- suffix_aa: array (nullable = true)\n", + " | | | |-- element: string (containsNull = true)\n", + " | | |-- use: string (nullable = true)\n", + " |-- resourceType: string (nullable = true)\n", + " |-- telecom: array (nullable = true)\n", + " | |-- element: struct (containsNull = true)\n", + " | | |-- system: string (nullable = true)\n", + " | | |-- use: string (nullable = true)\n", + " | | |-- value: string (nullable = true)\n", + " | | |-- value_aa: string (nullable = true)\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "import spark.implicits._\n", + "path: String = /home/jovyan/work/etl/output/annotated/Patient.ndjson\n", + "Patient: org.apache.spark.sql.DataFrame = [address: array>,line:array,line_aa:array,postalCode:string,postalCode_aa:string,state:string,state_aa:string>>, birthDate: string ... 15 more fields]\n" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "// git clone https://github.com/sync-for-science/a3 \n", + "// docker run --rm -p 8888:8888 -v $(pwd):/home/jovyan/work jupyter/all-spark-notebook\n", + "\n", + "import spark.implicits._\n", + "val path = \"/home/jovyan/work/etl/output/annotated/Patient.ndjson\"\n", + "val Patient = spark.read.json(path)\n", + "Patient.printSchema()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "res1: org.apache.spark.sql.DataFrame = [count(1): bigint]\n" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Patient.createOrReplaceTempView(\"Patient\")\n", + "\n", + "spark.sql(\"create or replace temporary view pv as (SELECT count(*) FROM Patient)\")\n", + "spark.sql(\"SELECT * FROM pv\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "spylon-kernel", + "language": "scala", + "name": "spylon-kernel" + }, + "language_info": { + "codemirror_mode": "text/x-scala", + "file_extension": ".scala", + "help_links": [ + { + "text": "MetaKernel Magics", + "url": "https://metakernel.readthedocs.io/en/latest/source/README.html" + } + ], + "mimetype": "text/x-scala", + "name": "scala", + "pygments_lexer": "scala", + "version": "0.4.1" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}