From 74b50d6e9515771145ea63731e3b8ea44faa7d95 Mon Sep 17 00:00:00 2001 From: MadalenaBotelho Date: Tue, 4 Nov 2025 08:22:04 +0000 Subject: [PATCH 1/9] feat: install jupyter lab in Dockerfile --- Docker/Dockerfile | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/Docker/Dockerfile b/Docker/Dockerfile index 5d94ed2..d5e10f0 100644 --- a/Docker/Dockerfile +++ b/Docker/Dockerfile @@ -86,35 +86,41 @@ RUN wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoo wget https://repo1.maven.org/maven2/com/mysql/mysql-connector-j/8.3.0/mysql-connector-j-8.3.0.jar ########################################### -# Stage 4: Final runtime image for K8s +# Stage 4: Final runtime image for K8s + Jupyter ########################################### FROM spark-base AS final -# Set environment variables for PySpark +# Set environment variables for PySpark + Jupyter ENV PYSPARK_PYTHON=/usr/local/bin/python3.11 ENV PYSPARK_DRIVER_PYTHON=/usr/local/bin/python3.11 -ENV PYTHONPATH="" ENV PYTHONPATH="${SPARK_HOME}/python:${SPARK_HOME}/python/lib/py4j-0.10.9.7-src.zip:${PYTHONPATH}" +ENV JUPYTER_PORT=8888 +ENV JUPYTER_DIR=/opt/spark/work-dir/notebooks -# Install matching PySpark version and dependencies +# Install PySpark, JupyterLab and common Python libs RUN pip install --no-cache-dir \ pyspark==3.5.6 \ pandas \ - numpy + numpy \ + jupyterlab==4.2.5 -# Create non-root user for running Spark (matches official image) +# Create non-root user for running Spark/Jupyter RUN groupadd -r -g 185 spark && \ useradd -r -u 185 -g 185 spark -# Create directory for Spark logs & local storage -RUN mkdir -p /opt/spark/work-dir && \ +# Prepare work directory +RUN mkdir -p ${JUPYTER_DIR} && \ chown -R spark:spark /opt/spark -# Switch to non-root user USER 185 +WORKDIR ${JUPYTER_DIR} -WORKDIR /opt/spark/work-dir -RUN mkdir src -COPY src/ ./src/ +# Copy code +RUN mkdir -p /opt/spark/src +COPY src/ /opt/spark/src/ -ENTRYPOINT ["/opt/entrypoint.sh"] +# Expose Jupyter port +EXPOSE 8888 + +# Entrypoint — by default launches JupyterLab +ENTRYPOINT ["bash", "-lc", "jupyter lab --ip=0.0.0.0 --port=${JUPYTER_PORT} --no-browser --ServerApp.token='' --ServerApp.password='' --allow-root --NotebookApp.notebook_dir=${JUPYTER_DIR}"] \ No newline at end of file From 0400dab74dc4420525bbd3a45aabae77ea76bbc9 Mon Sep 17 00:00:00 2001 From: MadalenaBotelho Date: Tue, 4 Nov 2025 10:07:43 +0000 Subject: [PATCH 2/9] =?UTF-8?q?fix:=20Dockerfile=20=E2=80=94=20criar=20$HO?= =?UTF-8?q?ME=20e=20diret=C3=B3rios=20do=20Jupyter=20com=20permiss=C3=B5es?= =?UTF-8?q?=20corretas?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Docker/Dockerfile | 47 ++++++++++++++++++----------------------------- 1 file changed, 18 insertions(+), 29 deletions(-) diff --git a/Docker/Dockerfile b/Docker/Dockerfile index d5e10f0..e2dac88 100644 --- a/Docker/Dockerfile +++ b/Docker/Dockerfile @@ -90,37 +90,26 @@ RUN wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoo ########################################### FROM spark-base AS final -# Set environment variables for PySpark + Jupyter -ENV PYSPARK_PYTHON=/usr/local/bin/python3.11 -ENV PYSPARK_DRIVER_PYTHON=/usr/local/bin/python3.11 -ENV PYTHONPATH="${SPARK_HOME}/python:${SPARK_HOME}/python/lib/py4j-0.10.9.7-src.zip:${PYTHONPATH}" -ENV JUPYTER_PORT=8888 -ENV JUPYTER_DIR=/opt/spark/work-dir/notebooks - -# Install PySpark, JupyterLab and common Python libs -RUN pip install --no-cache-dir \ - pyspark==3.5.6 \ - pandas \ - numpy \ - jupyterlab==4.2.5 - -# Create non-root user for running Spark/Jupyter +# Create non-root user WITH home directory RUN groupadd -r -g 185 spark && \ - useradd -r -u 185 -g 185 spark - -# Prepare work directory -RUN mkdir -p ${JUPYTER_DIR} && \ - chown -R spark:spark /opt/spark + useradd -m -r -u 185 -g 185 -d /home/spark spark + +# Set HOME and Jupyter runtime dir +ENV HOME=/home/spark \ + JUPYTER_PORT=8888 \ + JUPYTER_DIR=/opt/spark/work-dir/notebooks \ + PYSPARK_PYTHON=/usr/local/bin/python3.11 \ + PYSPARK_DRIVER_PYTHON=/usr/local/bin/python3.11 \ + PYTHONPATH="${SPARK_HOME}/python:${SPARK_HOME}/python/lib/py4j-0.10.9.7-src.zip:${PYTHONPATH}" + +# Prepare directories Jupyter expects and fix ownership +RUN mkdir -p ${JUPYTER_DIR} \ + && mkdir -p ${HOME}/.local/share/jupyter/runtime \ + && mkdir -p ${HOME}/.jupyter \ + && chown -R spark:spark /home/spark /opt/spark USER 185 WORKDIR ${JUPYTER_DIR} -# Copy code -RUN mkdir -p /opt/spark/src -COPY src/ /opt/spark/src/ - -# Expose Jupyter port -EXPOSE 8888 - -# Entrypoint — by default launches JupyterLab -ENTRYPOINT ["bash", "-lc", "jupyter lab --ip=0.0.0.0 --port=${JUPYTER_PORT} --no-browser --ServerApp.token='' --ServerApp.password='' --allow-root --NotebookApp.notebook_dir=${JUPYTER_DIR}"] \ No newline at end of file +# EntryPoint: use ServerApp.root_dir (notebook_dir is deprecated) +ENTRYPOINT ["bash","-lc","jupyter lab --ip=0.0.0.0 --port=${JUPYTER_PORT} --no-browser --ServerApp.root_dir=${JUPYTER_DIR} --ServerApp.token='' --ServerApp.password=''"] \ No newline at end of file From 7b146f78e232285eb3cce225a730e9da67dd7815 Mon Sep 17 00:00:00 2001 From: MadalenaBotelho Date: Tue, 4 Nov 2025 10:36:41 +0000 Subject: [PATCH 3/9] fix: installing JupyterLab explicitly on stage 4 --- Docker/Dockerfile | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/Docker/Dockerfile b/Docker/Dockerfile index e2dac88..5893441 100644 --- a/Docker/Dockerfile +++ b/Docker/Dockerfile @@ -85,6 +85,9 @@ RUN wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoo wget https://repo1.maven.org/maven2/io/delta/delta-kernel-api/${DELTA_VERSION}/delta-kernel-api-${DELTA_VERSION}.jar && \ wget https://repo1.maven.org/maven2/com/mysql/mysql-connector-j/8.3.0/mysql-connector-j-8.3.0.jar +########################################### +# Stage 4: Final runtime image for K8s + Jupyter +########################################### ########################################### # Stage 4: Final runtime image for K8s + Jupyter ########################################### @@ -102,6 +105,13 @@ ENV HOME=/home/spark \ PYSPARK_DRIVER_PYTHON=/usr/local/bin/python3.11 \ PYTHONPATH="${SPARK_HOME}/python:${SPARK_HOME}/python/lib/py4j-0.10.9.7-src.zip:${PYTHONPATH}" +# ✅ Install PySpark + JupyterLab + common libs +RUN pip install --no-cache-dir \ + pyspark==3.5.6 \ + pandas \ + numpy \ + jupyterlab==4.2.5 + # Prepare directories Jupyter expects and fix ownership RUN mkdir -p ${JUPYTER_DIR} \ && mkdir -p ${HOME}/.local/share/jupyter/runtime \ @@ -111,5 +121,8 @@ RUN mkdir -p ${JUPYTER_DIR} \ USER 185 WORKDIR ${JUPYTER_DIR} +# Expose port for Jupyter +EXPOSE 8888 + # EntryPoint: use ServerApp.root_dir (notebook_dir is deprecated) ENTRYPOINT ["bash","-lc","jupyter lab --ip=0.0.0.0 --port=${JUPYTER_PORT} --no-browser --ServerApp.root_dir=${JUPYTER_DIR} --ServerApp.token='' --ServerApp.password=''"] \ No newline at end of file From cde0933dbdb0670b138f4df89afcbc49080903e2 Mon Sep 17 00:00:00 2001 From: MadalenaBotelho Date: Tue, 4 Nov 2025 10:37:29 +0000 Subject: [PATCH 4/9] fix: apagar comentario duplicado --- Docker/Dockerfile | 3 --- 1 file changed, 3 deletions(-) diff --git a/Docker/Dockerfile b/Docker/Dockerfile index 5893441..cf793ca 100644 --- a/Docker/Dockerfile +++ b/Docker/Dockerfile @@ -85,9 +85,6 @@ RUN wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoo wget https://repo1.maven.org/maven2/io/delta/delta-kernel-api/${DELTA_VERSION}/delta-kernel-api-${DELTA_VERSION}.jar && \ wget https://repo1.maven.org/maven2/com/mysql/mysql-connector-j/8.3.0/mysql-connector-j-8.3.0.jar -########################################### -# Stage 4: Final runtime image for K8s + Jupyter -########################################### ########################################### # Stage 4: Final runtime image for K8s + Jupyter ########################################### From b4f18030b9b7123896705351ea53436bedcdf083 Mon Sep 17 00:00:00 2001 From: MadalenaBotelho Date: Mon, 10 Nov 2025 07:56:22 +0000 Subject: [PATCH 5/9] feat: test airflow dag to trigger hello spark job --- src/jobs/hello_spark_job.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 src/jobs/hello_spark_job.py diff --git a/src/jobs/hello_spark_job.py b/src/jobs/hello_spark_job.py new file mode 100644 index 0000000..e59a28b --- /dev/null +++ b/src/jobs/hello_spark_job.py @@ -0,0 +1,15 @@ +from pyspark.sql import SparkSession + +def main(): + spark = SparkSession.builder.appName("hello_spark_job").getOrCreate() + + data = [("Madalena", 1), ("Vitor", 2), ("Beatriz", 3)] + df = spark.createDataFrame(data, ["name", "value"]) + + print("### Hello from Spark on Kubernetes via Airflow ###") + df.show(truncate=False) + + spark.stop() + +if __name__ == "__main__": + main() \ No newline at end of file From c35376e4248a2fb45ef26a695c89b16412a034f0 Mon Sep 17 00:00:00 2001 From: MadalenaBotelho Date: Mon, 10 Nov 2025 08:16:58 +0000 Subject: [PATCH 6/9] feat: add DAG --- dags/spark_hello_world_k8s.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 dags/spark_hello_world_k8s.py diff --git a/dags/spark_hello_world_k8s.py b/dags/spark_hello_world_k8s.py new file mode 100644 index 0000000..2ebf9b9 --- /dev/null +++ b/dags/spark_hello_world_k8s.py @@ -0,0 +1,25 @@ +from datetime import datetime, timedelta + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators.kubernetes_pod import KubernetesPodOperator + +# DAG simples só para testar o fluxo fim-a-fim +with DAG( + dag_id="spark_hello_world_k8s", + start_date=datetime(2025, 1, 1), + schedule_interval=None, # trigger manual via UI + catchup=False, + dagrun_timeout=timedelta(minutes=30), + tags=["demo", "spark", "kubernetes"], +) as dag: + + spark_hello = KubernetesPodOperator( + task_id="spark_hello", + name="spark-hello-world", + namespace="data-platform", + image="nauedu/nau-analytics-external-data-product:feat_add_jupyter_to_dockerfile", + cmds=["bash", "-lc"], + arguments=["python /opt/spark/src/jobs/hello_spark.py"], + is_delete_operator_pod=True, + in_cluster=True, + ) \ No newline at end of file From 42f90cd9bab7b57036cde28fd6039b7c8c88a948 Mon Sep 17 00:00:00 2001 From: MadalenaBotelho Date: Mon, 10 Nov 2025 08:31:03 +0000 Subject: [PATCH 7/9] fix: buildx failed with: ERROR: failed to build: failed to solve: process "/bin/sh -c wget --- Docker/Dockerfile | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Docker/Dockerfile b/Docker/Dockerfile index cf793ca..9bf9402 100644 --- a/Docker/Dockerfile +++ b/Docker/Dockerfile @@ -32,13 +32,13 @@ RUN ln -sf ${PREFIX}/bin/python3.11 /usr/local/bin/python \ ########################################### # Stage 2: Get entrypoint from official Spark ########################################### -FROM apache/spark:3.5.6 AS spark-official +FROM apache/spark:3.5.5 AS spark-official ########################################### # Stage 3: Spark + Delta + Cloud connectors ########################################### FROM ubuntu:22.04 AS spark-base -ARG SPARK_VERSION=3.5.6 +ARG SPARK_VERSION=3.5.5 ARG HADOOP_VERSION=3 ARG DELTA_VERSION=3.2.1 ENV DEBIAN_FRONTEND=noninteractive @@ -99,12 +99,12 @@ ENV HOME=/home/spark \ JUPYTER_PORT=8888 \ JUPYTER_DIR=/opt/spark/work-dir/notebooks \ PYSPARK_PYTHON=/usr/local/bin/python3.11 \ - PYSPARK_DRIVER_PYTHON=/usr/local/bin/python3.11 \ - PYTHONPATH="${SPARK_HOME}/python:${SPARK_HOME}/python/lib/py4j-0.10.9.7-src.zip:${PYTHONPATH}" - + PYSPARK_DRIVER_PYTHON=/usr/local/bin/python3.11 +# Definir PYTHONPATH separado (sem depender de anterior) +ENV PYTHONPATH="${SPARK_HOME}/python:${SPARK_HOME}/python/lib/py4j-0.10.9.7-src.zip" # ✅ Install PySpark + JupyterLab + common libs RUN pip install --no-cache-dir \ - pyspark==3.5.6 \ + pyspark==3.5.5 \ pandas \ numpy \ jupyterlab==4.2.5 From fbc6a16354f5c05ce630837b0584e57784e44488 Mon Sep 17 00:00:00 2001 From: MadalenaBotelho Date: Mon, 10 Nov 2025 09:24:14 +0000 Subject: [PATCH 8/9] fix: build failed regarding spark image --- Docker/Dockerfile | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/Docker/Dockerfile b/Docker/Dockerfile index 9bf9402..600fb0c 100644 --- a/Docker/Dockerfile +++ b/Docker/Dockerfile @@ -1,4 +1,5 @@ # syntax=docker/dockerfile:1 + ########################################### # Stage 1: Build Python 3.11.6 from source ########################################### @@ -6,6 +7,7 @@ FROM ubuntu:22.04 AS python-build ENV DEBIAN_FRONTEND=noninteractive ENV PYTHON_VERSION=3.11.6 ENV PREFIX=/usr/local + RUN apt-get update && apt-get install -y \ build-essential \ wget \ @@ -19,31 +21,38 @@ RUN apt-get update && apt-get install -y \ libsqlite3-dev \ libbz2-dev \ && rm -rf /var/lib/apt/lists/* + WORKDIR /usr/src + RUN wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz \ && tar -xzf Python-${PYTHON_VERSION}.tgz + WORKDIR /usr/src/Python-${PYTHON_VERSION} + RUN ./configure --enable-optimizations --prefix=${PREFIX} \ && make -j"$(nproc)" \ && make altinstall + RUN ln -sf ${PREFIX}/bin/python3.11 /usr/local/bin/python \ && ln -sf ${PREFIX}/bin/pip3.11 /usr/local/bin/pip ########################################### # Stage 2: Get entrypoint from official Spark ########################################### -FROM apache/spark:3.5.5 AS spark-official +FROM apache/spark:3.5.6 AS spark-official ########################################### # Stage 3: Spark + Delta + Cloud connectors ########################################### FROM ubuntu:22.04 AS spark-base -ARG SPARK_VERSION=3.5.5 + +ARG SPARK_VERSION=3.5.6 ARG HADOOP_VERSION=3 ARG DELTA_VERSION=3.2.1 + ENV DEBIAN_FRONTEND=noninteractive ENV SPARK_HOME=/opt/spark -ENV PATH=$SPARK_HOME/bin:$PATH +ENV PATH="$SPARK_HOME/bin:$PATH" # Install Java + basic utilities RUN apt-get update && apt-get install -y \ @@ -56,7 +65,7 @@ RUN apt-get update && apt-get install -y \ procps \ && rm -rf /var/lib/apt/lists/* -# Copy compiled Python +# Copy compiled Python from builder COPY --from=python-build /usr/local /usr/local # Copy entrypoint script from official Spark image @@ -95,31 +104,30 @@ RUN groupadd -r -g 185 spark && \ useradd -m -r -u 185 -g 185 -d /home/spark spark # Set HOME and Jupyter runtime dir -ENV HOME=/home/spark \ - JUPYTER_PORT=8888 \ - JUPYTER_DIR=/opt/spark/work-dir/notebooks \ - PYSPARK_PYTHON=/usr/local/bin/python3.11 \ - PYSPARK_DRIVER_PYTHON=/usr/local/bin/python3.11 -# Definir PYTHONPATH separado (sem depender de anterior) +ENV HOME=/home/spark +ENV JUPYTER_PORT=8888 +ENV JUPYTER_DIR=/opt/spark/work-dir/notebooks +ENV PYSPARK_PYTHON=/usr/local/bin/python3.11 +ENV PYSPARK_DRIVER_PYTHON=/usr/local/bin/python3.11 +# Explicit PYTHONPATH for PySpark ENV PYTHONPATH="${SPARK_HOME}/python:${SPARK_HOME}/python/lib/py4j-0.10.9.7-src.zip" -# ✅ Install PySpark + JupyterLab + common libs + +# Install PySpark + JupyterLab + common libs RUN pip install --no-cache-dir \ - pyspark==3.5.5 \ + pyspark==3.5.6 \ pandas \ numpy \ jupyterlab==4.2.5 # Prepare directories Jupyter expects and fix ownership -RUN mkdir -p ${JUPYTER_DIR} \ - && mkdir -p ${HOME}/.local/share/jupyter/runtime \ - && mkdir -p ${HOME}/.jupyter \ +RUN mkdir -p "${JUPYTER_DIR}" \ + && mkdir -p "${HOME}/.local/share/jupyter/runtime" \ + && mkdir -p "${HOME}/.jupyter" \ && chown -R spark:spark /home/spark /opt/spark USER 185 WORKDIR ${JUPYTER_DIR} -# Expose port for Jupyter EXPOSE 8888 -# EntryPoint: use ServerApp.root_dir (notebook_dir is deprecated) ENTRYPOINT ["bash","-lc","jupyter lab --ip=0.0.0.0 --port=${JUPYTER_PORT} --no-browser --ServerApp.root_dir=${JUPYTER_DIR} --ServerApp.token='' --ServerApp.password=''"] \ No newline at end of file From c1d6d9db6ca9dc06b5fa1d8d701da11122944281 Mon Sep 17 00:00:00 2001 From: MadalenaBotelho Date: Mon, 10 Nov 2025 11:49:52 +0000 Subject: [PATCH 9/9] fix: update spark version --- Docker/Dockerfile | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/Docker/Dockerfile b/Docker/Dockerfile index 600fb0c..f415200 100644 --- a/Docker/Dockerfile +++ b/Docker/Dockerfile @@ -4,6 +4,7 @@ # Stage 1: Build Python 3.11.6 from source ########################################### FROM ubuntu:22.04 AS python-build + ENV DEBIAN_FRONTEND=noninteractive ENV PYTHON_VERSION=3.11.6 ENV PREFIX=/usr/local @@ -39,22 +40,22 @@ RUN ln -sf ${PREFIX}/bin/python3.11 /usr/local/bin/python \ ########################################### # Stage 2: Get entrypoint from official Spark ########################################### -FROM apache/spark:3.5.6 AS spark-official +FROM apache/spark:3.5.7 AS spark-official ########################################### # Stage 3: Spark + Delta + Cloud connectors ########################################### FROM ubuntu:22.04 AS spark-base -ARG SPARK_VERSION=3.5.6 +ARG SPARK_VERSION=3.5.7 ARG HADOOP_VERSION=3 ARG DELTA_VERSION=3.2.1 ENV DEBIAN_FRONTEND=noninteractive ENV SPARK_HOME=/opt/spark -ENV PATH="$SPARK_HOME/bin:$PATH" +ENV PATH="${SPARK_HOME}/bin:${PATH}" -# Install Java + basic utilities +# Java + utils RUN apt-get update && apt-get install -y \ openjdk-11-jdk \ curl \ @@ -65,10 +66,10 @@ RUN apt-get update && apt-get install -y \ procps \ && rm -rf /var/lib/apt/lists/* -# Copy compiled Python from builder +# Copy Python from build stage COPY --from=python-build /usr/local /usr/local -# Copy entrypoint script from official Spark image +# Copy entrypoint scripts from official Spark image COPY --from=spark-official /opt/entrypoint.sh /opt/entrypoint.sh COPY --from=spark-official /opt/decom.sh /opt/decom.sh RUN chmod +x /opt/entrypoint.sh /opt/decom.sh @@ -80,8 +81,8 @@ RUN wget https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VER && mv spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} spark \ && rm spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz -# Add useful connectors (Delta, AWS, Azure, MySQL) -WORKDIR $SPARK_HOME/jars +# Add connectors (Delta, AWS, Azure, MySQL) +WORKDIR ${SPARK_HOME}/jars RUN wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar && \ wget https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.375/aws-java-sdk-bundle-1.12.375.jar && \ wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-azure/3.3.4/hadoop-azure-3.3.4.jar && \ @@ -99,27 +100,26 @@ RUN wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoo ########################################### FROM spark-base AS final -# Create non-root user WITH home directory +# Non-root user with home dir RUN groupadd -r -g 185 spark && \ useradd -m -r -u 185 -g 185 -d /home/spark spark -# Set HOME and Jupyter runtime dir -ENV HOME=/home/spark -ENV JUPYTER_PORT=8888 -ENV JUPYTER_DIR=/opt/spark/work-dir/notebooks -ENV PYSPARK_PYTHON=/usr/local/bin/python3.11 -ENV PYSPARK_DRIVER_PYTHON=/usr/local/bin/python3.11 -# Explicit PYTHONPATH for PySpark -ENV PYTHONPATH="${SPARK_HOME}/python:${SPARK_HOME}/python/lib/py4j-0.10.9.7-src.zip" +# Env for Jupyter + PySpark +ENV HOME=/home/spark \ + JUPYTER_PORT=8888 \ + JUPYTER_DIR=/opt/spark/work-dir/notebooks \ + PYSPARK_PYTHON=/usr/local/bin/python3.11 \ + PYSPARK_DRIVER_PYTHON=/usr/local/bin/python3.11 \ + PYTHONPATH="${SPARK_HOME}/python" -# Install PySpark + JupyterLab + common libs +# PySpark + JupyterLab + libs RUN pip install --no-cache-dir \ - pyspark==3.5.6 \ + pyspark==3.5.7 \ pandas \ numpy \ jupyterlab==4.2.5 -# Prepare directories Jupyter expects and fix ownership +# Dirs Jupyter + notebooks RUN mkdir -p "${JUPYTER_DIR}" \ && mkdir -p "${HOME}/.local/share/jupyter/runtime" \ && mkdir -p "${HOME}/.jupyter" \ @@ -130,4 +130,5 @@ WORKDIR ${JUPYTER_DIR} EXPOSE 8888 +# Default: start JupyterLab (K8s manifest pode override se quiser usar só spark-submit) ENTRYPOINT ["bash","-lc","jupyter lab --ip=0.0.0.0 --port=${JUPYTER_PORT} --no-browser --ServerApp.root_dir=${JUPYTER_DIR} --ServerApp.token='' --ServerApp.password=''"] \ No newline at end of file