From 21bdb00058a43f36b5f5c444370fc56e7f55f82a Mon Sep 17 00:00:00 2001 From: "Xu, Qinying (H&B, Herston)" Date: Tue, 28 Oct 2025 14:44:47 +1000 Subject: [PATCH 01/14] release VS in docker --- dev/docker/Dockerfile | 48 +++++++++++++++ dev/docker/compute_local_fdr.py | 37 ++++++++++++ dev/docker/note.md | 95 ++++++++++++++++++++++++++++++ dev/docker/requirements.txt | 17 ++++++ dev/docker/run_importance_chr22.py | 30 ++++++++++ 5 files changed, 227 insertions(+) create mode 100644 dev/docker/Dockerfile create mode 100644 dev/docker/compute_local_fdr.py create mode 100644 dev/docker/note.md create mode 100644 dev/docker/requirements.txt create mode 100644 dev/docker/run_importance_chr22.py diff --git a/dev/docker/Dockerfile b/dev/docker/Dockerfile new file mode 100644 index 00000000..eb3d0b05 --- /dev/null +++ b/dev/docker/Dockerfile @@ -0,0 +1,48 @@ +# Use Ubuntu 20.04 as the base image +FROM --platform=linux/amd64 ubuntu:20.04 + +# Set working directory +WORKDIR /app + +# Avoid interactive prompts during package installation +ENV DEBIAN_FRONTEND=noninteractive + +# Install Python 3.8, pip, Java 8, and Git +RUN apt-get update && apt-get install -y \ + python3.8 \ + python3-pip \ + openjdk-8-jdk \ + git wget \ + && rm -rf /var/lib/apt/lists/* + +# Set Python 3.8 as default python +RUN ln -s /usr/bin/python3.8 /usr/bin/python + +# Set JAVA_HOME environment variable, +# openjdk-8-jdk for linux/amd64, it installs java-8-openjdk-amd64. +ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 +ENV PATH=$JAVA_HOME/bin:$PATH + +# Install Apache Spark 3.1.2 with Hadoop 3.2, only work for linux/amd64 +RUN wget -q https://archive.apache.org/dist/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz && \ + tar -xzf spark-3.1.2-bin-hadoop3.2.tgz -C /opt && \ + rm spark-3.1.2-bin-hadoop3.2.tgz && \ + ln -s /opt/spark-3.1.2-bin-hadoop3.2 /opt/spark + +# Set Spark environment variables +ENV SPARK_HOME=/opt/spark +ENV PATH=$SPARK_HOME/bin:$PATH +# ENV PYSPARK_PYTHON=python3.8 +# ENV PYSPARK_DRIVER_PYTHON=python3.8 + +# Install VariantSpark +RUN pip3 install --no-cache-dir variant-spark + +# Clone the VariantSpark repository +# RUN git clone https://github.com/aehrc/VariantSpark.git + +# Set working directory to the cloned repository +WORKDIR /app/VariantSpark + +# Command to run +CMD ["bash"] diff --git a/dev/docker/compute_local_fdr.py b/dev/docker/compute_local_fdr.py new file mode 100644 index 00000000..d7f6dd38 --- /dev/null +++ b/dev/docker/compute_local_fdr.py @@ -0,0 +1,37 @@ +import hail as hl +import varspark.hail as vshl +from matplotlib import pyplot as plt +vshl.init() + +vds = hl.import_vcf('./data/chr22_1000.vcf') +labels = hl.import_table('./data/chr22-labels-hail.csv', impute = True, delimiter=",").key_by('sample') + +vds = vds.annotate_cols(label = labels[vds.s]) +vds.cols().show(3) + +rf_model = vshl.random_forest_model(y=vds.label['x22_16050408'], + x=vds.GT.n_alt_alleles(), seed = 13, mtry_fraction = 0.05, min_node_size = 5, max_depth = 10) +rf_model.fit_trees(300, 50) + +print("OOB error: %s" % rf_model.oob_error()) +impTable = rf_model.variable_importance() +impTable.order_by(hl.desc(impTable.importance)).show(10) + +fdrCalc = rf_model.get_lfdr() + +fig, ax1 = plt.subplots(figsize=(10, 5), layout='constrained') +fdrCalc.plot_log_densities(ax1, cutoff_list=[1, 2, 3, 4, 5, 10, 15, 20], find_automatic_best=True) +plt.show() + +fig, ax2 = plt.subplots(figsize=(10, 5), layout='constrained') +fdrCalc.plot_log_hist(ax2, split_count=2) +plt.show() + +pvalsDF, fdr = fdrCalc.compute_fdr(countThreshold = 2, local_fdr_cutoff = 0.05) +pvalsDF, fdr + +fig, ax3 = plt.subplots(figsize=(10, 5), layout='constrained') +fdrCalc.plot(ax3) +plt.show() + +hl.stop() diff --git a/dev/docker/note.md b/dev/docker/note.md new file mode 100644 index 00000000..6f34e390 --- /dev/null +++ b/dev/docker/note.md @@ -0,0 +1,95 @@ + +# try https://variantspark.readthedocs.io/en/latest/getting_started.html +## ask gpt to create dockerfile + - any python base image eg. FROM python:3.8, don't support openjdk package installation. + - try ubuntu which set python 3.8 as default. + +## test it interactively locl + - docker build -t vsapp . + - docker run -it --name vsrun1 vsapp + ``` + python --version # Should show Python 3.8.x + java -version # Should show OpenJDK 8 + pip3 show variant-spark # To find where variant-spark is installed + ``` + - docker cp variantspark_script.py vsrun2:/app/VariantSpark/variantspark_script.py # copy file from local to docker + - vs works without mvn install but only pip install inside docker container + - variant-spark importance -if gitHub/VariantSpark/data/chr22_1000.vcf -ff gitHub/VariantSpark/data/chr22-labels.csv -fc 22_16050408 -v -rn 500 -rbs 20 -ro -sr 13 + ``` + root@16542009db87:/app/VariantSpark# variant-spark importance -if gitHub/VariantSpark/data/chr22_1000.vcf -ff gitHub/VariantSpark/data/chr22-labels.csv -fc 22_16050408 -v -rn 500 -rbs 20 -ro -sr 13 + 25/10/27 08:41:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable + log4j:WARN No appenders could be found for logger (au.csiro.variantspark.cli.ImportanceCmd). + ... + Last build trees: 20, time: 779 ms, timePerTree: 38 ms + Finished trees: 500, current oobError: 0.016483516483516484, totalTime: 36.185 s, avg timePerTree: 0.07237 s + Last build trees: 20, time: 675 ms, timePerTree: 33 ms + Random forest oob accuracy: 0.016483516483516484, took: 36.4 s + variable,importance + 22_16050408_T_C,18.484457676767143 + 22_16051480_T_C,17.593204808682323 + ... + ``` + - variant-spark --spark --master 'local[*]' -- importance -if gitHub/VariantSpark/data/chr22_1000.vcf -ff gitHub/VariantSpark/data/chr22-labels.csv -fc 22_16050408 -v -rn 500 -rbs 20 -ro -sr 13 + - pip3 install --no-cache-dir -r /app/VariantSpark/requirements.txt # install python dependency + +## about pip install variant-spark +- variant-spark_2.12-0.5.5-all.jar : installed by pip install variant-spark + - jar tf variant-spark_2.12-0.5.5-all.jar + - jar is included inside the python package: + - /usr/local/lib/python3.8/dist-packages/varspark/jars/variant-spark_2.12-0.5.5-all.jar + - but this jar is not fat jar which didn't includes au.csiro.aehrc.third.hail-is +- hail-all-spark.jar : installed by pip3 install hail==0.2.74 inside the requirement.txt + - is used by the Python hail package at runtime. + - jar tf hail-all-spark.jar | grep hail | grep SparkBackend + +- mvn install with hail + - Maven will try to download a JAR matching hail_2.12_3.1:0.2.74 from repo: au.csiro.aehrc.third.hail-is based on pom.xml + - the JAR is stored in your local Maven repository (~/.m2/repository/au/csiro/aehrc/third/hail-is/hail_2.12_3.1/0.2.74/). + +- refer to src/main/scala/au/csiro/variantspark/hail/methods/RFModel.scala + - ~/.m2/repository/au/.../hail_2.12_3.1/0.2.74/ is called during mvn test or pure scala code running + - python: vshl.init() adds hail-all-spark.jar to the Spark classpath. + - python: spark = SparkSession.builder.config('spark.jars', vs.find_jar()).getOrCreate() adds spark.jars to spark classpath + - python: vshl.random_forest_model(...) calls scala RFModel.scala based on park classpath + - summary: python calls scala depend on hail-all-spark.jar but not mvn installed hails + +- which variant-spark # to find variant-spark bash script + - orginal from https://github.com/aehrc/VariantSpark/tree/master/bin/variant-spark + - it requires to set up spark + +## + + +## Docker Build on ARM vs. AMD64 +- `docker build -t vsapp . ` on your Mac (with an ARM-based chip like M1/M2), Docker builds the image for the native architecture, which is linux/arm64. +- `docker build --platform linux/amd64 -t vsapp .` you instruct Docker to build the image for the linux/amd64 architecture, even on your ARM-based Mac. +- The openjdk-8-jdk package in Ubuntu’s repositories is architecture-specific. For linux/arm64, it installs java-8-openjdk-arm64; for linux/amd64, it installs java-8-openjdk-amd64. + +- `uname -m` # shows x86_64 for AMD64; or aarch64 for ARM64 + + +# optimize dockerfile with two layout dockerfile + +# to do list +- pip3 show variant-spark shows Version: 0.5.5 but author Piotr Szul et. al is wrong +- pip3 install variant-spark, not automatically install pyspark as a dependency, got error + ``` + from pyspark import SparkConf + ModuleNotFoundError: No module named 'pyspark' + ``` + - pip3 show Jinja2 pandas typedecorator hail pyspark scipy numpy patsy statsmodels seaborn # only typedecorator installed + ``` + root@16542009db87:/app/VariantSpark# pip3 show Jinja2 pandas typedecorator hail pyspark scipy numpy patsy statsmodels seaborn + WARNING: Package(s) not found: Jinja2, hail, numpy, pandas, patsy, pyspark, scipy, seaborn, statsmodels + Name: typedecorator + Version: 0.0.5 + Summary: Decorator-based type checking library for Python 2 and 3 + Home-page: https://github.com/dobarkod/typedecorator/ + Author: Senko Rasic + Author-email: senko.rasic@goodcode.io + License: MIT + Location: /usr/local/lib/python3.8/dist-packages + Requires: + Required-by: variant-spark + + ``` \ No newline at end of file diff --git a/dev/docker/requirements.txt b/dev/docker/requirements.txt new file mode 100644 index 00000000..dd71ac96 --- /dev/null +++ b/dev/docker/requirements.txt @@ -0,0 +1,17 @@ +# varspark dependencies +Jinja2==3.0.3 +pandas==1.1.4 +typedecorator==0.0.5 +pyspark==3.1.3 +scipy==1.6.3 +numpy==1.21.2 +patsy==0.5.2 +statsmodels==0.13.2 +seaborn==0.11.2 +hail==0.2.74 +# below dependency are version specify to compatible with hail +chardet==3.0.4 +google-auth==1.35.0 +google-cloud-core==1.7.3 +# hail 0.2.74 has requirement google-cloud-storage==1.25.*, +google-cloud-storage==1.25.0 diff --git a/dev/docker/run_importance_chr22.py b/dev/docker/run_importance_chr22.py new file mode 100644 index 00000000..d3d4fe3b --- /dev/null +++ b/dev/docker/run_importance_chr22.py @@ -0,0 +1,30 @@ +# variantspark_script.py +import varspark as vs +from pyspark.sql import SparkSession + +# Step 1: Create a Spark session with VariantSpark JAR attached +spark = SparkSession.builder.config('spark.jars', vs.find_jar()).getOrCreate() + +# Step 2: Create a VarsparkContext +vc = vs.VarsparkContext(spark, silent=True) + +# Step 3: Load features and labels +features = vc.import_vcf('/app/VariantSpark/data/chr22_1000.vcf') +labels = vc.load_label('/app/VariantSpark/data/chr22-labels.csv', '22_16050408') + +# Optional: Print some information to verify +print("Features loaded:", features) +print("Labels loaded:", labels) + +# Step 4: Run the importance analysis and retrieve top important variables: +ia = features.importance_analysis(labels, seed = 13, n_trees=500, batch_size=20) +top_variables = ia.important_variables() + +# Step 5: Display the results. +print("%s\t%s" % ('Variable', 'Importance')) +for var_and_imp in top_variables: + print("%s\t%s" % var_and_imp) + +# Stop the Spark session +spark.stop() + From 2605b6eb7e113d7eb6a109e2358a82d1799d5883 Mon Sep 17 00:00:00 2001 From: "Christina.xu" Date: Tue, 28 Oct 2025 15:22:10 +1000 Subject: [PATCH 02/14] Update note.md --- dev/docker/note.md | 59 +++++++++++++++++++++++----------------------- 1 file changed, 30 insertions(+), 29 deletions(-) diff --git a/dev/docker/note.md b/dev/docker/note.md index 6f34e390..d380d4ec 100644 --- a/dev/docker/note.md +++ b/dev/docker/note.md @@ -1,36 +1,36 @@ -# try https://variantspark.readthedocs.io/en/latest/getting_started.html +try https://variantspark.readthedocs.io/en/latest/getting_started.html ## ask gpt to create dockerfile - any python base image eg. FROM python:3.8, don't support openjdk package installation. - try ubuntu which set python 3.8 as default. ## test it interactively locl - - docker build -t vsapp . - - docker run -it --name vsrun1 vsapp +- docker build -t vsapp . +- docker run -it --name vsrun1 vsapp +``` + python --version # Should show Python 3.8.x + java -version # Should show OpenJDK 8 + pip3 show variant-spark # To find where variant-spark is installed +``` +- docker cp variantspark_script.py vsrun2:/app/VariantSpark/variantspark_script.py # copy file from local to docker +- vs works without mvn install but only pip install inside docker container + - variant-spark importance -if gitHub/VariantSpark/data/chr22_1000.vcf -ff gitHub/VariantSpark/data/chr22-labels.csv -fc 22_16050408 -v -rn 500 -rbs 20 -ro -sr 13 ``` - python --version # Should show Python 3.8.x - java -version # Should show OpenJDK 8 - pip3 show variant-spark # To find where variant-spark is installed + root@16542009db87:/app/VariantSpark# variant-spark importance -if gitHub/VariantSpark/data/chr22_1000.vcf -ff gitHub/VariantSpark/data/chr22-labels.csv -fc 22_16050408 -v -rn 500 -rbs 20 -ro -sr 13 + 25/10/27 08:41:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable + log4j:WARN No appenders could be found for logger (au.csiro.variantspark.cli.ImportanceCmd). + ... + Last build trees: 20, time: 779 ms, timePerTree: 38 ms + Finished trees: 500, current oobError: 0.016483516483516484, totalTime: 36.185 s, avg timePerTree: 0.07237 s + Last build trees: 20, time: 675 ms, timePerTree: 33 ms + Random forest oob accuracy: 0.016483516483516484, took: 36.4 s + variable,importance + 22_16050408_T_C,18.484457676767143 + 22_16051480_T_C,17.593204808682323 + ... ``` - - docker cp variantspark_script.py vsrun2:/app/VariantSpark/variantspark_script.py # copy file from local to docker - - vs works without mvn install but only pip install inside docker container - - variant-spark importance -if gitHub/VariantSpark/data/chr22_1000.vcf -ff gitHub/VariantSpark/data/chr22-labels.csv -fc 22_16050408 -v -rn 500 -rbs 20 -ro -sr 13 - ``` - root@16542009db87:/app/VariantSpark# variant-spark importance -if gitHub/VariantSpark/data/chr22_1000.vcf -ff gitHub/VariantSpark/data/chr22-labels.csv -fc 22_16050408 -v -rn 500 -rbs 20 -ro -sr 13 - 25/10/27 08:41:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable - log4j:WARN No appenders could be found for logger (au.csiro.variantspark.cli.ImportanceCmd). - ... - Last build trees: 20, time: 779 ms, timePerTree: 38 ms - Finished trees: 500, current oobError: 0.016483516483516484, totalTime: 36.185 s, avg timePerTree: 0.07237 s - Last build trees: 20, time: 675 ms, timePerTree: 33 ms - Random forest oob accuracy: 0.016483516483516484, took: 36.4 s - variable,importance - 22_16050408_T_C,18.484457676767143 - 22_16051480_T_C,17.593204808682323 - ... - ``` - - variant-spark --spark --master 'local[*]' -- importance -if gitHub/VariantSpark/data/chr22_1000.vcf -ff gitHub/VariantSpark/data/chr22-labels.csv -fc 22_16050408 -v -rn 500 -rbs 20 -ro -sr 13 - - pip3 install --no-cache-dir -r /app/VariantSpark/requirements.txt # install python dependency + - variant-spark --spark --master 'local[*]' -- importance -if gitHub/VariantSpark/data/chr22_1000.vcf -ff gitHub/VariantSpark/data/chr22-labels.csv -fc 22_16050408 -v -rn 500 -rbs 20 -ro -sr 13 +- pip3 install --no-cache-dir -r /app/VariantSpark/requirements.txt # install python dependency ## about pip install variant-spark - variant-spark_2.12-0.5.5-all.jar : installed by pip install variant-spark @@ -39,8 +39,9 @@ - /usr/local/lib/python3.8/dist-packages/varspark/jars/variant-spark_2.12-0.5.5-all.jar - but this jar is not fat jar which didn't includes au.csiro.aehrc.third.hail-is - hail-all-spark.jar : installed by pip3 install hail==0.2.74 inside the requirement.txt - - is used by the Python hail package at runtime. - - jar tf hail-all-spark.jar | grep hail | grep SparkBackend + - is used by the Python hail package at runtime. + - /usr/local/lib/python3.8/dist-packages/hail/backend/hail-all-spark.jar + - jar tf hail-all-spark.jar | grep hail | grep SparkBackend - mvn install with hail - Maven will try to download a JAR matching hail_2.12_3.1:0.2.74 from repo: au.csiro.aehrc.third.hail-is based on pom.xml @@ -57,7 +58,7 @@ - orginal from https://github.com/aehrc/VariantSpark/tree/master/bin/variant-spark - it requires to set up spark -## + ## Docker Build on ARM vs. AMD64 @@ -92,4 +93,4 @@ Requires: Required-by: variant-spark - ``` \ No newline at end of file + ``` From 1cda03a7a11162e8acdfd9196d5c4bc1a080cb6e Mon Sep 17 00:00:00 2001 From: "Christina.xu" Date: Tue, 28 Oct 2025 15:23:11 +1000 Subject: [PATCH 03/14] Fix formatting and update notes on Hail package --- dev/docker/note.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dev/docker/note.md b/dev/docker/note.md index d380d4ec..8089d634 100644 --- a/dev/docker/note.md +++ b/dev/docker/note.md @@ -39,9 +39,9 @@ try https://variantspark.readthedocs.io/en/latest/getting_started.html - /usr/local/lib/python3.8/dist-packages/varspark/jars/variant-spark_2.12-0.5.5-all.jar - but this jar is not fat jar which didn't includes au.csiro.aehrc.third.hail-is - hail-all-spark.jar : installed by pip3 install hail==0.2.74 inside the requirement.txt - - is used by the Python hail package at runtime. - - /usr/local/lib/python3.8/dist-packages/hail/backend/hail-all-spark.jar - - jar tf hail-all-spark.jar | grep hail | grep SparkBackend + - is used by the Python hail package at runtime. + - /usr/local/lib/python3.8/dist-packages/hail/backend/hail-all-spark.jar + - jar tf hail-all-spark.jar | grep hail | grep SparkBackend - mvn install with hail - Maven will try to download a JAR matching hail_2.12_3.1:0.2.74 from repo: au.csiro.aehrc.third.hail-is based on pom.xml From 129783649093c2e510e49b6bb6e22d8d2ebc8dfa Mon Sep 17 00:00:00 2001 From: "Christina.xu" Date: Tue, 28 Oct 2025 16:02:11 +1000 Subject: [PATCH 04/14] Update note.md --- dev/docker/note.md | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/dev/docker/note.md b/dev/docker/note.md index 8089d634..60df09c1 100644 --- a/dev/docker/note.md +++ b/dev/docker/note.md @@ -32,12 +32,15 @@ try https://variantspark.readthedocs.io/en/latest/getting_started.html - variant-spark --spark --master 'local[*]' -- importance -if gitHub/VariantSpark/data/chr22_1000.vcf -ff gitHub/VariantSpark/data/chr22-labels.csv -fc 22_16050408 -v -rn 500 -rbs 20 -ro -sr 13 - pip3 install --no-cache-dir -r /app/VariantSpark/requirements.txt # install python dependency -## about pip install variant-spark -- variant-spark_2.12-0.5.5-all.jar : installed by pip install variant-spark - - jar tf variant-spark_2.12-0.5.5-all.jar - - jar is included inside the python package: - - /usr/local/lib/python3.8/dist-packages/varspark/jars/variant-spark_2.12-0.5.5-all.jar +## pip install variant-spark +- /usr/local/lib/python3.8/dist-packages/varspark is installed, includes + - /usr/local/lib/python3.8/dist-packages/varspark/jars/variant-spark_2.12-0.5.5-all.jar - but this jar is not fat jar which didn't includes au.csiro.aehrc.third.hail-is + - /usr/local/lib/python3.8/dist-packages/varspark: from variant-spark-0.5.5.tar.gz/varspark +- /usr/local/share/variant-spark/data/chr22*.vcf: from variant-spark-0.5.5.tar.gz/target/data +- /usr/local/bin/jvariant-spark and variant-spark etc: from variant-spark-0.5.5.tar.gz/target/bin + - +## p - hail-all-spark.jar : installed by pip3 install hail==0.2.74 inside the requirement.txt - is used by the Python hail package at runtime. - /usr/local/lib/python3.8/dist-packages/hail/backend/hail-all-spark.jar From 2ae125ba9f4647f1f2f6a1a544c290e3e7644520 Mon Sep 17 00:00:00 2001 From: "Christina.xu" Date: Tue, 28 Oct 2025 16:05:26 +1000 Subject: [PATCH 05/14] Update note.md --- dev/docker/note.md | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/dev/docker/note.md b/dev/docker/note.md index 60df09c1..cb97b3d5 100644 --- a/dev/docker/note.md +++ b/dev/docker/note.md @@ -30,8 +30,12 @@ try https://variantspark.readthedocs.io/en/latest/getting_started.html ... ``` - variant-spark --spark --master 'local[*]' -- importance -if gitHub/VariantSpark/data/chr22_1000.vcf -ff gitHub/VariantSpark/data/chr22-labels.csv -fc 22_16050408 -v -rn 500 -rbs 20 -ro -sr 13 -- pip3 install --no-cache-dir -r /app/VariantSpark/requirements.txt # install python dependency +- `which variant-spark` # to find variant-spark bash script + - seek bash script: https://github.com/aehrc/VariantSpark/tree/master/bin/variant-spark + - it requires to set up spark + +- pip3 install --no-cache-dir -r /app/VariantSpark/requirements.txt # install python dependency ## pip install variant-spark - /usr/local/lib/python3.8/dist-packages/varspark is installed, includes - /usr/local/lib/python3.8/dist-packages/varspark/jars/variant-spark_2.12-0.5.5-all.jar @@ -39,14 +43,14 @@ try https://variantspark.readthedocs.io/en/latest/getting_started.html - /usr/local/lib/python3.8/dist-packages/varspark: from variant-spark-0.5.5.tar.gz/varspark - /usr/local/share/variant-spark/data/chr22*.vcf: from variant-spark-0.5.5.tar.gz/target/data - /usr/local/bin/jvariant-spark and variant-spark etc: from variant-spark-0.5.5.tar.gz/target/bin - - -## p + +## pip install hail==0.2.74 - hail-all-spark.jar : installed by pip3 install hail==0.2.74 inside the requirement.txt - is used by the Python hail package at runtime. - /usr/local/lib/python3.8/dist-packages/hail/backend/hail-all-spark.jar - jar tf hail-all-spark.jar | grep hail | grep SparkBackend -- mvn install with hail +## mvn install - Maven will try to download a JAR matching hail_2.12_3.1:0.2.74 from repo: au.csiro.aehrc.third.hail-is based on pom.xml - the JAR is stored in your local Maven repository (~/.m2/repository/au/csiro/aehrc/third/hail-is/hail_2.12_3.1/0.2.74/). @@ -57,9 +61,7 @@ try https://variantspark.readthedocs.io/en/latest/getting_started.html - python: vshl.random_forest_model(...) calls scala RFModel.scala based on park classpath - summary: python calls scala depend on hail-all-spark.jar but not mvn installed hails -- which variant-spark # to find variant-spark bash script - - orginal from https://github.com/aehrc/VariantSpark/tree/master/bin/variant-spark - - it requires to set up spark + From 6be96920bf4495b950c3cd3a6b94eb97c01c9d5e Mon Sep 17 00:00:00 2001 From: "Christina.xu" Date: Tue, 28 Oct 2025 16:06:53 +1000 Subject: [PATCH 06/14] Update notes on variant-spark installation Clarified installation notes for variant-spark dependencies. --- dev/docker/note.md | 1 + 1 file changed, 1 insertion(+) diff --git a/dev/docker/note.md b/dev/docker/note.md index cb97b3d5..25ad375c 100644 --- a/dev/docker/note.md +++ b/dev/docker/note.md @@ -37,6 +37,7 @@ try https://variantspark.readthedocs.io/en/latest/getting_started.html - pip3 install --no-cache-dir -r /app/VariantSpark/requirements.txt # install python dependency ## pip install variant-spark +- it only install compulsary dependency like typecore, but not includes hail and payspark. - /usr/local/lib/python3.8/dist-packages/varspark is installed, includes - /usr/local/lib/python3.8/dist-packages/varspark/jars/variant-spark_2.12-0.5.5-all.jar - but this jar is not fat jar which didn't includes au.csiro.aehrc.third.hail-is From 545858a0075318c0650417bdb6e5e144dd3f5e92 Mon Sep 17 00:00:00 2001 From: "Christina.xu" Date: Tue, 28 Oct 2025 16:36:45 +1000 Subject: [PATCH 07/14] Update note.md --- dev/docker/note.md | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/dev/docker/note.md b/dev/docker/note.md index 25ad375c..a8061a05 100644 --- a/dev/docker/note.md +++ b/dev/docker/note.md @@ -13,9 +13,11 @@ try https://variantspark.readthedocs.io/en/latest/getting_started.html pip3 show variant-spark # To find where variant-spark is installed ``` - docker cp variantspark_script.py vsrun2:/app/VariantSpark/variantspark_script.py # copy file from local to docker -- vs works without mvn install but only pip install inside docker container - - variant-spark importance -if gitHub/VariantSpark/data/chr22_1000.vcf -ff gitHub/VariantSpark/data/chr22-labels.csv -fc 22_16050408 -v -rn 500 -rbs 20 -ro -sr 13 +- vs works with only `pip install variant-spark`: + - `which variant-spark` # to find https://github.com/aehrc/VariantSpark/tree/master/bin/variant-spark + - to run scala importance don't need hail to be installed (no mvn install, no pip install hails) ``` + root@16542009db87:/app/VariantSpark# variant-spark importance -if gitHub/VariantSpark/data/chr22_1000.vcf -ff gitHub/VariantSpark/data/chr22-labels.csv -fc 22_16050408 -v -rn 500 -rbs 20 -ro -sr 13 root@16542009db87:/app/VariantSpark# variant-spark importance -if gitHub/VariantSpark/data/chr22_1000.vcf -ff gitHub/VariantSpark/data/chr22-labels.csv -fc 22_16050408 -v -rn 500 -rbs 20 -ro -sr 13 25/10/27 08:41:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable log4j:WARN No appenders could be found for logger (au.csiro.variantspark.cli.ImportanceCmd). @@ -29,13 +31,12 @@ try https://variantspark.readthedocs.io/en/latest/getting_started.html 22_16051480_T_C,17.593204808682323 ... ``` - - variant-spark --spark --master 'local[*]' -- importance -if gitHub/VariantSpark/data/chr22_1000.vcf -ff gitHub/VariantSpark/data/chr22-labels.csv -fc 22_16050408 -v -rn 500 -rbs 20 -ro -sr 13 - -- `which variant-spark` # to find variant-spark bash script - - seek bash script: https://github.com/aehrc/VariantSpark/tree/master/bin/variant-spark - - it requires to set up spark - + - `python compute_local_fdr.py` failed with error "ModuleNotFoundError: No module named 'hail'" + - `python run_importance_chr22.py` failed with error "ModuleNotFoundError: No module named 'pyspark'" + - pip3 install --no-cache-dir -r /app/VariantSpark/requirements.txt # install python dependency + - now hai and pyspark are installed in container: /usr/local/lib/python3.8/dist-packages/hail/ + - now `python compute_local_fdr.py` and `python run_importance_chr22.py` works ## pip install variant-spark - it only install compulsary dependency like typecore, but not includes hail and payspark. - /usr/local/lib/python3.8/dist-packages/varspark is installed, includes From d0e7f4e29f592de4c32a143a490b818c91e189ce Mon Sep 17 00:00:00 2001 From: "Xu, Qinying (H&B, Herston)" Date: Tue, 28 Oct 2025 18:08:47 +1000 Subject: [PATCH 08/14] update --- dev/docker/Dockerfile | 20 ++++++++++++-------- dev/docker/local_run-importance-ch22.sh | 8 ++++++++ dev/docker/requirements.txt | 4 +++- 3 files changed, 23 insertions(+), 9 deletions(-) create mode 100644 dev/docker/local_run-importance-ch22.sh diff --git a/dev/docker/Dockerfile b/dev/docker/Dockerfile index eb3d0b05..87b10a1a 100644 --- a/dev/docker/Dockerfile +++ b/dev/docker/Dockerfile @@ -7,9 +7,9 @@ WORKDIR /app # Avoid interactive prompts during package installation ENV DEBIAN_FRONTEND=noninteractive -# Install Python 3.8, pip, Java 8, and Git +# Install Python 3.7, pip, Java 8, and Git RUN apt-get update && apt-get install -y \ - python3.8 \ + python3.7 \ python3-pip \ openjdk-8-jdk \ git wget \ @@ -23,11 +23,11 @@ RUN ln -s /usr/bin/python3.8 /usr/bin/python ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 ENV PATH=$JAVA_HOME/bin:$PATH -# Install Apache Spark 3.1.2 with Hadoop 3.2, only work for linux/amd64 -RUN wget -q https://archive.apache.org/dist/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz && \ - tar -xzf spark-3.1.2-bin-hadoop3.2.tgz -C /opt && \ - rm spark-3.1.2-bin-hadoop3.2.tgz && \ - ln -s /opt/spark-3.1.2-bin-hadoop3.2 /opt/spark +# Install Apache Spark 3.1.1 with Hadoop 3.2, only work for linux/amd64 +RUN wget -q https://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz && \ + tar -xzf spark-3.1.1-bin-hadoop3.2.tgz -C /opt && \ + rm spark-3.1.1-bin-hadoop3.2.tgz && \ + ln -s /opt/spark-3.1.1-bin-hadoop3.2 /opt/spark # Set Spark environment variables ENV SPARK_HOME=/opt/spark @@ -38,11 +38,15 @@ ENV PATH=$SPARK_HOME/bin:$PATH # Install VariantSpark RUN pip3 install --no-cache-dir variant-spark +# Install matching PySpark + Hail +RUN pip install --no-cache-dir pyspark==3.1.1 "variant-spark[hail,deps]" hail==0.2.74 + # Clone the VariantSpark repository # RUN git clone https://github.com/aehrc/VariantSpark.git # Set working directory to the cloned repository WORKDIR /app/VariantSpark +RUN ln -s /usr/local/share/variant-spark/data/ . # Command to run -CMD ["bash"] +CMD ["bash variant-spark -h"] diff --git a/dev/docker/local_run-importance-ch22.sh b/dev/docker/local_run-importance-ch22.sh new file mode 100644 index 00000000..d95988f2 --- /dev/null +++ b/dev/docker/local_run-importance-ch22.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +PWD=$(cd `dirname "$0"`; pwd) +PATH=${PATH}:${PWD}/bin +export VS_ECHO_CMDLINE=YES + +variant-spark --spark --master 'local[*]' -- \ + importance -if ${PWD}/data/chr22_1000.vcf -ff ${PWD}/data/chr22-labels.csv -fc 22_16050408 -v -rn 500 -rbs 20 -ro -sr 13 "$@" diff --git a/dev/docker/requirements.txt b/dev/docker/requirements.txt index dd71ac96..13a5a979 100644 --- a/dev/docker/requirements.txt +++ b/dev/docker/requirements.txt @@ -1,8 +1,10 @@ +# python 3.7 # varspark dependencies Jinja2==3.0.3 pandas==1.1.4 typedecorator==0.0.5 -pyspark==3.1.3 +# 3.1.1 most suits hail 0.2.74 +pyspark==3.1.1 scipy==1.6.3 numpy==1.21.2 patsy==0.5.2 From cb21238e26fbb4714937c51b2627a36f10ab192d Mon Sep 17 00:00:00 2001 From: "Christina.xu" Date: Tue, 28 Oct 2025 23:56:16 +1000 Subject: [PATCH 09/14] Update note.md --- dev/docker/note.md | 42 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/dev/docker/note.md b/dev/docker/note.md index a8061a05..695bd0b0 100644 --- a/dev/docker/note.md +++ b/dev/docker/note.md @@ -65,8 +65,47 @@ try https://variantspark.readthedocs.io/en/latest/getting_started.html +## python 3.7 vs 3.8 +- Ubuntu 20.04 or older, or Debian, where Python 3.7 reached End of Life (EOL: June 2023) and was removed from repos +- Ubuntu 20.04 contains python3.8 as default (python3). + - python3.8 with importance call : `import varspark as vs` + ``` + root@a874b29b622c:/app/VariantSpark# python3 run_importance_chr22.py + 25/10/28 13:47:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable + Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties + Setting default log level to "WARN". + To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). + Features loaded: + Labels loaded: au.csiro.variantspark.input.CsvLabelSource@47be15fa + + Variable Importance + 22_16050408_T_C 0.0008041915634907004 + 22_16051480_T_C 0.0007654163908573393 + 22_16050678_C_T 0.0006921965571074235 + 22_16053197_G_T 0.00065148141258399 + 22_16053435_G_T 0.0006144056480311232 + 22_16051107_C_A 0.0006139653108376215 + 22_16051882_C_T 0.0005007281009782979 + 22_16053797_T_C 0.0004618498469961836 + 22_16052838_T_A 0.0004613601158382499 + 22_16053509_A_G 0.0004548314795407337 + + ``` + - python3.8 call compute_local_fdr.py : `import varspark as vs` + ``` + root@a874b29b622c:/app/VariantSpark# python3 compute_local_fdr.py + An error occurred: module 'importlib.metadata' has no attribute 'packages_distributions' + /usr/local/lib/python3.8/dist-packages/google/api_core/_python_version_support.py:237: FutureWarning: You are using a non-supported Python version (3.8.10). Google will not post any further updates to google.api_core supporting this Python version. Please upgrade to the latest Python version, or at least Python 3.10, and then update google.api_core. + warnings.warn(message, FutureWarning) + /usr/local/lib/python3.8/dist-packages/scipy/__init__.py:138: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.24.4) + warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion} is required for this version of " + Traceback (most recent call last): + File "compute_local_fdr.py", line 1, in + import hail as hl + File "/usr/local/lib/python3.8/dist-packages/hail/__init__.py", line 48, in + ``` ## Docker Build on ARM vs. AMD64 - `docker build -t vsapp . ` on your Mac (with an ARM-based chip like M1/M2), Docker builds the image for the native architecture, which is linux/arm64. @@ -76,7 +115,6 @@ try https://variantspark.readthedocs.io/en/latest/getting_started.html - `uname -m` # shows x86_64 for AMD64; or aarch64 for ARM64 -# optimize dockerfile with two layout dockerfile # to do list - pip3 show variant-spark shows Version: 0.5.5 but author Piotr Szul et. al is wrong @@ -85,7 +123,7 @@ try https://variantspark.readthedocs.io/en/latest/getting_started.html from pyspark import SparkConf ModuleNotFoundError: No module named 'pyspark' ``` - - pip3 show Jinja2 pandas typedecorator hail pyspark scipy numpy patsy statsmodels seaborn # only typedecorator installed +- pip3 show Jinja2 pandas typedecorator hail pyspark scipy numpy patsy statsmodels seaborn # only typedecorator installed ``` root@16542009db87:/app/VariantSpark# pip3 show Jinja2 pandas typedecorator hail pyspark scipy numpy patsy statsmodels seaborn WARNING: Package(s) not found: Jinja2, hail, numpy, pandas, patsy, pyspark, scipy, seaborn, statsmodels From 4d1e0cb029f8a2a990092b4002c8b0c19149902b Mon Sep 17 00:00:00 2001 From: "Christina.xu" Date: Tue, 28 Oct 2025 23:57:12 +1000 Subject: [PATCH 10/14] Update note.md --- dev/docker/note.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/docker/note.md b/dev/docker/note.md index 695bd0b0..27e3121e 100644 --- a/dev/docker/note.md +++ b/dev/docker/note.md @@ -91,7 +91,7 @@ try https://variantspark.readthedocs.io/en/latest/getting_started.html 22_16053509_A_G 0.0004548314795407337 ``` - - python3.8 call compute_local_fdr.py : `import varspark as vs` + - python3.8 call compute_local_fdr.py : `import hail as hl; import varspark.hail as vshl` ``` root@a874b29b622c:/app/VariantSpark# python3 compute_local_fdr.py From ce9fdd40b55d80ec6370cd76de9e3103e92ecc22 Mon Sep 17 00:00:00 2001 From: "Christina.xu" Date: Wed, 29 Oct 2025 00:14:27 +1000 Subject: [PATCH 11/14] Update note.md --- dev/docker/note.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dev/docker/note.md b/dev/docker/note.md index 27e3121e..b3032257 100644 --- a/dev/docker/note.md +++ b/dev/docker/note.md @@ -91,7 +91,7 @@ try https://variantspark.readthedocs.io/en/latest/getting_started.html 22_16053509_A_G 0.0004548314795407337 ``` - - python3.8 call compute_local_fdr.py : `import hail as hl; import varspark.hail as vshl` + - python3.8 call compute_local_fdr.py : `import hail as hl; import varspark.hail as vshl`, got all version conflict, due to `RUN pip install --no-cache-dir pyspark==3.1.1 "variant-spark[hail,deps]" hail==0.2.74` install lists of dependency under /usr/local/lib/python3.8/dist-packages/; all incorrect version ``` root@a874b29b622c:/app/VariantSpark# python3 compute_local_fdr.py @@ -106,6 +106,7 @@ try https://variantspark.readthedocs.io/en/latest/getting_started.html File "/usr/local/lib/python3.8/dist-packages/hail/__init__.py", line 48, in ``` + - python3.8 call compute_local_fdr.py after `pip install -r requirement.txt` to install correct verion. now `import hail as hl` working. ## Docker Build on ARM vs. AMD64 - `docker build -t vsapp . ` on your Mac (with an ARM-based chip like M1/M2), Docker builds the image for the native architecture, which is linux/arm64. From d9db7d4d487473275002156c8f1d009d04c6c8fb Mon Sep 17 00:00:00 2001 From: "Christina.xu" Date: Wed, 29 Oct 2025 00:16:54 +1000 Subject: [PATCH 12/14] Update note.md --- dev/docker/note.md | 1 + 1 file changed, 1 insertion(+) diff --git a/dev/docker/note.md b/dev/docker/note.md index b3032257..36f47b49 100644 --- a/dev/docker/note.md +++ b/dev/docker/note.md @@ -140,3 +140,4 @@ try https://variantspark.readthedocs.io/en/latest/getting_started.html Required-by: variant-spark ``` +- try micromaba with python 3.7 From f2b9ceace337655fe5c0bd16e6b573f2fde9de90 Mon Sep 17 00:00:00 2001 From: "Xu, Qinying (H&B, Herston)" Date: Tue, 4 Nov 2025 09:33:48 +1000 Subject: [PATCH 13/14] install VS in docker --- Dockerfile | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..b6a43ab1 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,63 @@ +# Use Ubuntu 20.04 as the base image +FROM --platform=linux/amd64 ubuntu:20.04 + +# Set working directory +WORKDIR /app + +# Avoid interactive prompts during package installation +ENV DEBIAN_FRONTEND=noninteractive + +# Install Python 3.7, pip, Java 8, and Git +RUN apt-get update && apt-get install -y \ + python3-pip \ + openjdk-8-jdk \ + git wget \ + && rm -rf /var/lib/apt/lists/* + +# Set Python 3.8 as default python +RUN ln -s /usr/bin/python3.8 /usr/bin/python + +# Set JAVA_HOME environment variable, +# openjdk-8-jdk for linux/amd64, it installs java-8-openjdk-amd64. +ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 +ENV PATH=$JAVA_HOME/bin:$PATH + +# Install Apache Spark 3.1.1 with Hadoop 3.2, only work for linux/amd64 +RUN wget -q https://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz && \ + tar -xzf spark-3.1.1-bin-hadoop3.2.tgz -C /opt && \ + rm spark-3.1.1-bin-hadoop3.2.tgz && \ + ln -s /opt/spark-3.1.1-bin-hadoop3.2 /opt/spark + +# Set Spark environment variables +ENV SPARK_HOME=/opt/spark +ENV PATH=$SPARK_HOME/bin:$PATH +# ENV PYSPARK_PYTHON=python3.8 +# ENV PYSPARK_DRIVER_PYTHON=python3.8 + +# Install VariantSpark +RUN pip3 install --no-cache-dir variant-spark + +# Install matching PySpark + Hail +RUN pip install --no-cache-dir pyspark==3.1.1 variant-spark hail==0.2.74 \ +Jinja2==3.0.3 \ +pandas==1.1.4 \ +typedecorator==0.0.5 \ +scipy==1.6.3 \ +numpy==1.21.2 \ +patsy==0.5.2 \ +statsmodels==0.13.2 \ +seaborn==0.11.2 \ +chardet==3.0.4 \ +google-auth==1.35.0 \ +google-cloud-core==1.7.3 \ +google-cloud-storage==1.25.0 \ + +# Clone the VariantSpark repository +# RUN git clone https://github.com/aehrc/VariantSpark.git + +# Set working directory to the cloned repository +WORKDIR /app/VariantSpark +RUN ln -s /usr/local/share/variant-spark/data/ . + +# Command to run +CMD ["variant-spark -h"] From d5461cba97eece0828a88895561f44e9cc8fef2a Mon Sep 17 00:00:00 2001 From: "Xu, Qinying (H&B, Herston)" Date: Fri, 7 Nov 2025 10:58:31 +1000 Subject: [PATCH 14/14] add variantspark version --- Dockerfile | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index b6a43ab1..00dcfbb1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -35,10 +35,11 @@ ENV PATH=$SPARK_HOME/bin:$PATH # ENV PYSPARK_DRIVER_PYTHON=python3.8 # Install VariantSpark -RUN pip3 install --no-cache-dir variant-spark +RUN pip3 install --no-cache-dir variant-spark==0.5.5 # Install matching PySpark + Hail -RUN pip install --no-cache-dir pyspark==3.1.1 variant-spark hail==0.2.74 \ +RUN pip install --no-cache-dir pyspark==3.1.1 \ +hail==0.2.74 \ Jinja2==3.0.3 \ pandas==1.1.4 \ typedecorator==0.0.5 \ @@ -50,7 +51,9 @@ seaborn==0.11.2 \ chardet==3.0.4 \ google-auth==1.35.0 \ google-cloud-core==1.7.3 \ -google-cloud-storage==1.25.0 \ +google-cloud-storage==1.25.0 + +RUN pip install variant-spark==0.5.5 # Clone the VariantSpark repository # RUN git clone https://github.com/aehrc/VariantSpark.git