aehrc · ChristinaXu2017 · Oct 28, 2025 · Oct 28, 2025 · Oct 28, 2025 · Oct 28, 2025
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,66 @@
+# Use Ubuntu 20.04 as the base image
+FROM --platform=linux/amd64 ubuntu:20.04
+
+# Set working directory
+WORKDIR /app
+
+# Avoid interactive prompts during package installation
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install Python 3.7, pip, Java 8, and Git
+RUN apt-get update && apt-get install -y \
+    python3-pip \
+    openjdk-8-jdk \
+    git wget \
+    && rm -rf /var/lib/apt/lists/*
+
+# Set Python 3.8 as default python
+RUN ln -s /usr/bin/python3.8 /usr/bin/python
+
+# Set JAVA_HOME environment variable, 
+# openjdk-8-jdk for linux/amd64, it installs java-8-openjdk-amd64.
+ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
+ENV PATH=$JAVA_HOME/bin:$PATH
+
+# Install Apache Spark 3.1.1 with Hadoop 3.2, only work for linux/amd64
+RUN wget -q https://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz && \
+    tar -xzf spark-3.1.1-bin-hadoop3.2.tgz -C /opt && \
+    rm spark-3.1.1-bin-hadoop3.2.tgz && \
+    ln -s /opt/spark-3.1.1-bin-hadoop3.2 /opt/spark
+
+# Set Spark environment variables
+ENV SPARK_HOME=/opt/spark
+ENV PATH=$SPARK_HOME/bin:$PATH
+# ENV PYSPARK_PYTHON=python3.8
+# ENV PYSPARK_DRIVER_PYTHON=python3.8
+
+# Install VariantSpark
+RUN pip3 install --no-cache-dir variant-spark==0.5.5
+
+# Install matching PySpark + Hail
+RUN pip install --no-cache-dir pyspark==3.1.1 \
+hail==0.2.74 \
+Jinja2==3.0.3 \
+pandas==1.1.4 \
+typedecorator==0.0.5 \
+scipy==1.6.3 \
+numpy==1.21.2 \
+patsy==0.5.2 \
+statsmodels==0.13.2 \
+seaborn==0.11.2 \
+chardet==3.0.4 \
+google-auth==1.35.0 \
+google-cloud-core==1.7.3 \
+google-cloud-storage==1.25.0 
+
+RUN pip install variant-spark==0.5.5
+
+# Clone the VariantSpark repository
+# RUN git clone https://github.com/aehrc/VariantSpark.git
+
+# Set working directory to the cloned repository
+WORKDIR /app/VariantSpark
+RUN ln -s /usr/local/share/variant-spark/data/ .
+
+# Command to run
+CMD ["variant-spark -h"]
diff --git a/dev/docker/Dockerfile b/dev/docker/Dockerfile
@@ -0,0 +1,52 @@
+# Use Ubuntu 20.04 as the base image
+FROM --platform=linux/amd64 ubuntu:20.04
+
+# Set working directory
+WORKDIR /app
+
+# Avoid interactive prompts during package installation
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install Python 3.7, pip, Java 8, and Git
+RUN apt-get update && apt-get install -y \
+    python3.7 \
+    python3-pip \
+    openjdk-8-jdk \
+    git wget \
+    && rm -rf /var/lib/apt/lists/*
+
+# Set Python 3.8 as default python
+RUN ln -s /usr/bin/python3.8 /usr/bin/python
+
+# Set JAVA_HOME environment variable, 
+# openjdk-8-jdk for linux/amd64, it installs java-8-openjdk-amd64.
+ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
+ENV PATH=$JAVA_HOME/bin:$PATH
+
+# Install Apache Spark 3.1.1 with Hadoop 3.2, only work for linux/amd64
+RUN wget -q https://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz && \
+    tar -xzf spark-3.1.1-bin-hadoop3.2.tgz -C /opt && \
+    rm spark-3.1.1-bin-hadoop3.2.tgz && \
+    ln -s /opt/spark-3.1.1-bin-hadoop3.2 /opt/spark
+
+# Set Spark environment variables
+ENV SPARK_HOME=/opt/spark
+ENV PATH=$SPARK_HOME/bin:$PATH
+# ENV PYSPARK_PYTHON=python3.8
+# ENV PYSPARK_DRIVER_PYTHON=python3.8
+
+# Install VariantSpark
+RUN pip3 install --no-cache-dir variant-spark
+
+# Install matching PySpark + Hail
+RUN pip install --no-cache-dir pyspark==3.1.1 "variant-spark[hail,deps]" hail==0.2.74
+
+# Clone the VariantSpark repository
+# RUN git clone https://github.com/aehrc/VariantSpark.git
+
+# Set working directory to the cloned repository
+WORKDIR /app/VariantSpark
+RUN ln -s /usr/local/share/variant-spark/data/ .
+
+# Command to run
+CMD ["bash variant-spark -h"]
diff --git a/dev/docker/compute_local_fdr.py b/dev/docker/compute_local_fdr.py
@@ -0,0 +1,37 @@
+import hail as hl
+import varspark.hail as vshl
+from matplotlib import pyplot as plt
+vshl.init()
+
+vds = hl.import_vcf('./data/chr22_1000.vcf')
+labels = hl.import_table('./data/chr22-labels-hail.csv', impute = True, delimiter=",").key_by('sample')
+
+vds = vds.annotate_cols(label = labels[vds.s])
+vds.cols().show(3)
+
+rf_model = vshl.random_forest_model(y=vds.label['x22_16050408'],
+                x=vds.GT.n_alt_alleles(), seed = 13, mtry_fraction = 0.05, min_node_size = 5, max_depth = 10)
+rf_model.fit_trees(300, 50)
+
+print("OOB error: %s" % rf_model.oob_error())
+impTable = rf_model.variable_importance()
+impTable.order_by(hl.desc(impTable.importance)).show(10)
+
+fdrCalc = rf_model.get_lfdr()
+
+fig, ax1 = plt.subplots(figsize=(10, 5), layout='constrained')
+fdrCalc.plot_log_densities(ax1, cutoff_list=[1, 2, 3, 4, 5, 10, 15, 20], find_automatic_best=True)
+plt.show()
+
+fig, ax2 = plt.subplots(figsize=(10, 5), layout='constrained')
+fdrCalc.plot_log_hist(ax2, split_count=2)
+plt.show()
+
+pvalsDF, fdr = fdrCalc.compute_fdr(countThreshold = 2, local_fdr_cutoff = 0.05)
+pvalsDF, fdr
+
+fig, ax3 = plt.subplots(figsize=(10, 5), layout='constrained')
+fdrCalc.plot(ax3)
+plt.show()
+
+hl.stop()
diff --git a/dev/docker/local_run-importance-ch22.sh b/dev/docker/local_run-importance-ch22.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+PWD=$(cd `dirname "$0"`; pwd)
+PATH=${PATH}:${PWD}/bin
+export VS_ECHO_CMDLINE=YES
+
+variant-spark --spark --master 'local[*]' -- \
+ importance -if ${PWD}/data/chr22_1000.vcf -ff ${PWD}/data/chr22-labels.csv -fc 22_16050408 -v -rn 500 -rbs 20 -ro -sr 13 "$@"
diff --git a/dev/docker/note.md b/dev/docker/note.md
@@ -0,0 +1,143 @@
+
+try https://variantspark.readthedocs.io/en/latest/getting_started.html
+## ask gpt to create dockerfile
+	- any python base image eg. FROM python:3.8, don't support openjdk package installation. 
+	- try ubuntu which set python 3.8 as default. 
+
+## test it interactively locl
+- docker build -t vsapp .
+- docker run -it --name vsrun1 vsapp 
+```
+	python --version  # Should show Python 3.8.x
+	java -version  # Should show OpenJDK 8
+	pip3 show variant-spark # To find where variant-spark is installed 	
+```
+- docker cp variantspark_script.py vsrun2:/app/VariantSpark/variantspark_script.py # copy file from local to docker
+- vs works with only `pip install variant-spark`:
+  - `which variant-spark` # to find  https://github.com/aehrc/VariantSpark/tree/master/bin/variant-spark
+  -  to run scala importance don't need hail to be installed (no mvn install, no pip install hails)
+	```
+        root@16542009db87:/app/VariantSpark# variant-spark importance -if gitHub/VariantSpark/data/chr22_1000.vcf -ff gitHub/VariantSpark/data/chr22-labels.csv -fc 22_16050408 -v -rn 500 -rbs 20 -ro -sr 13
+		root@16542009db87:/app/VariantSpark# variant-spark importance -if gitHub/VariantSpark/data/chr22_1000.vcf -ff gitHub/VariantSpark/data/chr22-labels.csv -fc 22_16050408 -v -rn 500 -rbs 20 -ro -sr 13
+		25/10/27 08:41:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
+		log4j:WARN No appenders could be found for logger (au.csiro.variantspark.cli.ImportanceCmd).
+		...
+		Last build trees: 20, time: 779 ms, timePerTree: 38 ms
+		Finished trees: 500, current oobError: 0.016483516483516484, totalTime: 36.185 s,  avg timePerTree: 0.07237 s
+		Last build trees: 20, time: 675 ms, timePerTree: 33 ms
+		Random forest oob accuracy: 0.016483516483516484, took: 36.4 s
+		variable,importance
+		22_16050408_T_C,18.484457676767143
+		22_16051480_T_C,17.593204808682323
+		...
+	```
+  - `python compute_local_fdr.py` failed with error "ModuleNotFoundError: No module named 'hail'"
+  - `python run_importance_chr22.py` failed with error "ModuleNotFoundError: No module named 'pyspark'"
+
+- pip3 install --no-cache-dir -r /app/VariantSpark/requirements.txt # install python dependency
+  - now hai and pyspark are installed in container: /usr/local/lib/python3.8/dist-packages/hail/
+  - now `python compute_local_fdr.py` and `python run_importance_chr22.py` works
+## pip install variant-spark
+- it only install compulsary dependency like typecore, but not includes hail and payspark. 
+- /usr/local/lib/python3.8/dist-packages/varspark is installed, includes
+	- /usr/local/lib/python3.8/dist-packages/varspark/jars/variant-spark_2.12-0.5.5-all.jar
+		- but this jar is not fat jar which didn't includes au.csiro.aehrc.third.hail-is
+ 	- /usr/local/lib/python3.8/dist-packages/varspark: from variant-spark-0.5.5.tar.gz/varspark
+- /usr/local/share/variant-spark/data/chr22*.vcf: from variant-spark-0.5.5.tar.gz/target/data
+- /usr/local/bin/jvariant-spark and variant-spark etc:  from variant-spark-0.5.5.tar.gz/target/bin
+
+## pip install hail==0.2.74
+- hail-all-spark.jar : installed by pip3 install hail==0.2.74 inside the requirement.txt
+	 - is used by the Python hail package at runtime.
+	 - /usr/local/lib/python3.8/dist-packages/hail/backend/hail-all-spark.jar
+	 - jar tf hail-all-spark.jar | grep hail | grep SparkBackend  
+
+## mvn install 
+	- Maven will try to download a JAR matching hail_2.12_3.1:0.2.74 from repo: au.csiro.aehrc.third.hail-is based on pom.xml
+	- the JAR is stored in your local Maven repository (~/.m2/repository/au/csiro/aehrc/third/hail-is/hail_2.12_3.1/0.2.74/).
+
+- refer to src/main/scala/au/csiro/variantspark/hail/methods/RFModel.scala
+	- ~/.m2/repository/au/.../hail_2.12_3.1/0.2.74/ is called during mvn test or pure scala code running
+	- python: vshl.init() adds hail-all-spark.jar to the Spark classpath.
+	- python: spark = SparkSession.builder.config('spark.jars', vs.find_jar()).getOrCreate() adds spark.jars to spark classpath
+	- python: vshl.random_forest_model(...) calls scala RFModel.scala based on park classpath
+	- summary: python calls scala depend on hail-all-spark.jar but not mvn installed hails
+
+
+
+## python 3.7 vs 3.8
+- Ubuntu 20.04 or older, or Debian, where Python 3.7 reached End of Life (EOL: June 2023) and was removed from repos
+- Ubuntu 20.04 contains python3.8 as default (python3).
+  - python3.8 with importance call : `import varspark as vs`
+  ```
+	  root@a874b29b622c:/app/VariantSpark# python3 run_importance_chr22.py 
+	25/10/28 13:47:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
+	Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
+	Setting default log level to "WARN".
+	To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
+	Features loaded: <varspark.core.FeatureSource object at 0x7fffe9a84d30>
+	Labels loaded: au.csiro.variantspark.input.CsvLabelSource@47be15fa
+
+	Variable        Importance                                                      
+	22_16050408_T_C	0.0008041915634907004
+	22_16051480_T_C	0.0007654163908573393
+	22_16050678_C_T	0.0006921965571074235
+	22_16053197_G_T	0.00065148141258399
+	22_16053435_G_T	0.0006144056480311232
+	22_16051107_C_A	0.0006139653108376215
+	22_16051882_C_T	0.0005007281009782979
+	22_16053797_T_C	0.0004618498469961836
+	22_16052838_T_A	0.0004613601158382499
+	22_16053509_A_G	0.0004548314795407337
+
+  ```
+  - python3.8 call compute_local_fdr.py  : `import hail as hl; import varspark.hail as vshl`, got all version conflict, due to `RUN pip install --no-cache-dir pyspark==3.1.1 "variant-spark[hail,deps]" hail==0.2.74` install lists of dependency under /usr/local/lib/python3.8/dist-packages/; all incorrect version
+   ```
+
+	   root@a874b29b622c:/app/VariantSpark# python3 compute_local_fdr.py         
+	An error occurred: module 'importlib.metadata' has no attribute 'packages_distributions'
+	/usr/local/lib/python3.8/dist-packages/google/api_core/_python_version_support.py:237: FutureWarning: You are using a non-supported Python version (3.8.10). Google will not post any further updates to google.api_core supporting this Python version. Please upgrade to the latest Python version, or at least Python 3.10, and then update google.api_core.
+	  warnings.warn(message, FutureWarning)
+	/usr/local/lib/python3.8/dist-packages/scipy/__init__.py:138: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.24.4)
+	  warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion} is required for this version of "
+	Traceback (most recent call last):
+	  File "compute_local_fdr.py", line 1, in <module>
+	    import hail as hl
+	  File "/usr/local/lib/python3.8/dist-packages/hail/__init__.py", line 48, in <module>
+
+   ```
+   - python3.8 call compute_local_fdr.py  after `pip install -r requirement.txt` to install correct verion. now `import hail as hl` working.
+
+## Docker Build on ARM vs. AMD64
+- `docker build -t vsapp . ` on your Mac (with an ARM-based chip like M1/M2), Docker builds the image for the native architecture, which is linux/arm64.
+- `docker build --platform linux/amd64 -t vsapp .` you instruct Docker to build the image for the linux/amd64 architecture, even on your ARM-based Mac.
+- The openjdk-8-jdk package in Ubuntu’s repositories is architecture-specific. For linux/arm64, it installs java-8-openjdk-arm64; for linux/amd64, it installs java-8-openjdk-amd64.
+
+- `uname -m` # shows x86_64 for AMD64; or aarch64 for ARM64
+
+
+
+# to do list
+- pip3 show variant-spark shows Version: 0.5.5 but author Piotr Szul et. al is wrong
+- pip3 install variant-spark, not automatically install pyspark as a dependency, got error
+  ```
+  	from pyspark import SparkConf
+	ModuleNotFoundError: No module named 'pyspark'
+  ```
+- pip3 show Jinja2 pandas typedecorator hail pyspark scipy numpy patsy statsmodels seaborn # only typedecorator installed
+	```
+	root@16542009db87:/app/VariantSpark# pip3 show Jinja2 pandas typedecorator hail pyspark scipy numpy patsy statsmodels seaborn
+	WARNING: Package(s) not found: Jinja2, hail, numpy, pandas, patsy, pyspark, scipy, seaborn, statsmodels
+	Name: typedecorator
+	Version: 0.0.5
+	Summary: Decorator-based type checking library for Python 2 and 3
+	Home-page: https://github.com/dobarkod/typedecorator/
+	Author: Senko Rasic
+	Author-email: senko.rasic@goodcode.io
+	License: MIT
+	Location: /usr/local/lib/python3.8/dist-packages
+	Requires: 
+	Required-by: variant-spark
+
+	```
+- try micromaba with python 3.7
diff --git a/dev/docker/requirements.txt b/dev/docker/requirements.txt
@@ -0,0 +1,19 @@
+# python 3.7
+# varspark dependencies
+Jinja2==3.0.3
+pandas==1.1.4
+typedecorator==0.0.5
+# 3.1.1 most suits hail 0.2.74 
+pyspark==3.1.1
+scipy==1.6.3
+numpy==1.21.2
+patsy==0.5.2
+statsmodels==0.13.2
+seaborn==0.11.2
+hail==0.2.74
+# below dependency are version specify to compatible with hail
+chardet==3.0.4
+google-auth==1.35.0
+google-cloud-core==1.7.3
+# hail 0.2.74 has requirement google-cloud-storage==1.25.*,
+google-cloud-storage==1.25.0
diff --git a/dev/docker/run_importance_chr22.py b/dev/docker/run_importance_chr22.py
@@ -0,0 +1,30 @@
+# variantspark_script.py
+import varspark as vs
+from pyspark.sql import SparkSession
+
+# Step 1: Create a Spark session with VariantSpark JAR attached
+spark = SparkSession.builder.config('spark.jars', vs.find_jar()).getOrCreate()
+
+# Step 2: Create a VarsparkContext
+vc = vs.VarsparkContext(spark, silent=True)
+
+# Step 3: Load features and labels
+features = vc.import_vcf('/app/VariantSpark/data/chr22_1000.vcf')
+labels = vc.load_label('/app/VariantSpark/data/chr22-labels.csv', '22_16050408')
+
+# Optional: Print some information to verify
+print("Features loaded:", features)
+print("Labels loaded:", labels)
+
+# Step 4: Run the importance analysis and retrieve top important variables:
+ia = features.importance_analysis(labels, seed = 13, n_trees=500, batch_size=20)
+top_variables = ia.important_variables()
+
+# Step 5: Display the results.
+print("%s\t%s" % ('Variable', 'Importance'))
+for var_and_imp in top_variables:
+    print("%s\t%s" % var_and_imp)    
+
+# Stop the Spark session
+spark.stop() 
+