Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Use Ubuntu 20.04 as the base image
FROM --platform=linux/amd64 ubuntu:20.04

# Set working directory
WORKDIR /app

# Avoid interactive prompts during package installation
ENV DEBIAN_FRONTEND=noninteractive

# Install Python 3.7, pip, Java 8, and Git
RUN apt-get update && apt-get install -y \
python3-pip \
openjdk-8-jdk \
git wget \
&& rm -rf /var/lib/apt/lists/*

# Set Python 3.8 as default python
RUN ln -s /usr/bin/python3.8 /usr/bin/python

# Set JAVA_HOME environment variable,
# openjdk-8-jdk for linux/amd64, it installs java-8-openjdk-amd64.
ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
ENV PATH=$JAVA_HOME/bin:$PATH

# Install Apache Spark 3.1.1 with Hadoop 3.2, only work for linux/amd64
RUN wget -q https://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz && \
tar -xzf spark-3.1.1-bin-hadoop3.2.tgz -C /opt && \
rm spark-3.1.1-bin-hadoop3.2.tgz && \
ln -s /opt/spark-3.1.1-bin-hadoop3.2 /opt/spark

# Set Spark environment variables
ENV SPARK_HOME=/opt/spark
ENV PATH=$SPARK_HOME/bin:$PATH
# ENV PYSPARK_PYTHON=python3.8
# ENV PYSPARK_DRIVER_PYTHON=python3.8

# Install VariantSpark
RUN pip3 install --no-cache-dir variant-spark==0.5.5

# Install matching PySpark + Hail
RUN pip install --no-cache-dir pyspark==3.1.1 \
hail==0.2.74 \
Jinja2==3.0.3 \
pandas==1.1.4 \
typedecorator==0.0.5 \
scipy==1.6.3 \
numpy==1.21.2 \
patsy==0.5.2 \
statsmodels==0.13.2 \
seaborn==0.11.2 \
chardet==3.0.4 \
google-auth==1.35.0 \
google-cloud-core==1.7.3 \
google-cloud-storage==1.25.0

RUN pip install variant-spark==0.5.5

# Clone the VariantSpark repository
# RUN git clone https://github.com/aehrc/VariantSpark.git

# Set working directory to the cloned repository
WORKDIR /app/VariantSpark
RUN ln -s /usr/local/share/variant-spark/data/ .

# Command to run
CMD ["variant-spark -h"]
52 changes: 52 additions & 0 deletions dev/docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Use Ubuntu 20.04 as the base image
FROM --platform=linux/amd64 ubuntu:20.04

# Set working directory
WORKDIR /app

# Avoid interactive prompts during package installation
ENV DEBIAN_FRONTEND=noninteractive

# Install Python 3.7, pip, Java 8, and Git
RUN apt-get update && apt-get install -y \
python3.7 \
python3-pip \
openjdk-8-jdk \
git wget \
&& rm -rf /var/lib/apt/lists/*

# Set Python 3.8 as default python
RUN ln -s /usr/bin/python3.8 /usr/bin/python

# Set JAVA_HOME environment variable,
# openjdk-8-jdk for linux/amd64, it installs java-8-openjdk-amd64.
ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
ENV PATH=$JAVA_HOME/bin:$PATH

# Install Apache Spark 3.1.1 with Hadoop 3.2, only work for linux/amd64
RUN wget -q https://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz && \
tar -xzf spark-3.1.1-bin-hadoop3.2.tgz -C /opt && \
rm spark-3.1.1-bin-hadoop3.2.tgz && \
ln -s /opt/spark-3.1.1-bin-hadoop3.2 /opt/spark

# Set Spark environment variables
ENV SPARK_HOME=/opt/spark
ENV PATH=$SPARK_HOME/bin:$PATH
# ENV PYSPARK_PYTHON=python3.8
# ENV PYSPARK_DRIVER_PYTHON=python3.8

# Install VariantSpark
RUN pip3 install --no-cache-dir variant-spark

# Install matching PySpark + Hail
RUN pip install --no-cache-dir pyspark==3.1.1 "variant-spark[hail,deps]" hail==0.2.74

# Clone the VariantSpark repository
# RUN git clone https://github.com/aehrc/VariantSpark.git

# Set working directory to the cloned repository
WORKDIR /app/VariantSpark
RUN ln -s /usr/local/share/variant-spark/data/ .

# Command to run
CMD ["bash variant-spark -h"]
37 changes: 37 additions & 0 deletions dev/docker/compute_local_fdr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import hail as hl
import varspark.hail as vshl
from matplotlib import pyplot as plt
vshl.init()

vds = hl.import_vcf('./data/chr22_1000.vcf')
labels = hl.import_table('./data/chr22-labels-hail.csv', impute = True, delimiter=",").key_by('sample')

vds = vds.annotate_cols(label = labels[vds.s])
vds.cols().show(3)

rf_model = vshl.random_forest_model(y=vds.label['x22_16050408'],
x=vds.GT.n_alt_alleles(), seed = 13, mtry_fraction = 0.05, min_node_size = 5, max_depth = 10)
rf_model.fit_trees(300, 50)

print("OOB error: %s" % rf_model.oob_error())
impTable = rf_model.variable_importance()
impTable.order_by(hl.desc(impTable.importance)).show(10)

fdrCalc = rf_model.get_lfdr()

fig, ax1 = plt.subplots(figsize=(10, 5), layout='constrained')
fdrCalc.plot_log_densities(ax1, cutoff_list=[1, 2, 3, 4, 5, 10, 15, 20], find_automatic_best=True)
plt.show()

fig, ax2 = plt.subplots(figsize=(10, 5), layout='constrained')
fdrCalc.plot_log_hist(ax2, split_count=2)
plt.show()

pvalsDF, fdr = fdrCalc.compute_fdr(countThreshold = 2, local_fdr_cutoff = 0.05)
pvalsDF, fdr

fig, ax3 = plt.subplots(figsize=(10, 5), layout='constrained')
fdrCalc.plot(ax3)
plt.show()

hl.stop()
8 changes: 8 additions & 0 deletions dev/docker/local_run-importance-ch22.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash

PWD=$(cd `dirname "$0"`; pwd)
PATH=${PATH}:${PWD}/bin
export VS_ECHO_CMDLINE=YES

variant-spark --spark --master 'local[*]' -- \
importance -if ${PWD}/data/chr22_1000.vcf -ff ${PWD}/data/chr22-labels.csv -fc 22_16050408 -v -rn 500 -rbs 20 -ro -sr 13 "$@"
143 changes: 143 additions & 0 deletions dev/docker/note.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@

try https://variantspark.readthedocs.io/en/latest/getting_started.html
## ask gpt to create dockerfile
- any python base image eg. FROM python:3.8, don't support openjdk package installation.
- try ubuntu which set python 3.8 as default.

## test it interactively locl
- docker build -t vsapp .
- docker run -it --name vsrun1 vsapp
```
python --version # Should show Python 3.8.x
java -version # Should show OpenJDK 8
pip3 show variant-spark # To find where variant-spark is installed
```
- docker cp variantspark_script.py vsrun2:/app/VariantSpark/variantspark_script.py # copy file from local to docker
- vs works with only `pip install variant-spark`:
- `which variant-spark` # to find https://github.com/aehrc/VariantSpark/tree/master/bin/variant-spark
- to run scala importance don't need hail to be installed (no mvn install, no pip install hails)
```
root@16542009db87:/app/VariantSpark# variant-spark importance -if gitHub/VariantSpark/data/chr22_1000.vcf -ff gitHub/VariantSpark/data/chr22-labels.csv -fc 22_16050408 -v -rn 500 -rbs 20 -ro -sr 13
root@16542009db87:/app/VariantSpark# variant-spark importance -if gitHub/VariantSpark/data/chr22_1000.vcf -ff gitHub/VariantSpark/data/chr22-labels.csv -fc 22_16050408 -v -rn 500 -rbs 20 -ro -sr 13
25/10/27 08:41:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
log4j:WARN No appenders could be found for logger (au.csiro.variantspark.cli.ImportanceCmd).
...
Last build trees: 20, time: 779 ms, timePerTree: 38 ms
Finished trees: 500, current oobError: 0.016483516483516484, totalTime: 36.185 s, avg timePerTree: 0.07237 s
Last build trees: 20, time: 675 ms, timePerTree: 33 ms
Random forest oob accuracy: 0.016483516483516484, took: 36.4 s
variable,importance
22_16050408_T_C,18.484457676767143
22_16051480_T_C,17.593204808682323
...
```
- `python compute_local_fdr.py` failed with error "ModuleNotFoundError: No module named 'hail'"
- `python run_importance_chr22.py` failed with error "ModuleNotFoundError: No module named 'pyspark'"

- pip3 install --no-cache-dir -r /app/VariantSpark/requirements.txt # install python dependency
- now hai and pyspark are installed in container: /usr/local/lib/python3.8/dist-packages/hail/
- now `python compute_local_fdr.py` and `python run_importance_chr22.py` works
## pip install variant-spark
- it only install compulsary dependency like typecore, but not includes hail and payspark.
- /usr/local/lib/python3.8/dist-packages/varspark is installed, includes
- /usr/local/lib/python3.8/dist-packages/varspark/jars/variant-spark_2.12-0.5.5-all.jar
- but this jar is not fat jar which didn't includes au.csiro.aehrc.third.hail-is
- /usr/local/lib/python3.8/dist-packages/varspark: from variant-spark-0.5.5.tar.gz/varspark
- /usr/local/share/variant-spark/data/chr22*.vcf: from variant-spark-0.5.5.tar.gz/target/data
- /usr/local/bin/jvariant-spark and variant-spark etc: from variant-spark-0.5.5.tar.gz/target/bin

## pip install hail==0.2.74
- hail-all-spark.jar : installed by pip3 install hail==0.2.74 inside the requirement.txt
- is used by the Python hail package at runtime.
- /usr/local/lib/python3.8/dist-packages/hail/backend/hail-all-spark.jar
- jar tf hail-all-spark.jar | grep hail | grep SparkBackend

## mvn install
- Maven will try to download a JAR matching hail_2.12_3.1:0.2.74 from repo: au.csiro.aehrc.third.hail-is based on pom.xml
- the JAR is stored in your local Maven repository (~/.m2/repository/au/csiro/aehrc/third/hail-is/hail_2.12_3.1/0.2.74/).

- refer to src/main/scala/au/csiro/variantspark/hail/methods/RFModel.scala
- ~/.m2/repository/au/.../hail_2.12_3.1/0.2.74/ is called during mvn test or pure scala code running
- python: vshl.init() adds hail-all-spark.jar to the Spark classpath.
- python: spark = SparkSession.builder.config('spark.jars', vs.find_jar()).getOrCreate() adds spark.jars to spark classpath
- python: vshl.random_forest_model(...) calls scala RFModel.scala based on park classpath
- summary: python calls scala depend on hail-all-spark.jar but not mvn installed hails



## python 3.7 vs 3.8
- Ubuntu 20.04 or older, or Debian, where Python 3.7 reached End of Life (EOL: June 2023) and was removed from repos
- Ubuntu 20.04 contains python3.8 as default (python3).
- python3.8 with importance call : `import varspark as vs`
```
root@a874b29b622c:/app/VariantSpark# python3 run_importance_chr22.py
25/10/28 13:47:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Features loaded: <varspark.core.FeatureSource object at 0x7fffe9a84d30>
Labels loaded: au.csiro.variantspark.input.CsvLabelSource@47be15fa

Variable Importance
22_16050408_T_C 0.0008041915634907004
22_16051480_T_C 0.0007654163908573393
22_16050678_C_T 0.0006921965571074235
22_16053197_G_T 0.00065148141258399
22_16053435_G_T 0.0006144056480311232
22_16051107_C_A 0.0006139653108376215
22_16051882_C_T 0.0005007281009782979
22_16053797_T_C 0.0004618498469961836
22_16052838_T_A 0.0004613601158382499
22_16053509_A_G 0.0004548314795407337

```
- python3.8 call compute_local_fdr.py : `import hail as hl; import varspark.hail as vshl`, got all version conflict, due to `RUN pip install --no-cache-dir pyspark==3.1.1 "variant-spark[hail,deps]" hail==0.2.74` install lists of dependency under /usr/local/lib/python3.8/dist-packages/; all incorrect version
```

root@a874b29b622c:/app/VariantSpark# python3 compute_local_fdr.py
An error occurred: module 'importlib.metadata' has no attribute 'packages_distributions'
/usr/local/lib/python3.8/dist-packages/google/api_core/_python_version_support.py:237: FutureWarning: You are using a non-supported Python version (3.8.10). Google will not post any further updates to google.api_core supporting this Python version. Please upgrade to the latest Python version, or at least Python 3.10, and then update google.api_core.
warnings.warn(message, FutureWarning)
/usr/local/lib/python3.8/dist-packages/scipy/__init__.py:138: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.24.4)
warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion} is required for this version of "
Traceback (most recent call last):
File "compute_local_fdr.py", line 1, in <module>
import hail as hl
File "/usr/local/lib/python3.8/dist-packages/hail/__init__.py", line 48, in <module>

```
- python3.8 call compute_local_fdr.py after `pip install -r requirement.txt` to install correct verion. now `import hail as hl` working.

## Docker Build on ARM vs. AMD64
- `docker build -t vsapp . ` on your Mac (with an ARM-based chip like M1/M2), Docker builds the image for the native architecture, which is linux/arm64.
- `docker build --platform linux/amd64 -t vsapp .` you instruct Docker to build the image for the linux/amd64 architecture, even on your ARM-based Mac.
- The openjdk-8-jdk package in Ubuntu’s repositories is architecture-specific. For linux/arm64, it installs java-8-openjdk-arm64; for linux/amd64, it installs java-8-openjdk-amd64.

- `uname -m` # shows x86_64 for AMD64; or aarch64 for ARM64



# to do list
- pip3 show variant-spark shows Version: 0.5.5 but author Piotr Szul et. al is wrong
- pip3 install variant-spark, not automatically install pyspark as a dependency, got error
```
from pyspark import SparkConf
ModuleNotFoundError: No module named 'pyspark'
```
- pip3 show Jinja2 pandas typedecorator hail pyspark scipy numpy patsy statsmodels seaborn # only typedecorator installed
```
root@16542009db87:/app/VariantSpark# pip3 show Jinja2 pandas typedecorator hail pyspark scipy numpy patsy statsmodels seaborn
WARNING: Package(s) not found: Jinja2, hail, numpy, pandas, patsy, pyspark, scipy, seaborn, statsmodels
Name: typedecorator
Version: 0.0.5
Summary: Decorator-based type checking library for Python 2 and 3
Home-page: https://github.com/dobarkod/typedecorator/
Author: Senko Rasic
Author-email: senko.rasic@goodcode.io
License: MIT
Location: /usr/local/lib/python3.8/dist-packages
Requires:
Required-by: variant-spark

```
- try micromaba with python 3.7
19 changes: 19 additions & 0 deletions dev/docker/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# python 3.7
# varspark dependencies
Jinja2==3.0.3
pandas==1.1.4
typedecorator==0.0.5
# 3.1.1 most suits hail 0.2.74
pyspark==3.1.1
scipy==1.6.3
numpy==1.21.2
patsy==0.5.2
statsmodels==0.13.2
seaborn==0.11.2
hail==0.2.74
# below dependency are version specify to compatible with hail
chardet==3.0.4
google-auth==1.35.0
google-cloud-core==1.7.3
# hail 0.2.74 has requirement google-cloud-storage==1.25.*,
google-cloud-storage==1.25.0
30 changes: 30 additions & 0 deletions dev/docker/run_importance_chr22.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# variantspark_script.py
import varspark as vs
from pyspark.sql import SparkSession

# Step 1: Create a Spark session with VariantSpark JAR attached
spark = SparkSession.builder.config('spark.jars', vs.find_jar()).getOrCreate()

# Step 2: Create a VarsparkContext
vc = vs.VarsparkContext(spark, silent=True)

# Step 3: Load features and labels
features = vc.import_vcf('/app/VariantSpark/data/chr22_1000.vcf')
labels = vc.load_label('/app/VariantSpark/data/chr22-labels.csv', '22_16050408')

# Optional: Print some information to verify
print("Features loaded:", features)
print("Labels loaded:", labels)

# Step 4: Run the importance analysis and retrieve top important variables:
ia = features.importance_analysis(labels, seed = 13, n_trees=500, batch_size=20)
top_variables = ia.important_variables()

# Step 5: Display the results.
print("%s\t%s" % ('Variable', 'Importance'))
for var_and_imp in top_variables:
print("%s\t%s" % var_and_imp)

# Stop the Spark session
spark.stop()