diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..00dcfbb1 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,66 @@ +# Use Ubuntu 20.04 as the base image +FROM --platform=linux/amd64 ubuntu:20.04 + +# Set working directory +WORKDIR /app + +# Avoid interactive prompts during package installation +ENV DEBIAN_FRONTEND=noninteractive + +# Install Python 3.7, pip, Java 8, and Git +RUN apt-get update && apt-get install -y \ + python3-pip \ + openjdk-8-jdk \ + git wget \ + && rm -rf /var/lib/apt/lists/* + +# Set Python 3.8 as default python +RUN ln -s /usr/bin/python3.8 /usr/bin/python + +# Set JAVA_HOME environment variable, +# openjdk-8-jdk for linux/amd64, it installs java-8-openjdk-amd64. +ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 +ENV PATH=$JAVA_HOME/bin:$PATH + +# Install Apache Spark 3.1.1 with Hadoop 3.2, only work for linux/amd64 +RUN wget -q https://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz && \ + tar -xzf spark-3.1.1-bin-hadoop3.2.tgz -C /opt && \ + rm spark-3.1.1-bin-hadoop3.2.tgz && \ + ln -s /opt/spark-3.1.1-bin-hadoop3.2 /opt/spark + +# Set Spark environment variables +ENV SPARK_HOME=/opt/spark +ENV PATH=$SPARK_HOME/bin:$PATH +# ENV PYSPARK_PYTHON=python3.8 +# ENV PYSPARK_DRIVER_PYTHON=python3.8 + +# Install VariantSpark +RUN pip3 install --no-cache-dir variant-spark==0.5.5 + +# Install matching PySpark + Hail +RUN pip install --no-cache-dir pyspark==3.1.1 \ +hail==0.2.74 \ +Jinja2==3.0.3 \ +pandas==1.1.4 \ +typedecorator==0.0.5 \ +scipy==1.6.3 \ +numpy==1.21.2 \ +patsy==0.5.2 \ +statsmodels==0.13.2 \ +seaborn==0.11.2 \ +chardet==3.0.4 \ +google-auth==1.35.0 \ +google-cloud-core==1.7.3 \ +google-cloud-storage==1.25.0 + +RUN pip install variant-spark==0.5.5 + +# Clone the VariantSpark repository +# RUN git clone https://github.com/aehrc/VariantSpark.git + +# Set working directory to the cloned repository +WORKDIR /app/VariantSpark +RUN ln -s /usr/local/share/variant-spark/data/ . + +# Command to run +CMD ["variant-spark -h"] diff --git a/dev/docker/Dockerfile b/dev/docker/Dockerfile new file mode 100644 index 00000000..87b10a1a --- /dev/null +++ b/dev/docker/Dockerfile @@ -0,0 +1,52 @@ +# Use Ubuntu 20.04 as the base image +FROM --platform=linux/amd64 ubuntu:20.04 + +# Set working directory +WORKDIR /app + +# Avoid interactive prompts during package installation +ENV DEBIAN_FRONTEND=noninteractive + +# Install Python 3.7, pip, Java 8, and Git +RUN apt-get update && apt-get install -y \ + python3.7 \ + python3-pip \ + openjdk-8-jdk \ + git wget \ + && rm -rf /var/lib/apt/lists/* + +# Set Python 3.8 as default python +RUN ln -s /usr/bin/python3.8 /usr/bin/python + +# Set JAVA_HOME environment variable, +# openjdk-8-jdk for linux/amd64, it installs java-8-openjdk-amd64. +ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 +ENV PATH=$JAVA_HOME/bin:$PATH + +# Install Apache Spark 3.1.1 with Hadoop 3.2, only work for linux/amd64 +RUN wget -q https://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz && \ + tar -xzf spark-3.1.1-bin-hadoop3.2.tgz -C /opt && \ + rm spark-3.1.1-bin-hadoop3.2.tgz && \ + ln -s /opt/spark-3.1.1-bin-hadoop3.2 /opt/spark + +# Set Spark environment variables +ENV SPARK_HOME=/opt/spark +ENV PATH=$SPARK_HOME/bin:$PATH +# ENV PYSPARK_PYTHON=python3.8 +# ENV PYSPARK_DRIVER_PYTHON=python3.8 + +# Install VariantSpark +RUN pip3 install --no-cache-dir variant-spark + +# Install matching PySpark + Hail +RUN pip install --no-cache-dir pyspark==3.1.1 "variant-spark[hail,deps]" hail==0.2.74 + +# Clone the VariantSpark repository +# RUN git clone https://github.com/aehrc/VariantSpark.git + +# Set working directory to the cloned repository +WORKDIR /app/VariantSpark +RUN ln -s /usr/local/share/variant-spark/data/ . + +# Command to run +CMD ["bash variant-spark -h"] diff --git a/dev/docker/compute_local_fdr.py b/dev/docker/compute_local_fdr.py new file mode 100644 index 00000000..d7f6dd38 --- /dev/null +++ b/dev/docker/compute_local_fdr.py @@ -0,0 +1,37 @@ +import hail as hl +import varspark.hail as vshl +from matplotlib import pyplot as plt +vshl.init() + +vds = hl.import_vcf('./data/chr22_1000.vcf') +labels = hl.import_table('./data/chr22-labels-hail.csv', impute = True, delimiter=",").key_by('sample') + +vds = vds.annotate_cols(label = labels[vds.s]) +vds.cols().show(3) + +rf_model = vshl.random_forest_model(y=vds.label['x22_16050408'], + x=vds.GT.n_alt_alleles(), seed = 13, mtry_fraction = 0.05, min_node_size = 5, max_depth = 10) +rf_model.fit_trees(300, 50) + +print("OOB error: %s" % rf_model.oob_error()) +impTable = rf_model.variable_importance() +impTable.order_by(hl.desc(impTable.importance)).show(10) + +fdrCalc = rf_model.get_lfdr() + +fig, ax1 = plt.subplots(figsize=(10, 5), layout='constrained') +fdrCalc.plot_log_densities(ax1, cutoff_list=[1, 2, 3, 4, 5, 10, 15, 20], find_automatic_best=True) +plt.show() + +fig, ax2 = plt.subplots(figsize=(10, 5), layout='constrained') +fdrCalc.plot_log_hist(ax2, split_count=2) +plt.show() + +pvalsDF, fdr = fdrCalc.compute_fdr(countThreshold = 2, local_fdr_cutoff = 0.05) +pvalsDF, fdr + +fig, ax3 = plt.subplots(figsize=(10, 5), layout='constrained') +fdrCalc.plot(ax3) +plt.show() + +hl.stop() diff --git a/dev/docker/local_run-importance-ch22.sh b/dev/docker/local_run-importance-ch22.sh new file mode 100644 index 00000000..d95988f2 --- /dev/null +++ b/dev/docker/local_run-importance-ch22.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +PWD=$(cd `dirname "$0"`; pwd) +PATH=${PATH}:${PWD}/bin +export VS_ECHO_CMDLINE=YES + +variant-spark --spark --master 'local[*]' -- \ + importance -if ${PWD}/data/chr22_1000.vcf -ff ${PWD}/data/chr22-labels.csv -fc 22_16050408 -v -rn 500 -rbs 20 -ro -sr 13 "$@" diff --git a/dev/docker/note.md b/dev/docker/note.md new file mode 100644 index 00000000..36f47b49 --- /dev/null +++ b/dev/docker/note.md @@ -0,0 +1,143 @@ + +try https://variantspark.readthedocs.io/en/latest/getting_started.html +## ask gpt to create dockerfile + - any python base image eg. FROM python:3.8, don't support openjdk package installation. + - try ubuntu which set python 3.8 as default. + +## test it interactively locl +- docker build -t vsapp . +- docker run -it --name vsrun1 vsapp +``` + python --version # Should show Python 3.8.x + java -version # Should show OpenJDK 8 + pip3 show variant-spark # To find where variant-spark is installed +``` +- docker cp variantspark_script.py vsrun2:/app/VariantSpark/variantspark_script.py # copy file from local to docker +- vs works with only `pip install variant-spark`: + - `which variant-spark` # to find https://github.com/aehrc/VariantSpark/tree/master/bin/variant-spark + - to run scala importance don't need hail to be installed (no mvn install, no pip install hails) + ``` + root@16542009db87:/app/VariantSpark# variant-spark importance -if gitHub/VariantSpark/data/chr22_1000.vcf -ff gitHub/VariantSpark/data/chr22-labels.csv -fc 22_16050408 -v -rn 500 -rbs 20 -ro -sr 13 + root@16542009db87:/app/VariantSpark# variant-spark importance -if gitHub/VariantSpark/data/chr22_1000.vcf -ff gitHub/VariantSpark/data/chr22-labels.csv -fc 22_16050408 -v -rn 500 -rbs 20 -ro -sr 13 + 25/10/27 08:41:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable + log4j:WARN No appenders could be found for logger (au.csiro.variantspark.cli.ImportanceCmd). + ... + Last build trees: 20, time: 779 ms, timePerTree: 38 ms + Finished trees: 500, current oobError: 0.016483516483516484, totalTime: 36.185 s, avg timePerTree: 0.07237 s + Last build trees: 20, time: 675 ms, timePerTree: 33 ms + Random forest oob accuracy: 0.016483516483516484, took: 36.4 s + variable,importance + 22_16050408_T_C,18.484457676767143 + 22_16051480_T_C,17.593204808682323 + ... + ``` + - `python compute_local_fdr.py` failed with error "ModuleNotFoundError: No module named 'hail'" + - `python run_importance_chr22.py` failed with error "ModuleNotFoundError: No module named 'pyspark'" + +- pip3 install --no-cache-dir -r /app/VariantSpark/requirements.txt # install python dependency + - now hai and pyspark are installed in container: /usr/local/lib/python3.8/dist-packages/hail/ + - now `python compute_local_fdr.py` and `python run_importance_chr22.py` works +## pip install variant-spark +- it only install compulsary dependency like typecore, but not includes hail and payspark. +- /usr/local/lib/python3.8/dist-packages/varspark is installed, includes + - /usr/local/lib/python3.8/dist-packages/varspark/jars/variant-spark_2.12-0.5.5-all.jar + - but this jar is not fat jar which didn't includes au.csiro.aehrc.third.hail-is + - /usr/local/lib/python3.8/dist-packages/varspark: from variant-spark-0.5.5.tar.gz/varspark +- /usr/local/share/variant-spark/data/chr22*.vcf: from variant-spark-0.5.5.tar.gz/target/data +- /usr/local/bin/jvariant-spark and variant-spark etc: from variant-spark-0.5.5.tar.gz/target/bin + +## pip install hail==0.2.74 +- hail-all-spark.jar : installed by pip3 install hail==0.2.74 inside the requirement.txt + - is used by the Python hail package at runtime. + - /usr/local/lib/python3.8/dist-packages/hail/backend/hail-all-spark.jar + - jar tf hail-all-spark.jar | grep hail | grep SparkBackend + +## mvn install + - Maven will try to download a JAR matching hail_2.12_3.1:0.2.74 from repo: au.csiro.aehrc.third.hail-is based on pom.xml + - the JAR is stored in your local Maven repository (~/.m2/repository/au/csiro/aehrc/third/hail-is/hail_2.12_3.1/0.2.74/). + +- refer to src/main/scala/au/csiro/variantspark/hail/methods/RFModel.scala + - ~/.m2/repository/au/.../hail_2.12_3.1/0.2.74/ is called during mvn test or pure scala code running + - python: vshl.init() adds hail-all-spark.jar to the Spark classpath. + - python: spark = SparkSession.builder.config('spark.jars', vs.find_jar()).getOrCreate() adds spark.jars to spark classpath + - python: vshl.random_forest_model(...) calls scala RFModel.scala based on park classpath + - summary: python calls scala depend on hail-all-spark.jar but not mvn installed hails + + + +## python 3.7 vs 3.8 +- Ubuntu 20.04 or older, or Debian, where Python 3.7 reached End of Life (EOL: June 2023) and was removed from repos +- Ubuntu 20.04 contains python3.8 as default (python3). + - python3.8 with importance call : `import varspark as vs` + ``` + root@a874b29b622c:/app/VariantSpark# python3 run_importance_chr22.py + 25/10/28 13:47:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable + Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties + Setting default log level to "WARN". + To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). + Features loaded: + Labels loaded: au.csiro.variantspark.input.CsvLabelSource@47be15fa + + Variable Importance + 22_16050408_T_C 0.0008041915634907004 + 22_16051480_T_C 0.0007654163908573393 + 22_16050678_C_T 0.0006921965571074235 + 22_16053197_G_T 0.00065148141258399 + 22_16053435_G_T 0.0006144056480311232 + 22_16051107_C_A 0.0006139653108376215 + 22_16051882_C_T 0.0005007281009782979 + 22_16053797_T_C 0.0004618498469961836 + 22_16052838_T_A 0.0004613601158382499 + 22_16053509_A_G 0.0004548314795407337 + + ``` + - python3.8 call compute_local_fdr.py : `import hail as hl; import varspark.hail as vshl`, got all version conflict, due to `RUN pip install --no-cache-dir pyspark==3.1.1 "variant-spark[hail,deps]" hail==0.2.74` install lists of dependency under /usr/local/lib/python3.8/dist-packages/; all incorrect version + ``` + + root@a874b29b622c:/app/VariantSpark# python3 compute_local_fdr.py + An error occurred: module 'importlib.metadata' has no attribute 'packages_distributions' + /usr/local/lib/python3.8/dist-packages/google/api_core/_python_version_support.py:237: FutureWarning: You are using a non-supported Python version (3.8.10). Google will not post any further updates to google.api_core supporting this Python version. Please upgrade to the latest Python version, or at least Python 3.10, and then update google.api_core. + warnings.warn(message, FutureWarning) + /usr/local/lib/python3.8/dist-packages/scipy/__init__.py:138: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.24.4) + warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion} is required for this version of " + Traceback (most recent call last): + File "compute_local_fdr.py", line 1, in + import hail as hl + File "/usr/local/lib/python3.8/dist-packages/hail/__init__.py", line 48, in + + ``` + - python3.8 call compute_local_fdr.py after `pip install -r requirement.txt` to install correct verion. now `import hail as hl` working. + +## Docker Build on ARM vs. AMD64 +- `docker build -t vsapp . ` on your Mac (with an ARM-based chip like M1/M2), Docker builds the image for the native architecture, which is linux/arm64. +- `docker build --platform linux/amd64 -t vsapp .` you instruct Docker to build the image for the linux/amd64 architecture, even on your ARM-based Mac. +- The openjdk-8-jdk package in Ubuntu’s repositories is architecture-specific. For linux/arm64, it installs java-8-openjdk-arm64; for linux/amd64, it installs java-8-openjdk-amd64. + +- `uname -m` # shows x86_64 for AMD64; or aarch64 for ARM64 + + + +# to do list +- pip3 show variant-spark shows Version: 0.5.5 but author Piotr Szul et. al is wrong +- pip3 install variant-spark, not automatically install pyspark as a dependency, got error + ``` + from pyspark import SparkConf + ModuleNotFoundError: No module named 'pyspark' + ``` +- pip3 show Jinja2 pandas typedecorator hail pyspark scipy numpy patsy statsmodels seaborn # only typedecorator installed + ``` + root@16542009db87:/app/VariantSpark# pip3 show Jinja2 pandas typedecorator hail pyspark scipy numpy patsy statsmodels seaborn + WARNING: Package(s) not found: Jinja2, hail, numpy, pandas, patsy, pyspark, scipy, seaborn, statsmodels + Name: typedecorator + Version: 0.0.5 + Summary: Decorator-based type checking library for Python 2 and 3 + Home-page: https://github.com/dobarkod/typedecorator/ + Author: Senko Rasic + Author-email: senko.rasic@goodcode.io + License: MIT + Location: /usr/local/lib/python3.8/dist-packages + Requires: + Required-by: variant-spark + + ``` +- try micromaba with python 3.7 diff --git a/dev/docker/requirements.txt b/dev/docker/requirements.txt new file mode 100644 index 00000000..13a5a979 --- /dev/null +++ b/dev/docker/requirements.txt @@ -0,0 +1,19 @@ +# python 3.7 +# varspark dependencies +Jinja2==3.0.3 +pandas==1.1.4 +typedecorator==0.0.5 +# 3.1.1 most suits hail 0.2.74 +pyspark==3.1.1 +scipy==1.6.3 +numpy==1.21.2 +patsy==0.5.2 +statsmodels==0.13.2 +seaborn==0.11.2 +hail==0.2.74 +# below dependency are version specify to compatible with hail +chardet==3.0.4 +google-auth==1.35.0 +google-cloud-core==1.7.3 +# hail 0.2.74 has requirement google-cloud-storage==1.25.*, +google-cloud-storage==1.25.0 diff --git a/dev/docker/run_importance_chr22.py b/dev/docker/run_importance_chr22.py new file mode 100644 index 00000000..d3d4fe3b --- /dev/null +++ b/dev/docker/run_importance_chr22.py @@ -0,0 +1,30 @@ +# variantspark_script.py +import varspark as vs +from pyspark.sql import SparkSession + +# Step 1: Create a Spark session with VariantSpark JAR attached +spark = SparkSession.builder.config('spark.jars', vs.find_jar()).getOrCreate() + +# Step 2: Create a VarsparkContext +vc = vs.VarsparkContext(spark, silent=True) + +# Step 3: Load features and labels +features = vc.import_vcf('/app/VariantSpark/data/chr22_1000.vcf') +labels = vc.load_label('/app/VariantSpark/data/chr22-labels.csv', '22_16050408') + +# Optional: Print some information to verify +print("Features loaded:", features) +print("Labels loaded:", labels) + +# Step 4: Run the importance analysis and retrieve top important variables: +ia = features.importance_analysis(labels, seed = 13, n_trees=500, batch_size=20) +top_variables = ia.important_variables() + +# Step 5: Display the results. +print("%s\t%s" % ('Variable', 'Importance')) +for var_and_imp in top_variables: + print("%s\t%s" % var_and_imp) + +# Stop the Spark session +spark.stop() +