From 2b5fa83ad423b8511332c10940903dba53fbe3e5 Mon Sep 17 00:00:00 2001
From: Jan Domanski <jan.domanski@omsf.io>
Date: Wed, 7 Jan 2026 10:37:37 -0700
Subject: [PATCH 1/2] build: fix the blackwell dockerfile

---
 docker/Build_instructions_blackwell.md | 27 +++++++++----
 docker/Dockerfile.blackwell            | 56 ++++++++++----------------
 2 files changed, 42 insertions(+), 41 deletions(-)

diff --git a/docker/Build_instructions_blackwell.md b/docker/Build_instructions_blackwell.md
index c86b0cfd..5dad84ad 100644
--- a/docker/Build_instructions_blackwell.md
+++ b/docker/Build_instructions_blackwell.md
@@ -12,21 +12,34 @@ This will create a Docker image named `openfold-3-blackwell` with the `latest` t
 ## test Pytorch and CUDA
 
 ```bash
-docker run --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 openfold-3-blackwell:latest   python -c "import torch; print('CUDA:', torch.version.cuda); print('PyTorch:', torch.__version__)"  
+docker run \
+    --gpus all \
+    --ipc=host \
+    --ulimit memlock=-1 \
+    openfold-3-blackwell:latest \
+    python -c "import torch; print('CUDA:', torch.version.cuda); print('PyTorch:', torch.__version__)"
 ```
 
 Should print something like:
-CUDA: 12.8
-PyTorch: 2.7.0a0+ecf3bae40a.nv25.02
 
+```
+CUDA: 13.1
+PyTorch: 2.10.0a0+b4e4ee81d3.nv25.12
+```
 
 ## test run_openfold inference example
 
-docker run --gpus all -it --ipc=host --ulimit memlock=-1 \
-    -v $(pwd):/output \
+```bash
+docker run \
+    --gpus all -it \
+    --ipc=host \
+    --ulimit memlock=-1 \
+    -v /home/jandom/.openfold3:/root/.openfold3 \
+    -v $(pwd)/output:/output \
     -w /output openfold-3-blackwell:latest \
     run_openfold predict \
-    --query_json=/opt/openfold-3/examples/example_inference_inputs/query_ubiquitin.json \
+    --query_json=/opt/openfold3/examples/example_inference_inputs/query_ubiquitin.json \
     --num_diffusion_samples=1 \
     --num_model_seeds=1 \
-    --use_templates=false 
\ No newline at end of file
+    --use_templates=false 
+```
\ No newline at end of file
diff --git a/docker/Dockerfile.blackwell b/docker/Dockerfile.blackwell
index 8d5e04ca..1ed66a00 100644
--- a/docker/Dockerfile.blackwell
+++ b/docker/Dockerfile.blackwell
@@ -1,5 +1,5 @@
 # Simple OpenFold3 Dockerfile using NVIDIA PyTorch container
-FROM nvcr.io/nvidia/pytorch:25.02-py3
+FROM nvcr.io/nvidia/pytorch:25.12-py3
 
 # Install system dependencies
 RUN apt-get update && apt-get install -y \
@@ -13,15 +13,21 @@ RUN apt-get update && apt-get install -y \
     libxft2 \
     && rm -rf /var/lib/apt/lists/*
 
-# Clone OpenFold3 source and modify environment file
+# Install CUTLASS for DeepSpeed Evoformer attention kernel
+# We need only the headers for DeepSpeed JIT, don't need the pip package with bindings
 WORKDIR /opt
-RUN git clone https://github.com/aqlaboratory/openfold-3.git && \
-    cd openfold-3 && \
-    cp -p environments/production-linux-64.yml environments/production.yml.backup && \
-    grep -v "pytorch::pytorch" environments/production.yml > environments/production.yml.tmp && \
-    mv environments/production.yml.tmp environments/production.yml
+RUN git clone https://github.com/NVIDIA/cutlass --branch v3.6.0 --depth 1
+
+# Pre-compile DeepSpeed operations for Blackwell GPUs to avoid runtime compilation
+# Create necessary cache directories
+RUN python3 -c "import os; os.makedirs('/root/.triton/autotune', exist_ok=True)"
 
-WORKDIR /opt/openfold-3
+# Set environment variables including CUDA architecture for Blackwell
+ENV PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    KMP_AFFINITY=none \
+    CUTLASS_PATH=/opt/cutlass \
+    TORCH_CUDA_ARCH_LIST="12.1"
 
 # Install Python dependencies
 RUN pip install --no-cache-dir \
@@ -46,36 +52,18 @@ RUN pip install --no-cache-dir \
     awscli \
     memory_profiler \
     func_timeout \
-    biotite==1.2.0 \
-    "nvidia-cutlass<4" \
-    "cuda-python<12.9.1"
+    biotite==1.2.0
 
-# Install CUTLASS for DeepSpeed Evoformer attention kernel
-WORKDIR /opt
-RUN git clone https://github.com/NVIDIA/cutlass --branch v3.6.0 --depth 1
+COPY pyproject.toml /opt/openfold3/
+COPY openfold3/__init__.py /opt/openfold3/openfold3/
+COPY scripts/ /opt/openfold3/scripts/
 
 # Install OpenFold3 package itself (provides run_openfold command)
-WORKDIR /opt/openfold-3
-RUN python3 -m pip install --editable --no-deps .
-
-# Set environment variables including CUDA architecture for Blackwell
-ENV PYTHONUNBUFFERED=1 \
-    PYTHONDONTWRITEBYTECODE=1 \
-    KMP_AFFINITY=none \
-    CUTLASS_PATH=/opt/cutlass \
-    TORCH_CUDA_ARCH_LIST="12.0"
-
-# Pre-compile DeepSpeed operations for Blackwell GPUs to avoid runtime compilation
-# Create necessary cache directories
-RUN python3 -c "import os; os.makedirs('/root/.triton/autotune', exist_ok=True)"
+WORKDIR /opt/openfold3
+RUN python3 -m pip install --no-deps --editable .
 
-# Create a Python sitecustomize.py to set TORCH_CUDA_ARCH_LIST before any imports
-# This ensures the variable is set before PyTorch's cpp_extension checks it
-RUN mkdir -p /usr/local/lib/python3.12/site-packages && \
-    echo 'import os' > /usr/local/lib/python3.12/site-packages/sitecustomize.py && \
-    echo 'os.environ.setdefault("TORCH_CUDA_ARCH_LIST", "12.0")' >> /usr/local/lib/python3.12/site-packages/sitecustomize.py && \
-    echo 'os.environ.setdefault("CUTLASS_PATH", "/opt/cutlass")' >> /usr/local/lib/python3.12/site-packages/sitecustomize.py && \
-    echo 'os.environ.setdefault("KMP_AFFINITY", "none")' >> /usr/local/lib/python3.12/site-packages/sitecustomize.py
+# Copy the entire source tree directly (at the very end for optimal caching)
+COPY . /opt/openfold3
 
 # Default command
 CMD ["/bin/bash"]

From da12523a721ab47e2ebcda0bdb18003d5f0cedcb Mon Sep 17 00:00:00 2001
From: Jan Domanski <jan.domanski@omsf.io>
Date: Fri, 9 Jan 2026 07:25:39 -0700
Subject: [PATCH 2/2] working example of blackwell running without docker

Lots and lots of ENV magic and overrides... not great
---
 docker/Build_instructions_blackwell.md    |  4 +-
 environments/production-linux-aarch64.yml | 54 +++++++++++++++++++++++
 2 files changed, 56 insertions(+), 2 deletions(-)
 create mode 100644 environments/production-linux-aarch64.yml

diff --git a/docker/Build_instructions_blackwell.md b/docker/Build_instructions_blackwell.md
index 5dad84ad..5ca0504f 100644
--- a/docker/Build_instructions_blackwell.md
+++ b/docker/Build_instructions_blackwell.md
@@ -38,8 +38,8 @@ docker run \
     -v $(pwd)/output:/output \
     -w /output openfold-3-blackwell:latest \
     run_openfold predict \
-    --query_json=/opt/openfold3/examples/example_inference_inputs/query_ubiquitin.json \
+    --query_json=examples/example_inference_inputs/query_ubiquitin.json \
     --num_diffusion_samples=1 \
     --num_model_seeds=1 \
     --use_templates=false 
-```
\ No newline at end of file
+```
diff --git a/environments/production-linux-aarch64.yml b/environments/production-linux-aarch64.yml
new file mode 100644
index 00000000..ac0b58df
--- /dev/null
+++ b/environments/production-linux-aarch64.yml
@@ -0,0 +1,54 @@
+# Blackwell (sm_121) on aarch64 environment
+name: openfold3-env
+variables:
+  CUDA_HOME: /usr/local/cuda
+  PATH: /usr/local/cuda/bin:${PATH}
+  LD_LIBRARY_PATH: /usr/local/cuda/lib64:${LD_LIBRARY_PATH}
+  # Triton bundles its own ptaxs which does not support sm_121
+  # This forces Triton to use the system ptaxas compiler, aware of sm_121
+  TRITON_PTXAS_PATH: /usr/local/cuda/bin/ptxas
+  # Requires: git clone https://github.com/NVIDIA/cutlass --branch v3.6.0 --depth 1 ~/workspace/cutlass
+  CUTLASS_PATH: /home/jandom/workspace/cutlass
+  # Note: OMP_NUM_THREADS=1 is required to avoid threading conflicts
+  OMP_NUM_THREADS: "1"
+
+channels:
+  - conda-forge
+  - bioconda
+  - nvidia
+dependencies:
+  - python
+  - awscli
+  - setuptools
+  - pip
+  - conda-forge::uv
+  - pytorch-lightning
+  - biopython
+  - numpy
+  - pandas
+  - PyYAML
+  - requests
+  - scipy
+  - tqdm
+  - typing-extensions
+  - wandb
+  - modelcif
+  - ml-collections
+  - rdkit=2025.09.3
+  - mmseqs2
+  - bioconda::hmmer
+  - bioconda::hhsuite
+  - bioconda::kalign2
+  - bioconda::snakemake
+  - memory_profiler
+  - func_timeout
+  - boto3
+  - conda-forge::python-lmdb=1.6
+  - conda-forge::ijson
+  - pip:
+      # PyTorch stable cu130 for aarch64 - works on Blackwell via PTX JIT
+      - --extra-index-url https://download.pytorch.org/whl/cu130
+      - torch>=2.9.0
+      - biotite==1.2.0
+      - deepspeed
+      - pdbeccdutils