From 2b5fa83ad423b8511332c10940903dba53fbe3e5 Mon Sep 17 00:00:00 2001 From: Jan Domanski Date: Wed, 7 Jan 2026 10:37:37 -0700 Subject: [PATCH 1/2] build: fix the blackwell dockerfile --- docker/Build_instructions_blackwell.md | 27 +++++++++---- docker/Dockerfile.blackwell | 56 ++++++++++---------------- 2 files changed, 42 insertions(+), 41 deletions(-) diff --git a/docker/Build_instructions_blackwell.md b/docker/Build_instructions_blackwell.md index c86b0cfd..5dad84ad 100644 --- a/docker/Build_instructions_blackwell.md +++ b/docker/Build_instructions_blackwell.md @@ -12,21 +12,34 @@ This will create a Docker image named `openfold-3-blackwell` with the `latest` t ## test Pytorch and CUDA ```bash -docker run --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 openfold-3-blackwell:latest python -c "import torch; print('CUDA:', torch.version.cuda); print('PyTorch:', torch.__version__)" +docker run \ + --gpus all \ + --ipc=host \ + --ulimit memlock=-1 \ + openfold-3-blackwell:latest \ + python -c "import torch; print('CUDA:', torch.version.cuda); print('PyTorch:', torch.__version__)" ``` Should print something like: -CUDA: 12.8 -PyTorch: 2.7.0a0+ecf3bae40a.nv25.02 +``` +CUDA: 13.1 +PyTorch: 2.10.0a0+b4e4ee81d3.nv25.12 +``` ## test run_openfold inference example -docker run --gpus all -it --ipc=host --ulimit memlock=-1 \ - -v $(pwd):/output \ +```bash +docker run \ + --gpus all -it \ + --ipc=host \ + --ulimit memlock=-1 \ + -v /home/jandom/.openfold3:/root/.openfold3 \ + -v $(pwd)/output:/output \ -w /output openfold-3-blackwell:latest \ run_openfold predict \ - --query_json=/opt/openfold-3/examples/example_inference_inputs/query_ubiquitin.json \ + --query_json=/opt/openfold3/examples/example_inference_inputs/query_ubiquitin.json \ --num_diffusion_samples=1 \ --num_model_seeds=1 \ - --use_templates=false \ No newline at end of file + --use_templates=false +``` \ No newline at end of file diff --git a/docker/Dockerfile.blackwell b/docker/Dockerfile.blackwell index 8d5e04ca..1ed66a00 100644 --- a/docker/Dockerfile.blackwell +++ b/docker/Dockerfile.blackwell @@ -1,5 +1,5 @@ # Simple OpenFold3 Dockerfile using NVIDIA PyTorch container -FROM nvcr.io/nvidia/pytorch:25.02-py3 +FROM nvcr.io/nvidia/pytorch:25.12-py3 # Install system dependencies RUN apt-get update && apt-get install -y \ @@ -13,15 +13,21 @@ RUN apt-get update && apt-get install -y \ libxft2 \ && rm -rf /var/lib/apt/lists/* -# Clone OpenFold3 source and modify environment file +# Install CUTLASS for DeepSpeed Evoformer attention kernel +# We need only the headers for DeepSpeed JIT, don't need the pip package with bindings WORKDIR /opt -RUN git clone https://github.com/aqlaboratory/openfold-3.git && \ - cd openfold-3 && \ - cp -p environments/production-linux-64.yml environments/production.yml.backup && \ - grep -v "pytorch::pytorch" environments/production.yml > environments/production.yml.tmp && \ - mv environments/production.yml.tmp environments/production.yml +RUN git clone https://github.com/NVIDIA/cutlass --branch v3.6.0 --depth 1 + +# Pre-compile DeepSpeed operations for Blackwell GPUs to avoid runtime compilation +# Create necessary cache directories +RUN python3 -c "import os; os.makedirs('/root/.triton/autotune', exist_ok=True)" -WORKDIR /opt/openfold-3 +# Set environment variables including CUDA architecture for Blackwell +ENV PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 \ + KMP_AFFINITY=none \ + CUTLASS_PATH=/opt/cutlass \ + TORCH_CUDA_ARCH_LIST="12.1" # Install Python dependencies RUN pip install --no-cache-dir \ @@ -46,36 +52,18 @@ RUN pip install --no-cache-dir \ awscli \ memory_profiler \ func_timeout \ - biotite==1.2.0 \ - "nvidia-cutlass<4" \ - "cuda-python<12.9.1" + biotite==1.2.0 -# Install CUTLASS for DeepSpeed Evoformer attention kernel -WORKDIR /opt -RUN git clone https://github.com/NVIDIA/cutlass --branch v3.6.0 --depth 1 +COPY pyproject.toml /opt/openfold3/ +COPY openfold3/__init__.py /opt/openfold3/openfold3/ +COPY scripts/ /opt/openfold3/scripts/ # Install OpenFold3 package itself (provides run_openfold command) -WORKDIR /opt/openfold-3 -RUN python3 -m pip install --editable --no-deps . - -# Set environment variables including CUDA architecture for Blackwell -ENV PYTHONUNBUFFERED=1 \ - PYTHONDONTWRITEBYTECODE=1 \ - KMP_AFFINITY=none \ - CUTLASS_PATH=/opt/cutlass \ - TORCH_CUDA_ARCH_LIST="12.0" - -# Pre-compile DeepSpeed operations for Blackwell GPUs to avoid runtime compilation -# Create necessary cache directories -RUN python3 -c "import os; os.makedirs('/root/.triton/autotune', exist_ok=True)" +WORKDIR /opt/openfold3 +RUN python3 -m pip install --no-deps --editable . -# Create a Python sitecustomize.py to set TORCH_CUDA_ARCH_LIST before any imports -# This ensures the variable is set before PyTorch's cpp_extension checks it -RUN mkdir -p /usr/local/lib/python3.12/site-packages && \ - echo 'import os' > /usr/local/lib/python3.12/site-packages/sitecustomize.py && \ - echo 'os.environ.setdefault("TORCH_CUDA_ARCH_LIST", "12.0")' >> /usr/local/lib/python3.12/site-packages/sitecustomize.py && \ - echo 'os.environ.setdefault("CUTLASS_PATH", "/opt/cutlass")' >> /usr/local/lib/python3.12/site-packages/sitecustomize.py && \ - echo 'os.environ.setdefault("KMP_AFFINITY", "none")' >> /usr/local/lib/python3.12/site-packages/sitecustomize.py +# Copy the entire source tree directly (at the very end for optimal caching) +COPY . /opt/openfold3 # Default command CMD ["/bin/bash"] From da12523a721ab47e2ebcda0bdb18003d5f0cedcb Mon Sep 17 00:00:00 2001 From: Jan Domanski Date: Fri, 9 Jan 2026 07:25:39 -0700 Subject: [PATCH 2/2] working example of blackwell running without docker Lots and lots of ENV magic and overrides... not great --- docker/Build_instructions_blackwell.md | 4 +- environments/production-linux-aarch64.yml | 54 +++++++++++++++++++++++ 2 files changed, 56 insertions(+), 2 deletions(-) create mode 100644 environments/production-linux-aarch64.yml diff --git a/docker/Build_instructions_blackwell.md b/docker/Build_instructions_blackwell.md index 5dad84ad..5ca0504f 100644 --- a/docker/Build_instructions_blackwell.md +++ b/docker/Build_instructions_blackwell.md @@ -38,8 +38,8 @@ docker run \ -v $(pwd)/output:/output \ -w /output openfold-3-blackwell:latest \ run_openfold predict \ - --query_json=/opt/openfold3/examples/example_inference_inputs/query_ubiquitin.json \ + --query_json=examples/example_inference_inputs/query_ubiquitin.json \ --num_diffusion_samples=1 \ --num_model_seeds=1 \ --use_templates=false -``` \ No newline at end of file +``` diff --git a/environments/production-linux-aarch64.yml b/environments/production-linux-aarch64.yml new file mode 100644 index 00000000..ac0b58df --- /dev/null +++ b/environments/production-linux-aarch64.yml @@ -0,0 +1,54 @@ +# Blackwell (sm_121) on aarch64 environment +name: openfold3-env +variables: + CUDA_HOME: /usr/local/cuda + PATH: /usr/local/cuda/bin:${PATH} + LD_LIBRARY_PATH: /usr/local/cuda/lib64:${LD_LIBRARY_PATH} + # Triton bundles its own ptaxs which does not support sm_121 + # This forces Triton to use the system ptaxas compiler, aware of sm_121 + TRITON_PTXAS_PATH: /usr/local/cuda/bin/ptxas + # Requires: git clone https://github.com/NVIDIA/cutlass --branch v3.6.0 --depth 1 ~/workspace/cutlass + CUTLASS_PATH: /home/jandom/workspace/cutlass + # Note: OMP_NUM_THREADS=1 is required to avoid threading conflicts + OMP_NUM_THREADS: "1" + +channels: + - conda-forge + - bioconda + - nvidia +dependencies: + - python + - awscli + - setuptools + - pip + - conda-forge::uv + - pytorch-lightning + - biopython + - numpy + - pandas + - PyYAML + - requests + - scipy + - tqdm + - typing-extensions + - wandb + - modelcif + - ml-collections + - rdkit=2025.09.3 + - mmseqs2 + - bioconda::hmmer + - bioconda::hhsuite + - bioconda::kalign2 + - bioconda::snakemake + - memory_profiler + - func_timeout + - boto3 + - conda-forge::python-lmdb=1.6 + - conda-forge::ijson + - pip: + # PyTorch stable cu130 for aarch64 - works on Blackwell via PTX JIT + - --extra-index-url https://download.pytorch.org/whl/cu130 + - torch>=2.9.0 + - biotite==1.2.0 + - deepspeed + - pdbeccdutils