aqlaboratory · jandom · Apr 9, 2026 · Apr 5, 2026 · Apr 7, 2026 · Apr 8, 2026
diff --git a/docs/source/Installation.md b/docs/source/Installation.md
@@ -30,6 +30,19 @@ to install GPU accelerated {doc}`cuEquivariance attention kernels <kernels>`, us
 pip install openfold3[cuequivariance]
 ```
 
+To use AMD ROCm-compatible Triton kernels, first install the ROCm PyTorch wheel (which bundles ROCm Triton), then install openfold3:
+
+```bash
+pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm7.2
+pip install openfold3
+```
+
+After installation, verify your ROCm environment is correctly configured:
+
+```bash
+validate-openfold3-rocm
+```
+
 (installation-environment-variables)=
 ### Environment variables
 

diff --git a/docs/source/inference.md b/docs/source/inference.md
@@ -197,6 +197,7 @@ We provide several example runner files in our [examples directory](https://gith
 - Using low memory settings
 - Customizing output formats
 - Enabling cuEquivariance kernels
+- Enabling AMD ROCm Triton kernels
 - Saving MSA and Template processing outputs
 - And more
 
@@ -297,6 +298,36 @@ model_update:
 
 ---
 
+#### 🔴 AMD ROCm Inference with Triton Kernels
+
+On AMD GPUs, OpenFold3 can use native Triton kernels for the Evoformer attention and TriangleMultiplicativeUpdate layers instead of the default CUDA-specific kernels.
+
+First, install PyTorch for ROCm and openfold3 (see [Installation](https://github.com/aqlaboratory/openfold-3/blob/main/docs/source/Installation.md)).
+Then enable the Triton kernels in your `runner.yml` using the provided [`triton.yml`](https://github.com/aqlaboratory/openfold-3/blob/main/examples/example_runner_yamls/triton.yml) example:
+
+```yaml
+model_update:
+  presets:
+    - predict
+  custom:
+    settings:
+      memory:
+        eval:
+          use_triton_triangle_kernels: true
+          use_deepspeed_evo_attention: false
+          use_cueq_triangle_kernels: false
+```
+
+```bash
+run_openfold predict \
+    --query-json /path/to/query.json \
+    --output-dir /path/to/output/ \
+    --runner-yaml examples/example_runner_yamls/triton.yml
+```
+
+> **Note on first-run compilation**: Triton JIT-compiles kernels on first use and caches them to `~/.triton/cache`. The compilation is a one-time cost per unique sequence length per machine; subsequent runs at the same length incur no overhead.
+
+---
 
 ### 3.4 Customized ColabFold MSA Server Settings Using `runner.yml` 
 

diff --git a/environments/production-amd-linux-64.yml b/environments/production-amd-linux-64.yml
@@ -0,0 +1,39 @@
+name: openfold3-env
+channels:
+  - conda-forge
+  - bioconda
+  - pytorch
+dependencies:
+  - python
+  - awscli
+  - setuptools
+  - pip
+  - conda-forge::uv
+  - pytorch-lightning
+  - biopython
+  - numpy
+  - pandas
+  - PyYAML
+  - requests
+  - scipy
+  - tqdm
+  - typing-extensions
+  - wandb
+  - modelcif
+  - ml-collections
+  - mkl
+  - rdkit=2025.09.3
+  - biotite==1.2.0
+  - bioconda::hmmer
+  - bioconda::hhsuite
+  - bioconda::kalign2
+  - memory_profiler
+  - func_timeout
+  - boto3
+  - conda-forge::python-lmdb=1.6
+  - conda-forge::ijson
+  - pip:
+      - pdbeccdutils
+      - --extra-index-url https://download.pytorch.org/whl/rocm7.2
+      - torch
+      - torchvision
diff --git a/examples/example_runner_yamls/triton.yml b/examples/example_runner_yamls/triton.yml
@@ -0,0 +1,11 @@
+model_update:
+  presets:
+    - predict
+    - low_mem  # to use low memory settings
+  custom:
+    settings:
+      memory:
+        eval:
+          use_triton_triangle_kernels: true
+          use_deepspeed_evo_attention: false
+          use_cueq_triangle_kernels: false