diff --git a/.drone.yml b/.drone.yml index ec6cc9258..cfa6a113d 100644 --- a/.drone.yml +++ b/.drone.yml @@ -11,7 +11,7 @@ steps: - name: get version -- branch image: ubuntu:latest commands: - - echo $(cat alphafold/version.py | grep version | grep -oP "\d+\.\d+\.\d+")-$(echo $DRONE_COMMIT_BRANCH | sed 's/[/_-]//g')-$DRONE_BUILD_NUMBER > .tags + - echo $(cat VERSION)-alphafold-$(cat alphafold/version.py | grep version | grep -oP "\d+\.\d+\.\d+")-$(echo $DRONE_COMMIT_BRANCH | sed 's/[/_-]//g')-$DRONE_BUILD_NUMBER > .tags - echo $(cat .tags) when: event: @@ -20,30 +20,17 @@ steps: - name: get version -- tag image: ubuntu:latest commands: - - echo $(cat alphafold/version.py | grep version | grep -oP "\d+\.\d+\.\d+")-cuda-$(cat docker/Dockerfile | grep CUDA | grep -oP '(?<=CUDA=)\d+(\.\d+)*')-$(cat docker/Dockerfile | grep nvidia/cuda | grep -oP 'ubuntu\K[0-9]+\.[0-9]+' | sed 's/^/ubuntu /' | sed 's/ /-/') > .tags + - echo $(cat VERSION)-alphafold-$(cat alphafold/version.py | grep version | grep -oP "\d+\.\d+\.\d+") > .tags - echo $(cat .tags) when: event: - tag - - name: release server image -- branch + - name: release server image image: plugins/gcr settings: repo: cyrus-containers/alphafold debug: true dockerfile: docker/Dockerfile json_key: - from_secret: dockerconfigjson - when: - event: push - - - name: release server image -- tag - image: plugins/gcr - settings: - repo: cyrus-containers/alphafold - debug: true - dockerfile: docker/Dockerfile - json_key: - from_secret: dockerconfigjson - when: - event: tag \ No newline at end of file + from_secret: dockerconfigjson \ No newline at end of file diff --git a/VERSION b/VERSION new file mode 100644 index 000000000..afaf360d3 --- /dev/null +++ b/VERSION @@ -0,0 +1 @@ +1.0.0 \ No newline at end of file diff --git a/alphafold/data/mmcif_parsing.py b/alphafold/data/mmcif_parsing.py index 61cf149c0..74c3ad50d 100644 --- a/alphafold/data/mmcif_parsing.py +++ b/alphafold/data/mmcif_parsing.py @@ -315,6 +315,7 @@ def _get_header(parsed_info: MmCIFDict) -> PdbHeader: try: raw_resolution = parsed_info[res_key][0] header['resolution'] = float(raw_resolution) + break except ValueError: logging.debug('Invalid resolution format: %s', parsed_info[res_key]) diff --git a/alphafold/data/pipeline.py b/alphafold/data/pipeline.py index a90eb5776..bfc587b4c 100644 --- a/alphafold/data/pipeline.py +++ b/alphafold/data/pipeline.py @@ -124,7 +124,8 @@ def __init__(self, use_small_bfd: bool, mgnify_max_hits: int = 501, uniref_max_hits: int = 10000, - use_precomputed_msas: bool = False): + use_precomputed_msas: bool = False, + excluded_pdbs: list = []): """Initializes the data pipeline.""" self._use_small_bfd = use_small_bfd self.jackhmmer_uniref90_runner = jackhmmer.Jackhmmer( @@ -146,6 +147,7 @@ def __init__(self, self.mgnify_max_hits = mgnify_max_hits self.uniref_max_hits = uniref_max_hits self.use_precomputed_msas = use_precomputed_msas + self.excluded_pdbs = excluded_pdbs def process(self, input_fasta_path: str, msa_output_dir: str) -> FeatureDict: """Runs alignment tools on the input sequence and creates features.""" @@ -200,6 +202,15 @@ def process(self, input_fasta_path: str, msa_output_dir: str) -> FeatureDict: pdb_template_hits = self.template_searcher.get_template_hits( output_string=pdb_templates_result, input_sequence=input_sequence) + pdb_templates_out_path = os.path.join(msa_output_dir, 'pdb_templates_used.csv') + with open(pdb_templates_out_path, 'w') as f: + f.write('PDB ID,Chain\n') + for hit in pdb_template_hits: + pdbid, chain = templates._get_pdb_id_and_chain(hit) + if pdbid in self.excluded_pdbs: + pdb_template_hits.remove(hit) + else: + f.write(f'{pdbid},{chain}\n') if self._use_small_bfd: bfd_out_path = os.path.join(msa_output_dir, 'small_bfd_hits.sto') diff --git a/alphafold/model/utils.py b/alphafold/model/utils.py index 3e5ac625c..634f0388a 100644 --- a/alphafold/model/utils.py +++ b/alphafold/model/utils.py @@ -163,7 +163,7 @@ def inner(key, shape, **kwargs): keys = grid_keys(key, shape) signature = ( '()->()' - if isinstance(keys, jax.random.PRNGKeyArray) + if jax.dtypes.issubdtype(keys.dtype, jax.dtypes.prng_key) else '(2)->()' ) return jnp.vectorize( diff --git a/cut_version_tag.sh b/cut_version_tag.sh index 3b0596c80..ca0dfddd4 100755 --- a/cut_version_tag.sh +++ b/cut_version_tag.sh @@ -5,15 +5,20 @@ git checkout main echo "pulling latest changes" git pull -VERSION=`cat alphafold/version.py | grep version | grep -o "\d*\.\d*\.\d*"` +VERSION=`cat VERSION` +ALPHAFOLD_VERSION=`cat alphafold/version.py | grep version | grep -o "\d*\.\d*\.\d*"` CUDA_VERSION=`cat docker/Dockerfile | grep CUDA | ggrep -oP '(?<=CUDA=)\d*(\.\d+)*'` -OS_VERSION=`cat docker/Dockerfile | grep nvidia/cuda | ggrep -oP 'ubuntu\K[0-9]+\.[0-9]+' | sed 's/^/ubuntu /' | sed 's/ /-/'` -FULL_TAG="$VERSION-cuda-$CUDA_VERSION-$OS_VERSION" +OS_VERSION=`cat docker/Dockerfile | grep nvidia/cuda | ggrep -oP 'ubuntu\K[0-9]+\.[0-9]+' | sed 's/^/Ubuntu /'` echo "current git HEAD is \"$(git log --oneline |head -1)\"" -read -p "Would you like to create and push the tag ${FULL_TAG} at the current head of the master branch? (y/n)" proceed +read -p "Would you like to create and push the tag $VERSION at the current head of the master branch? (y/n)" proceed if [[ ${proceed} == "y" ]]; then - git tag "${FULL_TAG}" + git tag "$VERSION" -m "AF2 – Levitate Bio + + AlphaFold2: v$ALPHAFOLD_VERSION + CUDA Toolkit: v$CUDA_VERSION + OS: $OS_VERSION" + git push --tags fi \ No newline at end of file diff --git a/docker/Dockerfile b/docker/Dockerfile index def50978c..4acf20e2f 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG CUDA=11.4.3 -FROM nvidia/cuda:${CUDA}-cudnn8-runtime-ubuntu18.04 +ARG CUDA=12.2.2 +FROM nvidia/cuda:${CUDA}-cudnn8-runtime-ubuntu20.04 # FROM directive resets ARGS, so we specify again (the value is retained if # previously set). ARG CUDA @@ -54,14 +54,10 @@ RUN wget -q -P /tmp \ # Install conda packages. ENV PATH="/opt/conda/bin:$PATH" ENV LD_LIBRARY_PATH="/opt/conda/lib:$LD_LIBRARY_PATH" -RUN conda install -qy conda==24.1.2 \ - && conda install -y -c conda-forge \ - openmm=7.7.0 \ - cudatoolkit==${CUDA_VERSION} \ - pdbfixer \ - pip \ - python=3.10 \ - && conda clean --all --force-pkgs-dirs --yes +RUN conda install -qy conda==24.1.2 pip python=3.11 \ + && conda install -y -c nvidia/label/cuda-${CUDA_VERSION} cuda --strict-channel-priority \ + && conda install -y -c conda-forge openmm=8.0.0 pdbfixer ncurses \ + && conda clean --all --force-pkgs-dirs --yes COPY . /app/alphafold RUN wget -q -P /app/alphafold/alphafold/common/ \ @@ -71,13 +67,16 @@ RUN wget -q -P /app/alphafold/alphafold/common/ \ RUN pip3 install --upgrade pip --no-cache-dir \ && pip3 install -r /app/alphafold/requirements.txt --no-cache-dir \ && pip3 install --upgrade --no-cache-dir \ - jax==0.3.25 \ - jaxlib==0.3.25+cuda11.cudnn805 \ + jax==0.4.26 \ + jaxlib==0.4.26+cuda12.cudnn89 \ -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html # Add SETUID bit to the ldconfig binary so that non-root users can run it. RUN chmod u+s /sbin/ldconfig.real +# Currently needed to avoid undefined_symbol error. +RUN ln -sf /usr/lib/x86_64-linux-gnu/libffi.so.7 /opt/conda/lib/libffi.so.7 + # We need to run `ldconfig` first to ensure GPUs are visible, due to some quirk # with Debian. See https://github.com/NVIDIA/nvidia-docker/issues/1399 for # details. diff --git a/notebooks/AlphaFold.ipynb b/notebooks/AlphaFold.ipynb index 11ff88164..353cf4820 100644 --- a/notebooks/AlphaFold.ipynb +++ b/notebooks/AlphaFold.ipynb @@ -112,7 +112,7 @@ " %shell conda install -qy conda==24.1.2 \\\n", " \u0026\u0026 conda install -qy -c conda-forge \\\n", " python=3.10 \\\n", - " openmm=7.7.0 \\\n", + " openmm=8.0.0 \\\n", " pdbfixer\n", " pbar.update(80)\n", "\n", diff --git a/requirements.txt b/requirements.txt index d4ee2c63d..08e470e0d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,13 +1,13 @@ absl-py==1.0.0 biopython==1.79 -chex==0.0.7 -dm-haiku==0.0.10 +chex==0.1.86 +dm-haiku==0.0.12 dm-tree==0.1.8 docker==5.0.0 immutabledict==2.0.0 -jax==0.4.14 +jax==0.4.26 ml-collections==0.1.0 numpy==1.24.3 pandas==2.0.3 scipy==1.11.1 -tensorflow-cpu==2.13.0 +tensorflow-cpu==2.16.1 diff --git a/run_alphafold.py b/run_alphafold.py index 86a629d9e..25ba0fe5d 100644 --- a/run_alphafold.py +++ b/run_alphafold.py @@ -142,6 +142,8 @@ class ModelsToRelax(enum.Enum): 'Relax on GPU can be much faster than CPU, so it is ' 'recommended to enable if possible. GPUs must be available' ' if this setting is enabled.') +flags.DEFINE_list('excluded_pdbs', [], 'PDB IDs to exclude from' + ' templates.') FLAGS = flags.FLAGS @@ -494,7 +496,8 @@ def main(argv): template_searcher=template_searcher, template_featurizer=template_featurizer, use_small_bfd=use_small_bfd, - use_precomputed_msas=FLAGS.use_precomputed_msas) + use_precomputed_msas=FLAGS.use_precomputed_msas, + excluded_pdbs=FLAGS.excluded_pdbs) if run_multimer_system: num_predictions_per_model = FLAGS.num_multimer_predictions_per_model diff --git a/server/README.md b/server/README.md new file mode 100644 index 000000000..99b2fd851 --- /dev/null +++ b/server/README.md @@ -0,0 +1,286 @@ +# JSON file format for AlphaFold Server jobs + +You can +[download an example JSON file here](https://github.com/google-deepmind/alphafold/blob/main/server/example.json); +here we describe the contents of this example JSON file. + +This JSON file consists of a list of dictionaries (even in the case of a single +dictionary, a single-element list must be used), with each dictionary containing +a job description. Therefore, you can specify multiple jobs in one JSON file. + +Each job description contains a job name, a list of PRNG seeds (which can be an +empty list for automated random seed assignment), and a list of entities +(molecules) to be modeled. + +AlphaFold Server JSON files are especially useful for automation of repetitive +modeling jobs (e.g. to screen interactions of one protein with a small number of +others). The easiest way to construct an initial JSON file is to run a modeling +job via AlphaFold Server GUI and use it as a template. AlphaFold Server will +produce a zip file containing modeling results. Inside the zip file you will +find a JSON file named `_job_request.json` containing the job inputs. +These files offer a convenient starting point for generating new jobs as they +are easily editable in standard text editors or in programming environments like +Google Colab notebooks. + +Note that comments are not allowed in JSON files. + +## Job name, seeds and sequences + +* `name` is a string with the job name. This is how the job will appear as in + the job history table. +* `modelSeeds` is a list of strings of uint32 seed values (e.g. + `["1593933729", "4273"]`). Seeds are used to run the modeling. We recommend + providing an empty list, in which case a single random seed will be used. + This is the recommended option. +* `sequences` is a list of dictionaries that carry descriptions of the + entities (molecules) for modeling. + +```json +{ + "name": "Test Fold Job Number One", + "modelSeeds": [], + "sequences": [...] +} +``` + +## Entity types + +Valid entity types mirror those available in the AlphaFold Server web interface: + +* `proteinChain` – used for proteins +* `dnaSequence` – used for DNA (single strand) +* `rnaSequence` – used for RNA (single strand) +* `ligand` – used for allowed ligands +* `ion` – used for allowed ions + +### Protein chains + +`sequence` is a string containing protein sequence; the same limitations as in +the UI are in place, e.g. only letters corresponding to amino acids are allowed, +as defined by IUPAC. Only 20 standard amino acid type are supported. + +`count` is the number of copies of this protein chain (integer). + +`glycans` is an optional list of dictionaries that carries descriptions of the +protein glycosylation. + +* `residues` is a string defining glycan. Please refer to the + [FAQ](https://alphafoldserver.com/faq) for the format description and + allowed glycans. +* `position` is a position of the amino acid to which the glycan is attached + (integer, 1-based indexing). + +`modifications` is an optional list of dictionaries that carries descriptions of +the post-translational modifications. + +* `ptmType` is a string containing the + [CCD code](https://www.wwpdb.org/data/ccd) of the modification; the same + codes are allowed as in the UI. +* `position` is a position of the modified amino acid (integer). +* Allowed modifications: `CCD_SEP`, `CCD_TPO`, `CCD_PTR`, `CCD_NEP`, + `CCD_HIP`, `CCD_ALY`, `CCD_MLY`, `CCD_M3L`, `CCD_MLZ`, `CCD_2MR`, `CCD_AGM`, + `CCD_MCS`, `CCD_HYP`, `CCD_HY3`, `CCD_LYZ`, `CCD_AHB`, `CCD_P1L`, `CCD_SNN`, + `CCD_SNC`, `CCD_TRF`, `CCD_KCR`, `CCD_CIR`, `CCD_YHA` + +```json +{ + "proteinChain": { + "sequence": "PREACHINGS", + + "glycans": [ + { + "residues": "NAG(NAG)(BMA)", + "position": 8 + }, + { + "residues": "BMA", + "position": 10 + } + ], + + "modifications": [ + { + "ptmType": "CCD_HY3", + "ptmPosition": 1 + }, + { + "ptmType": "CCD_P1L", + "ptmPosition": 5 + } + ], + + "count": 1 + } +}, +{ + "proteinChain": { + "sequence": "REACHER", + "count": 1 + } +} +``` + +### DNA chains + +Please note that the `dnaSequence` type refers to single stranded DNA. If you +wish to model double stranded DNA, please add a second `"dnaSequence`", carrying +the sequence of the reverse complement strand. + +`sequence` is a string containing a DNA sequence; the same limitations as in the +UI are in place, i.e. only letters A, T, G, C are allowed. + +`count` is a number of copies of this DNA chain (integer). + +`modifications` is an optional list of dictionaries that carries descriptions of +the DNA chemical modifications. + +* `modificationType` is a string containing + [CCD code](https://www.wwpdb.org/data/ccd) of modification; the same codes + are allowed as in the UI. +* `basePosition` is a position of the modified nucleotide (integer). +* Allowed modifications: `CCD_5CM`, `CCD_C34`, `CCD_5HC`, `CCD_6OG`, + `CCD_6MA`, `CCD_1CC`, `CCD_8OG`, `CCD_5FC`, `CCD_3DR` + +```json +{ + "dnaSequence": { + "sequence": "GATTACA", + + "modifications": [ + { + "modificationType": "CCD_6OG", + "basePosition": 1 + }, + { + "modificationType": "CCD_6MA", + "basePosition": 2 + } + ], + + "count": 1 + } +}, +{ + "dnaSequence": { + "sequence": "TGTAATC", + "count": 1 + } +} +``` + +### RNA chains + +`sequence` is a string containing RNA sequence (single strand); the same +limitations as in the UI are in place, e.g. only letters A, U, G, C are allowed. + +`count` is a number of copies of this RNA chain (integer). + +`modifications` is an optional list of dictionaries that carries descriptions of +the RNA chemical modifications. + +* `modificationType` is a string containing + [CCD code](https://www.wwpdb.org/data/ccd) of modification; the same codes + are allowed as in the UI. +* `basePosition` is a position of the modified nucleotide (integer). +* Allowed modifications: `CCD_PSU`, `CCD_5MC`, `CCD_OMC`, `CCD_4OC`, + `CCD_5MU`, `CCD_OMU`, `CCD_UR3`, `CCD_A2M`, `CCD_MA6`, `CCD_6MZ`, `CCD_2MG`, + `CCD_OMG`, `CCD_7MG`, `CCD_RSQ` + +```json +{ + "rnaSequence": { + "sequence": "GUAC", + + "modifications": [ + { + "modificationType": "CCD_2MG", + "basePosition": 1 + }, + { + "modificationType": "CCD_5MC", + "basePosition": 4 + } + ], + + "count": 1 + } +} +``` + +### Ligands + +`ligand` is a string containing the [CCD code](https://www.wwpdb.org/data/ccd) +of the ligand; the same codes are allowed as in the UI. + +`count` is the number of copies of this ligand (integer). + +Allowed ligands: `CCD_ADP`, `CCD_ATP`, `CCD_AMP`, `CCD_GTP`, `CCD_GDP`, +`CCD_FAD`, `CCD_NAD`, `CCD_NAP`, `CCD_NDP`, `CCD_HEM`, `CCD_HEC`, `CCD_PLM`, +`CCD_OLA`, `CCD_MYR`, `CCD_CIT`, `CCD_CLA`, `CCD_CHL`, `CCD_BCL`, `CCD_BCB` + +```json +{ + "ligand": { + "ligand": "CCD_ATP", + "count": 1 + } +}, +{ + "ligand": { + "ligand": "CCD_HEM", + "count": 2 + } +} +``` + +### Ions + +`ion` is a string containing [CCD code](https://www.wwpdb.org/data/ccd) of the +ion; the same codes are allowed as in the UI. The ion charge is implicitly +specified by the CCD code. + +`count` is a number of copies of this ion (integer). + +Allowed ions: `MG`, `ZN`, `CL`, `CA`, `NA`, `MN`, `K`, `FE`, `CU`, `CO` + +```json +{ + "ion": { + "ion": "MG", + "count": 2 + } +}, +{ + "ion": { + "ion": "NA", + "count": 3 + } +} +``` + +# Additional modeling jobs + +You may specify multiple jobs in one JSON file. This is an example of a simple +job request for one protein chain and two copies of the palindromic DNA +sequence: + +```json +{ + "name": "Test Fold Job Number Two", + "modelSeeds": [], + "sequences": [ + { + "proteinChain": { + "sequence": "TEACHINGS", + "count": 1 + } + }, + { + "dnaSequence": { + "sequence": "TAGCTA", + "count": 2 + } + } + ] +} +``` diff --git a/server/example.json b/server/example.json new file mode 100644 index 000000000..e0c9cedfb --- /dev/null +++ b/server/example.json @@ -0,0 +1,120 @@ +[ + { + "name": "Test Fold Job Number One", + "modelSeeds": [], + "sequences": [ + { + "proteinChain": { + "sequence": "PREACHINGS", + "glycans": [ + { + "residues": "NAG(NAG)(BMA)", + "position": 8 + }, + { + "residues": "BMA", + "position": 10 + } + ], + "modifications": [ + { + "ptmType": "CCD_HY3", + "ptmPosition": 1 + }, + { + "ptmType": "CCD_P1L", + "ptmPosition": 5 + } + ], + "count": 1 + } + }, + { + "proteinChain": { + "sequence": "REACHER", + "count": 1 + } + }, + { + "dnaSequence": { + "sequence": "GATTACA", + "modifications": [ + { + "modificationType": "CCD_6OG", + "basePosition": 1 + }, + { + "modificationType": "CCD_6MA", + "basePosition": 2 + } + ], + "count": 1 + } + }, + { + "dnaSequence": { + "sequence": "TGTAATC", + "count": 1 + } + }, + { + "rnaSequence": { + "sequence": "GUAC", + "modifications": [ + { + "modificationType": "CCD_2MG", + "basePosition": 1 + }, + { + "modificationType": "CCD_5MC", + "basePosition": 4 + } + ], + "count": 1 + } + }, + { + "ligand": { + "ligand": "CCD_ATP", + "count": 1 + } + }, + { + "ligand": { + "ligand": "CCD_HEM", + "count": 2 + } + }, + { + "ion": { + "ion": "MG", + "count": 2 + } + }, + { + "ion": { + "ion": "NA", + "count": 3 + } + } + ] + }, + { + "name": "Test Fold Job Number Two", + "modelSeeds": [], + "sequences": [ + { + "proteinChain": { + "sequence": "TEACHINGS", + "count": 1 + } + }, + { + "dnaSequence": { + "sequence": "TAGGACA", + "count": 1 + } + } + ] + } +]