{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "accelerator": "GPU", "colab": { "name": "AlphaFold2.ipynb", "provenance": [], "machine_shape": "hm", "gpuType": "T4" }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "G4yBrceuFbf3" }, "source": [ "\n", "\n", "##ColabFold v1.5.5: AlphaFold2 using MMseqs2\n", "\n", "Easy to use protein structure and complex prediction using [AlphaFold2](https://www.nature.com/articles/s41586-021-03819-2) and [Alphafold2-multimer](https://www.biorxiv.org/content/10.1101/2021.10.04.463034v1). Sequence alignments/templates are generated through [MMseqs2](mmseqs.com) and [HHsearch](https://github.com/soedinglab/hh-suite). For more details, see bottom of the notebook, checkout the [ColabFold GitHub](https://github.com/sokrypton/ColabFold) and read our manuscript.\n", "Old versions: [v1.4](https://colab.research.google.com/github/sokrypton/ColabFold/blob/v1.4.0/AlphaFold2.ipynb), [v1.5.1](https://colab.research.google.com/github/sokrypton/ColabFold/blob/v1.5.1/AlphaFold2.ipynb), [v1.5.2](https://colab.research.google.com/github/sokrypton/ColabFold/blob/v1.5.2/AlphaFold2.ipynb), [v1.5.3-patch](https://colab.research.google.com/github/sokrypton/ColabFold/blob/56c72044c7d51a311ca99b953a71e552fdc042e1/AlphaFold2.ipynb)\n", "\n", "[Mirdita M, Schütze K, Moriwaki Y, Heo L, Ovchinnikov S, Steinegger M. ColabFold: Making protein folding accessible to all.\n", "*Nature Methods*, 2022](https://www.nature.com/articles/s41592-022-01488-1)" ] }, { "cell_type": "code", "metadata": { "id": "kOblAo-xetgx", "cellView": "form", "outputId": "861d3e4f-ba01-4c52-fadf-e5f0a65509d9", "colab": { "base_uri": "https://localhost:8080/" } }, "source": [ "#@title Input protein sequence(s), then hit `Runtime` -> `Run all`\n", "from google.colab import files\n", "import os\n", "import re\n", "import hashlib\n", "import random\n", "\n", "from sys import version_info\n", "python_version = f\"{version_info.major}.{version_info.minor}\"\n", "\n", "def add_hash(x,y):\n", " return x+\"_\"+hashlib.sha1(y.encode()).hexdigest()[:5]\n", "\n", "query_sequence = 'AAVALLPAVLLALLAVTDQLGEDFFAVDLEAFLQEFGLLPEKE' #@param {type:\"string\"}\n", "#@markdown - Use `:` to specify inter-protein chainbreaks for **modeling complexes** (supports homo- and hetro-oligomers). For example **PI...SK:PI...SK** for a homodimer\n", "jobname = 'test' #@param {type:\"string\"}\n", "# number of models to use\n", "num_relax = 0 #@param [0, 1, 5] {type:\"raw\"}\n", "#@markdown - specify how many of the top ranked structures to relax using amber\n", "template_mode = \"none\" #@param [\"none\", \"pdb100\",\"custom\"]\n", "#@markdown - `none` = no template information is used. `pdb100` = detect templates in pdb100 (see [notes](#pdb100)). `custom` - upload and search own templates (PDB or mmCIF format, see [notes](#custom_templates))\n", "\n", "use_amber = num_relax > 0\n", "\n", "# remove whitespaces\n", "query_sequence = \"\".join(query_sequence.split())\n", "\n", "basejobname = \"\".join(jobname.split())\n", "basejobname = re.sub(r'\\W+', '', basejobname)\n", "jobname = add_hash(basejobname, query_sequence)\n", "\n", "# check if directory with jobname exists\n", "def check(folder):\n", " if os.path.exists(folder):\n", " return False\n", " else:\n", " return True\n", "if not check(jobname):\n", " n = 0\n", " while not check(f\"{jobname}_{n}\"): n += 1\n", " jobname = f\"{jobname}_{n}\"\n", "\n", "# make directory to save results\n", "os.makedirs(jobname, exist_ok=True)\n", "\n", "# save queries\n", "queries_path = os.path.join(jobname, f\"{jobname}.csv\")\n", "with open(queries_path, \"w\") as text_file:\n", " text_file.write(f\"id,sequence\\n{jobname},{query_sequence}\")\n", "\n", "if template_mode == \"pdb100\":\n", " use_templates = True\n", " custom_template_path = None\n", "elif template_mode == \"custom\":\n", " custom_template_path = os.path.join(jobname,f\"template\")\n", " os.makedirs(custom_template_path, exist_ok=True)\n", " uploaded = files.upload()\n", " use_templates = True\n", " for fn in uploaded.keys():\n", " os.rename(fn,os.path.join(custom_template_path,fn))\n", "else:\n", " custom_template_path = None\n", " use_templates = False\n", "\n", "print(\"jobname\",jobname)\n", "print(\"sequence\",query_sequence)\n", "print(\"length\",len(query_sequence.replace(\":\",\"\")))\n", "\n" ], "execution_count": 1, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "jobname test_21d06\n", "sequence AAVALLPAVLLALLAVTDQLGEDFFAVDLEAFLQEFGLLPEKE\n", "length 43\n" ] } ] }, { "cell_type": "code", "source": [ "#@title Install dependencies\n", "%%time\n", "import os\n", "USE_AMBER = use_amber\n", "USE_TEMPLATES = use_templates\n", "PYTHON_VERSION = python_version\n", "\n", "if not os.path.isfile(\"COLABFOLD_READY\"):\n", " print(\"installing colabfold...\")\n", " os.system(\"pip install -q --no-warn-conflicts 'colabfold[alphafold-minus-jax] @ git+https://github.com/sokrypton/ColabFold'\")\n", " if os.environ.get('TPU_NAME', False) != False:\n", " os.system(\"pip uninstall -y jax jaxlib\")\n", " os.system(\"pip install --no-warn-conflicts --upgrade dm-haiku==0.0.10 'jax[cuda12_pip]'==0.3.25 -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html\")\n", " os.system(\"ln -s /usr/local/lib/python3.*/dist-packages/colabfold colabfold\")\n", " os.system(\"ln -s /usr/local/lib/python3.*/dist-packages/alphafold alphafold\")\n", " os.system(\"touch COLABFOLD_READY\")\n", "\n", "if USE_AMBER or USE_TEMPLATES:\n", " if not os.path.isfile(\"CONDA_READY\"):\n", " print(\"installing conda...\")\n", " os.system(\"wget -qnc https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh\")\n", " os.system(\"bash Mambaforge-Linux-x86_64.sh -bfp /usr/local\")\n", " os.system(\"mamba config --set auto_update_conda false\")\n", " os.system(\"touch CONDA_READY\")\n", "\n", "if USE_TEMPLATES and not os.path.isfile(\"HH_READY\") and USE_AMBER and not os.path.isfile(\"AMBER_READY\"):\n", " print(\"installing hhsuite and amber...\")\n", " os.system(f\"mamba install -y -c conda-forge -c bioconda kalign2=2.04 hhsuite=3.3.0 openmm=7.7.0 python='{PYTHON_VERSION}' pdbfixer\")\n", " os.system(\"touch HH_READY\")\n", " os.system(\"touch AMBER_READY\")\n", "else:\n", " if USE_TEMPLATES and not os.path.isfile(\"HH_READY\"):\n", " print(\"installing hhsuite...\")\n", " os.system(f\"mamba install -y -c conda-forge -c bioconda kalign2=2.04 hhsuite=3.3.0 python='{PYTHON_VERSION}'\")\n", " os.system(\"touch HH_READY\")\n", " if USE_AMBER and not os.path.isfile(\"AMBER_READY\"):\n", " print(\"installing amber...\")\n", " os.system(f\"mamba install -y -c conda-forge openmm=7.7.0 python='{PYTHON_VERSION}' pdbfixer\")\n", " os.system(\"touch AMBER_READY\")" ], "metadata": { "cellView": "form", "id": "AzIKiDiCaHAn", "outputId": "17095677-05ce-4e08-efa1-c70bd9f81012", "colab": { "base_uri": "https://localhost:8080/" } }, "execution_count": 2, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "installing colabfold...\n", "CPU times: user 238 ms, sys: 28.3 ms, total: 266 ms\n", "Wall time: 1min 2s\n" ] } ] }, { "cell_type": "code", "source": [ "#@markdown ### MSA options (custom MSA upload, single sequence, pairing mode)\n", "msa_mode = \"mmseqs2_uniref_env\" #@param [\"mmseqs2_uniref_env\", \"mmseqs2_uniref\",\"single_sequence\",\"custom\"]\n", "pair_mode = \"unpaired_paired\" #@param [\"unpaired_paired\",\"paired\",\"unpaired\"] {type:\"string\"}\n", "#@markdown - \"unpaired_paired\" = pair sequences from same species + unpaired MSA, \"unpaired\" = seperate MSA for each chain, \"paired\" - only use paired sequences.\n", "\n", "# decide which a3m to use\n", "if \"mmseqs2\" in msa_mode:\n", " a3m_file = os.path.join(jobname,f\"{jobname}.a3m\")\n", "\n", "elif msa_mode == \"custom\":\n", " a3m_file = os.path.join(jobname,f\"{jobname}.custom.a3m\")\n", " if not os.path.isfile(a3m_file):\n", " custom_msa_dict = files.upload()\n", " custom_msa = list(custom_msa_dict.keys())[0]\n", " header = 0\n", " import fileinput\n", " for line in fileinput.FileInput(custom_msa,inplace=1):\n", " if line.startswith(\">\"):\n", " header = header + 1\n", " if not line.rstrip():\n", " continue\n", " if line.startswith(\">\") == False and header == 1:\n", " query_sequence = line.rstrip()\n", " print(line, end='')\n", "\n", " os.rename(custom_msa, a3m_file)\n", " queries_path=a3m_file\n", " print(f\"moving {custom_msa} to {a3m_file}\")\n", "\n", "else:\n", " a3m_file = os.path.join(jobname,f\"{jobname}.single_sequence.a3m\")\n", " with open(a3m_file, \"w\") as text_file:\n", " text_file.write(\">1\\n%s\" % query_sequence)" ], "metadata": { "cellView": "form", "id": "C2_sh2uAonJH" }, "execution_count": 3, "outputs": [] }, { "cell_type": "code", "source": [ "#@markdown ### Advanced settings\n", "model_type = \"auto\" #@param [\"auto\", \"alphafold2_ptm\", \"alphafold2_multimer_v1\", \"alphafold2_multimer_v2\", \"alphafold2_multimer_v3\", \"deepfold_v1\"]\n", "#@markdown - if `auto` selected, will use `alphafold2_ptm` for monomer prediction and `alphafold2_multimer_v3` for complex prediction.\n", "#@markdown Any of the mode_types can be used (regardless if input is monomer or complex).\n", "num_recycles = \"3\" #@param [\"auto\", \"0\", \"1\", \"3\", \"6\", \"12\", \"24\", \"48\"]\n", "#@markdown - if `auto` selected, will use `num_recycles=20` if `model_type=alphafold2_multimer_v3`, else `num_recycles=3` .\n", "recycle_early_stop_tolerance = \"auto\" #@param [\"auto\", \"0.0\", \"0.5\", \"1.0\"]\n", "#@markdown - if `auto` selected, will use `tol=0.5` if `model_type=alphafold2_multimer_v3` else `tol=0.0`.\n", "relax_max_iterations = 200 #@param [0, 200, 2000] {type:\"raw\"}\n", "#@markdown - max amber relax iterations, `0` = unlimited (AlphaFold2 default, can take very long)\n", "pairing_strategy = \"greedy\" #@param [\"greedy\", \"complete\"] {type:\"string\"}\n", "#@markdown - `greedy` = pair any taxonomically matching subsets, `complete` = all sequences have to match in one line.\n", "\n", "\n", "#@markdown #### Sample settings\n", "#@markdown - enable dropouts and increase number of seeds to sample predictions from uncertainty of the model.\n", "#@markdown - decrease `max_msa` to increase uncertainity\n", "max_msa = \"auto\" #@param [\"auto\", \"512:1024\", \"256:512\", \"64:128\", \"32:64\", \"16:32\"]\n", "num_seeds = 1 #@param [1,2,4,8,16] {type:\"raw\"}\n", "use_dropout = False #@param {type:\"boolean\"}\n", "\n", "num_recycles = None if num_recycles == \"auto\" else int(num_recycles)\n", "recycle_early_stop_tolerance = None if recycle_early_stop_tolerance == \"auto\" else float(recycle_early_stop_tolerance)\n", "if max_msa == \"auto\": max_msa = None\n", "\n", "#@markdown #### Save settings\n", "save_all = False #@param {type:\"boolean\"}\n", "save_recycles = False #@param {type:\"boolean\"}\n", "save_to_google_drive = False #@param {type:\"boolean\"}\n", "#@markdown - if the save_to_google_drive option was selected, the result zip will be uploaded to your Google Drive\n", "dpi = 200 #@param {type:\"integer\"}\n", "#@markdown - set dpi for image resolution\n", "\n", "if save_to_google_drive:\n", " from pydrive2.drive import GoogleDrive\n", " from pydrive2.auth import GoogleAuth\n", " from google.colab import auth\n", " from oauth2client.client import GoogleCredentials\n", " auth.authenticate_user()\n", " gauth = GoogleAuth()\n", " gauth.credentials = GoogleCredentials.get_application_default()\n", " drive = GoogleDrive(gauth)\n", " print(\"You are logged into Google Drive and are good to go!\")\n", "\n", "#@markdown Don't forget to hit `Runtime` -> `Run all` after updating the form." ], "metadata": { "cellView": "form", "id": "ADDuaolKmjGW" }, "execution_count": 4, "outputs": [] }, { "cell_type": "code", "source": [ "#@title Run Prediction\n", "display_images = True #@param {type:\"boolean\"}\n", "\n", "import sys\n", "import warnings\n", "warnings.simplefilter(action='ignore', category=FutureWarning)\n", "from Bio import BiopythonDeprecationWarning\n", "warnings.simplefilter(action='ignore', category=BiopythonDeprecationWarning)\n", "from pathlib import Path\n", "from colabfold.download import download_alphafold_params, default_data_dir\n", "from colabfold.utils import setup_logging\n", "from colabfold.batch import get_queries, run, set_model_type\n", "from colabfold.plot import plot_msa_v2\n", "\n", "import os\n", "import numpy as np\n", "try:\n", " K80_chk = os.popen('nvidia-smi | grep \"Tesla K80\" | wc -l').read()\n", "except:\n", " K80_chk = \"0\"\n", " pass\n", "if \"1\" in K80_chk:\n", " print(\"WARNING: found GPU Tesla K80: limited to total length < 1000\")\n", " if \"TF_FORCE_UNIFIED_MEMORY\" in os.environ:\n", " del os.environ[\"TF_FORCE_UNIFIED_MEMORY\"]\n", " if \"XLA_PYTHON_CLIENT_MEM_FRACTION\" in os.environ:\n", " del os.environ[\"XLA_PYTHON_CLIENT_MEM_FRACTION\"]\n", "\n", "from colabfold.colabfold import plot_protein\n", "from pathlib import Path\n", "import matplotlib.pyplot as plt\n", "\n", "# For some reason we need that to get pdbfixer to import\n", "if use_amber and f\"/usr/local/lib/python{python_version}/site-packages/\" not in sys.path:\n", " sys.path.insert(0, f\"/usr/local/lib/python{python_version}/site-packages/\")\n", "\n", "def input_features_callback(input_features):\n", " if display_images:\n", " plot_msa_v2(input_features)\n", " plt.show()\n", " plt.close()\n", "\n", "def prediction_callback(protein_obj, length,\n", " prediction_result, input_features, mode):\n", " model_name, relaxed = mode\n", " if not relaxed:\n", " if display_images:\n", " fig = plot_protein(protein_obj, Ls=length, dpi=150)\n", " plt.show()\n", " plt.close()\n", "\n", "result_dir = jobname\n", "log_filename = os.path.join(jobname,\"log.txt\")\n", "setup_logging(Path(log_filename))\n", "\n", "queries, is_complex = get_queries(queries_path)\n", "model_type = set_model_type(is_complex, model_type)\n", "\n", "if \"multimer\" in model_type and max_msa is not None:\n", " use_cluster_profile = False\n", "else:\n", " use_cluster_profile = True\n", "\n", "download_alphafold_params(model_type, Path(\".\"))\n", "results = run(\n", " queries=queries,\n", " result_dir=result_dir,\n", " use_templates=use_templates,\n", " custom_template_path=custom_template_path,\n", " num_relax=num_relax,\n", " msa_mode=msa_mode,\n", " model_type=model_type,\n", " num_models=5,\n", " num_recycles=num_recycles,\n", " relax_max_iterations=relax_max_iterations,\n", " recycle_early_stop_tolerance=recycle_early_stop_tolerance,\n", " num_seeds=num_seeds,\n", " use_dropout=use_dropout,\n", " model_order=[1,2,3,4,5],\n", " is_complex=is_complex,\n", " data_dir=Path(\".\"),\n", " keep_existing_results=False,\n", " rank_by=\"auto\",\n", " pair_mode=pair_mode,\n", " pairing_strategy=pairing_strategy,\n", " stop_at_score=float(100),\n", " prediction_callback=prediction_callback,\n", " dpi=dpi,\n", " zip_results=False,\n", " save_all=save_all,\n", " max_msa=max_msa,\n", " use_cluster_profile=use_cluster_profile,\n", " input_features_callback=input_features_callback,\n", " save_recycles=save_recycles,\n", " user_agent=\"colabfold/google-colab-main\",\n", ")\n", "results_zip = f\"{jobname}.result.zip\"\n", "os.system(f\"zip -r {results_zip} {jobname}\")" ], "metadata": { "cellView": "form", "id": "mbaIO9pWjaN0", "collapsed": true, "outputId": "ff931b5c-704e-40bd-d54a-eb527361150f", "colab": { "base_uri": "https://localhost:8080/" } }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "Downloading alphafold2_ptm weights to .: 83%|████████▎ | 2.86G/3.47G [02:18<00:28, 22.9MB/s]" ] } ] }, { "cell_type": "code", "metadata": { "id": "KK7X9T44pWb7", "cellView": "form" }, "source": [ "#@title Display 3D structure {run: \"auto\"}\n", "import py3Dmol\n", "import glob\n", "import matplotlib.pyplot as plt\n", "from colabfold.colabfold import plot_plddt_legend\n", "from colabfold.colabfold import pymol_color_list, alphabet_list\n", "rank_num = 1 #@param [\"1\", \"2\", \"3\", \"4\", \"5\"] {type:\"raw\"}\n", "color = \"lDDT\" #@param [\"chain\", \"lDDT\", \"rainbow\"]\n", "show_sidechains = False #@param {type:\"boolean\"}\n", "show_mainchains = False #@param {type:\"boolean\"}\n", "\n", "tag = results[\"rank\"][0][rank_num - 1]\n", "jobname_prefix = \".custom\" if msa_mode == \"custom\" else \"\"\n", "pdb_filename = f\"{jobname}/{jobname}{jobname_prefix}_unrelaxed_{tag}.pdb\"\n", "pdb_file = glob.glob(pdb_filename)\n", "\n", "def show_pdb(rank_num=1, show_sidechains=False, show_mainchains=False, color=\"lDDT\"):\n", " model_name = f\"rank_{rank_num}\"\n", " view = py3Dmol.view(js='https://3dmol.org/build/3Dmol.js',)\n", " view.addModel(open(pdb_file[0],'r').read(),'pdb')\n", "\n", " if color == \"lDDT\":\n", " view.setStyle({'cartoon': {'colorscheme': {'prop':'b','gradient': 'roygb','min':50,'max':90}}})\n", " elif color == \"rainbow\":\n", " view.setStyle({'cartoon': {'color':'spectrum'}})\n", " elif color == \"chain\":\n", " chains = len(queries[0][1]) + 1 if is_complex else 1\n", " for n,chain,color in zip(range(chains),alphabet_list,pymol_color_list):\n", " view.setStyle({'chain':chain},{'cartoon': {'color':color}})\n", "\n", " if show_sidechains:\n", " BB = ['C','O','N']\n", " view.addStyle({'and':[{'resn':[\"GLY\",\"PRO\"],'invert':True},{'atom':BB,'invert':True}]},\n", " {'stick':{'colorscheme':f\"WhiteCarbon\",'radius':0.3}})\n", " view.addStyle({'and':[{'resn':\"GLY\"},{'atom':'CA'}]},\n", " {'sphere':{'colorscheme':f\"WhiteCarbon\",'radius':0.3}})\n", " view.addStyle({'and':[{'resn':\"PRO\"},{'atom':['C','O'],'invert':True}]},\n", " {'stick':{'colorscheme':f\"WhiteCarbon\",'radius':0.3}})\n", " if show_mainchains:\n", " BB = ['C','O','N','CA']\n", " view.addStyle({'atom':BB},{'stick':{'colorscheme':f\"WhiteCarbon\",'radius':0.3}})\n", "\n", " view.zoomTo()\n", " return view\n", "\n", "show_pdb(rank_num, show_sidechains, show_mainchains, color).show()\n", "if color == \"lDDT\":\n", " plot_plddt_legend().show()" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "11l8k--10q0C", "cellView": "form" }, "source": [ "#@title Plots {run: \"auto\"}\n", "from IPython.display import display, HTML\n", "import base64\n", "from html import escape\n", "\n", "# see: https://stackoverflow.com/a/53688522\n", "def image_to_data_url(filename):\n", " ext = filename.split('.')[-1]\n", " prefix = f'data:image/{ext};base64,'\n", " with open(filename, 'rb') as f:\n", " img = f.read()\n", " return prefix + base64.b64encode(img).decode('utf-8')\n", "\n", "pae = \"\"\n", "pae_file = os.path.join(jobname,f\"{jobname}{jobname_prefix}_pae.png\")\n", "if os.path.isfile(pae_file):\n", " pae = image_to_data_url(pae_file)\n", "cov = image_to_data_url(os.path.join(jobname,f\"{jobname}{jobname_prefix}_coverage.png\"))\n", "plddt = image_to_data_url(os.path.join(jobname,f\"{jobname}{jobname_prefix}_plddt.png\"))\n", "display(HTML(f\"\"\"\n", "\n", "