diff --git a/Useful tools/Structure prediction with AlphaFold2.ipynb b/Useful tools/Structure prediction with AlphaFold2.ipynb new file mode 100644 index 0000000..f84adea --- /dev/null +++ b/Useful tools/Structure prediction with AlphaFold2.ipynb @@ -0,0 +1,669 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "accelerator": "GPU", + "colab": { + "name": "AlphaFold2.ipynb", + "provenance": [], + "machine_shape": "hm", + "gpuType": "T4" + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "G4yBrceuFbf3" + }, + "source": [ + "\n", + "\n", + "##ColabFold v1.5.5: AlphaFold2 using MMseqs2\n", + "\n", + "Easy to use protein structure and complex prediction using [AlphaFold2](https://www.nature.com/articles/s41586-021-03819-2) and [Alphafold2-multimer](https://www.biorxiv.org/content/10.1101/2021.10.04.463034v1). Sequence alignments/templates are generated through [MMseqs2](mmseqs.com) and [HHsearch](https://github.com/soedinglab/hh-suite). For more details, see bottom of the notebook, checkout the [ColabFold GitHub](https://github.com/sokrypton/ColabFold) and read our manuscript.\n", + "Old versions: [v1.4](https://colab.research.google.com/github/sokrypton/ColabFold/blob/v1.4.0/AlphaFold2.ipynb), [v1.5.1](https://colab.research.google.com/github/sokrypton/ColabFold/blob/v1.5.1/AlphaFold2.ipynb), [v1.5.2](https://colab.research.google.com/github/sokrypton/ColabFold/blob/v1.5.2/AlphaFold2.ipynb), [v1.5.3-patch](https://colab.research.google.com/github/sokrypton/ColabFold/blob/56c72044c7d51a311ca99b953a71e552fdc042e1/AlphaFold2.ipynb)\n", + "\n", + "[Mirdita M, Schütze K, Moriwaki Y, Heo L, Ovchinnikov S, Steinegger M. ColabFold: Making protein folding accessible to all.\n", + "*Nature Methods*, 2022](https://www.nature.com/articles/s41592-022-01488-1)" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "kOblAo-xetgx", + "cellView": "form", + "outputId": "861d3e4f-ba01-4c52-fadf-e5f0a65509d9", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "source": [ + "#@title Input protein sequence(s), then hit `Runtime` -> `Run all`\n", + "from google.colab import files\n", + "import os\n", + "import re\n", + "import hashlib\n", + "import random\n", + "\n", + "from sys import version_info\n", + "python_version = f\"{version_info.major}.{version_info.minor}\"\n", + "\n", + "def add_hash(x,y):\n", + " return x+\"_\"+hashlib.sha1(y.encode()).hexdigest()[:5]\n", + "\n", + "query_sequence = 'AAVALLPAVLLALLAVTDQLGEDFFAVDLEAFLQEFGLLPEKE' #@param {type:\"string\"}\n", + "#@markdown - Use `:` to specify inter-protein chainbreaks for **modeling complexes** (supports homo- and hetro-oligomers). For example **PI...SK:PI...SK** for a homodimer\n", + "jobname = 'test' #@param {type:\"string\"}\n", + "# number of models to use\n", + "num_relax = 0 #@param [0, 1, 5] {type:\"raw\"}\n", + "#@markdown - specify how many of the top ranked structures to relax using amber\n", + "template_mode = \"none\" #@param [\"none\", \"pdb100\",\"custom\"]\n", + "#@markdown - `none` = no template information is used. `pdb100` = detect templates in pdb100 (see [notes](#pdb100)). `custom` - upload and search own templates (PDB or mmCIF format, see [notes](#custom_templates))\n", + "\n", + "use_amber = num_relax > 0\n", + "\n", + "# remove whitespaces\n", + "query_sequence = \"\".join(query_sequence.split())\n", + "\n", + "basejobname = \"\".join(jobname.split())\n", + "basejobname = re.sub(r'\\W+', '', basejobname)\n", + "jobname = add_hash(basejobname, query_sequence)\n", + "\n", + "# check if directory with jobname exists\n", + "def check(folder):\n", + " if os.path.exists(folder):\n", + " return False\n", + " else:\n", + " return True\n", + "if not check(jobname):\n", + " n = 0\n", + " while not check(f\"{jobname}_{n}\"): n += 1\n", + " jobname = f\"{jobname}_{n}\"\n", + "\n", + "# make directory to save results\n", + "os.makedirs(jobname, exist_ok=True)\n", + "\n", + "# save queries\n", + "queries_path = os.path.join(jobname, f\"{jobname}.csv\")\n", + "with open(queries_path, \"w\") as text_file:\n", + " text_file.write(f\"id,sequence\\n{jobname},{query_sequence}\")\n", + "\n", + "if template_mode == \"pdb100\":\n", + " use_templates = True\n", + " custom_template_path = None\n", + "elif template_mode == \"custom\":\n", + " custom_template_path = os.path.join(jobname,f\"template\")\n", + " os.makedirs(custom_template_path, exist_ok=True)\n", + " uploaded = files.upload()\n", + " use_templates = True\n", + " for fn in uploaded.keys():\n", + " os.rename(fn,os.path.join(custom_template_path,fn))\n", + "else:\n", + " custom_template_path = None\n", + " use_templates = False\n", + "\n", + "print(\"jobname\",jobname)\n", + "print(\"sequence\",query_sequence)\n", + "print(\"length\",len(query_sequence.replace(\":\",\"\")))\n", + "\n" + ], + "execution_count": 1, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "jobname test_21d06\n", + "sequence AAVALLPAVLLALLAVTDQLGEDFFAVDLEAFLQEFGLLPEKE\n", + "length 43\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "#@title Install dependencies\n", + "%%time\n", + "import os\n", + "USE_AMBER = use_amber\n", + "USE_TEMPLATES = use_templates\n", + "PYTHON_VERSION = python_version\n", + "\n", + "if not os.path.isfile(\"COLABFOLD_READY\"):\n", + " print(\"installing colabfold...\")\n", + " os.system(\"pip install -q --no-warn-conflicts 'colabfold[alphafold-minus-jax] @ git+https://github.com/sokrypton/ColabFold'\")\n", + " if os.environ.get('TPU_NAME', False) != False:\n", + " os.system(\"pip uninstall -y jax jaxlib\")\n", + " os.system(\"pip install --no-warn-conflicts --upgrade dm-haiku==0.0.10 'jax[cuda12_pip]'==0.3.25 -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html\")\n", + " os.system(\"ln -s /usr/local/lib/python3.*/dist-packages/colabfold colabfold\")\n", + " os.system(\"ln -s /usr/local/lib/python3.*/dist-packages/alphafold alphafold\")\n", + " os.system(\"touch COLABFOLD_READY\")\n", + "\n", + "if USE_AMBER or USE_TEMPLATES:\n", + " if not os.path.isfile(\"CONDA_READY\"):\n", + " print(\"installing conda...\")\n", + " os.system(\"wget -qnc https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh\")\n", + " os.system(\"bash Mambaforge-Linux-x86_64.sh -bfp /usr/local\")\n", + " os.system(\"mamba config --set auto_update_conda false\")\n", + " os.system(\"touch CONDA_READY\")\n", + "\n", + "if USE_TEMPLATES and not os.path.isfile(\"HH_READY\") and USE_AMBER and not os.path.isfile(\"AMBER_READY\"):\n", + " print(\"installing hhsuite and amber...\")\n", + " os.system(f\"mamba install -y -c conda-forge -c bioconda kalign2=2.04 hhsuite=3.3.0 openmm=7.7.0 python='{PYTHON_VERSION}' pdbfixer\")\n", + " os.system(\"touch HH_READY\")\n", + " os.system(\"touch AMBER_READY\")\n", + "else:\n", + " if USE_TEMPLATES and not os.path.isfile(\"HH_READY\"):\n", + " print(\"installing hhsuite...\")\n", + " os.system(f\"mamba install -y -c conda-forge -c bioconda kalign2=2.04 hhsuite=3.3.0 python='{PYTHON_VERSION}'\")\n", + " os.system(\"touch HH_READY\")\n", + " if USE_AMBER and not os.path.isfile(\"AMBER_READY\"):\n", + " print(\"installing amber...\")\n", + " os.system(f\"mamba install -y -c conda-forge openmm=7.7.0 python='{PYTHON_VERSION}' pdbfixer\")\n", + " os.system(\"touch AMBER_READY\")" + ], + "metadata": { + "cellView": "form", + "id": "AzIKiDiCaHAn", + "outputId": "17095677-05ce-4e08-efa1-c70bd9f81012", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "installing colabfold...\n", + "CPU times: user 238 ms, sys: 28.3 ms, total: 266 ms\n", + "Wall time: 1min 2s\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "#@markdown ### MSA options (custom MSA upload, single sequence, pairing mode)\n", + "msa_mode = \"mmseqs2_uniref_env\" #@param [\"mmseqs2_uniref_env\", \"mmseqs2_uniref\",\"single_sequence\",\"custom\"]\n", + "pair_mode = \"unpaired_paired\" #@param [\"unpaired_paired\",\"paired\",\"unpaired\"] {type:\"string\"}\n", + "#@markdown - \"unpaired_paired\" = pair sequences from same species + unpaired MSA, \"unpaired\" = seperate MSA for each chain, \"paired\" - only use paired sequences.\n", + "\n", + "# decide which a3m to use\n", + "if \"mmseqs2\" in msa_mode:\n", + " a3m_file = os.path.join(jobname,f\"{jobname}.a3m\")\n", + "\n", + "elif msa_mode == \"custom\":\n", + " a3m_file = os.path.join(jobname,f\"{jobname}.custom.a3m\")\n", + " if not os.path.isfile(a3m_file):\n", + " custom_msa_dict = files.upload()\n", + " custom_msa = list(custom_msa_dict.keys())[0]\n", + " header = 0\n", + " import fileinput\n", + " for line in fileinput.FileInput(custom_msa,inplace=1):\n", + " if line.startswith(\">\"):\n", + " header = header + 1\n", + " if not line.rstrip():\n", + " continue\n", + " if line.startswith(\">\") == False and header == 1:\n", + " query_sequence = line.rstrip()\n", + " print(line, end='')\n", + "\n", + " os.rename(custom_msa, a3m_file)\n", + " queries_path=a3m_file\n", + " print(f\"moving {custom_msa} to {a3m_file}\")\n", + "\n", + "else:\n", + " a3m_file = os.path.join(jobname,f\"{jobname}.single_sequence.a3m\")\n", + " with open(a3m_file, \"w\") as text_file:\n", + " text_file.write(\">1\\n%s\" % query_sequence)" + ], + "metadata": { + "cellView": "form", + "id": "C2_sh2uAonJH" + }, + "execution_count": 3, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#@markdown ### Advanced settings\n", + "model_type = \"auto\" #@param [\"auto\", \"alphafold2_ptm\", \"alphafold2_multimer_v1\", \"alphafold2_multimer_v2\", \"alphafold2_multimer_v3\", \"deepfold_v1\"]\n", + "#@markdown - if `auto` selected, will use `alphafold2_ptm` for monomer prediction and `alphafold2_multimer_v3` for complex prediction.\n", + "#@markdown Any of the mode_types can be used (regardless if input is monomer or complex).\n", + "num_recycles = \"3\" #@param [\"auto\", \"0\", \"1\", \"3\", \"6\", \"12\", \"24\", \"48\"]\n", + "#@markdown - if `auto` selected, will use `num_recycles=20` if `model_type=alphafold2_multimer_v3`, else `num_recycles=3` .\n", + "recycle_early_stop_tolerance = \"auto\" #@param [\"auto\", \"0.0\", \"0.5\", \"1.0\"]\n", + "#@markdown - if `auto` selected, will use `tol=0.5` if `model_type=alphafold2_multimer_v3` else `tol=0.0`.\n", + "relax_max_iterations = 200 #@param [0, 200, 2000] {type:\"raw\"}\n", + "#@markdown - max amber relax iterations, `0` = unlimited (AlphaFold2 default, can take very long)\n", + "pairing_strategy = \"greedy\" #@param [\"greedy\", \"complete\"] {type:\"string\"}\n", + "#@markdown - `greedy` = pair any taxonomically matching subsets, `complete` = all sequences have to match in one line.\n", + "\n", + "\n", + "#@markdown #### Sample settings\n", + "#@markdown - enable dropouts and increase number of seeds to sample predictions from uncertainty of the model.\n", + "#@markdown - decrease `max_msa` to increase uncertainity\n", + "max_msa = \"auto\" #@param [\"auto\", \"512:1024\", \"256:512\", \"64:128\", \"32:64\", \"16:32\"]\n", + "num_seeds = 1 #@param [1,2,4,8,16] {type:\"raw\"}\n", + "use_dropout = False #@param {type:\"boolean\"}\n", + "\n", + "num_recycles = None if num_recycles == \"auto\" else int(num_recycles)\n", + "recycle_early_stop_tolerance = None if recycle_early_stop_tolerance == \"auto\" else float(recycle_early_stop_tolerance)\n", + "if max_msa == \"auto\": max_msa = None\n", + "\n", + "#@markdown #### Save settings\n", + "save_all = False #@param {type:\"boolean\"}\n", + "save_recycles = False #@param {type:\"boolean\"}\n", + "save_to_google_drive = False #@param {type:\"boolean\"}\n", + "#@markdown - if the save_to_google_drive option was selected, the result zip will be uploaded to your Google Drive\n", + "dpi = 200 #@param {type:\"integer\"}\n", + "#@markdown - set dpi for image resolution\n", + "\n", + "if save_to_google_drive:\n", + " from pydrive2.drive import GoogleDrive\n", + " from pydrive2.auth import GoogleAuth\n", + " from google.colab import auth\n", + " from oauth2client.client import GoogleCredentials\n", + " auth.authenticate_user()\n", + " gauth = GoogleAuth()\n", + " gauth.credentials = GoogleCredentials.get_application_default()\n", + " drive = GoogleDrive(gauth)\n", + " print(\"You are logged into Google Drive and are good to go!\")\n", + "\n", + "#@markdown Don't forget to hit `Runtime` -> `Run all` after updating the form." + ], + "metadata": { + "cellView": "form", + "id": "ADDuaolKmjGW" + }, + "execution_count": 4, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#@title Run Prediction\n", + "display_images = True #@param {type:\"boolean\"}\n", + "\n", + "import sys\n", + "import warnings\n", + "warnings.simplefilter(action='ignore', category=FutureWarning)\n", + "from Bio import BiopythonDeprecationWarning\n", + "warnings.simplefilter(action='ignore', category=BiopythonDeprecationWarning)\n", + "from pathlib import Path\n", + "from colabfold.download import download_alphafold_params, default_data_dir\n", + "from colabfold.utils import setup_logging\n", + "from colabfold.batch import get_queries, run, set_model_type\n", + "from colabfold.plot import plot_msa_v2\n", + "\n", + "import os\n", + "import numpy as np\n", + "try:\n", + " K80_chk = os.popen('nvidia-smi | grep \"Tesla K80\" | wc -l').read()\n", + "except:\n", + " K80_chk = \"0\"\n", + " pass\n", + "if \"1\" in K80_chk:\n", + " print(\"WARNING: found GPU Tesla K80: limited to total length < 1000\")\n", + " if \"TF_FORCE_UNIFIED_MEMORY\" in os.environ:\n", + " del os.environ[\"TF_FORCE_UNIFIED_MEMORY\"]\n", + " if \"XLA_PYTHON_CLIENT_MEM_FRACTION\" in os.environ:\n", + " del os.environ[\"XLA_PYTHON_CLIENT_MEM_FRACTION\"]\n", + "\n", + "from colabfold.colabfold import plot_protein\n", + "from pathlib import Path\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# For some reason we need that to get pdbfixer to import\n", + "if use_amber and f\"/usr/local/lib/python{python_version}/site-packages/\" not in sys.path:\n", + " sys.path.insert(0, f\"/usr/local/lib/python{python_version}/site-packages/\")\n", + "\n", + "def input_features_callback(input_features):\n", + " if display_images:\n", + " plot_msa_v2(input_features)\n", + " plt.show()\n", + " plt.close()\n", + "\n", + "def prediction_callback(protein_obj, length,\n", + " prediction_result, input_features, mode):\n", + " model_name, relaxed = mode\n", + " if not relaxed:\n", + " if display_images:\n", + " fig = plot_protein(protein_obj, Ls=length, dpi=150)\n", + " plt.show()\n", + " plt.close()\n", + "\n", + "result_dir = jobname\n", + "log_filename = os.path.join(jobname,\"log.txt\")\n", + "setup_logging(Path(log_filename))\n", + "\n", + "queries, is_complex = get_queries(queries_path)\n", + "model_type = set_model_type(is_complex, model_type)\n", + "\n", + "if \"multimer\" in model_type and max_msa is not None:\n", + " use_cluster_profile = False\n", + "else:\n", + " use_cluster_profile = True\n", + "\n", + "download_alphafold_params(model_type, Path(\".\"))\n", + "results = run(\n", + " queries=queries,\n", + " result_dir=result_dir,\n", + " use_templates=use_templates,\n", + " custom_template_path=custom_template_path,\n", + " num_relax=num_relax,\n", + " msa_mode=msa_mode,\n", + " model_type=model_type,\n", + " num_models=5,\n", + " num_recycles=num_recycles,\n", + " relax_max_iterations=relax_max_iterations,\n", + " recycle_early_stop_tolerance=recycle_early_stop_tolerance,\n", + " num_seeds=num_seeds,\n", + " use_dropout=use_dropout,\n", + " model_order=[1,2,3,4,5],\n", + " is_complex=is_complex,\n", + " data_dir=Path(\".\"),\n", + " keep_existing_results=False,\n", + " rank_by=\"auto\",\n", + " pair_mode=pair_mode,\n", + " pairing_strategy=pairing_strategy,\n", + " stop_at_score=float(100),\n", + " prediction_callback=prediction_callback,\n", + " dpi=dpi,\n", + " zip_results=False,\n", + " save_all=save_all,\n", + " max_msa=max_msa,\n", + " use_cluster_profile=use_cluster_profile,\n", + " input_features_callback=input_features_callback,\n", + " save_recycles=save_recycles,\n", + " user_agent=\"colabfold/google-colab-main\",\n", + ")\n", + "results_zip = f\"{jobname}.result.zip\"\n", + "os.system(f\"zip -r {results_zip} {jobname}\")" + ], + "metadata": { + "cellView": "form", + "id": "mbaIO9pWjaN0", + "collapsed": true, + "outputId": "ff931b5c-704e-40bd-d54a-eb527361150f", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Downloading alphafold2_ptm weights to .: 83%|████████▎ | 2.86G/3.47G [02:18<00:28, 22.9MB/s]" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "KK7X9T44pWb7", + "cellView": "form" + }, + "source": [ + "#@title Display 3D structure {run: \"auto\"}\n", + "import py3Dmol\n", + "import glob\n", + "import matplotlib.pyplot as plt\n", + "from colabfold.colabfold import plot_plddt_legend\n", + "from colabfold.colabfold import pymol_color_list, alphabet_list\n", + "rank_num = 1 #@param [\"1\", \"2\", \"3\", \"4\", \"5\"] {type:\"raw\"}\n", + "color = \"lDDT\" #@param [\"chain\", \"lDDT\", \"rainbow\"]\n", + "show_sidechains = False #@param {type:\"boolean\"}\n", + "show_mainchains = False #@param {type:\"boolean\"}\n", + "\n", + "tag = results[\"rank\"][0][rank_num - 1]\n", + "jobname_prefix = \".custom\" if msa_mode == \"custom\" else \"\"\n", + "pdb_filename = f\"{jobname}/{jobname}{jobname_prefix}_unrelaxed_{tag}.pdb\"\n", + "pdb_file = glob.glob(pdb_filename)\n", + "\n", + "def show_pdb(rank_num=1, show_sidechains=False, show_mainchains=False, color=\"lDDT\"):\n", + " model_name = f\"rank_{rank_num}\"\n", + " view = py3Dmol.view(js='https://3dmol.org/build/3Dmol.js',)\n", + " view.addModel(open(pdb_file[0],'r').read(),'pdb')\n", + "\n", + " if color == \"lDDT\":\n", + " view.setStyle({'cartoon': {'colorscheme': {'prop':'b','gradient': 'roygb','min':50,'max':90}}})\n", + " elif color == \"rainbow\":\n", + " view.setStyle({'cartoon': {'color':'spectrum'}})\n", + " elif color == \"chain\":\n", + " chains = len(queries[0][1]) + 1 if is_complex else 1\n", + " for n,chain,color in zip(range(chains),alphabet_list,pymol_color_list):\n", + " view.setStyle({'chain':chain},{'cartoon': {'color':color}})\n", + "\n", + " if show_sidechains:\n", + " BB = ['C','O','N']\n", + " view.addStyle({'and':[{'resn':[\"GLY\",\"PRO\"],'invert':True},{'atom':BB,'invert':True}]},\n", + " {'stick':{'colorscheme':f\"WhiteCarbon\",'radius':0.3}})\n", + " view.addStyle({'and':[{'resn':\"GLY\"},{'atom':'CA'}]},\n", + " {'sphere':{'colorscheme':f\"WhiteCarbon\",'radius':0.3}})\n", + " view.addStyle({'and':[{'resn':\"PRO\"},{'atom':['C','O'],'invert':True}]},\n", + " {'stick':{'colorscheme':f\"WhiteCarbon\",'radius':0.3}})\n", + " if show_mainchains:\n", + " BB = ['C','O','N','CA']\n", + " view.addStyle({'atom':BB},{'stick':{'colorscheme':f\"WhiteCarbon\",'radius':0.3}})\n", + "\n", + " view.zoomTo()\n", + " return view\n", + "\n", + "show_pdb(rank_num, show_sidechains, show_mainchains, color).show()\n", + "if color == \"lDDT\":\n", + " plot_plddt_legend().show()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "11l8k--10q0C", + "cellView": "form" + }, + "source": [ + "#@title Plots {run: \"auto\"}\n", + "from IPython.display import display, HTML\n", + "import base64\n", + "from html import escape\n", + "\n", + "# see: https://stackoverflow.com/a/53688522\n", + "def image_to_data_url(filename):\n", + " ext = filename.split('.')[-1]\n", + " prefix = f'data:image/{ext};base64,'\n", + " with open(filename, 'rb') as f:\n", + " img = f.read()\n", + " return prefix + base64.b64encode(img).decode('utf-8')\n", + "\n", + "pae = \"\"\n", + "pae_file = os.path.join(jobname,f\"{jobname}{jobname_prefix}_pae.png\")\n", + "if os.path.isfile(pae_file):\n", + " pae = image_to_data_url(pae_file)\n", + "cov = image_to_data_url(os.path.join(jobname,f\"{jobname}{jobname_prefix}_coverage.png\"))\n", + "plddt = image_to_data_url(os.path.join(jobname,f\"{jobname}{jobname_prefix}_plddt.png\"))\n", + "display(HTML(f\"\"\"\n", + "\n", + "