{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"accelerator": "GPU",
"colab": {
"name": "AlphaFold2.ipynb",
"provenance": [],
"machine_shape": "hm",
"gpuType": "T4"
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "G4yBrceuFbf3"
},
"source": [
"\n",
"\n",
"##ColabFold v1.5.5: AlphaFold2 using MMseqs2\n",
"\n",
"Easy to use protein structure and complex prediction using [AlphaFold2](https://www.nature.com/articles/s41586-021-03819-2) and [Alphafold2-multimer](https://www.biorxiv.org/content/10.1101/2021.10.04.463034v1). Sequence alignments/templates are generated through [MMseqs2](mmseqs.com) and [HHsearch](https://github.com/soedinglab/hh-suite). For more details, see bottom of the notebook, checkout the [ColabFold GitHub](https://github.com/sokrypton/ColabFold) and read our manuscript.\n",
"Old versions: [v1.4](https://colab.research.google.com/github/sokrypton/ColabFold/blob/v1.4.0/AlphaFold2.ipynb), [v1.5.1](https://colab.research.google.com/github/sokrypton/ColabFold/blob/v1.5.1/AlphaFold2.ipynb), [v1.5.2](https://colab.research.google.com/github/sokrypton/ColabFold/blob/v1.5.2/AlphaFold2.ipynb), [v1.5.3-patch](https://colab.research.google.com/github/sokrypton/ColabFold/blob/56c72044c7d51a311ca99b953a71e552fdc042e1/AlphaFold2.ipynb)\n",
"\n",
"[Mirdita M, Schütze K, Moriwaki Y, Heo L, Ovchinnikov S, Steinegger M. ColabFold: Making protein folding accessible to all.\n",
"*Nature Methods*, 2022](https://www.nature.com/articles/s41592-022-01488-1)"
]
},
{
"cell_type": "code",
"metadata": {
"id": "kOblAo-xetgx",
"cellView": "form",
"outputId": "861d3e4f-ba01-4c52-fadf-e5f0a65509d9",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"source": [
"#@title Input protein sequence(s), then hit `Runtime` -> `Run all`\n",
"from google.colab import files\n",
"import os\n",
"import re\n",
"import hashlib\n",
"import random\n",
"\n",
"from sys import version_info\n",
"python_version = f\"{version_info.major}.{version_info.minor}\"\n",
"\n",
"def add_hash(x,y):\n",
" return x+\"_\"+hashlib.sha1(y.encode()).hexdigest()[:5]\n",
"\n",
"query_sequence = 'AAVALLPAVLLALLAVTDQLGEDFFAVDLEAFLQEFGLLPEKE' #@param {type:\"string\"}\n",
"#@markdown - Use `:` to specify inter-protein chainbreaks for **modeling complexes** (supports homo- and hetro-oligomers). For example **PI...SK:PI...SK** for a homodimer\n",
"jobname = 'test' #@param {type:\"string\"}\n",
"# number of models to use\n",
"num_relax = 0 #@param [0, 1, 5] {type:\"raw\"}\n",
"#@markdown - specify how many of the top ranked structures to relax using amber\n",
"template_mode = \"none\" #@param [\"none\", \"pdb100\",\"custom\"]\n",
"#@markdown - `none` = no template information is used. `pdb100` = detect templates in pdb100 (see [notes](#pdb100)). `custom` - upload and search own templates (PDB or mmCIF format, see [notes](#custom_templates))\n",
"\n",
"use_amber = num_relax > 0\n",
"\n",
"# remove whitespaces\n",
"query_sequence = \"\".join(query_sequence.split())\n",
"\n",
"basejobname = \"\".join(jobname.split())\n",
"basejobname = re.sub(r'\\W+', '', basejobname)\n",
"jobname = add_hash(basejobname, query_sequence)\n",
"\n",
"# check if directory with jobname exists\n",
"def check(folder):\n",
" if os.path.exists(folder):\n",
" return False\n",
" else:\n",
" return True\n",
"if not check(jobname):\n",
" n = 0\n",
" while not check(f\"{jobname}_{n}\"): n += 1\n",
" jobname = f\"{jobname}_{n}\"\n",
"\n",
"# make directory to save results\n",
"os.makedirs(jobname, exist_ok=True)\n",
"\n",
"# save queries\n",
"queries_path = os.path.join(jobname, f\"{jobname}.csv\")\n",
"with open(queries_path, \"w\") as text_file:\n",
" text_file.write(f\"id,sequence\\n{jobname},{query_sequence}\")\n",
"\n",
"if template_mode == \"pdb100\":\n",
" use_templates = True\n",
" custom_template_path = None\n",
"elif template_mode == \"custom\":\n",
" custom_template_path = os.path.join(jobname,f\"template\")\n",
" os.makedirs(custom_template_path, exist_ok=True)\n",
" uploaded = files.upload()\n",
" use_templates = True\n",
" for fn in uploaded.keys():\n",
" os.rename(fn,os.path.join(custom_template_path,fn))\n",
"else:\n",
" custom_template_path = None\n",
" use_templates = False\n",
"\n",
"print(\"jobname\",jobname)\n",
"print(\"sequence\",query_sequence)\n",
"print(\"length\",len(query_sequence.replace(\":\",\"\")))\n",
"\n"
],
"execution_count": 1,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"jobname test_21d06\n",
"sequence AAVALLPAVLLALLAVTDQLGEDFFAVDLEAFLQEFGLLPEKE\n",
"length 43\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"#@title Install dependencies\n",
"%%time\n",
"import os\n",
"USE_AMBER = use_amber\n",
"USE_TEMPLATES = use_templates\n",
"PYTHON_VERSION = python_version\n",
"\n",
"if not os.path.isfile(\"COLABFOLD_READY\"):\n",
" print(\"installing colabfold...\")\n",
" os.system(\"pip install -q --no-warn-conflicts 'colabfold[alphafold-minus-jax] @ git+https://github.com/sokrypton/ColabFold'\")\n",
" if os.environ.get('TPU_NAME', False) != False:\n",
" os.system(\"pip uninstall -y jax jaxlib\")\n",
" os.system(\"pip install --no-warn-conflicts --upgrade dm-haiku==0.0.10 'jax[cuda12_pip]'==0.3.25 -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html\")\n",
" os.system(\"ln -s /usr/local/lib/python3.*/dist-packages/colabfold colabfold\")\n",
" os.system(\"ln -s /usr/local/lib/python3.*/dist-packages/alphafold alphafold\")\n",
" os.system(\"touch COLABFOLD_READY\")\n",
"\n",
"if USE_AMBER or USE_TEMPLATES:\n",
" if not os.path.isfile(\"CONDA_READY\"):\n",
" print(\"installing conda...\")\n",
" os.system(\"wget -qnc https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh\")\n",
" os.system(\"bash Mambaforge-Linux-x86_64.sh -bfp /usr/local\")\n",
" os.system(\"mamba config --set auto_update_conda false\")\n",
" os.system(\"touch CONDA_READY\")\n",
"\n",
"if USE_TEMPLATES and not os.path.isfile(\"HH_READY\") and USE_AMBER and not os.path.isfile(\"AMBER_READY\"):\n",
" print(\"installing hhsuite and amber...\")\n",
" os.system(f\"mamba install -y -c conda-forge -c bioconda kalign2=2.04 hhsuite=3.3.0 openmm=7.7.0 python='{PYTHON_VERSION}' pdbfixer\")\n",
" os.system(\"touch HH_READY\")\n",
" os.system(\"touch AMBER_READY\")\n",
"else:\n",
" if USE_TEMPLATES and not os.path.isfile(\"HH_READY\"):\n",
" print(\"installing hhsuite...\")\n",
" os.system(f\"mamba install -y -c conda-forge -c bioconda kalign2=2.04 hhsuite=3.3.0 python='{PYTHON_VERSION}'\")\n",
" os.system(\"touch HH_READY\")\n",
" if USE_AMBER and not os.path.isfile(\"AMBER_READY\"):\n",
" print(\"installing amber...\")\n",
" os.system(f\"mamba install -y -c conda-forge openmm=7.7.0 python='{PYTHON_VERSION}' pdbfixer\")\n",
" os.system(\"touch AMBER_READY\")"
],
"metadata": {
"cellView": "form",
"id": "AzIKiDiCaHAn",
"outputId": "17095677-05ce-4e08-efa1-c70bd9f81012",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"execution_count": 2,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"installing colabfold...\n",
"CPU times: user 238 ms, sys: 28.3 ms, total: 266 ms\n",
"Wall time: 1min 2s\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"#@markdown ### MSA options (custom MSA upload, single sequence, pairing mode)\n",
"msa_mode = \"mmseqs2_uniref_env\" #@param [\"mmseqs2_uniref_env\", \"mmseqs2_uniref\",\"single_sequence\",\"custom\"]\n",
"pair_mode = \"unpaired_paired\" #@param [\"unpaired_paired\",\"paired\",\"unpaired\"] {type:\"string\"}\n",
"#@markdown - \"unpaired_paired\" = pair sequences from same species + unpaired MSA, \"unpaired\" = seperate MSA for each chain, \"paired\" - only use paired sequences.\n",
"\n",
"# decide which a3m to use\n",
"if \"mmseqs2\" in msa_mode:\n",
" a3m_file = os.path.join(jobname,f\"{jobname}.a3m\")\n",
"\n",
"elif msa_mode == \"custom\":\n",
" a3m_file = os.path.join(jobname,f\"{jobname}.custom.a3m\")\n",
" if not os.path.isfile(a3m_file):\n",
" custom_msa_dict = files.upload()\n",
" custom_msa = list(custom_msa_dict.keys())[0]\n",
" header = 0\n",
" import fileinput\n",
" for line in fileinput.FileInput(custom_msa,inplace=1):\n",
" if line.startswith(\">\"):\n",
" header = header + 1\n",
" if not line.rstrip():\n",
" continue\n",
" if line.startswith(\">\") == False and header == 1:\n",
" query_sequence = line.rstrip()\n",
" print(line, end='')\n",
"\n",
" os.rename(custom_msa, a3m_file)\n",
" queries_path=a3m_file\n",
" print(f\"moving {custom_msa} to {a3m_file}\")\n",
"\n",
"else:\n",
" a3m_file = os.path.join(jobname,f\"{jobname}.single_sequence.a3m\")\n",
" with open(a3m_file, \"w\") as text_file:\n",
" text_file.write(\">1\\n%s\" % query_sequence)"
],
"metadata": {
"cellView": "form",
"id": "C2_sh2uAonJH"
},
"execution_count": 3,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#@markdown ### Advanced settings\n",
"model_type = \"auto\" #@param [\"auto\", \"alphafold2_ptm\", \"alphafold2_multimer_v1\", \"alphafold2_multimer_v2\", \"alphafold2_multimer_v3\", \"deepfold_v1\"]\n",
"#@markdown - if `auto` selected, will use `alphafold2_ptm` for monomer prediction and `alphafold2_multimer_v3` for complex prediction.\n",
"#@markdown Any of the mode_types can be used (regardless if input is monomer or complex).\n",
"num_recycles = \"3\" #@param [\"auto\", \"0\", \"1\", \"3\", \"6\", \"12\", \"24\", \"48\"]\n",
"#@markdown - if `auto` selected, will use `num_recycles=20` if `model_type=alphafold2_multimer_v3`, else `num_recycles=3` .\n",
"recycle_early_stop_tolerance = \"auto\" #@param [\"auto\", \"0.0\", \"0.5\", \"1.0\"]\n",
"#@markdown - if `auto` selected, will use `tol=0.5` if `model_type=alphafold2_multimer_v3` else `tol=0.0`.\n",
"relax_max_iterations = 200 #@param [0, 200, 2000] {type:\"raw\"}\n",
"#@markdown - max amber relax iterations, `0` = unlimited (AlphaFold2 default, can take very long)\n",
"pairing_strategy = \"greedy\" #@param [\"greedy\", \"complete\"] {type:\"string\"}\n",
"#@markdown - `greedy` = pair any taxonomically matching subsets, `complete` = all sequences have to match in one line.\n",
"\n",
"\n",
"#@markdown #### Sample settings\n",
"#@markdown - enable dropouts and increase number of seeds to sample predictions from uncertainty of the model.\n",
"#@markdown - decrease `max_msa` to increase uncertainity\n",
"max_msa = \"auto\" #@param [\"auto\", \"512:1024\", \"256:512\", \"64:128\", \"32:64\", \"16:32\"]\n",
"num_seeds = 1 #@param [1,2,4,8,16] {type:\"raw\"}\n",
"use_dropout = False #@param {type:\"boolean\"}\n",
"\n",
"num_recycles = None if num_recycles == \"auto\" else int(num_recycles)\n",
"recycle_early_stop_tolerance = None if recycle_early_stop_tolerance == \"auto\" else float(recycle_early_stop_tolerance)\n",
"if max_msa == \"auto\": max_msa = None\n",
"\n",
"#@markdown #### Save settings\n",
"save_all = False #@param {type:\"boolean\"}\n",
"save_recycles = False #@param {type:\"boolean\"}\n",
"save_to_google_drive = False #@param {type:\"boolean\"}\n",
"#@markdown - if the save_to_google_drive option was selected, the result zip will be uploaded to your Google Drive\n",
"dpi = 200 #@param {type:\"integer\"}\n",
"#@markdown - set dpi for image resolution\n",
"\n",
"if save_to_google_drive:\n",
" from pydrive2.drive import GoogleDrive\n",
" from pydrive2.auth import GoogleAuth\n",
" from google.colab import auth\n",
" from oauth2client.client import GoogleCredentials\n",
" auth.authenticate_user()\n",
" gauth = GoogleAuth()\n",
" gauth.credentials = GoogleCredentials.get_application_default()\n",
" drive = GoogleDrive(gauth)\n",
" print(\"You are logged into Google Drive and are good to go!\")\n",
"\n",
"#@markdown Don't forget to hit `Runtime` -> `Run all` after updating the form."
],
"metadata": {
"cellView": "form",
"id": "ADDuaolKmjGW"
},
"execution_count": 4,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#@title Run Prediction\n",
"display_images = True #@param {type:\"boolean\"}\n",
"\n",
"import sys\n",
"import warnings\n",
"warnings.simplefilter(action='ignore', category=FutureWarning)\n",
"from Bio import BiopythonDeprecationWarning\n",
"warnings.simplefilter(action='ignore', category=BiopythonDeprecationWarning)\n",
"from pathlib import Path\n",
"from colabfold.download import download_alphafold_params, default_data_dir\n",
"from colabfold.utils import setup_logging\n",
"from colabfold.batch import get_queries, run, set_model_type\n",
"from colabfold.plot import plot_msa_v2\n",
"\n",
"import os\n",
"import numpy as np\n",
"try:\n",
" K80_chk = os.popen('nvidia-smi | grep \"Tesla K80\" | wc -l').read()\n",
"except:\n",
" K80_chk = \"0\"\n",
" pass\n",
"if \"1\" in K80_chk:\n",
" print(\"WARNING: found GPU Tesla K80: limited to total length < 1000\")\n",
" if \"TF_FORCE_UNIFIED_MEMORY\" in os.environ:\n",
" del os.environ[\"TF_FORCE_UNIFIED_MEMORY\"]\n",
" if \"XLA_PYTHON_CLIENT_MEM_FRACTION\" in os.environ:\n",
" del os.environ[\"XLA_PYTHON_CLIENT_MEM_FRACTION\"]\n",
"\n",
"from colabfold.colabfold import plot_protein\n",
"from pathlib import Path\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# For some reason we need that to get pdbfixer to import\n",
"if use_amber and f\"/usr/local/lib/python{python_version}/site-packages/\" not in sys.path:\n",
" sys.path.insert(0, f\"/usr/local/lib/python{python_version}/site-packages/\")\n",
"\n",
"def input_features_callback(input_features):\n",
" if display_images:\n",
" plot_msa_v2(input_features)\n",
" plt.show()\n",
" plt.close()\n",
"\n",
"def prediction_callback(protein_obj, length,\n",
" prediction_result, input_features, mode):\n",
" model_name, relaxed = mode\n",
" if not relaxed:\n",
" if display_images:\n",
" fig = plot_protein(protein_obj, Ls=length, dpi=150)\n",
" plt.show()\n",
" plt.close()\n",
"\n",
"result_dir = jobname\n",
"log_filename = os.path.join(jobname,\"log.txt\")\n",
"setup_logging(Path(log_filename))\n",
"\n",
"queries, is_complex = get_queries(queries_path)\n",
"model_type = set_model_type(is_complex, model_type)\n",
"\n",
"if \"multimer\" in model_type and max_msa is not None:\n",
" use_cluster_profile = False\n",
"else:\n",
" use_cluster_profile = True\n",
"\n",
"download_alphafold_params(model_type, Path(\".\"))\n",
"results = run(\n",
" queries=queries,\n",
" result_dir=result_dir,\n",
" use_templates=use_templates,\n",
" custom_template_path=custom_template_path,\n",
" num_relax=num_relax,\n",
" msa_mode=msa_mode,\n",
" model_type=model_type,\n",
" num_models=5,\n",
" num_recycles=num_recycles,\n",
" relax_max_iterations=relax_max_iterations,\n",
" recycle_early_stop_tolerance=recycle_early_stop_tolerance,\n",
" num_seeds=num_seeds,\n",
" use_dropout=use_dropout,\n",
" model_order=[1,2,3,4,5],\n",
" is_complex=is_complex,\n",
" data_dir=Path(\".\"),\n",
" keep_existing_results=False,\n",
" rank_by=\"auto\",\n",
" pair_mode=pair_mode,\n",
" pairing_strategy=pairing_strategy,\n",
" stop_at_score=float(100),\n",
" prediction_callback=prediction_callback,\n",
" dpi=dpi,\n",
" zip_results=False,\n",
" save_all=save_all,\n",
" max_msa=max_msa,\n",
" use_cluster_profile=use_cluster_profile,\n",
" input_features_callback=input_features_callback,\n",
" save_recycles=save_recycles,\n",
" user_agent=\"colabfold/google-colab-main\",\n",
")\n",
"results_zip = f\"{jobname}.result.zip\"\n",
"os.system(f\"zip -r {results_zip} {jobname}\")"
],
"metadata": {
"cellView": "form",
"id": "mbaIO9pWjaN0",
"collapsed": true,
"outputId": "ff931b5c-704e-40bd-d54a-eb527361150f",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"Downloading alphafold2_ptm weights to .: 83%|████████▎ | 2.86G/3.47G [02:18<00:28, 22.9MB/s]"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "KK7X9T44pWb7",
"cellView": "form"
},
"source": [
"#@title Display 3D structure {run: \"auto\"}\n",
"import py3Dmol\n",
"import glob\n",
"import matplotlib.pyplot as plt\n",
"from colabfold.colabfold import plot_plddt_legend\n",
"from colabfold.colabfold import pymol_color_list, alphabet_list\n",
"rank_num = 1 #@param [\"1\", \"2\", \"3\", \"4\", \"5\"] {type:\"raw\"}\n",
"color = \"lDDT\" #@param [\"chain\", \"lDDT\", \"rainbow\"]\n",
"show_sidechains = False #@param {type:\"boolean\"}\n",
"show_mainchains = False #@param {type:\"boolean\"}\n",
"\n",
"tag = results[\"rank\"][0][rank_num - 1]\n",
"jobname_prefix = \".custom\" if msa_mode == \"custom\" else \"\"\n",
"pdb_filename = f\"{jobname}/{jobname}{jobname_prefix}_unrelaxed_{tag}.pdb\"\n",
"pdb_file = glob.glob(pdb_filename)\n",
"\n",
"def show_pdb(rank_num=1, show_sidechains=False, show_mainchains=False, color=\"lDDT\"):\n",
" model_name = f\"rank_{rank_num}\"\n",
" view = py3Dmol.view(js='https://3dmol.org/build/3Dmol.js',)\n",
" view.addModel(open(pdb_file[0],'r').read(),'pdb')\n",
"\n",
" if color == \"lDDT\":\n",
" view.setStyle({'cartoon': {'colorscheme': {'prop':'b','gradient': 'roygb','min':50,'max':90}}})\n",
" elif color == \"rainbow\":\n",
" view.setStyle({'cartoon': {'color':'spectrum'}})\n",
" elif color == \"chain\":\n",
" chains = len(queries[0][1]) + 1 if is_complex else 1\n",
" for n,chain,color in zip(range(chains),alphabet_list,pymol_color_list):\n",
" view.setStyle({'chain':chain},{'cartoon': {'color':color}})\n",
"\n",
" if show_sidechains:\n",
" BB = ['C','O','N']\n",
" view.addStyle({'and':[{'resn':[\"GLY\",\"PRO\"],'invert':True},{'atom':BB,'invert':True}]},\n",
" {'stick':{'colorscheme':f\"WhiteCarbon\",'radius':0.3}})\n",
" view.addStyle({'and':[{'resn':\"GLY\"},{'atom':'CA'}]},\n",
" {'sphere':{'colorscheme':f\"WhiteCarbon\",'radius':0.3}})\n",
" view.addStyle({'and':[{'resn':\"PRO\"},{'atom':['C','O'],'invert':True}]},\n",
" {'stick':{'colorscheme':f\"WhiteCarbon\",'radius':0.3}})\n",
" if show_mainchains:\n",
" BB = ['C','O','N','CA']\n",
" view.addStyle({'atom':BB},{'stick':{'colorscheme':f\"WhiteCarbon\",'radius':0.3}})\n",
"\n",
" view.zoomTo()\n",
" return view\n",
"\n",
"show_pdb(rank_num, show_sidechains, show_mainchains, color).show()\n",
"if color == \"lDDT\":\n",
" plot_plddt_legend().show()"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "11l8k--10q0C",
"cellView": "form"
},
"source": [
"#@title Plots {run: \"auto\"}\n",
"from IPython.display import display, HTML\n",
"import base64\n",
"from html import escape\n",
"\n",
"# see: https://stackoverflow.com/a/53688522\n",
"def image_to_data_url(filename):\n",
" ext = filename.split('.')[-1]\n",
" prefix = f'data:image/{ext};base64,'\n",
" with open(filename, 'rb') as f:\n",
" img = f.read()\n",
" return prefix + base64.b64encode(img).decode('utf-8')\n",
"\n",
"pae = \"\"\n",
"pae_file = os.path.join(jobname,f\"{jobname}{jobname_prefix}_pae.png\")\n",
"if os.path.isfile(pae_file):\n",
" pae = image_to_data_url(pae_file)\n",
"cov = image_to_data_url(os.path.join(jobname,f\"{jobname}{jobname_prefix}_coverage.png\"))\n",
"plddt = image_to_data_url(os.path.join(jobname,f\"{jobname}{jobname_prefix}_plddt.png\"))\n",
"display(HTML(f\"\"\"\n",
"\n",
"