Readme + notebook cleanup

fteufel · Sep 8, 2022 · 36dd9e1 · 36dd9e1
1 parent 4caa27e
commit 36dd9e1
Show file tree

Hide file tree

Showing 3 changed files with 302 additions and 516 deletions.
diff --git a/README.md b/README.md
@@ -1,2 +1,18 @@
 # alphafold-peptide-receptors
 Identifying peptide-receptor interactions using AlphaFold-Multimer
+
+
+## Prerequisites
+
+- Installation of AlphaFold 2.2.0 - we used the docker-free version provided in https://github.com/kalininalab/alphafold_non_docker
+- As we split MSA generation from prediction, copy `af_scripts/run_alphafold_msaonly.py` into the root directory of alphafold (that contains `run_alphafold.py`). This script only runs the data generation pipeline and omits the neural network execution.
+
+
+## Run AlphaFold
+- Execute `af_scripts/precompute_msas.py` to make all MSAs. The working directory needs to be the alphafold root dir. To change the data or run parameters, modify the variables on lines 14 to 21.
+- Execute `af_scripts/predict_from_precomputed.py` to predict all pairwise complexes. Modify the variables on lines 18 to 26 if you changed the data or msa directories. The script is meant to be executed on a GPU node and spawns multiple AlphaFold processes in parallel. Modify `GPU_AVAILABLE` starting from line 34 to match your GPU setup (default assumes 8 GPUs available)
+
+
+## Rank receptors
+
+- The function to extract the metrics from a single alphafold result is defined in `qc_metrics.py`. In `benchmark.ipynb`, we apply this function to all results, aggregate the metrics and rank the receptors. The notebook produces the results presented in the manuscript.
diff --git a/af_scripts/precompute_msas.py b/af_scripts/precompute_msas.py
@@ -9,8 +9,11 @@
 from tqdm.auto import tqdm
 from multiprocessing import Pool
 import subprocess
+from Bio import SeqIO
 
-RECEPTOR_CSV = '../alphafold-peptide-receptors/data/human_receptors.csv'
+NUM_PARALLEL = 6
+RECEPTOR_CSV = '../alphafold-peptide-receptors/data/mouse_receptors.csv'
+PEPTIDE_FASTA = '../alphafold-peptide-receptors/data/pdb_benchmark_peptides.fasta'
 MSA_DIR = '../data/msas'
 MAX_SEQ_LEN = 2000
 AF_CONFIG_STR = '--data_dir=../weights --model_preset=multimer --num_multimer_predictions_per_model=1 --max_template_date=1950-11-01 --run_relax=False --uniref90_database_path=../weights/uniref90/uniref90.fasta --mgnify_database_path=../weights/mgnify/mgy_clusters_2018_12.fa --template_mmcif_dir=../weights/pdb_mmcif/mmcif_files --obsolete_pdbs_path=../weights/pdb_mmcif/obsolete.dat --db_preset=reduced_dbs --small_bfd_database_path=../weights/small_bfd/bfd-first_non_consensus_sequences.fasta --pdb_seqres_database_path=../weights/pdb_seqres/pdb_seqres.txt --uniprot_database_path=../weights/uniprot/uniprot.fasta --use_gpu_relax=False --use_precomputed_msas=True'
@@ -58,22 +61,32 @@ def msa_job(x):
 
     # filter receptor list and create missing MSAs.
     df = pd.read_csv(RECEPTOR_CSV)
+    df = df.loc[~df['is_single_tm']]
 
     df = df.loc[df['Sequence'].str.len()<=MAX_SEQ_LEN]
 
+
     already_predicted = os.listdir(MSA_DIR)
 
     jobs = []
     for idx, row in df.iterrows():
+
+        if row['protein_id'] != 'P35456':
+            continue
         seq_name = row['protein_id']
         if seq_name in already_predicted:
             continue
         aa_sequence = row['Sequence']
         jobs.append((seq_name, aa_sequence))
 
-
+    for pep in SeqIO.parse(open(PEPTIDE_FASTA), 'fasta'):
+        _, prot_id, pos = pep.name.split('|')
+        pep_id = prot_id + ':' + pos
+        pep_seq = str(pep.seq)
+        jobs.append((pep_id, pep_seq))
+
 
-    with Pool(processes=5) as pool:
+    with Pool(processes=NUM_PARALLEL) as pool:
         for i in tqdm(pool.imap_unordered(msa_job, jobs), total=len(jobs)):
             pass