add Chainsaw option for embedding mode

jgreener64 · jgreener64 · commit f31594de2a08 · 2026-01-13T16:05:54.000Z
diff --git a/README.md b/README.md
@@ -124,6 +124,7 @@ progres embed -l filepaths.txt -o searchdb.pt
 - `-l` is a text file with information on one structure per line, each of which will be one entry in the output. White space should separate the file path to the structure and the domain name, with optionally any additional text being treated as a note for the notes column of the results.
 - `-o` is the output file path for the PyTorch file containing a dictionary with the embeddings and associated data. It can be read in with `torch.load`.
 - `-f` determines the file format of each structure as above (`guess`, `pdb`, `mmcif`, `mmtf` or `coords`).
+- `-c` indicates to split each structure into domains with Chainsaw to allow searching against each domain separately. If no domains are found with Chainsaw for a structure, it will not be added. Only the first chain in each file is considered. Running Chainsaw may take a few seconds.
 
 Again, the structures should correspond to single protein domains.
 The embeddings are stored as Float16, which has no noticeable effect on search performance.
diff --git a/bin/progres b/bin/progres
@@ -77,6 +77,9 @@ parser_embed.add_argument("-o", "--outputfile", required=True,
 parser_embed.add_argument("-f", "--fileformat",
     choices=["guess", "pdb", "mmcif", "mmtf", "coords"], default="guess",
     help="file format of the structures, by default guessed from the file extension")
+parser_embed.add_argument("-c", "--chainsaw", default=False, action="store_true",
+    help=("split each structure into domains with Chainsaw to allow searching "
+          "against each domain separately"))
 parser_embed.add_argument("-d", "--device", default="cpu",
     help="device to run on, default is \"cpu\"")
 
@@ -107,7 +110,7 @@ def main():
     elif args.mode == "embed":
         from progres import progres_embed
         progres_embed(structurelist=args.structurelist, outputfile=args.outputfile,
-                      fileformat=args.fileformat, device=args.device)
+                      fileformat=args.fileformat, chainsaw=args.chainsaw, device=args.device)
     else:
         print("No mode selected, run \"progres -h\" to see help", file=sys.stderr)
 
diff --git a/progres/progres.py b/progres/progres.py
@@ -683,20 +683,34 @@ def progres_score_print(structure1, structure2, fileformat1="guess",
     score = progres_score(structure1, structure2, fileformat1, fileformat2, device)
     print(score)
 
-def progres_embed(structurelist, outputfile, fileformat="guess", device="cpu",
+def progres_embed(structurelist, outputfile, fileformat="guess", chainsaw=False, device="cpu",
                   batch_size=None, float_type=torch.float16):
     download_data_if_required()
 
-    fps, domids, notes = [], [], []
+    fps, domids_fp, notes_fp = [], [], []
     with open(structurelist) as f:
         for line in f.readlines():
             cols = line.strip().split(None, 2)
             fps.append(cols[0])
-            domids.append(cols[1])
-            notes.append(cols[2] if len(cols) > 2 else "-")
+            domids_fp.append(cols[1])
+            notes_fp.append(cols[2] if len(cols) > 2 else "-")
 
     model = load_trained_model(device)
-    data_set = StructureDataset(fps, fileformat, model, device)
+    data_set = StructureDataset(fps, fileformat, model, device, chainsaw)
+    if chainsaw:
+        domids, notes = [], []
+        i, dom_i = 0, 1
+        for fp, domid, note in zip(fps, domids_fp, notes_fp):
+            while data_set.file_paths[i] == fp:
+                domids.append(f"{domid}_D{dom_i}")
+                notes.append(f"{note} - domain {dom_i} ({data_set.res_ranges[i]})")
+                i += 1
+                dom_i += 1
+            dom_i = 1
+    else:
+        domids, notes = domids_fp, notes_fp
+    assert len(domids) == len(notes) == len(data_set)
+
     if batch_size is None:
         batch_size = get_batch_size(device)
     data_loader = DataLoader(