11"""Class for working with MMCIF files."""
2+
23# BioPandas
34# Authors: Arian Jamasb <[email protected] >, 45# Authors: Sebastian Raschka <[email protected] > @@ -69,56 +70,76 @@ def read_mmcif(self, path):
6970 self .code = self .data ["entry" ]["id" ][0 ].lower ()
7071 return self
7172
72- def fetch_mmcif (self , pdb_code : Optional [str ] = None , uniprot_id : Optional [str ] = None , source : str = "pdb" ):
73+ def fetch_mmcif (
74+ self ,
75+ pdb_code : Optional [str ] = None ,
76+ uniprot_id : Optional [str ] = None ,
77+ source : str = "pdb" ,
78+ ):
7379 """Fetches mmCIF file contents from the Protein Databank at rcsb.org or AlphaFold database at https://alphafold.ebi.ac.uk/.
74- .
80+ .
7581
76- Parameters
77- ----------
78- pdb_code : str, optional
79- A 4-letter PDB code, e.g., `"3eiy"` to retrieve structures from the PDB. Defaults to `None`.
82+ Parameters
83+ ----------
84+ pdb_code : str, optional
85+ A 4-letter PDB code, e.g., `"3eiy"` to retrieve structures from the PDB. Defaults to `None`.
8086
81- uniprot_id : str, optional
82- A UniProt Identifier, e.g., `"Q5VSL9"` to retrieve structures from the AF2 database. Defaults to `None`.
87+ uniprot_id : str, optional
88+ A UniProt Identifier, e.g., `"Q5VSL9"` to retrieve structures from the AF2 database. Defaults to `None`.
8389
84- source : str
85- The source to retrieve the structure from
86- (`"pdb"`, `"alphafold2-v3"` or `"alphafold2-v4"`). Defaults to `"pdb"`.
90+ source : str
91+ The source to retrieve the structure from
92+ (`"pdb"`, `"alphafold2-v3"` or `"alphafold2-v4"`). Defaults to `"pdb"`.
8793
88- Returns
89- ---------
90- self
94+ Returns
95+ ---------
96+ self
9197
9298 """
9399 # Sanitize input
94100 invalid_input_identifier_1 = pdb_code is None and uniprot_id is None
95- invalid_input_identifier_2 = pdb_code is not None and uniprot_id is not None
96- invalid_input_combination_1 = uniprot_id is not None and source == "pdb"
101+ invalid_input_identifier_2 = (
102+ pdb_code is not None and uniprot_id is not None
103+ )
104+ invalid_input_combination_1 = (
105+ uniprot_id is not None and source == "pdb"
106+ )
97107 invalid_input_combination_2 = pdb_code is not None and source in {
98- "alphafold2-v3" , "alphafold2-v4" }
108+ "alphafold2-v3" ,
109+ "alphafold2-v4" ,
110+ }
99111
100112 if invalid_input_identifier_1 or invalid_input_identifier_2 :
101113 raise ValueError (
102- "Please provide either a PDB code or a UniProt ID." )
114+ "Please provide either a PDB code or a UniProt ID."
115+ )
103116
104117 if invalid_input_combination_1 :
105118 raise ValueError (
106- "Please use a 'pdb_code' instead of 'uniprot_id' for source='pdb'." )
119+ "Please use a 'pdb_code' instead of 'uniprot_id' for source='pdb'."
120+ )
107121 elif invalid_input_combination_2 :
108122 raise ValueError (
109- f"Please use a 'uniprot_id' instead of 'pdb_code' for source={ source } ." )
123+ f"Please use a 'uniprot_id' instead of 'pdb_code' for source={ source } ."
124+ )
110125
111126 if source == "pdb" :
112127 self .mmcif_path , self .mmcif_text = self ._fetch_mmcif (pdb_code )
113128 elif source == "alphafold2-v3" :
114129 af2_version = 3
115- self .mmcif_path , self .mmcif_text = self ._fetch_af2 (uniprot_id , af2_version )
130+ self .mmcif_path , self .mmcif_text = self ._fetch_af2 (
131+ uniprot_id , af2_version
132+ )
116133 elif source == "alphafold2-v4" :
117134 af2_version = 4
118- self .mmcif_path , self .mmcif_text = self ._fetch_af2 (uniprot_id , af2_version )
135+ self .mmcif_path , self .mmcif_text = self ._fetch_af2 (
136+ uniprot_id , af2_version
137+ )
119138 else :
120- raise ValueError (f"Invalid source: { source } ."
121- " Please use one of 'pdb', 'alphafold2-v3' or 'alphafold2-v4'." )
139+ raise ValueError (
140+ f"Invalid source: { source } ."
141+ " Please use one of 'pdb', 'alphafold2-v3' or 'alphafold2-v4'."
142+ )
122143
123144 self ._df = self ._construct_df (text = self .mmcif_text )
124145 return self
@@ -129,7 +150,8 @@ def _construct_df(self, text: str):
129150 self .data = data
130151 df : Dict [str , pd .DataFrame ] = {}
131152 full_df = pd .DataFrame .from_dict (
132- data ["atom_site" ], orient = "index" ).transpose ()
153+ data ["atom_site" ], orient = "index"
154+ ).transpose ()
133155 full_df = full_df .astype (mmcif_col_types , errors = "ignore" )
134156 df ["ATOM" ] = pd .DataFrame (full_df [full_df .group_PDB == "ATOM" ])
135157 df ["HETATM" ] = pd .DataFrame (full_df [full_df .group_PDB == "HETATM" ])
@@ -148,8 +170,9 @@ def _fetch_mmcif(pdb_code):
148170 response = urlopen (url )
149171 txt = response .read ()
150172 txt = (
151- txt .decode (
152- "utf-8" ) if sys .version_info [0 ] >= 3 else txt .encode ("ascii" )
173+ txt .decode ("utf-8" )
174+ if sys .version_info [0 ] >= 3
175+ else txt .encode ("ascii" )
153176 )
154177 except HTTPError as e :
155178 print (f"HTTP Error { e .code } " )
@@ -166,11 +189,15 @@ def _fetch_af2(uniprot_id: str, af2_version: int = 3):
166189 try :
167190 response = urlopen (url )
168191 txt = response .read ()
169- txt = txt .decode ('utf-8' ) if sys .version_info [0 ] >= 3 else txt .encode ('ascii' )
192+ txt = (
193+ txt .decode ("utf-8" )
194+ if sys .version_info [0 ] >= 3
195+ else txt .encode ("ascii" )
196+ )
170197 except HTTPError as e :
171- print (f' HTTP Error { e .code } ' )
198+ print (f" HTTP Error { e .code } " )
172199 except URLError as e :
173- print (f' URL Error { e .args } ' )
200+ print (f" URL Error { e .args } " )
174201 return url , txt
175202
176203 @staticmethod
@@ -184,7 +211,8 @@ def _read_mmcif(path):
184211 openf = gzip .open
185212 else :
186213 allowed_formats = ", " .join (
187- (".cif" , ".cif.gz" , ".mmcif" , ".mmcif.gz" ))
214+ (".cif" , ".cif.gz" , ".mmcif" , ".mmcif.gz" )
215+ )
188216 raise ValueError (
189217 f"Wrong file format; allowed file formats are { allowed_formats } "
190218 )
@@ -194,8 +222,9 @@ def _read_mmcif(path):
194222
195223 if path .endswith (".gz" ):
196224 txt = (
197- txt .decode (
198- "utf-8" ) if sys .version_info [0 ] >= 3 else txt .encode ("ascii" )
225+ txt .decode ("utf-8" )
226+ if sys .version_info [0 ] >= 3
227+ else txt .encode ("ascii" )
199228 )
200229 return path , txt
201230
@@ -271,14 +300,19 @@ def _get_mainchain(
271300 def _get_hydrogen (df , invert ):
272301 """Return only hydrogen atom entries from a DataFrame"""
273302 return (
274- df [(df ["type_symbol" ] != "H" )] if invert else df [(
275- df ["type_symbol" ] == "H" )]
303+ df [(df ["type_symbol" ] != "H" )]
304+ if invert
305+ else df [(df ["type_symbol" ] == "H" )]
276306 )
277307
278308 @staticmethod
279309 def _get_heavy (df , invert ):
280310 """Return only heavy atom entries from a DataFrame"""
281- return df [df ["type_symbol" ] == "H" ] if invert else df [df ["type_symbol" ] != "H" ]
311+ return (
312+ df [df ["type_symbol" ] == "H" ]
313+ if invert
314+ else df [df ["type_symbol" ] != "H" ]
315+ )
282316
283317 @staticmethod
284318 def _get_calpha (df , invert , atom_col : str = "auth_atom_id" ):
@@ -288,7 +322,11 @@ def _get_calpha(df, invert, atom_col: str = "auth_atom_id"):
288322 @staticmethod
289323 def _get_carbon (df , invert ):
290324 """Return carbon atom entries from a DataFrame"""
291- return df [df ["type_symbol" ] != "C" ] if invert else df [df ["type_symbol" ] == "C" ]
325+ return (
326+ df [df ["type_symbol" ] != "C" ]
327+ if invert
328+ else df [df ["type_symbol" ] == "C" ]
329+ )
292330
293331 def amino3to1 (
294332 self ,
@@ -339,8 +377,9 @@ def amino3to1(
339377 indices .append (ind )
340378 cmp = num
341379
342- transl = tmp .iloc [indices ][residue_col ].map (
343- amino3to1dict ).fillna (fillna )
380+ transl = (
381+ tmp .iloc [indices ][residue_col ].map (amino3to1dict ).fillna (fillna )
382+ )
344383
345384 return pd .concat ((tmp .iloc [indices ][chain_col ], transl ), axis = 1 )
346385
@@ -425,7 +464,9 @@ def distance(self, xyz=(0.00, 0.00, 0.00), records=("ATOM", "HETATM")):
425464
426465 return np .sqrt (
427466 np .sum (
428- df [["Cartn_x" , "Cartn_y" , "Cartn_z" ]].subtract (xyz , axis = 1 ) ** 2 , axis = 1
467+ df [["Cartn_x" , "Cartn_y" , "Cartn_z" ]].subtract (xyz , axis = 1 )
468+ ** 2 ,
469+ axis = 1 ,
429470 )
430471 )
431472
@@ -451,7 +492,9 @@ def distance_df(df, xyz=(0.00, 0.00, 0.00)):
451492 """
452493 return np .sqrt (
453494 np .sum (
454- df [["Cartn_x" , "Cartn_y" , "Cartn_z" ]].subtract (xyz , axis = 1 ) ** 2 , axis = 1
495+ df [["Cartn_x" , "Cartn_y" , "Cartn_z" ]].subtract (xyz , axis = 1 )
496+ ** 2 ,
497+ axis = 1 ,
455498 )
456499 )
457500
@@ -485,7 +528,11 @@ def read_mmcif_from_list(self, mmcif_lines):
485528 self .code = self .data ["entry" ]["id" ][0 ].lower ()
486529 return self
487530
488- def convert_to_pandas_pdb (self , offset_chains : bool = True , records : List [str ] = ["ATOM" , "HETATM" ]) -> PandasPdb :
531+ def convert_to_pandas_pdb (
532+ self ,
533+ offset_chains : bool = True ,
534+ records : List [str ] = ["ATOM" , "HETATM" ],
535+ ) -> PandasPdb :
489536 """Returns a PandasPdb object with the same data as the PandasMmcif
490537 object.
491538
@@ -525,10 +572,15 @@ def convert_to_pandas_pdb(self, offset_chains: bool = True, records: List[str] =
525572
526573 # Update atom numbers
527574 if offset_chains :
528- offsets = pandaspdb .df ["ATOM" ]["chain_id" ].astype (
529- "category" ).cat .codes
530- pandaspdb .df ["ATOM" ]["atom_number" ] = pandaspdb .df ["ATOM" ]["atom_number" ] + offsets
575+ offsets = (
576+ pandaspdb .df ["ATOM" ]["chain_id" ].astype ("category" ).cat .codes
577+ )
578+ pandaspdb .df ["ATOM" ]["atom_number" ] = (
579+ pandaspdb .df ["ATOM" ]["atom_number" ] + offsets
580+ )
531581 hetatom_offset = offsets .max () + 1
532- pandaspdb .df ["HETATM" ]["atom_number" ] = pandaspdb .df ["HETATM" ]["atom_number" ] + hetatom_offset
582+ pandaspdb .df ["HETATM" ]["atom_number" ] = (
583+ pandaspdb .df ["HETATM" ]["atom_number" ] + hetatom_offset
584+ )
533585
534586 return pandaspdb
0 commit comments