Source code for pdb_numpy.format.pdb

#!/usr/bin/env python3
# coding: utf-8

import os
import urllib.request
import logging
import numpy as np
import gzip


from .. import geom as geom
from ..model import Model
from .. import coor
from . import encode_cython as encode

# from . import encode

# Logging
logger = logging.getLogger(__name__)

FIELD_DICT = {"A": "ATOM  ", "H": "HETATM"}

CHAIN_LIST = [
    chr(i)
    for i in list(range(65, 91))
    + list(range(48, 58))
    + list(range(97, 123))
    + list(range(192, 500))
]


[docs] def parse(pdb_lines, pqr_format=False): """Parse the pdb lines and return atom information's as a dictionary Parameters ---------- pdb_lines : list list of pdb lines pqr_format : bool, optional if True, parse pqr format, by default False Returns ------- Coor Coor object """ pdb_coor = coor.Coor() # To parse hexadecimal resid: resid_base = 10 atom_index = 0 uniq_resid = -1 old_resid = -np.inf old_insert_res = " " model_num = 1 index_list = [] field_list = [] # 6 char num_resid_uniqresid_list = [] # int 5 digits (+1 with Chimera) alter_chain_insert_list = [] # 1 char name_resname_elem_list = [] # 4 / 3 char (+1 with Chimera) = 4 xyz_list = [] # real (8.3) occ_beta_list = [] # real (6.2) conection_list = {} transformation = "" symmetry = "" for line in pdb_lines: if line.startswith("CRYST1"): pdb_coor.crystal_pack = line elif line.startswith("REMARK 350 "): transformation += line elif line.startswith("REMARK 290 "): symmetry += line elif line.startswith("MODEL"): # print('Read Model {}'.format(model_num)) model_num += 1 elif line.startswith("CONECT"): # We use an index starting at 0, as pdb files start at 1 index_list = [ encode.hy36decode(5, f"{int(index):5d}") for index in line.split()[1:] ] if index_list[0] not in conection_list: conection_list[index_list[0]] = [] for index in index_list[1:]: conection_list[index_list[0]].append(index) elif line.startswith("ENDMDL") or line.startswith("END"): if len(field_list) > 0: local_model = Model() local_model.atom_dict = { "field": np.array(field_list, dtype="|U1"), "num_resid_uniqresid": np.array( num_resid_uniqresid_list, dtype="int32" ), "name_resname_elem": np.array(name_resname_elem_list, dtype="|U4"), "alterloc_chain_insertres": np.array( alter_chain_insert_list, dtype="|U2" ), "xyz": np.array(xyz_list, dtype="float32"), "occ_beta": np.array(occ_beta_list, dtype="float32"), } if ( len(pdb_coor.models) >= 1 and local_model.len != pdb_coor.models[-1].len ): logger.warning( f"The atom number is not the same in the model {len(pdb_coor.models)-1} and the model {len(pdb_coor.models)}." "\nSkip this model." ) else: pdb_coor.models.append(local_model) atom_index = 0 uniq_resid = -1 old_resid = -np.inf old_insert_res = " " model_num = 1 index_list = [] field_list = [] # 6 char num_resid_uniqresid_list = [] # int 5 digits (+1 with Chimera) alter_chain_insert_list = [] # 1 char name_resname_elem_list = [] # 4 / 3 char (+1 with Chimera) = 4 xyz_list = [] # real (8.3) occ_beta_list = [] # real (6.2) elif line.startswith("ATOM") or line.startswith("HETATM"): field = line[:6].strip() atom_num = encode.hy36decode(5, line[6:11]) atom_name = line[12:16].strip() res_name = line[17:20].strip() chain = line[21] # To parse hexadecimal resid: resid = encode.hy36decode(4, line[22:26]) # If resid is hexadecimal, resid_base is set to 16 if resid >= 9999: resid_base = 16 insert_res = line[26:27].strip() xyz = [float(line[30:38]), float(line[38:46]), float(line[46:54])] if pqr_format: alter_loc = "" res_name = line[16:20].strip() occ, beta = line[54:62].strip(), line[62:70].strip() elem_symbol = "" else: alter_loc = line[16:17].strip() res_name = line[17:21].strip() occ, beta = line[54:60].strip(), line[60:66].strip() elem_symbol = line[76:78].strip() if occ == "": occ = 0.0 else: occ = float(occ) if beta == "": beta = 0.0 else: beta = float(beta) if resid != old_resid or insert_res != old_insert_res: uniq_resid += 1 old_resid = resid old_insert_res = insert_res field_list.append(field[0]) num_resid_uniqresid_list.append([atom_num, resid, uniq_resid]) index_list.append(atom_index) name_resname_elem_list.append([atom_name, res_name, elem_symbol]) alter_chain_insert_list.append([alter_loc, chain, insert_res]) xyz_list.append(xyz) occ_beta_list.append([occ, beta]) atom_index += 1 if len(field_list) > 0: logger.warning("No ENDMDL in the pdb file.") local_model = Model() local_model.atom_dict = { "field": np.array(field_list, dtype="|U1"), "num_resid_uniqresid": np.array(num_resid_uniqresid_list, dtype="int32"), "name_resname_elem": np.array(name_resname_elem_list, dtype="|U4"), "alterloc_chain_insertres": np.array(alter_chain_insert_list, dtype="|U2"), "xyz": np.array(xyz_list, dtype="float32"), "occ_beta": np.array(occ_beta_list, dtype="float32"), } if len(pdb_coor.models) >= 1 and local_model.len != pdb_coor.models[-1].len: logger.warning( f"The atom number is not the same in the model {len(pdb_coor.models)-1} and the model {len(pdb_coor.models)}." "\nSkip this model." ) else: pdb_coor.models.append(local_model) if transformation != "": pdb_coor.transformation = parse_transformation(transformation) if symmetry != "": pdb_coor.symmetry = parse_symmetry(symmetry) pdb_coor.conect = conection_list return pdb_coor
[docs] def parse_transformation(text): """Parse the `REMARK 350 BIOMT` information from a pdb file. Parameters ---------- text : str pdb file Returns ------- symetry_dict : dict symetry information """ transformation_dict = {} for line in text.split("\n"): if line[11:23] == "BIOMOLECULE:": biomol = int(line[24:]) transformation_dict[biomol] = {"chains": [], "matrix": []} elif line[34:41] == "CHAINS:": transformation_dict[biomol]["chains"] += [ chain.strip() for chain in line[42:].split(",") ] elif line.startswith("REMARK 350 BIOMT"): transformation_dict[biomol]["matrix"] += [ [float(x) for x in line[19:].split()] ] return transformation_dict
[docs] def parse_symmetry(text): """Parse the `REMARK 290 SMTRY` information from a pdb file. Parameters ---------- text : str pdb file Returns ------- symetry_dict : dict symetry information """ symmetry_dict = {"matrix": []} for line in text.split("\n"): if line.startswith("REMARK 290 SMTRY"): symmetry_dict["matrix"] += [[float(x) for x in line[19:].split()]] return symmetry_dict
[docs] def fetch(pdb_ID): """Get a pdb file from the PDB using its ID and return a Coor object. Parameters ---------- pdb_ID : str pdb ID Returns ------- Coor Coor object Examples -------- >>> prot_coor = Coor() >>> prot_coor.get_PDB('3EAM') """ # Get the pdb file from the PDB: with urllib.request.urlopen( f"http://files.rcsb.org/download/{pdb_ID}.pdb" ) as response: pdb_lines = response.read().decode("utf-8").splitlines(True) return parse(pdb_lines)
[docs] def fetch_BioAssembly(pdb_ID, index=1): """Get a Bio Assembly pdb file from the PDB using its ID and return a Coor object. Parameters ---------- pdb_ID : str pdb ID index : int Bio Assembly index Returns ------- Coor Coor object Examples -------- >>> prot_coor = Coor() >>> prot_coor.get_PDB('3EAM') """ # Get the pdb file from the PDB: req = urllib.request.Request( f"http://files.rcsb.org/download/{pdb_ID}.pdb{index}.gz" ) req.add_header("Accept-Encoding", "gzip") with urllib.request.urlopen(req) as response: pdb_lines = gzip.decompress(response.read()).decode("utf-8").splitlines(True) return parse(pdb_lines)
[docs] def convert_chain_2_letter(chain: str) -> str: """For Coor coming from `.mmcif` format, chain ID can be 2 letters long. It has to converted to one letter long in `.pdb` format. Parameters ---------- chain : str chain ID Returns ------- str chain ID in one letter long """ # print(f"convert_chain_2_letter() chain: {chain}") count = 0 base = 65 for i, letter in enumerate(chain): if i > 0: base = 64 count += 26**i * (ord(letter) - base) if 0 < count < len(CHAIN_LIST): return CHAIN_LIST[count] else: return "0"
[docs] def get_pdb_string(pdb_coor): """Return a coor object as a pdb string. Parameters ---------- pdb_coor : Coor Coor object Returns ------- str Coor object as a pdb string Examples -------- >>> prot_coor = Coor() >>> prot_coor.read_file(os.path.join(TEST_PATH, '1y0m.pdb'))\ #doctest: +ELLIPSIS Succeed to read file ...1y0m.pdb , 648 atoms found >>> pdb_str = prot_coor.get_structure_string() >>> print(f'Number of caracters: {len(pdb_str)}') Number of caracters: 51264 """ str_out = "" if pdb_coor.crystal_pack != "": str_out += geom.cryst_convert(pdb_coor.crystal_pack, format_out="pdb") elif pdb_coor.data_mmCIF is not None: str_out += geom.cryst_convert_mmCIF(pdb_coor.data_mmCIF, format_out="pdb") for model_index, model in enumerate(pdb_coor.models): str_out += f"MODEL {model_index + 1:4d}\n" out_chain = "" # atom_num_gap = 0 # Replace mmcif `.` altloc by `''` and `?` insertres by `''` alterloc = [ "" if altloc == "." else altloc for altloc in model.atom_dict["alterloc_chain_insertres"][:, 0] ] insertres = [ "" if insert == "?" else insert for insert in model.atom_dict["alterloc_chain_insertres"][:, 2] ] chain_list = [ "" if chain == "?" else chain for chain in model.atom_dict["alterloc_chain_insertres"][:, 1] ] elem_symbol = [ "" if elem == "?" else elem for elem in model.atom_dict["name_resname_elem"][:, 2] ] old_chain = chain_list[0] if len(old_chain) > 1: out_chain = convert_chain_2_letter(old_chain) else: out_chain = old_chain # If resname in 3 letters or less, we add a space at the end mylen = np.vectorize(len) if max(mylen(model.atom_dict["name_resname_elem"][:, 1])) <= 3: resname = np.char.add(model.atom_dict["name_resname_elem"][:, 1], " ") else: resname = model.atom_dict["name_resname_elem"][:, 1] for i in range(model.len): # Atom name should start at column 14, with the type of atom ex: # - with atom type 'C': ' CH3' # for 2 letters atom type, it should start at column 13 ex: # - with atom type 'FE': 'FE1' num = model.atom_dict["num_resid_uniqresid"][i, 0] name = model.atom_dict["name_resname_elem"][i, 0].astype(np.str_) if len(name) <= 3 and name[0] in ["C", "H", "O", "N", "S", "P"]: name = " " + name # chain = model.atom_dict["alterloc_chain_insertres"][i, 1].astype(np.str_) if chain_list[i] != old_chain: # str_out += f"TER {encode.hy36encode(5, i + 1 + atom_num_gap):5s} {resname[i-1]:>4s}{out_chain:1s}{resid:>4s}\n" str_out += f"TER {encode.hy36encode(5, num-1):5s} {resname[i-1]:>4s}{out_chain:1s}{resid:>4s}\n" # atom_num_gap += 1 old_chain = chain_list[i] if len(old_chain) > 1: out_chain = convert_chain_2_letter(old_chain) else: out_chain = old_chain # To use resid > 9999, we need to convert the resid in hexadecimal format resid = model.atom_dict["num_resid_uniqresid"][i, 1] if resid > 9999: resid = encode.hy36encode(4, resid) else: resid = str(resid) # Note : Here we use 4 letter residue name. str_out += ( "{:6s}{:5s} {:4s}{:1s}{:>4s}{:1s}{:>4s}{:1s}" " {:8.3f}{:8.3f}{:8.3f}{:6.2f}{:6.2f}" " {:>2s}\n".format( FIELD_DICT[model.atom_dict["field"][i]], # encode.hy36encode(5, i + 1 + atom_num_gap), encode.hy36encode(5, num), name, alterloc[i], resname[i], out_chain, resid, insertres[i], model.atom_dict["xyz"][i, 0], model.atom_dict["xyz"][i, 1], model.atom_dict["xyz"][i, 2], model.atom_dict["occ_beta"][i, 0], model.atom_dict["occ_beta"][i, 1], elem_symbol[i], ) ) # str_out += f"TER {encode.hy36encode(5, i + 1 + atom_num_gap):5s} {resname[i-1]:>4s}{out_chain:1s}{resid:>4s}\n" str_out += f"TER {encode.hy36encode(5, num):5s} {resname[i-1]:>4s}{out_chain:1s}{resid:>4s}\n" str_out += "ENDMDL\n" if len(pdb_coor.conect) > 0: for atom_index in sorted(pdb_coor.conect.keys()): connected_atoms = pdb_coor.conect[atom_index] # PDB format allows up to 4 bonded atom serial numbers per CONECT line. for i in range(0, len(connected_atoms), 4): line = "CONECT" + f"{encode.hy36encode(5, atom_index):>5s}" for connected_atom in connected_atoms[i : i + 4]: line += f"{encode.hy36encode(5, connected_atom):>5s}" str_out += line + "\n" str_out += "END\n" return str_out
[docs] def write(coor, pdb_out, overwrite=False): """Write a pdb file. Parameters ---------- coor : Coor Coor object pdb_out : str path of the pdb file to write overwrite : bool, optional, default=False flag to overwrite or not if file has already been created. Returns ------- None Examples -------- >>> prot_coor = Coor(os.path.join(TEST_PATH, '1y0m.pdb')) >>> prot_coor.write_pdb(os.path.join(TEST_OUT, 'tmp.pdb')) Succeed to save file tmp.pdb """ if not overwrite and os.path.exists(pdb_out): logger.warning(f"PDB file {pdb_out} already exist, file not saved") return filout = open(pdb_out, "w") filout.write(get_pdb_string(coor)) filout.close() logger.info(f"Succeed to save file {os.path.relpath(pdb_out)}") return