#!/usr/bin/env python3
# coding: utf-8
import numpy as np
from . import geom, select
import logging
# Logging
logger = logging.getLogger(__name__)
[docs]
def add_NH(coor):
"""Add NH atoms to a protein structure.
This function adds NH atoms to a protein structure by computing the coordinates
of the hydrogen atoms using neighboring C and CA atoms. The new NH atoms are added
to the protein structure.
Parameters
----------
coor : Coor
Protein coordinates.
Returns
-------
TO FIX IS SHOULD RETURN A NEW COOR OBJECT
"""
protein = coor.select_atoms("protein")
unique_residues = np.unique(protein.uniq_resid)
for m_i, model in enumerate(coor.models):
for i in range(len(unique_residues) - 1):
if model.resname[model.uniq_resid == unique_residues[i + 1]][0] == b"PRO":
continue
C = model.xyz[
np.logical_and(
model.name == b"C", model.uniq_resid == unique_residues[i]
)
]
CA = model.xyz[
np.logical_and(
model.name == b"CA", model.uniq_resid == unique_residues[i + 1]
)
]
N_index = np.where(
np.logical_and(
model.name == b"N", model.uniq_resid == unique_residues[i + 1]
)
)[0][0]
N = model.xyz[N_index]
C_N = N - C
C_N = C_N / np.linalg.norm(C_N)
CA_N = N - CA
CA_N = CA_N / np.linalg.norm(CA_N)
NH = C_N + CA_N
# NH = (NH / np.linalg.norm(NH)) * 0.9970 # Charmm36
NH = (NH / np.linalg.norm(NH)) * 1.01 # ?
NH += N
model.add_atom(
N_index + 1,
"H",
model.resname[N_index],
model.num[N_index],
model.resid[N_index],
model.uniq_resid[N_index],
model.chain[N_index],
NH,
model.occ[N_index],
model.beta[N_index],
model.alterloc[N_index],
model.insertres[N_index],
model.elem[N_index],
)
[docs]
def get_NH_xyz(model):
"""computes and returns the nitrogen-hydrogen (NH) coordinates of a protein.
Parameters
----------
model : Model
Model object containing protein coordinates.
Returns
-------
NH_list : numpy.ndarray
2D array of shape (n_residues, 3) containing the NH coordinates for
each residue in the protein. The first row contains None values to
align with the residue numbering in the protein.
Notes
-----
The function first selects the protein atoms excluding alternate
locations B, C, D and E. It then extracts the C, CA and N atoms
and the corresponding residue names. It computes the NH vectors for
each residue and adds them to the N atom position. Finally, the NH
coordinates are stored in a 2D numpy array where each row represents a
residue and the columns correspond to the x, y, and z coordinates. The
first row contains None values to align with the residue numbering in
the protein.
"""
protein = model.select_atoms("protein and not altloc B C D E")
unique_residues = np.unique(protein.uniq_resid)
C_array = model.select_atoms("protein and name C and not altloc B C D E").xyz
CA_array = model.select_atoms("protein and name CA and not altloc B C D E").xyz
N_array = model.select_atoms("protein and name N and not altloc B C D E").xyz
assert len(C_array) == len(unique_residues)
assert len(CA_array) == len(unique_residues)
assert len(N_array) == len(unique_residues)
resname_array = model.select_atoms(
"protein and name CA and not altloc B C D E"
).resname
NH_list = []
# Add first NH
NH_list.append([None, None, None])
for i in range(len(unique_residues) - 1):
if resname_array[i + 1] == b"PRO":
NH_list.append([None, None, None])
continue
C = C_array[i]
CA = CA_array[i + 1]
N = N_array[i + 1]
C_N = N - C
C_N = C_N / np.linalg.norm(C_N)
CA_N = N - CA
CA_N = CA_N / np.linalg.norm(CA_N)
NH = C_N + CA_N
# NH = (NH / np.linalg.norm(NH)) * 0.9970 # Charmm36
NH = (NH / np.linalg.norm(NH)) * 1.01 # ?
NH += N
NH_list.append(NH)
return np.array(NH_list)
[docs]
def hbond_energy(vec_N, vec_H, vec_O, vec_C):
r"""Compute HBond energy based on ON, CH, OH and CN distances.
.. math::
E = q_1 q_2 \left( \frac{1}{r(ON)} + \frac{l}{r(CH)} - \frac{l}{r(OH)} - \frac{l}{r(CN)} \right) f
where: :math:`q1 = 0.42 e` and :math:`q2 = 0.20 e` f is a dimensional factor :math:`f = 332`
and :math:`r` is in Angstrom.
The function returns the calculated hydrogen bond energy in units of kcal/mol.
Reference:
----------
Kabsch W and Sander C. Dictionary of protein secondary structure: pattern recognition
of hydrogen-bonded and geometrical features. *Biopolymers*. 1983 22 2577-2637.
Parameters
----------
vec_N : numpy.ndarray
Nitrogen coordinates.
vec_H : numpy.ndarray
Hydrogen coordinates.
vec_O : numpy.ndarray
Oxygen coordinates.
vec_C : numpy.ndarray
Carbon coordinates.
Returns
-------
energy : float
HBond energy (kcal/mol).
"""
rON = np.linalg.norm(vec_N - vec_O)
rCH = np.linalg.norm(vec_H - vec_C)
rOH = np.linalg.norm(vec_H - vec_O)
rCN = np.linalg.norm(vec_N - vec_C)
# 27.888 = 332 * (0.42 * 0.20)
energy = 27.888 * (1 / rON + 1 / rCH - 1 / rOH - 1 / rCN)
return energy
[docs]
def compute_bend(CA_sel):
"""Compute bend for a protein.
Parameters
----------
CA_sel : Model
AtomGroup containing only CA atoms.
Returns
-------
bend : numpy.ndarray
An array of boolean values indicating whether the corresponding
residue has a bend (True) or not (False). The length of the array is
equal to the number of residues in the protein.
Notes
-----
The bend is computed as the angle between the vectors connecting a
residue's CA atom to the CA atoms of the residues separated by two
positions in sequence. A residue is considered to have a bend if the angle
is greater than 70 degrees.
"""
n_res = len(CA_sel.uniq_resid)
bend = np.array([False for _ in range(n_res)])
for i in range(2, n_res - 2):
CA_i = CA_sel.xyz[i]
CA_i_minus_2 = CA_sel.xyz[i - 2]
CA_i_plus_2 = CA_sel.xyz[i + 2]
vec_i_1 = CA_i - CA_i_minus_2
vec_i_2 = CA_i_plus_2 - CA_i
vec_i_1 /= np.linalg.norm(vec_i_1)
vec_i_2 /= np.linalg.norm(vec_i_2)
# print(i, np.degrees(geom.angle_vec(vec_i_1, vec_i_2)))
if np.degrees(geom.angle_vec(vec_i_1, vec_i_2)) > 70.0:
bend[i] = True
return bend
[docs]
def compute_Hbond_matrix(model):
"""Compute Hbond matrix for a protein.
Parameters
----------
model : Model
model containing the protein.
Returns
-------
Hbond_mat : numpy.ndarray
A boolean matrix of shape (n_res, n_res) where n_res is the number of
Notes
-----
The cutoff distance for the Hbond neighbor search is 8 Angstrom.
"""
cutoff = 8.0
CA_array = model.select_atoms("protein and name CA and not altloc B C D E").xyz
n_res = len(CA_array)
dist_mat = geom.distance_matrix(CA_array, CA_array)
Hbond_mat = np.zeros_like(dist_mat, dtype=bool)
O_array = model.select_atoms("protein and name O and not altloc B C D E").xyz
N_array = model.select_atoms("protein and name N and not altloc B C D E").xyz
C_array = model.select_atoms("protein and name C and not altloc B C D E").xyz
H_array = get_NH_xyz(model)
assert len(O_array) == n_res
assert len(N_array) == n_res
assert len(C_array) == n_res
assert len(H_array) == n_res
# Get indexes to check
mask = dist_mat < cutoff
# Remove lower triangle and i, i+1 (k=1)
mask[np.tril_indices_from(mask, k=1)] = False
indexes = np.argwhere(mask)
for i, j in indexes:
O_i = O_array[i]
C_i = C_array[i]
N_i = N_array[i]
H_i = H_array[i]
O_j = O_array[j]
C_j = C_array[j]
N_j = N_array[j]
H_j = H_array[j]
# Compute HBond energies
if H_j[0] is not None:
energy = hbond_energy(N_j, H_j, O_i, C_i)
if energy < -0.5:
Hbond_mat[i, j] = True
if H_i[0] is not None:
energy = hbond_energy(N_i, H_i, O_j, C_j)
if energy < -0.5:
Hbond_mat[j, i] = True
return Hbond_mat
[docs]
def compute_DSSP(coor):
r"""Compute DSSP for a protein.
The compute_DSSP function takes in a coor parameter which represents
the coordinates of a protein. It computes the secondary structure of
the protein based on the input coordinates and returns a sequence of
secondary structure elements. The output sequence consists of the
following elements:
- "H" represents a 4-helix (:math:`\alpha`-helix)
- "B" represents a residue in an isolated :math:`\beta`-bridge
- "E" represents an extended strand that participates in a :math:`\beta`-ladder
- "G" represents a 3-helix (:math:`3_{10}`-helix)
- "I" represents a 5-helix (:math:`\pi`-helix)
- "T" represents an H-bonded turn
- "S" represents a bend
The function first adds NH hydrogen atoms to the protein coordinates and
selects the :math:`\alpha`-carbons of the protein. It then computes a distance
matrix between all residues and uses this to compute the secondary
structure of the protein. It first computes turns, then :math:`\beta`-sheets,
and finally assigns :math:`\alpha`-helices, :math:`\pi`-helices, and :math:`3_{10}`-helices.
Finally, it joins overlapping helices and returns the secondary structure
sequence.
Parameters
----------
coor : Coor
Coor object containing the protein coordinates.
Returns
-------
SS_seq : numpy.ndarray
A numpy array of shape (n_res,) where n_res is the number of residues
.. note::
- 96 % accuracy compared to DSSP (3eam)
- omitted β-bulge annotation
"""
cutoff = 8
# Add hydrogens atoms
# Find a way to deal with PRO
CA_sel = coor.select_atoms("protein and name CA and not altloc B C D E")
unique_residues = np.unique(CA_sel.uniq_resid)
chain_array = CA_sel.chain
n_res = len(unique_residues)
O_array = coor.select_atoms("protein and name O and not altloc B C D E").xyz
N_array = coor.select_atoms("protein and name N and not altloc B C D E").xyz
C_array = coor.select_atoms("protein and name C and not altloc B C D E").xyz
if len(O_array) != n_res or len(N_array) != n_res or len(C_array) != n_res:
logger.warning(
"Incomplete backbone atoms, removing residues with incomplete backbone atoms"
)
coor = select.remove_incomplete_backbone_residues(coor)
CA_sel = coor.select_atoms("protein and name CA and not altloc B C D E")
unique_residues = np.unique(CA_sel.uniq_resid)
chain_array = CA_sel.chain
n_res = len(unique_residues)
max_dist = 0
SS_list = []
for i, model in enumerate(coor.models):
# Compute distance matrix between all residues
Hbond_mat = compute_Hbond_matrix(model)
# Compute secondary structure
SS_seq = np.array([" " for i in range(n_res)])
H_seq = np.array([False for i in range(n_res)])
G_seq = np.array([False for i in range(n_res)])
I_seq = np.array([False for i in range(n_res)])
E_seq = np.array([False for i in range(n_res)])
S_seq = compute_bend(CA_sel.models[i])
# N-turn
for i in range(n_res - 3):
if i < n_res - 4 and Hbond_mat[i, i + 4]:
H_seq[i] = True
if Hbond_mat[i, i + 3]:
G_seq[i] = True
if i < n_res - 5 and Hbond_mat[i, i + 5]:
I_seq[i] = True
# Beta sheet
# PART TO ACCELERATE
for i in range(1, n_res - 1):
for j in range(i + 3, n_res - 1):
if (
(Hbond_mat[i - 1, j] and Hbond_mat[j, i + 1])
or (Hbond_mat[j - 1, i] and Hbond_mat[i, j + 1])
) or (
(Hbond_mat[i, j] and Hbond_mat[j, i])
or (Hbond_mat[i - 1, j + 1] and Hbond_mat[j - 1, i + 1])
):
E_seq[i] = True
E_seq[j] = True
# Assign secondary structure sequence (order follows the list above)
# Bend
for i in range(n_res):
if S_seq[i]:
SS_seq[i] = "S"
for i in range(n_res - 1):
if I_seq[i]:
SS_seq[i + 1 : i + 5] = "T"
if H_seq[i]:
SS_seq[i + 1 : i + 4] = "T"
if G_seq[i]:
SS_seq[i + 1 : i + 3] = "T"
# A minimal helix is defined by two consecutive n-turns
for i in range(1, n_res - 3):
if G_seq[i] and G_seq[i - 1]:
SS_seq[i : i + 3] = "G"
for i in range(1, n_res - 1):
if E_seq[i] and E_seq[i - 1]:
SS_seq[i - 1 : i + 1] = "E"
elif E_seq[i] and not (E_seq[i + 1] and E_seq[i - 1]):
SS_seq[i] = "B"
for i in range(1, n_res - 4):
# A minimal helix is defined by two consecutive n-turns
if H_seq[i] and H_seq[i - 1]:
SS_seq[i : i + 4] = "H"
# In 2012, DSSP was rewritten so that the assignment of π helices
# was given preference over α helices, resulting in better detection of π helices.
for i in range(1, n_res - 5):
if I_seq[i] and I_seq[i - 1]:
SS_seq[i : i + 5] = "I"
# Two overlapping minimal helices offset by two or three residues are joined
# into one helix:
for i in range(1, n_res - 2):
if SS_seq[i - 1] == "H" and (SS_seq[i + 1] == "H" or SS_seq[i + 2] == "H"):
SS_seq[i] = "H"
for i in range(1, n_res - 1):
if SS_seq[i - 1] in ["E", "B"] and (SS_seq[i + 1] in ["E", "B"]):
SS_seq[i - 1 : i + 2] = "E"
seq_dict = {}
for SS, chain in zip(SS_seq, chain_array):
if chain not in seq_dict:
seq_dict[chain] = SS
else:
seq_dict[chain] += SS
SS_list.append(seq_dict)
return SS_list