Source code for pdb_numpy.DSSP

#!/usr/bin/env python3
# coding: utf-8

import numpy as np
from . import geom, select
import logging

# Logging
logger = logging.getLogger(__name__)



[docs]
def add_NH(coor):
    """Add NH atoms to a protein structure.

    This function adds NH atoms to a protein structure by computing the coordinates
    of the hydrogen atoms using neighboring C and CA atoms. The new NH atoms are added
    to the protein structure.

    Parameters
    ----------
    coor : Coor
        Protein coordinates.


    Returns
    -------
    TO FIX IS SHOULD RETURN A NEW COOR OBJECT
    """

    protein = coor.select_atoms("protein")
    unique_residues = np.unique(protein.uniq_resid)

    for m_i, model in enumerate(coor.models):
        for i in range(len(unique_residues) - 1):
            if model.resname[model.uniq_resid == unique_residues[i + 1]][0] == b"PRO":
                continue

            C = model.xyz[
                np.logical_and(
                    model.name == b"C", model.uniq_resid == unique_residues[i]
                )
            ]
            CA = model.xyz[
                np.logical_and(
                    model.name == b"CA", model.uniq_resid == unique_residues[i + 1]
                )
            ]
            N_index = np.where(
                np.logical_and(
                    model.name == b"N", model.uniq_resid == unique_residues[i + 1]
                )
            )[0][0]
            N = model.xyz[N_index]

            C_N = N - C
            C_N = C_N / np.linalg.norm(C_N)
            CA_N = N - CA
            CA_N = CA_N / np.linalg.norm(CA_N)

            NH = C_N + CA_N
            # NH = (NH / np.linalg.norm(NH)) * 0.9970  # Charmm36
            NH = (NH / np.linalg.norm(NH)) * 1.01  # ?
            NH += N

            model.add_atom(
                N_index + 1,
                "H",
                model.resname[N_index],
                model.num[N_index],
                model.resid[N_index],
                model.uniq_resid[N_index],
                model.chain[N_index],
                NH,
                model.occ[N_index],
                model.beta[N_index],
                model.alterloc[N_index],
                model.insertres[N_index],
                model.elem[N_index],
            )




[docs]
def get_NH_xyz(model):
    """computes and returns the nitrogen-hydrogen (NH) coordinates of a protein.

    Parameters
    ----------
    model : Model
        Model object containing protein coordinates.

    Returns
    -------
    NH_list : numpy.ndarray
        2D array of shape (n_residues, 3) containing the NH coordinates for
        each residue in the protein. The first row contains None values to
        align with the residue numbering in the protein.

    Notes
    -----

    The function first selects the protein atoms excluding alternate
    locations B, C, D and E. It then extracts the C, CA and N atoms
    and the corresponding residue names. It computes the NH vectors for
    each residue and adds them to the N atom position. Finally, the NH
    coordinates are stored in a 2D numpy array where each row represents a
    residue and the columns correspond to the x, y, and z coordinates. The
    first row contains None values to align with the residue numbering in
    the protein.

    """

    protein = model.select_atoms("protein and not altloc B C D E")
    unique_residues = np.unique(protein.uniq_resid)

    C_array = model.select_atoms("protein and name C and not altloc B C D E").xyz
    CA_array = model.select_atoms("protein and name CA and not altloc B C D E").xyz
    N_array = model.select_atoms("protein and name N and not altloc B C D E").xyz

    assert len(C_array) == len(unique_residues)
    assert len(CA_array) == len(unique_residues)
    assert len(N_array) == len(unique_residues)

    resname_array = model.select_atoms(
        "protein and name CA and not altloc B C D E"
    ).resname

    NH_list = []
    # Add first NH
    NH_list.append([None, None, None])
    for i in range(len(unique_residues) - 1):
        if resname_array[i + 1] == b"PRO":
            NH_list.append([None, None, None])
            continue

        C = C_array[i]
        CA = CA_array[i + 1]
        N = N_array[i + 1]

        C_N = N - C
        C_N = C_N / np.linalg.norm(C_N)
        CA_N = N - CA
        CA_N = CA_N / np.linalg.norm(CA_N)

        NH = C_N + CA_N
        # NH = (NH / np.linalg.norm(NH)) * 0.9970  # Charmm36
        NH = (NH / np.linalg.norm(NH)) * 1.01  # ?
        NH += N

        NH_list.append(NH)

    return np.array(NH_list)




[docs]
def hbond_energy(vec_N, vec_H, vec_O, vec_C):
    r"""Compute HBond energy based on ON, CH, OH and CN distances.

    .. math::
        E = q_1 q_2 \left( \frac{1}{r(ON)} + \frac{l}{r(CH)} - \frac{l}{r(OH)} - \frac{l}{r(CN)} \right) f

    where: :math:`q1 = 0.42 e` and :math:`q2 = 0.20 e` f is a dimensional factor :math:`f = 332`
    and :math:`r` is in Angstrom.

    The function returns the calculated hydrogen bond energy in units of kcal/mol.

    Reference:
    ----------

    Kabsch W and Sander C. Dictionary of protein secondary structure: pattern recognition
    of hydrogen-bonded and geometrical features. *Biopolymers*. 1983 22 2577-2637.

    Parameters
    ----------
    vec_N : numpy.ndarray
        Nitrogen coordinates.
    vec_H : numpy.ndarray
        Hydrogen coordinates.
    vec_O : numpy.ndarray
        Oxygen coordinates.
    vec_C : numpy.ndarray
        Carbon coordinates.

    Returns
    -------
    energy : float
        HBond energy (kcal/mol).
    """

    rON = np.linalg.norm(vec_N - vec_O)
    rCH = np.linalg.norm(vec_H - vec_C)
    rOH = np.linalg.norm(vec_H - vec_O)
    rCN = np.linalg.norm(vec_N - vec_C)
    # 27.888 = 332 * (0.42 * 0.20)
    energy = 27.888 * (1 / rON + 1 / rCH - 1 / rOH - 1 / rCN)

    return energy




[docs]
def compute_bend(CA_sel):
    """Compute bend for a protein.

    Parameters
    ----------
    CA_sel : Model
        AtomGroup containing only CA atoms.

    Returns
    -------
    bend : numpy.ndarray
       An array of boolean values indicating whether the corresponding
        residue has a bend (True) or not (False). The length of the array is
        equal to the number of residues in the protein.

    Notes
    -----
    The bend is computed as the angle between the vectors connecting a
    residue's CA atom to the CA atoms of the residues separated by two
    positions in sequence. A residue is considered to have a bend if the angle
    is greater than 70 degrees.

    """

    n_res = len(CA_sel.uniq_resid)
    bend = np.array([False for _ in range(n_res)])

    for i in range(2, n_res - 2):
        CA_i = CA_sel.xyz[i]
        CA_i_minus_2 = CA_sel.xyz[i - 2]
        CA_i_plus_2 = CA_sel.xyz[i + 2]

        vec_i_1 = CA_i - CA_i_minus_2
        vec_i_2 = CA_i_plus_2 - CA_i

        vec_i_1 /= np.linalg.norm(vec_i_1)
        vec_i_2 /= np.linalg.norm(vec_i_2)

        # print(i, np.degrees(geom.angle_vec(vec_i_1, vec_i_2)))
        if np.degrees(geom.angle_vec(vec_i_1, vec_i_2)) > 70.0:
            bend[i] = True

    return bend




[docs]
def compute_Hbond_matrix(model):
    """Compute Hbond matrix for a protein.

    Parameters
    ----------
    model : Model
        model containing the protein.

    Returns
    -------
    Hbond_mat : numpy.ndarray
        A boolean matrix of shape (n_res, n_res) where n_res is the number of

    Notes
    -----
    The cutoff distance for the Hbond neighbor search is 8 Angstrom.

    """
    cutoff = 8.0

    CA_array = model.select_atoms("protein and name CA and not altloc B C D E").xyz
    n_res = len(CA_array)

    dist_mat = geom.distance_matrix(CA_array, CA_array)
    Hbond_mat = np.zeros_like(dist_mat, dtype=bool)

    O_array = model.select_atoms("protein and name O and not altloc B C D E").xyz
    N_array = model.select_atoms("protein and name N and not altloc B C D E").xyz
    C_array = model.select_atoms("protein and name C and not altloc B C D E").xyz

    H_array = get_NH_xyz(model)

    assert len(O_array) == n_res
    assert len(N_array) == n_res
    assert len(C_array) == n_res
    assert len(H_array) == n_res

    # Get indexes to check
    mask = dist_mat < cutoff
    # Remove lower triangle and i, i+1 (k=1)
    mask[np.tril_indices_from(mask, k=1)] = False
    indexes = np.argwhere(mask)

    for i, j in indexes:
        O_i = O_array[i]
        C_i = C_array[i]
        N_i = N_array[i]
        H_i = H_array[i]

        O_j = O_array[j]
        C_j = C_array[j]
        N_j = N_array[j]
        H_j = H_array[j]

        # Compute HBond energies
        if H_j[0] is not None:
            energy = hbond_energy(N_j, H_j, O_i, C_i)
            if energy < -0.5:
                Hbond_mat[i, j] = True

        if H_i[0] is not None:
            energy = hbond_energy(N_i, H_i, O_j, C_j)
            if energy < -0.5:
                Hbond_mat[j, i] = True

    return Hbond_mat




[docs]
def compute_DSSP(coor):
    r"""Compute DSSP for a protein.

    The compute_DSSP function takes in a coor parameter which represents
    the coordinates of a protein. It computes the secondary structure of
    the protein based on the input coordinates and returns a sequence of
    secondary structure elements. The output sequence consists of the
    following elements:

    - "H" represents a 4-helix (:math:`\alpha`-helix)
    - "B" represents a residue in an isolated :math:`\beta`-bridge
    - "E" represents an extended strand that participates in a :math:`\beta`-ladder
    - "G" represents a 3-helix (:math:`3_{10}`-helix)
    - "I" represents a 5-helix (:math:`\pi`-helix)
    - "T" represents an H-bonded turn
    - "S" represents a bend

    The function first adds NH hydrogen atoms to the protein coordinates and
    selects the :math:`\alpha`-carbons of the protein. It then computes a distance
    matrix between all residues and uses this to compute the secondary
    structure of the protein. It first computes turns, then :math:`\beta`-sheets,
    and finally assigns :math:`\alpha`-helices, :math:`\pi`-helices, and :math:`3_{10}`-helices.
    Finally, it joins overlapping helices and returns the secondary structure
    sequence.

    Parameters
    ----------
    coor : Coor
        Coor object containing the protein coordinates.

    Returns
    -------
    SS_seq : numpy.ndarray
        A numpy array of shape (n_res,) where n_res is the number of residues


    .. note::
        - 96 % accuracy compared to DSSP (3eam)
        - omitted β-bulge annotation


    """

    cutoff = 8

    # Add hydrogens atoms
    # Find a way to deal with PRO

    CA_sel = coor.select_atoms("protein and name CA and not altloc B C D E")
    unique_residues = np.unique(CA_sel.uniq_resid)
    chain_array = CA_sel.chain
    n_res = len(unique_residues)

    O_array = coor.select_atoms("protein and name O and not altloc B C D E").xyz
    N_array = coor.select_atoms("protein and name N and not altloc B C D E").xyz
    C_array = coor.select_atoms("protein and name C and not altloc B C D E").xyz

    if len(O_array) != n_res or len(N_array) != n_res or len(C_array) != n_res:
        logger.warning(
            "Incomplete backbone atoms, removing residues with incomplete backbone atoms"
        )
        coor = select.remove_incomplete_backbone_residues(coor)
        CA_sel = coor.select_atoms("protein and name CA and not altloc B C D E")
        unique_residues = np.unique(CA_sel.uniq_resid)
        chain_array = CA_sel.chain
        n_res = len(unique_residues)

    max_dist = 0

    SS_list = []

    for i, model in enumerate(coor.models):
        # Compute distance matrix between all residues
        Hbond_mat = compute_Hbond_matrix(model)

        # Compute secondary structure
        SS_seq = np.array([" " for i in range(n_res)])
        H_seq = np.array([False for i in range(n_res)])
        G_seq = np.array([False for i in range(n_res)])
        I_seq = np.array([False for i in range(n_res)])
        E_seq = np.array([False for i in range(n_res)])
        S_seq = compute_bend(CA_sel.models[i])

        # N-turn
        for i in range(n_res - 3):
            if i < n_res - 4 and Hbond_mat[i, i + 4]:
                H_seq[i] = True
            if Hbond_mat[i, i + 3]:
                G_seq[i] = True
            if i < n_res - 5 and Hbond_mat[i, i + 5]:
                I_seq[i] = True

        # Beta sheet
        # PART TO ACCELERATE
        for i in range(1, n_res - 1):
            for j in range(i + 3, n_res - 1):
                if (
                    (Hbond_mat[i - 1, j] and Hbond_mat[j, i + 1])
                    or (Hbond_mat[j - 1, i] and Hbond_mat[i, j + 1])
                ) or (
                    (Hbond_mat[i, j] and Hbond_mat[j, i])
                    or (Hbond_mat[i - 1, j + 1] and Hbond_mat[j - 1, i + 1])
                ):
                    E_seq[i] = True
                    E_seq[j] = True

        # Assign secondary structure sequence (order follows the list above)
        # Bend
        for i in range(n_res):
            if S_seq[i]:
                SS_seq[i] = "S"

        for i in range(n_res - 1):
            if I_seq[i]:
                SS_seq[i + 1 : i + 5] = "T"
            if H_seq[i]:
                SS_seq[i + 1 : i + 4] = "T"
            if G_seq[i]:
                SS_seq[i + 1 : i + 3] = "T"

        # A minimal helix is defined by two consecutive n-turns

        for i in range(1, n_res - 3):
            if G_seq[i] and G_seq[i - 1]:
                SS_seq[i : i + 3] = "G"

        for i in range(1, n_res - 1):
            if E_seq[i] and E_seq[i - 1]:
                SS_seq[i - 1 : i + 1] = "E"
            elif E_seq[i] and not (E_seq[i + 1] and E_seq[i - 1]):
                SS_seq[i] = "B"

        for i in range(1, n_res - 4):
            # A minimal helix is defined by two consecutive n-turns
            if H_seq[i] and H_seq[i - 1]:
                SS_seq[i : i + 4] = "H"
        #  In 2012, DSSP was rewritten so that the assignment of π helices
        #  was given preference over α helices, resulting in better detection of π helices.
        for i in range(1, n_res - 5):
            if I_seq[i] and I_seq[i - 1]:
                SS_seq[i : i + 5] = "I"

        # Two overlapping minimal helices offset by two or three residues are joined
        # into one helix:
        for i in range(1, n_res - 2):
            if SS_seq[i - 1] == "H" and (SS_seq[i + 1] == "H" or SS_seq[i + 2] == "H"):
                SS_seq[i] = "H"

        for i in range(1, n_res - 1):
            if SS_seq[i - 1] in ["E", "B"] and (SS_seq[i + 1] in ["E", "B"]):
                SS_seq[i - 1 : i + 2] = "E"

        seq_dict = {}

        for SS, chain in zip(SS_seq, chain_array):
            if chain not in seq_dict:
                seq_dict[chain] = SS
            else:
                seq_dict[chain] += SS

        SS_list.append(seq_dict)

    return SS_list