Source code for pdb_numpy.alignement

#!/usr/bin/env python3
# coding: utf-8

import os
import numpy as np
from itertools import permutations
import logging

from . import align_cython

from .. import analysis
from .. import geom
from ..data.blosum import BLOSUM62


# Logging
logger = logging.getLogger(__name__)


# def align_seq_C(seq_1, seq_2, gap_cost=-11, gap_extension=-1):
#    """Align two amino acid sequences using the Waterman - Smith Algorithm.
#
#    Parameters
#    ----------
#    seq_1 : str
#        First sequence to align
#    seq_2 : str
#        Second sequence to align
#    gap_cost : int, optional
#        Cost of gap, by default -8
#    gap_extension : int, optional
#        Cost of gap extension, by default -2
#
#    Returns
#    -------
#    str
#        Aligned sequence 1
#    str
#        Aligned sequence 2
#
#    .note:: This function is based on the C implementation of the Waterman - Smith Algorithm.
#
#        To compile the C code, run the following command in the src/pdb_numpy/alignement folder:
#
#        in Linux:
#        ```
#        gcc -shared -o _align.so -fPIC _align.c
#        ```
#
#        in OSX:
#        ```
#        gcc -shared -o _align.dylib -fPIC _align.c
#        ```
#
#    """
#    import platform
#    import ctypes
#
#    dir_path = os.path.dirname(os.path.abspath(__file__))
#    if platform.uname()[0] == "Darwin":
#        so_file = os.path.join(dir_path, "_align.dylib")
#    else:
#        so_file = os.path.join(dir_path, "_align.so")
#
#    my_functions = ctypes.CDLL(so_file)
#
#    class test(ctypes.Structure):
#        _fields_ = [
#            ("seq1", ctypes.c_char_p),
#            ("seq2", ctypes.c_char_p),
#            ("score", ctypes.c_int),
#        ]
#
#    align = my_functions.align
#    align.restype = ctypes.POINTER(test)
#    align.argtypes = [
#        ctypes.c_char_p,
#        ctypes.c_char_p,
#        ctypes.c_char_p,
#        ctypes.c_int,
#        ctypes.c_int,
#    ]
#
#    seq_1_bytes = seq_1.encode("ascii")
#    seq_2_bytes = seq_2.encode("ascii")
#    blosum_file = os.path.join(dir_path, "../data/blosum62.txt")
#
#    alignement_res = align(
#        ctypes.c_char_p(seq_1_bytes),
#        ctypes.c_char_p(seq_2_bytes),
#        ctypes.c_char_p(blosum_file.encode("ascii")),
#        ctypes.c_int(gap_cost),
#        ctypes.c_int(gap_extension),
#    )
#
#    seq_1_aligned = alignement_res.contents.seq1
#    seq_2_aligned = alignement_res.contents.seq2
#
#    # print(seq_1_aligned)
#
#    free_align = my_functions.free_align
#    free_align.argtypes = [ctypes.POINTER(test)]
#    free_align(alignement_res)
#
#    # print(f"Max score: {alignement_res.contents.score}")
#    return seq_1_aligned.decode("ascii"), seq_2_aligned.decode("ascii")


[docs] def align_seq_cython(seq_1, seq_2, gap_cost=-11, gap_extension=-1): return align_cython.align_seq(seq_1, seq_2, gap_cost, gap_extension)
[docs] def align_seq(seq_1, seq_2, gap_cost=-11, gap_extension=-1): """Align two amino acid sequences using the Waterman - Smith Algorithm. Parameters ---------- seq_1 : str First sequence to align seq_2 : str Second sequence to align gap_cost : int, optional Cost of gap, by default -8 gap_extension : int, optional Cost of gap extension, by default -2 Returns ------- str Aligned sequence 1 str Aligned sequence 2 """ seq_1 = seq_1.replace("-", "") seq_2 = seq_2.replace("-", "") len_1 = len(seq_1) len_2 = len(seq_2) # Initialize the matrix matrix = np.zeros((len_1 + 1, len_2 + 1), dtype=int) prev_line = np.zeros((len_2 + 1), dtype=bool) choices = np.zeros((3), dtype=int) # Fill the matrix for i in range(1, len_1 + 1): # print(i) prev = False # insertion matrix[i, j - 1] for j in range(1, len_2 + 1): # Identify the BLOSUM62 score # Match choices[0] = matrix[i - 1, j - 1] + BLOSUM62[(seq_2[j - 1], seq_1[i - 1])] # Delete choices[1] = matrix[i - 1, j] + (gap_extension if prev else gap_cost) # Insert choices[2] = matrix[i, j - 1] + ( gap_extension if prev_line[j] else gap_cost ) max_index = np.argmax(choices) matrix[i, j] = choices[max_index] prev_line[j] = False prev = False if max_index == 1: prev = True elif max_index == 2: prev_line[j] = True # Identify the maximum score min_seq = min(len_1, len_2) max_score = np.max(matrix[min_seq:, min_seq:]) max_index = np.where(matrix == max_score) show_num = 10 # print(matrix[:show_num, :show_num]) # print("Max score:", max_score, max_index) index_list = [] for i in range(len(max_index[0])): if max_index[0][i] >= min_seq and max_index[1][i] >= min_seq: index_list.append([max_index[0][i], max_index[1][i]]) # if len(index_list) > 1: # logger.warning(f"Ambigous alignement, {len(index_list)} solutions exists") i = index_list[0][0] j = index_list[0][1] # print(i,j) # Traceback and compute the alignment align_1 = "" align_2 = "" if i != len_1: align_2 = (len_1 - i) * "-" if j != len_2: align_1 = (len_2 - j) * "-" align_1 += seq_1[i:] align_2 += seq_2[j:] # i -= 1 # j -= 1 while i != 0 and j != 0: if ( matrix[i, j] == matrix[i - 1, j - 1] + BLOSUM62[(seq_1[i - 1], seq_2[j - 1])] ): align_1 = seq_1[i - 1] + align_1 align_2 = seq_2[j - 1] + align_2 i -= 1 j -= 1 elif ( matrix[i, j] == matrix[i - 1, j] + gap_cost or matrix[i, j] == matrix[i - 1, j] + gap_extension ): align_1 = seq_1[i - 1] + align_1 align_2 = "-" + align_2 i -= 1 elif ( matrix[i, j] == matrix[i, j - 1] + gap_cost or matrix[i, j] == matrix[i, j - 1] + gap_extension ): align_1 = "-" + align_1 align_2 = seq_2[j - 1] + align_2 j -= 1 align_1 = seq_1[:i] + align_1 align_2 = seq_2[:j] + align_2 if i != 0: align_2 = i * "-" + align_2 elif j != 0: align_1 = j * "-" + align_1 assert len(align_1) == len(align_2) # print(f"Max score: {max_score}") return align_1, align_2
[docs] def align_seq_WS(seq_1, seq_2, gap_cost=-8, gap_extension=-1): """Align two amino acid sequences using the Waterman - Smith Algorithm. without gap extensions. Parameters ---------- seq_1 : str First sequence to align seq_2 : str Second sequence to align gap_cost : int, optional Cost of gap, by default -8 Returns ------- str Aligned sequence 1 str Aligned sequence 2 """ seq_1 = seq_1.replace("-", "") seq_2 = seq_2.replace("-", "") len_1 = len(seq_1) len_2 = len(seq_2) # Initialize the matrix matrix = np.zeros((len_1 + 1, len_2 + 1)) # Fill the matrix for i in range(1, len_1 + 1): # print(i) for j in range(1, len_2 + 1): # Identify the BLOSUM62 score match = matrix[i - 1, j - 1] + BLOSUM62[(seq_2[j - 1], seq_1[i - 1])] delete = matrix[i - 1, j] + gap_cost insert = matrix[i, j - 1] + gap_cost matrix[i, j] = max(0, match, delete, insert) # if match > delete and match > insert: # matrix[i, j] = match # #print('Match') # elif delete > insert: # matrix[i, j] = delete # #print('Delete') # else: # matrix[i, j] = insert # #print('Insert') for i in range(1, len_1 + 1): for j in range(1, len_2 + 1): print(matrix[i, j], end=" ") print() # Identify the maximum score min_seq = min(len_1, len_2) max_score = np.max(matrix[min_seq:, min_seq:]) max_index = np.where(matrix == max_score) index_list = [] for i in range(len(max_index[0])): if max_index[0][i] >= min_seq and max_index[1][i] >= min_seq: index_list.append([max_index[0][i], max_index[1][i]]) if len(index_list) > 1: logger.warning(f"Ambigous alignement, {len(index_list)} solutions exists") i = index_list[0][0] j = index_list[0][1] # Traceback and compute the alignment align_1 = "" align_2 = "" if i != len_1: align_2 = (len_1 - i) * "-" if j != len_2: align_1 = (len_2 - j) * "-" align_1 += seq_1[i:] align_2 += seq_2[j:] while i != 0 and j != 0: if ( matrix[i, j] == matrix[i - 1, j - 1] + BLOSUM62[(seq_1[i - 1], seq_2[j - 1])] ): align_1 = seq_1[i - 1] + align_1 align_2 = seq_2[j - 1] + align_2 i -= 1 j -= 1 elif ( matrix[i, j] == matrix[i - 1, j] + gap_cost or matrix[i, j] == matrix[i - 1, j] + gap_extension ): align_1 = seq_1[i - 1] + align_1 align_2 = "-" + align_2 i -= 1 elif ( matrix[i, j] == matrix[i, j - 1] + gap_cost or matrix[i, j] == matrix[i, j - 1] + gap_extension ): align_1 = "-" + align_1 align_2 = seq_2[j - 1] + align_2 j -= 1 align_1 = seq_1[:i] + align_1 align_2 = seq_2[:j] + align_2 if i != 0: align_2 = i * "-" + align_2 elif j != 0: align_1 = j * "-" + align_1 assert len(align_1) == len(align_2) return align_1, align_2
# def get_common_atoms( # coor_1, coor_2, chain_1=["A"], chain_2=["A"], back_names=["C", "N", "O", "CA"] # ): # """Get atom selection in common for two atom_dict based on sequence # alignement. # Parameters # ---------- # coor_1 : Coor # First coordinate # coor_2 : Coor # Second coordinate # chain_1 : list, optional # List of chain to consider in the first coordinate, by default ["A"] # chain_2 : list, optional # List of chain to consider in the second coordinate, by default ["A"] # back_names : list, optional # List of backbone atom names, by default ["C", "N", "O", "CA"] # Returns # ------- # sel_index_1 : list # List of index of the first coordinate # sel_index_2 : list # List of index of the second coordinate # """ # def get_uniq_chain_order(array): # uniq_chains = [] # for chain in array: # if chain not in uniq_chains: # uniq_chains.append(chain) # return uniq_chains # def check_same_chain_order(coor, chain_list): # coor_chain_order = get_uniq_chain_order(coor.chain) # index = 0 # founded = 0 # for chain in chain_list: # if chain not in coor_chain_order: # print(f"Chain {chain} not found in the first Coor object. Available chains: {coor_chain_order}") # else: # for i in range(index, len(coor_chain_order)): # if chain == coor_chain_order[i]: # founded += 1 # index = i # break # if founded != len(chain_1): # return False # else: # return True # if isinstance(chain_1, str): # chain_1 = [chain_1] # if isinstance(chain_2, str): # chain_2 = [chain_2] # # check if chain_1 and chain_2 are in the same order as in the Coor object # coor_1_back = coor_1.select_atoms( # f"chain {' '.join(chain_1)} and protein and name {' '.join(back_names)} and not altloc B C D E F" # ) # coor_2_back = coor_2.select_atoms( # f"chain {' '.join(chain_2)} and protein and name {' '.join(back_names)} and not altloc B C D E F" # ) # sel_1_seq = coor_1_back.get_aa_seq() # sel_2_seq = coor_2_back.get_aa_seq() # # Order of the chains in the Coor object should change the order of the chains in the selection # # To have a correct alignement # if len(chain_1) <= 1 or check_same_chain_order(coor_1, chain_1): # sel_index_1 = coor_1.get_index_select( # f"chain {' '.join(chain_1)} and protein and name {' '.join(back_names)} and not altloc B C D E F" # ) # else: # sel_index_1 = [] # for chain in chain_1: # sel_index_1 += coor_1.get_index_select( # f"chain {chain} and protein and name {' '.join(back_names)} and not altloc B C D E F" # ) # if len(chain_2) <= 1 or check_same_chain_order(coor_2, chain_2): # sel_index_2 = coor_2.get_index_select( # f"chain {' '.join(chain_2)} and protein and name {' '.join(back_names)} and not altloc B C D E F" # ) # else: # sel_index_2 = [] # for chain in chain_2: # sel_index_2 += coor_2.get_index_select( # f"chain {chain} and protein and name {' '.join(back_names)} and not altloc B C D E F" # ) # seq_1 = "" # for chain in chain_1: # seq_1 += sel_1_seq[chain].replace("-", "") # seq_2 = "" # for chain in chain_2: # seq_2 += sel_2_seq[chain].replace("-", "") # assert len(sel_index_1) == len(seq_1) * len( # back_names # ), "Incomplete backbone atoms for first Coor object, you might consider using the remove_incomplete_residues method before." # assert len(sel_index_2) == len(seq_2) * len( # back_names # ), "Incomplete backbone atoms for second Coor object, you might consider using the remove_incomplete_residues method before." # align_seq_1, align_seq_2 = align_seq_cython(seq_1, seq_2) # # print_align_seq(align_seq_1, align_seq_2) # align_sel_1 = [] # align_sel_2 = [] # index_sel_1 = 0 # index_sel_2 = 0 # back_num = len(back_names) # for i in range(len(align_seq_1)): # # print(i, index_sel_1, index_sel_2) # if align_seq_1[i] != "-" and align_seq_2[i] != "-": # align_sel_1 += list(sel_index_1[index_sel_1 : index_sel_1 + back_num]) # align_sel_2 += list(sel_index_2[index_sel_2 : index_sel_2 + back_num]) # index_sel_1 += back_num # index_sel_2 += back_num # elif align_seq_1[i] != "-": # index_sel_1 += back_num # else: # index_sel_2 += back_num # assert len(align_sel_1) == len( # align_sel_2 # ), "Two selection don't have the same atom number" # return align_sel_1, align_sel_2
[docs] def get_common_atoms( coor_1, coor_2, chain_1=["A"], chain_2=["A"], back_names=["C", "N", "O", "CA"] ): """Get atom selection in common for two atom_dict based on sequence alignement. Parameters ---------- coor_1 : Coor First coordinate coor_2 : Coor Second coordinate chain_1 : list, optional List of chain to consider in the first coordinate, by default ["A"] chain_2 : list, optional List of chain to consider in the second coordinate, by default ["A"] back_names : list, optional List of backbone atom names, by default ["C", "N", "O", "CA"] Returns ------- sel_index_1 : list List of index of the first coordinate sel_index_2 : list List of index of the second coordinate """ coor_1_back = coor_1.select_atoms( f"chain {' '.join(chain_1)} and protein and name {' '.join(back_names)} and not altloc B C D E F" ) coor_2_back = coor_2.select_atoms( f"chain {' '.join(chain_2)} and protein and name {' '.join(back_names)} and not altloc B C D E F" ) sel_1_seq = coor_1_back.get_aa_seq() sel_2_seq = coor_2_back.get_aa_seq() sel_index_1 = coor_1.get_index_select( f"chain {' '.join(chain_1)} and protein and name {' '.join(back_names)} and not altloc B C D E F" ) sel_index_2 = coor_2.get_index_select( f"chain {' '.join(chain_2)} and protein and name {' '.join(back_names)} and not altloc B C D E F" ) seq_1 = "" for chain in chain_1: seq_1 += sel_1_seq[chain].replace("-", "") seq_2 = "" for chain in chain_2: seq_2 += sel_2_seq[chain].replace("-", "") assert len(sel_index_1) == len(seq_1) * len( back_names ), "Incomplete backbone atoms for first Coor object, you might consider using the remove_incomplete_residues method before." assert len(sel_index_2) == len(seq_2) * len( back_names ), "Incomplete backbone atoms for second Coor object, you might consider using the remove_incomplete_residues method before." align_seq_1, align_seq_2 = align_seq_cython(seq_1, seq_2) # print_align_seq(align_seq_1, align_seq_2) align_sel_1 = [] align_sel_2 = [] index_sel_1 = 0 index_sel_2 = 0 back_num = len(back_names) for i in range(len(align_seq_1)): # print(i, index_sel_1, index_sel_2) if align_seq_1[i] != "-" and align_seq_2[i] != "-": align_sel_1 += list(sel_index_1[index_sel_1 : index_sel_1 + back_num]) align_sel_2 += list(sel_index_2[index_sel_2 : index_sel_2 + back_num]) index_sel_1 += back_num index_sel_2 += back_num elif align_seq_1[i] != "-": index_sel_1 += back_num else: index_sel_2 += back_num assert len(align_sel_1) == len( align_sel_2 ), "Two selection don't have the same atom number" return align_sel_1, align_sel_2
[docs] def coor_align(coor_1, coor_2, index_1, index_2, frame_ref=0): """Align two structure. Parameters ---------- coor_1 : Coor First coordinate coor_2 : Coor Second coordinate index_1 : list List of atom index to align in the first coordinates index_2 : list List of atom index to align in the second coordinates frame_ref : int, optional Frame to use as reference for coor_2, by default 0 Returns ------- None """ assert len(index_1) != 0, "No atom selected in the first structure" assert len(index_1) == len(index_2), "Two structure don't have the same atom number" assert ( 0 <= frame_ref < len(coor_2.models) ), "Reference frame index is larger than the number of frame in the reference structure" self_align = False if id(coor_1) == id(coor_2): logger.info("Same Coor object, self alignement") self_align = True centroid_2 = coor_2.models[frame_ref].xyz[index_2].mean(axis=0) coor_2.models[frame_ref].xyz -= centroid_2 ref_coor = coor_2.models[frame_ref].xyz[index_2] for i, model in enumerate(coor_1.models): centroid_1 = model.xyz[index_1].mean(axis=0) if not (self_align and (i == frame_ref)): model.xyz -= centroid_1 rot_mat = geom.quaternion_rotate(model.xyz[index_1], ref_coor) # from scipy.spatial.transform import Rotation as R # rot_mat = R.align_vectors(model.xyz[index_1], ref_coor)[0].as_matrix() model.xyz = np.dot(model.xyz, rot_mat) if not (self_align and (i == frame_ref)): model.xyz += centroid_2 coor_2.models[frame_ref].xyz += centroid_2
[docs] def align_seq_based( coor_1, coor_2, chain_1=["A"], chain_2=["A"], back_names=["C", "N", "O", "CA"], compute_rmsd=True, frame_ref=0, ): """Align two structure based on sequence alignement. Parameters ---------- coor_1 : Coor First coordinate coor_2 : Coor Second coordinate chain_1 : list, optional List of chain to consider in the first coordinate, by default ["A"] chain_2 : list, optional List of chain to consider in the second coordinate, by default ["A"] back_names : list, optional List of backbone atom names, by default ["C", "N", "O", "CA"] compute_rmsd : bool, optional Compute RMSD between the two structure, by default True frame_ref : int, optional Frame to use as reference for coor_2, by default 0 Returns ------- rmsd : float, optional RMSD between the two structure sel_index_1 : list List of index of the first coordinate sel_index_2 : list List of index of the second coordinate """ assert ( 0 <= frame_ref < len(coor_2.models) ), "Reference frame index is larger than the number of frame in the reference structure" index_1, index_2 = get_common_atoms(coor_1, coor_2, chain_1, chain_2, back_names) coor_align(coor_1, coor_2, index_1, index_2, frame_ref=frame_ref) if compute_rmsd: return analysis.rmsd( coor_1, coor_2, index_list=[index_1, index_2], frame_ref=frame_ref ), [ index_1, index_2, ] else: return None, [index_1, index_2]
[docs] def rmsd_seq_based( coor_1, coor_2, chain_1=["A"], chain_2=["A"], back_names=["C", "N", "O", "CA"], compute_rmsd=True, ): """Align two structure based on sequence alignement. Parameters ---------- coor_1 : Coor First coordinate coor_2 : Coor Second coordinate chain_1 : list, optional List of chain to consider in the first coordinate, by default ["A"] chain_2 : list, optional List of chain to consider in the second coordinate, by default ["A"] back_names : list, optional List of backbone atom names, by default ["C", "N", "O", "CA"] compute_rmsd : bool, optional Compute RMSD between the two structure, by default True Returns ------- rmsd : float, optional RMSD between the two structure sel_index_1 : list List of index of the first coordinate sel_index_2 : list List of index of the second coordinate """ index_1, index_2 = get_common_atoms(coor_1, coor_2, chain_1, chain_2, back_names) return analysis.rmsd(coor_1, coor_2, index_list=[index_1, index_2]), [ index_1, index_2, ]
[docs] def align_chain_permutation( coor_1, coor_2, chain_1=None, chain_2=None, back_names=["CA"] ): """Align two structure based on chain permutation. Parameters ---------- coor_1 : Coor First coordinate coor_2 : Coor Second coordinate chain_1 : list, optional List of chain to consider in the first coordinate, by default None chain_2 : list, optional List of chain to consider in the second coordinate, by default None Returns ------- rmsd : list minimal RMSDs between the two structure index : list List of index of the first coordinate and the second coordinate """ if chain_1 is None: ca_chain1 = coor_1.select_atoms("name CA") chain_1 = np.unique(ca_chain1.chain) if chain_2 is None: ca_chain2 = coor_2.select_atoms("name CA") chain_2 = np.unique(ca_chain2.chain) if len(chain_2) <= len(chain_1): chain_1_perm = list(permutations(chain_1, len(chain_2))) chain_2_perm = [chain_2] * len(chain_1_perm) else: chain_2_perm = list(permutations(chain_2, len(chain_1))) chain_1_perm = [chain_1] * len(chain_2_perm) # Compute atoms in common for all chains combination: index_common = {} for chain_i in chain_1: for chain_j in chain_2: logger.info(f"compute common atoms for {chain_i} and {chain_j}") index_1, index_2 = get_common_atoms( coor_1, coor_2, chain_i, chain_j, back_names=back_names ) index_common[chain_i, chain_j] = [index_1, index_2] # Compute RMSD for all chains permutation rmsd_perm = [] index_perm = [] for perm_1, perm_2 in zip(chain_1_perm, chain_2_perm): logger.info( f'Trying chains permutation: {" ".join(perm_1)} with {" ".join(perm_2)}' ) index_all_1 = [] index_all_2 = [] for chain_i, chain_j in zip(perm_1, perm_2): # index_1, index_2 = get_common_atoms(coor_1, coor_2, chain_i, chain_j) index_1, index_2 = index_common[chain_i, chain_j] index_all_1 += index_1 index_all_2 += index_2 coor_align(coor_1, coor_2, index_all_1, index_all_2) rmsd = analysis.rmsd(coor_1, coor_2, index_list=[index_all_1, index_all_2]) rmsd_perm.append(rmsd) index_perm.append([index_all_1, index_all_2]) min_index = 0 min_rmsd = rmsd_perm[0][0] for i, rmsds in enumerate(rmsd_perm): for rmsd in rmsds: if rmsd < min_rmsd: min_index = i min_rmsd = rmsd # Do the alignement with the best permutation min_index_perm = index_perm[min_index] coor_align(coor_1, coor_2, min_index_perm[0], min_index_perm[1]) return ( rmsd_perm[min_index], index_perm[min_index], )