Source code for pdb_numpy.format.mmcif

#!/usr/bin/env python3
# coding: utf-8

import os
import shlex
from collections import OrderedDict
import urllib.request
import logging
import numpy as np
import gzip


from .. import geom
from ..model import Model
from .. import coor

# Logging
logger = logging.getLogger(__name__)

FIELD_DICT = {"A": "ATOM  ", "H": "HETATM"}

MMCIF_ATOM_SITE = (
    "# \n"
    "loop_\n"
    "_atom_site.group_PDB \n"
    "_atom_site.id \n"
    "_atom_site.type_symbol \n"
    "_atom_site.label_atom_id \n"
    "_atom_site.label_alt_id \n"
    "_atom_site.label_comp_id \n"
    "_atom_site.label_asym_id \n"
    "_atom_site.label_entity_id \n"
    "_atom_site.label_seq_id \n"
    "_atom_site.pdbx_PDB_ins_code \n"
    "_atom_site.Cartn_x \n"
    "_atom_site.Cartn_y \n"
    "_atom_site.Cartn_z \n"
    "_atom_site.occupancy \n"
    "_atom_site.B_iso_or_equiv \n"
    "_atom_site.pdbx_formal_charge \n"
    "_atom_site.auth_seq_id \n"
    "_atom_site.auth_comp_id \n"
    "_atom_site.auth_asym_id \n"
    "_atom_site.auth_atom_id \n"
    "_atom_site.pdbx_PDB_model_num \n"
)


[docs]def parse(mmcif_lines):
    """Parse the mmcif lines and return atom information as a dictionary

    Parameters
    ----------

    mmcif_lines : list
        list of pdb lines

    Returns
    -------
    Coor
        Coor object

    """

    data_mmCIF = _parse_raw_mmcif_lines(mmcif_lines)

    model_index = data_mmCIF["_atom_site"]["col_names"].index("pdbx_PDB_model_num")
    model_array = np.array(data_mmCIF["_atom_site"]["value"][model_index]).astype(
        np.int16
    )
    model_list = np.unique(model_array)

    # field list
    col_index = data_mmCIF["_atom_site"]["col_names"].index("group_PDB")
    field_array = np.array(
        [field[0] for field in data_mmCIF["_atom_site"]["value"][col_index]],
        dtype="|U1",
    )

    # "num_resid_uniqresid"
    col_index = data_mmCIF["_atom_site"]["col_names"].index("id")
    num_array = np.array(data_mmCIF["_atom_site"]["value"][col_index]).astype(np.int32)
    # check that num_array is consecutive (Maybe useless)
    assert np.array_equal(
        num_array, np.arange(1, len(num_array) + 1)
    ), "Atom numbering is not consecutive"

    col_index = data_mmCIF["_atom_site"]["col_names"].index("auth_seq_id")
    resid_array = np.array(data_mmCIF["_atom_site"]["value"][col_index]).astype(
        np.int32
    )
    uniq_resid_list = []
    uniq_resid = 0
    prev_resid = resid_array[0]
    prev_model = model_array[0]
    for resid, model in zip(resid_array, model_array):
        if model != prev_model:
            uniq_resid = 0
            prev_resid = resid
            prev_model = model
        if resid != prev_resid:
            uniq_resid += 1
            uniq_resid_list.append(uniq_resid)
            prev_resid = resid
        else:
            uniq_resid_list.append(uniq_resid)

    uniq_resid_array = np.array(uniq_resid_list).astype(np.int32)

    num_resid_uniqresid_array = np.column_stack(
        (num_array, resid_array, uniq_resid_array)
    )

    # "name_resname"
    col_index = data_mmCIF["_atom_site"]["col_names"].index("label_atom_id")
    # dtype set to U5 to avoid truncation of long atom names like "O5\'"
    name_array = np.array(data_mmCIF["_atom_site"]["value"][col_index], dtype="|U4")
    col_index = data_mmCIF["_atom_site"]["col_names"].index("label_comp_id")
    resname_array = np.array(data_mmCIF["_atom_site"]["value"][col_index], dtype="|U4")
    col_index = data_mmCIF["_atom_site"]["col_names"].index("type_symbol")
    ele_array = np.array(data_mmCIF["_atom_site"]["value"][col_index], dtype="|U4")

    name_resname_array = np.column_stack((name_array, resname_array, ele_array))

    # "alterloc_chain_insertres"
    col_index = data_mmCIF["_atom_site"]["col_names"].index("label_alt_id")
    alterloc_array = np.array(data_mmCIF["_atom_site"]["value"][col_index], dtype="|U2")
    alterloc_array[alterloc_array == b"."] = ""
    col_index = data_mmCIF["_atom_site"]["col_names"].index("label_asym_id")
    chain_array = np.array(data_mmCIF["_atom_site"]["value"][col_index], dtype="|U2")
    col_index = data_mmCIF["_atom_site"]["col_names"].index("pdbx_PDB_ins_code")
    insertres_array = np.array(
        data_mmCIF["_atom_site"]["value"][col_index], dtype="|U2"
    )
    insertres_array[insertres_array == b"?"] = ""
    alterloc_chain_insertres_array = np.column_stack(
        (alterloc_array, chain_array, insertres_array)
    )

    # "xyz"
    col_index = data_mmCIF["_atom_site"]["col_names"].index("Cartn_x")
    x_array = np.array(data_mmCIF["_atom_site"]["value"][col_index]).astype(np.float32)
    col_index = data_mmCIF["_atom_site"]["col_names"].index("Cartn_y")
    y_array = np.array(data_mmCIF["_atom_site"]["value"][col_index]).astype(np.float32)
    col_index = data_mmCIF["_atom_site"]["col_names"].index("Cartn_z")
    z_array = np.array(data_mmCIF["_atom_site"]["value"][col_index]).astype(np.float32)

    xyz_array = np.column_stack((x_array, y_array, z_array))

    # "occ_beta"
    col_index = data_mmCIF["_atom_site"]["col_names"].index("occupancy")
    occ_array = np.array(data_mmCIF["_atom_site"]["value"][col_index]).astype(
        np.float32
    )
    col_index = data_mmCIF["_atom_site"]["col_names"].index("B_iso_or_equiv")
    beta_array = np.array(data_mmCIF["_atom_site"]["value"][col_index]).astype(
        np.float32
    )

    occ_beta_array = np.column_stack((occ_array, beta_array))

    # Need to extract atom symbols ?

    mmcif_coor = coor.Coor()

    for model in model_list:
        model_index = model_array == model

        local_model = Model()
        local_model.atom_dict = {
            "field": field_array[model_index],
            "num_resid_uniqresid": num_resid_uniqresid_array[model_index],
            "name_resname_elem": name_resname_array[model_index],
            "alterloc_chain_insertres": alterloc_chain_insertres_array[model_index],
            "xyz": xyz_array[model_index],
            "occ_beta": occ_beta_array[model_index],
        }

        if len(mmcif_coor.models) > 1 and local_model.len != mmcif_coor.models[-1].len:
            logger.warning(
                f"The atom number is not the same in the model {len(mmcif_coor.models)-1} and the model {len(mmcif_coor.models)}."
            )

        mmcif_coor.models.append(local_model)

    # Delete atom coordinates in the dict
    data_mmCIF["_atom_site"] = None
    mmcif_coor.data_mmCIF = data_mmCIF

    if "_pdbx_struct_oper_list" in data_mmCIF:
        mmcif_coor.transformation = parse_transformation(data_mmCIF)
    if "_cell" in data_mmCIF:
        mmcif_coor.crystal_pack = parse_crystal_pack(data_mmCIF)

    return mmcif_coor


[docs]def parse_crystal_pack(data_mmCIF):
    """
    Parse crystal packing information from a mmcif file.
    treat the following mmcif tags:
    - `_cell`
    - `_symmetry`

    Parameters
    ----------
    data_mmCIF : dict
        mmcif data

    Returns
    -------
    crystal_pack : str
        crystal packing
    """

    a = float(data_mmCIF["_cell"]["length_a"])
    b = float(data_mmCIF["_cell"]["length_b"])
    c = float(data_mmCIF["_cell"]["length_c"])
    alpha = float(data_mmCIF["_cell"]["angle_alpha"])
    beta = float(data_mmCIF["_cell"]["angle_beta"])
    gamma = float(data_mmCIF["_cell"]["angle_gamma"])
    z = int(data_mmCIF["_cell"]["Z_PDB"])

    sGroup = data_mmCIF["_symmetry"]["space_group_name_H-M"].replace("'", "")

    crystal_pack = f"CRYST1{a:9.3f}{b:9.3f}{c:9.3f}{alpha:7.2f}{beta:7.2f}{gamma:7.2f} {sGroup:9} {z:3d}\n"
    return crystal_pack


[docs]def parse_transformation(data_mmCIF):
    """Parse information from a mmcif file.
    treat the following mmcif tags:
    - `_pdbx_struct_assembly_gen`
    - `_pdbx_struct_oper_list`
    - `_pdbx_struct_assembly`

    Parameters
    ----------
    data_mmCIF : dict
        mmcif data

    Returns
    -------
    transformation_dict : dict
        transformation dict
    """

    matrix_indexes = [
        ["id", "matrix[1][1]", "matrix[1][2]", "matrix[1][3]", "vector[1]"],
        ["id", "matrix[2][1]", "matrix[2][2]", "matrix[2][3]", "vector[2]"],
        ["id", "matrix[3][1]", "matrix[3][2]", "matrix[3][3]", "vector[3]"],
    ]
    transformation_dict = {}

    # print('_pdbx_struct_oper_list', data_mmCIF['_pdbx_struct_oper_list'])
    # print('_pdbx_struct_assembly_gen', data_mmCIF['_pdbx_struct_assembly_gen'])

    # Extract transformation list:
    # Here with only one transformation
    if "asym_id_list" in data_mmCIF["_pdbx_struct_assembly_gen"]:
        trans_num = 1
        chain_list = [
            chain.strip()
            for chain in data_mmCIF["_pdbx_struct_assembly_gen"]["asym_id_list"].split(
                ","
            )
        ]

        transformation_dict[1] = {"chains": chain_list, "matrix": []}

        if "value" in data_mmCIF["_pdbx_struct_oper_list"]:
            for i in range(len(data_mmCIF["_pdbx_struct_oper_list"]["value"][0])):

                for matrix_index in matrix_indexes:
                    local_matrix = []
                    for index in matrix_index:
                        local_index = data_mmCIF["_pdbx_struct_oper_list"][
                            "col_names"
                        ].index(index)
                        local_matrix.append(
                            float(
                                data_mmCIF["_pdbx_struct_oper_list"]["value"][
                                    local_index
                                ][i]
                            )
                        )
                    transformation_dict[1]["matrix"].append(local_matrix)
        else:
            for matrix_index in matrix_indexes:
                local_matrix = []
                for index in matrix_index:
                    local_matrix.append(
                        float(data_mmCIF["_pdbx_struct_oper_list"][index])
                    )
                transformation_dict[1]["matrix"].append(local_matrix)

    # Here with multiple transformation
    else:
        trans_num = len(data_mmCIF["_pdbx_struct_assembly"]["value"][0])
        assert trans_num == len(data_mmCIF["_pdbx_struct_assembly_gen"]["value"][0])

        chain_index = data_mmCIF["_pdbx_struct_assembly_gen"]["col_names"].index(
            "asym_id_list"
        )
        local_matrix_index = data_mmCIF["_pdbx_struct_assembly_gen"]["col_names"].index(
            "oper_expression"
        )

        for i in range(trans_num):
            # Extract chain list and matrix indexes
            chain_list = [
                chain.strip()
                for chain in data_mmCIF["_pdbx_struct_assembly_gen"]["value"][
                    chain_index
                ][i].split(",")
            ]
            matrix_index_list = [
                chain.strip()
                for chain in data_mmCIF["_pdbx_struct_assembly_gen"]["value"][
                    local_matrix_index
                ][i].split(",")
            ]
            transformation_dict[i + 1] = {"chains": chain_list, "matrix": []}

            # Extract matrix value
            for j in range(len(data_mmCIF["_pdbx_struct_oper_list"]["value"][0])):
                matrix_id = data_mmCIF["_pdbx_struct_oper_list"]["value"][0][j]

                if matrix_id in matrix_index_list:
                    for matrix_index in matrix_indexes:
                        local_matrix = []
                        for index in matrix_index:
                            local_index = data_mmCIF["_pdbx_struct_oper_list"][
                                "col_names"
                            ].index(index)
                            local_matrix.append(
                                float(
                                    data_mmCIF["_pdbx_struct_oper_list"]["value"][
                                        local_index
                                    ][j]
                                )
                            )
                        transformation_dict[i + 1]["matrix"].append(local_matrix)

    return transformation_dict


[docs]def parse_symmetry(data_mmCIF):
    """Parse information from a mmcif file.
    treat the following mmcif tags:
    - `_symmetry_equiv_pos_as_xyz`

    Parameters
    ----------
    data_mmCIF : dict
        mmcif data

    Returns
    -------
    symmetry_dict : dict
        symmetry dict
    """

    symmetry_dict = {}
    if "_symmetry_equiv_pos_as_xyz" in data_mmCIF:
        symmetry_dict["symmetry"] = data_mmCIF["_symmetry_equiv_pos_as_xyz"]["value"]
    else:
        symmetry_dict["symmetry"] = []

    return symmetry_dict


[docs]def fetch(pdb_ID):
    """Get a mmcif file from the PDB using its ID
    and return a Coor object.

    Parameters
    ----------
    pdb_ID : str
        pdb ID

    Returns
    -------
    Coor
        Coor object

    Examples
    --------
    >>> prot_coor = Coor()
    >>> prot_coor.get_PDB_mmcif('3EAM')
    """

    # Get the pdb file from the PDB:
    with urllib.request.urlopen(
        f"http://files.rcsb.org/download/{pdb_ID}.cif"
    ) as response:
        mmcif_lines = response.read().decode("utf-8").splitlines(True)

    return parse(mmcif_lines)


[docs]def fetch_BioAssembly(pdb_ID, index=1):
    """Get a Bio Assembly mmcif file from the PDB using its ID
    and return a Coor object.

    Parameters
    ----------
    pdb_ID : str
        pdb ID
    index : int
        Bio Assembly index

    Returns
    -------
    Coor
        Coor object

    Examples
    --------
    >>> prot_coor = Coor()
    >>> prot_coor.get_PDB('3EAM')
    """

    # https://files.rcsb.org/download/5AEF-assembly1.cif.gz

    # Get the pdb file from the PDB:
    req = urllib.request.Request(
        f"http://files.rcsb.org/download/{pdb_ID}-assembly{index}.cif.gz"
    )
    req.add_header("Accept-Encoding", "gzip")

    with urllib.request.urlopen(req) as response:
        cif_lines = gzip.decompress(response.read()).decode("utf-8").splitlines(True)

    return parse(cif_lines)


def _parse_raw_mmcif_lines(mmcif_lines):
    """Parse the mmcif lines and return atom information as a dictionary

    Parameters
    ----------
    mmcif_lines : list
        list of pdb lines

    Returns
    -------
    dict
        dictionary with atom information

    """

    data_mmCIF = OrderedDict()
    tabular = False
    mutli_line = ""

    category = "title"
    attribute = "title"

    for i, line in enumerate(mmcif_lines):
        # print(line, end="")

        if line.startswith("#"):
            tabular = False

        elif line.startswith("loop_"):
            tabular = True
            col_names = []

        elif line.startswith("_"):
            token = shlex.split(line, posix=False)
            category, attribute = token[0].split(".")

            if tabular:
                if category not in data_mmCIF:
                    data_mmCIF[category] = {"col_names": [], "value": []}
                data_mmCIF[category]["col_names"].append(attribute)
                data_mmCIF[category]["value"].append([])
                final_token = []
            else:
                if category not in data_mmCIF:
                    data_mmCIF[category] = OrderedDict()
                # Necessary to handle attributes on 2 lines.
                if len(token) == 2:
                    data_mmCIF[category][attribute] = token[1]

        # Fix the issue with token between 2 ";"
        # Opening ";"
        elif line.startswith(";") and not mutli_line:
            mutli_line += line

        # Closing ";"
        elif line.startswith(";") and mutli_line:
            mutli_line += line
            if tabular:
                final_token += [mutli_line]
                # print(len(final_token), len(data_mmCIF[category]['col_names']), final_token)
                if len(final_token) == len(data_mmCIF[category]["col_names"]):
                    # print("finished", final_token)
                    # remove the last "\n"
                    final_token[-1] = final_token[-1][:-1]
                    for i in range(len(data_mmCIF[category]["col_names"])):
                        data_mmCIF[category]["value"][i].append(final_token[i])
                    final_token = []
            else:
                data_mmCIF[category][attribute] = mutli_line
            mutli_line = ""

        elif mutli_line:
            mutli_line += line

        elif tabular:
            token = shlex.split(line, posix=False)
            token_complete = True
            # print(final_token, len(final_token), token, len(token), len(data_mmCIF[category]['col_names']))

            # TO FIX !!
            if len(token) != len(data_mmCIF[category]["col_names"]):
                if len(final_token) == len(data_mmCIF[category]["col_names"]):
                    token = final_token
                elif len(final_token) + len(token) == len(
                    data_mmCIF[category]["col_names"]
                ):
                    # print("token complete", final_token, token)
                    token = final_token + token
                else:
                    token_complete = False
                    final_token += token

            if token_complete:
                # print("token complete")
                for i in range(len(data_mmCIF[category]["col_names"])):
                    data_mmCIF[category]["value"][i].append(token[i])
                final_token = []
        else:
            token = shlex.split(line, posix=False)
            if category not in data_mmCIF:
                data_mmCIF[category] = OrderedDict()
            data_mmCIF[category][attribute] = token[0]

    return data_mmCIF


def _get_float_format_size(array, dec_num=3):
    """Return the float format size for a given array.

    Parameters
    ----------
    array : numpy.ndarray
        array to format

    Returns
    -------
    str
        float format size

    """

    min_array = np.min(array)
    max_array = np.max(array)

    size = max(len(f"{min_array:.{dec_num}f}"), len(f"{max_array:.{dec_num}f}"))

    return size


[docs]def read(file_in):
    """Read a mmcif file.

    Parameters
    ----------
    file_in : str
        path of the pdb file to read

    Returns
    -------
    Coor
        Coor object

    """

    with open(file_in, "r") as filin:
        lines = filin.readlines()

    return parse(lines)


[docs]def get_mmcif_string(coor):
    """Return a coor object as a mmcif string.

    Parameters
    ----------
    coor : Coor
        Coor object to write

    Returns
    -------
    str
        Coor object as a pdb string

    """

    str_out = ""

    line_max_len = 135
    old_category = ""

    if len(coor.data_mmCIF) == 0:
        coor.data_mmCIF = {
            "title" : {"title": "untitled"},
            "_entry": {"id": "XXXX"}
        }

        if coor.crystal_pack.startswith("CRYST1"):
            line = coor.crystal_pack
            a = float(line[6:15])
            b = float(line[15:24])
            c = float(line[24:33])
            alpha = float(line[33:40])
            beta = float(line[40:47])
            gamma = float(line[47:54])
            sGroup = line[56:66]
            try:
                z = int(line[67:70])
            except ValueError:
                z = 1

            coor.data_mmCIF["_cell"] = {
                "length_a": str(a),
                "length_b": str(b),
                "length_c": str(c),
                "angle_alpha": str(alpha),
                "angle_beta": str(beta),
                "angle_gamma": str(gamma),
                "Z_PDB": str(z),
            }
        elif len(coor.crystal_pack) > 0:
            line_split = coor.crystal_pack.split()
            #  v1(x) v2(y) v3(z) v1(y) v1(z) v2(x) v2(z) v3(x) v3(y)
            if len(line_split) == 3:
                v1 = np.array([float(line_split[0]), 0.0, 0.0])
                v2 = np.array([0.0, float(line_split[1]), 0.0])
                v3 = np.array([0.0, 0.0, float(line_split[2])])
            elif len(line_split) == 9:
                v1 = np.array(
                    [float(line_split[0]), float(line_split[3]), float(line_split[4])]
                )
                v2 = np.array(
                    [float(line_split[5]), float(line_split[1]), float(line_split[6])]
                )
                v3 = np.array(
                    [float(line_split[7]), float(line_split[8]), float(line_split[2])]
                )
            a = sum(v1**2) ** 0.5 * 10
            b = sum(v2**2) ** 0.5 * 10
            c = sum(v3**2) ** 0.5 * 10
            alpha = np.rad2deg(angle_vec(v2, v3))
            beta = np.rad2deg(angle_vec(v1, v3))
            gamma = np.rad2deg(angle_vec(v1, v2))
            # Following is wrong, to check !!!
            sGroup = "1"
            z = 1
            coor.data_mmCIF["_cell"] = {
                "length_a": str(a),
                "length_b": str(b),
                "length_c": str(c),
                "angle_alpha": str(alpha),
                "angle_beta": str(beta),
                "angle_gamma": str(gamma),
                "Z_PDB": str(z),
            }
        coor.data_mmCIF["_atom_site"] = None


    for category in coor.data_mmCIF:
        if category == "title":
            str_out += f"{coor.data_mmCIF[category]['title']}\n"
        elif category == "_atom_site":
            atom_num = coor.total_len
            model_num = 1
            str_out += MMCIF_ATOM_SITE
            # Get column size
            atom_num_size = len(
                str(coor.models[-1].atom_dict["num_resid_uniqresid"][-1, 0])
            )
            resnum_size = len(
                str(max(coor.models[-1].atom_dict["num_resid_uniqresid"][:, 2]))
            )
            resid_size = len(
                str(max(coor.models[-1].atom_dict["num_resid_uniqresid"][:, 1]))
            )
            name_size = len(
                max(coor.models[0].atom_dict["name_resname_elem"][:, 0], key=len)
            )
            chain_size = len(
                max(coor.models[0].atom_dict["alterloc_chain_insertres"][:, 1], key=len)
            )
            resname_size = len(
                max(coor.models[0].atom_dict["name_resname_elem"][:, 1], key=len)
            )
            elem_size = len(
                max(coor.models[0].atom_dict["name_resname_elem"][:, 2], key=len)
            )
            x_size = _get_float_format_size(coor.models[0].atom_dict["xyz"][:, 0])
            y_size = _get_float_format_size(coor.models[0].atom_dict["xyz"][:, 1])
            z_size = _get_float_format_size(coor.models[0].atom_dict["xyz"][:, 2])
            beta_size = _get_float_format_size(
                coor.models[0].atom_dict["occ_beta"][:, 1], dec_num=2
            )
            for model in coor.models:
                for i in range(model.len):
                    alt_pos = (
                        "."
                        if model.atom_dict["alterloc_chain_insertres"][i, 0] == b""
                        else model.atom_dict["alterloc_chain_insertres"][i, 0].astype(
                            np.str_
                        )
                    )
                    insert_res = (
                        "?"
                        if model.atom_dict["alterloc_chain_insertres"][i, 2] == b""
                        else model.atom_dict["alterloc_chain_insertres"][i, 2].astype(
                            np.str_
                        )
                    )
                    str_out += (
                        "{:6s} {:<{atom_num_size}d} {:{elem_size}s} {:{name_size}s} {:1s} {:{resname_size}s} "
                        "{:{chain_size}s} 1 {:<{resnum_size}d} {:1s} {:<{x_size}.3f} {:<{y_size}.3f} "
                        "{:<{z_size}.3f} {:<4.2f} {:<{beta_size}.2f} {:1s} {:<{resid_size}d}"
                        " {:{resname_size}s} {:{chain_size}s} {:{name_size}s} {:1d}\n".format(
                            FIELD_DICT[model.atom_dict["field"][i]],
                            model.atom_dict["num_resid_uniqresid"][i, 0],
                            model.atom_dict["name_resname_elem"][i, 2].astype(np.str_),
                            model.atom_dict["name_resname_elem"][i, 0].astype(np.str_),
                            alt_pos,
                            model.atom_dict["name_resname_elem"][i, 1].astype(np.str_),
                            model.atom_dict["alterloc_chain_insertres"][i, 1].astype(
                                np.str_
                            ),
                            model.atom_dict["num_resid_uniqresid"][i, 2] + 1,
                            insert_res,
                            model.atom_dict["xyz"][i, 0],
                            model.atom_dict["xyz"][i, 1],
                            model.atom_dict["xyz"][i, 2],
                            model.atom_dict["occ_beta"][i, 0],
                            model.atom_dict["occ_beta"][i, 1],
                            insert_res,
                            model.atom_dict["num_resid_uniqresid"][i, 1],
                            model.atom_dict["name_resname_elem"][i, 1].astype(np.str_),
                            model.atom_dict["alterloc_chain_insertres"][i, 1].astype(
                                np.str_
                            ),
                            model.atom_dict["name_resname_elem"][i, 0].astype(np.str_),
                            model_num,
                            atom_num_size=atom_num_size,
                            name_size=name_size,
                            resname_size=resname_size,
                            x_size=x_size,
                            y_size=y_size,
                            z_size=z_size,
                            elem_size=elem_size,
                            resnum_size=resnum_size,
                            resid_size=resid_size,
                            beta_size=beta_size,
                            chain_size=chain_size,
                        )
                    )
                model_num += 1
        else:
            # Add a # for each new category
            if category != old_category:
                str_out += "# \n"
                old_category = category
            # Write the loop
            if "col_names" in coor.data_mmCIF[category]:
                str_out += "loop_\n"
                raw_width = []
                for i, col_name in enumerate(coor.data_mmCIF[category]["col_names"]):
                    str_out += f"{category}.{col_name} \n"
                    # Extract the word with no column
                    list_no_column = [
                        elem
                        for elem in coor.data_mmCIF[category]["value"][i]
                        if elem.find(";")
                    ]
                    # Compute the max length of the word with no column
                    max_len = len(max(list_no_column, key=len))
                    raw_width.append(max_len)
                # Compute the max length of the line as function of the max length of the word
                tot_width = 0
                break_list = []
                for i, width in enumerate(raw_width):
                    tot_width += width + 1
                    if tot_width > line_max_len:
                        break_list.append(i)
                        tot_width = 0
                for i in range(len(coor.data_mmCIF[category]["value"][0])):
                    for j in range(len(coor.data_mmCIF[category]["col_names"])):
                        word = coor.data_mmCIF[category]["value"][j][i]
                        # If the word starts with a ";", we add a new line
                        if word[0] == ";":
                            # Except if the previous word was a ";"
                            if str_out[-1] == "\n":
                                str_out += f"{word}"
                            else:
                                str_out += f"\n{word}"
                        else:
                            # If the word is too long, we break the line
                            if j in break_list:
                                str_out += f"\n{word:{raw_width[j]}} "
                            else:
                                str_out += f"{word:{raw_width[j]}} "
                    str_out += f"\n"
            # Write the data
            else:
                max_len = (
                    len(max(coor.data_mmCIF[category], key=len)) + len(category) + 3
                )
                for attribute in coor.data_mmCIF[category]:
                    if coor.data_mmCIF[category][attribute].startswith(";"):
                        str_out += (
                            f"{'.'.join([category, attribute]):{max_len}} \n{coor.data_mmCIF[category][attribute]}"
                        )
                    else:
                        local_str = f"{'.'.join([category, attribute]):{max_len}} {coor.data_mmCIF[category][attribute]} \n"
                        if len(local_str) > line_max_len:
                            str_out += (
                                f"{'.'.join([category, attribute]):{max_len}} \n{coor.data_mmCIF[category][attribute]} \n"
                            )
                        else:
                            str_out += local_str
    str_out += "#\n"

    return str_out


[docs]def write(coor, mmcif_out, overwrite=False):
    """Write a mmcif file.

    Parameters
    ----------
    coor : Coor
        Coor object
    mmcif_out : str
        path of the mmcif file to write
    overwrite : bool, optional, default=False
        flag to overwrite or not if file has already been created.

    Returns
    -------
    None

    """

    if not overwrite and os.path.exists(mmcif_out):
        logger.info(f"MMCIF file {mmcif_out} already exist, file not saved")
        return

    filout = open(mmcif_out, "w")
    filout.write(get_mmcif_string(coor))
    filout.close()
    logger.info(f"Succeed to save file {os.path.relpath(mmcif_out)}")
    return