Source code for group_decomposition.utils

"""
utils.py

Various utilities used in fragmenting molecules - retrieving information from molecules
Identifying small parts of molecules, and tools for minor manipulations of SMILES or molecules
"""
import ast
import os

import pandas as pd
from rdkit import Chem  # pylint:disable=import-error
from rdkit.Chem import (  # pylint:disable=import-error # search for rdScaffoldAttachment points * to remove; pylint:disable=import-error; rdDetermineBonds,
    AllChem,
    rdDetermineBonds,
    rdqueries,
)



[docs]
def get_molecules_atomicnum(molecule):
    """Given molecule object, get list of atomic numbers."""
    # a
    atom_num_list = []
    for atom in molecule.GetAtoms():
        atom_num_list.append(atom.GetAtomicNum())
    return atom_num_list




[docs]
def get_molecules_atomsinrings(molecule):
    """Given molecule object, get Boolean list of if atoms are in a ring."""
    is_in_ring_list = []
    for atom in molecule.GetAtoms():
        is_in_ring_list.append(atom.IsInRing())
    return is_in_ring_list




[docs]
def trim_placeholders(rwmol):
    """Given Chem.RWmol, remove atoms with atomic number 0."""
    qa = rdqueries.AtomNumEqualsQueryAtom(0)  # define query for atomic number 0
    if len(rwmol.GetAtomsMatchingQuery(qa)) > 0:  # if there are matches
        query_match = rwmol.GetAtomsMatchingQuery(qa)
        rm_at_idx = []
        for atom in query_match:  # identify atoms to be removed
            rm_at_idx.append(atom.GetIdx())
            # remove starting from highest number so upcoming indices not affected
        rm_at_idx_sort = sorted(rm_at_idx, reverse=True)
        # e.g. need to remove 3 and 5, if don't do this and you remove 3,
        # then the 5 you want to remove is now 4, and you'll remove wrong atom
        for idx in rm_at_idx_sort:  # remove atoms
            rwmol.RemoveAtom(idx)
    return rwmol




[docs]
def mol_with_atom_index(mol):
    """Label a molecule with map numbers for order atoms are in molecule.

    Useful for tracking atoms after fragmenting

    Args:
        mol: an rdkit molecule object

    Returns:
        rdkit molecule object with AtomMapNum set so that atom 0
        has map number 1, atom 1 has map number 2...etc

    """
    # from https://www.rdkit.org/docs/Cookbook.html
    for atom in mol.GetAtoms():
        if atom.GetAtomicNum() != 0:
            atom.SetAtomMapNum(atom.GetIdx() + 1)
    return mol



def _get_charge_from_cml(cml_file):
    with open(cml_file, encoding="utf-8") as file:
        for line in file:
            if "formalCharge" in line:
                split_line = line.split(" ")
                for word in split_line:
                    if "formalCharge" in word:
                        charge = int(
                            word.replace("formalCharge=", "")
                            .replace(">\n", "")
                            .replace('"', "")
                        )
                        break
    return charge


def mol_from_xyzfile(xyz_file: str, cml_file):
    """Attempt to create a molecule from an xyz file by automatically determining bond orders"""
    # raw_mol = Chem.MolFromXYZFile('DUDE_67368827_adrb2_decoys_C19H25N3O4_CIR.xyz')
    # mol = Chem.Mol(raw_mol)
    # rdDetermineBonds.DetermineConnectivity(mol)
    # rdDetermineBonds.DetermineBondOrders(mol)
    mol = Chem.Mol(Chem.MolFromXYZFile(xyz_file))
    try:
        rdDetermineBonds.DetermineBonds(mol, charge=_get_charge_from_cml(cml_file))
    except:  # pylint:disable=bare-except
        if os.path.isfile("error_log.txt"):
            with open("error_log.txt", "a", encoding="utf-8") as er_file:
                er_file.write(
                    f"Could not determine bond orders from xyz for {cml_file}\n"
                )
        else:
            with open("error_log.txt", "w", encoding="utf-8") as er_file:
                er_file.write(f"Could not determine bond orders for {cml_file}\n")
        return None

    atomic_symbols = []
    xyz_coordinates = []
    ats_read = 0
    num_atoms = mol.GetNumAtoms()
    with open(xyz_file, encoding="utf-8") as file:
        for line_number, line in enumerate(file):
            if ats_read < num_atoms and line_number > 1:
                ats_read += 1
                atomic_symbol, x, y, z = line.split()[:4]
                atomic_symbols.append(atomic_symbol)
                xyz_coordinates.append([float(x), float(y), float(z)])
            elif ats_read == num_atoms:
                break
    return {
        "Molecule": mol_with_atom_index(mol),
        "xyz_pos": xyz_coordinates,
        "atomic_symbols": atomic_symbols,
    }


# from https://github.com/rdkit/rdkit/issues/2413
# conf = m.GetConformer()


# in principal, you should check that the atoms match
# for i in range(m.GetNumAtoms()):
#     print(i)
#     x,y,z = xyz_coordinates[i]
#     conf.SetAtomPosition(i,Point3D(x,y,z))



[docs]
def get_cml_atom_types(cml_file):
    """Extract atom types from cml file

    Args:
        cml_file: cml file name

    Returns:
        Data Frame column whose elements are tuples of
        atom types as defined by Retrievium. (atom number, type, valence)

    """
    n_atl = 0
    type_list = []
    idx_list = []
    with open(cml_file, encoding="utf-8") as file:
        for line in file:
            if "atomTypeList" in line:
                n_atl += 1
                if n_atl == 2:
                    break
            elif n_atl == 1:
                split_line = line.split()
                idx_list.append(
                    int(split_line[1].split("=")[1].replace('"', "").replace("a", ""))
                    - 1
                )
                at_label = split_line[2].split("=")[1].replace('"', "")
                at_type = int(split_line[3].split("=")[1].replace('"', ""))
                at_valence = int(
                    split_line[4].split("=")[1].split("/")[0].replace('"', "")
                )
                type_list.append((at_label, at_type, at_valence))
    temp_frame = pd.DataFrame(list(zip(idx_list, type_list)), columns=["idx", "type"])
    temp_frame.sort_values(by="idx", inplace=True)
    return temp_frame["type"]



def add_cml_atoms_bonds(el_list, bond_list):
    """create a molecule from cml file, building it one atom at a time then
    adding in bond orders.

    Note: Bond orders from Retrievium cml are problematic at times.

    Recommended construction is to build atoms and connectivity from cml file
    Then assign bond orders based on template smiles also found in cml file"""
    flag = 1
    for atom in el_list:
        if flag:
            mol = Chem.MolFromSmiles(atom)
            rwmol = Chem.RWMol(mol)
            rwmol.BeginBatchEdit()
            flag = 0
        else:
            rwmol.AddAtom(Chem.Atom(atom))
    # mw.AddBond(6,7,Chem.BondType.SINGLE)
    for bond in bond_list:
        if bond[2] == "S":
            rwmol.AddBond(bond[0] - 1, bond[1] - 1, Chem.BondType.SINGLE)
        elif bond[2] == "D":
            rwmol.AddBond(bond[0] - 1, bond[1] - 1, Chem.BondType.DOUBLE)
        elif bond[2] == "T":
            rwmol.AddBond(bond[0] - 1, bond[1] - 1, Chem.BondType.TRIPLE)
        elif bond[2] == "A":
            rwmol.AddBond(bond[0] - 1, bond[1] - 1, Chem.BondType.AROMATIC)
    rwmol.CommitBatchEdit()
    return rwmol


def _add_cml_single_atoms_bonds(el_list, bond_list):
    """Build a mol from list of atoms and bonds, one atom at a time
    Bonds assigned are only single bonds.

    Adjust bonds after with _modAssignBondOrdersFromTemplate"""
    rwmol = Chem.RWMol(Chem.Mol())
    for atom in el_list:
        rwmol.AddAtom(Chem.Atom(atom))
    # mw.AddBond(6,7,Chem.BondType.SINGLE)
    for bond in bond_list:
        rwmol.AddBond(bond[0] - 1, bond[1] - 1, Chem.BondType.SINGLE)
    rwmol.CommitBatchEdit()
    return rwmol



[docs]
def smiles_from_cml(cml_file, smile_tag="retrievium:inputSMILES"):
    """Finds the Retreivium  SMILES in a cml file with a given label

    Args:
        cml_file: cml file name
        smile_tag: the label fo the SMILEs in the cml file. Defaults to input SMILEs

    Returns:
        string of the input SMILES code tagged in the file as retrievium:inputSMILES

    Note:
        Must be used on .cml files from the Retrievium database https://retrievium.ca

    """
    flag = 0
    with open(cml_file, encoding="utf-8") as file:
        for line in file:
            if smile_tag in line:
                flag = 1
            elif flag == 1:
                smile = line.split(">")[1].split("<")[0]
                break
    return smile




[docs]
def mol_from_cml(cml_file, input_type="cmlfile"):
    """Creates a molecule from a cml file and returns atoms, xyz and types

    Builds molecule one atom at a time connected by single bonds
    Then determines bond orders by mapping to a template smiles in the cml
    Finally, updates property cache, initializes ring info, and sanitizes mol
    If no match between SMILEs and connectivity, returns None and writes
    error to file

    Args:
        cml_file: cml file name or dictionary containing cml data
        input_type: cmlfile or cmldict. Use cmlfile if just raw cml file, use cmldict if dictionary

    Returns:
        list with elements: [Molecule, list of atom symbols in molecule,
        list of xyz coords of atoms in molecule,
        list of atom types of atoms in molecule]

    Note:
        list order matches mol numbering in cml

    """

    if input_type == "cmlfile":
        xyz_coords, at_types, bond_list, el_list, _ = data_from_cml(cml_file, True)
        smile = smiles_from_cml(cml_file)
    elif input_type == "cmldict":
        xyz_coords = cml_file["geom"]
        at_types = cml_file["atom_types"]
        bond_list = cml_file["bonds"]
        el_list = cml_file["labels"]
        # charge = cml_file["charge"]
        smile = cml_file["smiles"]
    rwmol = _add_cml_single_atoms_bonds(el_list, bond_list)
    for atom in rwmol.GetAtoms():
        atom.SetNoImplicit(True)

    rwmol2 = Chem.RemoveHs(rwmol, implicitOnly=True, updateExplicitCount=False)
    template = AllChem.MolFromSmiles(smile)
    bond_mol = _modAssignBondOrdersFromTemplate(template, rwmol2, cml_file)
    # need rings for aromaticity check
    # if os.path.isfile('error_log.txt'):
    #     er_file = open('error_log.txt','a')
    #     er_file.write(f'Could not sanitize {cml_file}\n')
    #     er_file.close()
    # else:
    #     er_file = open('error_log.txt','w')
    #     er_file.write(f'Could not sanitize {cml_file}\n')
    #     er_file.close()
    if bond_mol:
        bond_mol.UpdatePropertyCache()
        Chem.GetSymmSSSR(bond_mol)
        try:
            Chem.SanitizeMol(bond_mol)
        except:  # pylint:disable=bare-except
            _write_error(f"Could not sanitize {cml_file}\n")
            return [None, None, None, None]
        return [mol_with_atom_index(bond_mol), el_list, xyz_coords, at_types]
        # if os.path.isfile('error_log.txt'):
        #     er_file = open('error_log.txt','a')
        #     er_file.write(f'No match between template smiles and connected geom for {cml_file}\n')
        #     er_file.close()
        # else:
        #     er_file = open('error_log.txt','w')
        #     er_file.write(f'No match between template smiles and connected geom for {cml_file}\n')
        #     er_file.close()
    return [None, None, None, None]



def _write_error(errormessage):
    """writes errormessage to errorlog.txt"""
    if os.path.isfile("error_log.txt"):
        with open("error_log.txt", "a", encoding="utf-8") as er_file:
            er_file.write(errormessage)
    else:
        with open("error_log.txt", "w", encoding="utf-8") as er_file:
            er_file.write(errormessage)


def _modAssignBondOrdersFromTemplate(refmol, mol, cml_file):
    """This is originally from RDKit AllChem module.
      Modified here by Kevin Lefrancois-Gagnon(KLG) to disallow implicit hydrogens on all
      molcule objects used. This corresponds to the 4 for loops after creation of mol
      objects.
      Also Returns None for no match instead of value error, for use on large number of
      molecules, a single error won't stop the whole thing
      All other code is unmodified from the original.
      This was required by KLG for the following reason:
        using the original AssignBondOrdersFromTemplate on a molecule with
        only explicit hydrogens with only connectivity, no  bond types other than
        single bonds. When a charged atom like nitrogen was present, extra H were being added
        For example when a part was expected as C-[NH2+]-C, the following was seen
        C-[NH2+]([H])([H])-C, essentially doubling the expected # hydrogens
        These could not be deleted by Chem.RemoveHs(mol,implicitOnly=True) as for
        some reason these were explicit. The reason for why these were added is unclear

        Beyond the obviously problematic extra hydrogens, it also produced an error case
        when the later part of the code would clean up bond orders, like for some ring ketones
        in a ring labeled aromatic by the SMILEs.
        Adjusting implicit Hs to be disallowed for all molecules and their copies at the start
        resulted in the normal expected outputs.
    Resume original documentation:
      assigns bond orders to a molecule based on the
        bond orders in a template molecule

        Arguments
          - refmol: the template molecule
          - mol: the molecule to assign bond orders to

        An example, start by generating a template from a SMILES
        and read in the PDB structure of the molecule

        >>> import os
        >>> from rdkit.Chem import AllChem
        >>> template = AllChem.MolFromSmiles("CN1C(=NC(C1=O)(c2ccccc2)c3ccccc3)N")
        >>> mol = AllChem.MolFromPDBFile(os.path.join(RDConfig.RDCodeDir, 'Chem', 'test_data', '4DJU_lig.pdb'))
        >>> len([1 for b in template.GetBonds() if b.GetBondTypeAsDouble() == 1.0])
        8
        >>> len([1 for b in mol.GetBonds() if b.GetBondTypeAsDouble() == 1.0])
        22

        Now assign the bond orders based on the template molecule

        >>> newMol = AllChem.AssignBondOrdersFromTemplate(template, mol)
        >>> len([1 for b in newMol.GetBonds() if b.GetBondTypeAsDouble() == 1.0])
        8

        Note that the template molecule should have no explicit hydrogens
        else the algorithm will fail.

        It also works if there are different formal charges (this was github issue 235):

        >>> template=AllChem.MolFromSmiles('CN(C)C(=O)Cc1ccc2c(c1)NC(=O)c3ccc(cc3N2)c4ccc(c(c4)OC)[N+](=O)[O-]')
        >>> mol = AllChem.MolFromMolFile(os.path.join(RDConfig.RDCodeDir, 'Chem', 'test_data', '4FTR_lig.mol'))
        >>> AllChem.MolToSmiles(mol)
        'COC1CC(C2CCC3C(O)NC4CC(CC(O)N(C)C)CCC4NC3C2)CCC1N(O)O'
        >>> newMol = AllChem.AssignBondOrdersFromTemplate(template, mol)
        >>> AllChem.MolToSmiles(newMol)
        'COc1cc(-c2ccc3c(c2)Nc2ccc(CC(=O)N(C)C)cc2NC3=O)ccc1[N+](=O)[O-]'

    """
    # pylint:disable=too-many-branches
    refmol = Chem.AddHs(refmol)
    refmol2 = Chem.Mol(refmol)
    refmol2 = Chem.AddHs(refmol2)
    mol2 = Chem.Mol(mol)
    for atom in mol.GetAtoms():
        atom.SetNoImplicit(True)
    for atom in mol.GetAtoms():
        atom.SetNoImplicit(True)
    for atom in refmol2.GetAtoms():
        atom.SetNoImplicit(True)
    for atom in refmol.GetAtoms():
        atom.SetNoImplicit(True)
    # do the molecules match already?
    matching = mol2.GetSubstructMatch(refmol2)
    if not matching:  # no, they don't match
        # check if bonds of mol are SINGLE Chem.rdchem.BondType.DOUBLE
        for b in mol2.GetBonds():
            if b.GetBondType() != Chem.rdchem.BondType.SINGLE:
                b.SetBondType(Chem.rdchem.BondType.SINGLE)
                b.SetIsAromatic(False)
        # set the bonds of mol to SINGLE
        for b in refmol2.GetBonds():
            b.SetBondType(Chem.rdchem.BondType.SINGLE)
            b.SetIsAromatic(False)
        # set atom charges to zero;
        for a in refmol2.GetAtoms():
            a.SetFormalCharge(0)
        for a in mol2.GetAtoms():
            a.SetFormalCharge(0)

        matching = mol2.GetSubstructMatches(refmol2, uniquify=False)
        # do the molecules match now?
        if matching:
            #   if len(matching) > 1:
            #     logger.warning(msg="More than one matching pattern found - picking one")
            matching = matching[0]
            # apply matching: set bond properties
            for b in refmol.GetBonds():
                atom1 = matching[b.GetBeginAtomIdx()]
                atom2 = matching[b.GetEndAtomIdx()]
                b2 = mol2.GetBondBetweenAtoms(atom1, atom2)
                b2.SetBondType(b.GetBondType())
                b2.SetIsAromatic(b.GetIsAromatic())
            # apply matching: set atom properties
            for a in refmol.GetAtoms():
                a2 = mol2.GetAtomWithIdx(matching[a.GetIdx()])
                a2.SetHybridization(a.GetHybridization())
                a2.SetIsAromatic(a.GetIsAromatic())
                a2.SetNumExplicitHs(a.GetNumExplicitHs())
                a2.SetFormalCharge(a.GetFormalCharge())
            try:
                Chem.SanitizeMol(mol2)
            except:  # pylint:disable=bare-except
                er_message = f"Could not sanitize {cml_file}\n"
                # smile = smiles_from_cml(cml_file)
                er_message += _at_num_er(refmol2, mol2)
                # smimol = Chem.MolFromSmiles()
                _write_error(er_message)
                return None
            if hasattr(mol2, "__sssAtoms"):  # pylint:disable=protected-access
                mol2.__sssAtoms = None  # pylint:disable=protected-access # we don't want all bonds highlighted
        else:
            er_message = (
                f"No match between template smiles and connected geom for {cml_file}\n"
            )
            er_message += _at_num_er(refmol2, mol2)
            _write_error(er_message)
            return None
    return mol2


def _at_num_er(refmol2, mol2):
    """check numbering of atoms"""
    er_message = ""
    smi_num = []
    smi_lab = []
    symb_list = []
    num_list = []
    for atom in mol2.GetAtoms():
        if atom.GetSymbol() not in symb_list:
            symb_list.append(atom.GetSymbol())
            num_list.append(1)
        else:
            num_list[symb_list.index(atom.GetSymbol())] += 1
    for atom in refmol2.GetAtoms():
        if atom.GetSymbol() not in smi_lab:
            smi_lab.append(atom.GetSymbol())
            smi_num.append(1)
        else:
            smi_num[smi_lab.index(atom.GetSymbol())] += 1
    for i, num in enumerate(num_list):
        j = smi_lab.index(symb_list[i])
        if smi_num[j] != num:
            er_message += f"Expected {smi_num[j]} {smi_lab[j]} from SMILES, observed {num_list[i]} in xyz\n"
    return er_message



[docs]
def data_from_cml(cml_file, bonds=False):
    """Gets symbols, xyz coords, bonds and charge of a mol from cml file

    Args:
        cml_file: .cml filename

    Returns:
        list with relevant data from cml file.
        Elements in order are: molecular geometry, atom types,
        list of bonds, list of elements, charge


    Note:
        This is designed to parse files specifically from the Retrievium database
        https://retrievium.ca

        the SMILEs extracted is labelled with tag retrievium:inputSMILES

        The geometry extracted is from the third atomArray block in the .cml file

    Example Usage:
        >>> utils.data_from_cml(cml_file)

    """
    # pylint:disable=too-many-locals
    # pylint:disable=too-many-branches
    # pylint:disable=too-many-statements
    num_atom_array = 0
    geom_list = []
    n_atl = 0
    n_bary = 0
    type_list = []
    idx_list = []
    bond_list = []
    el_list = []
    with open(cml_file, encoding="utf-8") as file:
        for line in file:
            if "formalCharge" in line:
                split_line = line.split(" ")
                for word in split_line:
                    if "formalCharge" in word:
                        charge = int(
                            word.replace("formalCharge=", "")
                            .replace(">\n", "")
                            .replace('"', "")
                        )
                        continue
            if "atomArray" in line:
                num_atom_array += 1
                if num_atom_array == 5:
                    continue
            if num_atom_array == 5:
                quote_split = line.split('"')
                # maybe only do el_list with bonds?
                el_list.append(quote_split[3])
                x_split = quote_split[5]
                y_split = quote_split[7]
                z_split = quote_split[9]
                geom_list.append(
                    [
                        float(ast.literal_eval(x_split)),
                        float(ast.literal_eval(y_split)),
                        ast.literal_eval(z_split),
                    ]
                )
            elif num_atom_array == 6:
                if bonds:
                    if "bondArray" in line:
                        n_bary += 1
                        if n_bary in [1, 2]:
                            continue
                    elif n_bary == 1:
                        split_line = line.split()
                        at_1 = int(split_line[1].split('"')[1].replace("a", ""))
                        at_2 = int(split_line[2].replace('"', "").replace("a", ""))
                        b_ord = split_line[4].split('"')[1]
                        bond_list.append((at_1, at_2, b_ord))
                if "atomTypeList" in line:
                    n_atl += 1
                    if n_atl == 1:
                        continue
                    if n_atl == 2:
                        break
                elif n_atl == 1:
                    split_line = line.split()
                    idx_list.append(
                        int(
                            split_line[1]
                            .split("=")[1]
                            .replace('"', "")
                            .replace("a", "")
                        )
                        - 1
                    )
                    at_label = split_line[2].split("=")[1].replace('"', "")
                    at_type = int(split_line[3].split("=")[1].replace('"', ""))
                    at_valence = int(
                        split_line[4].split("=")[1].split("/")[0].replace('"', "")
                    )
                    type_list.append((at_label, at_type, at_valence))
    temp_frame = pd.DataFrame(list(zip(idx_list, type_list)), columns=["idx", "type"])
    temp_frame.sort_values(by="idx", inplace=True)
    if bonds:
        return [geom_list, list(temp_frame["type"]), bond_list, el_list, charge]
    return [geom_list, list(temp_frame["type"])]




[docs]
def all_data_from_cml(data):
    """Gets symbols, xyz coords, bonds and charge of a mol from cml file

    Args:
        data: lines of a .cml file

    Returns:
        dictionary with relevant data from cml file.
        Keys included are 'geom', 'atom_types', 'bonds',
        'labels', 'charge', 'multiplicity', 'smiles'

    Note:
        Not used in :attr:`group_decomposition.fragfunctions.identify_connected_fragments`
        This is used in an AiiDA workflow employing this package

        This is designed to parse files specifically from the Retrievium database
        https://retrievium.ca

        the SMILEs extracted is labelled with tag retrievium:inputSMILES

        The geometry extracted is from the third atomArray block in the .cml file

    Example Usage:
        >>> utils.all_data_from_cml(cml_file)

    """
    # pylint:disable=too-many-locals
    # pylint:disable=too-many-branches
    # pylint:disable=too-many-statements
    num_atom_array = 0
    geom_list = []
    n_atl = 0
    n_bary = 0
    type_list = []
    idx_list = []
    bond_list = []
    el_list = []
    smi_flag = 0
    for line in data:
        if "formalCharge" in line:
            split_line = line.split(" ")
            for word in split_line:
                if "spinMultiplicity" in word:
                    multiplicity = int(
                        word.replace("spinMultiplicity=", "").replace('"', "")
                    )
                if "formalCharge" in word:
                    charge = int(
                        word.replace("formalCharge=", "")
                        .replace('"', "")
                        .replace(">", "")
                        .replace("\n", "")
                    )
                    continue
        if "retrievium:inputSMILES" in line:
            smi_flag = 1
        elif smi_flag == 1:
            smile = line.split(">")[1].split("<")[0]
            smi_flag = 2
        if "atomArray" in line:
            num_atom_array += 1
            if num_atom_array == 5:
                continue
        if num_atom_array == 5:
            quote_split = line.split('"')
            # maybe only do el_list with bonds?
            el_list.append(quote_split[3])
            x_split = quote_split[5]
            y_split = quote_split[7]
            z_split = quote_split[9]
            geom_list.append(
                [
                    float(ast.literal_eval(x_split)),
                    float(ast.literal_eval(y_split)),
                    float(ast.literal_eval(z_split)),
                ]
            )
        elif num_atom_array == 6:
            if "bondArray" in line:
                n_bary += 1
                if n_bary in [1, 2]:
                    continue
            elif n_bary == 1:
                split_line = line.split()
                at_1 = int(split_line[1].split('"')[1].replace("a", ""))
                at_2 = int(split_line[2].replace('"', "").replace("a", ""))
                b_ord = split_line[4].split('"')[1]
                bond_list.append((at_1, at_2, b_ord))
            if "atomTypeList" in line:
                n_atl += 1
                if n_atl == 1:
                    continue
                if n_atl == 2:
                    break
            elif n_atl == 1:
                split_line = line.split()
                idx_list.append(
                    int(split_line[1].split("=")[1].replace('"', "").replace("a", ""))
                    - 1
                )
                at_label = split_line[2].split("=")[1].replace('"', "")
                at_type = int(split_line[3].split("=")[1].replace('"', ""))
                at_valence = int(
                    split_line[4].split("=")[1].split("/")[0].replace('"', "")
                )
                type_list.append((at_label, at_type, at_valence))
    temp_frame = pd.DataFrame(list(zip(idx_list, type_list)), columns=["idx", "type"])
    temp_frame.sort_values(by="idx", inplace=True)

    return {
        "geom": geom_list,
        "atom_types": list(temp_frame["type"]),
        "bonds": bond_list,
        "labels": el_list,
        "charge": charge,
        "multiplicity": multiplicity,
        "smiles": smile,  # pylint:disable=used-before-assignment
    }




[docs]
def xyz_list_to_str(xyz_list):
    """Convert 2d array to string [[a,b,c],[d,e,f]] -> a, b, c\n d, e, f\n"""
    outstr = ""
    for row in xyz_list:
        for at in row:
            outstr += str(at)
            outstr += ", "
        outstr += "\n"
    return outstr




[docs]
def list_to_str(lst):
    """Convert a list to string. [a,b,c] -> a b c"""
    out_str = ""
    for it in lst:
        out_str += str(it) + " "
    return out_str




[docs]
def mol_from_molfile(mol_file):
    """takes mol_file and returns mol wth atom numbers the same

    Modified for mol file structure from retrievium
    from stackexchange
    https://mattermodeling.stackexchange.com/questions/7234/how-to-input-3d-coordinates-from-xyz-file-and-connectivity-from-smiles-in-rdkit

    Args:
        mol_file: name of .mol file

    Returns:
        mol with AtomMapNumbers labeled by :attr:`group_decomposition.utils.mol_with_atom_index`

    """  # pylint:disable=line-too-long
    m = Chem.MolFromMolFile(mol_file, removeHs=False)
    if not m:
        raise ValueError(f"""Problem creating molecule from {mol_file}""")
    # this assumes whatever program you use doesn't re-order atoms
    #  .. which is usually a safe assumption
    #  .. so we don't bother tracking atoms
    m = mol_with_atom_index(m)
    atomic_symbols = []
    xyz_coordinates = []
    ats_read = 0
    num_atoms = m.GetNumAtoms()
    with open(mol_file, encoding="utf-8") as file:
        for line_number, line in enumerate(file):
            if ats_read < num_atoms and line_number > 3:
                ats_read += 1
                x, y, z, atomic_symbol = line.split()[:4]
                atomic_symbols.append(atomic_symbol)
                xyz_coordinates.append([float(x), float(y), float(z)])
            elif ats_read == num_atoms:
                break
    return {"Molecule": m, "xyz_pos": xyz_coordinates, "atomic_symbols": atomic_symbols}




[docs]
def xyz_from_cml(cml_file):
    """Extract xyz coordinates from cml file

    Args:
        cml_file: cml file name

    Returns:
        list of length 3 lists containing a molecule's xyz coordinates

    Note:
        Must be used on .cml files from the Retrievium database https://retrievium.ca

    """
    num_atom_array = 0
    geom_list = []
    with open(cml_file, encoding="utf-8") as file:
        for line in file:
            if "atomArray" in line:
                num_atom_array += 1
                if num_atom_array == 5:
                    continue
            if num_atom_array == 5:
                space_split = line.split()
                x_split = space_split[3].split("=")
                y_split = space_split[4].split("=")
                z_split = space_split[5].split("=")
                geom_list.append(
                    [
                        float(ast.literal_eval(x_split[1])),
                        float(ast.literal_eval(y_split[1])),
                        float(ast.literal_eval(z_split[1])),
                    ]
                )
            elif num_atom_array == 6:
                break
    return geom_list




[docs]
def get_canonical_molecule(smile: str):
    """Ensures that molecule numbering is consistent with creating molecule from canonical
    SMILES for consistency."""
    mol = Chem.MolFromSmiles(smile)
    if mol:
        mol_smi = Chem.MolToSmiles(mol)  # molsmi is canonical SMILES
    else:
        raise ValueError(
            f"""{smile} is not a valid SMILES code or
                         rdkit cannot construct a molecule from it"""
        )
    # create canonical molecule numbering from canonical SMILES
    return Chem.MolFromSmiles(mol_smi)




[docs]
def link_molecules(mol_1: Chem.Mol, mol_2: Chem.Mol, dl_1: int, dl_2: int):
    """Given two mols, each with dummy atoms that have dummyAtomLabels, link the molecules between
    the dummy atoms specified by labels dl_1 and dl_2

    Modified from https://www.oloren.ai/blog/add_rgroup.html
    Written by David Huang, Oloren AI, modified by Kevin Lefrancois-Gagnon

    Args:
        mol_1: Chem.Mol object
        mol_2: Chem.Mol object
        dl_1: the isotope of the dummy atom in mol_1 which will be replaced by mol_2
        dl_2: the isotope of the dummy atom in mol_2 which will be replaced by mol_1

    Returns:
        Chem.Mol object with mol_1 and mol_2 linked where dl_1 and dl_2 were"""
    # pylint:disable=too-many-locals
    # Loop over atoms until there are no wildcard atoms
    # Find wildcard atom if available, otherwise exit
    # We use the isotope here are FragmentOnBonds labels the dummy atoms by changing their isotope
    a = None
    for a_ in mol_1.GetAtoms():
        if a_.GetAtomicNum() == 0 and a_.GetIsotope() == dl_1:
            a = a_
            break
    if not a:
        raise ValueError(
            f"""Input molecule mol_1 does not have atom with dummy label {dl_1}"""
        )
    b = None
    for b_ in mol_2.GetAtoms():
        if b_.GetAtomicNum() == 0 and b_.GetIsotope() == dl_2:
            b = b_
            break
    if not b:
        raise ValueError(
            f"""Input molecule mol_1 does not have atom with dummy label {dl_2}"""
        )
    # Set wildcard atoms to having AtomMapNum 1000 for tracking
    a.SetAtomMapNum(1000)
    b.SetAtomMapNum(1000)
    # Put group and base molecule together and make it editable
    m = Chem.CombineMols(mol_1, mol_2)
    m = Chem.RWMol(m)
    # Find using tracking number the atoms to merge in new molecule
    a1 = None
    a2 = None
    for at in m.GetAtoms():
        if at.GetAtomMapNum() == 1000:
            if a1 is None:
                a1 = at
            else:
                a2 = at
    # Find atoms to bind together based on atoms to merge
    b1 = a1.GetBonds()[0]
    start = (
        b1.GetBeginAtomIdx()
        if b1.GetEndAtomIdx() == a1.GetIdx()
        else b1.GetEndAtomIdx()
    )

    b2 = a2.GetBonds()[0]
    end = (
        b2.GetBeginAtomIdx()
        if b2.GetEndAtomIdx() == a2.GetIdx()
        else b2.GetEndAtomIdx()
    )

    # Add the connection and remove original wildcard atoms
    m.AddBond(start, end, order=Chem.rdchem.BondType.SINGLE)
    m.RemoveAtom(a1.GetIdx())
    m.RemoveAtom(a2.GetIdx())

    return m



# def copy_molecule(molecule):
#     """create a copy of molecule object in new object(not pointer)"""
#     # see link https://sourceforge.net/p/rdkit/mailman/message/33652439/
#     return Chem.Mol(molecule)


def _clean_smile(trim_smi):
    """remove leftover junk from smiles when atom deleted."""
    trim_smi = trim_smi.replace("[*H]", "*")
    trim_smi = trim_smi.replace("[*H3]", "*")
    trim_smi = trim_smi.replace("[*H2]", "*")
    trim_smi = trim_smi.replace("[*H+]", "*")
    trim_smi = trim_smi.replace("[*H3+]", "*")
    trim_smi = trim_smi.replace("[*H2+]", "*")
    trim_smi = trim_smi.replace("[*H-]", "*")
    trim_smi = trim_smi.replace("[*H3-]", "*")
    trim_smi = trim_smi.replace("[*H2-]", "*")
    return trim_smi