Source code for group_decomposition.fragfunctions

"""
fragfunctions module

code used to generate fragments of molecules from SMILES code and analyze them

Main functions to call are:
identify_connected_fragments - takes one molecule SMILES, returns fragments with connections
count_uniques - takes output from above, removes attachments and counts unique fragments
count_groups_in_set - takes list of SMILES and counts unique fragments on set
"""
# pylint:disable=too-many-lines
import math
import os
import re
from collections import Counter

import numpy as np  # for arrays in fragment identification
import pandas as pd  # lots of work with data frames
from rdkit import Chem  # pylint:disable=import-error
from rdkit.Chem import AllChem, PandasTools, rdqueries  # pylint:disable=import-error

from group_decomposition import utils

_num_bonds_broken = 1

_H_BOND_LENGTHS = {
    # from Gaussview default cleaned bond length
    "C": 1.07,
    "O": 0.96,
    "N": 1.00,
    "F": 0.88,
    "Cl": 1.29,
    "B": 1.18,
    "Al": 1.55,
    "Si": 1.47,
    "P": 1.35,
    "S": 1.31,
}


def _generate_fragment_frame(fragment_smiles):
    """Given list of SMILES Generate output frame with SMILES codes and molecules for unique
    fragments."""
    frag_frame = pd.DataFrame(set(fragment_smiles), columns=["Smiles"])
    PandasTools.AddMoleculeColumnToFrame(
        frag_frame, "Smiles", "Molecule", includeFingerprints=True
    )
    frag_frame.drop(frag_frame.index[frag_frame["Smiles"] == "*"].tolist())
    return frag_frame


def _add_xyz_coords(frag_frame):
    """Given frag_frame with molecules, add xyz coordinates form MM94 optimization to it."""
    xyz_block_list = []
    query = rdqueries.AtomNumEqualsQueryAtom(0)
    for mol in frag_frame["Molecule"]:
        h_mol_rw = Chem.RWMol(mol)  # Change type of molecule object
        h_mol_rw = Chem.AddHs(h_mol_rw)  # Add hydrogens
        zero_at = h_mol_rw.GetAtomsMatchingQuery(query)  # Replace placeholder * with At
        for atom in zero_at:
            h_mol_rw.GetAtomWithIdx(atom.GetIdx()).SetAtomicNum(85)
        AllChem.EmbedMolecule(h_mol_rw)
        AllChem.MMFFOptimizeMolecule(h_mol_rw)  # Optimize with MMFF94
        xyz_block_list.append(
            AllChem.rdmolfiles.MolToXYZBlock(h_mol_rw)  # pylint:disable=no-member
        )  # Store xyz coordinates
    frag_frame["xyz"] = xyz_block_list
    return frag_frame


def _add_number_attachements(frag_frame):
    """Add number of attachments column to frag_frame, counting number of *."""
    attach_list = []
    for molecule in frag_frame["Molecule"]:
        attach = 0
        atoms = molecule.GetAtoms()
        for atom in atoms:
            if atom.GetAtomicNum() == 0:
                attach += 1
        attach_list.append(attach)
    frag_frame["numAttachments"] = attach_list
    return frag_frame


def _fragment_molecule(
    mol_list, patt, exld_ring=False, drop_parent=True, recombine_mono=False
):
    """Break molecules into fragments based on fragmenting pattern"""
    global _num_bonds_broken  # pylint:disable=global-statement
    out_mols = []
    pat_mol = Chem.MolFromSmarts(patt)
    # print(mol_list)
    for mol in mol_list:
        if mol.HasSubstructMatch(pat_mol):
            if exld_ring and mol.GetRingInfo().NumRings() > 0:
                continue
            bonds_at_idx = mol.GetSubstructMatches(pat_mol)
            bonds_to_break = [
                mol.GetBondBetweenAtoms(x[0], x[1]).GetIdx() for x in bonds_at_idx
            ]
            labels = [
                [x, x]
                for x in range(_num_bonds_broken, _num_bonds_broken + len(bonds_at_idx))
            ]
            _num_bonds_broken += len(bonds_at_idx)
            frag_mols_comb = Chem.FragmentOnBonds(
                mol, bondIndices=bonds_to_break, dummyLabels=labels
            )
            frag_smis = Chem.MolToSmiles(frag_mols_comb).split(".")
            # print(frag_smis)
            frag_mols = [Chem.MolFromSmiles(x) for x in frag_smis]
            if recombine_mono:
                frag_mols = _recombine_monoatomic(frag_mols)

            if not drop_parent:
                out_mols = out_mols + [mol]
            out_mols = out_mols + frag_mols
        else:
            out_mols = out_mols + [mol]
    return out_mols


def _recombine_monoatomic(frag_mols):
    """Take fragments and recombine"""
    # pylint:disable=too-many-nested-blocks
    for m in frag_mols:
        link = None
        if m.GetNumHeavyAtoms() == 1:
            dummy_atoms = [x for x in m.GetAtoms() if x.GetAtomicNum() == 0]
            dummy_atom_iso = [x.GetIsotope() for x in dummy_atoms]
            dummy_match = []
            matched_iso = []
            for i, dum in enumerate(dummy_atom_iso):
                for mo in frag_mols:
                    if mo != m:
                        for at in mo.GetAtoms():
                            if at.GetAtomicNum() == 0 and at.GetIsotope() == dum:
                                dummy_match.append(mo)
                                matched_iso.append(dum)
            for i, iso in enumerate(matched_iso):
                if i == 0:
                    link = utils.link_molecules(m, dummy_match[i], iso, iso)
                else:
                    link = utils.link_molecules(link, dummy_match[i], iso, iso)
        if link:
            frag_mols.append(link)
            frag_mols.remove(m)
            for mo in dummy_match:
                frag_mols.remove(mo)
    return frag_mols


# def get_num_placeholders(mol):
#     """Find number of * in a molecule"""
#     n_p = 0
#     for atom in mol.GetAtoms():
#         if atom.GetAtomicNum() == 0:
#             n_p += 1
#     return n_p


[docs] def generate_molecule_fragments( mol, patt: str = "[$([C;X4;!R]):1]-[$([R,!$([C;X4]);!#0;!#9;!#17;!#35;!#1]):2]", drop_parent: bool = False, recombine_mono: bool = True, ): r"""Fragment a molecule into constituent groups The molecule is first broken along ring-nonring single bonds, then single bonds to atoms double bonded to ring (e.g. C-N={ring}), then breaking based on the pattern provided. Args: mol: the rdkit molecule object to be fragmented patt: SMARTs string that matches the bonds to be broken after ring-non-ring are separated Defaults to breaking alkyl-non-alkyl bonds. See notes drop_parent: if False, do include the parent structure in the third break. Defaults to breaking identifying alkyl chains. If True, do not include the parents recombine_mono: If True (default), will recombine separated one heavy atom groups with the chains they are broken from in the last step Returns: list[mol]: a list of rdkit molecules generated by fragmenting Note: Bonds broken are labelled by integers. Breaking FragA-FragB-FragC will result in FragA-1 \*:1-FragB-\*:2, \*:2-FragC In this way fragments can be rejoined by recombining matching integers. The integers are added to the molecule as Isotopes In the last step, halides are not separated from the alkyl groups by default, and one-heavy atom groups are rejoined to acyclic portions of the molecule """ global _num_bonds_broken # pylint:disable=global-statement _num_bonds_broken = 1 first_break = _fragment_molecule([mol], patt="[!#0;R:1]-!@[!#0;!#1:2]") second_break = _fragment_molecule( first_break, patt="[$([!#0;!R]=[!#0;R]):1]-[!#0;!#1;!R:2]" ) third_break = _fragment_molecule( second_break, patt, exld_ring=True, drop_parent=drop_parent, recombine_mono=recombine_mono, ) return third_break
[docs] def identify_connected_fragments( inp: str, keep_only_children: bool = True, bb_patt: str = "[$([C;X4;!R]):1]-[$([R,!$([C;X4]);!#0;!#9;!#17;!#35;!#1]):2]", input_type: str = "smile", cml_file: str = "", include_parent: bool = False, aiida: bool = False, ) -> pd.DataFrame: r""" Given Smiles string, identify fragments in the molecule: Break all ring-non-ring atom single bonds. For atoms double bonded to rings, break their single bonds. For non-ring fragments, separate those into alkyl chains and hetero/double bonded atoms. (similar to Ertl functional groups) Args: input: a string containing either a smiles, .xyz, .mol or .cml filename for a given molecule update input_type below to match provided input keep_only_children: boolean, if True, when a group is broken down into its components remove the parent group from output. If False, parent group is retained bb_patt: string of SMARTS pattern for bonds to be broken in side chains and linkers defaults to cleaving sp3 carbon-((ring OR not sp3 carbon) AND not-placeholder/halogen/H) input_type: 'smile' if SMILES code or 'molfile' if .mol file, or 'xyzfile' if .xyz file, or 'cmlfile' if .cml file Note: xyz file REQUIRES cml_file to be provided as well cml_file: defaults to none, can be the cml file corresponding to the input .mol file include_parent: If True, include column in output frame repeating parent molecule intended use for True when merging multiple molecule fragment frames but need to retain a parent molecule object aiida: if True, format output to be able to be used in aiida database. That is, no molecule objects Returns: DataFrame with columms 'Smiles', 'Molecule', 'numAttachments' and 'xyz' Containing, fragment smiles, fragment Chem.Molecule object, number of \* placeholders, and rough xyz coordinates for the fragment is \* were At Note: Each bond breaking, connectivity is maintained through dummy atom labels. e.g. C-N -> C-[1*] N-[1*] - reattaching via the matching labels would reassemble the molecule currently will break apart a functional group if contains a ring-non-ring single bond. e.g. ring N-nonring S=O -> ring N-[1*] nonring S=O-[1*] """ # pylint:disable=too-many-arguments # pylint:disable=too-many-branches if aiida: inp = inp.value keep_only_children = keep_only_children.value bb_patt = bb_patt.value input_type = input_type.value if cml_file: cml_file = cml_file.value include_parent = include_parent.value # ensure smiles is canonical so writing and reading the smiles will result in same number # ordering of atoms if input_type == "smile": mol = utils.get_canonical_molecule(inp) xyz_coords = [] elif input_type in ["cmlfile", "cmldict"]: mol, atomic_symb, xyz_coords, atom_types = utils.mol_from_cml( inp, input_type=input_type ) if mol is None: return None elif input_type == "molfile": # use coordinates in cml file provided if able, else use xyz from mol file mol_dict = utils.mol_from_molfile(inp) mol, atomic_symb = mol_dict["Molecule"], mol_dict["atomic_symbols"] if cml_file: xyz_coords, atom_types, _, _, _ = utils.data_from_cml(cml_file) # atom_types = utils.get_cml_atom_types(cml_file) else: xyz_coords = mol_dict["xyz_pos"] # elif input_type == "xyzfile": # if not cml_file: # raise ValueError("No cml file provided, expected one for xyz input type") # mol_dict = utils.mol_from_xyzfile(xyz_file=inp, cml_file=cml_file) # if mol_dict is None: # return None # mol, atomic_symb, xyz_coords = ( # mol_dict["Molecule"], # mol_dict["atomic_symbols"], # mol_dict["xyz_pos"], # ) # atom_types = utils.get_cml_atom_types(cml_file) else: raise ValueError( f"""{input_type} should either be molfile, xyzfile, cmlfile, or a smile string""" ) mol_frags = generate_molecule_fragments( mol, patt=bb_patt, drop_parent=keep_only_children ) # initialize the output data frame frag_frame = _generate_fragment_frame([Chem.MolToSmiles(x) for x in mol_frags]) # add hydrogens and xyz coordinates resulting from MMFF94 opt, changing placeholders to At # frag_frame = _add_xyz_coords(frag_frame) # count number of placeholders in each fragment - it is the number of places it is attached frag_frame = _add_number_attachements(frag_frame) if input_type in [ "molfile", "xyzfile", "cmlfile", "cmldict", ]: # clear map labels and add xyz coordinates that are available frag_frame = _add_frag_comp(frag_frame, mol) frag_frame["Smiles"] = frag_frame["Smiles"].map( _clear_map_number ) # (lambda x: _clear_map_number(x)) frag_frame["xyz"] = frag_frame["Atoms"].map( lambda x: _add_rtr_xyz(x, xyz_coords) ) frag_frame["Labels"] = frag_frame["Atoms"].map( lambda x: _add_rtr_label(x, atomic_symb) ) frag_frame["Molecule"] = frag_frame["Molecule"].map( lambda x: _clear_map_number(x, "mol") ) if cml_file or input_type == "cmlfile" or input_type == "cmldict": frag_frame["atom_types"] = frag_frame["Atoms"].map( lambda x: _add_rtr_type(x, atom_types) ) if include_parent and not aiida: mol = _clear_map_number(mol, "mol") frag_frame["Parent"] = [mol] * len(frag_frame.index) if aiida: frag_frame = frag_frame.drop("Molecule", axis=1) return frag_frame
def _add_rtr_label(at_num_list, atomic_symb): out_list = [] for atom in at_num_list: out_list.append(atomic_symb[atom - 1]) return out_list def _add_rtr_type(at_num_list, atom_types): out_list = [] for atom in at_num_list: out_list.append(atom_types[atom - 1]) return out_list def _add_rtr_xyz(at_num_list, xyz_coords): """Construct string of atomSymbol x y z format, for use to map to frag_frame""" out_list = [] for atom in at_num_list: out_list.append(xyz_coords[atom - 1]) return out_list def _clear_map_number(mol_input, ret_type="smi"): """Given str or Chem.Molecule input, remove atomMapnumbers from atoms and return smiles (ret_type='smi') or molecule object (ret_type = 'mol')""" if isinstance(mol_input, str): mol = Chem.MolFromSmiles(mol_input) else: mol = mol_input if not mol: raise ValueError( f"""Could not construct mol from {mol_input} in output frame""" ) for atom in mol.GetAtoms(): atom.ClearProp("molAtomMapNumber") if ret_type == "smi": return Chem.MolToSmiles(mol) if ret_type == "mol": return mol raise ValueError(f"""Invalid ret_type, expected smi or mol, got {ret_type}""") def _get_atlabels_in_frag(molecule): """For a given molecule, extract list of atom map number for all non-H atoms""" # H's not included because the molecule object typically won't have explicit H out_list = [] for atom in molecule.GetAtoms(): if atom.GetAtomicNum() != 0: out_list.append(int(atom.GetProp("molAtomMapNumber"))) return out_list def _add_frag_comp(frag_frame, mol): """Given frag_frame and mol, add Atoms col to frag_frame with indices of atoms starting at 1""" # create list of lists of indices of atoms in each fragment frag_atoms = [] for frag_mol in frag_frame["Molecule"]: frag_atoms.append(_get_atlabels_in_frag(frag_mol)) # iterate over atoms, adding in hydrogens to the fragment since the above won't include H for atom in mol.GetAtoms(): if atom.GetAtomicNum() == 1: neigh_idx = atom.GetNeighbors()[0].GetIdx() + 1 for i, frag in enumerate(frag_atoms): if neigh_idx in frag: frag_atoms[i].append(atom.GetIdx() + 1) break # only need to get here once - H has only one bond frag_frame["Atoms"] = frag_atoms return frag_frame
[docs] def count_uniques( frag_frame: pd.DataFrame, drop_attachments: bool = False, uni_smi_type: bool = False ) -> pd.DataFrame: r"""Identify unique fragments in a frame and count the number of times they occur Given frag_frame output from :attr:`group_decomposition.fragfunctions.identify_connected_fragments`, remove dummy atom labels(and placeholders entirely if drop_attachments=True), then count unique fragments using SMILES to identify unique fragments Should also work on frames from :attr:`group_decomposition.fragfunctions.merge_uniques` or :attr:`group_decomposition.fragfunctions.count_groups_in_set` as well Args: frag_frame: frame resulting from :attr:`group_decomposition.fragfunctions.identify_connected_fragments` typically, or any similar frame with a list of SMILES codes in column ['Smiles'] drop_attachments: boolean, if False, retains placeholder \* at points of attachment, if True, removes \* for fragments with more than one atom uni_smi_type: include atom types in determination of unique fragments. If false, only determine unique by SMILES Returns: pandas data frame with columns 'Smiles', 'count' and 'Molecule', containing the Smiles string, the number of times the Smiles was in frag_frame, and rdkit.Chem.Molecule object Note: if drop_attachments=False, similar fragments with different number/positions of attachments will not count as being the same. e.g. ortho-attached aromatics would not match with meta or para attached aromatics If you've ran this previously with uni_smi_type=True, running on the output frame (or other frame derived from such frame) with uni_smi_type=False will collapse the output uniques determined by SMILE only """ # Change column indices to dict # pylint:disable=too-many-locals # pylint:disable=too-many-branches # pylint:disable=too-many-statements col_names = list(frag_frame.columns) # if frag_frame already has xyz coordinates keep those and don't use others # typically this will be if a mol and/or cml file was provided in frag_frame construction if "xyz" in col_names: xyz_inc = True xyz_list = frag_frame["xyz"] else: xyz_inc = False smile_list = frag_frame["Smiles"] if "Atoms" in col_names: atom_list = frag_frame["Atoms"] atoms_inc = True else: atoms_inc = False if "Labels" in col_names: labels_list = frag_frame["Labels"] labels_inc = True else: labels_inc = False if "Parent" in col_names: parent_list = frag_frame["Parent"] parent_inc = True else: parent_inc = False if "atom_types" in col_names: type_list = frag_frame["atom_types"] type_inc = True else: type_inc = False if "count" in col_names: count_list = frag_frame["count"] count_inc = True else: count_inc = False # print(type_list) no_connect_smile = [] # Clean smiles - either removing placeholder entirely(drop_attachments True) # Or just removing the dummyAtomLabel (drop_attachments False) for smile in smile_list: if drop_attachments: no_connect_smile.append(_drop_smi_attach(smile)) else: t_mol = Chem.MolFromSmiles(re.sub(r"\[[0-9]+\*\]", "*", smile)) no_connect_smile.append(Chem.MolToSmiles(t_mol)) # identify unique smiles and count number of times they occur # initialize lists to be used making frame unique_smiles = [] unique_xyz = [] unique_atoms = [] unique_labels = [] unique_parents = [] unique_smiles_counts = [] unique_types = [] counter_list = [] # Identify unique smiles, counting every occurence and adding xyz if included if uni_smi_type: ty_tup = list(map(tuple, frag_frame["atom_types"])) smi_ty = list(zip(no_connect_smile, ty_tup)) for i, val in enumerate(smi_ty): smile = val[0] at_tys = val[1] if smile not in unique_smiles: unique_smiles.append(smile) if xyz_inc: unique_xyz.append(xyz_list[i]) if atoms_inc: unique_atoms.append(atom_list[i]) if labels_inc: unique_labels.append(labels_list[i]) if parent_inc: unique_parents.append(parent_list[i]) if type_inc: unique_types.append(type_list[i]) counter_list.append(Counter(at_tys)) if count_inc: unique_smiles_counts.append(count_list[i]) else: unique_smiles_counts.append(1) else: at_ty_count = Counter(at_tys) if at_ty_count in counter_list: smi_ix = counter_list.index(at_ty_count) if count_inc: unique_smiles_counts[smi_ix] += count_list[i] else: unique_smiles_counts[smi_ix] += 1 else: unique_smiles.append(smile) if xyz_inc: unique_xyz.append(xyz_list[i]) if atoms_inc: unique_atoms.append(atom_list[i]) if labels_inc: unique_labels.append(labels_list[i]) if parent_inc: unique_parents.append(parent_list[i]) # if type_inc: # unique_types.append(labels_list[i]) counter_list.append(Counter(at_tys)) unique_types.append(type_list[i]) if count_inc: unique_smiles_counts.append(count_list[i]) else: unique_smiles_counts.append(1) else: for i, smile in enumerate(no_connect_smile): if smile not in unique_smiles: unique_smiles.append(smile) if xyz_inc: unique_xyz.append(xyz_list[i]) if atoms_inc: unique_atoms.append(atom_list[i]) if labels_inc: unique_labels.append(labels_list[i]) if parent_inc: unique_parents.append(parent_list[i]) if type_inc: unique_types.append(type_list[i]) if count_inc: unique_smiles_counts.append(count_list[i]) else: unique_smiles_counts.append(1) else: smi_ix = unique_smiles.index(smile) if count_inc: unique_smiles_counts[smi_ix] += count_list[i] else: unique_smiles_counts[smi_ix] += 1 # create output frame un_frame = _construct_unique_frame( uni_smi=unique_smiles, uni_smi_count=unique_smiles_counts, xyz=unique_xyz, atoms=unique_atoms, parents=unique_parents, labels=unique_labels, at_types=unique_types, ) # if atoms_inc: # un_frame['Atoms'] = unique_atoms # if labels_inc: # un_frame['Labels'] = unique_labels # if parent_inc: # un_frame['Parent'] = unique_parents # if type_inc: # un_frame['atom_types'] = unique_types return un_frame
def _find_neigh_notin_frag(mol, at_list): # for atoms with one attachment point out_nbr = 0 for idx in at_list: atom = mol.GetAtomWithIdx(idx - 1) nbrs = atom.GetNeighbors() for nbr in nbrs: nbr_idx = nbr.GetIdx() + 1 if nbr_idx not in at_list: out_nbr = nbr_idx out_at = atom.GetIdx() + 1 break if out_nbr: break return [out_at, out_nbr] def _find_neigh_xyz(frag_frame, neigh_idx): """Given frag_frame and index of neighbor atom, find its xyz coordinates""" # column index atoms_idx = list(frag_frame.columns).index("Atoms") # determine which row contains the neighbor atom neigh_bool = list(frag_frame.apply(lambda row: neigh_idx in row[atoms_idx], axis=1)) # print(neigh_bool) neigh_row = neigh_bool.index(True) list_xyz = frag_frame["xyz"][neigh_row] # find where in list neighbor atom is, return xyz at that index neigh_list_idx = frag_frame["Atoms"][neigh_row].index(neigh_idx) neigh_xyz = list_xyz[neigh_list_idx] return neigh_xyz def _move_along_bond(at_xyz, neigh_xyz, at_symb): """Returns xyz coordinates of neigh_xyz moved along bond to H bond length""" at_np = np.array(at_xyz) nb_np = np.array(neigh_xyz) # vector for bond to move along bond = at_np - nb_np # bond length to find target_length = _H_BOND_LENGTHS[at_symb] steps = np.linspace(0.0, 1.0, 100) # step along bond starting at neigh_xyz in direction of at_xyz. # stop when bond length is target for s in steps: new_xyz = neigh_xyz + s * bond if abs(_get_dist(new_xyz, at_np) - target_length) < 0.01: end_xyz = new_xyz # return target point return end_xyz def _get_dist(point_a, point_b): """Return distance btw two points""" return math.sqrt( (point_a[0] - point_b[0]) ** 2 + (point_a[1] - point_b[1]) ** 2 + (point_a[2] - point_b[2]) ** 2 ) def _find_H_xyz(mol, at_list, xyz_list, frag_frame): """Finds xyz of hydrogen atom that would be connected to a fragment Args: mol: Chem.Mol object at_list: list[int] - list of atoms in the fragment Note: these start at 1, but those in molecule start at 0. To convert add/subtract 1 xyz_list: list of xyz_coordinates of atoms in fragment frag_frame: full parent frag_frame WITHOUT filtering by number of attachments Returns: list[float]: xyz coordinates where H should be placed """ # find attached atom index and neighbor index in molecule at_idx, neigh_idx = _find_neigh_notin_frag(mol, at_list) # get attached atom symbol symb = mol.GetAtomWithIdx(at_idx - 1).GetSymbol() # index of the attached atom in at_list, which is the same as xyz_list list_idx = at_list.index(at_idx) # attached atom xyz at_xyz = xyz_list[list_idx] # neighbor atom xyz neigh_xyz = _find_neigh_xyz(frag_frame, neigh_idx) # move neighbor atom xyz along bond to H bond length (Gaussview defaults) h_xyz = _move_along_bond(at_xyz, neigh_xyz, symb) return h_xyz def _clean_molecule_name(smile): """Removes symbols in smile code unfit for file name""" smile = smile.replace("-", "Neg") smile = smile.replace("[", "-") smile = smile.replace("]", "-") smile = smile.replace("(", "-") smile = smile.replace(")", "-") smile = smile.replace("#", "t") smile = smile.replace("=", "d") smile = smile.replace("+", "Pos") smile = smile.replace("*", "Att") smile = smile.replace("@", "") return smile # def generate_fragment_structures( # inp: str, # keep_only_children: bool = True, # bb_patt: str = "[$([C;X4;!R]):1]-[$([R,!$([C;X4]);!#0;!#9;!#17;!#35;!#1]):2]", # input_type="smile", # cml_file="", # include_parent=True, # ): # """generate fragments and return a dictionary of fragments""" # # pylint:disable=too-many-arguments # frag_frame = identify_connected_fragments( # inp=inp, # keep_only_children=keep_only_children, # bb_patt=bb_patt, # input_type=input_type, # cml_file=cml_file, # include_parent=include_parent, # ) # mol = frag_frame.at[0, "Parent"] # frag_dict = output_ifc_dict(mol, frag_frame) # return frag_dict # show # def output_cgis_dicts(cgis_frame): # """generate fragments and return a dictionary of fragments""" # # should have parent col # on_at_frame = pd.DataFrame(cgis_frame[cgis_frame["numAttachments"] == 1]) # col_names = list(on_at_frame.columns) # xyz_idx = col_names.index("xyz") # atoms_idx = col_names.index("Atoms") # parent_idx = col_names.index("Parent") # on_at_frame["H_xyz"] = on_at_frame.apply( # lambda row: _find_H_xyz( # row[parent_idx], row[atoms_idx], row[xyz_idx], cgis_frame # ), # axis=1, # ) # on_at_frame["at_idx"] = on_at_frame.apply( # lambda row: _find_at_idx(row[parent_idx], row[atoms_idx]), axis=1 # ) # out_dict = { # re.sub( # r"\[[0-9]+\*\]", "*", on_at_frame.at[i, "Smiles"] # ): _write_frag_structure( # frag_mol=on_at_frame.at[i, "Molecule"], # xyz_list=on_at_frame.at[i, "xyz"], # symb_list=on_at_frame.at[i, "Labels"], # h_xyz=on_at_frame.at[i, "H_xyz"], # at_idx=on_at_frame.at[i, "at_idx"], # atom_types=frag_frame.at[i, "atom_types"], # ) # for i in range(0, nrow) # } # return out_dict
[docs] def output_ifc_dict(mol, frag_frame: pd.DataFrame, done_smi: list[str]): """generate a dictionary containing identify_connected_fragment information Only new fragments are included. Previously parsed fragments are listed in done_smi. Args: mol: rdkit molecule object that was fragmented frag_frame: identify_connected_fragments frame generated for mol done_smi: list of fragments which have been identified already Returns: list containing dict for fragments and done_smi lengthened by the number of fragments done Note: mainly for use in generating information for unique fragments in an AiiDA workflow """ on_at_frame = pd.DataFrame(frag_frame[frag_frame["numAttachments"] == 1]) if on_at_frame.empty: return None, done_smi col_names = list(on_at_frame.columns) # Find indices of relevant columns xyz_idx = col_names.index("xyz") atoms_idx = col_names.index("Atoms") # Find H xyz position and index of atom bonded to H on_at_frame["H_xyz"] = on_at_frame.apply( lambda row: _find_H_xyz(mol, row[atoms_idx], row[xyz_idx], frag_frame), axis=1 ) on_at_frame["at_idx"] = on_at_frame.apply( lambda row: _find_at_idx(mol, row[atoms_idx]), axis=1 ) nrow = on_at_frame.shape[0] on_at_frame = on_at_frame.reset_index(drop=True) # print(on_at_frame) # print(_clean_molecule_name(on_at_frame.at[0,'Smiles'])) # print(on_at_frame.at[0, "Molecule"]) # print(on_at_frame.at[0, "Labels"]) # print(on_at_frame.at[0, "xyz"]) # print(on_at_frame.at[0, "H_xyz"]) # print(on_at_frame.at[0, "at_idx"]) # print(on_at_frame.at[0, "atom_types"]) out_dict = {} for i in range(0, nrow): smi = on_at_frame.at[i, "Smiles"] if smi not in done_smi: done_smi.append(smi) key = re.sub(r"\[[0-9]+\*\]", "*", smi) out_dict[key] = _write_frag_structure( frag_mol=on_at_frame.at[i, "Molecule"], xyz_list=on_at_frame.at[i, "xyz"], symb_list=on_at_frame.at[i, "Labels"], h_xyz=on_at_frame.at[i, "H_xyz"], at_idx=on_at_frame.at[i, "at_idx"], atom_types=frag_frame.at[i, "atom_types"], ) return [out_dict, done_smi]
def _write_frag_structure(frag_mol, xyz_list, symb_list, h_xyz, at_idx, atom_types): """get output structure""" # pylint:disable=too-many-arguments # print("in function") num_atoms = len(symb_list) # smile = re.sub('\[[0-9]+\*\]', '*', Chem.MolToSmiles(frag_mol,canonical=False)) charge = Chem.GetFormalCharge(frag_mol) out_xyz = [xyz_list[at_idx], h_xyz] for i in range(num_atoms): if i != at_idx: out_xyz.append(xyz_list[i]) # geom_frame = pd.DataFrame(out_xyz,columns=['x','y','z']) symb_list.insert(1, "H") return { "geom": out_xyz, "charge": charge, "atom_types": atom_types, "atom_symbols": symb_list, }
[docs] def output_ifc_gjf( mol, frag_frame, esm="wb97xd", basis_set="aug-cc-pvtz", wfx=True, n_procs=4, mem="3200MB", multiplicity=1, ): """Takes a fragmented molecule and outputs gjf files of the fragments with one attachment point. Hydrogen is added in place of the connection to the rest of the molecule for the fragment Args: mol: Chem.Mol object for which fragmentation was performed frag_frame: output from either count_uniques or identify_connected_fragments esm: str, electronic structure method to include in gjf basis_set: str, basis set to include in gjf wfx: Boolean, if True add output=wfx to gjf file n_procs: int, >=0. if >0, add number of processors to be used to gjf mem: str, format "nMB" or "nGB", memory to be used in gjf multiplicity: int, defaults to 1. Multiplicity of molecule Returns: Creates gjf files in working directory for each fragment in frag_frame Note: H position is set by taking the atom the fragment is bonded two, replacing it with H and moving that closer to the C until it reaches the default distance Default distances taken from Gaussview "clean" C-H, C-O, etc bond lengths """ on_at_frame = pd.DataFrame(frag_frame[frag_frame["numAttachments"] == 1]) col_names = list(on_at_frame.columns) # Find indices of relevant columns xyz_idx = col_names.index("xyz") atoms_idx = col_names.index("Atoms") labels_idx = col_names.index("Labels") mol_idx = col_names.index("Molecule") # Find H xyz position and index of atom bonded to H on_at_frame["H_xyz"] = on_at_frame.apply( lambda row: _find_H_xyz(mol, row[atoms_idx], row[xyz_idx], frag_frame), axis=1 ) on_at_frame["at_idx"] = on_at_frame.apply( lambda row: _find_at_idx(mol, row[atoms_idx]), axis=1 ) # hxyz = len(col_names) # atidx = len(col_names) + 1 # print(on_at_frame) # write gjfs on_at_frame.apply( lambda row: _write_frag_gjf( frag_mol=row[mol_idx], xyz_list=row[xyz_idx], symb_list=row[labels_idx], h_xyz=row[len(col_names)], at_idx=row[len(col_names) + 1], esm=esm, basis_set=basis_set, wfx=wfx, n_procs=n_procs, mem=mem, multiplicity=multiplicity, ), axis=1, )
def _clean_basis(basis_set): basis_set = basis_set.replace("(", "") basis_set = basis_set.replace(")", "") basis_set = basis_set.replace(",", "") basis_set = basis_set.replace("+", "p") return basis_set def _write_frag_gjf( frag_mol, xyz_list, symb_list, h_xyz, at_idx, esm="wb97xd", basis_set="aug-cc-pvtz", wfx=True, n_procs=4, mem="3200MB", multiplicity=1, ): """Write the gjf file for a fragment. Args: frag_mol: Chem.Mol object xyz_list: list of xyz_coords of list symb_list: list of atomic symbols of fragment h_xyz: xyz coords of hydrogen attached to fragment at_idx: index of attached atom in fragment esm: str, electronic structure method to include in gjf basis_set: str, basis set to include in gjf wfx: Boolean, if True add output=wfx to gjf file n_procs: int, >=0. if >0, add number of processors to be used to gjf mem: str, format "nMB" or "nGB", memory to be used in gjf multiplicity: int, defaults to 1. Multiplicity of molecule Default template: %chk={filename}.chk %nprocs=4 %mem=3200MB # esm/basis_set opt freq output=wfx smile charge mult {xyz} {filename}.wfx """ # pylint:disable=too-many-locals num_atoms = len(symb_list) smile = re.sub(r"\[[0-9]+\*\]", "*", Chem.MolToSmiles(frag_mol, canonical=False)) charge = Chem.GetFormalCharge(frag_mol) molecule_name = _clean_molecule_name(smile) # print(at_idx) # print(xyz_list[at_idx]) # print(h_xyz) # build xyz of molecule out_xyz = [xyz_list[at_idx], h_xyz] for i in range(num_atoms): if i != at_idx: out_xyz.append(xyz_list[i]) geom_frame = pd.DataFrame(out_xyz, columns=["x", "y", "z"]) symb_list.insert(1, "H") geom_frame["Atom"] = symb_list geom_frame = geom_frame[["Atom", "x", "y", "z"]] # create file name clean_basis = _clean_basis(basis_set) new_file_name = "SubH" + "_" + molecule_name + "_" + esm + "_" + clean_basis if os.path.exists(new_file_name + ".gjf"): # print('deleting') os.remove(new_file_name + ".gjf") # write file with open(new_file_name + ".gjf", "a", encoding="utf-8") as f: f.write(f"%chk={new_file_name}" + ".chk\n") if n_procs: f.write(f"%nprocs={n_procs}\n") if mem: f.write(f"%mem={mem}\n") if wfx: f.write(f"#p {esm}/{basis_set} opt freq output=wfx\n") else: f.write(f"#p {esm}/{basis_set} opt freq\n") f.write("\n") f.write(f"{smile}") f.write("\n\n") f.write(f"{charge} {multiplicity}\n") dfAsString = geom_frame.to_string(header=False, index=False) f.write(dfAsString) f.write("\n\n") if wfx: f.write(new_file_name + ".wfx\n\n\n") else: f.write("\n\n\n") def _find_at_idx(mol, at_list): """Returns index of atom connected to the remainder of the molecule in the list of fragment atom numbers Args: mol: Chem.Mol object at_list: list[int] of atoms in molecule belonging to fragment being studied Note: these indices start at 1, while those in the molecule start at 0 Returns: int of index of connected atom in at_list. See examples for worked explanation Example: a fragment is defined by atoms [1,3,5,6,8] Atom 3 is where the remainder of the molecule attaches to the fragment The function returns the index of 3 in the atom list. In this case, it would return 1 """ # find at_idx = _find_neigh_notin_frag(mol, at_list)[0] list_idx = at_list.index(at_idx) return list_idx def _construct_unique_frame( uni_smi: list[str], uni_smi_count: list[int], xyz=[], atoms=[], parents=[], labels=[], at_types=[], ) -> pd.DataFrame: """given smiles, counts and (optional) xyz coordinates, create frame""" # pylint:disable=dangerous-default-value uniquefrag_frame = pd.DataFrame(uni_smi, columns=["Smiles"]) if xyz: uniquefrag_frame["xyz"] = xyz if atoms: uniquefrag_frame["Atoms"] = atoms if parents: uniquefrag_frame["Parent"] = parents if labels: uniquefrag_frame["Labels"] = labels if at_types: uniquefrag_frame["atom_types"] = at_types PandasTools.AddMoleculeColumnToFrame( uniquefrag_frame, "Smiles", "Molecule", includeFingerprints=True ) uniquefrag_frame["count"] = uni_smi_count # if we don't have xyz already add them, from MMFF94 opt if not xyz: uniquefrag_frame = _add_xyz_coords(uniquefrag_frame) # count number placeholders uniquefrag_frame = _add_number_attachements(uniquefrag_frame) return uniquefrag_frame def _drop_smi_attach(smile: str): """completely remove placeholder if number of non-placeholder in smiles is > 1""" mol = Chem.MolFromSmiles(smile) non_zero_atoms = 0 for atom in mol.GetAtoms(): if atom.GetAtomicNum() != 0: non_zero_atoms += 1 if non_zero_atoms > 1: temp = re.sub(r"\[[0-9]+\*\]", "", smile) t_mol = Chem.MolFromSmiles(re.sub(r"\(\)", "", temp)) if t_mol is None: t_mol = Chem.MolFromSmiles(re.sub(r"\[[0-9]+\*\]", "*", smile)) out_smi = Chem.MolToSmiles(t_mol) # Warning('Could not construct {smile} without attachments'.format(smile=smile)) else: out_smi = Chem.MolToSmiles(t_mol) else: out_smi = Chem.MolToSmiles( Chem.MolFromSmiles(re.sub(r"\[[0-9]+\*\]", "*", smile)) ) return out_smi
[docs] def merge_uniques( frame1: pd.DataFrame, frame2: pd.DataFrame, uni_smi_ty=True ) -> pd.DataFrame: """Given two frames of unique fragments, identify shared unique fragments, merge count and frames together. Args: frame1: a frame output from count_uniques frame2: a distinct frame also from count_uniques uni_smi_ty: If True, include atom types in determination of unique fragments. If false, only determine unique by SMILES Returns: a frame resulting from the merge of frame1 and frame2. All rows that have Smiles that are in frame1 but not frame2(and vice versa) are included unmodified If a row's SMILES is in both frame1 and frame2, modify the row to update the count of that fragment as sum of frame1 and frame2, then include one row. Note: for best results, SMILES must be canonical so that they can be exactly compared. Smiles in frame should be resulting from Chem.MolToSmiles(Chem.MolFromSmiles(smile)) - this will create a molecule from the smile, and write the smile back, in canonical form Example usage: >>> frame1 Smiles count C 2 C1CCC1 1 >>> frame2 Smiles count C 3 C1CC1 2 >>> merge_uniques(frame1,frame2) Smiles count C 5 C1CCC1 1 C1CC1 2 """ if frame1.empty: merge_frame = frame2 elif frame2.empty: merge_frame = frame1 else: rows_to_drop = _find_rows_to_drop(frame1, frame2, uni_smi_ty) merge_frame = rows_to_drop["merge_frame"] drop_frame_1 = frame1.drop(rows_to_drop["drop_rows_1"]) drop_frame_2 = frame2.drop(rows_to_drop["drop_rows_2"]) merge_frame = pd.concat([drop_frame_1, drop_frame_2, merge_frame]) merge_frame.reset_index(inplace=True, drop=True) # _add_frame(drop_frame_1,merge_frame) # merge_frame = _add_frame(drop_frame_2,merge_frame) return merge_frame
def _find_rows_to_drop(frame_a: pd.DataFrame, frame_b: pd.DataFrame, uni_smi_ty=True): """for two frames, find rows with same smile, store what rows to drop in each""" rows_to_drop_one = [] rows_to_drop_two = [] col_names = list(frame_a.columns) merge_frame = pd.DataFrame(columns=col_names) frame_a_idx = list(frame_a.index) frame_b_idx = list(frame_b.index) for i, smi in enumerate(frame_a["Smiles"]): if smi in list(frame_b["Smiles"]): # print(f'i is {i}') if uni_smi_ty: smi_idx = [ x for x in range(0, frame_b.shape[0]) if smi == list(frame_b["Smiles"])[x] ] ctr = [list(map(Counter, frame_b["atom_types"]))[x] for x in smi_idx] if list(map(Counter, frame_a["atom_types"]))[i] not in ctr: continue j = smi_idx[ctr.index(list(map(Counter, frame_a["atom_types"]))[i])] else: j = list(frame_b["Smiles"]).index(smi) # print(f'j is {j}') # print(frame_a.shape) cum_count = ( frame_a.at[frame_a_idx[i], "count"] + frame_b.at[frame_b_idx[j], "count"] ) # print('cum_count is {cum_count}') merge_frame = pd.concat( [ merge_frame, pd.DataFrame( [list(frame_a.iloc[frame_a_idx[i]])], columns=col_names ), ] ) merge_frame.reset_index(inplace=True, drop=True) # print(f'merge frame before updating cumcount {merge_frame}') # print(merge_frame.shape[0]-1) merge_frame.at[merge_frame.shape[0] - 1, "count"] = cum_count # print(f'merge_frame after updating cumcount {merge_frame}') # if made_frame == 0: # merge_frame = pd.DataFrame.from_dict({'Smiles':[smi],'count':[cum_count]}) # made_frame=1 # else: # merge_frame.loc[len(merge_frame)] = [smi, cum_count] rows_to_drop_one.append(i) rows_to_drop_two.append(j) # print(merge_frame) return { "drop_rows_1": rows_to_drop_one, "drop_rows_2": rows_to_drop_two, "merge_frame": merge_frame, }
[docs] def count_groups_in_set( list_of_inputs: list[str], drop_attachments: bool = False, input_type: str = "smile", bb_patt: str = "[$([C;X4;!R]):1]-[$([R,!$([C;X4]);!#0;!#9;!#17;!#35;!#1]):2]", cml_list=None, uni_smi_ty: bool = True, aiida: bool = False, ) -> pd.DataFrame: """Identify unique fragments in molecules defined in the list_of_smiles, and count the number of occurences for duplicates. Args: list_of_smiles: A list, with each element being a SMILES string, e.g. ['CC','C1CCCC1'] drop_attachments: Boolean for whether or not to drop attachment points from fragments if True, will remove all placeholder atoms indicating connectivity if False, placeholder atoms will remain input_type: smile, xyzfile, cmlfile or molfile, based on elements of lists_of_inputs cml_list: defaults empty, but can be a list of cml files corresponding to the molfile inputs bb_patt: SMARTS pattern for bonds to break in linkers and side chains. Defaults to breaking bonds between nonring carbons with four bonds single bonded to ring atoms or carbons that don't have four bonds, and are not H, halide, or placeholder uni_smi_type: if True, include atom types in determination of unique fragments. If false, only determine unique by SMILES Returns: an output pd.DataFrame, with columns 'Smiles' for fragment Smiles, 'count' for number of times each fragment occurs in the list, and 'Molecule' holding a rdkit.Chem.Molecule object Example usage: >>> count_groups_in_set(['c1ccc(c(c1)c2ccc(o2)C(=O)N3C[C@H](C4(C3)CC[NH2+]CC4)C(=O)NCCOCCO)F', 'Cc1nc2ccc(cc2s1)NC(=O)c3cc(ccc3N4CCCC4)S(=O)(=O)N5CCOCC5'],drop_attachments=False) """ out_frame = pd.DataFrame() for i, inp in enumerate(list_of_inputs): # print(inp) if cml_list: frame = identify_connected_fragments( inp, bb_patt=bb_patt, input_type=input_type, cml_file=cml_list[i], include_parent=True, ) else: frame = identify_connected_fragments( inp, bb_patt=bb_patt, input_type=input_type, include_parent=True ) if frame is not None: unique_frame = count_uniques(frame, drop_attachments, uni_smi_type=True) if out_frame.empty: out_frame = unique_frame else: out_frame = merge_uniques(out_frame, unique_frame, uni_smi_ty) out_frame = out_frame.drop("Molecule", axis=1) if not aiida: PandasTools.AddMoleculeColumnToFrame( out_frame, "Smiles", "Molecule", includeFingerprints=True ) # out_frame = _add_xyz_coords(out_frame) # out_frame = _add_number_attachements(out_frame) return out_frame