"""
fragfunctions module
code used to generate fragments of molecules from SMILES code and analyze them
Main functions to call are:
identify_connected_fragments - takes one molecule SMILES, returns fragments with connections
count_uniques - takes output from above, removes attachments and counts unique fragments
count_groups_in_set - takes list of SMILES and counts unique fragments on set
"""
# pylint:disable=too-many-lines
import math
import os
import re
from collections import Counter
import numpy as np # for arrays in fragment identification
import pandas as pd # lots of work with data frames
from rdkit import Chem # pylint:disable=import-error
from rdkit.Chem import AllChem, PandasTools, rdqueries # pylint:disable=import-error
from group_decomposition import utils
_num_bonds_broken = 1
_H_BOND_LENGTHS = {
# from Gaussview default cleaned bond length
"C": 1.07,
"O": 0.96,
"N": 1.00,
"F": 0.88,
"Cl": 1.29,
"B": 1.18,
"Al": 1.55,
"Si": 1.47,
"P": 1.35,
"S": 1.31,
}
def _generate_fragment_frame(fragment_smiles):
"""Given list of SMILES Generate output frame with SMILES codes and molecules for unique
fragments."""
frag_frame = pd.DataFrame(set(fragment_smiles), columns=["Smiles"])
PandasTools.AddMoleculeColumnToFrame(
frag_frame, "Smiles", "Molecule", includeFingerprints=True
)
frag_frame.drop(frag_frame.index[frag_frame["Smiles"] == "*"].tolist())
return frag_frame
def _add_xyz_coords(frag_frame):
"""Given frag_frame with molecules, add xyz coordinates form MM94 optimization to it."""
xyz_block_list = []
query = rdqueries.AtomNumEqualsQueryAtom(0)
for mol in frag_frame["Molecule"]:
h_mol_rw = Chem.RWMol(mol) # Change type of molecule object
h_mol_rw = Chem.AddHs(h_mol_rw) # Add hydrogens
zero_at = h_mol_rw.GetAtomsMatchingQuery(query) # Replace placeholder * with At
for atom in zero_at:
h_mol_rw.GetAtomWithIdx(atom.GetIdx()).SetAtomicNum(85)
AllChem.EmbedMolecule(h_mol_rw)
AllChem.MMFFOptimizeMolecule(h_mol_rw) # Optimize with MMFF94
xyz_block_list.append(
AllChem.rdmolfiles.MolToXYZBlock(h_mol_rw) # pylint:disable=no-member
) # Store xyz coordinates
frag_frame["xyz"] = xyz_block_list
return frag_frame
def _add_number_attachements(frag_frame):
"""Add number of attachments column to frag_frame, counting number of *."""
attach_list = []
for molecule in frag_frame["Molecule"]:
attach = 0
atoms = molecule.GetAtoms()
for atom in atoms:
if atom.GetAtomicNum() == 0:
attach += 1
attach_list.append(attach)
frag_frame["numAttachments"] = attach_list
return frag_frame
def _fragment_molecule(
mol_list, patt, exld_ring=False, drop_parent=True, recombine_mono=False
):
"""Break molecules into fragments based on fragmenting pattern"""
global _num_bonds_broken # pylint:disable=global-statement
out_mols = []
pat_mol = Chem.MolFromSmarts(patt)
# print(mol_list)
for mol in mol_list:
if mol.HasSubstructMatch(pat_mol):
if exld_ring and mol.GetRingInfo().NumRings() > 0:
continue
bonds_at_idx = mol.GetSubstructMatches(pat_mol)
bonds_to_break = [
mol.GetBondBetweenAtoms(x[0], x[1]).GetIdx() for x in bonds_at_idx
]
labels = [
[x, x]
for x in range(_num_bonds_broken, _num_bonds_broken + len(bonds_at_idx))
]
_num_bonds_broken += len(bonds_at_idx)
frag_mols_comb = Chem.FragmentOnBonds(
mol, bondIndices=bonds_to_break, dummyLabels=labels
)
frag_smis = Chem.MolToSmiles(frag_mols_comb).split(".")
# print(frag_smis)
frag_mols = [Chem.MolFromSmiles(x) for x in frag_smis]
if recombine_mono:
frag_mols = _recombine_monoatomic(frag_mols)
if not drop_parent:
out_mols = out_mols + [mol]
out_mols = out_mols + frag_mols
else:
out_mols = out_mols + [mol]
return out_mols
def _recombine_monoatomic(frag_mols):
"""Take fragments and recombine"""
# pylint:disable=too-many-nested-blocks
for m in frag_mols:
link = None
if m.GetNumHeavyAtoms() == 1:
dummy_atoms = [x for x in m.GetAtoms() if x.GetAtomicNum() == 0]
dummy_atom_iso = [x.GetIsotope() for x in dummy_atoms]
dummy_match = []
matched_iso = []
for i, dum in enumerate(dummy_atom_iso):
for mo in frag_mols:
if mo != m:
for at in mo.GetAtoms():
if at.GetAtomicNum() == 0 and at.GetIsotope() == dum:
dummy_match.append(mo)
matched_iso.append(dum)
for i, iso in enumerate(matched_iso):
if i == 0:
link = utils.link_molecules(m, dummy_match[i], iso, iso)
else:
link = utils.link_molecules(link, dummy_match[i], iso, iso)
if link:
frag_mols.append(link)
frag_mols.remove(m)
for mo in dummy_match:
frag_mols.remove(mo)
return frag_mols
# def get_num_placeholders(mol):
# """Find number of * in a molecule"""
# n_p = 0
# for atom in mol.GetAtoms():
# if atom.GetAtomicNum() == 0:
# n_p += 1
# return n_p
[docs]
def generate_molecule_fragments(
mol,
patt: str = "[$([C;X4;!R]):1]-[$([R,!$([C;X4]);!#0;!#9;!#17;!#35;!#1]):2]",
drop_parent: bool = False,
recombine_mono: bool = True,
):
r"""Fragment a molecule into constituent groups
The molecule is first broken along ring-nonring single bonds,
then single bonds to atoms double bonded to ring (e.g. C-N={ring}),
then breaking based on the pattern provided.
Args:
mol: the rdkit molecule object to be fragmented
patt: SMARTs string that matches the bonds to be broken
after ring-non-ring are separated
Defaults to breaking alkyl-non-alkyl bonds. See notes
drop_parent: if False, do include the parent structure in
the third break. Defaults to breaking identifying alkyl chains.
If True, do not include the parents
recombine_mono: If True (default), will recombine separated one heavy
atom groups with the chains they are broken from in the last step
Returns:
list[mol]: a list of rdkit molecules generated by fragmenting
Note:
Bonds broken are labelled by integers. Breaking FragA-FragB-FragC will result in FragA-1 \*:1-FragB-\*:2, \*:2-FragC
In this way fragments can be rejoined by recombining matching integers.
The integers are added to the molecule as Isotopes
In the last step, halides are not separated from the alkyl groups by default,
and one-heavy atom groups are rejoined to acyclic portions of the molecule
"""
global _num_bonds_broken # pylint:disable=global-statement
_num_bonds_broken = 1
first_break = _fragment_molecule([mol], patt="[!#0;R:1]-!@[!#0;!#1:2]")
second_break = _fragment_molecule(
first_break, patt="[$([!#0;!R]=[!#0;R]):1]-[!#0;!#1;!R:2]"
)
third_break = _fragment_molecule(
second_break,
patt,
exld_ring=True,
drop_parent=drop_parent,
recombine_mono=recombine_mono,
)
return third_break
[docs]
def identify_connected_fragments(
inp: str,
keep_only_children: bool = True,
bb_patt: str = "[$([C;X4;!R]):1]-[$([R,!$([C;X4]);!#0;!#9;!#17;!#35;!#1]):2]",
input_type: str = "smile",
cml_file: str = "",
include_parent: bool = False,
aiida: bool = False,
) -> pd.DataFrame:
r"""
Given Smiles string, identify fragments in the molecule:
Break all ring-non-ring atom single bonds.
For atoms double bonded to rings, break their single bonds.
For non-ring fragments, separate those into alkyl chains and hetero/double bonded atoms.
(similar to Ertl functional groups)
Args:
input: a string containing either a smiles, .xyz, .mol or .cml filename for a given molecule
update input_type below to match provided input
keep_only_children: boolean, if True, when a group is broken down into its components
remove the parent group from output. If False, parent group is retained
bb_patt: string of SMARTS pattern for bonds to be broken in side chains and linkers
defaults to cleaving sp3 carbon-((ring OR not sp3 carbon) AND not-placeholder/halogen/H)
input_type: 'smile' if SMILES code or 'molfile' if .mol file, or 'xyzfile' if .xyz file, or 'cmlfile' if .cml file
Note: xyz file REQUIRES cml_file to be provided as well
cml_file: defaults to none, can be the cml file corresponding to the input .mol file
include_parent: If True, include column in output frame repeating parent molecule
intended use for True when merging multiple molecule fragment frames but need to retain a parent molecule object
aiida: if True, format output to be able to be used in aiida database.
That is, no molecule objects
Returns:
DataFrame with columms 'Smiles', 'Molecule', 'numAttachments' and 'xyz'
Containing, fragment smiles, fragment Chem.Molecule object, number of \* placeholders,
and rough xyz coordinates for the fragment is \* were At
Note:
Each bond breaking, connectivity is maintained through dummy atom labels.
e.g. C-N -> C-[1*] N-[1*] - reattaching via the matching labels would reassemble the molecule
currently will break apart a functional group if contains a ring-non-ring single bond.
e.g. ring N-nonring S=O -> ring N-[1*] nonring S=O-[1*]
"""
# pylint:disable=too-many-arguments
# pylint:disable=too-many-branches
if aiida:
inp = inp.value
keep_only_children = keep_only_children.value
bb_patt = bb_patt.value
input_type = input_type.value
if cml_file:
cml_file = cml_file.value
include_parent = include_parent.value
# ensure smiles is canonical so writing and reading the smiles will result in same number
# ordering of atoms
if input_type == "smile":
mol = utils.get_canonical_molecule(inp)
xyz_coords = []
elif input_type in ["cmlfile", "cmldict"]:
mol, atomic_symb, xyz_coords, atom_types = utils.mol_from_cml(
inp, input_type=input_type
)
if mol is None:
return None
elif input_type == "molfile":
# use coordinates in cml file provided if able, else use xyz from mol file
mol_dict = utils.mol_from_molfile(inp)
mol, atomic_symb = mol_dict["Molecule"], mol_dict["atomic_symbols"]
if cml_file:
xyz_coords, atom_types, _, _, _ = utils.data_from_cml(cml_file)
# atom_types = utils.get_cml_atom_types(cml_file)
else:
xyz_coords = mol_dict["xyz_pos"]
# elif input_type == "xyzfile":
# if not cml_file:
# raise ValueError("No cml file provided, expected one for xyz input type")
# mol_dict = utils.mol_from_xyzfile(xyz_file=inp, cml_file=cml_file)
# if mol_dict is None:
# return None
# mol, atomic_symb, xyz_coords = (
# mol_dict["Molecule"],
# mol_dict["atomic_symbols"],
# mol_dict["xyz_pos"],
# )
# atom_types = utils.get_cml_atom_types(cml_file)
else:
raise ValueError(
f"""{input_type} should either be molfile, xyzfile, cmlfile, or a smile string"""
)
mol_frags = generate_molecule_fragments(
mol, patt=bb_patt, drop_parent=keep_only_children
)
# initialize the output data frame
frag_frame = _generate_fragment_frame([Chem.MolToSmiles(x) for x in mol_frags])
# add hydrogens and xyz coordinates resulting from MMFF94 opt, changing placeholders to At
# frag_frame = _add_xyz_coords(frag_frame)
# count number of placeholders in each fragment - it is the number of places it is attached
frag_frame = _add_number_attachements(frag_frame)
if input_type in [
"molfile",
"xyzfile",
"cmlfile",
"cmldict",
]: # clear map labels and add xyz coordinates that are available
frag_frame = _add_frag_comp(frag_frame, mol)
frag_frame["Smiles"] = frag_frame["Smiles"].map(
_clear_map_number
) # (lambda x: _clear_map_number(x))
frag_frame["xyz"] = frag_frame["Atoms"].map(
lambda x: _add_rtr_xyz(x, xyz_coords)
)
frag_frame["Labels"] = frag_frame["Atoms"].map(
lambda x: _add_rtr_label(x, atomic_symb)
)
frag_frame["Molecule"] = frag_frame["Molecule"].map(
lambda x: _clear_map_number(x, "mol")
)
if cml_file or input_type == "cmlfile" or input_type == "cmldict":
frag_frame["atom_types"] = frag_frame["Atoms"].map(
lambda x: _add_rtr_type(x, atom_types)
)
if include_parent and not aiida:
mol = _clear_map_number(mol, "mol")
frag_frame["Parent"] = [mol] * len(frag_frame.index)
if aiida:
frag_frame = frag_frame.drop("Molecule", axis=1)
return frag_frame
def _add_rtr_label(at_num_list, atomic_symb):
out_list = []
for atom in at_num_list:
out_list.append(atomic_symb[atom - 1])
return out_list
def _add_rtr_type(at_num_list, atom_types):
out_list = []
for atom in at_num_list:
out_list.append(atom_types[atom - 1])
return out_list
def _add_rtr_xyz(at_num_list, xyz_coords):
"""Construct string of atomSymbol x y z format, for use to map to frag_frame"""
out_list = []
for atom in at_num_list:
out_list.append(xyz_coords[atom - 1])
return out_list
def _clear_map_number(mol_input, ret_type="smi"):
"""Given str or Chem.Molecule input, remove atomMapnumbers from atoms and return
smiles (ret_type='smi') or molecule object (ret_type = 'mol')"""
if isinstance(mol_input, str):
mol = Chem.MolFromSmiles(mol_input)
else:
mol = mol_input
if not mol:
raise ValueError(
f"""Could not construct mol from {mol_input} in output frame"""
)
for atom in mol.GetAtoms():
atom.ClearProp("molAtomMapNumber")
if ret_type == "smi":
return Chem.MolToSmiles(mol)
if ret_type == "mol":
return mol
raise ValueError(f"""Invalid ret_type, expected smi or mol, got {ret_type}""")
def _get_atlabels_in_frag(molecule):
"""For a given molecule, extract list of atom map number for all non-H atoms"""
# H's not included because the molecule object typically won't have explicit H
out_list = []
for atom in molecule.GetAtoms():
if atom.GetAtomicNum() != 0:
out_list.append(int(atom.GetProp("molAtomMapNumber")))
return out_list
def _add_frag_comp(frag_frame, mol):
"""Given frag_frame and mol, add Atoms col to frag_frame with indices of atoms starting at 1"""
# create list of lists of indices of atoms in each fragment
frag_atoms = []
for frag_mol in frag_frame["Molecule"]:
frag_atoms.append(_get_atlabels_in_frag(frag_mol))
# iterate over atoms, adding in hydrogens to the fragment since the above won't include H
for atom in mol.GetAtoms():
if atom.GetAtomicNum() == 1:
neigh_idx = atom.GetNeighbors()[0].GetIdx() + 1
for i, frag in enumerate(frag_atoms):
if neigh_idx in frag:
frag_atoms[i].append(atom.GetIdx() + 1)
break # only need to get here once - H has only one bond
frag_frame["Atoms"] = frag_atoms
return frag_frame
[docs]
def count_uniques(
frag_frame: pd.DataFrame, drop_attachments: bool = False, uni_smi_type: bool = False
) -> pd.DataFrame:
r"""Identify unique fragments in a frame and count the number of times they occur
Given frag_frame output from :attr:`group_decomposition.fragfunctions.identify_connected_fragments`,
remove dummy atom labels(and placeholders entirely if drop_attachments=True), then count unique fragments
using SMILES to identify unique fragments
Should also work on frames from :attr:`group_decomposition.fragfunctions.merge_uniques` or
:attr:`group_decomposition.fragfunctions.count_groups_in_set` as well
Args:
frag_frame: frame resulting from :attr:`group_decomposition.fragfunctions.identify_connected_fragments` typically,
or any similar frame with a list of SMILES codes in column ['Smiles']
drop_attachments: boolean, if False, retains placeholder \* at points of attachment,
if True, removes \* for fragments with more than one atom
uni_smi_type: include atom types in determination of unique
fragments. If false, only determine unique by SMILES
Returns:
pandas data frame with columns 'Smiles', 'count' and 'Molecule',
containing the Smiles string, the number of times the Smiles was in frag_frame,
and rdkit.Chem.Molecule object
Note:
if drop_attachments=False, similar fragments with different number/positions of
attachments will not count as being the same.
e.g. ortho-attached aromatics would not match with meta or para attached aromatics
If you've ran this previously with uni_smi_type=True, running on the output frame
(or other frame derived from such frame)
with uni_smi_type=False will collapse the output uniques determined by SMILE only
"""
# Change column indices to dict
# pylint:disable=too-many-locals
# pylint:disable=too-many-branches
# pylint:disable=too-many-statements
col_names = list(frag_frame.columns)
# if frag_frame already has xyz coordinates keep those and don't use others
# typically this will be if a mol and/or cml file was provided in frag_frame construction
if "xyz" in col_names:
xyz_inc = True
xyz_list = frag_frame["xyz"]
else:
xyz_inc = False
smile_list = frag_frame["Smiles"]
if "Atoms" in col_names:
atom_list = frag_frame["Atoms"]
atoms_inc = True
else:
atoms_inc = False
if "Labels" in col_names:
labels_list = frag_frame["Labels"]
labels_inc = True
else:
labels_inc = False
if "Parent" in col_names:
parent_list = frag_frame["Parent"]
parent_inc = True
else:
parent_inc = False
if "atom_types" in col_names:
type_list = frag_frame["atom_types"]
type_inc = True
else:
type_inc = False
if "count" in col_names:
count_list = frag_frame["count"]
count_inc = True
else:
count_inc = False
# print(type_list)
no_connect_smile = []
# Clean smiles - either removing placeholder entirely(drop_attachments True)
# Or just removing the dummyAtomLabel (drop_attachments False)
for smile in smile_list:
if drop_attachments:
no_connect_smile.append(_drop_smi_attach(smile))
else:
t_mol = Chem.MolFromSmiles(re.sub(r"\[[0-9]+\*\]", "*", smile))
no_connect_smile.append(Chem.MolToSmiles(t_mol))
# identify unique smiles and count number of times they occur
# initialize lists to be used making frame
unique_smiles = []
unique_xyz = []
unique_atoms = []
unique_labels = []
unique_parents = []
unique_smiles_counts = []
unique_types = []
counter_list = []
# Identify unique smiles, counting every occurence and adding xyz if included
if uni_smi_type:
ty_tup = list(map(tuple, frag_frame["atom_types"]))
smi_ty = list(zip(no_connect_smile, ty_tup))
for i, val in enumerate(smi_ty):
smile = val[0]
at_tys = val[1]
if smile not in unique_smiles:
unique_smiles.append(smile)
if xyz_inc:
unique_xyz.append(xyz_list[i])
if atoms_inc:
unique_atoms.append(atom_list[i])
if labels_inc:
unique_labels.append(labels_list[i])
if parent_inc:
unique_parents.append(parent_list[i])
if type_inc:
unique_types.append(type_list[i])
counter_list.append(Counter(at_tys))
if count_inc:
unique_smiles_counts.append(count_list[i])
else:
unique_smiles_counts.append(1)
else:
at_ty_count = Counter(at_tys)
if at_ty_count in counter_list:
smi_ix = counter_list.index(at_ty_count)
if count_inc:
unique_smiles_counts[smi_ix] += count_list[i]
else:
unique_smiles_counts[smi_ix] += 1
else:
unique_smiles.append(smile)
if xyz_inc:
unique_xyz.append(xyz_list[i])
if atoms_inc:
unique_atoms.append(atom_list[i])
if labels_inc:
unique_labels.append(labels_list[i])
if parent_inc:
unique_parents.append(parent_list[i])
# if type_inc:
# unique_types.append(labels_list[i])
counter_list.append(Counter(at_tys))
unique_types.append(type_list[i])
if count_inc:
unique_smiles_counts.append(count_list[i])
else:
unique_smiles_counts.append(1)
else:
for i, smile in enumerate(no_connect_smile):
if smile not in unique_smiles:
unique_smiles.append(smile)
if xyz_inc:
unique_xyz.append(xyz_list[i])
if atoms_inc:
unique_atoms.append(atom_list[i])
if labels_inc:
unique_labels.append(labels_list[i])
if parent_inc:
unique_parents.append(parent_list[i])
if type_inc:
unique_types.append(type_list[i])
if count_inc:
unique_smiles_counts.append(count_list[i])
else:
unique_smiles_counts.append(1)
else:
smi_ix = unique_smiles.index(smile)
if count_inc:
unique_smiles_counts[smi_ix] += count_list[i]
else:
unique_smiles_counts[smi_ix] += 1
# create output frame
un_frame = _construct_unique_frame(
uni_smi=unique_smiles,
uni_smi_count=unique_smiles_counts,
xyz=unique_xyz,
atoms=unique_atoms,
parents=unique_parents,
labels=unique_labels,
at_types=unique_types,
)
# if atoms_inc:
# un_frame['Atoms'] = unique_atoms
# if labels_inc:
# un_frame['Labels'] = unique_labels
# if parent_inc:
# un_frame['Parent'] = unique_parents
# if type_inc:
# un_frame['atom_types'] = unique_types
return un_frame
def _find_neigh_notin_frag(mol, at_list):
# for atoms with one attachment point
out_nbr = 0
for idx in at_list:
atom = mol.GetAtomWithIdx(idx - 1)
nbrs = atom.GetNeighbors()
for nbr in nbrs:
nbr_idx = nbr.GetIdx() + 1
if nbr_idx not in at_list:
out_nbr = nbr_idx
out_at = atom.GetIdx() + 1
break
if out_nbr:
break
return [out_at, out_nbr]
def _find_neigh_xyz(frag_frame, neigh_idx):
"""Given frag_frame and index of neighbor atom, find its xyz coordinates"""
# column index
atoms_idx = list(frag_frame.columns).index("Atoms")
# determine which row contains the neighbor atom
neigh_bool = list(frag_frame.apply(lambda row: neigh_idx in row[atoms_idx], axis=1))
# print(neigh_bool)
neigh_row = neigh_bool.index(True)
list_xyz = frag_frame["xyz"][neigh_row]
# find where in list neighbor atom is, return xyz at that index
neigh_list_idx = frag_frame["Atoms"][neigh_row].index(neigh_idx)
neigh_xyz = list_xyz[neigh_list_idx]
return neigh_xyz
def _move_along_bond(at_xyz, neigh_xyz, at_symb):
"""Returns xyz coordinates of neigh_xyz moved along bond to H bond length"""
at_np = np.array(at_xyz)
nb_np = np.array(neigh_xyz)
# vector for bond to move along
bond = at_np - nb_np
# bond length to find
target_length = _H_BOND_LENGTHS[at_symb]
steps = np.linspace(0.0, 1.0, 100)
# step along bond starting at neigh_xyz in direction of at_xyz.
# stop when bond length is target
for s in steps:
new_xyz = neigh_xyz + s * bond
if abs(_get_dist(new_xyz, at_np) - target_length) < 0.01:
end_xyz = new_xyz
# return target point
return end_xyz
def _get_dist(point_a, point_b):
"""Return distance btw two points"""
return math.sqrt(
(point_a[0] - point_b[0]) ** 2
+ (point_a[1] - point_b[1]) ** 2
+ (point_a[2] - point_b[2]) ** 2
)
def _find_H_xyz(mol, at_list, xyz_list, frag_frame):
"""Finds xyz of hydrogen atom that would be connected to a fragment
Args:
mol: Chem.Mol object
at_list: list[int] - list of atoms in the fragment
Note: these start at 1, but those in molecule start at 0.
To convert add/subtract 1
xyz_list: list of xyz_coordinates of atoms in fragment
frag_frame: full parent frag_frame WITHOUT filtering by number of attachments
Returns:
list[float]: xyz coordinates where H should be placed
"""
# find attached atom index and neighbor index in molecule
at_idx, neigh_idx = _find_neigh_notin_frag(mol, at_list)
# get attached atom symbol
symb = mol.GetAtomWithIdx(at_idx - 1).GetSymbol()
# index of the attached atom in at_list, which is the same as xyz_list
list_idx = at_list.index(at_idx)
# attached atom xyz
at_xyz = xyz_list[list_idx]
# neighbor atom xyz
neigh_xyz = _find_neigh_xyz(frag_frame, neigh_idx)
# move neighbor atom xyz along bond to H bond length (Gaussview defaults)
h_xyz = _move_along_bond(at_xyz, neigh_xyz, symb)
return h_xyz
def _clean_molecule_name(smile):
"""Removes symbols in smile code unfit for file name"""
smile = smile.replace("-", "Neg")
smile = smile.replace("[", "-")
smile = smile.replace("]", "-")
smile = smile.replace("(", "-")
smile = smile.replace(")", "-")
smile = smile.replace("#", "t")
smile = smile.replace("=", "d")
smile = smile.replace("+", "Pos")
smile = smile.replace("*", "Att")
smile = smile.replace("@", "")
return smile
# def generate_fragment_structures(
# inp: str,
# keep_only_children: bool = True,
# bb_patt: str = "[$([C;X4;!R]):1]-[$([R,!$([C;X4]);!#0;!#9;!#17;!#35;!#1]):2]",
# input_type="smile",
# cml_file="",
# include_parent=True,
# ):
# """generate fragments and return a dictionary of fragments"""
# # pylint:disable=too-many-arguments
# frag_frame = identify_connected_fragments(
# inp=inp,
# keep_only_children=keep_only_children,
# bb_patt=bb_patt,
# input_type=input_type,
# cml_file=cml_file,
# include_parent=include_parent,
# )
# mol = frag_frame.at[0, "Parent"]
# frag_dict = output_ifc_dict(mol, frag_frame)
# return frag_dict
# show
# def output_cgis_dicts(cgis_frame):
# """generate fragments and return a dictionary of fragments"""
# # should have parent col
# on_at_frame = pd.DataFrame(cgis_frame[cgis_frame["numAttachments"] == 1])
# col_names = list(on_at_frame.columns)
# xyz_idx = col_names.index("xyz")
# atoms_idx = col_names.index("Atoms")
# parent_idx = col_names.index("Parent")
# on_at_frame["H_xyz"] = on_at_frame.apply(
# lambda row: _find_H_xyz(
# row[parent_idx], row[atoms_idx], row[xyz_idx], cgis_frame
# ),
# axis=1,
# )
# on_at_frame["at_idx"] = on_at_frame.apply(
# lambda row: _find_at_idx(row[parent_idx], row[atoms_idx]), axis=1
# )
# out_dict = {
# re.sub(
# r"\[[0-9]+\*\]", "*", on_at_frame.at[i, "Smiles"]
# ): _write_frag_structure(
# frag_mol=on_at_frame.at[i, "Molecule"],
# xyz_list=on_at_frame.at[i, "xyz"],
# symb_list=on_at_frame.at[i, "Labels"],
# h_xyz=on_at_frame.at[i, "H_xyz"],
# at_idx=on_at_frame.at[i, "at_idx"],
# atom_types=frag_frame.at[i, "atom_types"],
# )
# for i in range(0, nrow)
# }
# return out_dict
[docs]
def output_ifc_dict(mol, frag_frame: pd.DataFrame, done_smi: list[str]):
"""generate a dictionary containing identify_connected_fragment information
Only new fragments are included. Previously parsed fragments are listed in done_smi.
Args:
mol: rdkit molecule object that was fragmented
frag_frame: identify_connected_fragments frame generated for mol
done_smi: list of fragments which have been identified already
Returns:
list containing dict for fragments and done_smi lengthened by the number of fragments done
Note:
mainly for use in generating information for unique fragments in
an AiiDA workflow
"""
on_at_frame = pd.DataFrame(frag_frame[frag_frame["numAttachments"] == 1])
if on_at_frame.empty:
return None, done_smi
col_names = list(on_at_frame.columns)
# Find indices of relevant columns
xyz_idx = col_names.index("xyz")
atoms_idx = col_names.index("Atoms")
# Find H xyz position and index of atom bonded to H
on_at_frame["H_xyz"] = on_at_frame.apply(
lambda row: _find_H_xyz(mol, row[atoms_idx], row[xyz_idx], frag_frame), axis=1
)
on_at_frame["at_idx"] = on_at_frame.apply(
lambda row: _find_at_idx(mol, row[atoms_idx]), axis=1
)
nrow = on_at_frame.shape[0]
on_at_frame = on_at_frame.reset_index(drop=True)
# print(on_at_frame)
# print(_clean_molecule_name(on_at_frame.at[0,'Smiles']))
# print(on_at_frame.at[0, "Molecule"])
# print(on_at_frame.at[0, "Labels"])
# print(on_at_frame.at[0, "xyz"])
# print(on_at_frame.at[0, "H_xyz"])
# print(on_at_frame.at[0, "at_idx"])
# print(on_at_frame.at[0, "atom_types"])
out_dict = {}
for i in range(0, nrow):
smi = on_at_frame.at[i, "Smiles"]
if smi not in done_smi:
done_smi.append(smi)
key = re.sub(r"\[[0-9]+\*\]", "*", smi)
out_dict[key] = _write_frag_structure(
frag_mol=on_at_frame.at[i, "Molecule"],
xyz_list=on_at_frame.at[i, "xyz"],
symb_list=on_at_frame.at[i, "Labels"],
h_xyz=on_at_frame.at[i, "H_xyz"],
at_idx=on_at_frame.at[i, "at_idx"],
atom_types=frag_frame.at[i, "atom_types"],
)
return [out_dict, done_smi]
def _write_frag_structure(frag_mol, xyz_list, symb_list, h_xyz, at_idx, atom_types):
"""get output structure"""
# pylint:disable=too-many-arguments
# print("in function")
num_atoms = len(symb_list)
# smile = re.sub('\[[0-9]+\*\]', '*', Chem.MolToSmiles(frag_mol,canonical=False))
charge = Chem.GetFormalCharge(frag_mol)
out_xyz = [xyz_list[at_idx], h_xyz]
for i in range(num_atoms):
if i != at_idx:
out_xyz.append(xyz_list[i])
# geom_frame = pd.DataFrame(out_xyz,columns=['x','y','z'])
symb_list.insert(1, "H")
return {
"geom": out_xyz,
"charge": charge,
"atom_types": atom_types,
"atom_symbols": symb_list,
}
[docs]
def output_ifc_gjf(
mol,
frag_frame,
esm="wb97xd",
basis_set="aug-cc-pvtz",
wfx=True,
n_procs=4,
mem="3200MB",
multiplicity=1,
):
"""Takes a fragmented molecule and outputs gjf files of the fragments with one
attachment point.
Hydrogen is added in place of the connection to the rest of
the molecule for the fragment
Args:
mol: Chem.Mol object for which fragmentation was performed
frag_frame: output from either count_uniques or identify_connected_fragments
esm: str, electronic structure method to include in gjf
basis_set: str, basis set to include in gjf
wfx: Boolean, if True add output=wfx to gjf file
n_procs: int, >=0. if >0, add number of processors to be used to gjf
mem: str, format "nMB" or "nGB", memory to be used in gjf
multiplicity: int, defaults to 1. Multiplicity of molecule
Returns:
Creates gjf files in working directory for each fragment in frag_frame
Note:
H position is set by taking the atom the fragment is bonded two, replacing it with H
and moving that closer to the C until it reaches the default distance
Default distances taken from Gaussview "clean" C-H, C-O, etc bond lengths
"""
on_at_frame = pd.DataFrame(frag_frame[frag_frame["numAttachments"] == 1])
col_names = list(on_at_frame.columns)
# Find indices of relevant columns
xyz_idx = col_names.index("xyz")
atoms_idx = col_names.index("Atoms")
labels_idx = col_names.index("Labels")
mol_idx = col_names.index("Molecule")
# Find H xyz position and index of atom bonded to H
on_at_frame["H_xyz"] = on_at_frame.apply(
lambda row: _find_H_xyz(mol, row[atoms_idx], row[xyz_idx], frag_frame), axis=1
)
on_at_frame["at_idx"] = on_at_frame.apply(
lambda row: _find_at_idx(mol, row[atoms_idx]), axis=1
)
# hxyz = len(col_names)
# atidx = len(col_names) + 1
# print(on_at_frame)
# write gjfs
on_at_frame.apply(
lambda row: _write_frag_gjf(
frag_mol=row[mol_idx],
xyz_list=row[xyz_idx],
symb_list=row[labels_idx],
h_xyz=row[len(col_names)],
at_idx=row[len(col_names) + 1],
esm=esm,
basis_set=basis_set,
wfx=wfx,
n_procs=n_procs,
mem=mem,
multiplicity=multiplicity,
),
axis=1,
)
def _clean_basis(basis_set):
basis_set = basis_set.replace("(", "")
basis_set = basis_set.replace(")", "")
basis_set = basis_set.replace(",", "")
basis_set = basis_set.replace("+", "p")
return basis_set
def _write_frag_gjf(
frag_mol,
xyz_list,
symb_list,
h_xyz,
at_idx,
esm="wb97xd",
basis_set="aug-cc-pvtz",
wfx=True,
n_procs=4,
mem="3200MB",
multiplicity=1,
):
"""Write the gjf file for a fragment.
Args:
frag_mol: Chem.Mol object
xyz_list: list of xyz_coords of list
symb_list: list of atomic symbols of fragment
h_xyz: xyz coords of hydrogen attached to fragment
at_idx: index of attached atom in fragment
esm: str, electronic structure method to include in gjf
basis_set: str, basis set to include in gjf
wfx: Boolean, if True add output=wfx to gjf file
n_procs: int, >=0. if >0, add number of processors to be used to gjf
mem: str, format "nMB" or "nGB", memory to be used in gjf
multiplicity: int, defaults to 1. Multiplicity of molecule
Default template:
%chk={filename}.chk
%nprocs=4
%mem=3200MB
# esm/basis_set opt freq output=wfx
smile
charge mult
{xyz}
{filename}.wfx
"""
# pylint:disable=too-many-locals
num_atoms = len(symb_list)
smile = re.sub(r"\[[0-9]+\*\]", "*", Chem.MolToSmiles(frag_mol, canonical=False))
charge = Chem.GetFormalCharge(frag_mol)
molecule_name = _clean_molecule_name(smile)
# print(at_idx)
# print(xyz_list[at_idx])
# print(h_xyz)
# build xyz of molecule
out_xyz = [xyz_list[at_idx], h_xyz]
for i in range(num_atoms):
if i != at_idx:
out_xyz.append(xyz_list[i])
geom_frame = pd.DataFrame(out_xyz, columns=["x", "y", "z"])
symb_list.insert(1, "H")
geom_frame["Atom"] = symb_list
geom_frame = geom_frame[["Atom", "x", "y", "z"]]
# create file name
clean_basis = _clean_basis(basis_set)
new_file_name = "SubH" + "_" + molecule_name + "_" + esm + "_" + clean_basis
if os.path.exists(new_file_name + ".gjf"):
# print('deleting')
os.remove(new_file_name + ".gjf")
# write file
with open(new_file_name + ".gjf", "a", encoding="utf-8") as f:
f.write(f"%chk={new_file_name}" + ".chk\n")
if n_procs:
f.write(f"%nprocs={n_procs}\n")
if mem:
f.write(f"%mem={mem}\n")
if wfx:
f.write(f"#p {esm}/{basis_set} opt freq output=wfx\n")
else:
f.write(f"#p {esm}/{basis_set} opt freq\n")
f.write("\n")
f.write(f"{smile}")
f.write("\n\n")
f.write(f"{charge} {multiplicity}\n")
dfAsString = geom_frame.to_string(header=False, index=False)
f.write(dfAsString)
f.write("\n\n")
if wfx:
f.write(new_file_name + ".wfx\n\n\n")
else:
f.write("\n\n\n")
def _find_at_idx(mol, at_list):
"""Returns index of atom connected to the remainder of the molecule in the list
of fragment atom numbers
Args:
mol: Chem.Mol object
at_list: list[int] of atoms in molecule belonging to fragment being studied
Note: these indices start at 1, while those in the molecule start at 0
Returns:
int of index of connected atom in at_list. See examples for worked explanation
Example: a fragment is defined by atoms [1,3,5,6,8]
Atom 3 is where the remainder of the molecule attaches to the fragment
The function returns the index of 3 in the atom list.
In this case, it would return 1
"""
# find
at_idx = _find_neigh_notin_frag(mol, at_list)[0]
list_idx = at_list.index(at_idx)
return list_idx
def _construct_unique_frame(
uni_smi: list[str],
uni_smi_count: list[int],
xyz=[],
atoms=[],
parents=[],
labels=[],
at_types=[],
) -> pd.DataFrame:
"""given smiles, counts and (optional) xyz coordinates, create frame"""
# pylint:disable=dangerous-default-value
uniquefrag_frame = pd.DataFrame(uni_smi, columns=["Smiles"])
if xyz:
uniquefrag_frame["xyz"] = xyz
if atoms:
uniquefrag_frame["Atoms"] = atoms
if parents:
uniquefrag_frame["Parent"] = parents
if labels:
uniquefrag_frame["Labels"] = labels
if at_types:
uniquefrag_frame["atom_types"] = at_types
PandasTools.AddMoleculeColumnToFrame(
uniquefrag_frame, "Smiles", "Molecule", includeFingerprints=True
)
uniquefrag_frame["count"] = uni_smi_count
# if we don't have xyz already add them, from MMFF94 opt
if not xyz:
uniquefrag_frame = _add_xyz_coords(uniquefrag_frame)
# count number placeholders
uniquefrag_frame = _add_number_attachements(uniquefrag_frame)
return uniquefrag_frame
def _drop_smi_attach(smile: str):
"""completely remove placeholder if number of non-placeholder in smiles is > 1"""
mol = Chem.MolFromSmiles(smile)
non_zero_atoms = 0
for atom in mol.GetAtoms():
if atom.GetAtomicNum() != 0:
non_zero_atoms += 1
if non_zero_atoms > 1:
temp = re.sub(r"\[[0-9]+\*\]", "", smile)
t_mol = Chem.MolFromSmiles(re.sub(r"\(\)", "", temp))
if t_mol is None:
t_mol = Chem.MolFromSmiles(re.sub(r"\[[0-9]+\*\]", "*", smile))
out_smi = Chem.MolToSmiles(t_mol)
# Warning('Could not construct {smile} without attachments'.format(smile=smile))
else:
out_smi = Chem.MolToSmiles(t_mol)
else:
out_smi = Chem.MolToSmiles(
Chem.MolFromSmiles(re.sub(r"\[[0-9]+\*\]", "*", smile))
)
return out_smi
[docs]
def merge_uniques(
frame1: pd.DataFrame, frame2: pd.DataFrame, uni_smi_ty=True
) -> pd.DataFrame:
"""Given two frames of unique fragments, identify shared unique fragments,
merge count and frames together.
Args:
frame1: a frame output from count_uniques
frame2: a distinct frame also from count_uniques
uni_smi_ty: If True, include atom types in determination of unique
fragments. If false, only determine unique by SMILES
Returns:
a frame resulting from the merge of frame1 and frame2.
All rows that have Smiles that are in frame1 but not frame2(and vice versa) are included
unmodified
If a row's SMILES is in both frame1 and frame2, modify the row to update the count of
that fragment as sum of frame1 and frame2, then include one row.
Note:
for best results, SMILES must be canonical so that they can be exactly compared.
Smiles in frame should be resulting from Chem.MolToSmiles(Chem.MolFromSmiles(smile)) -
this will create a molecule from the smile, and write the smile back, in canonical form
Example usage:
>>> frame1
Smiles count
C 2
C1CCC1 1
>>> frame2
Smiles count
C 3
C1CC1 2
>>> merge_uniques(frame1,frame2)
Smiles count
C 5
C1CCC1 1
C1CC1 2
"""
if frame1.empty:
merge_frame = frame2
elif frame2.empty:
merge_frame = frame1
else:
rows_to_drop = _find_rows_to_drop(frame1, frame2, uni_smi_ty)
merge_frame = rows_to_drop["merge_frame"]
drop_frame_1 = frame1.drop(rows_to_drop["drop_rows_1"])
drop_frame_2 = frame2.drop(rows_to_drop["drop_rows_2"])
merge_frame = pd.concat([drop_frame_1, drop_frame_2, merge_frame])
merge_frame.reset_index(inplace=True, drop=True)
# _add_frame(drop_frame_1,merge_frame)
# merge_frame = _add_frame(drop_frame_2,merge_frame)
return merge_frame
def _find_rows_to_drop(frame_a: pd.DataFrame, frame_b: pd.DataFrame, uni_smi_ty=True):
"""for two frames, find rows with same smile, store what rows to drop in each"""
rows_to_drop_one = []
rows_to_drop_two = []
col_names = list(frame_a.columns)
merge_frame = pd.DataFrame(columns=col_names)
frame_a_idx = list(frame_a.index)
frame_b_idx = list(frame_b.index)
for i, smi in enumerate(frame_a["Smiles"]):
if smi in list(frame_b["Smiles"]):
# print(f'i is {i}')
if uni_smi_ty:
smi_idx = [
x
for x in range(0, frame_b.shape[0])
if smi == list(frame_b["Smiles"])[x]
]
ctr = [list(map(Counter, frame_b["atom_types"]))[x] for x in smi_idx]
if list(map(Counter, frame_a["atom_types"]))[i] not in ctr:
continue
j = smi_idx[ctr.index(list(map(Counter, frame_a["atom_types"]))[i])]
else:
j = list(frame_b["Smiles"]).index(smi)
# print(f'j is {j}')
# print(frame_a.shape)
cum_count = (
frame_a.at[frame_a_idx[i], "count"]
+ frame_b.at[frame_b_idx[j], "count"]
)
# print('cum_count is {cum_count}')
merge_frame = pd.concat(
[
merge_frame,
pd.DataFrame(
[list(frame_a.iloc[frame_a_idx[i]])], columns=col_names
),
]
)
merge_frame.reset_index(inplace=True, drop=True)
# print(f'merge frame before updating cumcount {merge_frame}')
# print(merge_frame.shape[0]-1)
merge_frame.at[merge_frame.shape[0] - 1, "count"] = cum_count
# print(f'merge_frame after updating cumcount {merge_frame}')
# if made_frame == 0:
# merge_frame = pd.DataFrame.from_dict({'Smiles':[smi],'count':[cum_count]})
# made_frame=1
# else:
# merge_frame.loc[len(merge_frame)] = [smi, cum_count]
rows_to_drop_one.append(i)
rows_to_drop_two.append(j)
# print(merge_frame)
return {
"drop_rows_1": rows_to_drop_one,
"drop_rows_2": rows_to_drop_two,
"merge_frame": merge_frame,
}
[docs]
def count_groups_in_set(
list_of_inputs: list[str],
drop_attachments: bool = False,
input_type: str = "smile",
bb_patt: str = "[$([C;X4;!R]):1]-[$([R,!$([C;X4]);!#0;!#9;!#17;!#35;!#1]):2]",
cml_list=None,
uni_smi_ty: bool = True,
aiida: bool = False,
) -> pd.DataFrame:
"""Identify unique fragments in molecules defined in the list_of_smiles,
and count the number of occurences for duplicates.
Args:
list_of_smiles: A list, with each element being a SMILES string, e.g. ['CC','C1CCCC1']
drop_attachments: Boolean for whether or not to drop attachment points from fragments
if True, will remove all placeholder atoms indicating connectivity
if False, placeholder atoms will remain
input_type: smile, xyzfile, cmlfile or molfile, based on elements of lists_of_inputs
cml_list: defaults empty, but can be a list of cml files corresponding to the molfile inputs
bb_patt: SMARTS pattern for bonds to break in linkers and side chains. Defaults to breaking
bonds between nonring carbons with four bonds single bonded to ring atoms or carbons that
don't have four bonds, and are not H, halide, or placeholder
uni_smi_type: if True, include atom types in determination of unique
fragments. If false, only determine unique by SMILES
Returns:
an output pd.DataFrame, with columns 'Smiles' for fragment Smiles,
'count' for number of times each fragment occurs in the list, and
'Molecule' holding a rdkit.Chem.Molecule object
Example usage:
>>> count_groups_in_set(['c1ccc(c(c1)c2ccc(o2)C(=O)N3C[C@H](C4(C3)CC[NH2+]CC4)C(=O)NCCOCCO)F',
'Cc1nc2ccc(cc2s1)NC(=O)c3cc(ccc3N4CCCC4)S(=O)(=O)N5CCOCC5'],drop_attachments=False)
"""
out_frame = pd.DataFrame()
for i, inp in enumerate(list_of_inputs):
# print(inp)
if cml_list:
frame = identify_connected_fragments(
inp,
bb_patt=bb_patt,
input_type=input_type,
cml_file=cml_list[i],
include_parent=True,
)
else:
frame = identify_connected_fragments(
inp, bb_patt=bb_patt, input_type=input_type, include_parent=True
)
if frame is not None:
unique_frame = count_uniques(frame, drop_attachments, uni_smi_type=True)
if out_frame.empty:
out_frame = unique_frame
else:
out_frame = merge_uniques(out_frame, unique_frame, uni_smi_ty)
out_frame = out_frame.drop("Molecule", axis=1)
if not aiida:
PandasTools.AddMoleculeColumnToFrame(
out_frame, "Smiles", "Molecule", includeFingerprints=True
)
# out_frame = _add_xyz_coords(out_frame)
# out_frame = _add_number_attachements(out_frame)
return out_frame