Source code for group_decomposition.utils

"""
utils.py

Various utilities used in fragmenting molecules - retrieving information from molecules
Identifying small parts of molecules, and tools for minor manipulations of SMILES or molecules
"""
import ast
import os

import pandas as pd
from rdkit import Chem  # pylint:disable=import-error
from rdkit.Chem import (  # pylint:disable=import-error # search for rdScaffoldAttachment points * to remove; pylint:disable=import-error; rdDetermineBonds,
    AllChem,
    rdDetermineBonds,
    rdqueries,
)


[docs] def get_molecules_atomicnum(molecule): """Given molecule object, get list of atomic numbers.""" # a atom_num_list = [] for atom in molecule.GetAtoms(): atom_num_list.append(atom.GetAtomicNum()) return atom_num_list
[docs] def get_molecules_atomsinrings(molecule): """Given molecule object, get Boolean list of if atoms are in a ring.""" is_in_ring_list = [] for atom in molecule.GetAtoms(): is_in_ring_list.append(atom.IsInRing()) return is_in_ring_list
[docs] def trim_placeholders(rwmol): """Given Chem.RWmol, remove atoms with atomic number 0.""" qa = rdqueries.AtomNumEqualsQueryAtom(0) # define query for atomic number 0 if len(rwmol.GetAtomsMatchingQuery(qa)) > 0: # if there are matches query_match = rwmol.GetAtomsMatchingQuery(qa) rm_at_idx = [] for atom in query_match: # identify atoms to be removed rm_at_idx.append(atom.GetIdx()) # remove starting from highest number so upcoming indices not affected rm_at_idx_sort = sorted(rm_at_idx, reverse=True) # e.g. need to remove 3 and 5, if don't do this and you remove 3, # then the 5 you want to remove is now 4, and you'll remove wrong atom for idx in rm_at_idx_sort: # remove atoms rwmol.RemoveAtom(idx) return rwmol
[docs] def mol_with_atom_index(mol): """Label a molecule with map numbers for order atoms are in molecule. Useful for tracking atoms after fragmenting Args: mol: an rdkit molecule object Returns: rdkit molecule object with AtomMapNum set so that atom 0 has map number 1, atom 1 has map number 2...etc """ # from https://www.rdkit.org/docs/Cookbook.html for atom in mol.GetAtoms(): if atom.GetAtomicNum() != 0: atom.SetAtomMapNum(atom.GetIdx() + 1) return mol
def _get_charge_from_cml(cml_file): with open(cml_file, encoding="utf-8") as file: for line in file: if "formalCharge" in line: split_line = line.split(" ") for word in split_line: if "formalCharge" in word: charge = int( word.replace("formalCharge=", "") .replace(">\n", "") .replace('"', "") ) break return charge def mol_from_xyzfile(xyz_file: str, cml_file): """Attempt to create a molecule from an xyz file by automatically determining bond orders""" # raw_mol = Chem.MolFromXYZFile('DUDE_67368827_adrb2_decoys_C19H25N3O4_CIR.xyz') # mol = Chem.Mol(raw_mol) # rdDetermineBonds.DetermineConnectivity(mol) # rdDetermineBonds.DetermineBondOrders(mol) mol = Chem.Mol(Chem.MolFromXYZFile(xyz_file)) try: rdDetermineBonds.DetermineBonds(mol, charge=_get_charge_from_cml(cml_file)) except: # pylint:disable=bare-except if os.path.isfile("error_log.txt"): with open("error_log.txt", "a", encoding="utf-8") as er_file: er_file.write( f"Could not determine bond orders from xyz for {cml_file}\n" ) else: with open("error_log.txt", "w", encoding="utf-8") as er_file: er_file.write(f"Could not determine bond orders for {cml_file}\n") return None atomic_symbols = [] xyz_coordinates = [] ats_read = 0 num_atoms = mol.GetNumAtoms() with open(xyz_file, encoding="utf-8") as file: for line_number, line in enumerate(file): if ats_read < num_atoms and line_number > 1: ats_read += 1 atomic_symbol, x, y, z = line.split()[:4] atomic_symbols.append(atomic_symbol) xyz_coordinates.append([float(x), float(y), float(z)]) elif ats_read == num_atoms: break return { "Molecule": mol_with_atom_index(mol), "xyz_pos": xyz_coordinates, "atomic_symbols": atomic_symbols, } # from https://github.com/rdkit/rdkit/issues/2413 # conf = m.GetConformer() # in principal, you should check that the atoms match # for i in range(m.GetNumAtoms()): # print(i) # x,y,z = xyz_coordinates[i] # conf.SetAtomPosition(i,Point3D(x,y,z))
[docs] def get_cml_atom_types(cml_file): """Extract atom types from cml file Args: cml_file: cml file name Returns: Data Frame column whose elements are tuples of atom types as defined by Retrievium. (atom number, type, valence) """ n_atl = 0 type_list = [] idx_list = [] with open(cml_file, encoding="utf-8") as file: for line in file: if "atomTypeList" in line: n_atl += 1 if n_atl == 2: break elif n_atl == 1: split_line = line.split() idx_list.append( int(split_line[1].split("=")[1].replace('"', "").replace("a", "")) - 1 ) at_label = split_line[2].split("=")[1].replace('"', "") at_type = int(split_line[3].split("=")[1].replace('"', "")) at_valence = int( split_line[4].split("=")[1].split("/")[0].replace('"', "") ) type_list.append((at_label, at_type, at_valence)) temp_frame = pd.DataFrame(list(zip(idx_list, type_list)), columns=["idx", "type"]) temp_frame.sort_values(by="idx", inplace=True) return temp_frame["type"]
def add_cml_atoms_bonds(el_list, bond_list): """create a molecule from cml file, building it one atom at a time then adding in bond orders. Note: Bond orders from Retrievium cml are problematic at times. Recommended construction is to build atoms and connectivity from cml file Then assign bond orders based on template smiles also found in cml file""" flag = 1 for atom in el_list: if flag: mol = Chem.MolFromSmiles(atom) rwmol = Chem.RWMol(mol) rwmol.BeginBatchEdit() flag = 0 else: rwmol.AddAtom(Chem.Atom(atom)) # mw.AddBond(6,7,Chem.BondType.SINGLE) for bond in bond_list: if bond[2] == "S": rwmol.AddBond(bond[0] - 1, bond[1] - 1, Chem.BondType.SINGLE) elif bond[2] == "D": rwmol.AddBond(bond[0] - 1, bond[1] - 1, Chem.BondType.DOUBLE) elif bond[2] == "T": rwmol.AddBond(bond[0] - 1, bond[1] - 1, Chem.BondType.TRIPLE) elif bond[2] == "A": rwmol.AddBond(bond[0] - 1, bond[1] - 1, Chem.BondType.AROMATIC) rwmol.CommitBatchEdit() return rwmol def _add_cml_single_atoms_bonds(el_list, bond_list): """Build a mol from list of atoms and bonds, one atom at a time Bonds assigned are only single bonds. Adjust bonds after with _modAssignBondOrdersFromTemplate""" rwmol = Chem.RWMol(Chem.Mol()) for atom in el_list: rwmol.AddAtom(Chem.Atom(atom)) # mw.AddBond(6,7,Chem.BondType.SINGLE) for bond in bond_list: rwmol.AddBond(bond[0] - 1, bond[1] - 1, Chem.BondType.SINGLE) rwmol.CommitBatchEdit() return rwmol
[docs] def smiles_from_cml(cml_file, smile_tag="retrievium:inputSMILES"): """Finds the Retreivium SMILES in a cml file with a given label Args: cml_file: cml file name smile_tag: the label fo the SMILEs in the cml file. Defaults to input SMILEs Returns: string of the input SMILES code tagged in the file as retrievium:inputSMILES Note: Must be used on .cml files from the Retrievium database https://retrievium.ca """ flag = 0 with open(cml_file, encoding="utf-8") as file: for line in file: if smile_tag in line: flag = 1 elif flag == 1: smile = line.split(">")[1].split("<")[0] break return smile
[docs] def mol_from_cml(cml_file, input_type="cmlfile"): """Creates a molecule from a cml file and returns atoms, xyz and types Builds molecule one atom at a time connected by single bonds Then determines bond orders by mapping to a template smiles in the cml Finally, updates property cache, initializes ring info, and sanitizes mol If no match between SMILEs and connectivity, returns None and writes error to file Args: cml_file: cml file name or dictionary containing cml data input_type: cmlfile or cmldict. Use cmlfile if just raw cml file, use cmldict if dictionary Returns: list with elements: [Molecule, list of atom symbols in molecule, list of xyz coords of atoms in molecule, list of atom types of atoms in molecule] Note: list order matches mol numbering in cml """ if input_type == "cmlfile": xyz_coords, at_types, bond_list, el_list, _ = data_from_cml(cml_file, True) smile = smiles_from_cml(cml_file) elif input_type == "cmldict": xyz_coords = cml_file["geom"] at_types = cml_file["atom_types"] bond_list = cml_file["bonds"] el_list = cml_file["labels"] # charge = cml_file["charge"] smile = cml_file["smiles"] rwmol = _add_cml_single_atoms_bonds(el_list, bond_list) for atom in rwmol.GetAtoms(): atom.SetNoImplicit(True) rwmol2 = Chem.RemoveHs(rwmol, implicitOnly=True, updateExplicitCount=False) template = AllChem.MolFromSmiles(smile) bond_mol = _modAssignBondOrdersFromTemplate(template, rwmol2, cml_file) # need rings for aromaticity check # if os.path.isfile('error_log.txt'): # er_file = open('error_log.txt','a') # er_file.write(f'Could not sanitize {cml_file}\n') # er_file.close() # else: # er_file = open('error_log.txt','w') # er_file.write(f'Could not sanitize {cml_file}\n') # er_file.close() if bond_mol: bond_mol.UpdatePropertyCache() Chem.GetSymmSSSR(bond_mol) try: Chem.SanitizeMol(bond_mol) except: # pylint:disable=bare-except _write_error(f"Could not sanitize {cml_file}\n") return [None, None, None, None] return [mol_with_atom_index(bond_mol), el_list, xyz_coords, at_types] # if os.path.isfile('error_log.txt'): # er_file = open('error_log.txt','a') # er_file.write(f'No match between template smiles and connected geom for {cml_file}\n') # er_file.close() # else: # er_file = open('error_log.txt','w') # er_file.write(f'No match between template smiles and connected geom for {cml_file}\n') # er_file.close() return [None, None, None, None]
def _write_error(errormessage): """writes errormessage to errorlog.txt""" if os.path.isfile("error_log.txt"): with open("error_log.txt", "a", encoding="utf-8") as er_file: er_file.write(errormessage) else: with open("error_log.txt", "w", encoding="utf-8") as er_file: er_file.write(errormessage) def _modAssignBondOrdersFromTemplate(refmol, mol, cml_file): """This is originally from RDKit AllChem module. Modified here by Kevin Lefrancois-Gagnon(KLG) to disallow implicit hydrogens on all molcule objects used. This corresponds to the 4 for loops after creation of mol objects. Also Returns None for no match instead of value error, for use on large number of molecules, a single error won't stop the whole thing All other code is unmodified from the original. This was required by KLG for the following reason: using the original AssignBondOrdersFromTemplate on a molecule with only explicit hydrogens with only connectivity, no bond types other than single bonds. When a charged atom like nitrogen was present, extra H were being added For example when a part was expected as C-[NH2+]-C, the following was seen C-[NH2+]([H])([H])-C, essentially doubling the expected # hydrogens These could not be deleted by Chem.RemoveHs(mol,implicitOnly=True) as for some reason these were explicit. The reason for why these were added is unclear Beyond the obviously problematic extra hydrogens, it also produced an error case when the later part of the code would clean up bond orders, like for some ring ketones in a ring labeled aromatic by the SMILEs. Adjusting implicit Hs to be disallowed for all molecules and their copies at the start resulted in the normal expected outputs. Resume original documentation: assigns bond orders to a molecule based on the bond orders in a template molecule Arguments - refmol: the template molecule - mol: the molecule to assign bond orders to An example, start by generating a template from a SMILES and read in the PDB structure of the molecule >>> import os >>> from rdkit.Chem import AllChem >>> template = AllChem.MolFromSmiles("CN1C(=NC(C1=O)(c2ccccc2)c3ccccc3)N") >>> mol = AllChem.MolFromPDBFile(os.path.join(RDConfig.RDCodeDir, 'Chem', 'test_data', '4DJU_lig.pdb')) >>> len([1 for b in template.GetBonds() if b.GetBondTypeAsDouble() == 1.0]) 8 >>> len([1 for b in mol.GetBonds() if b.GetBondTypeAsDouble() == 1.0]) 22 Now assign the bond orders based on the template molecule >>> newMol = AllChem.AssignBondOrdersFromTemplate(template, mol) >>> len([1 for b in newMol.GetBonds() if b.GetBondTypeAsDouble() == 1.0]) 8 Note that the template molecule should have no explicit hydrogens else the algorithm will fail. It also works if there are different formal charges (this was github issue 235): >>> template=AllChem.MolFromSmiles('CN(C)C(=O)Cc1ccc2c(c1)NC(=O)c3ccc(cc3N2)c4ccc(c(c4)OC)[N+](=O)[O-]') >>> mol = AllChem.MolFromMolFile(os.path.join(RDConfig.RDCodeDir, 'Chem', 'test_data', '4FTR_lig.mol')) >>> AllChem.MolToSmiles(mol) 'COC1CC(C2CCC3C(O)NC4CC(CC(O)N(C)C)CCC4NC3C2)CCC1N(O)O' >>> newMol = AllChem.AssignBondOrdersFromTemplate(template, mol) >>> AllChem.MolToSmiles(newMol) 'COc1cc(-c2ccc3c(c2)Nc2ccc(CC(=O)N(C)C)cc2NC3=O)ccc1[N+](=O)[O-]' """ # pylint:disable=too-many-branches refmol = Chem.AddHs(refmol) refmol2 = Chem.Mol(refmol) refmol2 = Chem.AddHs(refmol2) mol2 = Chem.Mol(mol) for atom in mol.GetAtoms(): atom.SetNoImplicit(True) for atom in mol.GetAtoms(): atom.SetNoImplicit(True) for atom in refmol2.GetAtoms(): atom.SetNoImplicit(True) for atom in refmol.GetAtoms(): atom.SetNoImplicit(True) # do the molecules match already? matching = mol2.GetSubstructMatch(refmol2) if not matching: # no, they don't match # check if bonds of mol are SINGLE Chem.rdchem.BondType.DOUBLE for b in mol2.GetBonds(): if b.GetBondType() != Chem.rdchem.BondType.SINGLE: b.SetBondType(Chem.rdchem.BondType.SINGLE) b.SetIsAromatic(False) # set the bonds of mol to SINGLE for b in refmol2.GetBonds(): b.SetBondType(Chem.rdchem.BondType.SINGLE) b.SetIsAromatic(False) # set atom charges to zero; for a in refmol2.GetAtoms(): a.SetFormalCharge(0) for a in mol2.GetAtoms(): a.SetFormalCharge(0) matching = mol2.GetSubstructMatches(refmol2, uniquify=False) # do the molecules match now? if matching: # if len(matching) > 1: # logger.warning(msg="More than one matching pattern found - picking one") matching = matching[0] # apply matching: set bond properties for b in refmol.GetBonds(): atom1 = matching[b.GetBeginAtomIdx()] atom2 = matching[b.GetEndAtomIdx()] b2 = mol2.GetBondBetweenAtoms(atom1, atom2) b2.SetBondType(b.GetBondType()) b2.SetIsAromatic(b.GetIsAromatic()) # apply matching: set atom properties for a in refmol.GetAtoms(): a2 = mol2.GetAtomWithIdx(matching[a.GetIdx()]) a2.SetHybridization(a.GetHybridization()) a2.SetIsAromatic(a.GetIsAromatic()) a2.SetNumExplicitHs(a.GetNumExplicitHs()) a2.SetFormalCharge(a.GetFormalCharge()) try: Chem.SanitizeMol(mol2) except: # pylint:disable=bare-except er_message = f"Could not sanitize {cml_file}\n" # smile = smiles_from_cml(cml_file) er_message += _at_num_er(refmol2, mol2) # smimol = Chem.MolFromSmiles() _write_error(er_message) return None if hasattr(mol2, "__sssAtoms"): # pylint:disable=protected-access mol2.__sssAtoms = None # pylint:disable=protected-access # we don't want all bonds highlighted else: er_message = ( f"No match between template smiles and connected geom for {cml_file}\n" ) er_message += _at_num_er(refmol2, mol2) _write_error(er_message) return None return mol2 def _at_num_er(refmol2, mol2): """check numbering of atoms""" er_message = "" smi_num = [] smi_lab = [] symb_list = [] num_list = [] for atom in mol2.GetAtoms(): if atom.GetSymbol() not in symb_list: symb_list.append(atom.GetSymbol()) num_list.append(1) else: num_list[symb_list.index(atom.GetSymbol())] += 1 for atom in refmol2.GetAtoms(): if atom.GetSymbol() not in smi_lab: smi_lab.append(atom.GetSymbol()) smi_num.append(1) else: smi_num[smi_lab.index(atom.GetSymbol())] += 1 for i, num in enumerate(num_list): j = smi_lab.index(symb_list[i]) if smi_num[j] != num: er_message += f"Expected {smi_num[j]} {smi_lab[j]} from SMILES, observed {num_list[i]} in xyz\n" return er_message
[docs] def data_from_cml(cml_file, bonds=False): """Gets symbols, xyz coords, bonds and charge of a mol from cml file Args: cml_file: .cml filename Returns: list with relevant data from cml file. Elements in order are: molecular geometry, atom types, list of bonds, list of elements, charge Note: This is designed to parse files specifically from the Retrievium database https://retrievium.ca the SMILEs extracted is labelled with tag retrievium:inputSMILES The geometry extracted is from the third atomArray block in the .cml file Example Usage: >>> utils.data_from_cml(cml_file) """ # pylint:disable=too-many-locals # pylint:disable=too-many-branches # pylint:disable=too-many-statements num_atom_array = 0 geom_list = [] n_atl = 0 n_bary = 0 type_list = [] idx_list = [] bond_list = [] el_list = [] with open(cml_file, encoding="utf-8") as file: for line in file: if "formalCharge" in line: split_line = line.split(" ") for word in split_line: if "formalCharge" in word: charge = int( word.replace("formalCharge=", "") .replace(">\n", "") .replace('"', "") ) continue if "atomArray" in line: num_atom_array += 1 if num_atom_array == 5: continue if num_atom_array == 5: quote_split = line.split('"') # maybe only do el_list with bonds? el_list.append(quote_split[3]) x_split = quote_split[5] y_split = quote_split[7] z_split = quote_split[9] geom_list.append( [ float(ast.literal_eval(x_split)), float(ast.literal_eval(y_split)), ast.literal_eval(z_split), ] ) elif num_atom_array == 6: if bonds: if "bondArray" in line: n_bary += 1 if n_bary in [1, 2]: continue elif n_bary == 1: split_line = line.split() at_1 = int(split_line[1].split('"')[1].replace("a", "")) at_2 = int(split_line[2].replace('"', "").replace("a", "")) b_ord = split_line[4].split('"')[1] bond_list.append((at_1, at_2, b_ord)) if "atomTypeList" in line: n_atl += 1 if n_atl == 1: continue if n_atl == 2: break elif n_atl == 1: split_line = line.split() idx_list.append( int( split_line[1] .split("=")[1] .replace('"', "") .replace("a", "") ) - 1 ) at_label = split_line[2].split("=")[1].replace('"', "") at_type = int(split_line[3].split("=")[1].replace('"', "")) at_valence = int( split_line[4].split("=")[1].split("/")[0].replace('"', "") ) type_list.append((at_label, at_type, at_valence)) temp_frame = pd.DataFrame(list(zip(idx_list, type_list)), columns=["idx", "type"]) temp_frame.sort_values(by="idx", inplace=True) if bonds: return [geom_list, list(temp_frame["type"]), bond_list, el_list, charge] return [geom_list, list(temp_frame["type"])]
[docs] def all_data_from_cml(data): """Gets symbols, xyz coords, bonds and charge of a mol from cml file Args: data: lines of a .cml file Returns: dictionary with relevant data from cml file. Keys included are 'geom', 'atom_types', 'bonds', 'labels', 'charge', 'multiplicity', 'smiles' Note: Not used in :attr:`group_decomposition.fragfunctions.identify_connected_fragments` This is used in an AiiDA workflow employing this package This is designed to parse files specifically from the Retrievium database https://retrievium.ca the SMILEs extracted is labelled with tag retrievium:inputSMILES The geometry extracted is from the third atomArray block in the .cml file Example Usage: >>> utils.all_data_from_cml(cml_file) """ # pylint:disable=too-many-locals # pylint:disable=too-many-branches # pylint:disable=too-many-statements num_atom_array = 0 geom_list = [] n_atl = 0 n_bary = 0 type_list = [] idx_list = [] bond_list = [] el_list = [] smi_flag = 0 for line in data: if "formalCharge" in line: split_line = line.split(" ") for word in split_line: if "spinMultiplicity" in word: multiplicity = int( word.replace("spinMultiplicity=", "").replace('"', "") ) if "formalCharge" in word: charge = int( word.replace("formalCharge=", "") .replace('"', "") .replace(">", "") .replace("\n", "") ) continue if "retrievium:inputSMILES" in line: smi_flag = 1 elif smi_flag == 1: smile = line.split(">")[1].split("<")[0] smi_flag = 2 if "atomArray" in line: num_atom_array += 1 if num_atom_array == 5: continue if num_atom_array == 5: quote_split = line.split('"') # maybe only do el_list with bonds? el_list.append(quote_split[3]) x_split = quote_split[5] y_split = quote_split[7] z_split = quote_split[9] geom_list.append( [ float(ast.literal_eval(x_split)), float(ast.literal_eval(y_split)), float(ast.literal_eval(z_split)), ] ) elif num_atom_array == 6: if "bondArray" in line: n_bary += 1 if n_bary in [1, 2]: continue elif n_bary == 1: split_line = line.split() at_1 = int(split_line[1].split('"')[1].replace("a", "")) at_2 = int(split_line[2].replace('"', "").replace("a", "")) b_ord = split_line[4].split('"')[1] bond_list.append((at_1, at_2, b_ord)) if "atomTypeList" in line: n_atl += 1 if n_atl == 1: continue if n_atl == 2: break elif n_atl == 1: split_line = line.split() idx_list.append( int(split_line[1].split("=")[1].replace('"', "").replace("a", "")) - 1 ) at_label = split_line[2].split("=")[1].replace('"', "") at_type = int(split_line[3].split("=")[1].replace('"', "")) at_valence = int( split_line[4].split("=")[1].split("/")[0].replace('"', "") ) type_list.append((at_label, at_type, at_valence)) temp_frame = pd.DataFrame(list(zip(idx_list, type_list)), columns=["idx", "type"]) temp_frame.sort_values(by="idx", inplace=True) return { "geom": geom_list, "atom_types": list(temp_frame["type"]), "bonds": bond_list, "labels": el_list, "charge": charge, "multiplicity": multiplicity, "smiles": smile, # pylint:disable=used-before-assignment }
[docs] def xyz_list_to_str(xyz_list): """Convert 2d array to string [[a,b,c],[d,e,f]] -> a, b, c\n d, e, f\n""" outstr = "" for row in xyz_list: for at in row: outstr += str(at) outstr += ", " outstr += "\n" return outstr
[docs] def list_to_str(lst): """Convert a list to string. [a,b,c] -> a b c""" out_str = "" for it in lst: out_str += str(it) + " " return out_str
[docs] def mol_from_molfile(mol_file): """takes mol_file and returns mol wth atom numbers the same Modified for mol file structure from retrievium from stackexchange https://mattermodeling.stackexchange.com/questions/7234/how-to-input-3d-coordinates-from-xyz-file-and-connectivity-from-smiles-in-rdkit Args: mol_file: name of .mol file Returns: mol with AtomMapNumbers labeled by :attr:`group_decomposition.utils.mol_with_atom_index` """ # pylint:disable=line-too-long m = Chem.MolFromMolFile(mol_file, removeHs=False) if not m: raise ValueError(f"""Problem creating molecule from {mol_file}""") # this assumes whatever program you use doesn't re-order atoms # .. which is usually a safe assumption # .. so we don't bother tracking atoms m = mol_with_atom_index(m) atomic_symbols = [] xyz_coordinates = [] ats_read = 0 num_atoms = m.GetNumAtoms() with open(mol_file, encoding="utf-8") as file: for line_number, line in enumerate(file): if ats_read < num_atoms and line_number > 3: ats_read += 1 x, y, z, atomic_symbol = line.split()[:4] atomic_symbols.append(atomic_symbol) xyz_coordinates.append([float(x), float(y), float(z)]) elif ats_read == num_atoms: break return {"Molecule": m, "xyz_pos": xyz_coordinates, "atomic_symbols": atomic_symbols}
[docs] def xyz_from_cml(cml_file): """Extract xyz coordinates from cml file Args: cml_file: cml file name Returns: list of length 3 lists containing a molecule's xyz coordinates Note: Must be used on .cml files from the Retrievium database https://retrievium.ca """ num_atom_array = 0 geom_list = [] with open(cml_file, encoding="utf-8") as file: for line in file: if "atomArray" in line: num_atom_array += 1 if num_atom_array == 5: continue if num_atom_array == 5: space_split = line.split() x_split = space_split[3].split("=") y_split = space_split[4].split("=") z_split = space_split[5].split("=") geom_list.append( [ float(ast.literal_eval(x_split[1])), float(ast.literal_eval(y_split[1])), float(ast.literal_eval(z_split[1])), ] ) elif num_atom_array == 6: break return geom_list
[docs] def get_canonical_molecule(smile: str): """Ensures that molecule numbering is consistent with creating molecule from canonical SMILES for consistency.""" mol = Chem.MolFromSmiles(smile) if mol: mol_smi = Chem.MolToSmiles(mol) # molsmi is canonical SMILES else: raise ValueError( f"""{smile} is not a valid SMILES code or rdkit cannot construct a molecule from it""" ) # create canonical molecule numbering from canonical SMILES return Chem.MolFromSmiles(mol_smi)
# def copy_molecule(molecule): # """create a copy of molecule object in new object(not pointer)""" # # see link https://sourceforge.net/p/rdkit/mailman/message/33652439/ # return Chem.Mol(molecule) def _clean_smile(trim_smi): """remove leftover junk from smiles when atom deleted.""" trim_smi = trim_smi.replace("[*H]", "*") trim_smi = trim_smi.replace("[*H3]", "*") trim_smi = trim_smi.replace("[*H2]", "*") trim_smi = trim_smi.replace("[*H+]", "*") trim_smi = trim_smi.replace("[*H3+]", "*") trim_smi = trim_smi.replace("[*H2+]", "*") trim_smi = trim_smi.replace("[*H-]", "*") trim_smi = trim_smi.replace("[*H3-]", "*") trim_smi = trim_smi.replace("[*H2-]", "*") return trim_smi