"""
fg_query module
Functions and SMARTs strings for querying specific functional groups from results table of
:attr:`group_decomposition.fragfunctions.identify_connected_fragments`
:attr:`group_decomposition.fragfunctions.count_uniques`, or :attr:`group_decomposition.fragfunctions.count_groups_in_set`
"""
import pandas as pd # lots of work with data frames
from rdkit import Chem # pylint:disable=import-error
fg_dict = {
"vinylic carbon": "[$([CX3]=[CX3])]",
"allenic carbon": "[$([CX2](=C)=C)]",
"alkyne": "[$([CX2]#C)]",
"acyl halide": "[CX3](=[OX1])[F,Cl,Br,I]",
"aldehyde": "[CX3H1](=O)[#6]",
"anhydride": "[CX3](=[OX1])[OX2][CX3](=[OX1])",
"amide": "[NX3][CX3](=[OX1])[#6]",
"amidinium": "[NX3][CX3]=[NX3+]",
"carbamic ester": "[NX3][CX3](=[OX1])[OX2H0]",
"carbamic acid": "[NX3,NX4+][CX3](=[OX1])[OX2H,OX1-]",
"carboxylate": "[CX3](=O)[O-]",
"carbonic acid/acid-ester": "[CX3](=[OX1])([OX2])[OX2H,OX1H0-1]",
"carbonic ester": "C[OX2][CX3](=[OX1])[OX2]C",
"carboxylic acid": "[CX3](=O)[OX2H1]",
"cyanamide": "[NX3][CX2]#[NX1]",
"ester or anhydride": "[#6][CX3](=O)[OX2H0][#6]",
"ketone": "[#6][CX3](=O)[#6]",
"ether": "[OD2]([#6])[#6]",
"primary amine": "[NX3;H2;!$(NC=[!#6]);!$(NC#[!#6])][#6]",
"enamine": "[NX3][CX3]=[CX3]",
"secondary amine": "[NX3;H1;!$(NC=O)]",
"azide": "[$(*-[NX2-]-[NX2+]#[NX1]),$(*-[NX2]=[NX2+]=[NX1-])]",
"hydrazine": "[NX3][NX3]",
"hydrazone": "[NX3][NX2]=[*]",
"substituted imine": "[CX3;$([C]([#6])[#6]),$([CH][#6])]=[NX2][#6]",
"iminium": "[NX3+]=[CX3]",
"unsubstituted dicarboximide": "[CX3](=[OX1])[NX3H][CX3](=[OX1])",
"substituted dicarboximide": "[CX3](=[OX1])[NX3H0]([#6])[CX3](=[OX1])",
# 'nitrate': '[$([NX3](=[OX1])(=[OX1])O),$([NX3+]([OX1-])(=[OX1])O)]',
"nitrile": "[NX1]#[CX2]",
"isonitrile": "[CX1-]#[NX2+]",
"nitro": "[$([NX3](=O)=O),$([NX3+](=O)[O-])][!#8]",
"nitroso": "[NX2]=[OX1]",
"alcohol hydroxyl": "[OX2H][CX4]",
"enol": "[OX2H][#6X3]=[#6]",
"phenol": "[OX2H][cX3]:[c]",
"peroxide": "[OX2,OX1-][OX2,OX1-]",
"thioester": "S([#6])[CX3](=O)[#6]",
"thiol": "[#16X2H]",
"thioamide": "[NX3][CX3]=[SX1]",
"thioketone": "[#6][CX3](=S)[#6]",
"thioaldehyde": "[CX3H1](=S)[#6]",
"monosulfide": "[#16X2H0][!#16]",
"disulfide": "[#16X2H0][#16X2H0]",
"sulfinic acid": "[$([#16X3](=[OX1])[OX2H,OX1H0-]),$([#16X3+]([OX1-])[OX2H,OX1H0-])]",
"sulfonic acid": "[$([#16X4](=[OX1])(=[OX1])([#6])[OX2H,OX1H0-]),$([#16X4+2]([OX1-])([OX1-])([#6])[OX2H,OX1H0-])]",
"sulfonamide": "[$([#16X4]([NX3])(=[OX1])(=[OX1])[#6]),$([#16X4+2]([NX3])([OX1-])([OX1-])[#6])]",
"sulfone": "[$([#16X4](=[OX1])(=[OX1])([#6])[#6]),$([#16X4+2]([OX1-])([OX1-])([#6])[#6])]",
"sulfoxide": "[$([#16X3](=[OX1])([#6])[#6]),$([#16X3+]([OX1-])([#6])[#6])]",
"sulfate": "[$([#16X4](=[OX1])(=[OX1])([OX2H,OX1H0-])[OX2][#6]),$([#16X4+2]([OX1-])([OX1-])([OX2H,OX1H0-])[OX2][#6])]",
"sulfuric acid diester": "[$([#16X4](=[OX1])(=[OX1])([OX2][#6])[OX2][#6]),$([#16X4](=[OX1])(=[OX1])([OX2][#6])[OX2][#6])]", # pylint:disable=line-too-long
"chlorine": "Cl",
"fluorine": "F",
"bromine": "Br",
}
[docs]
def query_pattern(frag_frame: pd.DataFrame, patt: str) -> int:
"""Determine number of functional groups matching pattern in fragment table
Args:
frag_frame: fragment frame from output of :attr:`group_decomposition.fragfunctions.identify_connected_fragments`
:attr:`group_decomposition.fragfunctions.count_uniques`,
or :attr:`group_decomposition.fragfunctions.count_groups_in_set`
patt: SMARTs string to match functional group. See :attr:`group_decomposition.fg_query.fg_dict` for SMARTS
of common groups
Returns:
Number of matches of the functional group queried in the table
Example Usage:
>>> frag_frame
Smiles Molecule count
*C=CC=C <mol object> 3
*C=C <mol object> 1
>>> query_pattern(frag_frame,'[$([CX3]=[CX3])]')
7
Note:
If the column has a count of the number of times a fragment occurs (count column), multiply the number of matches
in a group by the count of times it occurs
"""
patt_mol = Chem.MolFromSmarts(patt)
mol_list = list(frag_frame["Molecule"])
if "count" in list(frag_frame.columns):
count_list = list(frag_frame["count"])
patt_count = [
count_list[i] * len(mol.GetSubstructMatches(patt_mol))
for i, mol in enumerate(mol_list)
]
else:
patt_count = [len(mol.GetSubstructMatches(patt_mol)) for mol in mol_list]
return sum(patt_count)
[docs]
def count_fgs(frag_frame: pd.DataFrame, patt_dict: dict) -> pd.DataFrame:
"""Given functional group dictionary with patterns to match, return counts of each in fragment_frame
Args:
frag_frame: fragment frame from output of :attr:`group_decomposition.fragfunctions.identify_connected_fragments`
:attr:`group_decomposition.fragfunctions.count_uniques`,
or :attr:`group_decomposition.fragfunctions.count_groups_in_set`
patt_dict: has keys of functional group names and values being a SMARTs string to match
Returns:
Frame with one column containing names of functional group and another with count
Example Usage:
>>> frag_frame
Smiles Molecule count
*C=CC=C <mol object> 3
*C=O <mol object> 1
>>> patt_dict
{
'aldehyde': '[CX3H1](=O)[#6]',
'vinylic carbon': '[$([CX3]=[CX3])]'
}
>>> count_fgs(frag_frame,patt_dict)
group count
aldehye 1
vinylic carbon 6
Note:
If the column has a count of the number of times a fragment occurs (count column), multiply the number of matches
in a group by the count of times it occurs
"""
return pd.DataFrame(
{
"group": [list(patt_dict.keys())],
"count": [query_pattern(frag_frame, value) for value in patt_dict.values()],
}
)