Source code for group_decomposition.fg_query

"""
fg_query module

Functions and SMARTs strings for querying specific functional groups from results table of
:attr:`group_decomposition.fragfunctions.identify_connected_fragments`
:attr:`group_decomposition.fragfunctions.count_uniques`, or :attr:`group_decomposition.fragfunctions.count_groups_in_set`
"""
import pandas as pd  # lots of work with data frames
from rdkit import Chem  # pylint:disable=import-error

fg_dict = {
    "vinylic carbon": "[$([CX3]=[CX3])]",
    "allenic carbon": "[$([CX2](=C)=C)]",
    "alkyne": "[$([CX2]#C)]",
    "acyl halide": "[CX3](=[OX1])[F,Cl,Br,I]",
    "aldehyde": "[CX3H1](=O)[#6]",
    "anhydride": "[CX3](=[OX1])[OX2][CX3](=[OX1])",
    "amide": "[NX3][CX3](=[OX1])[#6]",
    "amidinium": "[NX3][CX3]=[NX3+]",
    "carbamic ester": "[NX3][CX3](=[OX1])[OX2H0]",
    "carbamic acid": "[NX3,NX4+][CX3](=[OX1])[OX2H,OX1-]",
    "carboxylate": "[CX3](=O)[O-]",
    "carbonic acid/acid-ester": "[CX3](=[OX1])([OX2])[OX2H,OX1H0-1]",
    "carbonic ester": "C[OX2][CX3](=[OX1])[OX2]C",
    "carboxylic acid": "[CX3](=O)[OX2H1]",
    "cyanamide": "[NX3][CX2]#[NX1]",
    "ester or anhydride": "[#6][CX3](=O)[OX2H0][#6]",
    "ketone": "[#6][CX3](=O)[#6]",
    "ether": "[OD2]([#6])[#6]",
    "primary amine": "[NX3;H2;!$(NC=[!#6]);!$(NC#[!#6])][#6]",
    "enamine": "[NX3][CX3]=[CX3]",
    "secondary amine": "[NX3;H1;!$(NC=O)]",
    "azide": "[$(*-[NX2-]-[NX2+]#[NX1]),$(*-[NX2]=[NX2+]=[NX1-])]",
    "hydrazine": "[NX3][NX3]",
    "hydrazone": "[NX3][NX2]=[*]",
    "substituted imine": "[CX3;$([C]([#6])[#6]),$([CH][#6])]=[NX2][#6]",
    "iminium": "[NX3+]=[CX3]",
    "unsubstituted dicarboximide": "[CX3](=[OX1])[NX3H][CX3](=[OX1])",
    "substituted dicarboximide": "[CX3](=[OX1])[NX3H0]([#6])[CX3](=[OX1])",
    # 'nitrate': '[$([NX3](=[OX1])(=[OX1])O),$([NX3+]([OX1-])(=[OX1])O)]',
    "nitrile": "[NX1]#[CX2]",
    "isonitrile": "[CX1-]#[NX2+]",
    "nitro": "[$([NX3](=O)=O),$([NX3+](=O)[O-])][!#8]",
    "nitroso": "[NX2]=[OX1]",
    "alcohol hydroxyl": "[OX2H][CX4]",
    "enol": "[OX2H][#6X3]=[#6]",
    "phenol": "[OX2H][cX3]:[c]",
    "peroxide": "[OX2,OX1-][OX2,OX1-]",
    "thioester": "S([#6])[CX3](=O)[#6]",
    "thiol": "[#16X2H]",
    "thioamide": "[NX3][CX3]=[SX1]",
    "thioketone": "[#6][CX3](=S)[#6]",
    "thioaldehyde": "[CX3H1](=S)[#6]",
    "monosulfide": "[#16X2H0][!#16]",
    "disulfide": "[#16X2H0][#16X2H0]",
    "sulfinic acid": "[$([#16X3](=[OX1])[OX2H,OX1H0-]),$([#16X3+]([OX1-])[OX2H,OX1H0-])]",
    "sulfonic acid": "[$([#16X4](=[OX1])(=[OX1])([#6])[OX2H,OX1H0-]),$([#16X4+2]([OX1-])([OX1-])([#6])[OX2H,OX1H0-])]",
    "sulfonamide": "[$([#16X4]([NX3])(=[OX1])(=[OX1])[#6]),$([#16X4+2]([NX3])([OX1-])([OX1-])[#6])]",
    "sulfone": "[$([#16X4](=[OX1])(=[OX1])([#6])[#6]),$([#16X4+2]([OX1-])([OX1-])([#6])[#6])]",
    "sulfoxide": "[$([#16X3](=[OX1])([#6])[#6]),$([#16X3+]([OX1-])([#6])[#6])]",
    "sulfate": "[$([#16X4](=[OX1])(=[OX1])([OX2H,OX1H0-])[OX2][#6]),$([#16X4+2]([OX1-])([OX1-])([OX2H,OX1H0-])[OX2][#6])]",
    "sulfuric acid diester": "[$([#16X4](=[OX1])(=[OX1])([OX2][#6])[OX2][#6]),$([#16X4](=[OX1])(=[OX1])([OX2][#6])[OX2][#6])]",  # pylint:disable=line-too-long
    "chlorine": "Cl",
    "fluorine": "F",
    "bromine": "Br",
}


[docs] def query_pattern(frag_frame: pd.DataFrame, patt: str) -> int: """Determine number of functional groups matching pattern in fragment table Args: frag_frame: fragment frame from output of :attr:`group_decomposition.fragfunctions.identify_connected_fragments` :attr:`group_decomposition.fragfunctions.count_uniques`, or :attr:`group_decomposition.fragfunctions.count_groups_in_set` patt: SMARTs string to match functional group. See :attr:`group_decomposition.fg_query.fg_dict` for SMARTS of common groups Returns: Number of matches of the functional group queried in the table Example Usage: >>> frag_frame Smiles Molecule count *C=CC=C <mol object> 3 *C=C <mol object> 1 >>> query_pattern(frag_frame,'[$([CX3]=[CX3])]') 7 Note: If the column has a count of the number of times a fragment occurs (count column), multiply the number of matches in a group by the count of times it occurs """ patt_mol = Chem.MolFromSmarts(patt) mol_list = list(frag_frame["Molecule"]) if "count" in list(frag_frame.columns): count_list = list(frag_frame["count"]) patt_count = [ count_list[i] * len(mol.GetSubstructMatches(patt_mol)) for i, mol in enumerate(mol_list) ] else: patt_count = [len(mol.GetSubstructMatches(patt_mol)) for mol in mol_list] return sum(patt_count)
[docs] def count_fgs(frag_frame: pd.DataFrame, patt_dict: dict) -> pd.DataFrame: """Given functional group dictionary with patterns to match, return counts of each in fragment_frame Args: frag_frame: fragment frame from output of :attr:`group_decomposition.fragfunctions.identify_connected_fragments` :attr:`group_decomposition.fragfunctions.count_uniques`, or :attr:`group_decomposition.fragfunctions.count_groups_in_set` patt_dict: has keys of functional group names and values being a SMARTs string to match Returns: Frame with one column containing names of functional group and another with count Example Usage: >>> frag_frame Smiles Molecule count *C=CC=C <mol object> 3 *C=O <mol object> 1 >>> patt_dict { 'aldehyde': '[CX3H1](=O)[#6]', 'vinylic carbon': '[$([CX3]=[CX3])]' } >>> count_fgs(frag_frame,patt_dict) group count aldehye 1 vinylic carbon 6 Note: If the column has a count of the number of times a fragment occurs (count column), multiply the number of matches in a group by the count of times it occurs """ return pd.DataFrame( { "group": [list(patt_dict.keys())], "count": [query_pattern(frag_frame, value) for value in patt_dict.values()], } )