Source code for gdsctools.drugs

"""

Small functionalities to retrive chembl/chemspider identifiers
based on a drug name

"""
from easydev import Progress
from gdsctools import DrugDecode
import pandas as pd


__all__ = ["ChemSpiderSearch"]


[docs]class ChemSpiderSearch(object):
    """This class uses ChemSpider and ChEMBL to identify drug name

    .. warning:: this is a draft version in dev mode

    ::

        c = ChemSpiderSearch()
        c.search_in_chemspider()
        c.search_from_smile_inchembl()
        df = c.find_chembl_ids()

    It happens that most of public names can be found
    and almost none of non-public are found. As expected...

    If chemspider, chembl and pubchem are empty, search for the drug name in
    chemspider.

        CHEMSPIDER search:
            if no identifier found, the search if DROPPED
            if 1 identifier found, we keep going using the SMILE identifier
            If more than 1 identifier found, this is AMBIGUOUS.


    If chembl and pubchem, check with unichem
    If chembl, check smiles
    If chembl and chemspider, check smiles ?

    SMILES are not unique

    """
    def __init__(self, drug_decode):
        print("ChemSpiderSearch is still in progress, please do not use")
        self.dd = DrugDecode(drug_decode)
        self.dd_filled = DrugDecode(drug_decode)

        from bioservices.chemspider import ChemSpider
        from bioservices import ChEMBL
        from bioservices import UniChem

        try:
            print('Loading PubChem')
            from bioservices.pubchem import PubChem
            self.puchem = PubChem()
        except:
            # Pubchem was introduced only in dec 2015
            pass

        print('Loading ChEMBL service')
        self.chembl = ChEMBL(cache=True)

        print('Loading ChemSpider service')
        self.chemspider = ChemSpider(cache=True)

        print('Loading UniChem service')
        # in unichem db number is 22 and chembl is 1
        self.unichem = UniChem()

        print('Settings some data aliases')
        self._cs_find = self.chemspider.find
        self._cs_get = self.chemspider.GetExtendedCompoundInfo

        self.drug_ids = sorted(list(self.dd.df.index.values))
        self.drug_names = sorted(list(self.dd.df.DRUG_NAME.values))

[docs]    def filling_chembl_pubchem_using_unichem(self):
        """

        """
        N = len(self.drug_ids)
        pb = Progress(N)
        for i, this in enumerate(self.drug_ids):
            entry = self.dd.df.loc[this]
            # if no information is provided, we will need to get it 
            # from chemspider

            # From the database, when chembl is provided, it is unique
            # same for chemspider and pubchem and CAS
            select = entry[['CHEMSPIDER', 'CHEMBL', 'PUBCHEM']]
            if select.count() == 0:
                name = self.dd.df.loc[this].DRUG_NAME
                results = self._cs_find(name)
                if len(results) == 0:
                    # nothing found
                    pass
                elif len(results) == 1:
                    self.dd_filled.df.loc[this].loc['CHEMSPIDER'] = results[0]
                else:
                    # non unique
                    #chemspider = ",".join([str(x) for x in results])
                    self.dd_filled.df.loc[this].loc['CHEMSPIDER'] = results
            pb.animate(i+1)

        # Search in chemspider systematically
        for i, this in enumerate(self.drug_ids):
            entry = self.dd.df.loc[this]
            if select.count() == 1:
                res = self._cs_find(drug)

            pb.animate(i+1)

[docs]    def find_chembl_ids(self):
        """


        """
        # don't know how to search for a chembl id given the drug name...
        # so we use chemspider
        #self.search_in_chemspider()

        # but chemspider returns molecular information (not chembl id)
        # so given the smile string, we look back in chembl for valid entries
        #self.search_from_smile_inchembl()

        # finally, get the chembl identifiers
        drugs = []
        chembl_ids = []
        chemspider_ids = []
        smiles_c = []
        smiles_cs = []

        for drug in self.drug_ids:
            try:
                entry = self.results_chembl[drug]

                ids = ",".join([x['chemblId'] for x in entry])
                drugs.append(drug)
                chembl_ids.append(ids)
                ids = ",".join([str(x) for x in self.results[drug]])
            except:
                print('skipping' + drug)
                ids = ",".join([drug, '', '', '', '', ''])
            chemspider_ids.append(ids)

        for drug in self.drug_ids:
            try:
                smiles_c.append(",".join([x['smiles'] for x in
                    self.results_chembl[drug]]))
            except:
                smiles_c.append('')
            try:
                smiles_cs.append(self.results_chemspider[drug]['smiles'])
            except:
                smiles_cs.append('')

        df = pd.DataFrame([drugs, chembl_ids, chemspider_ids, smiles_c,
            smiles_cs],
                index=['DRUG_NAME','CHEMBL_ID','CHEMSPIDER_ID', 'SMILE_CHEMBL',
                    'SMILE_CHEMSPIDER'])
        df = df.T
        return df

[docs]    def get_chemspider_ids(self, drug_name):
        res = self._cs_find(drug)
        return res

[docs]    def search_in_chemspider(self):
        # Fill results attribute as a dictionary. Keys being the drug id
        # and values are list of chemspider identifiers
        #
        # SB52334 --> SB-52334
        N = len(self.dd)

        pb = Progress(N)
        self.results = {}
        results = []
        for i, index in enumerate(self.dd.df.index):
            drug = self.dd.df.index[i]
            drug_name = self.dd.df.loc[drug].DRUG_NAME
            try:
                res = self._cs_find(drug_name)
            except:
                print("This drug index (%s) / drug name (%s) was not found" %
                        (index, drug_name))
                res = []
            self.results[drug] = res
            pb.animate(i+1)
            results.append(res)
        self.dd_filled.df['CHEMSPIDER_SEARCHED'] = results

[docs]    def search_from_smile_inchembl(self):

        N = len(self.drug_ids)

        pb = Progress(N)
        self.results_chembl = {}
        self.results_chemspider = {}

        for i in range(0, N):
            drug = self.drug_ids[i]
            self.results_chembl[drug] = []

            if self.results[drug]:
                for chemspider_id in self.results[drug]:
                    chemspider_entry = self._cs_get(chemspider_id)
                    self.results_chemspider[drug] = chemspider_entry
                    smile = chemspider_entry['smiles']
                    # now search in chembl
                    res_chembl = self.chembl.get_compounds_by_SMILES(smile)
                    try:
                        res_chembl['compounds']
                        self.results_chembl[drug].extend(res_chembl['compounds'])
                    except:
                        pass

            pb.animate(i+1)