"""
Small functionalities to retrive chembl/chemspider identifiers
based on a drug name
"""
from easydev import Progress
from gdsctools import DrugDecode
import pandas as pd
__all__ = ["ChemSpiderSearch"]
[docs]class ChemSpiderSearch(object):
"""This class uses ChemSpider and ChEMBL to identify drug name
.. warning:: this is a draft version in dev mode
::
c = ChemSpiderSearch()
c.search_in_chemspider()
c.search_from_smile_inchembl()
df = c.find_chembl_ids()
It happens that most of public names can be found
and almost none of non-public are found. As expected...
If chemspider, chembl and pubchem are empty, search for the drug name in
chemspider.
CHEMSPIDER search:
if no identifier found, the search if DROPPED
if 1 identifier found, we keep going using the SMILE identifier
If more than 1 identifier found, this is AMBIGUOUS.
If chembl and pubchem, check with unichem
If chembl, check smiles
If chembl and chemspider, check smiles ?
SMILES are not unique
"""
def __init__(self, drug_decode):
print("ChemSpiderSearch is still in progress, please do not use")
self.dd = DrugDecode(drug_decode)
self.dd_filled = DrugDecode(drug_decode)
from bioservices.chemspider import ChemSpider
from bioservices import ChEMBL
from bioservices import UniChem
try:
print('Loading PubChem')
from bioservices.pubchem import PubChem
self.puchem = PubChem()
except:
# Pubchem was introduced only in dec 2015
pass
print('Loading ChEMBL service')
self.chembl = ChEMBL(cache=True)
print('Loading ChemSpider service')
self.chemspider = ChemSpider(cache=True)
print('Loading UniChem service')
# in unichem db number is 22 and chembl is 1
self.unichem = UniChem()
print('Settings some data aliases')
self._cs_find = self.chemspider.find
self._cs_get = self.chemspider.GetExtendedCompoundInfo
self.drug_ids = sorted(list(self.dd.df.index.values))
self.drug_names = sorted(list(self.dd.df.DRUG_NAME.values))
[docs] def filling_chembl_pubchem_using_unichem(self):
"""
"""
N = len(self.drug_ids)
pb = Progress(N)
for i, this in enumerate(self.drug_ids):
entry = self.dd.df.loc[this]
# if no information is provided, we will need to get it
# from chemspider
# From the database, when chembl is provided, it is unique
# same for chemspider and pubchem and CAS
select = entry[['CHEMSPIDER', 'CHEMBL', 'PUBCHEM']]
if select.count() == 0:
name = self.dd.df.loc[this].DRUG_NAME
results = self._cs_find(name)
if len(results) == 0:
# nothing found
pass
elif len(results) == 1:
self.dd_filled.df.loc[this].loc['CHEMSPIDER'] = results[0]
else:
# non unique
#chemspider = ",".join([str(x) for x in results])
self.dd_filled.df.loc[this].loc['CHEMSPIDER'] = results
pb.animate(i+1)
# Search in chemspider systematically
for i, this in enumerate(self.drug_ids):
entry = self.dd.df.loc[this]
if select.count() == 1:
res = self._cs_find(drug)
pb.animate(i+1)
[docs] def find_chembl_ids(self):
"""
"""
# don't know how to search for a chembl id given the drug name...
# so we use chemspider
#self.search_in_chemspider()
# but chemspider returns molecular information (not chembl id)
# so given the smile string, we look back in chembl for valid entries
#self.search_from_smile_inchembl()
# finally, get the chembl identifiers
drugs = []
chembl_ids = []
chemspider_ids = []
smiles_c = []
smiles_cs = []
for drug in self.drug_ids:
try:
entry = self.results_chembl[drug]
ids = ",".join([x['chemblId'] for x in entry])
drugs.append(drug)
chembl_ids.append(ids)
ids = ",".join([str(x) for x in self.results[drug]])
except:
print('skipping' + drug)
ids = ",".join([drug, '', '', '', '', ''])
chemspider_ids.append(ids)
for drug in self.drug_ids:
try:
smiles_c.append(",".join([x['smiles'] for x in
self.results_chembl[drug]]))
except:
smiles_c.append('')
try:
smiles_cs.append(self.results_chemspider[drug]['smiles'])
except:
smiles_cs.append('')
df = pd.DataFrame([drugs, chembl_ids, chemspider_ids, smiles_c,
smiles_cs],
index=['DRUG_NAME','CHEMBL_ID','CHEMSPIDER_ID', 'SMILE_CHEMBL',
'SMILE_CHEMSPIDER'])
df = df.T
return df
[docs] def get_chemspider_ids(self, drug_name):
res = self._cs_find(drug)
return res
[docs] def search_in_chemspider(self):
# Fill results attribute as a dictionary. Keys being the drug id
# and values are list of chemspider identifiers
#
# SB52334 --> SB-52334
N = len(self.dd)
pb = Progress(N)
self.results = {}
results = []
for i, index in enumerate(self.dd.df.index):
drug = self.dd.df.index[i]
drug_name = self.dd.df.loc[drug].DRUG_NAME
try:
res = self._cs_find(drug_name)
except:
print("This drug index (%s) / drug name (%s) was not found" %
(index, drug_name))
res = []
self.results[drug] = res
pb.animate(i+1)
results.append(res)
self.dd_filled.df['CHEMSPIDER_SEARCHED'] = results
[docs] def search_from_smile_inchembl(self):
N = len(self.drug_ids)
pb = Progress(N)
self.results_chembl = {}
self.results_chemspider = {}
for i in range(0, N):
drug = self.drug_ids[i]
self.results_chembl[drug] = []
if self.results[drug]:
for chemspider_id in self.results[drug]:
chemspider_entry = self._cs_get(chemspider_id)
self.results_chemspider[drug] = chemspider_entry
smile = chemspider_entry['smiles']
# now search in chembl
res_chembl = self.chembl.get_compounds_by_SMILES(smile)
try:
res_chembl['compounds']
self.results_chembl[drug].extend(res_chembl['compounds'])
except:
pass
pb.animate(i+1)