Source code for gdsctools.cosmictools


# fixing compatiblity python 2 and 3 related to merging or urllib and urllib2 i
# n python 3
try:     #python 3
    from urllib.request import urlopen
except:
    from urllib2  import urlopen


import pandas as pd
import easydev


__all__ = ['COSMICFetcher', 'COSMICInfo']


[docs]class COSMICFetcher(object): """Utility to download a flat file about cosmic identier and build a small dataframe with cosmic identifiers and their diseases The original flat file is downloaded from ftp.expasy.org/databases and contains records as follows:: ID Identifier (cell line name) Once; starts an entry AC Accession (CVCL_xxxx) Once SY Synonyms Optional; once DR Cross-references Optional; once or more RX References identifiers Optional: once or more WW Web pages Optional; once or more CC Comments Optional; once or more DI Diseases Optional; once or more OX Species of origin Once or more HI Hierarchy Optional; once or more OI Originate from same individual Optional; once or more SX Sex (gender) of cell Optional; once CA Category Optional; once We keep only records with COSMIC cross references. From those records, we keep ID, AC, CA, DI (Disease) and the cosmic identifier. The resulting dataframe can then be accessed in the :attr:`df` attribute. :: >>> from gdsctools.cosmictools import COSMICFetcher >>> cf = COSMICFetcher() # this may take a while to download the file >>> cf.df.loc[0] ID OS-A AC CVCL_0C23 CA Cancer cell line COSMIC_ID 2239090 Disease C4917; Small cell lung carcinoma Name: 0, dtype: object """ def __init__(self, filename=None): """.. rubric:: Constructor :param str filename: If not provided, download file from expasy.org and store it in :attr:`data`. Otherwise, if filename is provided, reads a local file. Format should be the same as the file downloaded from expasy """ if filename is not None: fh = open(filename, 'r') self._data = fh.read() fh.close() self._scandata() else: url = 'ftp://ftp.expasy.org/databases/cellosaurus/cellosaurus.txt' self.url = url print('Downloading data. This may take a while') print('Consider saving the *data* attribute in a file ' + 'for next time') self._data = urlopen(self.url).read() self._scandata() def _scandata(self): print('Parsing the data') self._data = self._data.split("\nID ")[1:] # skip header print(len(self._data)) self._data = [this for this in self._data if 'Cosmic' in this] print('Dropping records with no COSMIC cross references:') print('Kept %s records' % len(self._data)) self._data2records() def _data2records(self): print("Creating records") self._records = {} for this in self._data: record = this.split("\n",1) identifier = record[0].strip() content = record[1].strip() self._records[identifier] = content # we want to store, AC, ID, DR if Cosmic or GDSC, OX (organism) # DI disease print("Scanning records") records = [] pb = easydev.Progress(len(self._records)) count = 0 for ID,this in self._records.items(): count += 1 pb.animate(count) # those are to be found only once AC = self._scan_record_for(this, 'AC')[0] # should have only one OX = self._scan_record_for(this, 'OX')[0] # should have only one CA = self._scan_record_for(this, 'CA')[0] try: OX = OX.split("!")[1].strip() except: pass for line in this.split("\n"): # get DI. Most of the time there is only one but could have 2 # sometimes DI = "__".join(self._scan_record_for(this, 'DI')) DI = DI.replace("NCIt;", "") DI = DI.strip() if line.startswith('DR'): dummy, content = line.split(" ", 1) if 'Cosmic' in content: content = content.replace('Cosmic;', '').strip() content = content.replace('Cosmic-CLP;', '').strip() content = content.replace('CC;', '').strip() records.append([ID, AC, OX, CA, int(content), DI]) self._records_list = records self.df = pd.DataFrame(records, columns=['ID', 'AC', 'OX', 'CA', 'COSMIC_ID', 'Disease']) # keep only homo sapiens (drop mus musculus) self.df = self.df[self.df.OX == 'Homo sapiens'] del self.df['OX'] self.df.drop_duplicates(inplace=True) self.df.reset_index(drop=True, inplace=True) def _scan_record_for(self, record, key): lines = [line for line in record.split("\n") if line.startswith(key)] content = [this.split(" ", 1)[1].strip() for this in lines] return content
[docs]class COSMICInfo(object): """Retrieve information about cell line included in GDSC1000 This file reads a GDSCTools dataset :attr:`gdsctools.datasets.cosmic_info`. Its content is stored in :attr:`df`. In corresponds to Table S1E (List cell line samples with data availability and annotations across the different omics The method :meth:`get` retrieves information contained in the dataframe :attr:`df`. Provide a known cosmic identifier as follows: .. doctest:: >>> from gdsctools import COSMICInfo >>> c = COSMICInfo() >>> c.get(909907, 'SAMPLE_NAME') 'ZR-75-30' or get all available field as follows:: >>> c.get(909907) SAMPLE_NAME ZR-75-30 SEQ 1 CNA 1 EXP 1 MET 1 DRUG_SCR 1 GDSC_description_1 breast GDSC_description_2 breast Study_Abbreviation BRCA MMR MSI-L SCREEN_MEDIUM R GROWTH_PROPERTIES Adherent Name: 909907, dtype: object .. note:: there are only 1000 cell lines in the :attr:`df`. Additional cell lines may be retrieve using :class:`COSMICFetcher` If a cosmic identifier is not found, the returned object has the same structure as above but with all fields set to False. .. seealso:: http://www.cancerrxgene.org/translation/CellLine """ def __init__(self): """.. rubric:: constructor""" from gdsctools.datasets import cosmic_info #: dataframe with all information self.df = pd.read_csv(cosmic_info.filename, sep=',') self.df.set_index('COSMIC_ID', inplace=True)
[docs] def get(self, identifier, colname=None): """ :param int identifier: a cosmic identifiers. Possible values are stored in :attr:`df.index` attribute :param colname: specific field. :return: if colname is not provided, returns a time series for the **identifier** with all available fields. Otherwise, returns a specific field. """ if isinstance(identifier, str): identifier = int(identifier) if identifier not in self.df.index: ts = pd.Series([None]*12, index=self.df.columns, name=identifier) else: ts = self.df.loc[identifier] if colname is None: return ts.copy() # to be safe since user may change it else: return ts[colname]
def _get_url(self, cosmic_id): url = 'http://cancer.sanger.ac.uk/cell_lines/sample/overview' url = url + "?id={0}#overview".format(cosmic_id) return url
[docs] def on_web(self, identifier): """Open a tab related to the COSMIC identifier (in your browser)""" from easydev import onweb url = self._get_url(identifier) onweb(url)