Source code for gdsctools.cosmictools


# fixing compatiblity python 2 and 3 related to merging or urllib and urllib2 i
# n python 3
try:     #python 3
    from urllib.request import urlopen
except:
    from urllib2  import urlopen


import pandas as pd
import easydev


__all__ = ['COSMICFetcher', 'COSMICInfo']


[docs]class COSMICFetcher(object):
    """Utility to download a flat file about cosmic identier and build a small
    dataframe with cosmic identifiers and their diseases

    The original flat file is downloaded from ftp.expasy.org/databases and
    contains records as follows::

        ID         Identifier (cell line name)     Once; starts an entry
        AC         Accession (CVCL_xxxx)           Once
        SY         Synonyms                        Optional; once
        DR         Cross-references                Optional; once or more
        RX         References identifiers          Optional: once or more
        WW         Web pages                       Optional; once or more
        CC         Comments                        Optional; once or more
        DI         Diseases                        Optional; once or more
        OX         Species of origin               Once or more
        HI         Hierarchy                       Optional; once or more
        OI         Originate from same individual  Optional; once or more
        SX         Sex (gender) of cell            Optional; once
        CA         Category                        Optional; once

    We keep only records with COSMIC cross references.
    From those records, we keep ID, AC, CA, DI (Disease) and the cosmic
    identifier.

    The resulting dataframe can then be accessed in the :attr:`df` attribute.

    ::

        >>> from gdsctools.cosmictools import COSMICFetcher
        >>> cf = COSMICFetcher() # this may take a while to download the file
        >>> cf.df.loc[0]
        ID                                       OS-A
        AC                                  CVCL_0C23
        CA                           Cancer cell line
        COSMIC_ID                             2239090
        Disease      C4917; Small cell lung carcinoma
        Name: 0, dtype: object

    """
    def __init__(self, filename=None):
        """.. rubric:: Constructor

        :param str filename: If not provided, download file
            from expasy.org and store it in :attr:`data`. Otherwise,
            if filename is provided, reads a local file. Format should be
            the same as the file downloaded from expasy

        """
        if filename is not None:
            fh = open(filename, 'r')
            self._data = fh.read()
            fh.close()
            self._scandata()
        else:
            url = 'ftp://ftp.expasy.org/databases/cellosaurus/cellosaurus.txt'
            self.url = url
            print('Downloading data. This may take a while')
            print('Consider saving the *data* attribute in a file ' +
                'for next time')
            self._data = urlopen(self.url).read()
            self._scandata()

    def _scandata(self):
        print('Parsing the data')
        self._data = self._data.split("\nID   ")[1:] # skip header
        print(len(self._data))
        self._data = [this for this in self._data if 'Cosmic' in this]
        print('Dropping records with no COSMIC cross references:')
        print('Kept %s records' % len(self._data))
        self._data2records()

    def _data2records(self):
        print("Creating records")
        self._records = {}
        for this in self._data:
            record = this.split("\n",1)
            identifier = record[0].strip()
            content = record[1].strip()
            self._records[identifier] = content

        # we want to store, AC, ID, DR if Cosmic or GDSC, OX (organism)
        # DI disease

        print("Scanning records")
        records = []
        pb = easydev.Progress(len(self._records))
        count = 0
        for ID,this in self._records.items():
            count += 1
            pb.animate(count)
            # those are to be found only once
            AC = self._scan_record_for(this, 'AC')[0] # should have only one
            OX = self._scan_record_for(this, 'OX')[0] # should have only one
            CA = self._scan_record_for(this, 'CA')[0]

            try:
                OX = OX.split("!")[1].strip()
            except:
                pass

            for line in this.split("\n"):

                # get DI. Most of the time there is only one but could have 2
                # sometimes
                DI = "__".join(self._scan_record_for(this, 'DI'))
                DI = DI.replace("NCIt;", "")
                DI = DI.strip()

                if line.startswith('DR'):
                    dummy, content = line.split(" ", 1)
                    if 'Cosmic' in content:
                        content = content.replace('Cosmic;', '').strip()
                        content = content.replace('Cosmic-CLP;', '').strip()
                        content = content.replace('CC;', '').strip()
                        records.append([ID, AC, OX, CA, int(content), DI])

        self._records_list = records
        self.df = pd.DataFrame(records, columns=['ID', 'AC', 'OX', 'CA',
                'COSMIC_ID', 'Disease'])

        # keep only homo sapiens (drop mus musculus)
        self.df = self.df[self.df.OX == 'Homo sapiens']
        del self.df['OX']
        self.df.drop_duplicates(inplace=True)
        self.df.reset_index(drop=True, inplace=True)

    def _scan_record_for(self, record, key):
        lines = [line for line in record.split("\n") if line.startswith(key)]
        content = [this.split(" ", 1)[1].strip() for this in lines]
        return content


[docs]class COSMICInfo(object):
    """Retrieve information about cell line included in GDSC1000

    This file reads a GDSCTools dataset :attr:`gdsctools.datasets.cosmic_info`.
    Its content is stored in :attr:`df`.

    In corresponds to Table S1E (List cell line samples with data 
    availability and annotations across the different omics

    The method :meth:`get` retrieves information
    contained in the dataframe :attr:`df`. Provide a known cosmic identifier
    as follows:

    .. doctest::

        >>> from gdsctools import COSMICInfo
        >>> c = COSMICInfo()
        >>> c.get(909907, 'SAMPLE_NAME')
        'ZR-75-30'

    or get all available field as follows::

        >>> c.get(909907)
        SAMPLE_NAME           ZR-75-30
        SEQ                          1
        CNA                          1
        EXP                          1
        MET                          1
        DRUG_SCR                     1
        GDSC_description_1      breast
        GDSC_description_2      breast
        Study_Abbreviation        BRCA
        MMR                      MSI-L
        SCREEN_MEDIUM                R
        GROWTH_PROPERTIES     Adherent
        Name: 909907, dtype: object

    .. note:: there are only 1000 cell lines in the :attr:`df`. Additional cell
        lines may be retrieve using :class:`COSMICFetcher`

    If a cosmic identifier is not found, the returned object has the same
    structure as above but with all fields set to False.

    .. seealso:: http://www.cancerrxgene.org/translation/CellLine
    """
    def __init__(self):
        """.. rubric:: constructor"""
        from gdsctools.datasets import cosmic_info
        #: dataframe with all information
        self.df = pd.read_csv(cosmic_info.filename, sep=',')
        self.df.set_index('COSMIC_ID', inplace=True)

[docs]    def get(self, identifier, colname=None):
        """

        :param int identifier: a cosmic identifiers. Possible values are 
            stored in :attr:`df.index` attribute
        :param colname: specific field. 

        :return: if colname is not provided, returns a time series for the 
            **identifier** with all available fields. Otherwise, returns a
            specific field.
        """
        if isinstance(identifier, str):
            identifier = int(identifier)

        if identifier not in self.df.index:
            ts = pd.Series([None]*12, index=self.df.columns, name=identifier)
        else:
            ts = self.df.loc[identifier]

        if colname is None:
            return ts.copy() # to be safe since user may change it 
        else:
            return ts[colname]

    def _get_url(self, cosmic_id):
        url = 'http://cancer.sanger.ac.uk/cell_lines/sample/overview'
        url = url + "?id={0}#overview".format(cosmic_id)
        return url

[docs]    def on_web(self, identifier):
        """Open a tab related to the COSMIC identifier (in your browser)"""
        from easydev import onweb
        url = self._get_url(identifier)
        onweb(url)