Source code for gdsctools.readers

# -*- python -*-
# -*- coding utf-8 -*-

#  This file is part of GDSCTools software
#
#  Copyright (c) 2015 - Wellcome Trust Sanger Institute
#  All rights reserved
#
#  File author(s): Thomas Cokelaer <cokelaer@gmail.com>
#
#  Distributed under the BSD 3-Clause License.
#  See accompanying file LICENSE.txt distributed with this software
#
#  website: http://github.com/CancerRxGene/gdsctools
#
##############################################################################
"""IO functionalities


Provides readers to read the following formats

- Matrix of IC50 data set :class:`IC50`
- Matrix of Genomic features with :class:`GenomicFeatures`
- Drug Decoder table with :class:`DrugDecode`

"""
import warnings

from gdsctools.errors import GDSCToolsDuplicatedDrugError

import pandas as pd
import pylab
import numpy as np
import easydev

import colorlog

__all__ = ['IC50', 'GenomicFeatures', 'Reader', 'DrugDecode']


def drug_name_to_int(name):
    # We want to remove the prefix Drug_
    # We also want to remove suffix _IC50 but in v18, we have names
    # such as Drug_1_0.33_IC50 to provide the concentration.
    # So, we should remove the string after the second _
    # finally, #154 also causes a trouble that is a cast to integer
    # from a string that is too large (more than 20 digits) may not be cast
    # with pandas. Values must be less than 2**64-1. To guarantee that
    # the cast works correctly, we can assume that it has less than 19 digits
    def _str_to_int(x, maxdigits=19):
        if isinstance(x, (int, np.integer)):
            return x
        elif isinstance(x, str):
            if len(x) > maxdigits:
                print("Warnings gdsctools.readers.drug_name_to_int: " +
                      "%s identifier too long." % x +
                      "Please use values below 2**64 with less than 19 digits")
                x = int(x[0:maxdigits])
            else:
                x = int(x)
            return x
        else:
            print(type(x))
            raise NotImplementedError

    # remove characters (' and ")
    if isinstance(name, str):
        name = name.replace("'", "")
        name = name.replace('"', "")

    # replace the Drug_ and DRUG_
    try:
        res = name.replace("Drug_", "").replace("DRUG_", "")
        res = res.split("_")[0]
        res = _str_to_int(res)
        return res
    except:
        return _str_to_int(name)



[docs]class Reader(object):
    """Convenience base class to read CSV or TSV files (using extension)"""
    def __init__(self, data=None):
        r""".. rubric:: Constructor

        This class takes only one input parameter, however, it may be a
        filename, or a dataframe or an instance of :class:`Reader` itself. This
        means than children classes such as :class:`IC50` can also be used
        as input as long as a dataframe named :attr:`df` can be found.

        :param data: a filename in CSV or TSV format with format specified by
            child class (see e.g. :class:`IC50`), or a valid dataframe, or an
            instance of :class:`Reader`.

        The input can be a filename either in CSV (comma separated values) or
        TSV (tabular separated values). The extension will be used to interpret
        the content, so please be consistent in the naming of the file
        extensions.

        ::

            >>> from gdsctools import Reader, ic50_test
            >>> r = Reader(ic50_test.filename) # this is a CSV file
            >>> len(r.df)   # number of rows
            988
            >>> len(r)      # number of elements
            11856

        Note that :class:`Reader` is a base class and more sophisticated
        readers are available. for example, the :class:`IC50` would be
        better to read this IC50 data set.

        The data has been stored in a data frame in the :attr:`df` attribute.

        The dataframe of the object itself can be used as an input to create
        an new instance::

            >>> from gdsctools import Reader, ic50_test
            >>> r = Reader(ic50_test.filename, sep="\t")
            >>> r2 = Reader(r) # here r.df is simply copied into r2
            >>> r == r2
            True

        It is sometimes convenient to create an empty Reader that will be
        populated later on::

            >>> r = Reader()
            >>> len(r)
            0

        More advanced readers (e.g. :class:`IC50`) can also be used as input
        as long as they have a :attr:`df` attribute::

            >>> from gdsctools import Reader, ic50_test
            >>> ic = IC50(ic50_test)
            >>> r = Reader(ic)

        """
        # input data
        if data is None:
            # create an empty dataframe
            self.df = pd.DataFrame()
            self._filename = None
        elif isinstance(data, str):
            # Read a filename in TSV or CSV format
            self.read_data(data)
            self._filename = data
        elif hasattr(data, 'filename'):
            # could be a data sets from gdsctools.datasets.Data
            self.read_data(data.filename)
            self._filename = data.filename
        elif hasattr(data, 'df'):
            # an instance of a Reader (or child such as IC50, GenomicFeatures)
            self.df = data.df.copy()
            self._filename = data._filename
        elif isinstance(data, pd.DataFrame):
            # Or just a dataframe ?
            self.df = data.copy()
            self._filename = None
        else:
            raise TypeError("Input must be a filename, a IC50 instance, or " +
                            "a dataframe.")

        #: if populated, can be used to check validity of a header
        # used by drug_decode only may be removed
        self.header = []

        # sanity check on cleaning columns if not alread done
        #try:self.df.columns = [x.strip() for x in self.df.columns]
        #except: pass # fails for the IC50 where header is made of integers

[docs]    def read_data(self, filename):
        # remove possible white spaces in the header's names
        if ".csv" in filename:
            separator = ","
        elif ".tsv" in filename:
            separator = "\t"
        elif ".txt" in filename:
            separator = "\t"
            print("GDSCTools warning: files with .txt extension are "
                    "accepted (we assume a tab-separated file) but "
                    "should be renamed with .csv or .tsv extension")
        else:
            raise NotImplementedError("Only .csv or .tsv files are accepted ")

        try:
            # this is to cope with pandas 0.13 on ReadTheDoc
            # and newer versions
            na_values = ["NA", "NaN"]
            if filename.endswith(".gz"):
                compression = "gzip"
            elif filename.endswith(".bz2"):
                compression = "bz2"
            elif filename.endswith(".zip"):
                compression = "zip"
            elif filename.endswith(".xz"):
                compression = "xz"
            else:
                compression = None

            # Sometimes a column in CSV file may have several values
            # separated by comma. This should be surrended by quotes "
            # To account for that feature, quotechar argument must be provided
            # Besides, to avoid conflicts with spaces, skipinitialspace must
            # be set to True. This also helps since spaces would be
            # interpreted as a string. Using skipinitialspace, the spaces
            # is converetd to NA
            rawdf = pd.read_csv(filename, sep=separator, comment="#",
                        na_values=na_values, skipinitialspace=True,
                        compression=compression, quotechar='"')
                #if sum([this.count('\t') for this in rawdf.columns])>2:
                #    print("Your input file does not seem to be comma"
                #        " separated. If tabulated, please rename with"
                #        " .tsv or .txt extension")
            # Sometimes, a user will provide a CSV, which is actually
            # tab-delimited. This is wrong and difficult to catch
        except Exception as err:
            msg = 'Could not read %s. See gdsctools.readers.Reader'
            print(msg % filename)
            raise(err)

        # Make sure the columns' names are stripped
        #rawdf.rename(columns=lambda x: x.strip(), inplace=True)

        # let us drop columns that are unnamed and print information
        columns = [x for x in rawdf.columns if x.startswith('Unnamed')]
        if len(columns) > 0:
            print('%s  unnamed columns found and removed. ' % len(columns) +
                'Please fix your input file.')
        self.df = rawdf.drop(columns, axis=1)

        # Some fields may be empty strings, which must be set as NA
        import warnings
        warnings.filterwarnings('ignore')
        self.df = self.df.replace(" ", "").replace("\t", "").replace("",
                np.nan)
        warnings.filterwarnings("default")

        # Finally, check that names do not contain the unwanted character
        # / that was used in some old matrices.
        if len([True for x in self.df.columns if "/" in x])>0:
            print("Your input data contains unwanted / characters in " +
                    " the header. Let's remove them.")
            self.df.columns = [x.replace("/", "_") for x in self.df.columns]

    def _interpret(self):
        pass

    def _valid_header(self, df):
        for name in self.header:
            if name not in list(df.columns):
                return False
        return True

    def _read_matrix_from_r(self, name):
        """Required biokit. Will be removed"""
        print("Reading matrix %s " % (name))
        self.session.run("rnames = rownames(%s)" % name)
        self.session.run("cnames = colnames(%s)" % name)
        self.session.run("data = %s" % name)

        cnames = self.session.cnames
        rnames = self.session.rnames
        data = self.session.data
        df = pd.DataFrame(data=data.copy())
        df.columns = [x.strip() for x in cnames]
        df.index = [x.strip() for x in rnames]
        return df

    def __str__(self):
        self.df.info()
        return ""

    def __len__(self):
        return self.df.shape[0] * self.df.shape[1]

[docs]    def to_csv(self, filename, sep=",", index=False, reset_index=True):
        """Save data into a CSV file without indices"""
        #Reset the index (e.g., COSMIC ID)
        if reset_index is True:
            df = self.df.reset_index()
        else:
            df = self.df
        df.to_csv(filename, sep=sep, index=index)

[docs]    def check(self):
        """Checking the format of the matrix

        Currently, only checks that there is no duplicated column names

        """
        if len(self.df.columns.unique()) != len(self.df.columns):
            columns = list(self.df.columns)
            for this in columns:
                if columns.count(this) > 1:
                    raise GDSCToolsDuplicatedDrugError(this)

    def _check_uniqueness(self, data):
        if len(set(data)) != len(data):
            raise Exception("Error gdsctools in readers.IC50: data " + 
                            " identifiers not unique.")

    def __eq__(self, other):
        return all(self.df.fillna(0) == other.df.fillna(0))


class CosmicRows(object):
    """Parent class to IC50 and GenomicFeatures to handle cosmic identifiers"""

    def _get_cosmic(self):
        return list(self.df.index)
    def _set_cosmic(self, cosmics):
        for cosmic in cosmics:
            if cosmic not in self.cosmicIds:
                raise ValueError('Unknown cosmic identifier')
        self.df = self.df.loc[cosmics]
    cosmicIds = property(_get_cosmic, _set_cosmic,
            doc="return list of cosmic ids (could have duplicates)")

    def drop_cosmic(self, cosmics):
        """drop a drug or a list of cosmic ids"""
        cosmics = easydev.to_list(cosmics)
        tokeep = [x for x in self.cosmicIds if x not in cosmics]
        self.cosmicIds = tokeep


[docs]class IC50(Reader, CosmicRows):
    """Reader of IC50 data set

    This input matrix must be a comman-separated value (CSV) or
    tab-separated value file (TSV).

    The matrix must have a header and at least 2 columns. If the number of rows
    is not sufficient, analysis may not be possible.

    The header must have a column called "COSMIC_ID" or "COSMIC ID".
    This column will be used as indices (row names). All other columns will
    be considered as input data.

    The column "COSMIC_ID" contains the cosmic identifiers (cell line). The
    other columns should be filled with the IC50s corresponding to a pair
    of COSMIC identifiers and Drug. Nothing prevents you to fill the file with
    data that have other meaning (e.g. AUC).

    If at least one column starts with ``Drug_``, all other columns will be
    ignored. This was implemented for back compatibility.

    The order of the columns is not important.

    Here is a simple example of a valid TSV file::

        COSMIC_ID   Drug_1_IC50 Drug_20_IC50
        111111      0.5         0.8
        222222      1           2


    A test file is provided in the gdsctools package::

        from gdsctools import ic50_test

    You can read it using this class and plot information as follows:

    .. plot::
        :width: 80%
        :include-source:

        from gdsctools import IC50, ic50_test
        r = IC50(ic50_test)
        r.plot_ic50_count()

    You can get basic information using the print function::

        >>> from gdsctools import IC50, ic50_test
        >>> r = IC50(ic50_test)
        >>> print(r)
        Number of drugs: 11
        Number of cell lines: 988
        Percentage of NA 0.206569746043

    You can get the drug identifiers as follows::

        r.drugIds

    and set the drugs, which means other will be removed::

        r.drugsIds = [1, 1000]

    .. versionchanged:: 0.9.10
        The column **COSMIC ID** should now be **COSMIC_ID**.
        Previous name is deprecated but still accepted.

    """
    cosmic_name = 'COSMIC_ID'

    def __init__(self, filename, v18=False):
        """.. rubric:: Constructor

        :param filename: input filename of IC50s. May also be an instance
            of :class:`IC50` or a valid dataframe. The data is stored as a
            dataframe in the attribute called :attr:`df`. Input file may be
            gzipped

        """
        super(IC50, self).__init__(filename)
        # interpret the raw data and check some of its contents
        self._v18 = v18

        if len(self.df) > 0:
            self._interpret()
            self.check()

    def _interpret(self):
        # if there is at least one column that starts with Drug or drug or
        # DRUG or variant then all other columns are dropped except "COSMIC ID"

        # For back compatibility with data that mixes Drug identifiers and
        # genomic features:
        _cols = [str(x) for x in self.df.columns]
        drug_prefix = None
        for this in _cols:
            if this.startswith("Drug_"):
                drug_prefix = "Drug"

        _cols = [str(x) for x in self.df.columns]
        if "COSMIC ID" in _cols and self.cosmic_name not in _cols:
            colorlog.warning("'COSMIC ID' column name is deprecated since " +
            "0.9.10. Please replace with 'COSMIC_ID'")
            self.df.columns = [x.replace("COSMIC ID", "COSMIC_ID")
                    for x in self.df.columns]
        if "CL" in _cols and "COSMID_ID" not in self.df.columns:
            colorlog.warning("'CL column name is deprecated since " +
            "0.9.10. Please replace with 'COSMIC_ID'")
            self.df.columns = [x.replace("CL", "COSMIC_ID")
                    for x in self.df.columns]

        # If the data has not been interpreted, COSMIC column should be
        # found in the column and set as the index
        _cols = [str(x) for x in self.df.columns]
        if self.cosmic_name in self.df.columns:
            self.df.set_index(self.cosmic_name, inplace=True)
            _cols = [str(x) for x in self.df.columns]
            if drug_prefix:
                columns = [x for x in _cols if x.startswith(drug_prefix)]
                self.df = self.df[columns]

        # If already interpreted, COSMIC name should be the index already.
        # and should be integers, so let us cast to integer
        elif self.df.index.name == self.cosmic_name:
            _cols = [str(x) for x in self.df.columns]
            if drug_prefix:
                columns = [x for x in _cols if x.startswith(drug_prefix)]
                columns = self.df.columns
                assert len(columns) == len(set(columns))
                self.df = self.df[columns]
        # Otherwise, raise an error
        else:
            raise ValueError("{0} column could not be found in the header".format(
                self.cosmic_name))

        # In v18, the drug ids may be duplicated
        if self._v18 is True:
            return

        self.df.columns = [drug_name_to_int(x) for x in self.df.columns]
        self.df.columns = self.df.columns.astype(int)

        self.df.index = [int(x) for x in self.df.index]
        self.df.index = self.df.index.astype(int)
        self.df.index.name = "COSMIC_ID"

        # Check uniqueness
        self._check_uniqueness(self.df.index)

[docs]    def drug_name_to_int(self, name):
        return drug_name_to_int(name)

    def _get_drugs(self):
        return list(self.df.columns)
    def _set_drugs(self, drugs):
        for drug in drugs:
            if drug not in self.drugIds:
                raise ValueError('Unknown drug name')
        self.df = self.df[drugs]
    drugIds = property(_get_drugs, _set_drugs,
                       doc='list the drug identifier name or select sub set')

[docs]    def drop_drugs(self, drugs):
        """drop a drug or a list of drugs"""
        drugs = easydev.to_list(drugs)
        tokeep = [x for x in self.drugIds if x not in drugs]
        self.drugIds = tokeep

    def __contains__(self, item):
        if item in self.drugIds:
            return True
        else:
            return False

[docs]    def plot_ic50_count(self, **kargs):
        """Plots the fraction of valid/measured IC50 per drug

        :param kargs: any valid parameters accepted by pylab.plot function.
        :return: the fraction of valid/measured IC50 per drug

        """
        data = self.df.count()/len(self.df)
        pylab.clf()
        pylab.plot(data.values, **kargs)
        pylab.grid()
        pylab.xlim([0, len(self.drugIds)+1])
        pylab.xlabel('Drug index')
        pylab.ylim([0,1])
        pylab.ylabel('Percentage of valid IC50')
        return  data

[docs]    def hist(self, bins=20, **kargs):
        """Histogram of the measured IC50

        :param bins: binning of the histogram
        :param kargs: any argument accepted by pylab.hist function.
        :return: all measured IC50

        .. plot::
            :include-source:
            :width: 80%

            from gdsctools import IC50, ic50_test
            r = IC50(ic50_test)
            r.hist()

        """
        pylab.clf()
        pylab.hist(self.get_ic50(), bins=bins, **kargs)
        pylab.grid()
        pylab.xlabel('log IC50')

[docs]    def get_ic50(self):
        """Return all ic50 as a list"""
        return [x for x in self.df.values.flatten() if not np.isnan(x)]

    def __str__(self):
        txt = "Number of drugs: %s\n" % len(self.drugIds)
        txt += "Number of cell lines: %s\n" % len(self.df)
        N = len(self.drugIds) * len(self.df)
        Nna = self.df.isnull().sum().sum()
        if N != 0:
            txt += "Percentage of NA {0}\n".format(Nna / float(N))
        return txt

    def __repr__(self):
        Nc = len(self.cosmicIds)
        Nd = len(self.drugIds)
        return "IC50 object <Nd={0}, Nc={1}>".format(Nd, Nc)

    """def __add__(self, other):
        print("Experimantal. combines IC50 via COSMIC IDs")
        df = pd.concat([self.df, other.df], ignore_index=True)
        df = df.drop_duplicates(cols=[self.cosmic_name])
        return df
    """
[docs]    def copy(self):
        new = IC50(self)
        return new


[docs]class GenomicFeatures(Reader, CosmicRows):
    """Read Matrix with Genomic Features

    These are the compulsary column names required (note the spaces):

        - 'COSMIC_ID'
        - 'TISSUE_FACTOR'
        - 'MSI_FACTOR'

    If one of the following column is found, it is removed (deprecated)::

        - 'SAMPLE_NAME'
        - 'Sample Name'
        - 'CELL_LINE'

    and features can be also encoded with the following convention:

        - columns ending in "_mut" to encode a gene mutation (e.g., BRAF_mut)
        - columns starting with "gain_cna"
        - columns starting with "loss_cna"

    Those columns will be removed:

        - starting with `Drug_`, which are supposibly from the IC50 matrix

    ::

        >>> from gdsctools import GenomicFeatures
        >>> gf = GenomicFeatures()
        >>> print(gf)
        Genomic features distribution
        Number of unique tissues 27
        Number of unique features 677 with
        - Mutation: 270
        - CNA (gain): 116
        - CNA (loss): 291

    .. versionchanged:: 0.9.10
        The header's columns' names have changed to be more consistant.
        Previous names are deprecated but still accepted.

    .. versionchanged:: 0.9.15
        If a tissue is empty, it is replaced by UNDEFINED.
        We also strip the spaces to make sure there is "THIS" and "THIS " are
        the same.

    """
    colnames = easydev.AttrDict()
    colnames.cosmic = 'COSMIC_ID'
    colnames.tissue = 'TISSUE_FACTOR'
    colnames.msi = 'MSI_FACTOR'
    colnames.media = 'MEDIA_FACTOR'

    def __init__(self, filename=None, empty_tissue_name="UNDEFINED"):
        """.. rubric:: Constructor

        If no file is provided, using the default file provided in the
        package that is made of 1001 cell lines times 680 features.

        :param str empty_tissue_name: if a tissue name is let empty, replace
            it with this string.

        """
        # first reset the filename to the shared data (if not provided)
        if filename is None:
            from gdsctools.datasets import genomic_features_test
            filename = genomic_features_test
        # used in the header so should be ser before call to super()

        super(GenomicFeatures, self).__init__(filename)

        # FIXME Remove columns related to Drug if any. Can be removed in
        # the future
        self.df = self.df[[x for x in self.df.columns
            if x.startswith('Drug_') is False]]

        for this in ['Sample Name', 'SAMPLE_NAME', 'Sample_Name', 'CELL_LINE']:
            if this in self.df.columns:
                self.df.drop(this, axis=1, inplace=True)

        # Let us rename "COSMIC ID" into "COSMIC_ID" if needed
        for old, new in {
                    'Tissue Factor Value': 'TISSUE_FACTOR',
                    'MS-instability Factor Value': 'MSI_FACTOR',
                    'COSMIC ID': 'COSMIC_ID'}.items():
            if old in self.df.columns:
                colorlog.warning("'%s' column name is deprecated " % old +
                    " since 0.9.10. Please replace with '%s'"%  new)
                self.df.columns = [x.replace(old, new)
                        for x in self.df.columns]
        if "CL" in self.df.columns and "COSMID_ID" not in self.df.columns:
            self.df.columns = [x.replace("CL", "COSMIC_ID")
                    for x in self.df.columns]

        # There are 3 special columns to hold the factors
        self._special_names = []

        # If tissue factor is not provided, we create and fill it with dummies.
        # OTherwise, we need to change a lot in the original code in ANOVA
        if self.colnames.tissue not in self.df.columns:
            colorlog.info("column named '%s' not found" % self.colnames.tissue)
            self.df[self.colnames.tissue] = ['UNDEFINED'] * len(self.df)
            self._special_names.append(self.colnames.tissue)
        else:
            self._special_names.append(self.colnames.tissue)

        self.found_msi = self.colnames.msi in self.df.columns
        if self.found_msi is False:
            colorlog.info("column named '%s' not found" % self.colnames.msi)
        else:
            self._special_names.append(self.colnames.msi)

        self.found_media = self.colnames.media in self.df.columns
        if self.found_media is False:
            pass
        else:
            self._special_names.append(self.colnames.media)

        # order columns and index
        self._order()

        #
        self._interpret_cosmic()

        #
        self.check()

        self._fix_empty_tissues(empty_tissue_name)

    def _fix_empty_tissues(self, name="UNDEFINED"):
        # Sometimes, tissues may be empty so a nan is present. This lead to
        # to errors in ANOVA or Regression so we replace them with "UNDEFINED"
        N = self.df.TISSUE_FACTOR.isnull().sum()
        if N > 0:
            logger.warning("Some tissues were empty strings and renamed as UNDEFINED!")
        self.df.TISSUE_FACTOR.fillna('UNDEFINED', inplace=True)

    def _get_shift(self):
        return len(self._special_names)
    shift = property(_get_shift)

    def _interpret_cosmic(self):
        if self.colnames.cosmic in self.df.columns:
            self.df.set_index(self.colnames.cosmic, inplace=True)
        elif self.colnames.cosmic == self.df.index.name:
            pass
        else:
            error_msg = "the features input file must contains a column " +\
                " named %s" % self.colnames.cosmic
            raise ValueError(error_msg)
        self.df.index = [int(x) for x in self.df.index]
        self.df.index = self.df.index.astype(int)
        self.df.index.name = "COSMIC_ID"
        self.df.sort_index(inplace=True)

[docs]    def fill_media_factor(self):
        """Given the COSMIC identifiers, fills the MEDIA_FACTOR column

        If already populated, replaced by new content.

        """
        from gdsctools import COSMICInfo
        c = COSMICInfo()
        self.df['MEDIA_FACTOR'] = [c.get(x).SCREEN_MEDIUM
                for x in self.df.index]
        self.found_media = True
        if self.colnames.media not in self._special_names:
            self._special_names.append(self.colnames.media)
        self._order()

    def _order(self):
        others = [x for x in self.df.columns if x not in self._special_names]
        self.df = self.df[self._special_names + others]

    def _get_features(self):
        return list(self.df.columns)
    def _set_features(self, features):
        for feature in features:
            if feature not in self.features:
                raise ValueError('Unknown feature name %s' % feature)
        features = [x for x in features if x.endswith('FACTOR') is False]
        features = self._special_names + features
        self.df = self.df[features]
        self._order()
    features = property(_get_features, _set_features,
                        doc="return list of features")

    def _get_tissues(self):
        return list(self.df[self.colnames.tissue])
    tissues = property(_get_tissues, doc='return list of tissues')

    def _get_unique_tissues(self):
        return list(self.df[self.colnames.tissue].unique())
    unique_tissues = property(_get_unique_tissues, doc='return set of tissues')

[docs]    def plot(self, shadow=True, explode=True, fontsize=12):
        """Histogram of the tissues found

        .. plot::
            :include-source:
            :width: 80%

            from gdsctools import GenomicFeatures
            gf = GenomicFeatures() # use the default file
            gf.plot()


        """
        if self.colnames.tissue not in self.df.columns:
            return
        data = pd.get_dummies(self.df[self.colnames.tissue]).sum()
        data.index = [x.replace("_", " ") for x in data.index]
        pylab.figure(1)
        pylab.clf()
        labels = list(data.index)

        labels = ["%s (%s)" % (l, count) for l, count in zip(labels, data)]
        if explode:
            pylab.pie(data, labels=labels, shadow=shadow, explode=[0.2]*len(labels))
        else:
            pylab.pie(data, labels=labels, shadow=shadow)

        # For the pie chart, we do not sort so that small slices are not all
        # together and this gives a better final pie chart but for the
        # histogram, we can sort it.
        try:
            data = data.sort_values(ascending=True)
        except:
            data = data.sort(ascending=True)
        # The bar plot
        pylab.figure(2)
        data.plot(kind='barh', width=0.8)
        pylab.grid(True)
        pylab.xlabel('Occurences', fontsize=fontsize)

        # keep the try to prevent MacOS issue
        try:pylab.tight_layout()
        except:pass
        return data

    def __str__(self):
        txt = 'Genomic features distribution\n'
        try:
            tissues = list(self.df[self.colnames.tissue].unique())
            Ntissue = len(tissues)
            txt += 'Number of unique tissues {0}'.format(Ntissue)
            if Ntissue == 1:
                 txt += ' ({0})\n'.format(tissues[0])
            elif Ntissue < 10:
                txt += '\nHere are the tissues: '
                txt += ",".join(tissues) + "\n"
            else:
                txt += '\nHere are the first 10 tissues: '
                txt += ", ".join(tissues[0:10]) + "\n"
        except:
            txt += 'No information about tissues\n'

        if self.found_msi:
            txt += "MSI column: yes\n"
        else:
            txt += "MSI column: no\n"

        if self.found_media:
            txt += "MEDIA column: yes\n"
        else:
            txt += "MEDIA column: no\n"

        # -3 since we have also the MSI, tissue, media columns
        # TODO should use shift attribute ?
        Nfeatures = len(self.features)
        txt += '\nThere are {0} unique features distributed as\n'.format(Nfeatures-self.shift)

        n_mutations = len([x for x in self.df.columns if x.endswith("_mut")])
        txt += "- Mutation: {}\n".format(n_mutations)

        n_gain = len([x for x in self.df.columns if x.startswith("gain_cna")])
        txt += "- CNA (gain): {}\n".format(n_gain)
        n_loss = len([x for x in self.df.columns if x.startswith("loss_cna")])
        txt += "- CNA (loss): {}".format(n_loss)
        return txt

[docs]    def drop_tissue_in(self, tissues):
        """Drop tissues from the list

        :param list tissues: a list of tissues to drop. If you have only
            one tissue, can be provided as a string. Since rows are removed
            some features (columns) may now be empty (all zeros). If so, those
            columns are dropped (except for the special columns (e.g, MSI).

        """
        tissues = easydev.to_list(tissues)
        mask = self.df[self.colnames.tissue].isin(tissues) == False
        self.df = self.df[mask]
        self._cleanup()

[docs]    def keep_tissue_in(self, tissues):
        """Drop tissues not in the list

        :param list tissues: a list of tissues to keep. If you have only
            one tissue, can be provided as a string. Since rows are removed
            some features (columns) may now be empty (all zeros). If so, those
            columns are dropped (except for the special columns (e.g, MSI).

        """
        tissues = easydev.to_list(tissues)
        mask = self.df[self.colnames.tissue].isin(tissues)
        self.df = self.df[mask]
        self._cleanup()

    def _cleanup(self, required_features=0):
        # FIXME: there is view/copy warning here in pandas. it should be fixed
        # or may have side-effects
        to_ignore = self._special_names
        # create a view ignoring the informative columns
        view = self.df[[x for x in self.df.columns if x not in to_ignore]]

        todrop = list(view.columns[view.sum() <= required_features])

        self.df.drop(todrop, axis=1, inplace=True)

    def __repr__(self):
        Nc = len(self.cosmicIds)
        Nf = len(self.features) - self.shift
        try:
            Nt = len(set(self.tissues))
        except:
            Nt = '?'
        return "GenomicFeatures <Nc={0}, Nf={1}, Nt={2}>".format(Nc, Nf, Nt)

[docs]    def compress_identical_features(self):
        """Merge duplicated columns/features

        Columns duplicated are merged as follows. Fhe first column is kept,
        others are dropped but to keep track of those dropped, the column name
        is renamed by concatenating the columns's names. The separator is a
        double underscore.

        ::

            gf = GenomicFeatures()
            gf.compress_identical_features()
            # You can now access to the column as follows (arbitrary example)
            gf.df['ARHGAP26_mut__G3BP2_mut']
        """

        # let us identify the duplicates as True/False
        datatr = self.df.transpose()
        duplicated_no_first = datatr[datatr.duplicated()]
        try:
            duplicated = datatr[datatr.duplicated(keep=False)]
        except:
            # pandas 0.16
            duplicated = datatr[datatr.duplicated(take_last=False)]

        tokeep = [x for x in duplicated.index if x not in duplicated_no_first.index]

        # Let us create a groupby strategy
        groups = {}
        # Let us now add the corrsponding duplicats
        for feature in tokeep:
            # Find all row identical to this feature
            matches = (duplicated.loc[feature] == duplicated).all(axis=1)
            groups[feature] = "__".join(duplicated.index[matches])

        # This drops all duplicated columns (the first is kept, others are
        # dropped)
        self.df = self.df.transpose().drop_duplicates().transpose()
        self.df.rename(columns=groups, inplace=True)
        # We want to keep the column names informative that is if there were
        # duplicates, we rename the column kept with the concatenation of all
        # the corresponding duplicates
        print("compressed %s groups of duplicates" % len(groups))
        return groups

[docs]    def get_TCGA(self):
        from gdsctools.cosmictools import COSMICInfo
        c = COSMICInfo()
        tcga = c.df.loc[self.df.index].TCGA
        return tcga


class PANCAN(Reader):
    """Reads RData file wit all genomic features including methylation.

    will be removed. Used to read original data in R format but
    will provide the data as CSV or TSV

    .. deprecated:: since v0.12
    """
    def __init__(self, filename=None):
        print('deprecated')
        """if filename is None:
            filename = easydev.get_share_file('gdsctools', 'data',
                            'PANCAN_simple_MOBEM.rdata')
        super(PANCAN, self).__init__(filename)
        # Remove R dependencies
        from biokit.rtools import RSession
        self.session = RSession()
        self.session.run('load("%s")' %self._filename)
        self.df = self._read_matrix_from_r('MoBEM')
        """

class Extra(Reader):
    def __init__(self, filename="djvIC50v17v002-nowWithRMSE.rdata"):
        super(Extra, self).__init__(filename)
        print("Deprecated since v0.12")
        # Remove R dependencies
        from biokit.rtools import RSession
        self.session = RSession()
        self.session.run('load("%s")' %self._filename)

        # 3 identical matrices containing AUC, IC50 and
        self.dfAUCv17= self._read_matrix_from_r('dfAUCv17')
        self.dfIC50v17 = self._read_matrix_from_r('dfIC50v17')
        # Residual
        self.dfResv17 = self._read_matrix_from_r('dfResv17')

        # This df holds the xmid/scale parameters for each cell line
        # Can be visualised using the tools.Logistic class.
        self.dfCL= self._read_matrix_from_r('dfCL')

        # There is an extra matrix called MoBEM, which is the same as in the
        # file

    def hist_residuals(self, bins=100):
        """Plot residuals across all drugs and cell lines"""
        data = [x for x in self.dfResv17.fillna(0).values.flatten() if x != 0]
        pylab.clf()
        pylab.hist(data, bins=bins, normed=True)
        pylab.grid(True)
        pylab.xlabel('Residuals')
        pylab.ylabel(r'\#')

    def scatter(self):
        from biokit.viz import scatter
        s = scatter.ScatterHist(self.dfCL)
        s.plot(kargs_histx={'color':'red', 'bins':20},
                kargs_scatter={'alpha':0.9, 's':100, 'c':'b'},
                kargs_histy={'color':'red', 'bins':20})

    def hist_ic50(self, bins=100):
        data = [x for x in self.dfIC50v17.fillna(0).values.flatten() if x != 0]
        pylab.clf()
        pylab.hist(data, bins=bins, normed=True)
        pylab.grid(True)
        pylab.xlabel('IC50')
        pylab.ylabel(r'\#')

    def hist_auc(self, bins=100):
        data = [x for x in self.dfAUCv17.fillna(0).values.flatten() if x != 0]
        pylab.clf()
        pylab.hist(data, bins=bins, normed=True)
        pylab.grid(True)
        pylab.xlabel('AUC')
        pylab.ylabel(r'\#')


[docs]class DrugDecode(Reader):
    """Reads a "drug decode" file

    The format must be comma-separated file. There are 3 compulsary columns
    called DRUG_ID, DRUG_NAME and DRUG_TARGET. Here is an example::

        DRUG_ID     ,DRUG_NAME   ,DRUG_TARGET
        999         ,Erlotinib   ,EGFR
        1039        ,SL 0101-1   ,"RSK, AURKB, PIM3"

    TSV file may also work out of the box. If a column name called
    'PUTATIVE_TARGET' is found, it is renamed 'DRUG_TARGET' to be compatible with
    earlier formats.

    In addition, 3 extra columns may be provided::

        - PUBCHEM_ID
        - WEBRELEASE
        - OWNED_BY

    The OWNED_BY and WEBRELEASE may be required to create packages for each
    company. If those columns are not provided, the internal dataframe is
    filled with None.

    Note that older version of identifiers such as::

        Drug_950_IC50

    are transformed as proper ID that is (in this case), just the number::

        950

    Then, the data is accessible as a dataframe, the index being the
    DRUG_ID column::

        data = DrugDecode('DRUG_DECODE.csv')
        data.df.iloc[999]


    .. note:: the DRUG_ID column must be made of integer
    """
    def __init__(self, filename=None):
        """.. rubric:: Constructor"""
        super(DrugDecode, self).__init__(filename)
        self.header = ['DRUG_ID', 'DRUG_NAME', 'DRUG_TARGET', 'OWNED_BY',
            'WEBRELEASE']
        self.header_extra = ["PUBCHEM_ID", "CHEMBL_ID", "CHEMSPIDER_ID"]

        try:
            # if the input data is already a DrugDecode instance, this should
            # fail since the expected df will not have the DRUG_ID field, that
            # should be the index
            self._interpret()
        except:
            pass
        self.df = self.df[sorted(self.df.columns)]

    def _interpret(self, filename=None):
        N = len(self.df)
        if N  == 0:
            return

        self.df.rename(columns={
                    'PUTATIVE_TARGET': 'DRUG_TARGET',
                    'THERAPEUTIC_TARGET': 'DRUG_TARGET'},
                inplace=True)

        for column in ["WEBRELEASE", "OWNED_BY"] + self.header_extra:
            if column not in self.df.columns:
                self.df[column] = [np.nan] * N

        #for this in self.header[1:]:
        for this in self.header:
            msg = " The column %s was not found and may be an issue later on."
            if this not in self.df.columns and this != self.df.index.name:
                logger.warning(msg % this )

        # Finally, set the drug ids as the index.
        try:
            self.df.set_index('DRUG_ID', inplace=True)
        except:
            # could be done already
            pass

        self.df.index = [drug_name_to_int(x) for x in self.df.index]
        self.df.index = self.df.index.astype(int)
        self.df.index.name = "DRUG_ID"

        # sort the columns
        try:
            self.df.sort_index(inplace=True)
        except:
            self.df = self.df.iloc[sorted(self.df.index)]
        self._check_uniqueness(self.df.index)

    def _get_names(self):
        return list(self.df.DRUG_NAME.values)
    drug_names = property(_get_names)

    def _get_target(self):
        return list(self.df.DRUG_TARGET.values)
    drug_targets = property(_get_target)

    def _get_drug_ids(self):
        return list(self.df.index)
    drugIds = property(_get_drug_ids,
            doc="return list of drug identifiers")

    def _get_row(self, drug_id, colname):
        if drug_id in self.df.index:
            return self.df.loc[drug_id][colname]
        elif str(drug_id).startswith("Drug_"):
            try:
                drug_id = int(drug_id.split("_")[1])
            except:
                print("DRUG ID %s not recognised" % drug_id)
                return
            if drug_id in self.df.index:
                return self.df[colname].loc[drug_id]
        elif "_" in str(drug_id):
            try:
                drug_id = int(drug_id.split("_")[0])
            except:
                print("DRUG ID %s not recognised" % drug_id)
                return
            if drug_id in self.df.index:
                return self.df[colname].loc[drug_id]
        else:
            return

[docs]    def get_name(self, drug_id):
        return self._get_row(drug_id, 'DRUG_NAME')

[docs]    def get_target(self, drug_id):
        return self._get_row(drug_id, 'DRUG_TARGET')

[docs]    def is_public(self, drug_id):
        return self._get_row(drug_id, 'WEBRELEASE')

[docs]    def check(self):
        for x in self.drugIds:
            try:
                x += 1
            except TypeError as err:
                print("drug identifiers must be numeric values")
                raise err
        # it may happen that a drug has no target in the database ! so we
        # cannot check that for the moment:
        #if  self.df.isnull().sum().sum()>0:
        #   print(d.df.isnull().sum())
        #    raise ValueError("all values must be non-na. check tabulation")

[docs]    def get_info(self):
        # Note that there are 4 cases : Y, N, U (unknown?) and NaN
        dd = {  'N': len(self),
                'N_public': sum(self.df.WEBRELEASE == 'Y'),
                'N_prop': sum(self.df.WEBRELEASE != 'Y')}
        return dd

    def __len__(self):
        return len(self.df)

    def __str__(self):
        txt = "Number of drugs: %s\n" % len(self.df)
        return txt

    def __repr__(self):
        txt = self.__str__()
        if len(self.companies):
            txt += "Contains %s companies" % len(self.companies)
        return txt

    def _get_companies(self):
        if 'OWNED_BY' in self.df.columns:
            companies = list(self.df.OWNED_BY.dropna().unique())
        else:
            companies = []
        return sorted(companies)
    companies = property(_get_companies)

[docs]    def drug_annotations(self, df):
        """Populate the drug_name and drug_target field if possible

        :param df: input dataframe as given by e.g., :meth:`anova_one_drug`
        :return df: same as input but with the FDR column populated
        """
        if len(self.df) == 0:
            return df
        #      print("Nothing done. DrugDecode is empty.")

        # aliases
        if 'DRUG_ID' not in df.columns:
            raise ValueError('Expected column named DRUG_ID but not found')

        drug_names = [self.get_name(x) for x in df.DRUG_ID.values]
        drug_target = [self.get_target(x) for x in df.DRUG_ID.values]

        # this is not clean. It works but could be simpler surely.
        df['DRUG_NAME'] = drug_names
        df['DRUG_TARGET'] = drug_target
        return df

    def __add__(self, other):
        """
        Fill missing values but do not overwrite existing fields even though
        the field in the other DrugDecode instance is difference.
        """
        # Problably not efficient but will do for now
        columns = list(self.df.columns)

        dd = DrugDecode()
        dd.df = self.df.copy()

        # add missing entires
        missing = [x for x in other.df.index if x not in self.df.index]
        dd.df = dd.df.append(other.df.iloc[missing])

        # merge existing ones
        for index, ts in other.df.iterrows():
            # add the drug if not already present
            if index in self.df.index:
                # here it is found in the 2 instances but
                # they may contain either complementary data, which
                # could have been done with pandas.merge but we wish
                # to check for incompatible data
                for column in columns:
                    a = dd.df.loc[index][column]
                    b = ts[column]
                    if pd.isnull(b) is True:
                        # nothing to do if b is NULL
                        pass
                    elif pd.isnull(a) is True:
                            # we can merge the content of b into a
                            # that is the content of other into this instance
                            dd.df.loc[index,column] = b
                    else:
                        # a and b are not null
                        if a != b:
                            print('WARNING: different fields in drug %s (%s %s %s)' %  (index, column, a, b))
        return dd

    def __eq__(self, other):
        try:
            return all(self.df.fillna(0) == other.df.fillna(0))
        except:
            return False

[docs]    def get_public_and_one_company(self, company):
        """Return drugs that belong to a specific company and public drugs"""
        drug_decode_company = self.df.query(
               "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company)
        # Transform into a proper DrugDecode class for safety
        return DrugDecode(drug_decode_company)