Source code for gdsctools.anova_results

# -*- python -*-
# -*- coding utf-8 -*-
#
#  This file is part of GDSCTools software
#
#  Copyright (c) 2015 - Wellcome Trust Sanger Institute
#  All rights reserved
#
#  File author(s): Thomas Cokelaer <cokelaer@gmail.comWE HERE>
#
#  Distributed under the BSD 3-Clause License.
#  See accompanying file LICENSE.txt distributed with this software
#
#  website: http://github.com/CancerRxGene/gdsctools
#
##############################################################################
"""ANOVAResults data structure to store the output of the ANOVA analysis"""
from collections import OrderedDict

import pandas as pd
import numpy as np
import pylab

from colormap import cmap_builder

from gdsctools import readers
from gdsctools.volcano import VolcanoANOVA
from gdsctools.report import HTMLTable

__all__ = ['ANOVAResults']


[docs]class ANOVAResults(object):
    """Class to handle results of the ANOVA analysis

    The :class:`ANOVA` class and in particular its method
    :meth:`~gdsctools.anova.ANOVA.anova_all` returns the results of
    the ANOVA analysis for each drug and genomic feature. The results
    are stored in a data structure defined in this class, which is 
    just a dataframe stored in :attr:`df` attribute with the following
    header:

    =========================== ===============================================
         Column name                                  Description
    =========================== ===============================================
    ASSOC_ID                    Alphanumeric identifier of the interaction
    FEATURE                     The CFE involved in the interaction, it
                                can be a mutated cancer driver gene
                                (CG) [suffix _mut], an abberrantly
                                fused protein [suffix fusion], a copy
                                number altered chromosomal region (RACS)
                                [prefix gain for amplifications or loss
                                for deletions];
    DRUG_ID                     Numerical id of the drug involved in the
                                interaction;
    DRUG_TARGET                 Putative target of the drug involved in the
                                interaction;
    N_FEATURE_pos               Number of cell lines harbouring the CFE
                                indicated in column E and that have been
                                screened with the drug indicated in columns
                                F and G, therefore have been included in
                                the test;
    N_FEATURE_neg               Number of cell lines not harbouring the
                                CFE indicated in column E and that have been
                                screened with the drug indicated in columns
                                F and G, therefore have been included in the
                                test;
    FEATURE_pos_logIC50_MEAN    Average log IC50 of the population of cell
                                lines accounted in colum i;
    FEATURE_neg_logIC50_MEAN    Average log IC50 of the population of cell
                                lines accounted in colum j;
    FEATURE_delta_MEAN_IC50     Difference between the two average natural
                                log IC50 values in the previous two columns
                                (j - i). A negative value indicates an
                                interaction for sensitivity, whereas a
                                positive value indicates an interaction
                                for resistance;
    FEATURE_pos_IC50_sd         Log IC50 Standard deviation for the
                                population of cell lines accounted in column
                                i;
    FEATURE_neg_IC50_sd         Log IC50 Standard deviation for the
                                population of cell lines accounted in column
                                j;
    FEATURE_IC50_effect_size    Cohen's d, quantifying the effect size of
                                the interaction. A value >=0.5 indicates
                                a moderate effect size. A value >=1
                                indicates a large effect size (i.e. difference
                                in mean log IC50 values greater than their
                                pooled standard deviations). A value >= 2
                                indicates a very large effect size (i.e.
                                difference in mean log IC50 is at least two
                                times their pooled standard deviation);
    FEATURE_pos_Glass_delta     Glass delta, quantifying the effect size of
                                the interaction as the ratio between the
                                difference of the mean log IC50 values and
                                the standard deviation of the log IC50 values
                                of the population of cell lines accounted in
                                column i;
    FEATURE_neg_Glass_delta     Glass delta Same as above for the negative set.
    ANOVA_FEATURE_pval          ANOVA test p-value quantyfing the interaction
                                significance;
    ANOVA_TISSUE_pval           ANOVA test p-value quantifying the
                                significance of the interaction between drug
                                response and the tissue of origin of the
                                cell lines; for the cancer-specific
                                interactions this value is NA;
    ANOVA_MEDIA_pval            ANOVA test p-value quantifying the
                                significance of the interaction between drug
                                response and the screening medium of the cell
                                lines; for the cancer-specific interactions
                                this value is NA;
    ANOVA_MSI_pval              ANOVA test p-value quantifying the
                                significance of the interaction between drug
                                response and the micro-satellite instability
                                status of the cell lines; for the cancer type
                                with no micro-satellite instable cell line
                                samples this value is NA;
    ANOVA_FEATURE_FDR           False discovery rate obtained by correcting
                                the p-values in column u, on an individual
                                analysis basis, for multiple hypothesis
                                testing with the q-value correction method
                                (Storey & TIbshirani, 2003)
    =========================== ===============================================

    Note that those column names are renamed internally (and if the data is
    saved in a new file):

    ======================= ================================
    ======================= ================================
    assoc_id                ASSOC_ID
    Drug id                 DRUG_ID
    Owned_by                OWNED_BY
    FEATUREpos_IC50_sd      FEATURE_pos_IC50_sd
    FEATUREneg_IC50_sd      FEATURE_neg_IC50_sd
    FEATUREpos_Glass_delta  FEATURE_pos_Glass_delta
    FEATUREneg_Glass_delta  FEATURE_neg_Glass_delta
    FEATUREpos_logIC50_MEAN FEATURE_pos_logIC50_MEAN
    FEATUREneg_logIC50_MEAN FEATURE_neg_logIC50_MEAN
    Drug Target             DRUG_TARGET
    FEATURE_deltaMEAN_IC50  FEATURE_delta_MEAN_IC50
    FEATURE_ANOVA_pval      ANOVA_FEATURE_pval
    ANOVA FEATURE FDR %     ANOVA_FEATURE_FDR
    MSI_ANOVA_pval          ANOVA_MSI_pval
    Tissue_ANOVA_pval       ANOVA_TISSUE_pval
    MEDIA_ANOVA_pval        ANOVA_MEDIA_pval
    TISSUE_ANOVA_pval       ANOVA_TISSUE_pval
    Drug name               DRUG_NAME
    ======================= ================================



    """
    _colname_drug_id = 'DRUG_ID'

    def __init__(self, filename=None, settings=None):
        """.. rubric:: Constructor

        :param str filename: Another ANOVAResults instance or a 
            compatible CSV file with the correct header.
            The filename may also be set to None (default) and 
            populated later.

        """
        if filename is not None and isinstance(filename, str):
            self.read_csv(filename)
        elif filename is None:
            self._df = pd.DataFrame()
        else:
            try:
                self._df = filename.df.copy()
            except:
                self._df = filename.copy()
            assert isinstance(self._df, pd.core.frame.DataFrame), \
                "excepts a dataframe or filename"

        #: dictionary with the relevant column names and their expected types
        self.mapping = OrderedDict()
        self.mapping['ASSOC_ID'] = np.dtype('int64')
        self.mapping['FEATURE'] = np.dtype('O')
        self.mapping['DRUG_ID'] = np.dtype('int64')
        self.mapping['DRUG_NAME'] = np.dtype('O')
        self.mapping['DRUG_TARGET'] = np.dtype('O')
        self.mapping['N_FEATURE_neg'] = np.dtype('int64')
        self.mapping['N_FEATURE_pos'] = np.dtype('int64')
        self.mapping['FEATURE_pos_logIC50_MEAN'] = np.dtype('float64')
        self.mapping['FEATURE_neg_logIC50_MEAN'] = np.dtype('float64')
        self.mapping['FEATURE_delta_MEAN_IC50'] = np.dtype('float64')
        self.mapping['FEATURE_IC50_effect_size'] = np.dtype('float64')
        self.mapping['FEATURE_neg_Glass_delta'] = np.dtype('float64')
        self.mapping['FEATURE_pos_Glass_delta'] = np.dtype('float64')
        self.mapping['FEATURE_neg_IC50_sd'] = np.dtype('float64')
        self.mapping['FEATURE_pos_IC50_sd'] = np.dtype('float64')
        self.mapping['FEATURE_IC50_T_pval'] = np.dtype('float64')
        self.mapping['ANOVA_FEATURE_pval'] = np.dtype('float64')
        self.mapping['ANOVA_TISSUE_pval'] = np.dtype('float64')
        self.mapping['ANOVA_MSI_pval'] = np.dtype('float64')
        self.mapping['ANOVA_MEDIA_pval'] = np.dtype('float64')
        self.mapping['ANOVA_FEATURE_FDR'] = np.dtype('float64')


        # If the dataframe is empty, we still fill the columns so that reports 
        # and other code will find the column names.
        if len(self.df) == 0:
            self.df = pd.DataFrame(columns=self.mapping.keys())
            self.df = self.astype(self.df)


        # before gdsctools, columns names were a bit different.
        # We need to rename some column names
        self.df.rename(columns={
            'assoc_id': 'ASSOC_ID',
            'Drug id': 'DRUG_ID',
            'Owned_by': 'OWNED_BY',
            'FEATUREpos_IC50_sd': 'FEATURE_pos_IC50_sd',
            'FEATUREneg_IC50_sd': 'FEATURE_neg_IC50_sd',
            'FEATUREpos_Glass_delta': 'FEATURE_pos_Glass_delta',
            'FEATUREneg_Glass_delta': 'FEATURE_neg_Glass_delta',
            'FEATUREpos_logIC50_MEAN': 'FEATURE_pos_logIC50_MEAN',
            'FEATUREneg_logIC50_MEAN': 'FEATURE_neg_logIC50_MEAN',
            'Drug Target': 'DRUG_TARGET',
            'FEATURE_deltaMEAN_IC50': 'FEATURE_delta_MEAN_IC50',
            'FEATURE_ANOVA_pval': 'ANOVA_FEATURE_pval',
            'ANOVA FEATURE FDR %': 'ANOVA_FEATURE_FDR',
            'MSI_ANOVA_pval': 'ANOVA_MSI_pval',
            'Tissue_ANOVA_pval': 'ANOVA_TISSUE_pval',
            'MEDIA_ANOVA_pval': 'ANOVA_MEDIA_pval',
            'TISSUE_ANOVA_pval': 'ANOVA_TISSUE_pval',
            'Drug name': 'DRUG_NAME', 'A': 'B'}, inplace=True)

        self.colnames_subset = [
            'ASSOC_ID', 'FEATURE',
            'DRUG_ID', 'DRUG_NAME', 'DRUG_TARGET',
            'N_FEATURE_neg', 'N_FEATURE_pos',
            'FEATURE_pos_logIC50_MEAN', 'FEATURE_neg_logIC50_MEAN',
            'FEATURE_delta_MEAN_IC50',
            'FEATURE_IC50_effect_size',
            'FEATURE_neg_Glass_delta', 'FEATURE_pos_Glass_delta',
            'ANOVA_FEATURE_pval',
            'ANOVA_TISSUE_pval',
            'ANOVA_MSI_pval',
            'ANOVA_MEDIA_pval',
            'ANOVA_FEATURE_FDR']

        self._df.reset_index(drop=True)
        self.settings = settings


[docs]    def astype(self, df):
        try:
            # does not work in python3.3 on travis but should work
            # we newer pandas version.
            df = df.apply(lambda x: pd.to_numeric(x, errors='ignore'))
        except:
            for col in df.columns:
                if col in self.mapping.keys():
                    df[col] = df[col].astype(self.mapping[col])
        return df

    def _get_df(self):
        return self._df
    def _set_df(self, df):
        # TODO check that all columns are found and with correct type.
        self._df = df
    df = property(_get_df, _set_df, doc="dataframe with all results")

[docs]    def to_csv(self, filename):
        """Save the ANOVAResults dataframe into a CSV file"""
        assert filename.endswith('.csv'), "filename should end in .csv"
        self.df.to_csv(filename, sep=',', index=False)

[docs]    def read_csv(self, filename):
        """Read an ANOVAResults file from a CSV file

        .. todo:: check validity of the header
        """
        self.reader = readers.Reader(filename)
        self._df = self.reader.df

    def __len__(self):
        return len(self.df)

    def _get_drugIds(self):
        if len(self) == 0:
            return []
        else:
            return self.df[self._colname_drug_id].unique()
    drugIds = property(_get_drugIds,
            doc="Returns the list of drug identifiers")

[docs]    def volcano(self, settings=None):
        """Calls :class:`VolcanoANOVA` on the results

        x-value is sign(FEATURE_delta_MEAN_IC50) times FEATURE_IC50_effect_size
        y-value is the FDR correction

        See the online documentation for details on gdsctools.readthedocs.io.

        """
        if settings is None:
            settings = self.settings
        if len(self.df) == 0:
            print("No data to plot")
            return
        self.handle_volcano = VolcanoANOVA(self.df, settings=settings)
        self.handle_volcano.volcano_plot_all()

    def __str__(self):
        txt = 'Total number of ANOVA tests performed: %s ' % len(self.df)
        return txt

    def __repr__(self):
        txt = 'ANOVAResults (%s tests): ' % len(self.df)
        return txt

[docs]    def copy(self):
        """Returns a copy """
        return ANOVAResults(self.df.copy())

[docs]    def get_html_table(self, collapse_table=False, clip_threshold=2,
            index=False, header=True, escape=False, add_href=True):
        """Return an HTML table for the reports


        :param add_href: add href to the FEATURE, DRUG ID and ASSOC ID

        """
        cmap_clip = cmap_builder('#ffffff', '#0070FF')
        cmap_absmax = cmap_builder('green', 'white', 'red')

        # The copy is used because we'll change it afterwards
        df = self.df[self.colnames_subset].copy()

        colname = 'ANOVA_FEATURE_FDR'

        df.loc[df[colname] < 0.01, colname] = '<0.01'
        # In the assoc column, we remove the first "a" letter so that
        # the column is properly sorted by Id but the link should be with the
        # "a" as prefix
        df.ASSOC_ID = df.ASSOC_ID.apply(lambda x: int(str(x).replace("a", "")))

        html = HTMLTable(df, 'notused')
        # Those columns should be links
        if add_href:
            html.add_href("FEATURE")
            html.add_href("ASSOC_ID", url="a", suffix=".html") # here url works like a prefix
            html.add_href("DRUG_ID", url="drug_", suffix=".html") # here url works like a prefix

        for this in ['FEATURE_IC50_effect_size', 'FEATURE_neg_Glass_delta',
                'FEATURE_pos_Glass_delta']:
            html.add_bgcolor(this, cmap_clip, mode='clip',
                    threshold=clip_threshold)

        # normalise data and annotate with color
        html.add_bgcolor('FEATURE_delta_MEAN_IC50', cmap_absmax,
            mode='absmax')

        html.df.columns = [x.replace("_", " ") for x in html.df.columns]
        return html.to_html(escape=escape, header=header, index=index,
                collapse_table=collapse_table, justify='center')

[docs]    def barplot_effect_size(self):
        """Dev not for production"""
        # barplot of the IC50 effect size
        if len(self.df) == 0:
            print("No data to plot")
        data = np.sign(self.df.FEATURE_delta_MEAN_IC50) * self.df.FEATURE_IC50_effect_size
        try:
            data = data.sort_values()
        except:
            data.sort(inplace=True)

        n_green = len(data[data<0])
        n_red = len(data[data>=0])

        print(n_green, n_red)
        data.plot(kind='barh', width=1, alpha=0.5,
            color=['green']*n_green + ['red'] * n_red)

        pylab.xlabel("Effect size")
        pylab.ylabel("Drug name")