Source code for gdsctools.anova

# -*- python -*-
# -*- coding utf-8 -*-
#
#  This file is part of GDSCTools software
#
#  Copyright (c) 2015 - Wellcome Trust Sanger Institute
#  All rights reserved
#
#  File author(s): Thomas Cokelaer <cokelaer@gmail.comWE HERE>
#
#  Distributed under the BSD 3-Clause License.
#  See accompanying file LICENSE.txt distributed with this software
#
#  website: http://github.com/CancerRxGene/gdsctools
#
##############################################################################
"""Code related to the ANOVA analysis to find associations between drug IC50s
and genomic features"""
import pandas as pd
import scipy
import pylab
import numpy as np

from statsmodels.formula.api import OLS

from easydev import Progress, AttrDict, do_profile

from gdsctools.models import BaseModels
from gdsctools.boxplots import BoxPlots
from gdsctools.settings import ANOVASettings
from gdsctools.anova_results import ANOVAResults

from easydev import MultiProcessing


__all__ = ['ANOVA']


class DummyDF(object):
    values = None


# Not that Logging is not used: it is not pickable and prevent
# multicore analysis.
[docs]class ANOVA(BaseModels): #Logging):
    """ANOVA analysis of the :term:`IC50` vs Feature matrices

    This class is the core of the analysis. It can be used to
    compute

    #. One association between a drug and a feature
    #. The association**S** between a drug and a set of features
    #. All assocations between a set of drugs and a set of features.

    For instance here below, we read an IC50 matrix and compute the
    association for a given drug with a specific feature.

    Note that genomic features are not provided as input but a default
    file is provided with this package that contains 49 genomic
    features for 988 cell lines. If your IC50 contains unknown cell lines,
    you can provide your own file or use the :mod:`gdsctools.datasets` v17.

    .. plot::
        :include-source:
        :width: 80%

        from gdsctools import IC50, ANOVA, ic50_test
        ic = IC50(ic50_test)
        an = ANOVA(ic)
        # This is to select a specific tissue
        an.set_cancer_type('breast')
        df = an.anova_one_drug_one_feature(1047, 'TP53_mut', show=True)

    :Details about the anova analysis: In the example above, we perform a
        regression/anova test based on OLS regression. This is done for
        one feature one drug across all cell lines (tissue) in the method
        :meth:`anova_one_drug`. The regression
        takes into account the following factors: tissue, MSI, MEDIA
        and features. If there is only one tissue, this factor is
        dropped. If the number of MSI values is less than a pre-defined
        parameter (see :class:`~gdsctools.settings.ANOVASettings`), it is
        dropped. MEDIA, MSI columns are optional. The other
        methods :meth:`anova_one_drug` and :meth:`anova_all` are wrappers
        around :meth:`anova_one_drug_one_feature` to loop over all drugs, and
        loop over all drugs and all features, respectively.

        Please see the online documentation (ANOVA sections) for more help
        on gdsctools.readthedocs.io

    Specific notes about the parameters. Default settings are used except for
    those releases:

    V17 ::

        gdsc.volcano_FDR_interpolation = False
        gdsc.settings.pvalue_correction_method = 'qvalue'

    V18 ::

        gdsc.settings.FDR_threshold = 35

    """
    def __init__(self, ic50, genomic_features=None,
            drug_decode=None, verbose=True,
            set_media_factor=False):
        """.. rubric:: Constructor

        :param DataFrame IC50: a dataframe with the IC50. Rows should be
            the COSMIC identifiers and columns should be the Drug names
            (or identifiers)
        :param features: another dataframe with rows as in the IC50 matrix
            and columns as features.  The first 3 columns must be named
            specifically to hold tissues, MSI (see format).
        :param drug_decode: a 3 column CSV file with drug's name and targets
            see :mod:`readers` for more information.
        :param verbose: verbosity in "WARNING", "ERROR", "DEBUG", "INFO"

        Please see :mod:`~gdsctools.readers` module for details about the
        input formats.

        The :attr:`settings` attribute contains specific settings related
        to the ANOVA analysis or visualisation.
        """
        super(ANOVA, self).__init__(ic50, genomic_features,
            drug_decode=drug_decode, verbose=verbose,
            set_media_factor=set_media_factor)

        self.sampling = 0
        self.pvalues_features = {}

    def _get_one_drug_one_feature_data(self, drug_id, feature_name,
            diagnostic_only=False):
        """
        return: a dictionary with relevant information. There is also
            a test to see if the data can be analysis or not. This is
            stored ad a boolean value with key called *status*.
        """
        # dictionary  struture to hold results (can set values as attributes)
        dd = AttrDict()

        # select IC50 of a given drug
        # a fast way to select non-NA values from 1 column:
        # dropna is actually faster than a method using a mask.
        #dd.Y = self.ic50.df[drug_id].dropna()
        #indices = dd.Y.index
        #dd.masked_features = self.features.df[feature_name][indices]
        #dd.masked_tissue = self.tissue_factor[indices]
        #dd.masked_msi = self.msi_factor[indices]
        #dd.positive_feature = dd.masked_features.values.sum()
        #dd.negative_feature = len(dd.masked_features) - dd.positive_feature
        #dd.positive_msi = dd.masked_msi.values.sum()
        #dd.negative_msi = len(dd.masked_msi) - dd.positive_msi
        # using a mask instead of indices is 30% slower
        #mask = self.ic50.df[drug_id].isnull()==False
        #dd.masked_features = self.features.df[feature_name][mask]
        #dd.masked_tissue = self.tissue_factor[mask]
        #dd.masked_msi = self.msi_factor[mask]

        # Amother version using a dictionary instead of dataframer is actually
        # 2-3 times faster. It requires to transform the dataframe into a
        # dictionary once for all and dropping the NA as well.
        # Now, the next line takes no time
        dd.Y = self.ic50_dict[drug_id]['Y']

        # an alias to the indices
        indices = self.ic50_dict[drug_id]['indices']
        dd.indices = indices
        # select only relevant tissues/msi/features

        # This a is 5-6 times slower to use loc than the 2 lines of
        # code that follows, the creation of this masked_features was
        # taking 99% of the time in this function and now takes about 50%
        #dd.masked_features = self.features.df.loc[indices, feature_name].values
        real_indices = self.ic50_dict[drug_id]['real_indices']
        dd.masked_features = self.features.df[feature_name].values[real_indices]

        dd.masked_tissue = self.tissue_dict[drug_id]
        if self.features.found_msi:
            dd.masked_msi = self.msi_dict[drug_id]
            dd.positive_msi = dd.masked_msi.values.sum()
            dd.negative_msi = len(dd.masked_msi) - dd.positive_msi

        if self.settings.include_media_factor:
            dd.masked_media = self.media_dict[drug_id]

        # compute length of pos/neg features and MSI
        dd.positive_feature = dd.masked_features.sum()
        dd.negative_feature = len(dd.masked_features) - dd.positive_feature

        # Some validity tests to run the analysis or not
        feature_threshold = self.settings.feature_factor_threshold
        msi_threshold = self.settings.MSI_factor_threshold

        A = self.settings.include_MSI_factor and\
            dd.positive_feature >= feature_threshold and\
            dd.negative_feature >= feature_threshold and\
            dd.negative_msi >= msi_threshold and \
            dd.positive_msi >= msi_threshold

        B = (not self.settings.include_MSI_factor) and\
            dd.positive_feature >= feature_threshold and\
            dd.negative_feature >= feature_threshold
        # We could of course use the mean() and std() functions from pandas or
        # numpy. We could also use the glass and cohens functions from the
        # stats module but the following code is much faster because it
        # factorises the computations of mean and variance
        dd.positives = dd.Y[dd.masked_features == 1]
        dd.negatives = dd.Y[dd.masked_features == 0]
        dd.Npos = len(dd.positives)
        dd.Nneg = len(dd.negatives)

        # additional information
        dd.feature_name = feature_name
        dd.drug_id = drug_id
        dd.drug_target = self.drug_decode.get_target(drug_id)
        dd.drug_name = self.drug_decode.get_name(drug_id)

        # FIXME is False does not give the same results as == False
        # in the test test_anova.py !!
        if (A == False) and (B == False):
            dd.status = False
            return dd
        else:
            dd.status = True

        if diagnostic_only is True:
            return dd

        # compute mean and std of pos and neg sets;using mean() takes 15us and
        # using the already computed sum and N takes 5us
        pos_sum = dd.positives.sum()
        neg_sum = dd.negatives.sum()
        dd.pos_IC50_mean = pos_sum / dd.Npos
        dd.neg_IC50_mean = neg_sum / dd.Nneg
        dd.delta_mean_IC50 = dd.pos_IC50_mean - dd.neg_IC50_mean

        # note the ddof to agree with R convention.
        dd.pos_IC50_std = dd.positives.std(ddof=1)
        dd.neg_IC50_std = dd.negatives.std(ddof=1)

        # Nov 2016. Den may be close to zero but slightly negative
        den = (dd.positives**2).sum() - pos_sum**2/dd.Npos
        dd.pos_IC50_std = np.sqrt( max(0,den) / (dd.Npos-1.))

        den = (dd.negatives**2).sum() -  neg_sum**2/dd.Nneg
        dd.neg_IC50_std = np.sqrt( max(0,den) / (dd.Nneg-1.))

        # Compute Cohens and Glass effect size. Since underlying code
        # has lots in common, we do not use the modules but add
        # the code here below
        md = np.abs(dd.pos_IC50_mean - dd.neg_IC50_mean)

        dd.pos_glass = md / dd.pos_IC50_std
        dd.neg_glass = md / dd.neg_IC50_std


        csd = (dd.Npos - 1.) * dd.pos_IC50_std**2 + \
                (dd.Nneg - 1.) * dd.neg_IC50_std**2
        csd /= dd.Npos + dd.Nneg - 2.  # make sure this is float
        if (csd > 0):
            dd.effectsize_ic50 = md / np.sqrt(csd)
        else:
            print("Unexpected negative effect size for %s %s. Set to zero. " % 
                (drug_id, feature_name))
            dd.effectsize_ic50 = 0

        # Note that equal_var is a user parameter and affects
        # results. The ANOVA_results.txt obtained from SFTP
        # have different values meaning that the equal.var param
        # was set to False. Note that pvalue is stored at index 1
        dd.ttest = self._get_ttest(dd.negatives, dd.positives)
        return dd

    def _get_ttest(self, sample1, sample2):
        # this computes the ttest.
        import scipy
        return scipy.stats.ttest_ind(sample1, sample2,
                equal_var=self.settings.equal_var_ttest)[1]

[docs]    def anova_one_drug_one_feature(self, drug_id,
            feature_name, show=False,
            production=False, directory='.', fontsize=18):
        """Compute ABOVA one drug and one feature level

        :param drug_id: a valid drug identifier
        :param feature_name: a valid feature name
        :param bool show: show boxplots with the different factor used
        :param str directory: where to save the figure.
        :param bool production: if False, returns a dataframe otherwise
            a dictionary. This is to speed up analysis when scanning
            the drug across all features.

        .. note:: **for developer** this is the core of the analysis
            and should be kept as fast as possible. 95% of the time is spent
            here.

        .. note:: **for developer** Data used in this function comes from
            _get_one_drug_one_feature_data method, which should also be kept
            as fast as possible.
        """
        if drug_id not in self.drugIds:
            raise ValueError('Unknown drug name %s. Use e.g., %s'
                    % (drug_id, self.drugIds[0]))

        if feature_name not in self.feature_names:
            # we start index at 3 to skip tissue/name/msi
            raise ValueError('Unknown feature name %s. Use e.g. one of %s'
                    % (feature_name, self.feature_names[0:3]))

        # This extract the relevant data and some simple metrics
        # This is now pretty fast accounting for 45 seconds
        # for 265 drugs and 988 features
        odof = self._get_one_drug_one_feature_data(drug_id, feature_name)

        # if the status is False, it means the number of data points
        # in a category (e.g., positive feature) is too low.
        # If so, nothing to do, we return an 'empty' dictionary
        if odof.status is False:
            results = self._odof_dict.copy()
            results['FEATURE'] = feature_name
            results['DRUG_ID'] = odof.drug_id
            results['DRUG_NAME'] = odof.drug_name
            results['DRUG_TARGET'] = odof.drug_target
            results['N_FEATURE_pos'] = odof.Npos
            results['N_FEATURE_neg'] = odof.Nneg
            if production is True:
                # return a dict
                return results
            else:
                # with newer version of pandas (v0.19), None are not accepted
                # anymore
                for k in results.keys():
                    if results[k] is None:
                        results[k] = np.nan
                df = pd.DataFrame(results, index=[1])
                return df

        # IMPORTANT: the order of the factors in the formula
        # is important. It does not change the total sum of square errors
        # but may change individual effects of the categorical components.

        # If a formula is provided, use statsmodels. Since it is slowish,
        # we implemented several cases as described in the doc for the 4
        # following cases:
        # - TISSUE + MSI +MEDIA + FEATURE
        # - TISSUE + MSI + FEATURE
        # - MSI + FEATURE
        # - FEATURE
        if self.settings.regression_formula not in ["auto", None, ""]:
            # This populates the anova_pvalues attribute itself
            _ = self.anova_one_drug_one_feature_custom(drug_id, feature_name,
                formula= self.settings.regression_formula,
                odof=odof)
            results = self._set_odof_results(self.anova_pvalues, odof)
        elif self.settings.analysis_type == 'PANCAN':
            # IMPORTANT: tissues are sorted alphabetically in R aov
            # function. Same in statsmodels but capitalised names
            # are sorted differently. In R, a<b<B<c but in Python,
            # A<B<C<a<b<c. So, 'aero' tissue is before 'Bladder' in R,
            # not in python. Since in a linear regression
            # models, the order of the factor matters and the first
            # factor is used as a reference, we decided to use same
            # convention as in R.
            # see http://statsmodels.sourceforge.net/devel/contrasts.html
            # for a good explanation

            # We could use pd.get_dummies but pretty slow
            # instead we create the full matrix in init() method.
            # One issue is that some columns end up with sum == 0
            # and needs to be dropped.
            df = self._tissue_dummies.loc[odof.masked_tissue.index]
            todrop = df.columns[df.values.sum(axis=0) == 0]

            if len(todrop) > 0: # use if since drop() is slow
                df = df.drop(todrop, axis=1)
            tissues = [x for x in df.columns if x.startswith('C(tissue')]
            df.drop(tissues[0], axis=1, inplace=True)
            # Here we set other variables with dataframe columns' names as
            # expected by OLS.
            if self.settings.include_media_factor == False:
                # make sure the media factor is not included
                todrop = [x for x in df.columns if
                        x.startswith('C(media)')]
                df = df.drop(todrop, axis=1)
            else:
                # drop the first one for the regression
                medias = [x for x in df.columns if x.startswith('C(media')]
                if len(medias):
                    df.drop(medias[0], axis=1, inplace=True)
            df['C(msi)[T.1]'] = odof.masked_msi.values
            df['feature'] = odof.masked_features

            # The regression itself
            self.data_lm = OLS(odof.Y, df.values).fit()
            # The ANOVA
            self.anova_pvalues = self._get_anova_summary(self.data_lm, odof=odof)
            results = self._set_odof_results(self.anova_pvalues, odof)
        elif self.settings.include_MSI_factor is True:
            df = DummyDF()
            df.values = np.ones((3, odof.Npos + odof.Nneg))
            df.values[1] = odof.masked_msi.values
            df.values[2] = odof.masked_features
            df.values = df.values.T
            # The regression itself
            self.data_lm = OLS(odof.Y, df.values).fit()
            # The ANOVA itself
            self.anova_pvalues = self._get_anova_summary(self.data_lm, odof=odof)
            results = self._set_odof_results(self.anova_pvalues, odof)
        else:
            df = DummyDF()
            df.values = np.ones((2, odof.Npos + odof.Nneg))
            df.values[1] = odof.masked_features
            df.values = df.values.T
            # The regression itself
            self.data_lm = OLS(odof.Y, df.values).fit()
            # The ANOVA itself
            self.anova_pvalues = self._get_anova_summary(self.data_lm, odof=odof)
            results = self._set_odof_results(self.anova_pvalues, odof)

        key = str(drug_id) + "__" + feature_name
        if self.sampling and key not in self.pvalues_features.keys():
            # This can be computed for a drug once for all
            # no need to redo it for each feature ?
            # If the length of Y is too small (e.g., < 20) the results may not be
            # great. This can be check zith the errors
            self.samples1 = []
            self.samples2 = []
            self.samples3 = []
            Y = odof.Y.copy()
            N = self.sampling
            pb = Progress(N, 20)
            for i in range(0, N):
                # To get the random distribution, shuffle Y
                # and noise not required
                # To get the noise effects, do not shuffle and set noise to
                # something different from 0
                noise = 0.0
                pylab.shuffle(Y)
                #data_lm = OLS(Y, df.values).fit()
                data_lm = OLS(Y+noise*pylab.randn(len(Y)), df.values).fit()
                anova_pvalues = self._get_anova_summary(data_lm,
                    output='dict', odof=odof)
                try:self.samples1.append(anova_pvalues['msi'])
                except:pass
                self.samples2.append(anova_pvalues['feature'])
                try:self.samples3.append(anova_pvalues['tissue'])
                except:pass
                #pb.animate(i+1)
            import fitter
            ff = fitter.Fitter(-pylab.log10(self.samples2))
            dist = "genexpon"
            ff.distributions = [dist]
            ff.fit()
            self.pvalues_features[key] = {
                'error': ff.df_errors.loc[dist].values[0],
                'params': ff.fitted_param[dist],
                'feature': feature_name,
                'N':len(Y)}

        if show is True:
            boxplot = BoxPlots(odof, savefig=self.settings.savefig,
                    directory=directory, fontsize=fontsize)
            boxplot.boxplot_association(fignum=1)

            # a boxplot to show cell lines effects. This requires
            # the settings.analyse_type to be PANCAN
            if self.settings.analysis_type == 'PANCAN':
                boxplot.boxplot_pancan(fignum=2, mode='tissue')
            if self.settings.include_MSI_factor:
                boxplot.boxplot_pancan(fignum=3, mode='msi')
            if self.settings.include_media_factor:
                boxplot.boxplot_pancan(fignum=3, mode='media')

        # about 30% of the time spent in creating the DataFrame...
        if production is True:
            return results
        else:
            # with newer version of pandas (v0.19), None are not accepted
            # anymore
            for k in results.keys():
                if results[k] is None:
                    results[k] = np.nan
            df = pd.DataFrame(results, index=[1])
            return df

    def _set_odof_results(self, pvalues, odof):
        # Store the pvalues. Note that some may be missing so we use try
        # except, which is faster than if/else
        try:pvalues = pvalues["PR(>F)"]
        except:pass
        try: tissue_PVAL = pvalues['tissue']
        except: tissue_PVAL = None
        try: MSI_PVAL = pvalues['msi']
        except: MSI_PVAL = None
        try: FEATURE_PVAL = pvalues['feature']
        except: FEATURE_PVAL = None
        try: MEDIA_PVAL = pvalues['media']
        except: MEDIA_PVAL = None

        results = {'FEATURE': odof.feature_name, 
                'DRUG_ID': odof.drug_id,
                'DRUG_NAME': odof.drug_name,
                'DRUG_TARGET': odof.drug_target,
                'N_FEATURE_pos': odof.Npos,
                'N_FEATURE_neg': odof.Nneg,
                'FEATURE_pos_logIC50_MEAN': odof.pos_IC50_mean,
                'FEATURE_neg_logIC50_MEAN': odof.neg_IC50_mean,
                'FEATURE_delta_MEAN_IC50': odof.delta_mean_IC50,
                'FEATURE_pos_IC50_sd': odof.pos_IC50_std,
                'FEATURE_neg_IC50_sd': odof.neg_IC50_std,
                'FEATURE_IC50_effect_size': odof.effectsize_ic50,
                'FEATURE_pos_Glass_delta': odof.pos_glass,
                'FEATURE_neg_Glass_delta': odof.neg_glass,
                'ANOVA_FEATURE_pval': FEATURE_PVAL,
                'ANOVA_TISSUE_pval': tissue_PVAL,
                'ANOVA_MSI_pval': MSI_PVAL,
                'ANOVA_MEDIA_pval': MEDIA_PVAL,
                'FEATURE_IC50_T_pval': odof.ttest # pvalues is in index 1
                }
        return results

[docs]    def anova_one_drug_one_feature_custom(self, drug_id, feature_name, formula,
        odof=None):
        """Same as :meth:`anova_one_drug_one_feature` but allows any formula

        :return: full ANOVA table but also populate interal attribute
            anova_pvalues that is a dictionary with pvalues for
            feature, media, msi and tissue

        Formula must be set in the settings attribute as follows::

            an = ANOVA(...)
            an.settings['regression_formula'] = "Y ~  C(tissue) + feature"

        .. note:: This function is convenient but 3-4 times slower than
            :meth:`anova_one_drug_one_feature`. So if your formula are one of::

                "Y ~  C(tissue) + C(media) + C(msi) + feature"
                "Y ~  C(tissue) + C(msi) + feature"
                "Y ~  C(msi) + feature"
                "Y ~  feature"

            you should rather use :meth:`anova_one_drug_one_feature` instead
            (keeping regression_formula set to 'auto').

        By default, in categories, the first treatment (e.g tissue) is used as a
        reference and is not shown in the results. You may set the reference as
        follows::

            "Y ~ C(tissue, Treatment(reference='breast'))"

        ANOVA pvalues returned are of type I

        .. versionadded:: 0.15.0

        """
        import statsmodels.formula.api as smf
        from statsmodels.stats.api import anova_lm

        if odof is None:
            odof = self._get_one_drug_one_feature_data(drug_id, feature_name)
        df = pd.DataFrame({ 'Y':odof.Y,
                            'feature': odof.masked_features})
        # Add other categorical explanatory variables if available
        try: df['tissue'] = odof.masked_tissue.values
        except: pass
        try: df['msi'] = odof.masked_msi.values
        except: pass
        try: df['media'] = odof.masked_media.values
        except:pass


        # "Y ~  C(tissue) + C(msi) + C(media) + feature"
        assert "Y" in formula, "Y must be the LHS of the formula"
        # This returns a Model instance
        model = smf.ols(formula, data=df)

        self._debug_custom_df = df
        self._debug_custom_model = model

        anova = anova_lm(model.fit(), typ=1)
        anova_pvalues = {}
        for k,v in anova['PR(>F)'].iteritems():
            if k == 'C(tissue)':
                anova_pvalues['tissue'] = v
            elif k == 'C(msi)':
                anova_pvalues['msi'] = v
            elif k == 'C(media)':
                anova_pvalues['media'] = v
            elif k == 'feature':
                anova_pvalues['feature'] = v
        self.anova_pvalues = anova_pvalues
        return anova

    # no need to optimise anymore
    def _get_anova_summary(self, data_lm, output='dict', odof=None):
        """

        an = ANOVA(...)
        an.anova_one_drug_one_feature(1047, "ABCB1_mut")
        an._get_anova_summary(an.data_lm, output="dataframe", odof=an.odof)

        should be identical to 

        an.anova_one_drug_one_feature_custom(1047, "ABCB1_mut", formula="Y ~ C(tissue) + C(msi) + feature")

        """
        q, r = np.linalg.qr(data_lm.model.data.exog)
        effects = np.dot(q.T, data_lm.model.data.endog)

        # In the regression, the first tissue is dropped hence -1
        # Similarly for the MEDIA factor
        modes = self._get_analysis_mode()

        # create the W matrix using tissue and MSI if requested
        # default is that the 3 features are used
        if 'tissue' in modes and 'msi' in modes and 'media' in modes:
            Ntissue = len(odof.masked_tissue.unique()) - 1
            Nmedia = len(odof.masked_media.unique()) -1
            dof = [Ntissue, Nmedia, 1, 1]
            self._debug_dof = dof
            Ncolumns = sum(dof) + 1         # intercept added
            indices = ['tissue', 'media', 'msi', 'feature', 'Residuals']
            # 4 stands for intercept + tissue + msi +feature
            arr = np.zeros((5, Ncolumns))
            arr[1, slice(1, Ntissue+1)] = 1
            arr[2, slice(Ntissue+1, Ntissue+Nmedia+1)] = 1
            arr[3, Ntissue + Nmedia + 1] = 1
            arr[4, Ntissue + Nmedia + 2] = 1
        elif 'tissue' in modes and 'msi' in modes:
            Ntissue = len(odof.masked_tissue.unique()) - 1
            dof = [Ntissue, 1, 1]
            indices = ['tissue', 'msi', 'feature', 'Residuals']
            # 4 stands for intercept + tissue + msi +feature
            Ncolumns = sum(dof) + 1         # intercept added
            arr = np.zeros((4, Ncolumns))
            arr[1, slice(1, Ntissue+1)] = 1
            arr[2, Ntissue + 1] = 1
            arr[3, Ntissue + 2] = 1
        elif 'tissue' not in modes and 'msi' in modes:
            dof = [1, 1]
            Ncolumns = sum(dof) + 1         # intercept added
            indices = ['msi', 'feature', 'Residuals']
            # 3 stands for intercept + msi +feature
            arr = np.zeros((3, Ncolumns))
            arr[1, 1] = 1
            arr[2, 2] = 1
        elif 'tissue' not in modes and 'msi' not in modes:
            dof = [1]
            Ncolumns = sum(dof) + 1         # intercept added
            indices = ['feature', 'Residuals']
            # 3 stands for intercept + msi +feature
            arr = np.zeros((2, Ncolumns))
            arr[1, 1] = 1
        else:
            raise NotImplementedError("""
This combo %s is not implemented in the "auto" mode. See
http://gdsctools.readthedocs.io/en/master/anova_parttwo.html for available
combos of variables. In short, MSI must be included. Note, however that
if you wish to use your own formula, you can set it in
settings.regression_formula ; this is simply be slower as compared to the 
standard regression. Here is an example of a formula: 

Y ~ C(tissue) + C(media) + feature

""" % modes)
        arr[0, 0] = 1                   # intercept

        self._debug_arr = arr
        self._debug_effects = effects
        sum_sq = np.dot(arr, effects**2)
        sum_sq = sum_sq[1:] # drop the intercep
        mean_sq = sum_sq / np.array(dof)
        Fvalues = mean_sq / (data_lm.ssr / data_lm.df_resid)
        F_pvalues = scipy.stats.f.sf(Fvalues, dof, data_lm.df_resid)

        sum_sq = np.append(sum_sq, data_lm.ssr)
        mean_sq = np.append(mean_sq, data_lm.mse_resid)
        F_pvalues = np.append(F_pvalues, None)
        Fvalues = np.append(Fvalues, None)
        dof.append(data_lm.model.df_resid)
        #indices.append('Residuals')
        # dataframe is slow, return just the dict of pvalues by default
        if output == 'dataframe':
            anova = pd.DataFrame({
                                'Sum Sq': sum_sq,
                                'Mean Sq': mean_sq,
                                'Df': dof,
                                'F value': Fvalues,
                                'PR(>F)': F_pvalues},
                    index=indices,
                    columns=['Df', 'Sum Sq', 'Mean Sq', 'F value', 'PR(>F)'])
            return anova
        elif self.settings.analysis_type == 'PANCAN':
            if self.settings.include_media_factor:
                dd = {'tissue': F_pvalues[0],
                      'media': F_pvalues[1],
                      'msi': F_pvalues[2],
                      'feature': F_pvalues[3]}
            else:
                dd = {'tissue': F_pvalues[0],
                      'msi':F_pvalues[1],
                      'feature':F_pvalues[2]}
            return dd
        elif self.settings.include_MSI_factor is True:
            return {'msi': F_pvalues[0], 'feature': F_pvalues[1]}
        else:
            return {'feature': F_pvalues[0]}

[docs]    def anova_one_drug(self, drug_id, animate=True, output='object'):
        """Computes ANOVA for a given drug across all features

        :param str drug_id: a valid drug identifier.
        :param animate: shows the progress bar
        :return: a dataframe

        Calls :meth:`anova_one_drug_one_feature` for each feature.
        """
        # drop first and second columns that are made of strings
        # works under python2 but not python 3. Assume that the 2 first
        #columns are the sample name and tissue feature
        # Then, we keep only cases with at least 3 features.
        # MSI could be used but is not like in original R code.
        features = self.features.df.copy()
        # need to skip the FACTOR to keep only features
        shift = self.features.shift

        features = features[features.columns[shift:]]
        # FIXME what about features with less than 3 zeros ?
        mask = features.sum(axis=0) >= 3

        # TODO: MSI, tissues, name must always be kept
        #
        selected_features = features[features.columns[mask]]

        # scan all features for a given drug
        assert drug_id in self.ic50.df.columns
        N = len(selected_features.columns)
        pb = Progress(N, 10)
        res = {}
        #
        for i, feature in enumerate(selected_features.columns):
            # production True, means we do not want to create a DataFrame
            # for each call to the anova_one_drug_one_feature function
            # Instead, we require dictionaries
            this = self.anova_one_drug_one_feature(drug_id, feature,
                    production=True)
            if this['ANOVA_FEATURE_pval'] is not None:
                res[feature] = this
            if animate is True:
                pb.animate(i+1)

        # if production is False:
        # df = pid.concat(res, ignore_index=True)
        df = pd.DataFrame.from_records(res)
        df = df.T

        df = ANOVAResults().astype(df)
        if len(df) == 0:
            return df

        # append DRUG_NAME/DRUG_TARGET columns
        df = self.drug_decode.drug_annotations(df)

        # TODO: drop rows where ANOVA_FEATURE_PVAL is None
        if output != 'object':
            df = self.add_pvalues_correction(df)
            return df
        else:
            df = self.add_pvalues_correction(df)
            res = ANOVAResults(df, self.settings)
            res.settings = ANOVASettings(**self.settings)
            return res

[docs]    def anova_all(self, animate=True, drugs=None, multicore=None):
        """Run all ANOVA tests for all drugs and all features.

        :param drugs: you may select a subset of drugs
        :param animate: shows the progress bar
        :return: an :class:`~gdsctools.anova_results.ANOVAResults`
            instance with the dataframe
            stored in an attribute called **df**

        Calls :meth:`anova_one_drug` for each drug and concatenate all
        results together. Note that once all data are gathered,
        :meth:`add_pvalues_correction` is called to fill a new column
        with FDR corrections.

        An extra column  named "ASSOC_ID" is also added with
        a unique identifer sorted by ascending FDR.

        .. note:: A thorough comparison with version v17 gives the same FDR
            results (difference ~1e-6); Note however that the qvalue results
            differ by about 0.3% due to different smoothing in R and Python.
        """
        if self.verbose and len(self.individual_anova):
            print("Reusing some results from the buffer. "
            "To reset the buffer, call reset_buffer() method")
        # drop DRUG where number of IC50 (non-null) is below 5
        # axis=0 is default but we emphasize that sum is over
        # column (i.e. drug
        vv = (self.ic50.df.isnull() == False).sum(axis=0)
        # FIXME: should be in one_drug_one_feature ??
        drug_names = vv.index[vv >= self.settings.minimum_nonna_ic50]

        # if user provided a list of drugs, use them:
        if drugs is not None:
            # todo: check valifity of the drug names
            drug_names = drugs[:]

        pb = Progress(len(drug_names), 1)
        drug_names = list(drug_names)
        #pylab.shuffle(drug_names) # ? why

        if animate is True:
            pb.animate(0)

        if multicore:
            # Note that here, we do not use the buffer
            multicore_analysis(self, drug_names, multicore)
        else:

            for i, drug_name in enumerate(drug_names):
                if drug_name in self.individual_anova.keys():
                    pass
                else:
                    res = self.anova_one_drug(drug_name, animate=False,
                                          output='dataframe')
                    self.individual_anova[drug_name] = res
                if animate is True:
                    pb.animate(i+1)
        print("\n")
        if len(self.individual_anova) == 0:
            return ANOVAResults()

        df = pd.concat(self.individual_anova, ignore_index=True)

        if len(df) == 0:
            return df
        # sort all data by ANOVA p-values
        try:
            df.sort_values('ANOVA_FEATURE_pval', inplace=True)
        except:
            df.sort('ANOVA_FEATURE_pval', inplace=True)

        # all ANOVA have been computed individually for each drug and each
        # feature. Now, we need to compute the multiple testing corrections
        if self.settings.pvalue_correction_level is True:
            df = self.add_pvalues_correction(df)
        else:
            pass

        # insert a unique identifier as first column
        df.insert(0, 'ASSOC_ID', range(1, len(df) + 1))

        self.df = df
        # order the column names as defined in the __init__ method
        df = df[self.column_names]
        df.reset_index(inplace=True, drop=True)

        return ANOVAResults(df, self.settings)

[docs]    def add_pvalues_correction(self, df, colname='ANOVA_FEATURE_pval'):
        """Compute and add corrected pvalues in column ANOVA_FEATURE_FDR

        :param df: a dataframe with a column named after colname (defaults to
            ANOVA_FEATURE_pval). The output of :meth:`anova_all` contains such
            a dataframe.
        :param str colname: name of the column that contains the pvalues to be
            corrected.

        The default multiple testing correction (FDR correction) is stored in
        :attr:`settings.pvalue_correction_method` and can be changed to other
        methods (e.g., **qvalue**).

        The results in stored in a column named **ANOVA_FEATURE_FDR** inside
        the input dataframe **df**.

        Values are in the range 0 to 1.

        .. seealso:: :meth:`anova_all`,
            :class:`~gdsctools.stats.MultipleTesting`
        """
        if len(df) == 0:
            return

        # extract pvalues
        data = df[colname].values

        # set the method and compute new pvalues
        self.multiple_testing.method = self.settings.pvalue_correction_method
        new_pvalues = self.multiple_testing.get_corrected_pvalues(data)
        new_pvalues *= 100
        # insert new columns.
        colname = 'ANOVA_FEATURE_FDR'

        try:
            df.insert(len(df.columns), colname, new_pvalues)
        except:
            # replaces it otherwise
            df[colname] = new_pvalues
        return df

[docs]    def reset_buffer(self):
        """Reset the buffer used to store the results

        When calling :meth:`anova_all`, the method :meth:`anova_one_drug`
        is called for each drug and the results saved in the
        :attr:`individual_anova` attribute. If called again, the results are
        simply returned from the buffer and not recomputed.

        If you change a settings that affects the analysis and therefore the
        results, then you should call this method.
        """
        self.individual_anova = {}


def analyse_one_drug(master, drug):
    """Function used by :func:`multicore_analysis`

    :param master: an instance of :class:`ANOVA`
    :param drug: a valid drug name


    """
    if drug in master.individual_anova.keys():
        res = master.individual_anova[drug]
    else:
        res = master.anova_one_drug(drug_id=drug, animate=False,
            output="dataframe")
    return (drug, res)


def multicore_analysis(anova, drugs, maxcpu=2):
    """Function used by :class:`ANOVA` to perform multiprocess analysis

    :param anova: an instance of :class:`ANOVA`
    :param list drugs: list of drugs to analyse
    :param int maxcpu: number of CPU to use

    :return: the instance itself with the individual_anova attribute filled
        with all results
    """
    t = MultiProcessing(maxcpu=maxcpu)
    for i, drug in enumerate(drugs):
        if drug not in anova.individual_anova.keys():
            t.add_job(analyse_one_drug, anova, drug)
    t.run()

    # populate the ANOVA instance with the results
    for this in t.results:
        drug = this[0]
        result = this[1]
        anova.individual_anova[drug] = result
    return anova