Source code for gdsctools.anova

# -*- python -*-
# -*- coding utf-8 -*-
#
#  This file is part of GDSCTools software
#
#  Copyright (c) 2015 - Wellcome Trust Sanger Institute
#  All rights reserved
#
#  File author(s): Thomas Cokelaer <cokelaer@gmail.comWE HERE>
#
#  Distributed under the BSD 3-Clause License.
#  See accompanying file LICENSE.txt distributed with this software
#
#  website: http://github.com/CancerRxGene/gdsctools
#
##############################################################################
"""Code related to the ANOVA analysis to find associations between drug IC50s
and genomic features"""
import pandas as pd
import scipy
import pylab
import numpy as np

from statsmodels.formula.api import OLS

from easydev import Progress, AttrDict, do_profile

from gdsctools.models import BaseModels
from gdsctools.boxplots import BoxPlots
from gdsctools.settings import ANOVASettings
from gdsctools.anova_results import ANOVAResults

from easydev import MultiProcessing


__all__ = ['ANOVA']


class DummyDF(object):
    values = None


# Not that Logging is not used: it is not pickable and prevent
# multicore analysis.
[docs]class ANOVA(BaseModels): #Logging): """ANOVA analysis of the :term:`IC50` vs Feature matrices This class is the core of the analysis. It can be used to compute #. One association between a drug and a feature #. The association**S** between a drug and a set of features #. All assocations between a set of drugs and a set of features. For instance here below, we read an IC50 matrix and compute the association for a given drug with a specific feature. Note that genomic features are not provided as input but a default file is provided with this package that contains 49 genomic features for 988 cell lines. If your IC50 contains unknown cell lines, you can provide your own file or use the :mod:`gdsctools.datasets` v17. .. plot:: :include-source: :width: 80% from gdsctools import IC50, ANOVA, ic50_test ic = IC50(ic50_test) an = ANOVA(ic) # This is to select a specific tissue an.set_cancer_type('breast') df = an.anova_one_drug_one_feature(1047, 'TP53_mut', show=True) :Details about the anova analysis: In the example above, we perform a regression/anova test based on OLS regression. This is done for one feature one drug across all cell lines (tissue) in the method :meth:`anova_one_drug`. The regression takes into account the following factors: tissue, MSI, MEDIA and features. If there is only one tissue, this factor is dropped. If the number of MSI values is less than a pre-defined parameter (see :class:`~gdsctools.settings.ANOVASettings`), it is dropped. MEDIA, MSI columns are optional. The other methods :meth:`anova_one_drug` and :meth:`anova_all` are wrappers around :meth:`anova_one_drug_one_feature` to loop over all drugs, and loop over all drugs and all features, respectively. Please see the online documentation (ANOVA sections) for more help on gdsctools.readthedocs.io Specific notes about the parameters. Default settings are used except for those releases: V17 :: gdsc.volcano_FDR_interpolation = False gdsc.settings.pvalue_correction_method = 'qvalue' V18 :: gdsc.settings.FDR_threshold = 35 """ def __init__(self, ic50, genomic_features=None, drug_decode=None, verbose=True, set_media_factor=False): """.. rubric:: Constructor :param DataFrame IC50: a dataframe with the IC50. Rows should be the COSMIC identifiers and columns should be the Drug names (or identifiers) :param features: another dataframe with rows as in the IC50 matrix and columns as features. The first 3 columns must be named specifically to hold tissues, MSI (see format). :param drug_decode: a 3 column CSV file with drug's name and targets see :mod:`readers` for more information. :param verbose: verbosity in "WARNING", "ERROR", "DEBUG", "INFO" Please see :mod:`~gdsctools.readers` module for details about the input formats. The :attr:`settings` attribute contains specific settings related to the ANOVA analysis or visualisation. """ super(ANOVA, self).__init__(ic50, genomic_features, drug_decode=drug_decode, verbose=verbose, set_media_factor=set_media_factor) self.sampling = 0 self.pvalues_features = {} def _get_one_drug_one_feature_data(self, drug_id, feature_name, diagnostic_only=False): """ return: a dictionary with relevant information. There is also a test to see if the data can be analysis or not. This is stored ad a boolean value with key called *status*. """ # dictionary struture to hold results (can set values as attributes) dd = AttrDict() # select IC50 of a given drug # a fast way to select non-NA values from 1 column: # dropna is actually faster than a method using a mask. #dd.Y = self.ic50.df[drug_id].dropna() #indices = dd.Y.index #dd.masked_features = self.features.df[feature_name][indices] #dd.masked_tissue = self.tissue_factor[indices] #dd.masked_msi = self.msi_factor[indices] #dd.positive_feature = dd.masked_features.values.sum() #dd.negative_feature = len(dd.masked_features) - dd.positive_feature #dd.positive_msi = dd.masked_msi.values.sum() #dd.negative_msi = len(dd.masked_msi) - dd.positive_msi # using a mask instead of indices is 30% slower #mask = self.ic50.df[drug_id].isnull()==False #dd.masked_features = self.features.df[feature_name][mask] #dd.masked_tissue = self.tissue_factor[mask] #dd.masked_msi = self.msi_factor[mask] # Amother version using a dictionary instead of dataframer is actually # 2-3 times faster. It requires to transform the dataframe into a # dictionary once for all and dropping the NA as well. # Now, the next line takes no time dd.Y = self.ic50_dict[drug_id]['Y'] # an alias to the indices indices = self.ic50_dict[drug_id]['indices'] dd.indices = indices # select only relevant tissues/msi/features # This a is 5-6 times slower to use loc than the 2 lines of # code that follows, the creation of this masked_features was # taking 99% of the time in this function and now takes about 50% #dd.masked_features = self.features.df.loc[indices, feature_name].values real_indices = self.ic50_dict[drug_id]['real_indices'] dd.masked_features = self.features.df[feature_name].values[real_indices] dd.masked_tissue = self.tissue_dict[drug_id] if self.features.found_msi: dd.masked_msi = self.msi_dict[drug_id] dd.positive_msi = dd.masked_msi.values.sum() dd.negative_msi = len(dd.masked_msi) - dd.positive_msi if self.settings.include_media_factor: dd.masked_media = self.media_dict[drug_id] # compute length of pos/neg features and MSI dd.positive_feature = dd.masked_features.sum() dd.negative_feature = len(dd.masked_features) - dd.positive_feature # Some validity tests to run the analysis or not feature_threshold = self.settings.feature_factor_threshold msi_threshold = self.settings.MSI_factor_threshold A = self.settings.include_MSI_factor and\ dd.positive_feature >= feature_threshold and\ dd.negative_feature >= feature_threshold and\ dd.negative_msi >= msi_threshold and \ dd.positive_msi >= msi_threshold B = (not self.settings.include_MSI_factor) and\ dd.positive_feature >= feature_threshold and\ dd.negative_feature >= feature_threshold # We could of course use the mean() and std() functions from pandas or # numpy. We could also use the glass and cohens functions from the # stats module but the following code is much faster because it # factorises the computations of mean and variance dd.positives = dd.Y[dd.masked_features == 1] dd.negatives = dd.Y[dd.masked_features == 0] dd.Npos = len(dd.positives) dd.Nneg = len(dd.negatives) # additional information dd.feature_name = feature_name dd.drug_id = drug_id dd.drug_target = self.drug_decode.get_target(drug_id) dd.drug_name = self.drug_decode.get_name(drug_id) # FIXME is False does not give the same results as == False # in the test test_anova.py !! if (A == False) and (B == False): dd.status = False return dd else: dd.status = True if diagnostic_only is True: return dd # compute mean and std of pos and neg sets;using mean() takes 15us and # using the already computed sum and N takes 5us pos_sum = dd.positives.sum() neg_sum = dd.negatives.sum() dd.pos_IC50_mean = pos_sum / dd.Npos dd.neg_IC50_mean = neg_sum / dd.Nneg dd.delta_mean_IC50 = dd.pos_IC50_mean - dd.neg_IC50_mean # note the ddof to agree with R convention. dd.pos_IC50_std = dd.positives.std(ddof=1) dd.neg_IC50_std = dd.negatives.std(ddof=1) # Nov 2016. Den may be close to zero but slightly negative den = (dd.positives**2).sum() - pos_sum**2/dd.Npos dd.pos_IC50_std = np.sqrt( max(0,den) / (dd.Npos-1.)) den = (dd.negatives**2).sum() - neg_sum**2/dd.Nneg dd.neg_IC50_std = np.sqrt( max(0,den) / (dd.Nneg-1.)) # Compute Cohens and Glass effect size. Since underlying code # has lots in common, we do not use the modules but add # the code here below md = np.abs(dd.pos_IC50_mean - dd.neg_IC50_mean) dd.pos_glass = md / dd.pos_IC50_std dd.neg_glass = md / dd.neg_IC50_std csd = (dd.Npos - 1.) * dd.pos_IC50_std**2 + \ (dd.Nneg - 1.) * dd.neg_IC50_std**2 csd /= dd.Npos + dd.Nneg - 2. # make sure this is float if (csd > 0): dd.effectsize_ic50 = md / np.sqrt(csd) else: print("Unexpected negative effect size for %s %s. Set to zero. " % (drug_id, feature_name)) dd.effectsize_ic50 = 0 # Note that equal_var is a user parameter and affects # results. The ANOVA_results.txt obtained from SFTP # have different values meaning that the equal.var param # was set to False. Note that pvalue is stored at index 1 dd.ttest = self._get_ttest(dd.negatives, dd.positives) return dd def _get_ttest(self, sample1, sample2): # this computes the ttest. import scipy return scipy.stats.ttest_ind(sample1, sample2, equal_var=self.settings.equal_var_ttest)[1]
[docs] def anova_one_drug_one_feature(self, drug_id, feature_name, show=False, production=False, directory='.', fontsize=18): """Compute ABOVA one drug and one feature level :param drug_id: a valid drug identifier :param feature_name: a valid feature name :param bool show: show boxplots with the different factor used :param str directory: where to save the figure. :param bool production: if False, returns a dataframe otherwise a dictionary. This is to speed up analysis when scanning the drug across all features. .. note:: **for developer** this is the core of the analysis and should be kept as fast as possible. 95% of the time is spent here. .. note:: **for developer** Data used in this function comes from _get_one_drug_one_feature_data method, which should also be kept as fast as possible. """ if drug_id not in self.drugIds: raise ValueError('Unknown drug name %s. Use e.g., %s' % (drug_id, self.drugIds[0])) if feature_name not in self.feature_names: # we start index at 3 to skip tissue/name/msi raise ValueError('Unknown feature name %s. Use e.g. one of %s' % (feature_name, self.feature_names[0:3])) # This extract the relevant data and some simple metrics # This is now pretty fast accounting for 45 seconds # for 265 drugs and 988 features odof = self._get_one_drug_one_feature_data(drug_id, feature_name) # if the status is False, it means the number of data points # in a category (e.g., positive feature) is too low. # If so, nothing to do, we return an 'empty' dictionary if odof.status is False: results = self._odof_dict.copy() results['FEATURE'] = feature_name results['DRUG_ID'] = odof.drug_id results['DRUG_NAME'] = odof.drug_name results['DRUG_TARGET'] = odof.drug_target results['N_FEATURE_pos'] = odof.Npos results['N_FEATURE_neg'] = odof.Nneg if production is True: # return a dict return results else: # with newer version of pandas (v0.19), None are not accepted # anymore for k in results.keys(): if results[k] is None: results[k] = np.nan df = pd.DataFrame(results, index=[1]) return df # IMPORTANT: the order of the factors in the formula # is important. It does not change the total sum of square errors # but may change individual effects of the categorical components. # If a formula is provided, use statsmodels. Since it is slowish, # we implemented several cases as described in the doc for the 4 # following cases: # - TISSUE + MSI +MEDIA + FEATURE # - TISSUE + MSI + FEATURE # - MSI + FEATURE # - FEATURE if self.settings.regression_formula not in ["auto", None, ""]: # This populates the anova_pvalues attribute itself _ = self.anova_one_drug_one_feature_custom(drug_id, feature_name, formula= self.settings.regression_formula, odof=odof) results = self._set_odof_results(self.anova_pvalues, odof) elif self.settings.analysis_type == 'PANCAN': # IMPORTANT: tissues are sorted alphabetically in R aov # function. Same in statsmodels but capitalised names # are sorted differently. In R, a<b<B<c but in Python, # A<B<C<a<b<c. So, 'aero' tissue is before 'Bladder' in R, # not in python. Since in a linear regression # models, the order of the factor matters and the first # factor is used as a reference, we decided to use same # convention as in R. # see http://statsmodels.sourceforge.net/devel/contrasts.html # for a good explanation # We could use pd.get_dummies but pretty slow # instead we create the full matrix in init() method. # One issue is that some columns end up with sum == 0 # and needs to be dropped. df = self._tissue_dummies.loc[odof.masked_tissue.index] todrop = df.columns[df.values.sum(axis=0) == 0] if len(todrop) > 0: # use if since drop() is slow df = df.drop(todrop, axis=1) tissues = [x for x in df.columns if x.startswith('C(tissue')] df.drop(tissues[0], axis=1, inplace=True) # Here we set other variables with dataframe columns' names as # expected by OLS. if self.settings.include_media_factor == False: # make sure the media factor is not included todrop = [x for x in df.columns if x.startswith('C(media)')] df = df.drop(todrop, axis=1) else: # drop the first one for the regression medias = [x for x in df.columns if x.startswith('C(media')] if len(medias): df.drop(medias[0], axis=1, inplace=True) df['C(msi)[T.1]'] = odof.masked_msi.values df['feature'] = odof.masked_features # The regression itself self.data_lm = OLS(odof.Y, df.values).fit() # The ANOVA self.anova_pvalues = self._get_anova_summary(self.data_lm, odof=odof) results = self._set_odof_results(self.anova_pvalues, odof) elif self.settings.include_MSI_factor is True: df = DummyDF() df.values = np.ones((3, odof.Npos + odof.Nneg)) df.values[1] = odof.masked_msi.values df.values[2] = odof.masked_features df.values = df.values.T # The regression itself self.data_lm = OLS(odof.Y, df.values).fit() # The ANOVA itself self.anova_pvalues = self._get_anova_summary(self.data_lm, odof=odof) results = self._set_odof_results(self.anova_pvalues, odof) else: df = DummyDF() df.values = np.ones((2, odof.Npos + odof.Nneg)) df.values[1] = odof.masked_features df.values = df.values.T # The regression itself self.data_lm = OLS(odof.Y, df.values).fit() # The ANOVA itself self.anova_pvalues = self._get_anova_summary(self.data_lm, odof=odof) results = self._set_odof_results(self.anova_pvalues, odof) key = str(drug_id) + "__" + feature_name if self.sampling and key not in self.pvalues_features.keys(): # This can be computed for a drug once for all # no need to redo it for each feature ? # If the length of Y is too small (e.g., < 20) the results may not be # great. This can be check zith the errors self.samples1 = [] self.samples2 = [] self.samples3 = [] Y = odof.Y.copy() N = self.sampling pb = Progress(N, 20) for i in range(0, N): # To get the random distribution, shuffle Y # and noise not required # To get the noise effects, do not shuffle and set noise to # something different from 0 noise = 0.0 pylab.shuffle(Y) #data_lm = OLS(Y, df.values).fit() data_lm = OLS(Y+noise*pylab.randn(len(Y)), df.values).fit() anova_pvalues = self._get_anova_summary(data_lm, output='dict', odof=odof) try:self.samples1.append(anova_pvalues['msi']) except:pass self.samples2.append(anova_pvalues['feature']) try:self.samples3.append(anova_pvalues['tissue']) except:pass #pb.animate(i+1) import fitter ff = fitter.Fitter(-pylab.log10(self.samples2)) dist = "genexpon" ff.distributions = [dist] ff.fit() self.pvalues_features[key] = { 'error': ff.df_errors.loc[dist].values[0], 'params': ff.fitted_param[dist], 'feature': feature_name, 'N':len(Y)} if show is True: boxplot = BoxPlots(odof, savefig=self.settings.savefig, directory=directory, fontsize=fontsize) boxplot.boxplot_association(fignum=1) # a boxplot to show cell lines effects. This requires # the settings.analyse_type to be PANCAN if self.settings.analysis_type == 'PANCAN': boxplot.boxplot_pancan(fignum=2, mode='tissue') if self.settings.include_MSI_factor: boxplot.boxplot_pancan(fignum=3, mode='msi') if self.settings.include_media_factor: boxplot.boxplot_pancan(fignum=3, mode='media') # about 30% of the time spent in creating the DataFrame... if production is True: return results else: # with newer version of pandas (v0.19), None are not accepted # anymore for k in results.keys(): if results[k] is None: results[k] = np.nan df = pd.DataFrame(results, index=[1]) return df
def _set_odof_results(self, pvalues, odof): # Store the pvalues. Note that some may be missing so we use try # except, which is faster than if/else try:pvalues = pvalues["PR(>F)"] except:pass try: tissue_PVAL = pvalues['tissue'] except: tissue_PVAL = None try: MSI_PVAL = pvalues['msi'] except: MSI_PVAL = None try: FEATURE_PVAL = pvalues['feature'] except: FEATURE_PVAL = None try: MEDIA_PVAL = pvalues['media'] except: MEDIA_PVAL = None results = {'FEATURE': odof.feature_name, 'DRUG_ID': odof.drug_id, 'DRUG_NAME': odof.drug_name, 'DRUG_TARGET': odof.drug_target, 'N_FEATURE_pos': odof.Npos, 'N_FEATURE_neg': odof.Nneg, 'FEATURE_pos_logIC50_MEAN': odof.pos_IC50_mean, 'FEATURE_neg_logIC50_MEAN': odof.neg_IC50_mean, 'FEATURE_delta_MEAN_IC50': odof.delta_mean_IC50, 'FEATURE_pos_IC50_sd': odof.pos_IC50_std, 'FEATURE_neg_IC50_sd': odof.neg_IC50_std, 'FEATURE_IC50_effect_size': odof.effectsize_ic50, 'FEATURE_pos_Glass_delta': odof.pos_glass, 'FEATURE_neg_Glass_delta': odof.neg_glass, 'ANOVA_FEATURE_pval': FEATURE_PVAL, 'ANOVA_TISSUE_pval': tissue_PVAL, 'ANOVA_MSI_pval': MSI_PVAL, 'ANOVA_MEDIA_pval': MEDIA_PVAL, 'FEATURE_IC50_T_pval': odof.ttest # pvalues is in index 1 } return results
[docs] def anova_one_drug_one_feature_custom(self, drug_id, feature_name, formula, odof=None): """Same as :meth:`anova_one_drug_one_feature` but allows any formula :return: full ANOVA table but also populate interal attribute anova_pvalues that is a dictionary with pvalues for feature, media, msi and tissue Formula must be set in the settings attribute as follows:: an = ANOVA(...) an.settings['regression_formula'] = "Y ~ C(tissue) + feature" .. note:: This function is convenient but 3-4 times slower than :meth:`anova_one_drug_one_feature`. So if your formula are one of:: "Y ~ C(tissue) + C(media) + C(msi) + feature" "Y ~ C(tissue) + C(msi) + feature" "Y ~ C(msi) + feature" "Y ~ feature" you should rather use :meth:`anova_one_drug_one_feature` instead (keeping regression_formula set to 'auto'). By default, in categories, the first treatment (e.g tissue) is used as a reference and is not shown in the results. You may set the reference as follows:: "Y ~ C(tissue, Treatment(reference='breast'))" ANOVA pvalues returned are of type I .. versionadded:: 0.15.0 """ import statsmodels.formula.api as smf from statsmodels.stats.api import anova_lm if odof is None: odof = self._get_one_drug_one_feature_data(drug_id, feature_name) df = pd.DataFrame({ 'Y':odof.Y, 'feature': odof.masked_features}) # Add other categorical explanatory variables if available try: df['tissue'] = odof.masked_tissue.values except: pass try: df['msi'] = odof.masked_msi.values except: pass try: df['media'] = odof.masked_media.values except:pass # "Y ~ C(tissue) + C(msi) + C(media) + feature" assert "Y" in formula, "Y must be the LHS of the formula" # This returns a Model instance model = smf.ols(formula, data=df) self._debug_custom_df = df self._debug_custom_model = model anova = anova_lm(model.fit(), typ=1) anova_pvalues = {} for k,v in anova['PR(>F)'].iteritems(): if k == 'C(tissue)': anova_pvalues['tissue'] = v elif k == 'C(msi)': anova_pvalues['msi'] = v elif k == 'C(media)': anova_pvalues['media'] = v elif k == 'feature': anova_pvalues['feature'] = v self.anova_pvalues = anova_pvalues return anova
# no need to optimise anymore def _get_anova_summary(self, data_lm, output='dict', odof=None): """ an = ANOVA(...) an.anova_one_drug_one_feature(1047, "ABCB1_mut") an._get_anova_summary(an.data_lm, output="dataframe", odof=an.odof) should be identical to an.anova_one_drug_one_feature_custom(1047, "ABCB1_mut", formula="Y ~ C(tissue) + C(msi) + feature") """ q, r = np.linalg.qr(data_lm.model.data.exog) effects = np.dot(q.T, data_lm.model.data.endog) # In the regression, the first tissue is dropped hence -1 # Similarly for the MEDIA factor modes = self._get_analysis_mode() # create the W matrix using tissue and MSI if requested # default is that the 3 features are used if 'tissue' in modes and 'msi' in modes and 'media' in modes: Ntissue = len(odof.masked_tissue.unique()) - 1 Nmedia = len(odof.masked_media.unique()) -1 dof = [Ntissue, Nmedia, 1, 1] self._debug_dof = dof Ncolumns = sum(dof) + 1 # intercept added indices = ['tissue', 'media', 'msi', 'feature', 'Residuals'] # 4 stands for intercept + tissue + msi +feature arr = np.zeros((5, Ncolumns)) arr[1, slice(1, Ntissue+1)] = 1 arr[2, slice(Ntissue+1, Ntissue+Nmedia+1)] = 1 arr[3, Ntissue + Nmedia + 1] = 1 arr[4, Ntissue + Nmedia + 2] = 1 elif 'tissue' in modes and 'msi' in modes: Ntissue = len(odof.masked_tissue.unique()) - 1 dof = [Ntissue, 1, 1] indices = ['tissue', 'msi', 'feature', 'Residuals'] # 4 stands for intercept + tissue + msi +feature Ncolumns = sum(dof) + 1 # intercept added arr = np.zeros((4, Ncolumns)) arr[1, slice(1, Ntissue+1)] = 1 arr[2, Ntissue + 1] = 1 arr[3, Ntissue + 2] = 1 elif 'tissue' not in modes and 'msi' in modes: dof = [1, 1] Ncolumns = sum(dof) + 1 # intercept added indices = ['msi', 'feature', 'Residuals'] # 3 stands for intercept + msi +feature arr = np.zeros((3, Ncolumns)) arr[1, 1] = 1 arr[2, 2] = 1 elif 'tissue' not in modes and 'msi' not in modes: dof = [1] Ncolumns = sum(dof) + 1 # intercept added indices = ['feature', 'Residuals'] # 3 stands for intercept + msi +feature arr = np.zeros((2, Ncolumns)) arr[1, 1] = 1 else: raise NotImplementedError(""" This combo %s is not implemented in the "auto" mode. See http://gdsctools.readthedocs.io/en/master/anova_parttwo.html for available combos of variables. In short, MSI must be included. Note, however that if you wish to use your own formula, you can set it in settings.regression_formula ; this is simply be slower as compared to the standard regression. Here is an example of a formula: Y ~ C(tissue) + C(media) + feature """ % modes) arr[0, 0] = 1 # intercept self._debug_arr = arr self._debug_effects = effects sum_sq = np.dot(arr, effects**2) sum_sq = sum_sq[1:] # drop the intercep mean_sq = sum_sq / np.array(dof) Fvalues = mean_sq / (data_lm.ssr / data_lm.df_resid) F_pvalues = scipy.stats.f.sf(Fvalues, dof, data_lm.df_resid) sum_sq = np.append(sum_sq, data_lm.ssr) mean_sq = np.append(mean_sq, data_lm.mse_resid) F_pvalues = np.append(F_pvalues, None) Fvalues = np.append(Fvalues, None) dof.append(data_lm.model.df_resid) #indices.append('Residuals') # dataframe is slow, return just the dict of pvalues by default if output == 'dataframe': anova = pd.DataFrame({ 'Sum Sq': sum_sq, 'Mean Sq': mean_sq, 'Df': dof, 'F value': Fvalues, 'PR(>F)': F_pvalues}, index=indices, columns=['Df', 'Sum Sq', 'Mean Sq', 'F value', 'PR(>F)']) return anova elif self.settings.analysis_type == 'PANCAN': if self.settings.include_media_factor: dd = {'tissue': F_pvalues[0], 'media': F_pvalues[1], 'msi': F_pvalues[2], 'feature': F_pvalues[3]} else: dd = {'tissue': F_pvalues[0], 'msi':F_pvalues[1], 'feature':F_pvalues[2]} return dd elif self.settings.include_MSI_factor is True: return {'msi': F_pvalues[0], 'feature': F_pvalues[1]} else: return {'feature': F_pvalues[0]}
[docs] def anova_one_drug(self, drug_id, animate=True, output='object'): """Computes ANOVA for a given drug across all features :param str drug_id: a valid drug identifier. :param animate: shows the progress bar :return: a dataframe Calls :meth:`anova_one_drug_one_feature` for each feature. """ # drop first and second columns that are made of strings # works under python2 but not python 3. Assume that the 2 first #columns are the sample name and tissue feature # Then, we keep only cases with at least 3 features. # MSI could be used but is not like in original R code. features = self.features.df.copy() # need to skip the FACTOR to keep only features shift = self.features.shift features = features[features.columns[shift:]] # FIXME what about features with less than 3 zeros ? mask = features.sum(axis=0) >= 3 # TODO: MSI, tissues, name must always be kept # selected_features = features[features.columns[mask]] # scan all features for a given drug assert drug_id in self.ic50.df.columns N = len(selected_features.columns) pb = Progress(N, 10) res = {} # for i, feature in enumerate(selected_features.columns): # production True, means we do not want to create a DataFrame # for each call to the anova_one_drug_one_feature function # Instead, we require dictionaries this = self.anova_one_drug_one_feature(drug_id, feature, production=True) if this['ANOVA_FEATURE_pval'] is not None: res[feature] = this if animate is True: pb.animate(i+1) # if production is False: # df = pid.concat(res, ignore_index=True) df = pd.DataFrame.from_records(res) df = df.T df = ANOVAResults().astype(df) if len(df) == 0: return df # append DRUG_NAME/DRUG_TARGET columns df = self.drug_decode.drug_annotations(df) # TODO: drop rows where ANOVA_FEATURE_PVAL is None if output != 'object': df = self.add_pvalues_correction(df) return df else: df = self.add_pvalues_correction(df) res = ANOVAResults(df, self.settings) res.settings = ANOVASettings(**self.settings) return res
[docs] def anova_all(self, animate=True, drugs=None, multicore=None): """Run all ANOVA tests for all drugs and all features. :param drugs: you may select a subset of drugs :param animate: shows the progress bar :return: an :class:`~gdsctools.anova_results.ANOVAResults` instance with the dataframe stored in an attribute called **df** Calls :meth:`anova_one_drug` for each drug and concatenate all results together. Note that once all data are gathered, :meth:`add_pvalues_correction` is called to fill a new column with FDR corrections. An extra column named "ASSOC_ID" is also added with a unique identifer sorted by ascending FDR. .. note:: A thorough comparison with version v17 gives the same FDR results (difference ~1e-6); Note however that the qvalue results differ by about 0.3% due to different smoothing in R and Python. """ if self.verbose and len(self.individual_anova): print("Reusing some results from the buffer. " "To reset the buffer, call reset_buffer() method") # drop DRUG where number of IC50 (non-null) is below 5 # axis=0 is default but we emphasize that sum is over # column (i.e. drug vv = (self.ic50.df.isnull() == False).sum(axis=0) # FIXME: should be in one_drug_one_feature ?? drug_names = vv.index[vv >= self.settings.minimum_nonna_ic50] # if user provided a list of drugs, use them: if drugs is not None: # todo: check valifity of the drug names drug_names = drugs[:] pb = Progress(len(drug_names), 1) drug_names = list(drug_names) #pylab.shuffle(drug_names) # ? why if animate is True: pb.animate(0) if multicore: # Note that here, we do not use the buffer multicore_analysis(self, drug_names, multicore) else: for i, drug_name in enumerate(drug_names): if drug_name in self.individual_anova.keys(): pass else: res = self.anova_one_drug(drug_name, animate=False, output='dataframe') self.individual_anova[drug_name] = res if animate is True: pb.animate(i+1) print("\n") if len(self.individual_anova) == 0: return ANOVAResults() df = pd.concat(self.individual_anova, ignore_index=True) if len(df) == 0: return df # sort all data by ANOVA p-values try: df.sort_values('ANOVA_FEATURE_pval', inplace=True) except: df.sort('ANOVA_FEATURE_pval', inplace=True) # all ANOVA have been computed individually for each drug and each # feature. Now, we need to compute the multiple testing corrections if self.settings.pvalue_correction_level is True: df = self.add_pvalues_correction(df) else: pass # insert a unique identifier as first column df.insert(0, 'ASSOC_ID', range(1, len(df) + 1)) self.df = df # order the column names as defined in the __init__ method df = df[self.column_names] df.reset_index(inplace=True, drop=True) return ANOVAResults(df, self.settings)
[docs] def add_pvalues_correction(self, df, colname='ANOVA_FEATURE_pval'): """Compute and add corrected pvalues in column ANOVA_FEATURE_FDR :param df: a dataframe with a column named after colname (defaults to ANOVA_FEATURE_pval). The output of :meth:`anova_all` contains such a dataframe. :param str colname: name of the column that contains the pvalues to be corrected. The default multiple testing correction (FDR correction) is stored in :attr:`settings.pvalue_correction_method` and can be changed to other methods (e.g., **qvalue**). The results in stored in a column named **ANOVA_FEATURE_FDR** inside the input dataframe **df**. Values are in the range 0 to 1. .. seealso:: :meth:`anova_all`, :class:`~gdsctools.stats.MultipleTesting` """ if len(df) == 0: return # extract pvalues data = df[colname].values # set the method and compute new pvalues self.multiple_testing.method = self.settings.pvalue_correction_method new_pvalues = self.multiple_testing.get_corrected_pvalues(data) new_pvalues *= 100 # insert new columns. colname = 'ANOVA_FEATURE_FDR' try: df.insert(len(df.columns), colname, new_pvalues) except: # replaces it otherwise df[colname] = new_pvalues return df
[docs] def reset_buffer(self): """Reset the buffer used to store the results When calling :meth:`anova_all`, the method :meth:`anova_one_drug` is called for each drug and the results saved in the :attr:`individual_anova` attribute. If called again, the results are simply returned from the buffer and not recomputed. If you change a settings that affects the analysis and therefore the results, then you should call this method. """ self.individual_anova = {}
def analyse_one_drug(master, drug): """Function used by :func:`multicore_analysis` :param master: an instance of :class:`ANOVA` :param drug: a valid drug name """ if drug in master.individual_anova.keys(): res = master.individual_anova[drug] else: res = master.anova_one_drug(drug_id=drug, animate=False, output="dataframe") return (drug, res) def multicore_analysis(anova, drugs, maxcpu=2): """Function used by :class:`ANOVA` to perform multiprocess analysis :param anova: an instance of :class:`ANOVA` :param list drugs: list of drugs to analyse :param int maxcpu: number of CPU to use :return: the instance itself with the individual_anova attribute filled with all results """ t = MultiProcessing(maxcpu=maxcpu) for i, drug in enumerate(drugs): if drug not in anova.individual_anova.keys(): t.add_job(analyse_one_drug, anova, drug) t.run() # populate the ANOVA instance with the results for this in t.results: drug = this[0] result = this[1] anova.individual_anova[drug] = result return anova