# coding=utf-8
# -*- python -*-
#
# This file is part of GDSCTools software
#
# Copyright (c) 2015 - Wellcome Trust Sanger Institute
# All rights reserved
#
# File author(s): Thomas Cokelaer <cokelaer@gmail.com>
#
# Distributed under the BSD 3-Clause License.
# See accompanying file LICENSE.txt distributed with this software
#
# website: http://github.com/CancerRxGene/gdsctools
#
##############################################################################
"""Code related to the ANOVA analysis to find associations between drug IC50s
and genomic features"""
from statsmodels.stats import multitest
import easydev
import numpy as np
from gdsctools.qvalue import QValue
__all__ = ['MultipleTesting', 'cohens', "signed_effects"]
def multiple_correction(pvalues, method='fdr'):
mt = MultipleTesting(method=method)
values = mt.get_corrected_pvalues(pvalues, method=None)
return values
[docs]class MultipleTesting(object):
"""This class eases the computation of multiple testing corrections
The method implemented so far are based on statsmodels or a local
implementation of **qvalue** method.
================ =============================================
method name Description
================ =============================================
bonferroni one-step correction
sidak one-step correction
holm-sidak step down method using Sidak adjustments
holm step down method using Bonferroni adjustments
simes-hochberg step up method (independent)
hommel close method based on Simes tests (non
negative)
fdr_bh FDR Benjamini-Hochberg (non-negative)
fdr_by FDR Benjamini-Yekutieli (negative)
fdr_tsbky FDR 2-stage Benjamini-Krieger-Yekutieli
non negative
frd_tsbh FDR 2-stage Benjamini-Hochberg'
non-negative
fdr same as fdr_bh
qvalue see :class:`~gdsctools.qvalue.QValue` class
================ =============================================
.. seealso:: :mod:`gdsctools.qvalue`.
.. seealso:: http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2907892/
"""
def __init__(self, method=None):
""".. rubric:: Constructor
:param method: default to **fdr** that is the FDR Benjamini-Hochberg
correction.
"""
#: set of valid methods
self.valid_methods = ['bonferroni', 'sidak', 'fdr_by',
'holm-sidak', 'simes-hochberg', 'hommel', 'fdr_bh',
'fdr_tsbh', 'fdr_tsbky', 'fdr', 'qvalue']
self._method = 'fdr'
if method is not None:
self.method = method
# parameter of the multiple test (e.g. used if method is bonferroni
self.alpha = 0.1
def _get_method(self):
return self._method
def _set_method(self, method):
easydev.check_param_in_list(method, self.valid_methods)
if method == 'fdr':
method = 'fdr_bh'
self._method = method
method = property(_get_method, _set_method, doc="get/set method")
[docs] def get_corrected_pvalues(self, pvalues, method=None):
"""Return corrected pvalues
:param list pvalues: list or array of pvalues to correct.
:param method: use the one defined in the constructor by default
but can be overwritten here
"""
if method is not None:
self.method = method
pvalues = np.array(pvalues)
if self.method == 'qvalue':
qv = QValue(pvalues)
corrections = qv.qvalue()
return corrections
else:
corrections = multitest.multipletests(pvalues,
alpha=self.alpha, method=self.method)[1]
return corrections
[docs] def plot_comparison(self, pvalues, methods=None):
"""Simple plot to compare the pvalues correction methods
.. plot::
:include-source:
:width: 80%
from gdsctools.stats import MultipleTesting
mt = MultipleTesting()
pvalues = [1e-10, 9.5e-2, 2.2e-1, 3.6e-1, 5e-1, 6e-1,8e-1,9.6e-1]
mt.plot_comparison(pvalues,
methods=['fdr_bh', 'qvalue', 'bonferroni', 'fdr_tsbh'])
.. note:: in that example, the qvalue and FDR are identical, but
this is not true in general.
"""
if methods is None:
methods = self.valid_methods
import pylab
pylab.clf()
for method in methods:
pv = self.get_corrected_pvalues(pvalues, method=method)
pylab.plot(pvalues, pv, 'o-', label=method.replace("_","\_"))
pylab.legend(loc='best')
pylab.ylabel('corrected pvalues')
pylab.grid()
pylab.ylim([0, 1.05])
[docs]def cohens(x, y):
r"""Effect size metric through Cohen's *d* metric
:param x: first vector
:param y: second vector
:return: absolute effect size value
The Cohen's effect size *d* is defined as the difference
between two means divided by a standard deviation of the data.
.. math::
d = \frac{\bar{x}_1 - \bar{x}_2}{s}
For two independent samples, the *pooled standard deviation* is used
instead, which is defined as:
.. math::
s = \sqrt{ \frac{(n_1-1)s_1^2 + (n_2-1)s_2^2}{n_1+n_2-2} }
A Cohen's *d* is frequently used in estimating sample sizes for
statistical testing: a lower *d* value indicates the necessity of
larger sample sizes, and vice versa.
.. note:: we return the absolute value
:references: https://en.wikipedia.org/wiki/Effect_size
"""
x = np.array(x)
y = np.array(y)
Nx = len(x) - 1. # note the dot to cast to float
Ny = len(y) - 1.
# mean difference:
md = np.abs(x.mean() - y.mean())
# here, we want same as in R that is unbiased variance
# so we use ddof = 1
xv = x.var(ddof=1)
yv = y.var(ddof=1)
csd = Nx * xv + Ny * yv
csd /= Nx + Ny # make sure this is float
csd = np.sqrt(csd)
return md / csd
def glass(x, y):
r"""Return Effect size through Glass :math:`\Delta` estimator
:param x: first sample
:param y: second sample
:return: 2 values (one or each sample)
The Glass effect size is computed as
.. math::
\Delta = \frac{\bar{x}_1-\bar{x}_2}{\sigma_i}
.. note:: the standard deviation is the unbiased one (divided by N-1)
where :math:`\sigma` is the standard deviation of either group
"""
x = np.array(x)
y = np.array(y)
# mean difference:
md = np.abs(x.mean() - y.mean())
# here, we want same as in R that is unbiased variance
# so we use ddof = 1
g1 = md / x.std(ddof=1)
g2 = md / y.std(ddof=1)
return g1, g2
[docs]def signed_effects(df):
import numpy as np
_colname_deltas = 'FEATURE_delta_MEAN_IC50'
_colname_effect_size = 'FEATURE_IC50_effect_size'
deltas = df[_colname_deltas]
effects = df[_colname_effect_size]
signed_effects = list(np.sign(deltas) * effects)
return signed_effects