Source code for gdsctools.stats

# coding=utf-8
# -*- python -*-
#
#  This file is part of GDSCTools software
#
#  Copyright (c) 2015 - Wellcome Trust Sanger Institute
#  All rights reserved
#
#  File author(s): Thomas Cokelaer <cokelaer@gmail.com>
#
#  Distributed under the BSD 3-Clause License.
#  See accompanying file LICENSE.txt distributed with this software
#
#  website: http://github.com/CancerRxGene/gdsctools
#
##############################################################################
"""Code related to the ANOVA analysis to find associations between drug IC50s
and genomic features"""
from statsmodels.stats import multitest
import easydev
import numpy as np
from gdsctools.qvalue import QValue


__all__ = ['MultipleTesting', 'cohens', "signed_effects"]


def multiple_correction(pvalues, method='fdr'):
    mt = MultipleTesting(method=method)
    values = mt.get_corrected_pvalues(pvalues, method=None)
    return values


[docs]class MultipleTesting(object):
    """This class eases the computation of multiple testing corrections

    The method implemented so far are based on statsmodels or a local
    implementation of **qvalue** method.

    ================    =============================================
    method name         Description
    ================    =============================================
    bonferroni          one-step correction
    sidak               one-step correction
    holm-sidak          step down method using Sidak adjustments
    holm                step down method using Bonferroni adjustments
    simes-hochberg      step up method (independent)
    hommel              close method based on Simes tests (non
                        negative)
    fdr_bh              FDR Benjamini-Hochberg (non-negative)
    fdr_by              FDR Benjamini-Yekutieli (negative)
    fdr_tsbky           FDR 2-stage Benjamini-Krieger-Yekutieli
                        non negative
    frd_tsbh            FDR 2-stage Benjamini-Hochberg'
                        non-negative
    fdr                 same as fdr_bh
    qvalue              see :class:`~gdsctools.qvalue.QValue` class
    ================    =============================================


    .. seealso:: :mod:`gdsctools.qvalue`.

    .. seealso:: http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2907892/

    """
    def __init__(self, method=None):
        """.. rubric:: Constructor

        :param method: default to **fdr** that is the FDR Benjamini-Hochberg
            correction.
        """

        #: set of valid methods
        self.valid_methods = ['bonferroni', 'sidak', 'fdr_by',
            'holm-sidak', 'simes-hochberg', 'hommel', 'fdr_bh',
            'fdr_tsbh', 'fdr_tsbky', 'fdr', 'qvalue']

        self._method = 'fdr'
        if method is not None:
            self.method = method
        # parameter of the multiple test (e.g. used if method is bonferroni
        self.alpha = 0.1

    def _get_method(self):
        return self._method
    def _set_method(self, method):
        easydev.check_param_in_list(method, self.valid_methods)
        if method == 'fdr':
            method = 'fdr_bh'
        self._method = method
    method = property(_get_method, _set_method, doc="get/set method")

[docs]    def get_corrected_pvalues(self, pvalues, method=None):
        """Return corrected pvalues

        :param list pvalues: list or array of pvalues to correct.
        :param method: use the one defined in the constructor by default
            but can be overwritten here
        """
        if method is not None:
            self.method = method

        pvalues = np.array(pvalues)

        if self.method == 'qvalue':
            qv = QValue(pvalues)
            corrections = qv.qvalue()
            return corrections
        else:
            corrections = multitest.multipletests(pvalues,
               alpha=self.alpha, method=self.method)[1]
            return corrections

[docs]    def plot_comparison(self, pvalues, methods=None):
        """Simple plot to compare the pvalues correction methods

        .. plot::
            :include-source:
            :width: 80%

            from gdsctools.stats import MultipleTesting
            mt = MultipleTesting()
            pvalues = [1e-10, 9.5e-2, 2.2e-1, 3.6e-1, 5e-1, 6e-1,8e-1,9.6e-1]
            mt.plot_comparison(pvalues,
                methods=['fdr_bh', 'qvalue', 'bonferroni', 'fdr_tsbh'])

        .. note:: in that example, the qvalue and FDR are identical, but
            this is not true in general.

        """
        if methods is None:
            methods = self.valid_methods

        import pylab
        pylab.clf()
        for method in methods:
            pv = self.get_corrected_pvalues(pvalues, method=method)
            pylab.plot(pvalues, pv, 'o-', label=method.replace("_","\_"))
        pylab.legend(loc='best')
        pylab.ylabel('corrected pvalues')
        pylab.grid()
        pylab.ylim([0, 1.05])


[docs]def cohens(x, y):
    r"""Effect size metric through Cohen's *d* metric

    :param x: first vector
    :param y: second vector
    :return: absolute effect size value

    The Cohen's effect size *d* is defined as the difference
    between two means divided by a standard deviation of the data.

    .. math::

        d = \frac{\bar{x}_1 - \bar{x}_2}{s}

    For two independent samples, the *pooled standard deviation* is used
    instead, which is defined as:

    .. math::

        s = \sqrt{  \frac{(n_1-1)s_1^2 + (n_2-1)s_2^2}{n_1+n_2-2} }


    A Cohen's *d* is frequently used in estimating sample sizes for
    statistical testing: a lower *d* value indicates the necessity of
    larger sample sizes, and vice versa.

    .. note:: we return the absolute value

    :references: https://en.wikipedia.org/wiki/Effect_size

    """
    x = np.array(x)
    y = np.array(y)

    Nx = len(x) - 1.  # note the dot to cast to float
    Ny = len(y) - 1.
    # mean difference:
    md = np.abs(x.mean() - y.mean())
    # here, we want same as in R that is unbiased variance
    # so we use ddof = 1
    xv = x.var(ddof=1)
    yv = y.var(ddof=1)
    csd = Nx * xv + Ny * yv
    csd /= Nx + Ny  # make sure this is float
    csd = np.sqrt(csd)

    return md / csd


def glass(x, y):
    r"""Return Effect size through Glass :math:`\Delta` estimator

    :param x: first sample
    :param y: second sample
    :return: 2 values (one or each sample)

    The Glass effect size is computed as

    .. math::


        \Delta = \frac{\bar{x}_1-\bar{x}_2}{\sigma_i}

    .. note:: the standard deviation is the unbiased one (divided by N-1)

    where :math:`\sigma` is the standard deviation of either group

    """
    x = np.array(x)
    y = np.array(y)

    # mean difference:
    md = np.abs(x.mean() - y.mean())

    # here, we want same as in R that is unbiased variance
    # so we use ddof = 1
    g1 = md / x.std(ddof=1)
    g2 = md / y.std(ddof=1)

    return g1, g2


[docs]def signed_effects(df):
    import numpy as np
    _colname_deltas = 'FEATURE_delta_MEAN_IC50'
    _colname_effect_size = 'FEATURE_IC50_effect_size'
    deltas = df[_colname_deltas]
    effects = df[_colname_effect_size]
    signed_effects = list(np.sign(deltas) * effects)
    return signed_effects