Source code for gdsctools.qvalue

# -*- python -*-
# -*- coding utf-8 -*-

#  This file is part of GDSCTools software
#
#  Copyright (c) 2015 - Wellcome Trust Sanger Institute
#  All rights reserved
#
#  File author(s): Thomas Cokelaer <cokelaer@gmail.comWE HERE>
#
#  Distributed under the BSD 3-Clause License.
#  See accompanying file LICENSE.txt distributed with this software
#
#  website: http://github.com/CancerRxGene/gdsctools
#
##############################################################################
"""Implementation of qvalue estimate

Author: Thomas Cokelaer
"""
# inspiration
# source: https://github.com/nfusi/qvalue/blob/master/qvalue/qvalue.py
# and qvalue (R package)
import numpy as np
import scipy.interpolate


[docs]class QValue(object): """Compute Q-value for a given set of P-values """ def __init__(self, pv, lambdas=None, pi0=None, df=3, method='smoother', smooth_log_pi0=False, verbose=True): """.. rubric:: Constructor The q-value of a test measures the proportion of false positives incurred (called the false discovery rate or FDR) when that particular test is called significant. :param pv: A vector of p-values (only necessary input) :param lambdas: The value of the tuning parameter to estimate pi_0. Must be in [0,1). Can be a single value or a range of values. If none, the default is a range from 0 to 0.9 with a step size of 0.05 (inluding 0 and 0.9) :param method: Either "smoother" or "bootstrap"; the method for automatically choosing tuning parameter in the estimation of pi_0, the proportion of true null hypotheses. Only smoother implemented for now. :param df: Number of degrees-of-freedom to use when estimating pi_0 with a smoother (default to 3 i.e., cubic interpolation.) :param float pi0: if None, it's estimated as suggested in Storey and Tibshirani, 2003. May be provided, which is convenient for testing. :param smooth_log_pi0: If True and 'pi0_method' = "smoother", pi_0 will be estimated by applying a smoother to a scatterplot of log pi_0 rather than just pi_0 .. note:: Estimation of pi0 differs slightly from the one given in R (about 0.3%) due to smoothing.spline function differences between R and SciPy. If no options are selected, then the method used to estimate pi_0 is the smoother method described in Storey and Tibshirani (2003). The bootstrap method is described in Storey, Taylor & Siegmund (2004) but not implemented yet. .. seealso:: :class:`gdsctools.stats.MultipleTesting` """ try: self.pv = np.array(pv) except: self.pv = pv.copy() assert(self.pv.min() >= 0 and self.pv.max() <= 1), \ "p-values should be between 0 and 1" if lambdas is None: epsilon = 1e-8 lambdas = scipy.arange(0,0.9+1e-8,0.05) if len(lambdas)>1 and len(lambdas)<4: raise ValueError("""if length of lambda greater than 1, you need at least 4 values""") if len(lambdas) >= 1 and (min(lambdas)<0 or max(lambdas)>=1): raise ValueError("lambdas must be in the range[0, 1[") self.m = float(len(self.pv)) self.df = df self.lambdas = lambdas self.method = method self.verbose = verbose self.smooth_log_pi0 = smooth_log_pi0 self.pi0 = self.estimate_pi0(pi0)
[docs] def estimate_pi0(self, pi0): """Estimate pi0 based on the pvalues""" pv = self.pv.ravel() # flatten array if pi0 is not None: pass elif len(self.lambdas) == 1: pi0 = np.mean(pv >= self.lambdas[0])/(1-self.lambdas[0]) pi0 = min(pi0, 1) else: # evaluate pi0 for different lambdas pi0 = [np.mean(pv>=this)/(1-this) for this in self.lambdas] # in R # lambda = seq(0,0.09, 0.1) # pi0 = c(1.0000000, 0.9759067, 0.9674164, 0.9622673, 0.9573241, # 0.9573241 0.9558824, 0.9573241, 0.9544406, 0.9457901) # spi0 = smooth.spline(lambda, pi0, df=3, all.knots=F, spar=0) # predict(spi0, x=max(lambda))$y --> 0.9457946 # spi0 = smooth.spline(lambda, pi0, df=3, all.knots=F) # predict(spi0, x=max(lambda))$y --> 0.9485383 # In this function, using pi0 and lambdas, we get 0.9457946 # this is not too bad, the difference on the v17 data set # is about 0.3 % if self.method == 'smoother': if (self.smooth_log_pi0): pi0 = np.log(pi0) # In R, the interpolation is done with smooth.spline # within qvalue. However this is done with default # parameters, and this is different from the Python # code. Note, however, that smooth.spline has a parameter # called spar. If set to 0, then we would get the same # as in scipy. It looks like scipy has no equivalent of # the smooth.spline function in R if spar is not 0 tck = scipy.interpolate.splrep(self.lambdas, pi0, k = self.df) pi0 = scipy.interpolate.splev(self.lambdas[-1], tck) if (self.smooth_log_pi0): pi0 = np.exp(pi0) pi0 = min(pi0, 1.) elif self.method == 'bootstrap': raise NotImplementedError """minpi0 = min(pi0) mse = rep(0, len(lambdas)) pi0.boot = rep(0, len(lambdas)) for i in range(1,100): p.boot = sample(p, size = m, replace = TRUE) for i in range(0,len(lambdas)): pi0.boot[i] <- mean(p.boot > lambdas[i])/(1 - lambdas[i]) mse = mse + (pi0.boot - minpi0)^2 pi0 = min(pi0[mse == min(mse)]) pi0 = min(pi0, 1)""" if pi0 > 1: if self.verbose: print("got pi0 > 1 (%.3f) while estimating qvalues, setting it to 1" % pi0) pi0 = 1.0 assert(pi0 >= 0 and pi0 <= 1), "pi0 is not between 0 and 1: %f" % pi0 return pi0
[docs] def qvalue(self): """Return the qvalues using pvalues stored in :attr:`pv` attribute""" pv = self.pv.ravel() p_ordered = scipy.argsort(pv) pv = pv[p_ordered] qv = self.pi0 * self.m/len(pv) * pv qv[-1] = min(qv[-1],1.0) for i in range(len(pv)-2, -1, -1): qv[i] = min(self.pi0*self.m*pv[i]/(i+1.0), qv[i+1]) # reorder qvalues qv_temp = qv.copy() qv = scipy.zeros_like(qv) qv[p_ordered] = qv_temp # reshape qvalues original_shape = self.pv.shape qv = qv.reshape(original_shape) return qv