# -*- python -*-
# -*- coding utf-8 -*-
# This file is part of GDSCTools software
#
# Copyright (c) 2015 - Wellcome Trust Sanger Institute
# All rights reserved
#
# File author(s): Thomas Cokelaer <cokelaer@gmail.comWE HERE>
#
# Distributed under the BSD 3-Clause License.
# See accompanying file LICENSE.txt distributed with this software
#
# website: http://github.com/CancerRxGene/gdsctools
#
##############################################################################
"""Implementation of qvalue estimate
Author: Thomas Cokelaer
"""
# inspiration
# source: https://github.com/nfusi/qvalue/blob/master/qvalue/qvalue.py
# and qvalue (R package)
import numpy as np
import scipy.interpolate
[docs]class QValue(object):
"""Compute Q-value for a given set of P-values
"""
def __init__(self, pv, lambdas=None,
pi0=None, df=3, method='smoother',
smooth_log_pi0=False, verbose=True):
""".. rubric:: Constructor
The q-value of a test measures the proportion of false
positives incurred (called the false discovery rate or FDR)
when that particular test is called significant.
:param pv: A vector of p-values (only necessary input)
:param lambdas: The value of the tuning parameter to estimate pi_0.
Must be in [0,1). Can be a single value or a range of values.
If none, the default is a range from 0 to 0.9 with a step size of
0.05 (inluding 0 and 0.9)
:param method: Either "smoother" or "bootstrap"; the method for
automatically choosing tuning parameter in the estimation of
pi_0, the proportion of true null hypotheses. Only smoother
implemented for now.
:param df: Number of degrees-of-freedom to use when
estimating pi_0 with a smoother (default to 3 i.e., cubic
interpolation.)
:param float pi0: if None, it's estimated as suggested in Storey and
Tibshirani, 2003. May be provided, which is convenient for testing.
:param smooth_log_pi0: If True and 'pi0_method' = "smoother",
pi_0 will be estimated by applying a smoother to a
scatterplot of log pi_0 rather than just pi_0
.. note:: Estimation of pi0 differs slightly from the one given in R
(about 0.3%) due to smoothing.spline function differences between
R and SciPy.
If no options are selected, then the method used to estimate pi_0
is the smoother method described in Storey and Tibshirani (2003).
The bootstrap method is described in Storey, Taylor & Siegmund
(2004) but not implemented yet.
.. seealso:: :class:`gdsctools.stats.MultipleTesting`
"""
try:
self.pv = np.array(pv)
except:
self.pv = pv.copy()
assert(self.pv.min() >= 0 and self.pv.max() <= 1), \
"p-values should be between 0 and 1"
if lambdas is None:
epsilon = 1e-8
lambdas = scipy.arange(0,0.9+1e-8,0.05)
if len(lambdas)>1 and len(lambdas)<4:
raise ValueError("""if length of lambda greater than 1, you need at least 4 values""")
if len(lambdas) >= 1 and (min(lambdas)<0 or max(lambdas)>=1):
raise ValueError("lambdas must be in the range[0, 1[")
self.m = float(len(self.pv))
self.df = df
self.lambdas = lambdas
self.method = method
self.verbose = verbose
self.smooth_log_pi0 = smooth_log_pi0
self.pi0 = self.estimate_pi0(pi0)
[docs] def estimate_pi0(self, pi0):
"""Estimate pi0 based on the pvalues"""
pv = self.pv.ravel() # flatten array
if pi0 is not None:
pass
elif len(self.lambdas) == 1:
pi0 = np.mean(pv >= self.lambdas[0])/(1-self.lambdas[0])
pi0 = min(pi0, 1)
else:
# evaluate pi0 for different lambdas
pi0 = [np.mean(pv>=this)/(1-this) for this in self.lambdas]
# in R
# lambda = seq(0,0.09, 0.1)
# pi0 = c(1.0000000, 0.9759067, 0.9674164, 0.9622673, 0.9573241,
# 0.9573241 0.9558824, 0.9573241, 0.9544406, 0.9457901)
# spi0 = smooth.spline(lambda, pi0, df=3, all.knots=F, spar=0)
# predict(spi0, x=max(lambda))$y --> 0.9457946
# spi0 = smooth.spline(lambda, pi0, df=3, all.knots=F)
# predict(spi0, x=max(lambda))$y --> 0.9485383
# In this function, using pi0 and lambdas, we get 0.9457946
# this is not too bad, the difference on the v17 data set
# is about 0.3 %
if self.method == 'smoother':
if (self.smooth_log_pi0):
pi0 = np.log(pi0)
# In R, the interpolation is done with smooth.spline
# within qvalue. However this is done with default
# parameters, and this is different from the Python
# code. Note, however, that smooth.spline has a parameter
# called spar. If set to 0, then we would get the same
# as in scipy. It looks like scipy has no equivalent of
# the smooth.spline function in R if spar is not 0
tck = scipy.interpolate.splrep(self.lambdas, pi0,
k = self.df)
pi0 = scipy.interpolate.splev(self.lambdas[-1], tck)
if (self.smooth_log_pi0):
pi0 = np.exp(pi0)
pi0 = min(pi0, 1.)
elif self.method == 'bootstrap':
raise NotImplementedError
"""minpi0 = min(pi0)
mse = rep(0, len(lambdas))
pi0.boot = rep(0, len(lambdas))
for i in range(1,100):
p.boot = sample(p, size = m, replace = TRUE)
for i in range(0,len(lambdas)):
pi0.boot[i] <- mean(p.boot > lambdas[i])/(1 - lambdas[i])
mse = mse + (pi0.boot - minpi0)^2
pi0 = min(pi0[mse == min(mse)])
pi0 = min(pi0, 1)"""
if pi0 > 1:
if self.verbose:
print("got pi0 > 1 (%.3f) while estimating qvalues, setting it to 1" % pi0)
pi0 = 1.0
assert(pi0 >= 0 and pi0 <= 1), "pi0 is not between 0 and 1: %f" % pi0
return pi0
[docs] def qvalue(self):
"""Return the qvalues using pvalues stored in :attr:`pv` attribute"""
pv = self.pv.ravel()
p_ordered = scipy.argsort(pv)
pv = pv[p_ordered]
qv = self.pi0 * self.m/len(pv) * pv
qv[-1] = min(qv[-1],1.0)
for i in range(len(pv)-2, -1, -1):
qv[i] = min(self.pi0*self.m*pv[i]/(i+1.0), qv[i+1])
# reorder qvalues
qv_temp = qv.copy()
qv = scipy.zeros_like(qv)
qv[p_ordered] = qv_temp
# reshape qvalues
original_shape = self.pv.shape
qv = qv.reshape(original_shape)
return qv