# coding=utf-8
# -*- python -*-
#
# This file is part of GDSCTools software
#
# Copyright (c) 2015 - Wellcome Trust Sanger Institute
# All rights reserved
#
# File author(s): Thomas Cokelaer <cokelaer@gmail.com>
#
# Distributed under the BSD 3-Clause License.
# See accompanying file LICENSE.txt distributed with this software
#
# website: http://github.com/CancerRxGene/gdsctools
#
##############################################################################
"""Volcano plot utilities
The :class:`VolcanoANOVA` is used in the creation of the report but
may be used by a usr in a Python shell.
"""
import os
import pandas as pd
import pylab
import numpy as np
import easydev
from numpy import log10
from easydev import Progress, AttrDict
from gdsctools.tools import Savefig
import jinja2
from jinja2 import Environment, PackageLoader
__all__ = ['VolcanoANOVA', "VolcanoANOVAJS"]
[docs]class VolcanoANOVA(object):
"""Utilities related to volcano plots
This class is used in :mod:`gdsctools.anova` but can also
be used independently as in the example below.
.. plot::
:include-source:
:width: 80%
from gdsctools import ANOVA, ic50_test, VolcanoANOVA
an = ANOVA(ic50_test)
# retrict analysis to a tissue to speed up computation
an.set_cancer_type('lung_NSCLC')
# Perform the entire analysis
results = an.anova_all()
# Plot volcano plot of pvalues versus signed effect size
v = VolcanoANOVA(results)
v.volcano_plot_all()
.. note:: Within an IPython shell, you should be able to click
on a circle and the title will be updated
with the name of the drug/feature and FDR value.
:Legend and color conventions: The green circles indicate significant hits
that are resistant while reds show sensitive hits. Circles are colored
if there are below the FDR_threshold AND below the pvalue_threshold AND
if the signed effect size is above the effect_threshold.
"""
_colname_pvalue = 'ANOVA_FEATURE_pval'
_colname_qvalue = 'ANOVA_FEATURE_FDR'
_colname_drugid = 'DRUG_ID'
_colname_feature = 'FEATURE'
_colname_deltas = 'FEATURE_delta_MEAN_IC50'
_colname_effect_size = 'FEATURE_IC50_effect_size'
_colname_N_feature_pos = 'N_FEATURE_pos'
def __init__(self, data, sep="\t", settings=None):
""".. rubric:: Constructor
:param data: an :class:`~gdsctools.anova.ANOVAResults` instance
or a dataframe with the proper columns names (see below)
:param settings: an instance of
:class:`~gdsctools.settings.ANOVASettings`
Expected column names to be found if a filename is provided::
ANOVA_FEATURE_pval
ANOVA_FEATURE_FDR
FEATURE_delta_MEAN_IC50
FEATURE_IC50_effect_size
N_FEATURE_pos
N_FEATURE_pos
FEATURE
DRUG_ID
If the plotting is too slow, you can use the :meth:`selector` to prune
the results (most of the data are noise and overlap on the middle
bottom area of the plot with little information.
"""
# a copy since we do may change the data
try:
# an ANOVAResults contains a df attribute
self.df = data.df.copy()
except:
# probably a dataframe
self.df = data.copy()
# this is redundant could reuse the input ??
if settings is None:
from gdsctools.settings import ANOVASettings
self.settings = ANOVASettings()
else:
self.settings = AttrDict(**settings)
self.figtools = Savefig()
self.figtools.directory = self.settings.directory
self.drugs = set(self.df[self._colname_drugid])
self.features = set(self.df[self._colname_feature])
# intensive calls made once for all
self.groups_by_drugs = self.df.groupby(self._colname_drugid).groups
self.groups_by_features = self.df.groupby(self._colname_feature).groups
[docs] def selector(self, df, Nbest=1000, Nrandom=1000, inplace=False):
"""Select only the first N best rows and N random ones
Sometimes, there are tens of thousands of associations and future
analysis will include more features and drugs. Plotting volcano plots
should therefore be fast and scalable. Here, we provide a naive
way of speeding up the plotting by selecting only a subset of the data
made of Nbest+Nrandom associations.
:param df: the input dataframe with ANOVAResults
:param int Nbest: how many of the most significant association
should be kept
:param int Nrandom: on top of the Nbest significant association,
set how many other randomly chosen associations are to be kept.
:return: pruned dataframe
"""
if len(df) < Nbest:
return df
Nmax = Nbest + Nrandom
N = len(df)
if N > Nbest:
x = range(Nbest, N)
pylab.shuffle(x)
n2pick = min(N, Nmax) - Nbest
indices = range(0, Nbest) + x[0:n2pick]
else:
indices = range(0, Nbest)
# indices in the index may not be order
indices = [df.index[xx] for xx in indices]
df = df.iloc[indices]
if inplace is True:
self.df = df
else:
return df
[docs] def volcano_plot_all_drugs(self):
"""Create a volcano plot for each drug and save in PNG files
Each filename is set to **volcano_<drug identifier>.png**
"""
drugs = list(self.df[self._colname_drugid].unique())
pb = Progress(len(drugs), 1)
for i, drug in enumerate(drugs):
self.volcano_plot_one_drug(drug)
self.savefig("volcano_%s.png" % drug, size_inches=(10, 10))
pb.animate(i+1)
[docs] def volcano_plot_all_features(self):
"""Create a volcano plot for each feature and save in PNG files
Each filename is set to **volcano_<feature name>.png**
"""
features = list(self.df[self._colname_feature].unique())
print('Creating image for each feature (using all drugs)')
pb = Progress(len(features), 1)
for i, feature in enumerate(features):
self.volcano_plot_one_feature(feature)
self.savefig("volcano_%s.png" % feature,
size_inches=(10, 10))
pb.animate(i+1)
[docs] def volcano_plot_all(self):
"""Create an overall volcano plot for all associations
This method saves the picture in a PNG file named **volcano_all.png**.
"""
# no annotations for all features.
# this is slow, we can drop non relevant data
data = self._get_volcano_sub_data('ALL')
data['annotation'] = ['' for x in range(len(data))]
self._volcano_plot(data, title='all drugs all features')
def _get_fdr_from_pvalue_interp(self, pvalue):
"""Here, FDR are computed using an interpolation"""
pvalue += 1e-15
qvals = self.df[self._colname_qvalue]
pvals = self.df[self._colname_pvalue]
ya = qvals[pvals < pvalue].max()
yb = qvals[pvals > pvalue].min()
xa = pvals[pvals < pvalue].max()
xb = pvals[pvals > pvalue].min()
dx = xb - xa
dy = yb - ya
yc = ya + dy * (pvalue - xa) / dx
return yc
def _get_pvalue_from_fdr(self, fdr):
"""Get pvalue for a given FDR threshold
This is equivalent to v17 of the R version but is not very precise
we should use _get_pvalue_from_fdr_interp instead.
"""
qvals = self.df[self._colname_qvalue]
pvals = self.df[self._colname_pvalue]
if isinstance(fdr, list):
pvalues = [pvals[qvals < this].max() for this in fdr]
return pvalues
else:
return pvals[qvals < fdr].max()
def _get_pvalue_from_fdr_interp(self, fdr):
# same as get_pvalue_from_fdr but with a linear interpolation
fdr += 1e-15
qvals = self.df[self._colname_qvalue]
pvals = self.df[self._colname_pvalue]
ya = pvals[qvals <= fdr].max()
yb = pvals[qvals > fdr].min()
xa = qvals[qvals <= fdr].max()
xb = qvals[qvals > fdr].min()
dx = xb - xa
dy = yb - ya
xc = fdr
yc = ya + dy * (xc - xa) / dx
return yc
def _get_volcano_global_data(self):
# using all data
colname = self._colname_N_feature_pos
minN = self.df[colname].min()
maxN = self.df[colname].max()
pvalues = self._get_pvalue_from_fdr(self.settings.FDR_threshold)
return {'minN': minN, 'maxN': maxN,
'pvalues': (self.settings.FDR_threshold, pvalues)}
def _get_volcano_sub_data(self, mode, target=None):
# Return data needed for each plot
# TODO could be simplified but works for now
# groups created in the constructor once for all
if mode == self._colname_drugid:
subdf = self.df.loc[self.groups_by_drugs[target]]
texts = subdf[self._colname_feature]
elif mode == 'FEATURE':
subdf = self.df.loc[self.groups_by_features[target]]
texts = subdf[self._colname_drugid]
elif mode == 'ALL':
# nothing to do, get all data
subdf = self.df
texts = subdf[self._colname_feature] # TODO + drug
else:
raise ValueError("mode parameter must be in [FEATURE, %s, ALL]" %
(self._colname_drugid))
# replaced by groups created in the constructor
#subdf = self.df[self.df[mode] == target]
deltas = subdf[self._colname_deltas]
effects = subdf[self._colname_effect_size]
signed_effects = list(np.sign(deltas) * effects)
qvals = list(subdf[self._colname_qvalue])
pvals = list(subdf[self._colname_pvalue])
#assocs = list(subdf['ASSOC_ID'])
colors = []
annotations = []
data = pd.DataFrame(index=range(len(qvals)))
data['pvalue'] = pvals
data['signed_effect'] = signed_effects
data['Feature'] = list(subdf[self._colname_feature])
data['Drug'] = list(subdf[self._colname_drugid])
data['text'] = texts.values
#data['Assoc'] = assocs
## !! here, we need to use .values since the pandas dataframe
# index goes from 1 to N but the origignal indices in subdf
# may not be from 1 to N but random between 1 and M>>N
data['FDR'] = subdf[self._colname_qvalue].values
annotations = []
# just an alias
# FIXME: why do we have a switch here for PANCAN ?
FDR_threshold = self.settings.FDR_threshold
if self.settings.analysis_type == 'PANCAN':
for sign, qval, pval in zip(signed_effects, qvals, pvals):
if sign <= -self.settings.effect_threshold and \
qval <= FDR_threshold and \
pval <= self.settings.pvalue_threshold:
colors.append('green')
annotations.append(True)
elif sign >= self.settings.effect_threshold and \
qval <= FDR_threshold and \
pval <= self.settings.pvalue_threshold:
colors.append('red')
annotations.append(True)
else:
colors.append('black')
annotations.append(False)
else:
for delta, qval, pval in zip(deltas, qvals, pvals):
if pval <= self.settings.pvalue_threshold and \
qval <= FDR_threshold and delta < 0:
colors.append('green')
annotations.append(True)
elif pval <= self.settings.pvalue_threshold and \
qval <= FDR_threshold and delta > 0:
colors.append('red')
annotations.append(True)
else:
colors.append('black')
annotations.append(False)
# here we normalise wrt the drug. In R code, normalised
# my max across all data (minN, maxN)
colname = self._colname_N_feature_pos
markersize = subdf[colname] / subdf[colname].max()
markersize = list(markersize * 800)
markersize = [x if x > 80 else 80 for x in markersize]
data['color'] = colors
data['annotation'] = annotations
data['markersize'] = markersize
return data
[docs] def volcano_plot_one_feature(self, feature):
"""Volcano plot for one feature (all drugs)
:param feature: a valid feature name to be found in the results
"""
assert feature in self.features, 'unknown feature name'
# FEATURE is the mode's name, not a column's name
data = self._get_volcano_sub_data('FEATURE', feature)
self._volcano_plot(data, title=feature)
[docs] def volcano_plot_one_drug(self, drug_id):
"""Volcano plot for one drug (all genomic features)
:param drug_id: a valid drug identifier to be found in the results
"""
assert drug_id in self.drugs, 'unknown drug name'
data = self._get_volcano_sub_data(self._colname_drugid, drug_id)
self._volcano_plot(data, title=drug_id)
def _volcano_plot(self, data, title=''):
"""Main volcano plot function called by other methods
such as volcano_plot_all"""
# This functio is a bit complicated because it does create a few tricky
# plots
# It creates a volcano plot, which is the easy part
# Then, it creates tooltips for the user interface in an IPython
# shell using a callback to 'onpick' function coded here below
# !! There seem to bes a memory leak in this function due to matplotlib
# This is not easy to track down and should have no impact now that
# ANOVAReport using JS instead of matplotlib
data = data.replace(np.inf, 0)
data = data.replace(-np.inf, 0)
colors = list(data['color'].values)
pvalues = data['pvalue'].values
signed_effects = data['signed_effect'].values
markersize = data['markersize'].values
Y = -np.log10(list(pvalues)) # should be cast to list ?
num = 1
#pylab.close(num)
fig = pylab.figure(num=1)
fig.clf()
ax = fig.add_subplot(111)
try:
ax.set_facecolor('#EEEEEE') #matplotlib 2.0
except:
ax.set_axis_bgcolor('#EEEEEE')
ax.cla()
# TODO signed effects may be inf why ?
X = [easydev.precision(x, digit=2) for x in signed_effects]
Y = [easydev.precision(y, digit=2) for y in Y]
# Using scatter() is slow as compared to plot()
# However, plot cannot take different sizes/colors
scatter = ax.scatter(X, Y, s=markersize,
alpha=0.3, c=colors, linewidth=1, picker=True)
scatter.set_zorder(11)
m = abs(signed_effects.min())
M = abs(signed_effects.max())
pylab.xlabel("Signed effect size", fontsize=self.settings.fontsize)
pylab.ylabel('-log10(pvalues)', fontsize=self.settings.fontsize)
l = max([m, M]) * 1.1
pylab.xlim([-l, l])
ax.grid(color='white', linestyle='solid')
# some aliases
fdr = self.settings.FDR_threshold
if fdr < self.df[self._colname_qvalue].min():
fdr = self.df[self._colname_qvalue].min()
fdrs = sorted(self.settings.volcano_additional_FDR_lines)
fdrs = fdrs[::-1] # reverse sorting
styles = ['--', ':', '-.']
if self.settings.volcano_FDR_interpolation is True:
get_pvalue_from_fdr = self._get_pvalue_from_fdr_interp
else:
get_pvalue_from_fdr = self._get_pvalue_from_fdr
pvalue = get_pvalue_from_fdr(fdr)
ax.axhline(-np.log10(pvalue), linestyle='--', lw=2,
color='red', alpha=1, label="FDR {:.2f} ".format(fdr) + " %")
for i, this in enumerate(fdrs):
if this < self.df[self._colname_qvalue].min() or\
this > self.df[self._colname_qvalue].max():
continue
pvalue = get_pvalue_from_fdr(this)
ax.axhline(-np.log10(pvalue), linestyle=styles[i],
color='red', alpha=1, label="FDR {:.2f} ".format(this) +" %")
pylab.ylim([0, pylab.ylim()[1]*1.2]) # times 1.2 to put the legend
ax.axvline(0, color='gray', alpha=0.8, lw=2)
axl = pylab.legend(loc='best')
axl.set_zorder(10) # in case there is a circle behind the legend.
#self.ax = ax
#self.axx = ax.twinx()
#self.common_ticks = ax.get_yticks()
#self.common_ylim = ax.get_ylim()
#pvals = self.df[self._colname_pvalue]
#y1 = pvals.min()
#y2 = pvals.max()
#fdr1 = self._get_fdr_from_pvalue_interp(y1)
#fdr2 = self._get_fdr_from_pvalue_interp(y2-2e-15) # make sure it exists
#self.axx.set_ylim([fdr2, fdr1])
#self.axx.set_ylabel('FDR \%', fontsize=self.settings.fontsize)
# For the static version
title_handler = pylab.title("%s" % str(title).replace("_"," "),
fontsize=self.settings.fontsize/1.2)
labels = []
# This code allows the ipython user to click on the matplotlib figure
# to get information about the drug and feature of a given circles.
def onpick(event):
ind = event.ind[0]
try:
title = str(str(data.iloc[ind]['Drug'])) + " / " + str(data.iloc[ind].Feature)
title += "\nFDR=" + "%.4e" % data.iloc[ind]['FDR']
title_handler.set_text(title.replace("_"," "))
except:
print('Failed to create new title on click')
print(data.iloc[ind].T)
fig.canvas.draw()
# keep track on the id for further memory release
# For more info search for "matplotlib memory leak mpl_connect"
self.cid = fig.canvas.mpl_connect('pick_event', onpick)
# for the JS version
# TODO: for the first 1 to 2000 entries ?
labels = []
self.data = data
for i, row in data[['Drug', 'Feature', 'FDR']].iterrows():
template = """
<table border="1" class="dataframe">
<tbody>
<tr>
<th>Drug</th>
<td>%(Drug)s</td>
</tr>
<tr>
<th>Feature</th>
<td>%(Feature)s</td>
</tr>
<tr>
<th>FDR</th>
<td>%(FDR)s</td>
</tr>
</tbody>
</table>""" % row.to_dict()
labels.append(template)
# this is more elegant but slower
#label = row.to_frame()
#label.columns = ['Row {0}'.format(i)]
#labels.append(str(label.to_html(header=False)))
self.scatter = scatter
self.current_fig = fig
# not sure is this is required. could be a memory leak here
import gc
gc.collect()
[docs] def savefig(self, filename, size_inches=(10, 10)):
# Save the PNG first. The savefig automatically set the size
# to a defined set and back to original figsize.
self.figtools.savefig(filename + '.png', size_inches=size_inches)
[docs]class VolcanoANOVAJS(VolcanoANOVA):
def __init__(self, data, sep="\t", settings=None):
super(VolcanoANOVAJS, self).__init__(data, sep, settings)
[docs] def render_drug(self, name):
self.data = self._get_volcano_sub_data("DRUG_ID", name)
return self._render_data(name)
[docs] def render_feature(self, name):
self.data = self._get_volcano_sub_data("FEATURE", name)
return self._render_data(name)
[docs] def render_all(self):
self.data = self._get_volcano_sub_data("ALL")
return self._render_data()
def _render_data(self, name="all associations"):
self.data['log10pvalue'] = -log10(self.data['pvalue'])
self.data['color'] = self.data['color'].apply(lambda x:
x.replace("black", "not_significant"))
self.data['color'] = self.data['color'].apply(lambda x:
x.replace("green", "sensitive"))
self.data['color'] = self.data['color'].apply(lambda x:
x.replace("red", "resistant"))
# We have 3 colors but sometimes you may have only one or 2.
# This may be an issue with canvasXpress. It seems essential
# to sort the color column so that names are sorted alphabetically
# and to include colors that are present in the sale order
try:
self.data.sort_values(by='color', inplace=True)
except:
self.data.sort("color", inplace=True)
colors = []
if "not_significant" in self.data.color.values:
colors.append("rgba(0,0,0,0.5)") # black
if "resistant" in self.data.color.values:
colors.append("rgba(205,0,0,0.5)") # red
if "sensitive" in self.data.color.values:
colors.append("rgba(0,205,0,0.5)") # green
env = Environment()
from easydev import get_package_location
env.loader = jinja2.FileSystemLoader(
get_package_location("gdsctools")
+ "/gdsctools/data/templates/")
template = env.get_template("volcano.html")
jinja = {}
jinja["colors"] = colors
jinja["Group"] = list(self.data['color'].values)
jinja['xlabel'] = "Signed Effect Size"
jinja['ylabel'] = "-log10(pvalue)"
text = []
for x,y,z in zip(self.data['Drug'].values,
self.data['Feature'].values,
self.data['FDR'].values):
text.append("<b>Drug:</b>%s <br><b>Feature:</b>%s <br><b>FDR:</b>%s" % (x,y,z))
jinja['vars'] = text
"""
# does not work in the JS somehow some points do not appear
# disabled for now
markersize = self.data['markersize']
markersize -= markersize.min()
markersize /= markersize.max()
markersize = (3*markersize).round()
#markersize[markersize == 0] = 1
FC = list(markersize.astype(int).astype(str))
jinja['FC'] = FC
"""
self.data.markersize /= (self.data.markersize.max()/3.)
#First value is Y, second is X, following will be used in the
try: # introduced in pandas > 0.16.2
jinja['data'] = self.data[["signed_effect", "log10pvalue",
"markersize"]].round(3).values.tolist()
except: #for py3.3 on travis
jinja['data'] = np.around(self.data[["signed_effect", "log10pvalue",
"markersize"]]).values.tolist()
jinja['title'] = '"%s"' % name
fdrs = self.get_fdr_ypos()
fdrs = [0 if np.isnan(x) else x for x in fdrs]
jinja['additional_fdrs'] = ""
for i,this in enumerate(fdrs):
line = '\n{"color": "red", "width": 1, "type": "dashed", "y": %s}'
if i == len(fdrs)-1:
pass
else:
line += ","
jinja['additional_fdrs'] += line % this
m = abs(self.data.signed_effect.min())
M = abs(self.data.signed_effect.max())
jinja['minX'] = -max([m, M]) * 1.1
jinja['maxX'] = max([m, M]) * 1.1
jinja['maxY'] = self.data["log10pvalue"].max() * 1.2
if max(fdrs) > jinja['maxY']:
jinja['maxY'] = max(fdrs) * 1.2
self.html = template.render(jinja)
return self.html
[docs] def get_fdr_ypos(self):
fdr = self.settings.FDR_threshold
fdrs = sorted(self.settings.volcano_additional_FDR_lines)
fdrs = fdrs[::-1] # reverse sorting
if self.settings.volcano_FDR_interpolation is True:
get_pvalue_from_fdr = self._get_pvalue_from_fdr_interp
else:
get_pvalue_from_fdr = self._get_pvalue_from_fdr
# If provided FDR is below minimum one, use minimum FDR as the
# threshold.
pvalue = get_pvalue_from_fdr(fdr)
if np.isnan(pvalue):
pvalue = self.df["ANOVA_FEATURE_pval"].min()
#ax.axhline(-np.log10(pvalue), linestyle='--', lw=2,
# color='red', alpha=1, label="FDR %s " % fdr + " \%")
pvalues = [-log10(pvalue)]
# If there is a NAN, plot is not shown, so we must be sure all FDRs are
# real values. Set 0 for the NAN so that we do not see the lines
#pvalues = [0 if np.isnan(x) else x for x in pvalues]
for i, this in enumerate(fdrs):
if this < self.df[self._colname_qvalue].min() or\
this > self.df[self._colname_qvalue].max():
pvalues.append(0)
continue
pvalue = get_pvalue_from_fdr(this)
pvalues.append(-np.log10(pvalue))
pvalues = [0 if np.isnan(x) else x for x in pvalues]
# we must have 3 values. If not, just repeat the last values
return pvalues
# window.CanvasXPress.references
# Working stuff:
# window.CanvasXpress.references[0].setHeight(400)
# window.CanvasXpress.references[0].data.d.line[0].color = 'green'
# window.CanvasXpress.references[0].redraw()
# window.CanvasXpress.references[0].redraw()
class ScatterJS(object):
"""
df =
js = ScatterJS(df, "x_name", "y_name", "color_name", "size_name")
html = js.get_html()
"""
def __init__(self, df, x, y, color, size):
self.data = df.copy()
self._colname_color = color
self._colname_x = x
self._colname_y = y
self._colname_size = size
self.minX = 0
self.minY = 0
self.maxX = 1
self.maxY = self.data[self._colname_y].max() * 1.1
def get_html(self):
# We have 3 colors but sometimes you may have only one or 2.
# This may be an issue with canvasXpress. It seems essential
# to sort the color column so that names are sorted alphabetically
# and to include colors that are present in the sale order
"""try:
self.data.sort_values(by='color', inplace=True)
except:
self.data.sort("color", inplace=True)
"""
# Jinja related
from easydev import get_package_location
env = Environment()
env.loader = jinja2.FileSystemLoader(
get_package_location("gdsctools")
+ "/gdsctools/data/templates/")
template = env.get_template("scatter.html")
# We need to cireate 20 different colors
from colormap import Colormap
c = Colormap()
cmap = c.cmap_linear("red", "blue", "yellow", N=20)
colors = self.data[self._colname_color]
colors = ["red" for x in self.data[self._colname_color]]
jinja = {}
jinja["colors"] = ["rgba(205,0,0,0.5)"]
jinja["Group"] = colors
jinja['xlabel'] = '"%s"' % self.xlabel
jinja['ylabel'] = '"%s"' % self.ylabel
selection = [self._colname_x, self._colname_y]
text = []
for index in zip(self.data.index):
text.append("<pre>%s</pre>" % self.data.iloc[index][selection].to_string())
jinja['vars'] = text
#self.data.markersize /= (self.data.markersize.max()/3.)
self.data['markersize'] = 20
selection = [self._colname_x, self._colname_y, self._colname_size]
#First value is Y, second is X, following will be used in the
try: # introduced in pandas > 0.16.2
jinja['data'] = self.data[selection].round(3).values.tolist()
except: #for py3.3 on travis
jinja['data'] = np.around(self.data[selection]).values.tolist()
jinja['title'] = '"Regression coefficient vs Bayes factor for all drugs"'
jinja['minX'] = self.minX
jinja['minY'] = self.minY
jinja['maxX'] = self.maxX
jinja['maxY'] = self.maxY
self.html = template.render(jinja)
return self.html