# -*- python -*-
# -*- coding utf-8 -*-
# This file is part of GDSCTools software
#
# Copyright (c) 2015 - Wellcome Trust Sanger Institute
# All rights reserved
#
# File author(s): Thomas Cokelaer <cokelaer@gmail.comWE HERE>
#
# Distributed under the BSD 3-Clause License.
# See accompanying file LICENSE.txt distributed with this software
#
# website: http://github.com/CancerRxGene/gdsctools
#
##############################################################################
"""Code related to the ANOVA analysis to find associations between drug IC50s
and genomic features"""
import os
import pandas as pd
import scipy
import pylab
from gdsctools import boxswarm
import jinja2
from jinja2 import Environment, PackageLoader
from easydev import get_package_location
import colorlog as logger
__all__ = ['BoxPlots', 'BoxPlotsJS']
[docs]class BoxPlots(object):
"""Box plot for a given association of drug versus genomic feature
.. plot::
:include-source:
:width: 80%
from gdsctools import ANOVA, ic50_test
from gdsctools.boxplots import BoxPlots
gdsc = ANOVA(ic50_test)
# Perform the entire analysis
odof = gdsc._get_one_drug_one_feature_data(1047, 'TP53_mut')
# Plot volcano plot of pvalues versus signed effect size
bx = BoxPlots(odof)
bx.boxplot_association()
If the **gdsc** analysis has the MSI factor and tissue factor on, then
additional plots can be created using :meth:`boxplot_pancan`.
Note that :attr:`odof` in the example above is a dictionary
with the following keys:
- drug_name
- feature_name
- masked_tissue: a dataframe with cosmic ids as index and 1 column of
tissues names.
- Y: list with the IC50s
- masked_features: a dataframe with cosmic ids as index and 1 column of
masked feature (1/0)
- masked_msi: same as masked_features
- negatives: subset of the IC50s corresponding to positive feature
- positives: subst of the IC50s corresponding to negative feature
.. seealso:: :class:`gdsctools.boxswarm.BoxSwarm`
"""
def __init__(self, odof, fontsize=20, savefig=False, directory='.'):
""".. rubric:: Constructor
"""
#: dictionary as returned by ANOVA._get_one_drug_one_feature_data
self.odof = odof
#: fontsize for the plots
self.fontsize = fontsize
#: boolean to save figure
self.savefig = savefig
#: directory where to save the figure.
self.directory = directory
#: linewidth used in the plots
self.lw = 3
self.drug = self.odof.drug_id
self.feature = self.odof.feature_name.replace("_", " ")
[docs] def boxplot_pancan(self, mode, fignum=1, title_prefix=''):
"""Create boxplot related to the MSI factor or Tissue factor
:param mode: either set to **msi** or **tissue**
"""
assert mode in ['tissue', 'msi', "media"]
results = self._get_boxplot_data(mode)
if results is None:
logger.warning("No tissue with at least 2 pos and 2 neg found (no image created).")
return
fig = pylab.figure(fignum)
oldsize = fig.get_size_inches()
pylab.clf() # or close ?
data, names, significance = results
N = len(names)
if N <= 2: # msi or 2 tissues
fontsize = self.fontsize
elif N<=14:
fontsize = max(4, int(self.fontsize - (N-2.)/(self.fontsize-4.)))
else:
fontsize = max(4, int(self.fontsize/1.4))
bb = boxswarm.BoxSwarm(data, names, fontsize=fontsize)
bb.xlabel = r'%s log(IC50)' % self.drug
if mode == 'tissue':
bb.title = 'FEATURE/Cancer-type interactions'
elif mode == 'msi':
bb.title = 'FEATURE/MS-instability interactions'
elif mode == "media":
bb.title = 'FEATURE/Media interactions'
ax = bb.plot(vert=False)
# get info from left axis
common_ylim = ax.get_ylim()
common_ticks = ax.get_yticks()
self.ax = ax.twinx()
self.ax.set_ylim(common_ylim)
self.ax.set_yticks(common_ticks)
self.ax.set_yticklabels([str(len(this))+" " for this in data],
fontsize=fontsize/1.4)
try:
pylab.tight_layout()
except:
pass
if self.savefig is True:
filename = self.directory + os.sep
filename += 'ODOF_{}_DRUG_{}____{}'.format(mode,
self.drug, self.feature)
fig.set_size_inches(14, 16)
pylab.savefig(filename + '.png', bbox_inches='tight')
fig.set_size_inches(oldsize)
fig.canvas.draw()
#pylab.savefig(filename + '.svg', bbox_inches='tight')
[docs] def boxplot_association(self, fignum=1):
"""Boxplot of the association (negative versus positive)
:param fignum: number of the figure
"""
pylab.figure(fignum)
pylab.clf()
# the plot itself
boxswarm.boxswarm(
{'pos': self.odof.positives, 'neg': self.odof.negatives},
lw=self.lw, fontsize=self.fontsize)
if self.odof.drug_name:
drug_name = "%s (%s)" % (self.drug, self.odof.drug_name)
else:
drug_name = self.drug
pylab.title('Individual association\n {0} versus {1}'.format(drug_name,
self.feature), fontsize=self.fontsize)
pylab.ylabel("{0} logIC50".format(self.drug),
fontsize=self.fontsize)
try:pylab.tight_layout()
except: pass
if self.savefig is True:
filename = self.directory + os.sep
filename += 'ODOF_all_DRUG_{}____{}'.format(self.drug,
self.feature)
pylab.savefig(filename + '.png', bbox_inches='tight')
def _get_boxplot_data(self, mode='tissue'):
# should be called by anova_one_drug_one_feature
# since masked_tissue, masked_ic50 attributes must
# be populated.
assert mode in ['tissue', 'msi', "media"]
# Let us use Pandas, this will be easier
try:
df = pd.DataFrame({
'ic50': self.odof.Y,
'media': self.odof.masked_media,
'feature': self.odof.masked_features,
'msi': self.odof.masked_msi.values,
'tissue': self.odof.masked_tissue.values})
except:
df = pd.DataFrame({
'ic50': self.odof.Y,
'tissue': self.odof.masked_tissue,
'feature': self.odof.masked_features,
'msi': self.odof.masked_msi.values})
if mode == 'tissue':
if 'msi' in df.columns:
df.drop('msi', inplace=True, axis=1)
if "media" in df.columns:
df.drop('media', inplace=True, axis=1)
elif mode == 'msi':
if 'tissue' in df.columns:
df.drop('tissue', inplace=True, axis=1)
if "media" in df.columns:
df.drop('media', inplace=True, axis=1)
elif mode == 'media':
if 'tissue' in df.columns:
df.drop('tissue', inplace=True, axis=1)
if 'msi' in df.columns:
df.drop('msi', inplace=True, axis=1)
groups = df.groupby(['feature', mode])
# counts items in each category and fill with NA
counts = groups.count().unstack().fillna(0)
# if positive or negative for a combo, is not>=2, drop it
# pandas 0.16.2
cc = (counts >= 2).all()
# create a groups structure
categories = list(cc.unstack().columns[cc])
"""
# Seems to be fixed (May 2016)
try:
# pandas 0.16.2
cc = (counts >= 2).all()
# create a groups structure
categories = list(cc.unstack().columns[cc])
except:
# pandas 0.13 for the doc only
categories = list(df.tissue.unique())[0:10]
"""
groups = df.query(mode + ' in @categories',
engine='python').groupby([mode, 'feature'])
# TODO; move all this if block into a method
# figure out the delta between pos and neg
means = groups.mean().unstack(mode)
if len(means): # need 2 values
delta = means.iloc[0] - means.iloc[1]
try:
# new pandas v0.17
delta.sort_values(inplace=True)
except Exception as err:
# conda for py3.3 is 0.16.2 where sort_values sdoes not exists
# Useful also for readthedocs
delta.sort(inplace=True)
significance = {}
data = []
names = []
for category in delta.loc['ic50'].index:
prefix_query = mode+"==@category"
neg = df.query(prefix_query+' and feature==0',
engine='python')['ic50']
pos = df.query(prefix_query+' and feature==1',
engine='python')['ic50']
# HERE in the original code, equal_var is False. why ?
res = scipy.stats.ttest_ind(neg, pos, equal_var=False)
#print res[1], self.odof.ttest
# this should be computed outside ??
significance[category] = res[1] # p-values
data.append(neg.values)
data.append(pos.values)
if mode == 'tissue':
name = category
elif mode == 'msi':
if category == 0:
name = 'MSI-stable'
elif category == 1:
name = 'MSI-unstable'
elif mode == "media":
name = category
for this in [0.05, 0.01, 0.001]:
if significance[category] < this:
name = '*' + name
names.append(name + ' neg')
names.append(name + ' pos')
return (data, names, significance)
else:
return None
[docs]class BoxPlotsJS(BoxPlots):
def __init__(self, odof, fontsize=20, savefig=False, directory='.'):
super(BoxPlotsJS, self).__init__(odof, fontsize=fontsize,
savefig=savefig, directory=directory)
[docs] def get_html_association(self):
env = Environment()
env.loader = jinja2.FileSystemLoader(
get_package_location("gdsctools")
+ "/gdsctools/data/templates/")
template = env.get_template("boxplot_association.html")
jinja = {}
N = len(self.odof.negatives) + len(self.odof.positives)
jinja["title"] = "Individual Association"
if self.odof.drug_name:
drug_name = "%s (%s)" % (self.drug, self.odof.drug_name)
jinja["subtitle"] = "%s versus %s" % (drug_name, self.feature)
else:
jinja["subtitle"] = "%s versus %s" % (self.drug, self.feature)
jinja['factor'] = ["neg"] * len(self.odof.negatives) + ["pos"] * len(self.odof.positives)
jinja["data"] = self.odof.negatives.tolist() + self.odof.positives.tolist()
jinja["smps"] = [str(this) for this in self.odof.indices]
jinja["subject"] = [str(this) for this in self.odof.indices]
jinja['ylabel'] = '"logIC50"'
html = template.render(jinja)
return html
[docs] def get_html_msi(self):
env = Environment()
env.loader = jinja2.FileSystemLoader(
get_package_location("gdsctools")
+ "/gdsctools/data/templates/")
template = env.get_template("boxplot_msi.html")
data = self._get_boxplot_data("msi")
if data is None:
return ""
# Show from bottom to top
labels = data[1][::-1]
data = data[0][::-1]
jinja = {}
N = len(self.odof.negatives) + len(self.odof.positives)
jinja["title"] = "FEATURE/MS-instability interactions"
jinja["subtitle"] = "%s versus %s" % (self.drug, self.feature)
factor = []
for i, thisdata in enumerate(data):
factor.extend( [labels[i]] * len(thisdata))
jinja['sign'] = [x.split()[1] for x in factor]
jinja['status'] = [x.split()[0] for x in factor]
jinja["data"] = list(pylab.flatten([list(this) for this in data]))
jinja['xlabel'] = '"logIC50"'
html = template.render(jinja)
return html
[docs] def get_html_tissue(self):
env = Environment()
env.loader = jinja2.FileSystemLoader(
get_package_location("gdsctools")
+ "/gdsctools/data/templates/")
template = env.get_template("boxplot_tissue.html")
data = self._get_boxplot_data("tissue")
if data is None:
return ""
# Show from bottom to top
labels = data[1][::-1]
data = data[0][::-1]
jinja = {}
N = len(self.odof.negatives) + len(self.odof.positives)
jinja["title"] = "FEATURE/MS-instability interactions"
jinja["subtitle"] = "%s versus %s" % (self.drug, self.feature)
factor = []
for i, thisdata in enumerate(data):
factor.extend( [labels[i]] * len(thisdata))
jinja['sign'] = [x.split()[1] for x in factor]
jinja['status'] = [x.split()[0] for x in factor]
jinja["data"] = list(pylab.flatten([list(this) for this in data]))
jinja['xlabel'] = '"logIC50"'
if len(labels)/2 >= 10:
jinja["minTextSize"] = 10
html = template.render(jinja)
return html