Source code for gdsctools.boxplots

# -*- python -*-
# -*- coding utf-8 -*-

#  This file is part of GDSCTools software
#
#  Copyright (c) 2015 - Wellcome Trust Sanger Institute
#  All rights reserved
#
#  File author(s): Thomas Cokelaer <cokelaer@gmail.comWE HERE>
#
#  Distributed under the BSD 3-Clause License.
#  See accompanying file LICENSE.txt distributed with this software
#
#  website: http://github.com/CancerRxGene/gdsctools
#
##############################################################################
"""Code related to the ANOVA analysis to find associations between drug IC50s
and genomic features"""
import os
import pandas as pd
import scipy
import pylab

from gdsctools import boxswarm

import jinja2
from jinja2 import Environment, PackageLoader
from easydev import get_package_location

import colorlog as logger

__all__ = ['BoxPlots', 'BoxPlotsJS']


[docs]class BoxPlots(object): """Box plot for a given association of drug versus genomic feature .. plot:: :include-source: :width: 80% from gdsctools import ANOVA, ic50_test from gdsctools.boxplots import BoxPlots gdsc = ANOVA(ic50_test) # Perform the entire analysis odof = gdsc._get_one_drug_one_feature_data(1047, 'TP53_mut') # Plot volcano plot of pvalues versus signed effect size bx = BoxPlots(odof) bx.boxplot_association() If the **gdsc** analysis has the MSI factor and tissue factor on, then additional plots can be created using :meth:`boxplot_pancan`. Note that :attr:`odof` in the example above is a dictionary with the following keys: - drug_name - feature_name - masked_tissue: a dataframe with cosmic ids as index and 1 column of tissues names. - Y: list with the IC50s - masked_features: a dataframe with cosmic ids as index and 1 column of masked feature (1/0) - masked_msi: same as masked_features - negatives: subset of the IC50s corresponding to positive feature - positives: subst of the IC50s corresponding to negative feature .. seealso:: :class:`gdsctools.boxswarm.BoxSwarm` """ def __init__(self, odof, fontsize=20, savefig=False, directory='.'): """.. rubric:: Constructor """ #: dictionary as returned by ANOVA._get_one_drug_one_feature_data self.odof = odof #: fontsize for the plots self.fontsize = fontsize #: boolean to save figure self.savefig = savefig #: directory where to save the figure. self.directory = directory #: linewidth used in the plots self.lw = 3 self.drug = self.odof.drug_id self.feature = self.odof.feature_name.replace("_", " ")
[docs] def boxplot_pancan(self, mode, fignum=1, title_prefix=''): """Create boxplot related to the MSI factor or Tissue factor :param mode: either set to **msi** or **tissue** """ assert mode in ['tissue', 'msi', "media"] results = self._get_boxplot_data(mode) if results is None: logger.warning("No tissue with at least 2 pos and 2 neg found (no image created).") return fig = pylab.figure(fignum) oldsize = fig.get_size_inches() pylab.clf() # or close ? data, names, significance = results N = len(names) if N <= 2: # msi or 2 tissues fontsize = self.fontsize elif N<=14: fontsize = max(4, int(self.fontsize - (N-2.)/(self.fontsize-4.))) else: fontsize = max(4, int(self.fontsize/1.4)) bb = boxswarm.BoxSwarm(data, names, fontsize=fontsize) bb.xlabel = r'%s log(IC50)' % self.drug if mode == 'tissue': bb.title = 'FEATURE/Cancer-type interactions' elif mode == 'msi': bb.title = 'FEATURE/MS-instability interactions' elif mode == "media": bb.title = 'FEATURE/Media interactions' ax = bb.plot(vert=False) # get info from left axis common_ylim = ax.get_ylim() common_ticks = ax.get_yticks() self.ax = ax.twinx() self.ax.set_ylim(common_ylim) self.ax.set_yticks(common_ticks) self.ax.set_yticklabels([str(len(this))+" " for this in data], fontsize=fontsize/1.4) try: pylab.tight_layout() except: pass if self.savefig is True: filename = self.directory + os.sep filename += 'ODOF_{}_DRUG_{}____{}'.format(mode, self.drug, self.feature) fig.set_size_inches(14, 16) pylab.savefig(filename + '.png', bbox_inches='tight') fig.set_size_inches(oldsize) fig.canvas.draw()
#pylab.savefig(filename + '.svg', bbox_inches='tight')
[docs] def boxplot_association(self, fignum=1): """Boxplot of the association (negative versus positive) :param fignum: number of the figure """ pylab.figure(fignum) pylab.clf() # the plot itself boxswarm.boxswarm( {'pos': self.odof.positives, 'neg': self.odof.negatives}, lw=self.lw, fontsize=self.fontsize) if self.odof.drug_name: drug_name = "%s (%s)" % (self.drug, self.odof.drug_name) else: drug_name = self.drug pylab.title('Individual association\n {0} versus {1}'.format(drug_name, self.feature), fontsize=self.fontsize) pylab.ylabel("{0} logIC50".format(self.drug), fontsize=self.fontsize) try:pylab.tight_layout() except: pass if self.savefig is True: filename = self.directory + os.sep filename += 'ODOF_all_DRUG_{}____{}'.format(self.drug, self.feature) pylab.savefig(filename + '.png', bbox_inches='tight')
def _get_boxplot_data(self, mode='tissue'): # should be called by anova_one_drug_one_feature # since masked_tissue, masked_ic50 attributes must # be populated. assert mode in ['tissue', 'msi', "media"] # Let us use Pandas, this will be easier try: df = pd.DataFrame({ 'ic50': self.odof.Y, 'media': self.odof.masked_media, 'feature': self.odof.masked_features, 'msi': self.odof.masked_msi.values, 'tissue': self.odof.masked_tissue.values}) except: df = pd.DataFrame({ 'ic50': self.odof.Y, 'tissue': self.odof.masked_tissue, 'feature': self.odof.masked_features, 'msi': self.odof.masked_msi.values}) if mode == 'tissue': if 'msi' in df.columns: df.drop('msi', inplace=True, axis=1) if "media" in df.columns: df.drop('media', inplace=True, axis=1) elif mode == 'msi': if 'tissue' in df.columns: df.drop('tissue', inplace=True, axis=1) if "media" in df.columns: df.drop('media', inplace=True, axis=1) elif mode == 'media': if 'tissue' in df.columns: df.drop('tissue', inplace=True, axis=1) if 'msi' in df.columns: df.drop('msi', inplace=True, axis=1) groups = df.groupby(['feature', mode]) # counts items in each category and fill with NA counts = groups.count().unstack().fillna(0) # if positive or negative for a combo, is not>=2, drop it # pandas 0.16.2 cc = (counts >= 2).all() # create a groups structure categories = list(cc.unstack().columns[cc]) """ # Seems to be fixed (May 2016) try: # pandas 0.16.2 cc = (counts >= 2).all() # create a groups structure categories = list(cc.unstack().columns[cc]) except: # pandas 0.13 for the doc only categories = list(df.tissue.unique())[0:10] """ groups = df.query(mode + ' in @categories', engine='python').groupby([mode, 'feature']) # TODO; move all this if block into a method # figure out the delta between pos and neg means = groups.mean().unstack(mode) if len(means): # need 2 values delta = means.iloc[0] - means.iloc[1] try: # new pandas v0.17 delta.sort_values(inplace=True) except Exception as err: # conda for py3.3 is 0.16.2 where sort_values sdoes not exists # Useful also for readthedocs delta.sort(inplace=True) significance = {} data = [] names = [] for category in delta.loc['ic50'].index: prefix_query = mode+"==@category" neg = df.query(prefix_query+' and feature==0', engine='python')['ic50'] pos = df.query(prefix_query+' and feature==1', engine='python')['ic50'] # HERE in the original code, equal_var is False. why ? res = scipy.stats.ttest_ind(neg, pos, equal_var=False) #print res[1], self.odof.ttest # this should be computed outside ?? significance[category] = res[1] # p-values data.append(neg.values) data.append(pos.values) if mode == 'tissue': name = category elif mode == 'msi': if category == 0: name = 'MSI-stable' elif category == 1: name = 'MSI-unstable' elif mode == "media": name = category for this in [0.05, 0.01, 0.001]: if significance[category] < this: name = '*' + name names.append(name + ' neg') names.append(name + ' pos') return (data, names, significance) else: return None
[docs]class BoxPlotsJS(BoxPlots): def __init__(self, odof, fontsize=20, savefig=False, directory='.'): super(BoxPlotsJS, self).__init__(odof, fontsize=fontsize, savefig=savefig, directory=directory)
[docs] def get_html_association(self): env = Environment() env.loader = jinja2.FileSystemLoader( get_package_location("gdsctools") + "/gdsctools/data/templates/") template = env.get_template("boxplot_association.html") jinja = {} N = len(self.odof.negatives) + len(self.odof.positives) jinja["title"] = "Individual Association" if self.odof.drug_name: drug_name = "%s (%s)" % (self.drug, self.odof.drug_name) jinja["subtitle"] = "%s versus %s" % (drug_name, self.feature) else: jinja["subtitle"] = "%s versus %s" % (self.drug, self.feature) jinja['factor'] = ["neg"] * len(self.odof.negatives) + ["pos"] * len(self.odof.positives) jinja["data"] = self.odof.negatives.tolist() + self.odof.positives.tolist() jinja["smps"] = [str(this) for this in self.odof.indices] jinja["subject"] = [str(this) for this in self.odof.indices] jinja['ylabel'] = '"logIC50"' html = template.render(jinja) return html
[docs] def get_html_media(self): env = Environment() env.loader = jinja2.FileSystemLoader( get_package_location("gdsctools") + "/gdsctools/data/templates/") template = env.get_template("boxplot_media.html") data = self._get_boxplot_data("media") if data is None: return "" # Show from bottom to top labels = data[1][::-1] data = data[0][::-1] jinja = {} N = len(self.odof.negatives) + len(self.odof.positives) jinja["title"] = "FEATURE/Media interactions" jinja["subtitle"] = "%s versus %s" % (self.drug, self.feature) factor = [] for i, thisdata in enumerate(data): factor.extend( [labels[i]] * len(thisdata)) jinja['sign'] = [x.split()[1] for x in factor] jinja['status'] = [x.split()[0] for x in factor] jinja["data"] = list(pylab.flatten([list(this) for this in data])) jinja['xlabel'] = '"logIC50"' html = template.render(jinja) return html
[docs] def get_html_msi(self): env = Environment() env.loader = jinja2.FileSystemLoader( get_package_location("gdsctools") + "/gdsctools/data/templates/") template = env.get_template("boxplot_msi.html") data = self._get_boxplot_data("msi") if data is None: return "" # Show from bottom to top labels = data[1][::-1] data = data[0][::-1] jinja = {} N = len(self.odof.negatives) + len(self.odof.positives) jinja["title"] = "FEATURE/MS-instability interactions" jinja["subtitle"] = "%s versus %s" % (self.drug, self.feature) factor = [] for i, thisdata in enumerate(data): factor.extend( [labels[i]] * len(thisdata)) jinja['sign'] = [x.split()[1] for x in factor] jinja['status'] = [x.split()[0] for x in factor] jinja["data"] = list(pylab.flatten([list(this) for this in data])) jinja['xlabel'] = '"logIC50"' html = template.render(jinja) return html
[docs] def get_html_tissue(self): env = Environment() env.loader = jinja2.FileSystemLoader( get_package_location("gdsctools") + "/gdsctools/data/templates/") template = env.get_template("boxplot_tissue.html") data = self._get_boxplot_data("tissue") if data is None: return "" # Show from bottom to top labels = data[1][::-1] data = data[0][::-1] jinja = {} N = len(self.odof.negatives) + len(self.odof.positives) jinja["title"] = "FEATURE/MS-instability interactions" jinja["subtitle"] = "%s versus %s" % (self.drug, self.feature) factor = [] for i, thisdata in enumerate(data): factor.extend( [labels[i]] * len(thisdata)) jinja['sign'] = [x.split()[1] for x in factor] jinja['status'] = [x.split()[0] for x in factor] jinja["data"] = list(pylab.flatten([list(this) for this in data])) jinja['xlabel'] = '"logIC50"' if len(labels)/2 >= 10: jinja["minTextSize"] = 10 html = template.render(jinja) return html