Source code for gdsctools.anova_report

# -*- python -*-
# -*- coding utf-8 -*-
#
#  This file is part of GDSCTools software
#
#  Copyright (c) 2015 - Wellcome Trust Sanger Institute
#  All rights reserved
#
#  File author(s): Thomas Cokelaer <cokelaer@gmail.comWE HERE>
#
#  Distributed under the BSD 3-Clause License.
#  See accompanying file LICENSE.txt distributed with this software
#
#  website: http://github.com/CancerRxGene/gdsctools
#
##############################################################################
"""Code related to the ANOVA analysis to find associations between drug IC50s
and genomic features"""
import os
import warnings

from gdsctools.report import HTMLTable, ReportMain
from gdsctools.tools import Savefig
from gdsctools.volcano import VolcanoANOVAJS
from gdsctools.boxplots import BoxPlotsJS
from gdsctools.settings import ANOVASettings
from gdsctools.anova_results import ANOVAResults
from gdsctools.readers import DrugDecode

import pandas as pd
import pylab
import numpy as np

from easydev import Progress
import easydev
from colormap import cmap_builder


__all__ = ['ANOVAReport']


[docs]class ANOVAReport(object): """Class used to interpret the results and create final HTML report Results is a data structure returned by :meth:`ANOVA.anova_all`. :: from gdsctools import * # Perform the analysis itself to get a set of results (dataframe) an = ANOVA(ic50_test) results = an.anova_all() # now, we can create the report. r = ANOVAReport(gdsc=an, results=results) # we can tune some settings r.settings.pvalue_threshold = 0.001 r.settings.FDR_threshold = 28 r.settings.directory = 'testing' r.create_html_pages() .. rubric:: Significant association An association is significant if - The field *ANOVA_FEATURE_FDR* must be < FDR_threshold - The field *ANOVA_FEATURE_pval* must be < pvalue_threshold It is then labelled **sensible** if *FEATURE_delta_MEAN_IC50* is below 0, otherwise it is **resistant**. """ def __init__(self, gdsc, results=None, sep="\t", drug_decode=None, verbose=True): """.. rubric:: Constructor :param gdsc: the instance with which you created the results to report :param results: the results returned by :meth:`ANOVA.anova_all`. If not provided, the ANOVA is run on the fly. """ self.verbose = verbose self.figtools = Savefig(verbose=False) self.gdsc = gdsc if results is None: results = gdsc.anova_all() self.df = ANOVAResults(results).df # this does a copy and sanity check # Make sure the DRUG are integers self.df.DRUG_ID = self.df.DRUG_ID.astype(int) self.settings = ANOVASettings() for k, v in gdsc.settings.items(): self.settings[k] = v self._colname_drug_id = 'DRUG_ID' self.varname_pval = 'ANOVA_FEATURE_pval' self.varname_qval = 'ANOVA_FEATURE_FDR' # maybe there was not drug_decode in the gdsc parameter, # so a user may have provide a file, in which case, we need # to update the content of the dur_decoder. if len(gdsc.drug_decode) == 0 and drug_decode is None: warnings.warn("No drug name or target will be populated." "You may want to provide a DRUG_DECODE file.") self.drug_decode = DrugDecode() elif drug_decode is not None: # Read a file self.drug_decode = DrugDecode(drug_decode) else: # Copy from gdsc instance self.drug_decode = DrugDecode(gdsc.drug_decode) self.df = self.drug_decode.drug_annotations(self.df) #if sum(self.df == np.inf).sum()>0: # print("WARNING: infinite values were found in your results... Set to zero") try: self.df = self.df.replace({np.inf:0, -np.inf:0}) except: pass # create some data self._set_sensible_df() self.company = None # just to create the directory # ReportMain(directory=self.settings.directory, verbose=self.verbose) def _get_ndrugs(self): return len(self.df[self._colname_drug_id].unique()) n_drugs = property(_get_ndrugs, doc="return number of drugs") def _get_ntests(self): return len(self.df.index) n_tests = property(_get_ntests) def _get_ncelllines(self): return len(self.gdsc.features.df.index) n_celllines = property(_get_ncelllines, doc="return number of cell lines") def _df_append(self, df, data): count = len(df) df.loc[count] = data return df
[docs] def diagnostics(self): """Return summary of the analysis (dataframe)""" self._set_sensible_df() df = pd.DataFrame({'text': [], 'value': []}) n_features = len(self.gdsc.features.df.columns) n_features -= self.gdsc.features.shift n_drugs = len(self.df[self._colname_drug_id].unique()) N = float(n_drugs * n_features) if N == 0: ratio = 0 else: ratio = float(self.n_tests)/(N) * 100 try: ratio = easydev.precision(ratio, digit=2) except: # Fixme: this is a hack for the infinite values but should not # happen... ratio = 0 msg = "Type of analysis" df = self._df_append(df, [msg, self.settings.analysis_type]) msg = "Total number of possible drug/feature associations" df = self._df_append(df, [msg, int(N)]) msg = "Total number of ANOVA tests performed" df = self._df_append(df, [msg, self.n_tests]) msg = "Percentage of tests performed" df = self._df_append(df, [msg, ratio]) # trick to have an empty line df = self._df_append(df, ["", ""]) msg = "Total number of tested drugs" df = self._df_append(df, [msg, n_drugs]) msg = "Total number of genomic features used" df = self._df_append(df, [msg, n_features]) msg = "Total number of screened cell lines" df = self._df_append(df, [msg, self.n_celllines]) msg = "MicroSatellite instability included as factor" msi = self.settings.include_MSI_factor df = self._df_append(df, [msg, msi]) # trick to have an empty line df = self._df_append(df, ["", ""]) nsens = len(self.sensible_df) nres = len(self.resistant_df) msg = "Total number of significant associations" df = self._df_append(df, [msg, nsens+nres]) msg = " - sensitive" df = self._df_append(df, [msg, nsens]) msg = " - resistant" df = self._df_append(df, [msg, nres]) msg = "p-value significance threshold" df = self._df_append(df, [msg, self.settings.pvalue_threshold]) msg = "FDR significance threshold" df = self._df_append(df, [msg, self.settings.FDR_threshold]) p1, p2 = self._get_pval_range() msg = 'Range of significant p-values' value = "[{:.4}, {:.4}]".format(p1, p2) df = self._df_append(df, [msg, value]) f1, f2 = self._get_fdr_range() msg = "Range of significant % FDRs" value = '[{:.4} {:.4}]'.format(f1, f2) df = self._df_append(df, [msg, value]) return df
def _get_pval_range(self): """Get pvalues range of the significant hits""" nsens = len(self.sensible_df) nres = len(self.resistant_df) N = nsens + nres if N == 0: return 0., 0. name = self.varname_pval data = self.df[name].iloc[0:N] m, M = data.min(), data.max() return m, M def _get_fdr_range(self): """Get FDR range of the significant hits""" name = self.varname_qval data = self.df[name][(self.df[name] < self.settings.FDR_threshold)] if len(data) == 0: return 0., 0. m, M = data.min(), data.max() return m, M def _set_sensible_df(self): # just an alias logand = np.logical_and # select sensible data set mask1 = self.df['ANOVA_FEATURE_FDR'] < self.settings.FDR_threshold mask2 = self.df['ANOVA_FEATURE_pval'] < self.settings.pvalue_threshold mask3 = self.df['FEATURE_delta_MEAN_IC50'] < 0 self.sensible_df = self.df[logand(logand(mask1, mask2), mask3)] # select resistant data set mask3 = self.df['FEATURE_delta_MEAN_IC50'] >= 0 self.resistant_df = self.df[logand(logand(mask1, mask2), mask3)]
[docs] def get_significant_set(self): """Return significant hits (resistant and sensible)""" # a property that is long to compute # and may change if FDR changes. self._set_sensible_df() df = pd.concat([self.sensible_df, self.resistant_df]) try: df.sort_values('ASSOC_ID', inplace=True) except: df.sort('ASSOC_ID', inplace=True) return df
def _get_data(self, df_count_sensible, df_count_resistant): # we can drop all columns except one, which is renamed as count df1 = df_count_sensible['ASSOC_ID'] df1.name = 'sens assoc' df2 = df_count_resistant['ASSOC_ID'] df2.name = 'res assoc' # Now, we join the two TimeSeries (note that above, we selected only # one column so the dataframe was downcast to time series) df_count = pd.DataFrame(df1).join(pd.DataFrame(df2), how='outer') # and set NA to zero df_count.fillna(0, inplace=True) # let us add a new column with the total df_count['total'] = df_count['sens assoc'] + df_count['res assoc'] # we want to sort by 'total' column and is equality by the name, # which is the index. So let us add the index temporarily as # a column, sort, and remove 'name' column afterwards df_count['name'] = df_count.index try: df_count.sort_values(by=['total', 'name'], ascending=[False, True], inplace=True) except: df_count.sort(columns=['total', 'name'], ascending=[False, True], inplace=True) df_count.drop('name', axis=1, inplace=True) return df_count
[docs] def get_drug_summary_data(self): """Return dataframe with drug summary""" # get sensible and resistant sub dataframes self._set_sensible_df() # group by drug colname = self._colname_drug_id df_count_sensible = self.sensible_df.groupby(colname).count() df_count_resistant = self.resistant_df.groupby(colname).count() df_count = self._get_data(df_count_sensible, df_count_resistant) return df_count
[docs] def drug_summary(self, top=50, fontsize=15, filename=None): """Return dataframe with significant drugs and plot figure :param fontsize: :param top: max number of significant associations to show :param filename: if provided, save the file in the directory """ df_count = self.get_drug_summary_data() if len(df_count): self._plot(df_count, 'drug', top) fig = pylab.gcf() self.figtools.directory = self.settings.directory self.figtools.savefig(filename, size_inches=(12, 14), bbox_inches='tight') return df_count
[docs] def get_feature_summary_data(self): """Return dataframe with feature summary""" # get sensible and resistant sub dataframes self._set_sensible_df() df_count_sensible = self.sensible_df.groupby('FEATURE').count() df_count_resistant = self.resistant_df.groupby('FEATURE').count() df_count = self._get_data(df_count_sensible, df_count_resistant) return df_count
[docs] def feature_summary(self, filename=None, top=50, fontsize=15): """Return dataframe with significant features and plot figure :param fontsize: :param top: max number of significant associations to show :param filename: if provided, save the file in the directory """ df_count = self.get_feature_summary_data() if len(df_count) > 0: self._plot(df_count, 'feature', top) #fig = pylab.gcf() self.figtools.directory = self.settings.directory self.figtools.savefig(filename, set_inches=(12, 14), bbox_inches='tight') return df_count
def _plot(self, df_count, title_tag, top): """Used by drug_summary and feature_summary to plot the bar plot""" if top > len(df_count): top = len(df_count) df = df_count.iloc[0:top][[u'sens assoc', u'res assoc']] labels = list(df.index) # add drug name if len(self.drug_decode) > 0: for i, label in enumerate(labels): if title_tag == 'drug': name = self.drug_decode.get_name(label) if name is not None: labels[i] = "{}-{}".format(labels[i], name) else: pass labels = [str(x).replace('_', ' ') for x in labels] # restrict size to first 30 characters labels = [x[0:30] for x in labels] ind = range(0, len(labels)) # reverse does not exist with python3 try: ind.reverse() except: ind = list(ind) ind.reverse() data1 = df['sens assoc'].values data2 = df['res assoc'].values pylab.figure(1) pylab.clf() p1 = pylab.barh(ind, data1, height=0.8, color='purple', label='sensitivity') p2 = pylab.barh(ind, data2, height=0.8, color='orange', left=data1, label='resistance') ax = pylab.gca() self.labels = labels ax.set_yticks([x + 0.5 for x in ind]) ax.set_yticklabels(labels, fontsize=12) xticks = ax.get_xticks() ax.set_xticklabels( [int(x) if divmod(x,1)[1] == 0 else "" for x in xticks]) pylab.grid() pylab.title(r"Top %s %s(s) most frequently " % (top, title_tag) + \ "\nassociated with drug response", fontsize=self.settings.fontsize/1.2) pylab.xlabel(r'Number of significant associations (FDR %s %s %s) ' % ("$>$", self.settings.FDR_threshold, "$\%$"), fontsize=16) M = max(data1+data2) #ax.set_xticks() #ax.set_xticklabels(labels, fontsize=fontsize) ax.set_xlim([0, M+1]) pylab.legend(loc='lower right') try:pylab.tight_layout() except:pass
[docs] def get_significant_hits(self, show=True): """Return a summary of significant hits :param show: show a plot with the distribution of significant hits .. todo:: to finalise """ fdrs = range(5, 50+1, 5) significants = [] significant_meaningful = [] strong_hits = [] full_strong_hits = [] MC1 = 1 MC2 = 2 mask2 = self.df['FEATURE_pos_logIC50_MEAN'] < MC1 mask3 = self.df['FEATURE_pos_logIC50_MEAN'] < MC2 mask4 = self.df['FEATURE_neg_logIC50_MEAN'] < MC1 mask5 = self.df['FEATURE_neg_logIC50_MEAN'] < MC2 maskMC = mask2 + mask3 + mask4 + mask5 for fdr in fdrs: # significant hits res = self.df['ANOVA_FEATURE_FDR'] < fdr significants.append(res.sum()) # meaningful hits indices = np.logical_and(self.df['ANOVA_FEATURE_FDR'] < fdr, maskMC) significant_meaningful.append(indices.sum()) # meaningful strong hits mask1 = self.df.loc[indices]['FEATURE_pos_Glass_delta'] >= 1 mask2 = self.df.loc[indices]['FEATURE_neg_Glass_delta'] >= 1 strong_hits.append(np.logical_or(mask1, mask2).sum()) # meaningful full strong hits mask1 = self.df.loc[indices]['FEATURE_pos_Glass_delta'] >= 1 mask2 = self.df.loc[indices]['FEATURE_neg_Glass_delta'] >= 1 full_strong_hits.append(np.logical_and(mask1, mask2).sum()) data = {'significants': significants, 'full_strong_hits': full_strong_hits, 'strong_hits': strong_hits, 'significant_meaningful': significant_meaningful} df = pd.DataFrame(data, columns = ['significants', 'significant_meaningful', 'strong_hits', 'full_strong_hits'], index=fdrs) df.columns = ['1) significant', '2) 1 + meaningful', '3) 2 + strong', '4) 2+ very strong'] if show is True: pylab.clf() ax = pylab.gca() df.plot(kind='bar', width=.8, color=['r', 'gray', 'orange', 'black'], rot=0, ax=ax) pylab.grid() # original is 'aquamarine4','cyan2','cornflowerblue ','aquamarine'), return df
def __str__(self): self.df.info() return ""
[docs] def create_html_associations(self): """Create an HTML page for each significant association The name of the output HTML file is **<association id>.html** where association id is stored in :attr:`df`. """ if self.verbose: print("Creating individual HTML pages for each significant association") df = self.get_significant_set() drugs = df['DRUG_ID'].values features = df['FEATURE'].values assocs = df['ASSOC_ID'].values fdrs = df['ANOVA_FEATURE_FDR'].values N = len(df) pb = Progress(N) html = Association(self, drug='dummy', feature='dummy', fdr='dummy', company=self.company) for i in range(N): html.drug = drugs[i] html.feature = features[i] if str(assocs[i]).startswith("a"): html._filename = str(assocs[i]) + '.html' else: html._filename = "a" + str(assocs[i]) + '.html' html.fdr = fdrs[i] html.assoc_id = assocs[i] #html._init_report() # since we have one shared instance html.create_report(onweb=False) if self.settings.animate: pb.animate(i+1) if self.settings.animate: print("\n")
[docs] def create_html_features(self): """Create an HTML page for each significant feature""" df = self.get_significant_set() groups = df.groupby('FEATURE') if self.verbose: print("Creating individual HTML pages for each feature") N = len(groups.indices.keys()) pb = Progress(N) for i, feature in enumerate(groups.indices.keys()): # get the indices and therefore subgroup subdf = groups.get_group(feature) html = HTMLOneFeature(self, self.df, subdf, feature) html.create_report(onweb=False) if self.settings.animate: pb.animate(i+1) if self.settings.animate: print("\n")
[docs] def create_html_drugs(self): """Create an HTML page for each drug""" # group by drugs all_drugs = list(self.df['DRUG_ID'].unique()) df = self.get_significant_set() groups = df.groupby('DRUG_ID') if self.verbose: print("Creating individual HTML pages for each drug") N = len(groups.indices.keys()) N = len(all_drugs) pb = Progress(N) for i, drug in enumerate(all_drugs): # enumerate(groups.indices.keys()): # get the indices and therefore subgroup if drug in groups.groups.keys(): subdf = groups.get_group(drug) else: subdf = {} html = HTMLOneDrug(self, self.df, subdf, drug) html.create_report(onweb=False) if self.settings.animate: pb.animate(i+1) if self.settings.animate: print("\n")
[docs] def create_html_main(self, onweb=False): """Create HTML main document (summary)""" self._set_sensible_df() if self.verbose: print("Creating main HTML page in directory %s" % (self.settings.directory)) ReportMain(directory=self.settings.directory, verbose=self.verbose) buffer_ = self.settings.savefig self.settings.savefig = True html = HTMLPageMain(self, 'index.html') html._init_report() # created the directory html.create_report(onweb=onweb) self.settings.savefig = buffer_
[docs] def create_html_manova(self, onweb=True): """Create summary table with all significant hits :param onweb: open browser with the created HTML page. """ df = self.get_significant_set() page = HTMLPageMANOVA(self, df, self.company) page.create_report(onweb)
[docs] def create_html_pages(self, onweb=True): """Create all HTML pages""" self.create_html_main(onweb=onweb) self.create_html_manova(onweb=False) self.create_html_drugs() self.create_html_features() self.create_html_associations()
[docs] def onweb(self): from easydev import onweb onweb(self.settings.directory + os.sep + 'index.html')
class HTMLPageMANOVA(ReportMain): """Creates an HTML page dedicated to significant hits :: # analyse the data across all drugs and features results = gdsc.anova_all() # Create a table for the first 10 significant hits (table is sorted # by ascending FDR) h = HTMLPageMANOVA(gdsc, results.df[0:10]) # Create the HTML page (pops up by default) h.report() """ def __init__(self, report, df, company): """.. rubric:: constructor :param : an ANOVA instance. :param directory: where to save the file The HTML filename is stored in the :attr:`filename`, which can be changes (default is manova.html) """ super(HTMLPageMANOVA, self).__init__(filename='manova.html', directory=report.settings.directory+os.sep+"associations", template_filename='manova.html', init_report=False) html = ANOVAResults(df).get_html_table(collapse_table=False) self.jinja['manova'] = html self.jinja['analysis_domain'] = report.settings.analysis_type self.jinja['resource_path'] = ".." self.jinja["collaborator"] = company def _create_report(self): pass # used to avoid warning ############################################################################## # # # # # HTML REPORTS RELATED # # # # # ############################################################################## class Association(ReportMain): def __init__(self, report, drug=None, feature=None, fdr=-1, assoc_id=-1, company=None): try: # here report is expected to be ANOVAReport # gdsctools interface from ipython self.factory = report.gdsc except: # here report is actually ANOVA itself # pipeline/standalone version self.factory = report # Does that changes the main settings ?? YES self.factory.settings.savefig = True self.assoc_id = assoc_id # This can be used by gdsctools_anova to remove the href self.add_href = True self.drug = drug self.feature = feature self.fdr = fdr filename = "{0}____{1}.html".format(self.drug, self.feature.replace(" ", "_")) super(Association, self).__init__( directory=report.settings.directory + os.sep + "associations", filename=filename, template_filename='association.html', init_report=False) self.jinja['analysis_domain'] = report.settings.analysis_type self.jinja['resource_path'] = ".." self.jinja["collaborator"] = company def run(self): # to keep . Used in the standalone version df = self.factory.anova_one_drug_one_feature(self.drug, self.feature, show=False, directory=self.directory) df['ASSOC_ID'] = self.assoc_id df['ANOVA_FEATURE_FDR'] = self.fdr return df def _create_report(self, onweb=True): # generated pictures and results df = self.run() odof = self.factory._get_one_drug_one_feature_data(self.drug, self.feature) # Create the table and add it sign = ANOVAResults(df) html_table = sign.get_html_table(escape=False, header=True, index=False, add_href=self.add_href) self.jinja['association_table'] = html_table # Javascript version bx = BoxPlotsJS(odof) self.jinja["boxplot_all_jsdata"] = bx.get_html_association() section = "" if self.factory.settings.include_MSI_factor: self.jinja["boxplot_msi_jsdata"] = bx.get_html_msi() if self.factory.settings.include_media_factor: self.jinja["boxplot_media_jsdata"] = bx.get_html_media() if self.factory.settings.analysis_type == 'PANCAN': self.jinja["boxplot_tissue_jsdata"] = bx.get_html_tissue() self.jinja['boxplots'] = section class HTMLOneFeature(ReportMain): def __init__(self, report, data, subdata, feature): self.df = data self.subdf = subdata self.feature = feature self.settings = report.settings self.n_hits = 0 if len(report.resistant_df): self.n_hits += sum([True for x in report.resistant_df.FEATURE if x == feature]) if len(report.sensible_df): self.n_hits += sum([True for x in report.sensible_df.FEATURE if x == feature]) filename = "{0}.html".format(self.feature) super(HTMLOneFeature, self).__init__( directory=report.settings.directory + os.sep + "associations", filename=filename, template_filename='feature.html', init_report=False) self.title = 'Single Feature analysis (%s)' % self.feature self.jinja['n_cell_lines'] = report.gdsc.features.df[feature].sum() self.jinja['feature_name'] = feature self.jinja['analysis_domaiin'] = report.settings.analysis_type self.jinja['n_hits'] = self.n_hits self.jinja['resource_path'] = ".." self.jinja["collaborator"] = report.company def create_pictures(self): v = VolcanoANOVAJS(self.df, settings=self.settings) html = v.render_feature(self.feature) return html def _create_report(self, onweb=True): self.jinja['N_hits'] = len(self.subdf) if len(self.subdf) > 0: sign = ANOVAResults(self.subdf) html = sign.get_html_table(escape=False, header=True, index=False) self.jinja['association_table'] = html self.jinja['volcano_jsdata'] = self.create_pictures() class HTMLOneDrug(ReportMain): def __init__(self, report, data, subdata, drug): """ data is a dataframe with all results from anova_all subdata is a dataframe with significant associations. Can be empty metadata is a dict with those keys: n_cell_lines min_conc max_conc drug """ self.df = data self.subdf = subdata self.drug = int(drug) self.settings = report.settings filename = "drug_{0}.html".format(self.drug) super(HTMLOneDrug, self).__init__( directory=report.settings.directory + os.sep + "associations", filename=filename, template_filename='drug.html', init_report=False) self.title = 'Single Drug analysis (%s)' % self.drug self.nhits = 0 if len(report.resistant_df): self.nhits += sum([True for x in report.resistant_df.DRUG_ID if x == drug]) if len(report.sensible_df): self.nhits += sum([True for x in report.sensible_df.DRUG_ID if x == drug]) self.jinja['n_cell_lines'] = len(report.gdsc.ic50.df[self.drug].dropna()) self.jinja['drug_id'] = self.drug self.jinja['drug_name'] = report.drug_decode.get_name(self.drug) self.jinja['drug_target'] = report.drug_decode.get_target(self.drug) #self.jinja['drug_synonyms'] = report.drug_decode._get_row(self.drug, 'SYNONYMS') self.jinja['drug_owner'] = report.drug_decode._get_row(self.drug, 'OWNED_BY') self.jinja['analysis_domain'] = report.settings.analysis_type self.jinja['n_hits'] = self.nhits self.jinja['resource_path'] = ".." self.jinja["collaborator"] = report.company def create_pictures(self): v = VolcanoANOVAJS(self.df, settings=self.settings) html = v.render_drug(self.drug) return html def _create_report(self, onweb=True): #self.create_pictures() # add the table self.jinja['synonyms'] = '' self.jinja['brand_name'] = '' self.jinja['conc_min'] = '?' self.jinja['conc_max'] = '?' # Table section self.jinja['N_hits'] = len(self.subdf) if len(self.subdf) > 0: sign = ANOVAResults(self.subdf) html = sign.get_html_table(escape=False, header=True, index=False) self.jinja['association_table'] = html # image section self.jinja['volcano_jsdata'] = self.create_pictures() class HTMLPageMain(ReportMain): def __init__(self, report, filename='index.html'): super(HTMLPageMain, self).__init__( directory=report.settings.directory, filename=filename) self.report = report self.settings = report.settings self.jinja['analysis_domain'] = report.settings.analysis_type self.jinja['settings_table'] = self.settings.to_html() self.jinja["collaborator"] = report.company def _create_report(self, onweb=True): self.add_summary() if len(self.report.df): self.add_volcano() self.add_manova() self.add_features() else: self.jinja["__section_skip_volcano"] = True self.jinja["__section_skip_manova"] = True self.jinja["__section_skip_features"] = True self.jinja["__section_skip_feature_summary"] = True self.jinja["__section_skip_drug_summary"] = True self.jinja["__section_skip_drug_associations"] = True self.jinja["__section_skip_feature_associations"] = True self.add_cosmic() self.add_settings() def add_summary(self): # A summary table diag = self.report.diagnostics() table = HTMLTable(diag, 'summary') txt = '' for index, row in diag.iterrows(): if len(row.text) == 0 and len(row.value) == 0: txt += '----<br/>' else: txt += row.text + ": " + str(row.value) + "<br/>" self.jinja['summary'] = txt def add_volcano(self): """ As of version 0.13, we will use Javascript directly. # this can be pretty slow. so keep only 1000 most relevant # values and 1000 random ones to get an idea of the distribution v = VolcanoANOVA(self.report.df, settings=self.settings) #v.selector(v.df, 1500, 1500, inplace=True) v.volcano_plot_all() v.savefig("volcano_all_js") self.jinja['volcano'] = ''' <h3></h3> <a href="volcano_all_js.html"> <img alt="volcano plot for all associations" src="volcano_all_js.png"> </a> <br/> <p> A javascript version is available <a href="volcano_all_js.html">here</a> ( or click on the image).</p> ''' """ v = VolcanoANOVAJS(self.report.df, settings=self.settings) self.jinja['volcano_jsdata'] = v.render_all() def add_manova(self): # MANOVA link N = len(self.report.get_significant_set()) self.jinja['manova'] = str(N) def add_features(self): # feature summary df_features = self.report.feature_summary("feature_summary.png") filename = 'OUTPUT' + os.sep + 'features_summary.csv' df_features.to_csv(self.directory + os.sep + filename, sep=',') not_tested = "" self.jinja['drug_not_tested'] = not_tested df_drugs = self.report.drug_summary(filename="drug_summary.png") get_name = self.report.drug_decode.get_name if len(self.report.drug_decode.df) > 0: df_drugs.index = ["{}-{}".format(x, get_name(x)) for x in df_drugs.index] filename = 'OUTPUT' + os.sep + 'drugs_summary.csv' df_drugs.to_csv(self.directory + os.sep + filename, sep=',') if len(self.report.df) == 0: return # --------------------------- Create table with links to all drugs groups = self.report.df.groupby('DRUG_ID') try: df = groups.mean()['ANOVA_FEATURE_FDR'].sort_values() except: # note double brackets for pythonn3.3 df = groups.mean()[['ANOVA_FEATURE_FDR']].sort() df = df.reset_index() # get back the Drug id in the dframe columns # let us add also the drug name df = self.report.drug_decode.drug_annotations(df) # let us also add number of associations computed counts = [len(groups.groups[k]) for k in df.DRUG_ID] df['Number of associations computed'] = counts groups = self.report.get_significant_set().groupby('DRUG_ID').groups count = [] for drug in df['DRUG_ID'].values: if drug in groups.keys(): count.append(len(groups[drug])) else: count.append(0) df['hits'] = count # add another set of drug_id but sorted in alpha numerical order table = HTMLTable(df, 'drugs') table.add_href('DRUG_ID', url="associations/drug_", suffix=".html") table.df.columns = [x.replace('ANOVA_FEATURE_FDR', 'mean FEATURE ANOVA FDR') for x in table.df.columns] table.add_bgcolor('hits', mode='max', cmap=cmap_builder('white', 'orange', 'red')) self.jinja['drug_table'] = table.to_html(escape=False, header=True, index=False) # ---------------------- Create full table with links to all features df = pd.DataFrame({'FEATURE': self.report.df['FEATURE'].unique()}) try: df.sort_values(by='FEATURE', inplace=True) except: df.sort('FEATURE', inplace=True) groups = self.report.get_significant_set().groupby('FEATURE').groups count = [] for feature in df['FEATURE'].values: if feature in groups.keys(): count.append(len(groups[feature])) else: count.append(0) df['hits'] = count table = HTMLTable(df, 'features') table.sort('hits', ascending=False) table.add_href('FEATURE', url="associations/", suffix=".html") table.add_bgcolor('hits', mode='max', cmap=cmap_builder('white', 'orange', 'red')) self.jinja['feature_table'] = table.to_html(escape=False, header=True, index=False) def add_cosmic(self): # -------------------------------------- COSMIC table for completeness colnames = self.report.gdsc.features._special_names df = self.report.gdsc.features.df[colnames] # TODO # add other columns if possible e.g., GDSC1, GDSC2, TCGA df = df.reset_index() table = HTMLTable(df) url = "http://cancer.sanger.ac.uk/cell_lines/sample/overview?id=" table.add_href('COSMIC_ID', url=url, newtab=True) self.jinja['cosmic_table'] = table.to_html() def add_settings(self): # -------------------------------------- settings and INPUT files input_dir = self.directory + os.sep + 'INPUT' filename = 'ANOVA_input.csv' filename = os.sep.join([input_dir, filename]) self.report.gdsc.ic50.to_csv(filename) filename = os.sep.join(['INPUT', 'ANOVA_input.csv']) self.jinja['ic50_file'] = filename # the genomic features, which may be the default version # one provided by the user. It may have been changed gf_filename = os.sep.join([input_dir, 'genomic_features.csv']) self.report.gdsc.features.to_csv(gf_filename) html = """Saved <a href="INPUT/genomic_features.csv">Genomic Features</a> file<br/> (possibly the default version).""" self.jinja['gf_file'] = html # Always save DRUG_DECODE file even if empty # It may be be interpreted in other pipeline or for reproducibility output_filename = input_dir + os.sep + 'DRUG_DECODE.csv' self.report.drug_decode.to_csv(output_filename) html = 'Get <a href="INPUT/DRUG_DECODE.csv">Drug DECODE file</a>' if len(self.report.drug_decode) == 0: html += 'Note that DRUG_DECODE file was not provided (empty?).' self.jinja['drug_decode'] = html # Save settings as json file filename = os.sep.join([input_dir, 'settings.json']) self.settings.to_json(filename) filename = os.path.basename(filename) self.jinja['settings'] = \ """Get the settings as a <a href="INPUT/%s"> json file</a>.""" % filename # Save all Results dataframe filename = os.sep.join([self.settings.directory, 'OUTPUT', 'results.csv']) ANOVAResults(self.report.df).to_csv(filename) code = """from gdsctools import * import os def getfile(filename, where='../INPUT'): return os.sep.join([where, filename]) # reback the IC50 and genomic features matrices gdsc = ANOVA(getfile('%(ic50)s'), getfile('%(gf_filename)s'), getfile('DRUG_DECODE.csv')) gdsc.settings.from_json(getfile('settings.json')) gdsc.init() # Analyse the data results = gdsc.anova_all() # Create the HTML report r = ANOVAReport(gdsc, results) r.create_html_pages(onweb=False)""" code = code % { 'ic50': 'ANOVA_input.csv', 'gf_filename': 'genomic_features.csv'} filename = os.sep.join([self.settings.directory, 'code','rerun.py']) fh = open(filename, 'w') fh.write(code) fh.close()