Source code for gdsctools.volcano

# coding=utf-8
# -*- python -*-
#
#  This file is part of GDSCTools software
#
#  Copyright (c) 2015 - Wellcome Trust Sanger Institute
#  All rights reserved
#
#  File author(s): Thomas Cokelaer <cokelaer@gmail.com>
#
#  Distributed under the BSD 3-Clause License.
#  See accompanying file LICENSE.txt distributed with this software
#
#  website: http://github.com/CancerRxGene/gdsctools
#
##############################################################################
"""Volcano plot utilities

The :class:`VolcanoANOVA` is used in the creation of the report but
may be used by a usr in a Python shell.

"""
import os

import pandas as pd
import pylab
import numpy as np
import easydev
from numpy import log10

from easydev import Progress, AttrDict
from gdsctools.tools import Savefig

import jinja2
from jinja2 import Environment, PackageLoader


__all__ = ['VolcanoANOVA', "VolcanoANOVAJS"]


[docs]class VolcanoANOVA(object):
    """Utilities related to volcano plots

    This class is used in :mod:`gdsctools.anova` but can also
    be used independently as in the example below.

    .. plot::
        :include-source:
        :width: 80%

        from gdsctools import ANOVA, ic50_test, VolcanoANOVA
        an = ANOVA(ic50_test)

        # retrict analysis to a tissue to speed up computation
        an.set_cancer_type('lung_NSCLC')

        # Perform the entire analysis
        results = an.anova_all()

        # Plot volcano plot of pvalues versus signed effect size
        v = VolcanoANOVA(results)
        v.volcano_plot_all()

    .. note:: Within an IPython shell, you should be able to click
        on a circle and the title will be updated
        with the name of the drug/feature and FDR value.

    :Legend and color conventions: The green circles indicate significant hits
        that are resistant while reds show sensitive hits. Circles are colored
        if there are below the FDR_threshold AND below the pvalue_threshold AND
        if the signed effect size is above the effect_threshold.



    """
    _colname_pvalue = 'ANOVA_FEATURE_pval'
    _colname_qvalue = 'ANOVA_FEATURE_FDR'
    _colname_drugid = 'DRUG_ID'
    _colname_feature = 'FEATURE'
    _colname_deltas = 'FEATURE_delta_MEAN_IC50'
    _colname_effect_size = 'FEATURE_IC50_effect_size'
    _colname_N_feature_pos = 'N_FEATURE_pos'

    def __init__(self, data, sep="\t", settings=None):
        """.. rubric:: Constructor

        :param data: an :class:`~gdsctools.anova.ANOVAResults` instance
            or a dataframe with the proper columns names (see below)
        :param settings: an instance of
            :class:`~gdsctools.settings.ANOVASettings`

        Expected column names to be found if a filename is provided::

            ANOVA_FEATURE_pval
            ANOVA_FEATURE_FDR
            FEATURE_delta_MEAN_IC50
            FEATURE_IC50_effect_size
            N_FEATURE_pos
            N_FEATURE_pos
            FEATURE
            DRUG_ID

        If the plotting is too slow, you can use the :meth:`selector` to prune
        the results (most of the data are noise and overlap on the middle
        bottom  area of the plot with little information.

        """
        # a copy since we do may change the data
        try:
            # an ANOVAResults contains a df attribute
            self.df = data.df.copy()
        except:
            # probably a dataframe
            self.df = data.copy()

        # this is redundant could reuse the input ??
        if settings is None:
            from gdsctools.settings import ANOVASettings
            self.settings = ANOVASettings()
        else:
            self.settings = AttrDict(**settings)

        self.figtools = Savefig()
        self.figtools.directory = self.settings.directory

        self.drugs = set(self.df[self._colname_drugid])
        self.features = set(self.df[self._colname_feature])

        # intensive calls made once for all
        self.groups_by_drugs = self.df.groupby(self._colname_drugid).groups
        self.groups_by_features = self.df.groupby(self._colname_feature).groups

[docs]    def selector(self, df, Nbest=1000, Nrandom=1000, inplace=False):
        """Select only the first N best rows and N random ones

        Sometimes, there are tens of thousands of associations and future
        analysis will include more features and drugs. Plotting volcano plots
        should therefore be fast and scalable. Here, we provide a naive
        way of speeding up the plotting by selecting only a subset of the data
        made of Nbest+Nrandom associations.

        :param df: the input dataframe with ANOVAResults
        :param int Nbest: how many of the most significant association
            should be kept
        :param int Nrandom: on top of the Nbest significant association,
            set how many other randomly chosen associations are to be kept.
        :return: pruned dataframe

        """
        if len(df) < Nbest:
            return df
        Nmax = Nbest + Nrandom
        N = len(df)
        if N > Nbest:
            x = range(Nbest, N)
            pylab.shuffle(x)
            n2pick = min(N, Nmax) - Nbest
            indices = range(0, Nbest) + x[0:n2pick]
        else:
            indices = range(0, Nbest)
        # indices in the index may not be order
        indices = [df.index[xx] for xx in indices]
        df = df.iloc[indices]
        if inplace is True:
            self.df = df
        else:
            return df

[docs]    def volcano_plot_all_drugs(self):
        """Create a volcano plot for each drug and save in PNG files

        Each filename is set to **volcano_<drug identifier>.png**
        """
        drugs = list(self.df[self._colname_drugid].unique())
        pb = Progress(len(drugs), 1)
        for i, drug in enumerate(drugs):
            self.volcano_plot_one_drug(drug)
            self.savefig("volcano_%s.png" % drug, size_inches=(10, 10))
            pb.animate(i+1)

[docs]    def volcano_plot_all_features(self):
        """Create a volcano plot for each feature and save in PNG files

        Each filename is set to **volcano_<feature name>.png**
        """
        features = list(self.df[self._colname_feature].unique())
        print('Creating image for each feature (using all drugs)')
        pb = Progress(len(features), 1)
        for i, feature in enumerate(features):
            self.volcano_plot_one_feature(feature)
            self.savefig("volcano_%s.png" % feature,
                    size_inches=(10, 10))
            pb.animate(i+1)

[docs]    def volcano_plot_all(self):
        """Create an overall volcano plot for all associations

        This method saves the picture in a PNG file named **volcano_all.png**.
        """
        # no annotations for all features.
        # this is slow, we can drop non relevant data
        data = self._get_volcano_sub_data('ALL')
        data['annotation'] = ['' for x in range(len(data))]

        self._volcano_plot(data, title='all drugs all features')

    def _get_fdr_from_pvalue_interp(self, pvalue):
        """Here, FDR are computed using an interpolation"""
        pvalue += 1e-15
        qvals = self.df[self._colname_qvalue]
        pvals = self.df[self._colname_pvalue]
        ya = qvals[pvals < pvalue].max()
        yb = qvals[pvals > pvalue].min()
        xa = pvals[pvals < pvalue].max()
        xb = pvals[pvals > pvalue].min()
        dx = xb - xa
        dy = yb - ya
        yc = ya + dy * (pvalue - xa) / dx
        return yc

    def _get_pvalue_from_fdr(self, fdr):
        """Get pvalue for a given FDR threshold

        This is equivalent to v17 of the R version but is not very precise
        we should use _get_pvalue_from_fdr_interp instead.

        """
        qvals = self.df[self._colname_qvalue]
        pvals = self.df[self._colname_pvalue]
        if isinstance(fdr, list):
            pvalues = [pvals[qvals < this].max() for this in fdr]
            return pvalues
        else:
            return pvals[qvals < fdr].max()

    def _get_pvalue_from_fdr_interp(self, fdr):
        # same as get_pvalue_from_fdr but with a linear interpolation
        fdr += 1e-15

        qvals = self.df[self._colname_qvalue]
        pvals = self.df[self._colname_pvalue]
        ya = pvals[qvals <= fdr].max()
        yb = pvals[qvals > fdr].min()
        xa = qvals[qvals <= fdr].max()
        xb = qvals[qvals > fdr].min()

        dx = xb - xa
        dy = yb - ya
        xc = fdr
        yc = ya + dy * (xc - xa) / dx
        return yc

    def _get_volcano_global_data(self):
        # using all data
        colname = self._colname_N_feature_pos
        minN = self.df[colname].min()
        maxN = self.df[colname].max()
        pvalues = self._get_pvalue_from_fdr(self.settings.FDR_threshold)
        return {'minN': minN, 'maxN': maxN,
                'pvalues': (self.settings.FDR_threshold, pvalues)}

    def _get_volcano_sub_data(self, mode, target=None):
        # Return data needed for each plot
        # TODO could be simplified but works for now

        # groups created in the constructor once for all
        if mode == self._colname_drugid:
            subdf = self.df.loc[self.groups_by_drugs[target]]
            texts = subdf[self._colname_feature]
        elif mode == 'FEATURE':
            subdf = self.df.loc[self.groups_by_features[target]]
            texts = subdf[self._colname_drugid]
        elif mode == 'ALL':
            # nothing to do, get all data
            subdf = self.df
            texts = subdf[self._colname_feature] # TODO + drug
        else:
            raise ValueError("mode parameter must be in [FEATURE, %s, ALL]" %
                    (self._colname_drugid))

        # replaced by groups created in the constructor
        #subdf = self.df[self.df[mode] == target]
        deltas = subdf[self._colname_deltas]
        effects = subdf[self._colname_effect_size]
        signed_effects = list(np.sign(deltas) * effects)
        qvals = list(subdf[self._colname_qvalue])
        pvals = list(subdf[self._colname_pvalue])
        #assocs = list(subdf['ASSOC_ID'])

        colors = []
        annotations = []

        data = pd.DataFrame(index=range(len(qvals)))
        data['pvalue'] = pvals
        data['signed_effect'] = signed_effects
        data['Feature'] = list(subdf[self._colname_feature])
        data['Drug'] = list(subdf[self._colname_drugid])
        data['text'] = texts.values
        #data['Assoc'] = assocs
        ## !! here, we need to use .values since the pandas dataframe
        # index goes from 1 to N but the origignal indices in subdf
        # may not be from 1 to N but random between 1 and M>>N
        data['FDR'] = subdf[self._colname_qvalue].values
        annotations = []

        # just an alias
        # FIXME: why do we have a switch here for PANCAN ?
        FDR_threshold = self.settings.FDR_threshold
        if self.settings.analysis_type == 'PANCAN':
            for sign, qval, pval in zip(signed_effects, qvals, pvals):
                if sign <= -self.settings.effect_threshold and \
                        qval <= FDR_threshold and \
                        pval <= self.settings.pvalue_threshold:
                    colors.append('green')
                    annotations.append(True)
                elif sign >= self.settings.effect_threshold and \
                        qval <= FDR_threshold and \
                        pval <= self.settings.pvalue_threshold:
                    colors.append('red')
                    annotations.append(True)
                else:
                    colors.append('black')
                    annotations.append(False)
        else:
            for delta, qval, pval in zip(deltas, qvals, pvals):
                if pval <= self.settings.pvalue_threshold and \
                        qval <= FDR_threshold and delta < 0: 
                    colors.append('green')
                    annotations.append(True)
                elif pval <= self.settings.pvalue_threshold and \
                        qval <= FDR_threshold and delta > 0:
                    colors.append('red')
                    annotations.append(True)
                else:
                    colors.append('black')
                    annotations.append(False)

        # here we normalise wrt the drug. In R code, normalised
        # my max across all data (minN, maxN)
        colname = self._colname_N_feature_pos
        markersize = subdf[colname] / subdf[colname].max()
        markersize = list(markersize * 800)
        markersize = [x if x > 80 else 80 for x in markersize]

        data['color'] = colors
        data['annotation'] = annotations
        data['markersize'] = markersize
        return data

[docs]    def volcano_plot_one_feature(self, feature):
        """Volcano plot for one feature (all drugs)

        :param feature: a valid feature name to be found in the results
        """
        assert feature in self.features, 'unknown feature name'
        # FEATURE is the mode's name, not a column's name
        data = self._get_volcano_sub_data('FEATURE', feature)
        self._volcano_plot(data, title=feature)

[docs]    def volcano_plot_one_drug(self, drug_id):
        """Volcano plot for one drug (all genomic features)

        :param drug_id: a valid drug identifier to be found in the results
        """
        assert drug_id in self.drugs, 'unknown drug name'
        data = self._get_volcano_sub_data(self._colname_drugid, drug_id)
        self._volcano_plot(data, title=drug_id)

    def _volcano_plot(self, data, title=''):
        """Main volcano plot function called by other methods
        such as volcano_plot_all"""
        # This functio is a bit complicated because it does create a few tricky
        # plots

        # It creates a volcano plot, which is the easy part
        # Then, it creates tooltips for the user interface in an IPython
        # shell using a callback to 'onpick' function coded here below
        # !! There seem to bes a memory leak in this function due to matplotlib
        # This is not easy to track down and should have no impact now that
        # ANOVAReport using JS instead of matplotlib 

        data = data.replace(np.inf, 0)
        data = data.replace(-np.inf, 0)

        colors = list(data['color'].values)
        pvalues = data['pvalue'].values
        signed_effects = data['signed_effect'].values
        markersize = data['markersize'].values

        Y = -np.log10(list(pvalues)) # should be cast to list ?

        num = 1
        #pylab.close(num)
        fig = pylab.figure(num=1)
        fig.clf()
        ax = fig.add_subplot(111)
        try:
            ax.set_facecolor('#EEEEEE') #matplotlib 2.0
        except:
            ax.set_axis_bgcolor('#EEEEEE')
        ax.cla()

        # TODO signed effects may be inf why ?


        X = [easydev.precision(x, digit=2) for x in signed_effects]
        Y = [easydev.precision(y, digit=2) for y in Y]

        # Using scatter() is slow as compared to plot()
        # However, plot cannot take different sizes/colors
        scatter = ax.scatter(X, Y, s=markersize,
                alpha=0.3, c=colors, linewidth=1, picker=True)
        scatter.set_zorder(11)

        m = abs(signed_effects.min())
        M = abs(signed_effects.max())
        pylab.xlabel("Signed effect size", fontsize=self.settings.fontsize)
        pylab.ylabel('-log10(pvalues)', fontsize=self.settings.fontsize)
        l = max([m, M]) * 1.1
        pylab.xlim([-l, l])
        ax.grid(color='white', linestyle='solid')

        # some aliases
        fdr = self.settings.FDR_threshold
        if fdr < self.df[self._colname_qvalue].min():
            fdr = self.df[self._colname_qvalue].min()


        fdrs = sorted(self.settings.volcano_additional_FDR_lines)
        fdrs = fdrs[::-1] # reverse sorting

        styles = ['--', ':', '-.']
        if self.settings.volcano_FDR_interpolation is True:
            get_pvalue_from_fdr = self._get_pvalue_from_fdr_interp
        else:
            get_pvalue_from_fdr = self._get_pvalue_from_fdr

        pvalue = get_pvalue_from_fdr(fdr)
        ax.axhline(-np.log10(pvalue), linestyle='--', lw=2,
            color='red', alpha=1, label="FDR {:.2f} ".format(fdr) + " %")


        for i, this in enumerate(fdrs):
            if this < self.df[self._colname_qvalue].min() or\
                this > self.df[self._colname_qvalue].max():
                    continue
            pvalue = get_pvalue_from_fdr(this)
            ax.axhline(-np.log10(pvalue), linestyle=styles[i],
                color='red', alpha=1, label="FDR {:.2f} ".format(this) +" %")

        pylab.ylim([0, pylab.ylim()[1]*1.2]) # times 1.2 to put the legend

        ax.axvline(0, color='gray', alpha=0.8, lw=2)
        axl = pylab.legend(loc='best')
        axl.set_zorder(10) # in case there is a circle behind the legend.

        #self.ax = ax
        #self.axx = ax.twinx()
        #self.common_ticks = ax.get_yticks()
        #self.common_ylim = ax.get_ylim()
        #pvals = self.df[self._colname_pvalue]
        #y1 = pvals.min()
        #y2 = pvals.max()
        #fdr1 = self._get_fdr_from_pvalue_interp(y1)
        #fdr2 = self._get_fdr_from_pvalue_interp(y2-2e-15) # make sure it exists
        #self.axx.set_ylim([fdr2, fdr1])
        #self.axx.set_ylabel('FDR \%', fontsize=self.settings.fontsize)

        # For the static version
        title_handler = pylab.title("%s" % str(title).replace("_","  "),
                fontsize=self.settings.fontsize/1.2)
        labels = []

        # This code allows the ipython user to click on the matplotlib figure
        # to get information about the drug and feature of a given circles.
        def onpick(event):
            ind = event.ind[0]
            try:
                title = str(str(data.iloc[ind]['Drug'])) + " / " + str(data.iloc[ind].Feature)
                title += "\nFDR=" + "%.4e" % data.iloc[ind]['FDR']
                title_handler.set_text(title.replace("_","  "))
            except:
                print('Failed to create new title on click')
            print(data.iloc[ind].T)
            fig.canvas.draw()

        # keep track on the id for further memory release
        # For more info search for "matplotlib memory leak mpl_connect"
        self.cid = fig.canvas.mpl_connect('pick_event', onpick)

        # for the JS version
        # TODO: for the first 1 to 2000 entries ?
        labels = []
        self.data = data
        for i, row in data[['Drug', 'Feature', 'FDR']].iterrows():

            template = """
<table border="1" class="dataframe">
  <tbody>
    <tr>
      <th>Drug</th>
      <td>%(Drug)s</td>
    </tr>
    <tr>
      <th>Feature</th>
      <td>%(Feature)s</td>
    </tr>
    <tr>
      <th>FDR</th>
      <td>%(FDR)s</td>
    </tr>
  </tbody>
</table>""" % row.to_dict()
            labels.append(template)

            # this is more elegant but slower
            #label = row.to_frame()
            #label.columns = ['Row {0}'.format(i)]
            #labels.append(str(label.to_html(header=False)))
        self.scatter = scatter
        self.current_fig = fig
        # not sure is this is required. could be a memory leak here
        import gc
        gc.collect()

[docs]    def savefig(self, filename, size_inches=(10, 10)):
        # Save the PNG first. The savefig automatically set the size
        # to a defined set and back to original figsize.
        self.figtools.savefig(filename + '.png', size_inches=size_inches)


[docs]class VolcanoANOVAJS(VolcanoANOVA):
    def __init__(self, data, sep="\t", settings=None):
        super(VolcanoANOVAJS, self).__init__(data, sep, settings)

[docs]    def render_drug(self, name):
        self.data = self._get_volcano_sub_data("DRUG_ID", name)
        return self._render_data(name)

[docs]    def render_feature(self, name):
        self.data = self._get_volcano_sub_data("FEATURE", name)
        return self._render_data(name)

[docs]    def render_all(self):
        self.data = self._get_volcano_sub_data("ALL")
        return self._render_data()

    def _render_data(self, name="all associations"):

        self.data['log10pvalue'] = -log10(self.data['pvalue'])

        self.data['color'] = self.data['color'].apply(lambda x:
                x.replace("black", "not_significant"))
        self.data['color'] = self.data['color'].apply(lambda x:
                x.replace("green", "sensitive"))
        self.data['color'] = self.data['color'].apply(lambda x:
                x.replace("red", "resistant"))

        # We have 3 colors but sometimes you may have only one or 2.
        # This may be an issue with canvasXpress. It seems essential
        # to sort the color column so that names are sorted alphabetically
        # and to include colors that are present in the sale order
        try:
            self.data.sort_values(by='color', inplace=True)
        except:
            self.data.sort("color", inplace=True)
        colors = []
        if "not_significant" in self.data.color.values:
            colors.append("rgba(0,0,0,0.5)")  # black
        if "resistant" in self.data.color.values:
            colors.append("rgba(205,0,0,0.5)")  # red
        if "sensitive" in self.data.color.values:
            colors.append("rgba(0,205,0,0.5)")  # green

        env = Environment()

        from easydev import get_package_location

        env.loader = jinja2.FileSystemLoader(
                        get_package_location("gdsctools")
                        + "/gdsctools/data/templates/")
        template = env.get_template("volcano.html")

        jinja = {}

        jinja["colors"] = colors
        jinja["Group"] = list(self.data['color'].values)
        jinja['xlabel'] = "Signed Effect Size"
        jinja['ylabel'] = "-log10(pvalue)"

        text = []
        for x,y,z in zip(self.data['Drug'].values,
            self.data['Feature'].values,
            self.data['FDR'].values):

            text.append("<b>Drug:</b>%s <br><b>Feature:</b>%s <br><b>FDR:</b>%s" % (x,y,z))
        jinja['vars'] = text

        """
        # does not work in the JS somehow some points do not appear
        # disabled for now
        markersize = self.data['markersize']
        markersize -= markersize.min()
        markersize /= markersize.max()
        markersize = (3*markersize).round()
        #markersize[markersize == 0] = 1

        FC = list(markersize.astype(int).astype(str))
        jinja['FC'] = FC
        """
        self.data.markersize /= (self.data.markersize.max()/3.)

        #First value is Y, second is X, following will be used in the
        try: # introduced in pandas > 0.16.2
            jinja['data'] = self.data[["signed_effect", "log10pvalue",
                "markersize"]].round(3).values.tolist()
        except: #for py3.3 on travis
            jinja['data'] = np.around(self.data[["signed_effect", "log10pvalue",
                "markersize"]]).values.tolist()
        jinja['title'] = '"%s"' % name

        fdrs = self.get_fdr_ypos()
        fdrs = [0 if np.isnan(x) else x for x in fdrs]
        jinja['additional_fdrs'] = ""
        for i,this in enumerate(fdrs):
            line = '\n{"color": "red", "width": 1, "type": "dashed", "y":  %s}'
            if i == len(fdrs)-1:
                pass
            else:
                line += ","
            jinja['additional_fdrs'] += line % this

        m = abs(self.data.signed_effect.min())
        M = abs(self.data.signed_effect.max())
        jinja['minX'] = -max([m, M]) * 1.1
        jinja['maxX'] =  max([m, M]) * 1.1
        jinja['maxY'] = self.data["log10pvalue"].max() * 1.2
        if max(fdrs) > jinja['maxY']:
            jinja['maxY'] = max(fdrs) * 1.2

        self.html = template.render(jinja)
        return self.html

[docs]    def get_fdr_ypos(self):
        fdr = self.settings.FDR_threshold
        fdrs = sorted(self.settings.volcano_additional_FDR_lines)
        fdrs = fdrs[::-1] # reverse sorting
        if self.settings.volcano_FDR_interpolation is True:
            get_pvalue_from_fdr = self._get_pvalue_from_fdr_interp
        else:
            get_pvalue_from_fdr = self._get_pvalue_from_fdr

        # If provided FDR is below minimum one, use minimum FDR as the
        # threshold.
        pvalue = get_pvalue_from_fdr(fdr)
        if np.isnan(pvalue):
            pvalue = self.df["ANOVA_FEATURE_pval"].min()
        #ax.axhline(-np.log10(pvalue), linestyle='--', lw=2,
        #    color='red', alpha=1, label="FDR %s " %  fdr + " \%")

        pvalues = [-log10(pvalue)]
        # If there is a NAN, plot is not shown, so we must be sure all FDRs are
        # real values. Set 0 for the NAN so that we do not see the lines
        #pvalues = [0 if np.isnan(x) else x for x in pvalues]
        for i, this in enumerate(fdrs):
            if this < self.df[self._colname_qvalue].min() or\
                this > self.df[self._colname_qvalue].max():
                    pvalues.append(0)
                    continue
            pvalue = get_pvalue_from_fdr(this)
            pvalues.append(-np.log10(pvalue))

        pvalues = [0 if np.isnan(x) else x for x in pvalues]
        # we must have 3 values. If not, just repeat the last values
        return pvalues


    # window.CanvasXPress.references
    # Working stuff:
    # window.CanvasXpress.references[0].setHeight(400)
    # window.CanvasXpress.references[0].data.d.line[0].color = 'green'
    # window.CanvasXpress.references[0].redraw()
    # window.CanvasXpress.references[0].redraw()


class ScatterJS(object):
    """

        df = 
        js = ScatterJS(df, "x_name", "y_name", "color_name", "size_name")
        html = js.get_html()

    """
    def __init__(self, df, x, y, color, size):
        self.data = df.copy()
        self._colname_color = color
        self._colname_x = x
        self._colname_y = y
        self._colname_size = size

        self.minX = 0
        self.minY = 0
        self.maxX = 1
        self.maxY = self.data[self._colname_y].max() * 1.1

    def get_html(self):

        # We have 3 colors but sometimes you may have only one or 2.
        # This may be an issue with canvasXpress. It seems essential
        # to sort the color column so that names are sorted alphabetically
        # and to include colors that are present in the sale order
        """try:
            self.data.sort_values(by='color', inplace=True)
        except:
            self.data.sort("color", inplace=True)
        """
        # Jinja related
        from easydev import get_package_location
        env = Environment()
        env.loader = jinja2.FileSystemLoader(
                        get_package_location("gdsctools")
                        + "/gdsctools/data/templates/")
        template = env.get_template("scatter.html")

        # We need to cireate 20 different colors
        from colormap import Colormap
        c = Colormap()
        cmap = c.cmap_linear("red", "blue", "yellow", N=20)

        colors = self.data[self._colname_color]
        colors = ["red" for x in self.data[self._colname_color]]

        jinja = {}
        jinja["colors"] = ["rgba(205,0,0,0.5)"]
        jinja["Group"] = colors
        jinja['xlabel'] = '"%s"' % self.xlabel
        jinja['ylabel'] = '"%s"' % self.ylabel

        selection = [self._colname_x, self._colname_y]

        text = []
        for index in zip(self.data.index):
            text.append("<pre>%s</pre>" % self.data.iloc[index][selection].to_string())
        jinja['vars'] = text

        #self.data.markersize /= (self.data.markersize.max()/3.)
        self.data['markersize'] = 20
        selection = [self._colname_x, self._colname_y, self._colname_size]
        #First value is Y, second is X, following will be used in the
        try: # introduced in pandas > 0.16.2
            jinja['data'] = self.data[selection].round(3).values.tolist()
        except: #for py3.3 on travis
            jinja['data'] = np.around(self.data[selection]).values.tolist()

        jinja['title'] = '"Regression coefficient vs Bayes factor for all drugs"' 

        jinja['minX'] = self.minX
        jinja['minY'] = self.minY
        jinja['maxX'] = self.maxX
        jinja['maxY'] = self.maxY

        self.html = template.render(jinja)
        return self.html