Source code for gdsctools.regression

# -*- python -*-
# -*- coding utf-8 -*-
#
#  This file is part of GDSCTools software
#
#  Copyright (c) 2015 - Wellcome Trust Sanger Institute
#  All rights reserved
#  Copyright (c) 2016 - Institut Pasteur
#  All rights reserved
#
#  File author(s): Thomas Cokelaer <cokelaer@gmail.com>
#  File author(s): Thomas Cokelaer <thomas.cokelaer@pasteur.fr>
#
#  Distributed under the BSD 3-Clause License.
#  See accompanying file LICENSE.txt distributed with this software
#
#  website: http://github.com/CancerRxGene/gdsctools
#
##############################################################################
"""Look for IC50 vs and genomic features associations using Regression methods"""
import itertools
import warnings

import pandas as pd
import pylab
import numpy as np

from easydev import Progress

from gdsctools.models import BaseModels
from gdsctools.boxswarm import BoxSwarm

from sklearn.linear_model import enet_path
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import linear_model # must use the module rather than classes to


__all__ = ["Regression", 'GDSCRidge', "GDSCLasso", "GDSCElasticNet", 
           "RegressionCVResults"]


"""book keeping

from statsmodels.formula.api import OLS

if self.settings.regression_method == 'ElasticNet':
    self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                       alpha=self.settings.regression_alpha,
                       L1_wt=self.settings.regression_L1_wt)
elif self.settings.regression_method == 'OLS':
    self.data_lm = OLS(odof.Y, df.values).fit()
elif self.settings.regression_method == 'Ridge':
    self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                       alpha=self.settings.regression_alpha, L1_wt=0)
elif self.settings.regression_method == 'Lasso':
    self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                       alpha=self.settings.regression_alpha, L1_wt=1)

"""


class RegressionCVResults(object):
    """Simple data structure to hold some results of the regression analysis

    - :attr:`model`
    - :attr:`kfold`: number of folds used
    - :attr:`Rp`
    - :attr:`alpha`: best alpha parameter
    - :attr:`ln_alpha`  best alpha parameter (log scale)
    """
    def __init__(self, model, Rp, kfold=None):
        self.model = model
        self.Rp = Rp
        self.kfold = kfold
    def _get_alpha(self):
        return self.model.alpha_
    alpha = property(_get_alpha)
    def _get_ln_alpha(self):
        return pylab.log(self.alpha)
    ln_alpha = property(_get_ln_alpha)
    def _get_coefficients(self):
        return self.model.coef_
    coefficients = property(_get_coefficients)
    def __str__(self):
        txt = "Best alpha on %s folds: %s (%.2f in log scale); Rp=%s" %\
                  (self.kfold, self.alpha, self.ln_alpha, self.Rp)
        return txt


[docs]class Regression(BaseModels):
    """Base class for all Regression analysis



    In the :class:`gdsctools.anova.ANOVA` case, the regression is based on the 
    OLS method and is computed for a given drug and a given feature (:term:`ODOF`). 
    Then, the analysis is repeated across all features for a
    given drug (:term:`ODAF`) and finally extended to all drugs (:term:`ADAF`). 
    So, there is one test for each combination of drug and feature.

    Here, all features for a given drug are taken together to perform a
    Regression analysis (:term:`ODAF`). The regression algorithm implemented so 
    far are:

    - Ridge
    - Lasso
    - ElasticNet
    - LassoLars

    Based on tools from the scikit-learn library.

    """
    def __init__(self, ic50, genomic_features=None,
            verbose=False):
        """.. rubric:: Constructor

        :param ic50: an IC50 file
        :param genomic_features: a genomic feature file

        see :ref:`data` for help on the input data formats.

        """
        super(Regression, self).__init__(ic50, genomic_features,
            verbose=verbose, set_media_factor=False)
        self.scale = False

    def _get_one_drug_data(self, name, randomize_Y=False):
        """Returns X and Y for a given drug, dropping NAs

        :param name: drug name
        :param randomize_Y: randomize Y

        - drops NA
        - drops TISSUE_FACTOR
        - drops MSI factor

        """
        Y = self.ic50.df[name]
        Y.dropna(inplace=True)
        X = self.features.df.loc[Y.index].copy()
        try:X = X.drop('TISSUE_FACTOR', axis=1)
        except:pass
        try: X = X.drop('MSI_FACTOR', axis=1)
        except:pass

        if self.scale is True:
            columns = X.columns
            # cast is essential here otherwise ValueError is raised
            X = preprocessing.scale(X.astype(float))
            X = pd.DataFrame(X, columns=columns)

        if randomize_Y:
            Y = Y.copy()
            pylab.shuffle(Y.values)
        return X, Y

    def _fit_model(self, drug_name, model):
        """call fit method of a model given a drug name

        Save the current X, Y, model fitter in _X, _Y and _model attributes
        """
        X, Y = self._get_one_drug_data(drug_name)
        model.fit(X, Y)
        return model

[docs]    def plot_importance(self, drug_name, model=None, fontsize=11,
            max_label_length=35, orientation="vertical"):
        """Plot the absolute weights found by a fittd model.


        :param str drug_name:
        :param model: a model
        :param int fontsize: (defaults to 11)
        :param max_label_length: 35 by default
        :param orientation: orientation of the plot (vertical or horizontal)

        :return: the dataframe with the weights (may be empty)

        .. note:: if no weights are different from zeros, no plots are
            created.

        """
        X, Y = self._get_one_drug_data(drug_name)
        if model is None:
            model = self.get_best_model(drug_name)
        model.fit(X, Y)

        df = pd.DataFrame({'name': X.columns, 'weight': model.coef_})
        df = df.set_index("name")

        df = df[df['weight'] != 0]

        if len(df):
            barplot(df, "weight", orientation=orientation, max_label_length=max_label_length,
                fontsize=fontsize)
            # sometimes there is only a few weights close to zero. Set the ylim
            # to 1 if it is below 0.1. 
            if pylab.ylim()[1] < 0.1:
                pylab.ylim([0,1])
            if len(df) < 5:
                pylab.xlim(-5,5)
        return df

    def _print(self, txt):
        if self.verbose:
            print(txt)

[docs]    def get_best_model(self, drug_name, kfolds=10, alphas=None, l1_ratio=0.5):
        """Return best model fitted using a CV

        :param drug_name:
        :param kfolds:
        :param alphas:
        :param l1_ratio:

        """
        self._print("Running CV to estimate best alpha.")
        results = self.runCV(drug_name, kfolds=kfolds, alphas=alphas,
                             l1_ratio=l1_ratio)
        best_alpha = results.alpha
        model = self.get_model(alpha=best_alpha)
        self._print("Using alpha=%s." % model.alpha)
        return model

[docs]    def plot_weight(self, drug_name, model=None, fontsize=12,
            figsize=(10,7), max_label_length=20, Nmax=40):
        """Plot the elastic net weights

        :param drug_name: the drug identifier
        :param alpha:
        :param l1_ratio:

        Large alpha values will have a more stringent effects on the
        weigths and select only some of them or maybe none. Conversely,
        setting alphas to zero will keep all weights.

        .. plot::
            :include-source:

            from gdsctools import *
            ic = IC50(gdsctools_data("IC50_v5.csv.gz"))
            gf = GenomicFeatures(gdsctools_data("genomic_features_v5.csv.gz"))
            en = GDSCElasticNet(ic, gf)
            model = en.get_model(alpha=0.01)
            en.plot_weight(1047, model=model)

        """
        X, Y = self._get_one_drug_data(drug_name)
        if model is None:
            model = self.get_best_model(drug_name)
        model.fit(X, Y)

        df = pd.DataFrame({'name': X.columns, 'weight': model.coef_})
        df = df.set_index("name").sort_values("weight")
        df = df[df != 0].dropna()

        # split the data keeping only 50 best weights at most
        if len(df) > Nmax:
            # figure out the threshold in absolute value so that the
            # we keep the Nmax strongest weights irrespective of the sign
            threshold = df.abs().sort_values(by="weight").values[-Nmax:][0,0]

            df1 = df.query("weight<=0 and abs(weight)>=@threshold").copy()
            df2 = df.query("weight>=0 and abs(weight)>=@threshold").copy()
        else:
            df1 = df[df.weight<0].copy()
            df2 = df[df.weight>=0].copy()

        df1.index = [this[0:max_label_length] for this in df1.index]
        df2.index = [this[0:max_label_length] for this in df2.index]

        # We also want some symmetry so as many red as blue so that the span
        # of positive and negative is equivalent
        N = len(df2) - len(df1)
        if N > 0:
            # more red LHS than blue RHS
            for i in range(1, N+1):
                label = "_dummy%s" % i
                df1.loc[label, "weight"] = 0
            df1.index = [x if not x.startswith("_dummy") else ""
                         for x in df1.index]
        elif N < 0:
            # more blue RHS than red LHS
            for i in range(1, abs(N)+1):
                label = "_dummy%s" % i
                df2.loc[label, "weight"] = 0
            df2.index = [x if not x.startswith("_dummy") else ""
                         for x in df2.index]
            df2.sort_values(by="weight", ascending=True, inplace=True)


        f, (ax, ax2) = pylab.subplots(1,2, sharey=True, figsize=(10,7))
        ff = pylab.gcf()
        ff.set_facecolor('white')
        self.df1 = df1
        self.df2 = df2

        if len(df1):
            df1.plot(y="weight", kind="bar",  width=1, lw=1, ax=ax,
                color="b", legend=False, fontsize=fontsize, figsize=figsize)
        if len(df2):
            df2.plot(y="weight", kind="bar",  width=1, lw=1, ax=ax2,
                color="r", sharey=True, legend=False, fontsize=fontsize,
                figsize=figsize)
        if len(df1) == 0 and len(df2) == 0:
            pylab.xlim([-5,5])
            pylab.ylim([-1,1])
            pylab.grid(True)
            return df

        # hide the spines between ax and ax2
        ax.spines['right'].set_visible(False)
        ax2.spines['left'].set_visible(False)
        ax.yaxis.tick_left()
        ax2.yaxis.tick_right()
        ax2.tick_params(labelleft='off')

        d = 0.02 # diagonal lines
        kwargs = dict(transform=ax.transAxes, color='k', clip_on=False)
        ax.plot((1 -d, 1 + d), (-d, +d), **kwargs)
        ax.plot((1 -d, 1 + d), (1-d, 1+d), **kwargs)

        kwargs.update(transform=ax2.transAxes)  # switch to the bottom axes
        ax2.plot(( -d,  d), (1-d, 1+d), **kwargs)
        ax2.plot(( -d,  d), (-d, d), **kwargs)
        ax.grid()
        ax2.grid()
        # x0, y0, width_x, width_y
        ax.set_position([0.06,0.3,0.425,0.6])
        ax2.set_position([0.50,0.3,0.425,0.6])
        return df

    def _get_rpearson(self, Y_pred, Y_test):
        self.a = Y_pred
        self.b = Y_test
        if Y_pred.std() == 0:
            Rp = 0
        else:
            Rp = np.corrcoef(Y_pred, Y_test)[0,1]
        if abs(Rp) <1e-10:
            Rp = 0
        return Rp

[docs]    def fit(self, drug_name, alpha=1, l1_ratio=0.5, kfolds=10,
                    show=True, tol=1e-3, normalize=False,
                    shuffle=False, perturbation=0.01, randomize_Y=False):
        """Run Elastic Net with a cross validation for one value of alpha

        :param drug_name: the drug to analyse
        :param float alpha: note that theis alpha parameter corresponds to the
            lambda parameter in glmnet R package.
        :param float l1_ratio: This is the lasso penalty parameter.
            Note that in scikit-learn, the l1_ratio correspond
            to the alpha  parameter in glmnet R package. l1_ratio set to 0.5
            means that there is a weight equivalent for the Lasso and Ridge
            effects.
        :param int kfolds: defaults to 10
        :param shuffle: shuffle the indices in the KFold
        :return: kfolds scores for each fold. The score is the pearson
            correlation.

        .. note:: l1_ratio < 0.01 is not reliable unless sequence of
            alpha is provided.

        .. note:: alpha = 0 correspond to an OLS analysis

        """
        assert kfolds > 1, "kfolds must be larger than 1"
        # Get the data for the requested drug
        X, Y = self._get_one_drug_data(drug_name, randomize_Y=randomize_Y)

        # Get a model
        en = self.get_model(alpha=alpha, l1_ratio=l1_ratio,
                            normalize=normalize, tol=tol)

        # Create a cross validation set of indices for training and testing
        kfold = model_selection.KFold(kfolds, shuffle=shuffle)

        # Store the results
        scores = []
        count = 1
        if show is True:
            pylab.clf()

        for train_index, test_index in kfold.split(Y):
            # Get X training and testing data set
            X_train = X.iloc[train_index]
            X_test = X.iloc[test_index]

            # Get Y training and testing data set
            Y_train = Y.iloc[train_index]
            Y_test = Y.iloc[test_index]

            # Fit model on the training set
            en.fit(X_train, Y_train)

            # now compare the prediction with Y_test. This is the coefficient
            # of determination R^2. See scikit learn doc for details.
            Y_pred = en.predict(X_test)

            scores.append(self._get_rpearson(Y_pred, Y_test))

            if show is True:
                N = len(Y_pred)
                import random
                pylab.plot(Y_test, Y_pred + np.append(Y_pred[1:],
                            Y_pred[0]) * perturbation,
                           "ob", alpha=0.5)
                pylab.xlabel("prediction")
                pylab.ylabel("test values")

            if kfolds == 1 and count == 1:
                break
            else:
                count += 1

        if show:
            pylab.title("Prediction on test set (Pearson correlation=%.2f)" %
                        np.mean(scores))
            pylab.xlabel("observed drug response")
            pylab.ylabel("Predicted drug response")
            pylab.grid(True)

        self.en = en
        self.X = X
        self.Y = Y

        return scores

[docs]    def runCV(self, drug_name, l1_ratio=0.5, alphas=None, kfolds=10,
                       verbose=True, shuffle=True, randomize_Y=False, **kargs):
        """Perform the Cross validation to get the best alpha parameter.

        :return: an instance of :class:`RegressionCVResults` that contains
            alpha parameter and Pearson correlation value.
        """
        # Get the data for the requested drug
        X, Y = self._get_one_drug_data(drug_name, randomize_Y=randomize_Y)

        if kfolds > len(X):
            kfolds = len(X)

        # Creates a model
        kfold = model_selection.KFold(kfolds, shuffle=shuffle)
        en = self._get_cv_model(l1_ratio=l1_ratio, alphas=alphas, kfold=kfold, **kargs)

        # Fit the model
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            model = en.fit(X, Y)
        prediction = model.predict(X)

        Rp = self._get_rpearson(prediction, Y)

        res = RegressionCVResults(model, Rp, kfolds)
        if verbose:
            print(res)
        return res

[docs]    def tune_alpha(self, drug_name, alphas=None, N=80, l1_ratio=0.5,
                   kfolds=10, show=True, shuffle=False, alpha_range=[-2.8,0.1],
                   randomize_Y=False):
        """Interactive tuning of the model (alpha).

        This is much faster than :meth:`plot_cindex` but much slower than
        :meth:`runCV`.

        .. plot::
            :include-source:

            from gdsctools import *
            ic = IC50(gdsctools_data("IC50_v5.csv.gz"))
            gf = GenomicFeatures(gdsctools_data("genomic_features_v5.csv.gz"))

            en = GDSCElasticNet(ic, gf)

            en.tune_alpha(1047, N=40, l1_ratio=0.1)

        """
        if alphas is None:
            # logspace returns a vector in natural space that guarantees a
            # uniform spacing in a log space (log10 or ln)
            # -2.8 to 0.5 means alpha from 1.58e-3 to 3.16
            # This is equivalent to log(1.58e-3)=-6.45 to log(3.16)=1.15 in ln
            # scale
            a1, a2 = alpha_range
            alphas = pylab.logspace(a1, a2, N)

        # Let us now do a CV across difference alphas
        all_scores = []
        for alpha in alphas:
            scores = self.fit(drug_name, alpha, l1_ratio=l1_ratio,
                              kfolds=kfolds, shuffle=shuffle,
                         randomize_Y=randomize_Y)
            all_scores.append(scores)

        # We can now plot the results that is the mean scores + error enveloppe
        df = pd.DataFrame(all_scores)

        # we also identify the max correlation and corresponding alpha
        maximum = df.mean(axis=1).max()
        alpha_best = alphas[df.mean(axis=1).argmax()]

        if show is True:
            mu = df.mean(axis=1)
            sigma = df.var(axis=1)
            pylab.clf()
            pylab.errorbar(pylab.log(alphas), mu, yerr=sigma, color="gray")
            pylab.plot(pylab.log(alphas), mu, 'or')
            pylab.axvline(pylab.log(alpha_best), lw=4, alpha=0.5, color='g')
            pylab.title("Mean scores (pearson) across alphas for Kfold=%s" % kfolds)
            pylab.xlabel("ln(alpha)")
            pylab.ylabel("mean score (pearson)")
            pylab.grid()

        results = {"alpha_best":alpha_best, "ln_alpha":pylab.log(alpha_best),
            "maximum_Rp":maximum}
        return results
        #return alphas, all_scores, maximum, alpha_best

[docs]    def check_randomness(self, drug_name, kfolds=10, N=10,
            progress=False, nbins=40, show=True, **kargs):
        """Compute Bayes factor between NULL model and best model fitted N times


        :param drug_name:
        :param kfolds:
        :param int N: optimise NULL models and real model N times
        :param progress:
        :param nbins:
        :param show:

        Bayes factor::

            S = sum([s>r for s,r in zip(scores, random_scores)])
            proba = S / len(scores)
            bayes_factor = 1. / (1-proba)

        Interpretation for values of the Bayes factor according to Kass 
        and Raftery (1995).

        ============================  ==================
            Interpretation                  B(1,2)
        ============================  ==================
           Very strong support for 1   < 0.0067
                    Strong support 1   0.0067 to 0.05
              Positive support for 1   0.05 to .33
                  Weak support for 1   0.33 to 1
         No support for either model   1
                  Weak support for 2   1 to 3
              Positive support for 2   3 to 20
                Strong support for 2   20 to 150
           Very strong support for 2   > 150
        ============================  ==================

        references: http://www.socsci.uci.edu/~mdlee/LodewyckxEtAl2009.pdf
            http://www.aarondefazio.com/adefazio-bayesfactor-guide.pdf
        """
        scores = []
        pb = Progress(N)
        for i in range(N):
            # Fit a model using CV
            inter_results = self.runCV(drug_name, kfolds=kfolds,
                                       verbose=False, **kargs)
            scores.append(inter_results.Rp)
            if progress:
                pb.animate(i+1)

        random_scores = []
        pb = Progress(N)
        for i in range(N):
            # Fit a model using CV
            inter_results = self.runCV(drug_name, kfolds=kfolds,
                                randomize_Y=True, verbose=False, **kargs)
            random_scores.append(inter_results.Rp)
            if progress:
                pb.animate(i+1)

        from scipy.stats import ttest_ind
        ttest_res = ttest_ind(scores, random_scores)
        results = { "scores": scores,
                    "random_scores": random_scores,
                    "ttest_pval": ttest_res.pvalue}

        # Compute the log of the Bayes factor to avoid underflow as communicated
        # by M.Menden.
        S = sum([s>r for s,r in zip(scores, random_scores)])
        proba = S / len(scores)
        if proba == 1:
            # Set the maximum instead of infinite
            # bayes_factor = np.inf
            bayes_factor = 1. / (1./len(scores))
        else:
            bayes_factor = 1. / (1-proba)
        results['bayes_factor'] = bayes_factor

        M = max(max(scores), max(random_scores)) * 1.2
        m = min(min(scores), min(random_scores)) * 1.2
        if show:
            bins = pylab.linspace(m, M, nbins)
            pylab.clf()
            pylab.hist(scores, bins=bins, color="b", alpha=0.5)
            pylab.hist(random_scores, color="r", alpha=0.5, bins=bins)
            pylab.title("Bayes factor=%(bayes_factor).2f" % results)
            pylab.grid(True)
            pylab.xlabel("Coefficient of correlation Rp")
            pylab.xlabel("#")

        return results

[docs]    def dendogram_coefficients(self, stacked=False, show=True, cmap="terrain"):
        """

        shows the coefficient of each optimised model for each drug

        This works for demonstration and small data sets.

        
        """
        drugids = self.drugIds
        from easydev import Progress
        pb = Progress(len(drugids))
        d = {}

        for i, drug_name in enumerate(drugids):
            X, Y = self._get_one_drug_data(drug_name, randomize_Y=False)
            results = self.runCV(drug_name, verbose=False)
            df = pd.DataFrame({'name': X.columns, 'weight': results.coefficients})
            df = df.set_index("name").sort_values("weight")
            d[drug_name] = df.copy()
            pb.animate(i+1)

        # use drugid to keep same order as in the data
        dfall = pd.concat([d[i] for i in drugids], axis=1)
        dfall.columns = drugids

        if show:
            from biokit import heatmap
            h = heatmap.Heatmap(dfall, cmap=cmap)
            h.plot(num=1,colorbar_position="top left")

        if stacked is True:
            dfall = dfall.stack().reset_index()
            dfall.columns = ["feature", "drug", "weight"]
        return dfall

[docs]    def boxplot(self, drug_name, model, n=5, minimum_match_per_combo=10,
                bx_vert=True, bx_alpha=0.5, verbose=False, max_label_length=40):
        """Boxplot plot of the most important features.

        :param int n: maximum most important features to be shown (in term
            of coefficient/weight used in the model)
        :param int minimum_match_per_combo: minimum number of alterations
            required for a feature to be shown

        ::

            # assuming there is a drug ID = 29
            r = GDSCLasso()
            model = r.get_best_model(29)
            r.boxplot(29, model)

        In addition, we also show the wild type case where the total number 
        of mutations is 0.


        """
        # The data and fitted model based on provided model
        X, Y = self._get_one_drug_data(drug_name)
        fitted_model = self._fit_model(drug_name, model)
        df = pd.DataFrame({'name': X.columns, 'weight': fitted_model.coef_})
        df = df.set_index("name")

        weights = df.abs().sort_values("weight")[-n:]
        feature_names = weights.index
        features = self.features.df[feature_names].copy() # dont change the data

        sorted_names = features.sum(axis=0).sort_values(ascending=True).index
        features = features[sorted_names].transpose()

        # mutual exclusive sorting
        """for index in range(n):
            new_indices = features.ix[n-index-1].sort_values(ascending=False).index
            features = features.ix[:,new_indices]
        # ignore rows where all features are off

        features = features.ix[:,features.sum(axis=0) >0]
        self._features = features
        self.weights= weights
        pylab.imshow(features.values[::-1], interpolation="None", aspect="auto", cmap="gray_r")
        """

        # barplot
        barplot(weights, "weight")

        #
        indices = {}
        features = features.transpose()
        features['total'] = features.sum(axis=1)

        # This is required
        original_columns = features.columns[:]
        columns = [x.replace("-", "_") for x in features.columns]
        columns = [x.replace("(", "_") for x in columns]
        columns = [x.replace(")", "_") for x in columns]
        columns = [x.replace(",", "_") for x in columns]
        columns = [x.replace(".", "_") for x in columns]
        features.columns = columns
        self._features = features
        # loop over all possible combos
        for n_combo in range(1, len(columns)+1):
            for combo in itertools.combinations(columns, n_combo):
                if "total" in combo:
                    continue
                # using query from pandas, we get indices for which 1 or 2
                # features are on (==1).
                query = " and ".join(["%s==1"] * n_combo)
                query += " and total==%s" % n_combo
                inner_indices = features.query(query % combo).index
                if len(inner_indices) >= minimum_match_per_combo:
                    name = ",".join(combo)
                    indices[name] = inner_indices
                    if verbose:
                        print("Found %s with %s events" % (name,len(inner_indices)))
        # Wild type
        inner_indices = features.query("total==0").index
        indices["Wild Type"] = inner_indices

        _X, Y = self._get_one_drug_data(drug_name)

        names = list(indices.keys())
        means = [Y.loc[indices[name]].dropna().mean() for name in names]
        df = pd.DataFrame({"names":names, "mean": means})
        sorted_names_by_mean = df.sort_values(by="mean")['names']

        data = [Y.loc[indices[name]].dropna() for name in sorted_names_by_mean]

        # the dropna means some subdata/names are now empty
        sorted_names_by_mean = [name for this,name in zip(data, sorted_names_by_mean) if len(this)]
        data = [this for this in data if len(this)]

        for subdata in data:
            subdata.index.name = subdata.index.name[0:max_label_length]

        if len(data):
            bx = BoxSwarm(data, sorted_names_by_mean)
            if bx_vert is False:
                bx.xlabel = "Drug response"
            else:
                bx.ylabel = "Drug response"
            bx.plot(vert=bx_vert, alpha=bx_alpha, widths=0.5)
        return {'weights': weights, "data":data, }


[docs]class GDSCRidge(Regression):
    """Same as :class:`Regression`"""
    def __init__(self, ic50, genomic_features=None, verbose=False):
        super(GDSCRidge, self).__init__(ic50, genomic_features,
                                        verbose=verbose)

[docs]    def get_model(self,alpha=1, l1_ratio=None, **kargs):
        return linear_model.Ridge(alpha)

    def _get_cv_model(self, alphas=None, kfold=None, l1_ratio=None, **kargs):
        if alphas is None:
            alphas = (0.1, 1.0, 10.0)
        return linear_model.RidgeCV(alphas=alphas, cv=kfold, **kargs)


[docs]class GDSCLasso(Regression):
    """See as :class:`Regression`"""
    def __init__(self, ic50, genomic_features=None, verbose=False):
        super(GDSCLasso, self).__init__(ic50, genomic_features,
                                        verbose=verbose)

[docs]    def get_model(self, alpha=1, l1_ratio=None, **kargs):
        return linear_model.Lasso(alpha)

    def _get_cv_model(self, alphas=None, kfold=None, l1_ratio=None, **kargs):
        return linear_model.LassoCV(alphas=alphas, cv=kfold, **kargs)


class GDSCLassoLars(Regression):
    """See as :class:`Regression`"""
    def __init__(self, ic50, genomic_features=None, verbose=False):
        super(GDSCLassoLars, self).__init__(ic50, genomic_features,
                                            verbose=verbose)

    def get_model(self, alpha=1, l1_ratio=None, **kargs):
        return linear_model.LassoLarsCV(alpha)

    def _get_cv_model(self, alphas=None, kfold=None, l1_ratio=None, **kargs):
        return linear_model.LassoLarsCV(cv=kfold, **kargs)


[docs]class GDSCElasticNet(Regression):
    """Variant of :class:`ANOVA` that handle the association at the drug level
    using an ElasticNet analysis of the IC50 vs Feature matrices.


    As compared to the :class:`GDSCRidge` and :class:`GDSCLasso`

    Here is an example on how to perform the analysis, which is similar to the
    ANOVA API:

    .. plot::
        :include-source:
        :width: 80%

        from gdsctools import GDSCElasticNet, gdsctools_data, IC50, GenomicFeatures
        ic50 = IC50(gdsctools_data("IC50_v5.csv.gz"))
        gf = GenomicFeatures(gdsctools_data("genomic_features_v5.csv.gz"))

        en = GDSCElasticNet(ic50, gf)
        en.enetpath_vs_enet(1047)


    For more information about the input data sets please see
    :class:`~gdsctools.anova.ANOVA`, :mod:`~gdsctools.readers`
    """
    def __init__(self, ic50, genomic_features=None, verbose=False,
            set_media_factor=False):
        """.. rubric:: Constructor

        :param DataFrame IC50: a dataframe with the IC50. Rows should be
            the COSMIC identifiers and columns should be the Drug names
            (or identifiers)
        :param features: another dataframe with rows as in the IC50 matrix
            and columns as features.  The first 3 columns must be named
            specifically to hold tissues, MSI (see format).
        :param verbose: verbosity in "WARNING", "ERROR", "DEBUG", "INFO"

        """
        super(GDSCElasticNet, self).__init__(ic50, genomic_features,
                                             verbose=verbose)

[docs]    def get_model(self,  alpha=1, l1_ratio=0.5, tol=1e-3, normalize=False,
                  **kargs):
        return linear_model.ElasticNet(alpha=alpha, l1_ratio=l1_ratio,
                            normalize=normalize, tol=tol)

    def _get_cv_model(self, l1_ratio=0.5, alphas=None, kfold=None, **kargs):
        return linear_model.ElasticNetCV(l1_ratio=l1_ratio, alphas=alphas, cv=kfold, **kargs)

[docs]    def plot_cindex(self, drug_name, alphas, l1_ratio=0.5, kfolds=10,
            hold=False):
        """Tune alpha parameter using concordance index


        This is longish and performs the following task. For a set of alpha
        (list), run the elastic net analysis for a given **l1_ratio** with
        **kfolds**. For each alpha, get the CIndex and find the CINdex for
        which the errors are minimum.

        .. warning:: this is a bit longish (300 seconds for 10 folds
            and 80 alphas) on GDSCv5 data set.
        """
        from dreamtools.core.cindex import cindex

        CI_train = {}
        CI_test = {}
        for c in range(kfolds):
            CI_train[c] = []
            CI_test[c] = []

        pb = Progress(len(alphas))

        for i, alpha in enumerate(alphas):
            self.fit(drug_name, alpha=alpha, l1_ratio=l1_ratio,
                             kfolds=kfolds)

            # Look at the results and store cindex
            for kf in range(kfolds):
                x_train = self.kfold_data['x_train'][kf].values
                y_train = self.kfold_data['y_train'][kf].values

                x_test = self.kfold_data['x_test'][kf].values
                y_test = self.kfold_data['y_test'][kf].values

                x_train_pred = self.en.predict(x_train)
                x_test_pred = self.en.predict(x_test)

                CI_test[kf].append(1-cindex(x_test_pred, y_test,
                    [True]*len(y_test)))
                CI_train[kf].append(1-cindex(x_train_pred, y_train,
                    [True] * len(y_train)))
            pb.animate(i)

        mu_train = pd.DataFrame(CI_train).transpose().mean()
        sigma_train = pd.DataFrame(CI_train).transpose().std()

        mu_test = pd.DataFrame(CI_test).transpose().mean()
        sigma_test = pd.DataFrame(CI_test).transpose().std()

        best_alpha = alphas[pd.DataFrame(CI_test).mean(axis=1).argmax()]

        pylab.clf()
        pylab.errorbar(pylab.log(alphas), mu_train, yerr=sigma_train,
                label="train")
        pylab.errorbar(pylab.log(alphas)+.1, mu_test, yerr=sigma_test,
                label="test")
        pylab.plot(pylab.log(alphas), mu_train, 'ob')
        pylab.plot(pylab.log(alphas)+.1, mu_train, 'or')
        pylab.legend()
        pylab.axvline(pylab.log(best_alpha), lw=2, color="purple")

        return best_alpha

[docs]    def enetpath_vs_enet(self, drug_name, alphas=None, l1_ratio=0.5, nfeat=5,
                         max_iter=1000, tol=1e-4, selection="cyclic",
                         fit_intercept=False):
        """
        #if X is not scaled, the enetpath and ElasticNet will give slightly
        # different results
        #if X is scale using::

            from sklearn import preprocessing
            xscaled = preprocessing.scale(X)
            xscaled = pd.DataFrame(xscaled, columns=X.columns)
        """
        # 1. use enet to loop over alphas and then plot the coefficients
        # along alpha for each feature

        # Get the data for the requested drug
        X, Y = self._get_one_drug_data(drug_name)

        # Run elasticnet for a bunch of alphas to get the coefficients
        alphas, coeffs, _ = enet_path(X, Y, l1_ratio=l1_ratio, alphas=alphas)

        # estimate the best alpha for later
        best_alpha = self.runCV(drug_name, verbose=False).alpha

        # The final data, coeffs is sorted by coefficients on the smallest alpha
        coeffs = pd.DataFrame(coeffs, index=list(X.columns))
        N = len(alphas)
        coeffs.sort_values(by=N-1, inplace=True)
        results = {"alphas": alphas, "coeffs": coeffs, "best_alpha": best_alpha}

        # the Viewer
        self._plot_enet(results)

        return results

    def _plot_enet(self, data, numfig=1, labels_to_show=5, fontsize=10):
        """Plot the enetpath data

        :param coeffs: a dataframe. The rows are the features. The columns are
            the alpha. The matrix contains the coefficients
        :param coeffs: the alphas. Must match the columns of the coeffs
            dataframe
        :param labels_to_show: number of labels to add on the figure

        """
        pylab.figure(numfig)
        pylab.clf()
        for this in data['coeffs'].iterrows():
            pylab.plot(pylab.log(data['alphas']), this[1], color="gray")
        pylab.axvline(pylab.log(data['best_alpha']), color="r", lw=2)
        pylab.xlabel("ln alpha")
        pylab.ylabel("Coefficients")
        pylab.grid(True)
        # The data is sorted with last rows having largest coeff and last
        # columns the smallest

        if len(data) < labels_to_show:
            labels_to_show = len(data)

        try:
            # recent python3/pandas version
            for text,v in data['coeffs'].iloc[-labels_to_show:,-1].items():
                pylab.text(pylab.log(min(data["alphas"])),v,text, fontsize=fontsize)
        except:
            # old python2/pandas
            for text,v in data['coeffs'].iloc[-labels_to_show:,-1].iteritems():
                pylab.text(pylab.log(min(data["alphas"])),v,text, fontsize=fontsize)


def barplot(df, colname, color="sign",colors=("b", "r"),
            orientation="vertical", Nmax=50, max_label_length=20,
            fontsize=12):

    assert orientation in ["vertical", "horizontal"]
    if orientation=="vertical":
        kind = "bar"
        figsize = (10, 7)
    else:
        kind = "barh"
        figsize = (8, 10)
    df = df.copy()

    pylab.figure(figsize=figsize)
    df.loc[:,'sign'] = df[colname]>0
    df[colname] = abs(df[colname])
    df.index = [this[0:max_label_length] for this in df.index]

    df = df.sort_values(colname, ascending=True)
    df.loc[df['sign'] == True, 'sign'] = 'r'
    df.loc[df['sign'] == False, 'sign'] = 'b'

    colors = "".join(df['sign'])

    if len(df) < Nmax:
        df.plot(y=colname, kind=kind, color=colors, width=1, lw=1,
            title='Importance plot', ax=pylab.gca(),
            fontsize=fontsize, figsize=figsize)
    else:
        df.iloc[-Nmax:].plot(y=colname, kind=kind, color=colors[-50:],
            width=1, lw=1,
            title='Importance plot', ax=pylab.gca(),
            fontsize=fontsize, figsize=figsize)

    import matplotlib.patches as mpatches
    red_patch = mpatches.Patch(color='red', label='positive weights')
    blue_patch = mpatches.Patch(color='blue', label="negative weights")

    ax = pylab.gca()
    if orientation=="horizontal":
        ax.set_position([0.3,0.1,0.6,0.8])
        pylab.legend(handles=[red_patch, blue_patch], loc='lower right')
    elif orientation=="vertical":
        ax.set_position([0.1,0.3,0.8,0.6])
        pylab.legend(handles=[red_patch, blue_patch], loc='upper left')
    pylab.grid()
    pylab.ylabel("Absolute weights", fontsize=fontsize)