Source code for gdsctools.regression

# -*- python -*-
# -*- coding utf-8 -*-
#
#  This file is part of GDSCTools software
#
#  Copyright (c) 2015 - Wellcome Trust Sanger Institute
#  All rights reserved
#  Copyright (c) 2016 - Institut Pasteur
#  All rights reserved
#
#  File author(s): Thomas Cokelaer <cokelaer@gmail.com>
#  File author(s): Thomas Cokelaer <thomas.cokelaer@pasteur.fr>
#
#  Distributed under the BSD 3-Clause License.
#  See accompanying file LICENSE.txt distributed with this software
#
#  website: http://github.com/CancerRxGene/gdsctools
#
##############################################################################
"""Look for IC50 vs and genomic features associations using Regression methods"""
import itertools
import warnings

import pandas as pd
import pylab
import numpy as np

from easydev import Progress

from gdsctools.models import BaseModels
from gdsctools.boxswarm import BoxSwarm

from sklearn.linear_model import enet_path
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import linear_model # must use the module rather than classes to


__all__ = ["Regression", 'GDSCRidge', "GDSCLasso", "GDSCElasticNet", 
           "RegressionCVResults"]


"""book keeping

from statsmodels.formula.api import OLS

if self.settings.regression_method == 'ElasticNet':
    self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                       alpha=self.settings.regression_alpha,
                       L1_wt=self.settings.regression_L1_wt)
elif self.settings.regression_method == 'OLS':
    self.data_lm = OLS(odof.Y, df.values).fit()
elif self.settings.regression_method == 'Ridge':
    self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                       alpha=self.settings.regression_alpha, L1_wt=0)
elif self.settings.regression_method == 'Lasso':
    self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                       alpha=self.settings.regression_alpha, L1_wt=1)

"""


class RegressionCVResults(object):
    """Simple data structure to hold some results of the regression analysis

    - :attr:`model`
    - :attr:`kfold`: number of folds used
    - :attr:`Rp`
    - :attr:`alpha`: best alpha parameter
    - :attr:`ln_alpha`  best alpha parameter (log scale)
    """
    def __init__(self, model, Rp, kfold=None):
        self.model = model
        self.Rp = Rp
        self.kfold = kfold
    def _get_alpha(self):
        return self.model.alpha_
    alpha = property(_get_alpha)
    def _get_ln_alpha(self):
        return pylab.log(self.alpha)
    ln_alpha = property(_get_ln_alpha)
    def _get_coefficients(self):
        return self.model.coef_
    coefficients = property(_get_coefficients)
    def __str__(self):
        txt = "Best alpha on %s folds: %s (%.2f in log scale); Rp=%s" %\
                  (self.kfold, self.alpha, self.ln_alpha, self.Rp)
        return txt


[docs]class Regression(BaseModels): """Base class for all Regression analysis In the :class:`gdsctools.anova.ANOVA` case, the regression is based on the OLS method and is computed for a given drug and a given feature (:term:`ODOF`). Then, the analysis is repeated across all features for a given drug (:term:`ODAF`) and finally extended to all drugs (:term:`ADAF`). So, there is one test for each combination of drug and feature. Here, all features for a given drug are taken together to perform a Regression analysis (:term:`ODAF`). The regression algorithm implemented so far are: - Ridge - Lasso - ElasticNet - LassoLars Based on tools from the scikit-learn library. """ def __init__(self, ic50, genomic_features=None, verbose=False): """.. rubric:: Constructor :param ic50: an IC50 file :param genomic_features: a genomic feature file see :ref:`data` for help on the input data formats. """ super(Regression, self).__init__(ic50, genomic_features, verbose=verbose, set_media_factor=False) self.scale = False def _get_one_drug_data(self, name, randomize_Y=False): """Returns X and Y for a given drug, dropping NAs :param name: drug name :param randomize_Y: randomize Y - drops NA - drops TISSUE_FACTOR - drops MSI factor """ Y = self.ic50.df[name] Y.dropna(inplace=True) X = self.features.df.loc[Y.index].copy() try:X = X.drop('TISSUE_FACTOR', axis=1) except:pass try: X = X.drop('MSI_FACTOR', axis=1) except:pass if self.scale is True: columns = X.columns # cast is essential here otherwise ValueError is raised X = preprocessing.scale(X.astype(float)) X = pd.DataFrame(X, columns=columns) if randomize_Y: Y = Y.copy() pylab.shuffle(Y.values) return X, Y def _fit_model(self, drug_name, model): """call fit method of a model given a drug name Save the current X, Y, model fitter in _X, _Y and _model attributes """ X, Y = self._get_one_drug_data(drug_name) model.fit(X, Y) return model
[docs] def plot_importance(self, drug_name, model=None, fontsize=11, max_label_length=35, orientation="vertical"): """Plot the absolute weights found by a fittd model. :param str drug_name: :param model: a model :param int fontsize: (defaults to 11) :param max_label_length: 35 by default :param orientation: orientation of the plot (vertical or horizontal) :return: the dataframe with the weights (may be empty) .. note:: if no weights are different from zeros, no plots are created. """ X, Y = self._get_one_drug_data(drug_name) if model is None: model = self.get_best_model(drug_name) model.fit(X, Y) df = pd.DataFrame({'name': X.columns, 'weight': model.coef_}) df = df.set_index("name") df = df[df['weight'] != 0] if len(df): barplot(df, "weight", orientation=orientation, max_label_length=max_label_length, fontsize=fontsize) # sometimes there is only a few weights close to zero. Set the ylim # to 1 if it is below 0.1. if pylab.ylim()[1] < 0.1: pylab.ylim([0,1]) if len(df) < 5: pylab.xlim(-5,5) return df
def _print(self, txt): if self.verbose: print(txt)
[docs] def get_best_model(self, drug_name, kfolds=10, alphas=None, l1_ratio=0.5): """Return best model fitted using a CV :param drug_name: :param kfolds: :param alphas: :param l1_ratio: """ self._print("Running CV to estimate best alpha.") results = self.runCV(drug_name, kfolds=kfolds, alphas=alphas, l1_ratio=l1_ratio) best_alpha = results.alpha model = self.get_model(alpha=best_alpha) self._print("Using alpha=%s." % model.alpha) return model
[docs] def plot_weight(self, drug_name, model=None, fontsize=12, figsize=(10,7), max_label_length=20, Nmax=40): """Plot the elastic net weights :param drug_name: the drug identifier :param alpha: :param l1_ratio: Large alpha values will have a more stringent effects on the weigths and select only some of them or maybe none. Conversely, setting alphas to zero will keep all weights. .. plot:: :include-source: from gdsctools import * ic = IC50(gdsctools_data("IC50_v5.csv.gz")) gf = GenomicFeatures(gdsctools_data("genomic_features_v5.csv.gz")) en = GDSCElasticNet(ic, gf) model = en.get_model(alpha=0.01) en.plot_weight(1047, model=model) """ X, Y = self._get_one_drug_data(drug_name) if model is None: model = self.get_best_model(drug_name) model.fit(X, Y) df = pd.DataFrame({'name': X.columns, 'weight': model.coef_}) df = df.set_index("name").sort_values("weight") df = df[df != 0].dropna() # split the data keeping only 50 best weights at most if len(df) > Nmax: # figure out the threshold in absolute value so that the # we keep the Nmax strongest weights irrespective of the sign threshold = df.abs().sort_values(by="weight").values[-Nmax:][0,0] df1 = df.query("weight<=0 and abs(weight)>=@threshold").copy() df2 = df.query("weight>=0 and abs(weight)>=@threshold").copy() else: df1 = df[df.weight<0].copy() df2 = df[df.weight>=0].copy() df1.index = [this[0:max_label_length] for this in df1.index] df2.index = [this[0:max_label_length] for this in df2.index] # We also want some symmetry so as many red as blue so that the span # of positive and negative is equivalent N = len(df2) - len(df1) if N > 0: # more red LHS than blue RHS for i in range(1, N+1): label = "_dummy%s" % i df1.loc[label, "weight"] = 0 df1.index = [x if not x.startswith("_dummy") else "" for x in df1.index] elif N < 0: # more blue RHS than red LHS for i in range(1, abs(N)+1): label = "_dummy%s" % i df2.loc[label, "weight"] = 0 df2.index = [x if not x.startswith("_dummy") else "" for x in df2.index] df2.sort_values(by="weight", ascending=True, inplace=True) f, (ax, ax2) = pylab.subplots(1,2, sharey=True, figsize=(10,7)) ff = pylab.gcf() ff.set_facecolor('white') self.df1 = df1 self.df2 = df2 if len(df1): df1.plot(y="weight", kind="bar", width=1, lw=1, ax=ax, color="b", legend=False, fontsize=fontsize, figsize=figsize) if len(df2): df2.plot(y="weight", kind="bar", width=1, lw=1, ax=ax2, color="r", sharey=True, legend=False, fontsize=fontsize, figsize=figsize) if len(df1) == 0 and len(df2) == 0: pylab.xlim([-5,5]) pylab.ylim([-1,1]) pylab.grid(True) return df # hide the spines between ax and ax2 ax.spines['right'].set_visible(False) ax2.spines['left'].set_visible(False) ax.yaxis.tick_left() ax2.yaxis.tick_right() ax2.tick_params(labelleft='off') d = 0.02 # diagonal lines kwargs = dict(transform=ax.transAxes, color='k', clip_on=False) ax.plot((1 -d, 1 + d), (-d, +d), **kwargs) ax.plot((1 -d, 1 + d), (1-d, 1+d), **kwargs) kwargs.update(transform=ax2.transAxes) # switch to the bottom axes ax2.plot(( -d, d), (1-d, 1+d), **kwargs) ax2.plot(( -d, d), (-d, d), **kwargs) ax.grid() ax2.grid() # x0, y0, width_x, width_y ax.set_position([0.06,0.3,0.425,0.6]) ax2.set_position([0.50,0.3,0.425,0.6]) return df
def _get_rpearson(self, Y_pred, Y_test): self.a = Y_pred self.b = Y_test if Y_pred.std() == 0: Rp = 0 else: Rp = np.corrcoef(Y_pred, Y_test)[0,1] if abs(Rp) <1e-10: Rp = 0 return Rp
[docs] def fit(self, drug_name, alpha=1, l1_ratio=0.5, kfolds=10, show=True, tol=1e-3, normalize=False, shuffle=False, perturbation=0.01, randomize_Y=False): """Run Elastic Net with a cross validation for one value of alpha :param drug_name: the drug to analyse :param float alpha: note that theis alpha parameter corresponds to the lambda parameter in glmnet R package. :param float l1_ratio: This is the lasso penalty parameter. Note that in scikit-learn, the l1_ratio correspond to the alpha parameter in glmnet R package. l1_ratio set to 0.5 means that there is a weight equivalent for the Lasso and Ridge effects. :param int kfolds: defaults to 10 :param shuffle: shuffle the indices in the KFold :return: kfolds scores for each fold. The score is the pearson correlation. .. note:: l1_ratio < 0.01 is not reliable unless sequence of alpha is provided. .. note:: alpha = 0 correspond to an OLS analysis """ assert kfolds > 1, "kfolds must be larger than 1" # Get the data for the requested drug X, Y = self._get_one_drug_data(drug_name, randomize_Y=randomize_Y) # Get a model en = self.get_model(alpha=alpha, l1_ratio=l1_ratio, normalize=normalize, tol=tol) # Create a cross validation set of indices for training and testing kfold = model_selection.KFold(kfolds, shuffle=shuffle) # Store the results scores = [] count = 1 if show is True: pylab.clf() for train_index, test_index in kfold.split(Y): # Get X training and testing data set X_train = X.iloc[train_index] X_test = X.iloc[test_index] # Get Y training and testing data set Y_train = Y.iloc[train_index] Y_test = Y.iloc[test_index] # Fit model on the training set en.fit(X_train, Y_train) # now compare the prediction with Y_test. This is the coefficient # of determination R^2. See scikit learn doc for details. Y_pred = en.predict(X_test) scores.append(self._get_rpearson(Y_pred, Y_test)) if show is True: N = len(Y_pred) import random pylab.plot(Y_test, Y_pred + np.append(Y_pred[1:], Y_pred[0]) * perturbation, "ob", alpha=0.5) pylab.xlabel("prediction") pylab.ylabel("test values") if kfolds == 1 and count == 1: break else: count += 1 if show: pylab.title("Prediction on test set (Pearson correlation=%.2f)" % np.mean(scores)) pylab.xlabel("observed drug response") pylab.ylabel("Predicted drug response") pylab.grid(True) self.en = en self.X = X self.Y = Y return scores
[docs] def runCV(self, drug_name, l1_ratio=0.5, alphas=None, kfolds=10, verbose=True, shuffle=True, randomize_Y=False, **kargs): """Perform the Cross validation to get the best alpha parameter. :return: an instance of :class:`RegressionCVResults` that contains alpha parameter and Pearson correlation value. """ # Get the data for the requested drug X, Y = self._get_one_drug_data(drug_name, randomize_Y=randomize_Y) if kfolds > len(X): kfolds = len(X) # Creates a model kfold = model_selection.KFold(kfolds, shuffle=shuffle) en = self._get_cv_model(l1_ratio=l1_ratio, alphas=alphas, kfold=kfold, **kargs) # Fit the model with warnings.catch_warnings(): warnings.simplefilter("ignore") model = en.fit(X, Y) prediction = model.predict(X) Rp = self._get_rpearson(prediction, Y) res = RegressionCVResults(model, Rp, kfolds) if verbose: print(res) return res
[docs] def tune_alpha(self, drug_name, alphas=None, N=80, l1_ratio=0.5, kfolds=10, show=True, shuffle=False, alpha_range=[-2.8,0.1], randomize_Y=False): """Interactive tuning of the model (alpha). This is much faster than :meth:`plot_cindex` but much slower than :meth:`runCV`. .. plot:: :include-source: from gdsctools import * ic = IC50(gdsctools_data("IC50_v5.csv.gz")) gf = GenomicFeatures(gdsctools_data("genomic_features_v5.csv.gz")) en = GDSCElasticNet(ic, gf) en.tune_alpha(1047, N=40, l1_ratio=0.1) """ if alphas is None: # logspace returns a vector in natural space that guarantees a # uniform spacing in a log space (log10 or ln) # -2.8 to 0.5 means alpha from 1.58e-3 to 3.16 # This is equivalent to log(1.58e-3)=-6.45 to log(3.16)=1.15 in ln # scale a1, a2 = alpha_range alphas = pylab.logspace(a1, a2, N) # Let us now do a CV across difference alphas all_scores = [] for alpha in alphas: scores = self.fit(drug_name, alpha, l1_ratio=l1_ratio, kfolds=kfolds, shuffle=shuffle, randomize_Y=randomize_Y) all_scores.append(scores) # We can now plot the results that is the mean scores + error enveloppe df = pd.DataFrame(all_scores) # we also identify the max correlation and corresponding alpha maximum = df.mean(axis=1).max() alpha_best = alphas[df.mean(axis=1).argmax()] if show is True: mu = df.mean(axis=1) sigma = df.var(axis=1) pylab.clf() pylab.errorbar(pylab.log(alphas), mu, yerr=sigma, color="gray") pylab.plot(pylab.log(alphas), mu, 'or') pylab.axvline(pylab.log(alpha_best), lw=4, alpha=0.5, color='g') pylab.title("Mean scores (pearson) across alphas for Kfold=%s" % kfolds) pylab.xlabel("ln(alpha)") pylab.ylabel("mean score (pearson)") pylab.grid() results = {"alpha_best":alpha_best, "ln_alpha":pylab.log(alpha_best), "maximum_Rp":maximum} return results
#return alphas, all_scores, maximum, alpha_best
[docs] def check_randomness(self, drug_name, kfolds=10, N=10, progress=False, nbins=40, show=True, **kargs): """Compute Bayes factor between NULL model and best model fitted N times :param drug_name: :param kfolds: :param int N: optimise NULL models and real model N times :param progress: :param nbins: :param show: Bayes factor:: S = sum([s>r for s,r in zip(scores, random_scores)]) proba = S / len(scores) bayes_factor = 1. / (1-proba) Interpretation for values of the Bayes factor according to Kass and Raftery (1995). ============================ ================== Interpretation B(1,2) ============================ ================== Very strong support for 1 < 0.0067 Strong support 1 0.0067 to 0.05 Positive support for 1 0.05 to .33 Weak support for 1 0.33 to 1 No support for either model 1 Weak support for 2 1 to 3 Positive support for 2 3 to 20 Strong support for 2 20 to 150 Very strong support for 2 > 150 ============================ ================== references: http://www.socsci.uci.edu/~mdlee/LodewyckxEtAl2009.pdf http://www.aarondefazio.com/adefazio-bayesfactor-guide.pdf """ scores = [] pb = Progress(N) for i in range(N): # Fit a model using CV inter_results = self.runCV(drug_name, kfolds=kfolds, verbose=False, **kargs) scores.append(inter_results.Rp) if progress: pb.animate(i+1) random_scores = [] pb = Progress(N) for i in range(N): # Fit a model using CV inter_results = self.runCV(drug_name, kfolds=kfolds, randomize_Y=True, verbose=False, **kargs) random_scores.append(inter_results.Rp) if progress: pb.animate(i+1) from scipy.stats import ttest_ind ttest_res = ttest_ind(scores, random_scores) results = { "scores": scores, "random_scores": random_scores, "ttest_pval": ttest_res.pvalue} # Compute the log of the Bayes factor to avoid underflow as communicated # by M.Menden. S = sum([s>r for s,r in zip(scores, random_scores)]) proba = S / len(scores) if proba == 1: # Set the maximum instead of infinite # bayes_factor = np.inf bayes_factor = 1. / (1./len(scores)) else: bayes_factor = 1. / (1-proba) results['bayes_factor'] = bayes_factor M = max(max(scores), max(random_scores)) * 1.2 m = min(min(scores), min(random_scores)) * 1.2 if show: bins = pylab.linspace(m, M, nbins) pylab.clf() pylab.hist(scores, bins=bins, color="b", alpha=0.5) pylab.hist(random_scores, color="r", alpha=0.5, bins=bins) pylab.title("Bayes factor=%(bayes_factor).2f" % results) pylab.grid(True) pylab.xlabel("Coefficient of correlation Rp") pylab.xlabel("#") return results
[docs] def dendogram_coefficients(self, stacked=False, show=True, cmap="terrain"): """ shows the coefficient of each optimised model for each drug This works for demonstration and small data sets. """ drugids = self.drugIds from easydev import Progress pb = Progress(len(drugids)) d = {} for i, drug_name in enumerate(drugids): X, Y = self._get_one_drug_data(drug_name, randomize_Y=False) results = self.runCV(drug_name, verbose=False) df = pd.DataFrame({'name': X.columns, 'weight': results.coefficients}) df = df.set_index("name").sort_values("weight") d[drug_name] = df.copy() pb.animate(i+1) # use drugid to keep same order as in the data dfall = pd.concat([d[i] for i in drugids], axis=1) dfall.columns = drugids if show: from biokit import heatmap h = heatmap.Heatmap(dfall, cmap=cmap) h.plot(num=1,colorbar_position="top left") if stacked is True: dfall = dfall.stack().reset_index() dfall.columns = ["feature", "drug", "weight"] return dfall
[docs] def boxplot(self, drug_name, model, n=5, minimum_match_per_combo=10, bx_vert=True, bx_alpha=0.5, verbose=False, max_label_length=40): """Boxplot plot of the most important features. :param int n: maximum most important features to be shown (in term of coefficient/weight used in the model) :param int minimum_match_per_combo: minimum number of alterations required for a feature to be shown :: # assuming there is a drug ID = 29 r = GDSCLasso() model = r.get_best_model(29) r.boxplot(29, model) In addition, we also show the wild type case where the total number of mutations is 0. """ # The data and fitted model based on provided model X, Y = self._get_one_drug_data(drug_name) fitted_model = self._fit_model(drug_name, model) df = pd.DataFrame({'name': X.columns, 'weight': fitted_model.coef_}) df = df.set_index("name") weights = df.abs().sort_values("weight")[-n:] feature_names = weights.index features = self.features.df[feature_names].copy() # dont change the data sorted_names = features.sum(axis=0).sort_values(ascending=True).index features = features[sorted_names].transpose() # mutual exclusive sorting """for index in range(n): new_indices = features.ix[n-index-1].sort_values(ascending=False).index features = features.ix[:,new_indices] # ignore rows where all features are off features = features.ix[:,features.sum(axis=0) >0] self._features = features self.weights= weights pylab.imshow(features.values[::-1], interpolation="None", aspect="auto", cmap="gray_r") """ # barplot barplot(weights, "weight") # indices = {} features = features.transpose() features['total'] = features.sum(axis=1) # This is required original_columns = features.columns[:] columns = [x.replace("-", "_") for x in features.columns] columns = [x.replace("(", "_") for x in columns] columns = [x.replace(")", "_") for x in columns] columns = [x.replace(",", "_") for x in columns] columns = [x.replace(".", "_") for x in columns] features.columns = columns self._features = features # loop over all possible combos for n_combo in range(1, len(columns)+1): for combo in itertools.combinations(columns, n_combo): if "total" in combo: continue # using query from pandas, we get indices for which 1 or 2 # features are on (==1). query = " and ".join(["%s==1"] * n_combo) query += " and total==%s" % n_combo inner_indices = features.query(query % combo).index if len(inner_indices) >= minimum_match_per_combo: name = ",".join(combo) indices[name] = inner_indices if verbose: print("Found %s with %s events" % (name,len(inner_indices))) # Wild type inner_indices = features.query("total==0").index indices["Wild Type"] = inner_indices _X, Y = self._get_one_drug_data(drug_name) names = list(indices.keys()) means = [Y.loc[indices[name]].dropna().mean() for name in names] df = pd.DataFrame({"names":names, "mean": means}) sorted_names_by_mean = df.sort_values(by="mean")['names'] data = [Y.loc[indices[name]].dropna() for name in sorted_names_by_mean] # the dropna means some subdata/names are now empty sorted_names_by_mean = [name for this,name in zip(data, sorted_names_by_mean) if len(this)] data = [this for this in data if len(this)] for subdata in data: subdata.index.name = subdata.index.name[0:max_label_length] if len(data): bx = BoxSwarm(data, sorted_names_by_mean) if bx_vert is False: bx.xlabel = "Drug response" else: bx.ylabel = "Drug response" bx.plot(vert=bx_vert, alpha=bx_alpha, widths=0.5) return {'weights': weights, "data":data, }
[docs]class GDSCRidge(Regression): """Same as :class:`Regression`""" def __init__(self, ic50, genomic_features=None, verbose=False): super(GDSCRidge, self).__init__(ic50, genomic_features, verbose=verbose)
[docs] def get_model(self,alpha=1, l1_ratio=None, **kargs): return linear_model.Ridge(alpha)
def _get_cv_model(self, alphas=None, kfold=None, l1_ratio=None, **kargs): if alphas is None: alphas = (0.1, 1.0, 10.0) return linear_model.RidgeCV(alphas=alphas, cv=kfold, **kargs)
[docs]class GDSCLasso(Regression): """See as :class:`Regression`""" def __init__(self, ic50, genomic_features=None, verbose=False): super(GDSCLasso, self).__init__(ic50, genomic_features, verbose=verbose)
[docs] def get_model(self, alpha=1, l1_ratio=None, **kargs): return linear_model.Lasso(alpha)
def _get_cv_model(self, alphas=None, kfold=None, l1_ratio=None, **kargs): return linear_model.LassoCV(alphas=alphas, cv=kfold, **kargs)
class GDSCLassoLars(Regression): """See as :class:`Regression`""" def __init__(self, ic50, genomic_features=None, verbose=False): super(GDSCLassoLars, self).__init__(ic50, genomic_features, verbose=verbose) def get_model(self, alpha=1, l1_ratio=None, **kargs): return linear_model.LassoLarsCV(alpha) def _get_cv_model(self, alphas=None, kfold=None, l1_ratio=None, **kargs): return linear_model.LassoLarsCV(cv=kfold, **kargs)
[docs]class GDSCElasticNet(Regression): """Variant of :class:`ANOVA` that handle the association at the drug level using an ElasticNet analysis of the IC50 vs Feature matrices. As compared to the :class:`GDSCRidge` and :class:`GDSCLasso` Here is an example on how to perform the analysis, which is similar to the ANOVA API: .. plot:: :include-source: :width: 80% from gdsctools import GDSCElasticNet, gdsctools_data, IC50, GenomicFeatures ic50 = IC50(gdsctools_data("IC50_v5.csv.gz")) gf = GenomicFeatures(gdsctools_data("genomic_features_v5.csv.gz")) en = GDSCElasticNet(ic50, gf) en.enetpath_vs_enet(1047) For more information about the input data sets please see :class:`~gdsctools.anova.ANOVA`, :mod:`~gdsctools.readers` """ def __init__(self, ic50, genomic_features=None, verbose=False, set_media_factor=False): """.. rubric:: Constructor :param DataFrame IC50: a dataframe with the IC50. Rows should be the COSMIC identifiers and columns should be the Drug names (or identifiers) :param features: another dataframe with rows as in the IC50 matrix and columns as features. The first 3 columns must be named specifically to hold tissues, MSI (see format). :param verbose: verbosity in "WARNING", "ERROR", "DEBUG", "INFO" """ super(GDSCElasticNet, self).__init__(ic50, genomic_features, verbose=verbose)
[docs] def get_model(self, alpha=1, l1_ratio=0.5, tol=1e-3, normalize=False, **kargs): return linear_model.ElasticNet(alpha=alpha, l1_ratio=l1_ratio, normalize=normalize, tol=tol)
def _get_cv_model(self, l1_ratio=0.5, alphas=None, kfold=None, **kargs): return linear_model.ElasticNetCV(l1_ratio=l1_ratio, alphas=alphas, cv=kfold, **kargs)
[docs] def plot_cindex(self, drug_name, alphas, l1_ratio=0.5, kfolds=10, hold=False): """Tune alpha parameter using concordance index This is longish and performs the following task. For a set of alpha (list), run the elastic net analysis for a given **l1_ratio** with **kfolds**. For each alpha, get the CIndex and find the CINdex for which the errors are minimum. .. warning:: this is a bit longish (300 seconds for 10 folds and 80 alphas) on GDSCv5 data set. """ from dreamtools.core.cindex import cindex CI_train = {} CI_test = {} for c in range(kfolds): CI_train[c] = [] CI_test[c] = [] pb = Progress(len(alphas)) for i, alpha in enumerate(alphas): self.fit(drug_name, alpha=alpha, l1_ratio=l1_ratio, kfolds=kfolds) # Look at the results and store cindex for kf in range(kfolds): x_train = self.kfold_data['x_train'][kf].values y_train = self.kfold_data['y_train'][kf].values x_test = self.kfold_data['x_test'][kf].values y_test = self.kfold_data['y_test'][kf].values x_train_pred = self.en.predict(x_train) x_test_pred = self.en.predict(x_test) CI_test[kf].append(1-cindex(x_test_pred, y_test, [True]*len(y_test))) CI_train[kf].append(1-cindex(x_train_pred, y_train, [True] * len(y_train))) pb.animate(i) mu_train = pd.DataFrame(CI_train).transpose().mean() sigma_train = pd.DataFrame(CI_train).transpose().std() mu_test = pd.DataFrame(CI_test).transpose().mean() sigma_test = pd.DataFrame(CI_test).transpose().std() best_alpha = alphas[pd.DataFrame(CI_test).mean(axis=1).argmax()] pylab.clf() pylab.errorbar(pylab.log(alphas), mu_train, yerr=sigma_train, label="train") pylab.errorbar(pylab.log(alphas)+.1, mu_test, yerr=sigma_test, label="test") pylab.plot(pylab.log(alphas), mu_train, 'ob') pylab.plot(pylab.log(alphas)+.1, mu_train, 'or') pylab.legend() pylab.axvline(pylab.log(best_alpha), lw=2, color="purple") return best_alpha
[docs] def enetpath_vs_enet(self, drug_name, alphas=None, l1_ratio=0.5, nfeat=5, max_iter=1000, tol=1e-4, selection="cyclic", fit_intercept=False): """ #if X is not scaled, the enetpath and ElasticNet will give slightly # different results #if X is scale using:: from sklearn import preprocessing xscaled = preprocessing.scale(X) xscaled = pd.DataFrame(xscaled, columns=X.columns) """ # 1. use enet to loop over alphas and then plot the coefficients # along alpha for each feature # Get the data for the requested drug X, Y = self._get_one_drug_data(drug_name) # Run elasticnet for a bunch of alphas to get the coefficients alphas, coeffs, _ = enet_path(X, Y, l1_ratio=l1_ratio, alphas=alphas) # estimate the best alpha for later best_alpha = self.runCV(drug_name, verbose=False).alpha # The final data, coeffs is sorted by coefficients on the smallest alpha coeffs = pd.DataFrame(coeffs, index=list(X.columns)) N = len(alphas) coeffs.sort_values(by=N-1, inplace=True) results = {"alphas": alphas, "coeffs": coeffs, "best_alpha": best_alpha} # the Viewer self._plot_enet(results) return results
def _plot_enet(self, data, numfig=1, labels_to_show=5, fontsize=10): """Plot the enetpath data :param coeffs: a dataframe. The rows are the features. The columns are the alpha. The matrix contains the coefficients :param coeffs: the alphas. Must match the columns of the coeffs dataframe :param labels_to_show: number of labels to add on the figure """ pylab.figure(numfig) pylab.clf() for this in data['coeffs'].iterrows(): pylab.plot(pylab.log(data['alphas']), this[1], color="gray") pylab.axvline(pylab.log(data['best_alpha']), color="r", lw=2) pylab.xlabel("ln alpha") pylab.ylabel("Coefficients") pylab.grid(True) # The data is sorted with last rows having largest coeff and last # columns the smallest if len(data) < labels_to_show: labels_to_show = len(data) try: # recent python3/pandas version for text,v in data['coeffs'].iloc[-labels_to_show:,-1].items(): pylab.text(pylab.log(min(data["alphas"])),v,text, fontsize=fontsize) except: # old python2/pandas for text,v in data['coeffs'].iloc[-labels_to_show:,-1].iteritems(): pylab.text(pylab.log(min(data["alphas"])),v,text, fontsize=fontsize)
def barplot(df, colname, color="sign",colors=("b", "r"), orientation="vertical", Nmax=50, max_label_length=20, fontsize=12): assert orientation in ["vertical", "horizontal"] if orientation=="vertical": kind = "bar" figsize = (10, 7) else: kind = "barh" figsize = (8, 10) df = df.copy() pylab.figure(figsize=figsize) df.loc[:,'sign'] = df[colname]>0 df[colname] = abs(df[colname]) df.index = [this[0:max_label_length] for this in df.index] df = df.sort_values(colname, ascending=True) df.loc[df['sign'] == True, 'sign'] = 'r' df.loc[df['sign'] == False, 'sign'] = 'b' colors = "".join(df['sign']) if len(df) < Nmax: df.plot(y=colname, kind=kind, color=colors, width=1, lw=1, title='Importance plot', ax=pylab.gca(), fontsize=fontsize, figsize=figsize) else: df.iloc[-Nmax:].plot(y=colname, kind=kind, color=colors[-50:], width=1, lw=1, title='Importance plot', ax=pylab.gca(), fontsize=fontsize, figsize=figsize) import matplotlib.patches as mpatches red_patch = mpatches.Patch(color='red', label='positive weights') blue_patch = mpatches.Patch(color='blue', label="negative weights") ax = pylab.gca() if orientation=="horizontal": ax.set_position([0.3,0.1,0.6,0.8]) pylab.legend(handles=[red_patch, blue_patch], loc='lower right') elif orientation=="vertical": ax.set_position([0.1,0.3,0.8,0.6]) pylab.legend(handles=[red_patch, blue_patch], loc='upper left') pylab.grid() pylab.ylabel("Absolute weights", fontsize=fontsize)