Source code for gdsctools.omnibem

# -*- python -*-
# -*- coding utf-8 -*-
#
#  This file is part of GDSCTools software
#
#  Copyright (c) 2016 - Wellcome Trust Sanger Institute
#  All rights reserved
#
#  File author(s): GDSCTools authors
#
#  Distributed under the BSD 3-Clause License.
#  See accompanying file LICENSE.txt distributed with this software
#
#  website: http://github.com/CancerRxGene/gdsctools
#
##############################################################################
"""OmniBEM functions"""
from gdsctools import Reader, GenomicFeatures

import pandas as pd
import pylab


__all__ = ["OmniBEMBuilder"]


[docs]class OmniBEMBuilder(object): """Utility to create :class:`~gdsctools.readers.GenomicFeatures` instance from BEM data Starting from an example provided in GDSCTools (test_omnibem_genomic_alteration.csv.gz), here is the code to get a data structure compatible with the :class:`GenomicFeature`, which can then be used as input to the :class:`~gdsctools.anova.ANOVA` class. See the constructor for the header format. .. plot:: :include-source: from gdsctools import gdsctools_data, OmniBEMBuilder, GenomicFeatures input_data = gdsctools_data("test_omnibem_genomic_alterations.csv.gz") bem = OmniBEMBuilder(input_data) # You may filter the data for instance to keep only a set of genes. # Here, we keep everything gene_list = bem.df.GENE.unique() bem.filter_by_gene_list(gene_list) # Finally, create a MoBEM dataframe mobem = bem.get_mobem() # You may filter with other functions(e.g., to keep only Methylation # features bem.filter_by_type_list(["Methylation"]) mobem = bem.get_mobem() # Then, let us create a dataframe that is compatible with # GenomicFeature. We just need to make sure the columns are correct mobem[[x for x in mobem.columns if x!="SAMPLE"]] gf = GenomicFeatures(mobem[[x for x in mobem.columns if x!="SAMPLE"]]) # Now, we can perform an ANOVA analysis: from gdsctools import ANOVA, ic50_test an = ANOVA(ic50_test, gf) results = an.anova_all() results.volcano() .. note:: The underlying data is stored in the attribute :attr:`df`. """ def __init__(self, genomic_alteration): """.. rubric:: Constructor :param str genomic_alteration: a filename in CSV format (gzipped or not). The format is explained here below. The input must be a 5-columns CSV file with the following columns:: COSMIC_ID: an integer TISSUE_TYPE: e.g. Methylation, SAMPLE: this should correspond to the COSMID_ID TYPE: Methylation, GENE: gene name IDENTIFIER: required for now but may be removed (rows can be used as identifier indeed .. warning:: If GENE is set to NA, we drop it. In the resources shown in the example here above, this corresponds to 380 rows (out of 60703). Similarly, if TISSUE is NA, we also drop these rows that is about 3000 rows. """ self.df = Reader(genomic_alteration).df # We drop genes and tissues that are set to NA self.df.dropna(inplace=True) # Some column names are changed for convenience self.df.rename(columns={"COSMIC.ID": "COSMIC_ID", "TISSUE.TYPE": "TISSUE_TYPE"}, inplace=True) self._update_unified() def __len__(self): return len(self.df) def _update_unified(self): """# Sort genes by number of times they have an alteration bem.unified.sort_values(by="IDENTIFIER") # top genes represented multiple times i bem.unified.sort_values(by="IDENTIFIER", ascending=False).head(20) """ # In R this is an aggregate function. In Pandas a groupby + aggregate # http://pandas.pydata.org/pandas-docs/stable/comparison_with_r.html groups = self.df.groupby(by=["COSMIC_ID", "GENE", "SAMPLE", "TISSUE_TYPE"], as_index=False) self.unified = groups[['IDENTIFIER']].aggregate(len) # Building a summary on cell lines unique = self.unified[self.unified['COSMIC_ID'].duplicated() == False] self.frequency = unique.groupby("TISSUE_TYPE")["TISSUE_TYPE"].count() self.total = self.unified.groupby("TISSUE_TYPE")["TISSUE_TYPE"].count() df = pd.concat([self.total, self.frequency, self.total/self.frequency], axis=1) df.columns = ['total', 'grouped', 'fraction'] self.summary = df
[docs] def get_mobem(self): """Return a dataframe compatible with ANOVA analysis The returned object can be read by :class:`~gdsctools.readers.GenomicFeatures`. """ # Select gene that appear at least a minimum number of times #agg = self.unified.groupby("GENE")["GENE"].count() #self.selection = agg[agg>=minimum_gene] # keep only gene in the selection #df = self.unified.query("GENE in @self.selection.index") df = self.unified this = pd.crosstab(df['GENE'], columns=[ df["COSMIC_ID"], df['TISSUE_TYPE'], df["SAMPLE"]]) this = this.T this = this.reset_index() if "TISSUE_TYPE" in this.columns: this.rename(columns={"TISSUE_TYPE":"TISSUE_FACTOR"}, inplace=True) else: print("Expected TISSUE_TYPE column. Not found.") return this
[docs] def get_significant_genes(self, N=20): """Return most present genes :param int N: the maximum number of genes to return :return: list of genes with the number of occurences The genes returned by be used to filter the data:: genes = bem.get_significant_genes(N=20) bem.filter_by_gene_list(genes.index) """ df = self.unified.groupby("GENE")["GENE"] return df.count().sort_values(ascending=False).head(N)
[docs] def filter_by_gene_list(self, genes, minimum_gene=3): """Keeps only required genes :param genes: a list of genes or a filename (a CSV file with a column named GENE). :param minimum_gene: genes with not enough entries are removed (defaults to 3) """ if isinstance(genes, str): df = pd.read_csv(genes) genes = df.GENE.values.tolist() self.df = self.df.query("GENE in @genes") # Update unified matrix agg = self.unified.groupby("GENE")["GENE"].count() self.selection = agg[agg>=minimum_gene] # keep only gene in the selection self.unified = self.unified.query("GENE in @self.selection.index") self._update_unified()
[docs] def filter_by_tissue_list(self, tissue_list): """Filter the data by tissue :param list tissue: the tissues to be kept. The data is update in place. """ self.df = self.df.query("TISSUE_TYPE in @tissue_list") self._update_unified()
[docs] def filter_by_type_list(self, type_list): """Filter the data by type :param list tissue: the types to be kept. The data is update in place. Here are some examples of types: Point.mutation, Amplification, Deletion, Methylation. """ self.df = self.df.query("TYPE in @type_list") self._update_unified()
[docs] def filter_by_sample_list(self, sample_list): """Filter the data by sample name :param list tissue: the samples to be kept. The data is updated in place. """ self.df = self.df.query("SAMPLE in @sample_list") self._update_unified()
[docs] def filter_by_cosmic_list(self, cosmic_list): """Filter the data by cosmic identifers :param list cosmic: the cosmic identifiers to be kept. The data is updated in place. """ self.df = self.df.query("COSMIC_ID in @cosmic_list") self._update_unified()
[docs] def plot_number_alteration_by_tissue(self, fontsize=10, width=0.9): """Plot number of alterations .. plot:: :width: 100% :include-source: from gdsctools import * data = gdsctools_data("test_omnibem_genomic_alterations.csv.gz") bem = OmniBEMBuilder(data) bem.filter_by_gene_list(gdsctools_data("test_omnibem_genes.txt")) bem.plot_number_alteration_by_tissue() """ count = self.unified.groupby(['TISSUE_TYPE'])['GENE'].count() try: count.sort_values(inplace=True, ascending=False) except: count.sort(inplace=True, ascending=False) count.plot(kind="bar", width=width) pylab.grid() pylab.xlabel("Tissue Type", fontsize=fontsize) pylab.ylabel("Total number of alterations in cell lines", fontsize=fontsize) try:pylab.tight_layout() except:pass
[docs] def plot_alterations_per_cellline(self, fontsize=10, width=0.9): """Plot number of alterations .. plot:: :width: 100% :include-source: from gdsctools import * data = gdsctools_data("test_omnibem_genomic_alterations.csv.gz") bem = OmniBEMBuilder(data) bem.filter_by_gene_list(gdsctools_data("test_omnibem_genes.txt")) bem.plot_alterations_per_cellline() """ try: df = self.summary.sort_values("fraction", ascending=False) except: df = self.summary.sort("fraction", ascending=False) df.plot(y="fraction", legend=False, kind='bar', fontsize=fontsize, width=width) pylab.ylabel("Alterations per cell line", fontsize=fontsize) pylab.grid() try:pylab.tight_layout() except:pass
[docs] def get_genomic_features(self): mobem = self.get_mobem() gf = GenomicFeatures(mobem[[x for x in mobem.columns if x!="SAMPLE"]]) return gf