Source code for gdsctools.omnibem

# -*- python -*-
# -*- coding utf-8 -*-
#
#  This file is part of GDSCTools software
#
#  Copyright (c) 2016 - Wellcome Trust Sanger Institute
#  All rights reserved
#
#  File author(s): GDSCTools authors
#
#  Distributed under the BSD 3-Clause License.
#  See accompanying file LICENSE.txt distributed with this software
#
#  website: http://github.com/CancerRxGene/gdsctools
#
##############################################################################
"""OmniBEM functions"""
from gdsctools import Reader, GenomicFeatures

import pandas as pd
import pylab


__all__ = ["OmniBEMBuilder"]


[docs]class OmniBEMBuilder(object):
    """Utility to create :class:`~gdsctools.readers.GenomicFeatures` instance from BEM data

    Starting from an example provided in GDSCTools
    (test_omnibem_genomic_alteration.csv.gz), here is the code to get a data
    structure compatible with the :class:`GenomicFeature`, which can then be
    used as input to the :class:`~gdsctools.anova.ANOVA` class.

    See the constructor for the header format.

    .. plot::
        :include-source:

        from gdsctools import gdsctools_data, OmniBEMBuilder, GenomicFeatures

        input_data = gdsctools_data("test_omnibem_genomic_alterations.csv.gz")
        bem = OmniBEMBuilder(input_data)

        # You may filter the data for instance to keep only a set of genes.
        # Here, we keep everything
        gene_list = bem.df.GENE.unique()
        bem.filter_by_gene_list(gene_list)

        # Finally, create a MoBEM dataframe
        mobem = bem.get_mobem()

        # You may filter with other functions(e.g., to keep only Methylation
        # features
        bem.filter_by_type_list(["Methylation"])
        mobem = bem.get_mobem()

        # Then, let us create a dataframe that is compatible with
        # GenomicFeature. We just need to make sure the columns are correct
        mobem[[x for x in mobem.columns if x!="SAMPLE"]]
        gf = GenomicFeatures(mobem[[x for x in mobem.columns if x!="SAMPLE"]])


        # Now, we can perform an ANOVA analysis:
        from gdsctools import ANOVA, ic50_test
        an = ANOVA(ic50_test, gf)
        results = an.anova_all()
        results.volcano()

    .. note:: The underlying data is stored in the attribute :attr:`df`.

    """
    def __init__(self, genomic_alteration):
        """.. rubric:: Constructor

        :param str genomic_alteration: a filename in CSV format (gzipped or
            not). The format is explained here below.

        The input must be a 5-columns CSV file with the following columns::

            COSMIC_ID: an integer
            TISSUE_TYPE: e.g. Methylation,
            SAMPLE: this should correspond to the COSMID_ID
            TYPE: Methylation,
            GENE: gene name
            IDENTIFIER: required for now but may be removed (rows can be
                used as identifier indeed

        .. warning:: If GENE is set to NA, we drop it. In the resources shown in
            the example here above, this corresponds to 380 rows (out of 60703).
            Similarly, if TISSUE is NA, we also drop these rows that is about
            3000 rows.
        """
        self.df = Reader(genomic_alteration).df

        # We drop genes and tissues that are set to NA
        self.df.dropna(inplace=True)

        # Some column names are changed for convenience
        self.df.rename(columns={"COSMIC.ID": "COSMIC_ID",
            "TISSUE.TYPE": "TISSUE_TYPE"}, inplace=True)
        self._update_unified()

    def __len__(self):
        return len(self.df)

    def _update_unified(self):
        """# Sort genes by number of times they have an alteration
        bem.unified.sort_values(by="IDENTIFIER")

        # top genes represented multiple times i
        bem.unified.sort_values(by="IDENTIFIER", ascending=False).head(20)
        """
        # In R this is an aggregate function. In Pandas a groupby + aggregate
        # http://pandas.pydata.org/pandas-docs/stable/comparison_with_r.html
        groups = self.df.groupby(by=["COSMIC_ID", "GENE", "SAMPLE",
            "TISSUE_TYPE"], as_index=False)
        self.unified = groups[['IDENTIFIER']].aggregate(len)

        # Building a summary on cell lines
        unique = self.unified[self.unified['COSMIC_ID'].duplicated() == False]
        self.frequency = unique.groupby("TISSUE_TYPE")["TISSUE_TYPE"].count()
        self.total = self.unified.groupby("TISSUE_TYPE")["TISSUE_TYPE"].count()

        df = pd.concat([self.total, self.frequency, self.total/self.frequency], axis=1)
        df.columns = ['total', 'grouped', 'fraction']
        self.summary = df

[docs]    def get_mobem(self):
        """Return a dataframe compatible with ANOVA analysis

        The returned object can be read
        by :class:`~gdsctools.readers.GenomicFeatures`.

        """
        # Select gene that appear at least a minimum number of times
        #agg = self.unified.groupby("GENE")["GENE"].count()
        #self.selection = agg[agg>=minimum_gene]

        # keep only gene in the selection
        #df = self.unified.query("GENE in @self.selection.index")
        df = self.unified
        this = pd.crosstab(df['GENE'], columns=[
            df["COSMIC_ID"], df['TISSUE_TYPE'], df["SAMPLE"]])
        this = this.T
        this = this.reset_index()

        if "TISSUE_TYPE" in this.columns:
            this.rename(columns={"TISSUE_TYPE":"TISSUE_FACTOR"}, inplace=True)
        else:
            print("Expected TISSUE_TYPE column. Not found.")

        return this

[docs]    def get_significant_genes(self, N=20):
        """Return most present genes

        :param int N: the maximum number of genes to return
        :return: list of genes with the number of occurences

        The genes returned by be used to filter the data::

            genes = bem.get_significant_genes(N=20)
            bem.filter_by_gene_list(genes.index)

        """
        df = self.unified.groupby("GENE")["GENE"]
        return df.count().sort_values(ascending=False).head(N)

[docs]    def filter_by_gene_list(self, genes, minimum_gene=3):
        """Keeps only required genes

        :param genes: a list of genes or a filename (a CSV file with a column
            named GENE).
        :param minimum_gene: genes with not enough entries are removed
            (defaults to 3)

        """
        if isinstance(genes, str):
            df = pd.read_csv(genes)
            genes = df.GENE.values.tolist()
        self.df = self.df.query("GENE in @genes")
        # Update unified matrix
        agg = self.unified.groupby("GENE")["GENE"].count()
        self.selection = agg[agg>=minimum_gene]

        # keep only gene in the selection
        self.unified = self.unified.query("GENE in @self.selection.index")
        self._update_unified()

[docs]    def filter_by_tissue_list(self, tissue_list):
        """Filter the data by tissue

        :param list tissue: the tissues to be kept. The data is update
            in place.

        """
        self.df = self.df.query("TISSUE_TYPE in @tissue_list")
        self._update_unified()

[docs]    def filter_by_type_list(self, type_list):
        """Filter the data by type

        :param list tissue: the types to be kept. The data is update
            in place. Here are some examples of types: Point.mutation,
            Amplification, Deletion, Methylation.

        """
        self.df = self.df.query("TYPE in @type_list")
        self._update_unified()

[docs]    def filter_by_sample_list(self, sample_list):
        """Filter the data by sample name

        :param list tissue: the samples to be kept. The data is updated
            in place.

        """
        self.df = self.df.query("SAMPLE in @sample_list")
        self._update_unified()

[docs]    def filter_by_cosmic_list(self, cosmic_list):
        """Filter the data by cosmic identifers

        :param list cosmic: the cosmic identifiers to be kept. The data is
            updated in place.

        """
        self.df = self.df.query("COSMIC_ID in @cosmic_list")
        self._update_unified()

[docs]    def plot_number_alteration_by_tissue(self, fontsize=10, width=0.9):
        """Plot number of alterations

        .. plot::
            :width: 100%
            :include-source:

            from gdsctools import *
            data = gdsctools_data("test_omnibem_genomic_alterations.csv.gz")
            bem = OmniBEMBuilder(data)
            bem.filter_by_gene_list(gdsctools_data("test_omnibem_genes.txt"))
            bem.plot_number_alteration_by_tissue()

        """
        count = self.unified.groupby(['TISSUE_TYPE'])['GENE'].count()
        try:
            count.sort_values(inplace=True, ascending=False)
        except:
            count.sort(inplace=True, ascending=False)
        count.plot(kind="bar", width=width)
        pylab.grid()
        pylab.xlabel("Tissue Type", fontsize=fontsize)
        pylab.ylabel("Total number of alterations in cell lines",
                     fontsize=fontsize)
        try:pylab.tight_layout()
        except:pass

[docs]    def plot_alterations_per_cellline(self, fontsize=10, width=0.9):
        """Plot number of alterations

        .. plot::
            :width: 100%
            :include-source:

            from gdsctools import *
            data = gdsctools_data("test_omnibem_genomic_alterations.csv.gz")
            bem = OmniBEMBuilder(data)
            bem.filter_by_gene_list(gdsctools_data("test_omnibem_genes.txt"))
            bem.plot_alterations_per_cellline()
        """
        try:
            df = self.summary.sort_values("fraction", ascending=False)
        except:
            df = self.summary.sort("fraction", ascending=False)
        df.plot(y="fraction", legend=False, kind='bar', fontsize=fontsize, 
            width=width)
        pylab.ylabel("Alterations per cell line",
                     fontsize=fontsize)
        pylab.grid()
        try:pylab.tight_layout()
        except:pass

[docs]    def get_genomic_features(self):
        mobem = self.get_mobem()
        gf = GenomicFeatures(mobem[[x for x in mobem.columns if x!="SAMPLE"]])
        return gf