Source code for gdsctools.gdsc1000

"""This module provides an  interface to download and filter GDSC1000 data, 
as published by Iorio et al (2016).

Author: Elisabeth D. Chen
Date: 2016-06-17

"""
import os
import csv

try:     #python 3
    from urllib.request import urlretrieve
except:
    from urllib import urlretrieve

from gdsctools.default_sets import DefaultDictionaries

import colorlog as logger
import pandas as pd


[docs]class GDSC1000(object ):
    """Help data retrieval from GDSC website (version v17)

    ::

        from gdsctools import GDSC1000
        data = GDSC1000()
        data.download_data()

    Next time, starting GDSCTools in the same directory, just type::

        data = GDSCTools()
        data.load_data()

    CNA (e.g., :attr:`cna_df`), Methylation and gene variants are downloaded. 
    IC50 as well and a set of annotations. There are combined in
    :attr:`genomic_df`.

    The :attr:`genomic_df` contains the :attr:`cna_df`, :attr:`methylation_df`
    and :attr:`variant_df` data. The three latter contains in common the
    CELL_LINE, ALTERATION_TYPE, IDENTIFIER  and TISSUE_FACTOR. 

    ALTERATION_TYPE can be GENETIC_VARIATION, METHYLATION, DELETION / AMPLIFICATION 
    variant_df also has the COSMIC_ID. The :attr:`annotation_df` gives mapping
    between identifiers and gene names.

    .. autosummary::

        download_data
        load_data
        make_matrix


    With this class, you may (1) download the data, (2) annotate or filter the
    data and (3) create a genomic matrix.

    To load the data, either download it from the website using
    :meth:`download_data` (downloads data from cancerrxgene.org loading methylation, cna, 
    variant and cell lines datasets). It extract relevant columns, re-names column names 
    and saves as csv files in the :attr:`data_folder_name` directory.

    If you have already downloaded the data, you may just load it using
    :meth:`load_data`. This function also annotates the data with gene
    information.

    Then, you filter the data with one of the filter methods::

        filter_by_cell_line([ "AsPC-1", "U-2-OS", "MDA-MB-231"... ] )
        filter_by_cosmic_id([ 948121, 2839818, ...] )
        filter_by_gene(["ATM", "TP53", ...])
        filter_by_recurrence(min_recurrence = 3 )
        filter_by_tissue_type([ "BRCA", "COAD/READ", ... ] )
        filter_by_alteration_type(["METHYLATION", "AMPLIFICATION",
                                        "DELETION", "GENE_VARIANT" ] )

    Finally, you can create the genomic matrix using :meth:`make_matrix`. 

    You can also look at the unique regions for agiven data set. For
    instance, methylation dataframe has about 2,000 regions but only 338 are
    unique::

        len(data.methylation_df.groupby('IDENTIFIER').groups)


    """
    def __init__(self, annotate=False, data_folder_name="./data/gdsc_1000_data/" ):
        """.. rubric:: constructor

        :param bool annotate:
        :param str data_folder_name:

        """
        self._save_csv = True
        self.annotate = annotate
        self.recurrence = 3
        self._data_folder_name = data_folder_name
        self._url_base = "http://www.cancerrxgene.org/gdsc1000/GDSC1000_WebResources/Data/suppData/"

        self._mapper = {"methylation": ["TableS2J.xlsx", "methylation.xlsx"],
                        "cna": ["TableS2G.xlsx", "cna.xlsx"],
                        "variant": ["TableS2C.xlsx", "variant.xlsx"],
                        "cell_line": ["TableS1E.xlsx", "cell_line_dict.xlsx"],
                        "methylation_annotation": ["TableS2H.xlsx", "methylation_annotation.xlsx"],
                        "cna_annotation": ["TableS2D.xlsx", "cna_annotation.xlsx"],
                        "ic50": ["TableS4A.xlsx", "ic50.xlsx"],
                        }

        self.defaults = DefaultDictionaries()
        self.core_genes = self.defaults.gene_dict['Core Genes']

    def __str__(self):
        txt = "Input:"
        try:
            txt = "  Cell lines: {}\n".format(len(self.cell_line_df))
        except:
            return "You must call download_data() or load_data()"
        txt += "  CNA features: {}\n".format(len(self.cna_df))
        txt += "  Methylation features: {}\n".format(len(self.methylation_df))
        txt += "  Variant: {}\n".format(len(self.variant_df))
        total = len(self.cna_df) + len(self.methylation_df) + len(self.variant_df)
        txt += "  Total: {}\n\n".format(total)
        txt += "Genomic Features :"
        import collections
        txt += str(collections.Counter(self.genomic_df.ALTERATION_TYPE))
        return txt

    def _get_path(self, key):
        return self._data_folder_name + self._mapper[key][1]

[docs]    def download_data(self):
        """Download and load data in memory"""
        self._download_raw()
        self._convert_raw_gdsc1000_data()

    def _convert_raw_gdsc1000_data(self):
        """Converts raw Excel document into CSV"""
        self._read_cell_line_data()
        self._read_cna_data()
        self._read_methylation_data()
        self._read_variant_data()
        self._read_ic50_data()
        self._read_annotation()
        self._combine_genomic_annotation()

    def _to_csv(self, df, filename, quoting=None):
        if self._save_csv:
            logger.info("Saving %s" % filename)
            df.to_csv(self._data_folder_name + filename, index=False)

    def _read_csv(self, filename):
        return pd.read_csv(self._data_folder_name + filename)

[docs]    def load_data(self, annotation=True):
        """If CSV files are already downloaded, just load them"""
        self.variant_df = self._read_csv("variant.csv")
        self.methylation_df = self._read_csv("methylation.csv")
        self.cna_df = self._read_csv("cna.csv")
        self.cell_line_df = self._read_csv("cell_line_dict.csv")
        self.ic50_df = self._read_csv("ic50_full_matrix.csv")

        self.drug_decoder_df = self._read_csv("drug_decoder.csv")
        self.genomic_df = self._read_csv("genomic.csv")
        self.annotation_df = self._read_csv("annotation.csv")

        self.backup_genomic_df = self.genomic_df
        self.backup_ic50_df = self.ic50_df
        self.backup_drug_decoder_df = self.drug_decoder_df

        if annotation:
            self.annotate_all()

    def _urlretrieve(self, filename, target):
        urlretrieve(
            self._url_base + filename,
            self._data_folder_name + target)

    def _download_raw(self):

        if os.path.exists(self._data_folder_name):
            pass
        else:
            os.makedirs(self._data_folder_name)

        # download all data sets
        for key, value in self._mapper.items():
            logger.info('Downloading %s ' % key)
            self._urlretrieve(*value)

    def _read_cna_data(self):
        logger.info("Processing CNA data.")
        # This block reads in and formats CNA data
        xls = pd.ExcelFile(self._get_path('cna'))
        df = xls.parse(header=2)

        # in version from feb 2017, there are 4 extra blank columns to be
        # drop
        dummies = ["dum1", "dum2", "dum3", "dum4"]
        df.columns = ["BLANK", "CELL_LINE", "TISSUE_ID1", "TISSUE_ID2",
                      "TISSUE_FACTOR", "ALTERATION_TYPE", "IDENTIFIER" ] + dummies

        df = df.drop(['BLANK', 'TISSUE_ID1', 'TISSUE_ID2'] + dummies, axis = 1)
        df.IDENTIFIER = df.IDENTIFIER.str.split(' ').str[0]
        df.ALTERATION_TYPE = df.ALTERATION_TYPE.str.upper()

        self._to_csv(df, 'cna.csv')
        self.cna_df = df

    def _read_methylation_data(self):
        logger.info("Processing methylation data.")
        # This block deals with the methylation data
        xls = pd.ExcelFile(self._get_path('methylation'))
        df = xls.parse(header=2)
        df.columns = ["BLANK", "CELL_LINE", "TISSUE_ID1", "TISSUE_ID2",
                      "TISSUE_FACTOR", "IDENTIFIER"]
        df = df.drop(["BLANK", "TISSUE_ID1", "TISSUE_ID2"], axis=1)
        df['ALTERATION_TYPE'] = "METHYLATION"
        self._to_csv(df, 'methylation.csv')
        self.methylation_df = df

    def _read_variant_data(self):
        logger.info("Processing variant data.")
        # This block deals with variant data
        xls = pd.ExcelFile(self._get_path('variant'))
        df = xls.parse(skiprows=20 )
        df = df[ ['SAMPLE', 'COSMIC_ID', 'Cancer Type', 'Gene', 'Recurrence Filter' ] ]
        df = df.loc[df['Recurrence Filter'] == 'Yes' ]
        df.columns = ['CELL_LINE', 'COSMIC_ID', 'TISSUE_FACTOR', 'IDENTIFIER',
                      'RECURRENCE' ]
        df = df.drop('RECURRENCE', axis=1)
        df['ALTERATION_TYPE'] = "GENETIC_VARIATION"
        self._to_csv(df, 'variant.csv')
        self.variant_df = df

    def _read_cell_line_data(self):
        logger.info("Processing cell line dictionary.")
        # This block imports the cell line dictionary
        xls = pd.ExcelFile(self._get_path("cell_line"))
        df = xls.parse(header = 2 )
        df.columns = ["CELL_LINE", "COSMIC_ID", "WES", "CNA", "GEX",
                      "METH", "DRUGDATA", "T_ID1", "T_ID2", "TISSUE_FACTOR",
                      "MSI_FACTOR", "MEDIA_FACTOR", "GROWTH_FACTOR" ]
        df.index = range(len(df.index ) )
        df = df.drop(0)
        df = df[['CELL_LINE', 'COSMIC_ID', 'TISSUE_FACTOR', 'MSI_FACTOR',
                 'MEDIA_FACTOR', 'GROWTH_FACTOR']]
        self._to_csv(df, 'cell_line_dict.csv')
        self.cell_line_df = df

    def _read_ic50_data(self):
        logger.info("Processing IC50 data." )
        xls = pd.ExcelFile(self._get_path('ic50') )
        df = xls.parse(header = 4 )
        df.columns.values[ 0 ] = 'COSMIC_ID'
        df = df.drop(0)
        df = df.drop('Sample Names', axis = 1 )
        self.ic50_df = df

        drug_dict = xls.parse(header=4)[0:1]
        drug_dict.drop(drug_dict.columns[[0, 1]], axis=1, inplace=True)
        drug_dict = drug_dict.T.reset_index()
        drug_dict.columns = ['DRUG_ID', 'DRUG_NAME']
        drug_dict['DRUG_TARGET'] = 'Unknown'
        self.drug_decoder_df = drug_dict

        self._to_csv(df, "ic50_full_matrix.csv")
        self._to_csv(drug_dict, "drug_decoder.csv")

        self.backup_ic50_df = self.ic50_df
        self.backup_drug_decoder_df = self.drug_decoder_df

    def _combine_genomic_annotation(self):
        logger.info("Combining data frames." )
        self.genomic_df = pd.concat([self.variant_df, self.cna_df,
                                     self.methylation_df] ,
                                     axis=0, join='inner')
        self.genomic_df = self.genomic_df.merge(self.cell_line_df,
                                    how='left', on=['CELL_LINE', 'TISSUE_FACTOR'])
        self._to_csv(self.genomic_df, 'genomic.csv')
        self.backup_genomic_df = self.genomic_df

[docs]    def reset_genomic_data(self ):
        self.genomic_df = self.backup_genomic_df
        self.annotate = False
        self.ic50_df = self.backup_ic50_df
        self.drug_decoder_df = self.backup_drug_decoder_df

[docs]    def annotate_all(self ):
        """Annotates dataframes after read_annotation"""
        logger.info("Annotating data" )
        self.genomic_df = self.genomic_df.merge(self.annotation_df,
            how='left', on=['IDENTIFIER'])
        self.genomic_df = self._string_split(self.genomic_df, 'GENE', ',' )
        self.annotate = True

    def _string_split(self, to_be_split, col_name, separator):
        """This function splits multiple genes contained within the
        same genomic region across multiple rows, duplicating all other values.
        """
        s = to_be_split[col_name].str.split(separator ).apply(pd.Series, 1).stack()
        # Making index
        s.index = s.index.droplevel(-1) # Flattens levels to retrieve indices from original frame
        # Defining name
        s.name = col_name # Join requires Series name
        #  Delete column in original dataframe
        del to_be_split[col_name]
        # Join data frames.
        split_df = to_be_split.join(s)
        # Return split data frame
        return split_df

    def _read_annotation(self):
        # Read and merge annotations
        xls = pd.ExcelFile(self._get_path("methylation_annotation"))

        methylation_annotation = xls.parse(header = 16)
        methylation_annotation = methylation_annotation[ ['Genomic Coordinates', 'GN' ] ]
        methylation_annotation.columns = [ 'IDENTIFIER', 'GENE' ]
        methylation_annotation.GENE = methylation_annotation.GENE.replace(
            to_replace = "; ", value=",", regex=True)
        methylation_annotation = methylation_annotation[
            methylation_annotation.IDENTIFIER.duplicated() == False ]

        xls = pd.ExcelFile(self._get_path("cna_annotation"))
        cna_annotation = xls.parse(header = 19)
        cna_annotation = cna_annotation[ [ 'Identifier', 'Contained genes' ] ]
        cna_annotation.columns = ['IDENTIFIER', 'GENE']

        variant_annotation = pd.DataFrame({
            'IDENTIFIER': self.variant_df.IDENTIFIER.unique(),
            'GENE': self.variant_df.IDENTIFIER.unique()})
        # CHECK FOR UNIQUENESS

        self.annotation_df = pd.concat([ methylation_annotation, 
            cna_annotation, variant_annotation ], axis = 0, join = "inner" )

        self._to_csv(self.annotation_df, "annotation.csv")

    def _check_filter(self, values, valid_values):
        if values is None:
            raise TypeError("Please enter list of types to keep. Valid types: " + 
                ", ".join(valid_values))
        for this in values:
            if this not in valid_values:
                raise ValueError("%s not valid. Use one of " % this + 
                    " ".join(valid_values))

[docs]    def filter_by_alteration_type(self, type_list=None):
        valid_types = self.genomic_df.ALTERATION_TYPE.unique()
        self._check_filter(type_list, valid_types)
        type_list = [ x.upper() for x in type_list ]
        self.genomic_df = self.genomic_df.query("ALTERATION_TYPE in @type_list" )

[docs]    def filter_by_gene(self, gene_list=None):
        if gene_list is None:
            print("Please enter a valid gene list or set to 'Core Genes' " + 
                  "to use default GDSC1000 core genes" )
            return
        elif isinstance(gene_list, str ):
            gene_list = self.defaults.gene_dict[gene_list]
        #gene_list = [x.upper() for x in gene_list]
        if self.annotate == True:
            self.genomic_df = self.genomic_df.query("GENE in @gene_list" )
        else:
            self.genomic_df = self.genomic_df.query("IDENTIFIER in @gene_list" )

[docs]    def filter_by_tissue_type(self, tissue_list=None):
        if tissue_list == None:
            print("Please enter list of tissues. To use all tissues, add argument tissue_list = 'Complete'" )
            print("Possible dictionary keys include: ", self.defaults.tissue_dict.keys() )
            return
        elif isinstance(tissue_list, str ):
            tissue_list = self.defaults.tissue_dict[ tissue_list ]
        tissue_list = [ x.upper() for x in tissue_list ]
        self.genomic_df = self.genomic_df.query("TISSUE_FACTOR in @tissue_list" )

[docs]    def filter_by_cosmic_id(self, cosmic_list=None):
        if cosmic_list == None:
            print("Please enter list of COSMIC IDs. To use an example set of COSMIC IDs, add argument cosmic_list = 'Dummy'" )
            print("Possible dictionary keys include: ", self.defaults.cosmic_dict.keys() )
            return
        elif isinstance(cosmic_list, str ):
            cosmic_list = self.defaults.cosmic_dict[ cosmic_list ]
        self.genomic_df = self.genomic_df.query("COSMIC_ID in @cosmic_list" )

[docs]    def filter_by_cell_line(self, cell_line_list = None ):
        if cell_line_list == None:
            print("Please enter list of cell line names. To use an example set of names, add argument cell_line_list = 'Dummy'" )
            print("Possible dictionary keys include: ", self.defaults.cell_line_dict.keys() )
            return
        elif isinstance(cell_line_list, str ):
            cell_line_list = self.defaults.cell_line_dict[cell_line_list]

        self.genomic_df = self.genomic_df.query("CELL_LINE in @cell_line_list")

[docs]    def filter_by_recurrence(self, min_recurrence=None):
        if min_recurrence == None:
            min_recurrence = self.recurrence

        if self.annotate == True:
            feature = "GENE"
        else:
            feature = "IDENTIFIER"

        flatten_cell_lines = self.genomic_df.groupby([ feature, "COSMIC_ID", "TISSUE_FACTOR" ], 
            as_index = False )[[ "ALTERATION_TYPE" ]].count()
        feature_count = flatten_cell_lines.groupby([feature], as_index = False )[[ "COSMIC_ID" ]].count()
        self.feature_count = feature_count[ feature_count["COSMIC_ID"] >= min_recurrence ]

        self.genomic_df = self.genomic_df[self.genomic_df[feature].isin(self.feature_count[feature])]

    # Make into matrix
[docs]    def make_matrix(self, min_recurrence=None):
        """Return a dataframe compatible with ANOVA analysis"""
        msi_dictionary = {"MSI-H": 1, "MSS/MSI-L": 0}

        if min_recurrence == None:
            min_recurrence = self.recurrence
        self.filter_by_recurrence(min_recurrence=min_recurrence)

        if self.annotate == True:
            feature = "GENE"
        else:
            feature = "IDENTIFIER"

        genomic_matrix = pd.crosstab(self.genomic_df[ feature ], 
            columns=[ self.genomic_df["COSMIC_ID"], self.genomic_df['TISSUE_FACTOR'], 
                      self.genomic_df["CELL_LINE"], self.genomic_df["MSI_FACTOR"],
                      self.genomic_df["MEDIA_FACTOR"]])
        genomic_matrix[ genomic_matrix > 1 ] = 1
        self.genomic_matrix = genomic_matrix.T.reset_index()
        self.genomic_matrix.MSI_FACTOR = self.genomic_matrix.MSI_FACTOR.map(msi_dictionary)

        self._to_csv(self.genomic_matrix, "genomic_features_make_matrix.csv", quoting=csv.QUOTE_NONNUMERIC)


        ic50_filtered_matrix = self.ic50_df.query("COSMIC_ID in @self.genomic_matrix.COSMIC_ID.tolist()" )
        self._to_csv(ic50_filtered_matrix, "ic50_make_matrix.csv")


    def _get_info(self, annotations):
        if self.annotate == True:
            feature = "GENE"
            summary = annotations.GENE.apply(lambda x: x.split(","))
        else:
            feature = "IDENTIFIER"
            summary = annotations.IDENTIFIER.apply(lambda x: x.split(","))
        summary = summary.to_frame()
        summary['COUNT'] = summary[feature].apply(lambda x: len(x))
        unique = len(set([y for x in summary[feature].values for y in x]))
        describe = summary.describe()
        describe.loc['unique genes'] = unique
        return describe

[docs]    def get_methylation_info(self):
        """Return dataframe with number of genes per unique methylation identifier"""
        # Get the unique methylated regions
        ident = self.methylation_df.IDENTIFIER.unique()

        # From the annotation, extract the corresponding data
        annotations = self.annotation_df.loc[
            self.annotation_df.IDENTIFIER.apply(lambda x: x in ident)]

        # Now, from the subset of annotations, get the GENE column and count
        # number of genes that may not be unique but separated by commas
        return self._get_info(annotations)

[docs]    def get_cna_info(self):
        """Return dataframe with number of genes per unique CNA identifier"""
        # Get the unique methylated regions
        ident = self.cna_df.IDENTIFIER.unique()

        # From the annotation, extract the corresponding data
        annotations = self.annotation_df.loc[
            self.annotation_df.IDENTIFIER.apply(lambda x: x in ident)]

        # Now, from the subset of annotations, get the GENE column and count
        # number of genes that may not be unique but separated by commas
        return self._get_info(annotations)

[docs]    def get_genomic_info(self):
        """Return information about the genomic dataframe

        The returned dataframes contains two columns: the number of unique 
        cosmic identifiers and features for each type of alterations.
        """
        cosmic = []
        features = []
        alterations =  ['METHYLATION', 'DELETION', 'GENETIC_VARIATION',
                     'AMPLIFICATION']
        for alteration in alterations:
            this = self.genomic_df.loc[self.genomic_df.ALTERATION_TYPE == alteration]
            N = len(this.COSMIC_ID.unique())
            cosmic.append(N)

            this = self.genomic_df.loc[self.genomic_df.ALTERATION_TYPE == alteration]
            features.append(len(this))

        df = pd.DataFrame({"features": features, "cosmic": cosmic})
        df.index = alterations
        try:
            print("Number of unique genes: {}".format(
                len(self.genomic_df.GENE.unique())))
        except:
            print("Number of unique genes: {}".format(
                len(self.genomic_df.IDENTIFIER.unique())))
        print("Number of unique COSMIC ID: {}".format(
            len(self.genomic_df.COSMIC_ID.unique())))
        return df