Source code for gdsctools.gdsc

# -*- coding: utf-8 -*-
#
#  This file is part of GDSCTools software
#
#  Copyright (c) 201 - GDSCTools Development Team
#
#  File author(s): Thomas Cokelaer <thomas.cokelaer@pasteur.fr>
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  documentation: http://gdsctools.readthedocs.io
#
##############################################################################
import glob
import os

from gdsctools.anova import ANOVA
from gdsctools.readers import IC50
from gdsctools.readers import DrugDecode
from gdsctools.anova_results import ANOVAResults
from gdsctools.anova_report import ANOVAReport
from gdsctools.settings import ANOVASettings
from gdsctools.anova_report import ReportMain

import pandas as pd

from easydev.console import purple, brown, red
from easydev import Progress

from reports import HTMLTable

__all__ = ["IC50Cluster", "GDSC"]


[docs]class IC50Cluster(IC50):
    """Utility to cluster columns that correspond to the same drug ID

    From GDSC v18 data sets onwards, DRUG identifiers may be duplicated
    to account for different drug concentration. This is not recommended
    since we'd rather use unique identifier for different experiments but to
    account for this feature, the IC50Cluster will rename the columns and
    transform the data as follows.

    Consider the case of the DRUG 1211. It appears 3 times in the original
    data::

        Drug_1211_0.15625_IC50
        Drug_1211_1_IC50
        Drug_1211_10_IC50

    Actually, there are about 15 such cases even though in general there are
    only 2 duplicates::


            DRUG_ID   1   2    3  total  icommon       ratio
        21     1782  47  47  NaN     47      47  100.000000
        20     1510  45  45  NaN     45      45  100.000000
        19     1211  48  47  4.0     50      46   92.000000
        18     1208  48  47  NaN     50      45   90.000000
        16     1032  43  47  NaN     50      40   80.000000
        17     1207  38  47  NaN     50      35   70.000000
        13      231   2  39  NaN     39       2    5.128205
        14      232  45   2  NaN     45       2    4.444444
        10      226   2  45  NaN     45       2    4.444444
        12      230   2  46  NaN     46       2    4.347826
        15      238   2  46  NaN     46       2    4.347826
        0       206   2  46  NaN     46       2    4.347826
        1       211  46   2  NaN     46       2    4.347826
        9       224   2  46  NaN     46       2    4.347826
        8       223   2  46  NaN     46       2    4.347826
        7       221   2  46  NaN     46       2    4.347826
        6       217   2  46  NaN     46       2    4.347826
        5       216   2  46  NaN     46       2    4.347826
        4       215   2  46  NaN     46       2    4.347826
        3       214   2  46  NaN     46       2    4.347826
        2       213   2  46  NaN     46       2    4.347826
        11      229   2  46  NaN     46       2    4.347826

    The clustering works as follows. If the ratio of drugs in common between
    several concentrations is large, then they are studied independently.
    Otherwise they are merged.

    In the final dataframe, the columns names are transformed into unique
    identifiers like in the IC50 class by removing the ``Drug_`` prefix and
    ````_conc_IC50`` suffix.

    The :attr:`mapping` contains the mapping between new and old identifiers.

    .. seealso:: :meth:`cleanup` method.
    """
    def __init__(self, ic50, ratio_threshold=10, verbose=True, cluster=True):
        """.. rubric:: constructor

        :param ic50:
        :param int ratio_threshold:
        :param bool verbose:
        :param bool cluster: may be useful to not cluster the data for
            testing or debugging

        """
        super(IC50Cluster, self).__init__(ic50, v18=True)
        self.verbose = verbose
        self.ratio_threshold = ratio_threshold

        if self.verbose:
            print(self)
        if cluster:
            self.cluster()
            self.cleanup()
            if self.verbose:
                print(self)

    def _get_to_cluster(self):
        info = self._info()
        if len(info) > 0:
            to_cluster = info[info.ratio < self.ratio_threshold].DRUG_ID.values
            return list(to_cluster)
        else:
            return []
    to_cluster = property(_get_to_cluster)

    def _get_mapping(self):
        from collections import defaultdict
        mapping = defaultdict(list)

        drug_ids = [self.drug_name_to_int(x) for x in self.df.columns]
        for drug_id, colname in zip(drug_ids, self.df.columns):
            mapping[drug_id].append(colname)
        return mapping

    def _get_duplicated(self):
        mapping = self._get_mapping()
        duplicated = [key for key in mapping.keys() if len(mapping[key]) > 1]
        return duplicated
    duplicated = property(_get_duplicated)

    def _info(self):
        mapping = self._get_mapping()
        duplicated = self.duplicated

        max_ids = max([len(x) for x in mapping.values()])

        results = []
        for drug_id in duplicated:
            df = self.df[mapping[drug_id]]

            total = df.mean(axis=1).count()
            individuals = list(df.count().values)

            # the number of individuals may be different. In v18 one drug had 3
            # entries (all other duplicated had only 2). so, we need to add
            # dummies (NA) when required:
            individuals += [None] * (max_ids - len(individuals))

            common = sum(df.count(axis=1) >= 2)
            result = [drug_id] + individuals + [total, common,
                    100 * common/float(total)]
            results.append(result)

        df = pd.DataFrame(results)
        if len(df):
            df.columns = ['DRUG_ID'] + [str(x) for x in range(1, max_ids+1)] +\
                ['total', 'common', 'ratio']
            try:
                df.sort_values(by='ratio', ascending=False, inplace=True)
            except:
                df.sort('ratio', ascending=False, inplace=True)
        return df

[docs]    def cluster(self):
        # get list of drug identifiers to cluster

        to_cluster = self.to_cluster
        self.clustered = to_cluster[:]
        mapping = self._get_mapping()
        self.mapped = {}

        if self.verbose:
            print('Found  %s non unique drug identifiers ' %
                    len(self.duplicated))
            print('Clustering %s of them.\n' % len(self.to_cluster))
        if len(self.to_cluster) == 0:
            return

        for identifier in to_cluster:
            drug_names = mapping[identifier]

            # Let us keep only the first concentration for now
            new_drug_name = drug_names[0]
            if len(drug_names) > 1:
                todrop = drug_names[1:]
            # add new column with new name and mean of the columns with same
            # drug id
            self.df[new_drug_name] = self.df[drug_names].mean(axis=1)
            # Remove the individual columns
            self.df.drop(todrop, axis=1, inplace=True)

[docs]    def cleanup(self, offset=10000):
        """Rename the columns into unique identifiers

        :param int offset: if duplicated, add the offset

        The :attr:`mapping` contains the mapping, which should be used
        to update the decoder file.
        """
        # Need to transform column names in proper identifiers (integer)
        # and makes sure identifiers are unique. If not, we add +10000
        # Also, for later we keep track of the original mame in a dictionary
        self.extra_mapping = {}
        new_columns = []
        for col in self.df.columns:
            identifier = self.drug_name_to_int(col)
            while identifier in new_columns:
                identifier += offset # not robust but would do for now
                # We use a while since ids may occur 3 times
            self.extra_mapping[identifier] = col
            new_columns.append(identifier)
        self.df.columns = new_columns


class GDSCBase(object):
    def __init__(self, genomic_feature_pattern="GF_*csv", verbose=True):
        self.verbose = True
        self.gf_filenames = glob.glob(genomic_feature_pattern)
        if len(self.gf_filenames) == 0:
            msg = "NO Genomic feature input files found. We expect files " +\
                    "with this pattern: GF_<TCGA>.csv e.g., (GF_COREAD.csv)"
            raise ValueError(msg)
        pass

    def mkdir(self, name):
        try:
            os.mkdir(name)
        except:
            if self.verbose:
                print("Note that directory %s already exists" % name)


[docs]class GDSC(GDSCBase):
    """Wrapper of the :class:`~gdcstools.anova.ANOVA` class and reports to
    analyse all TCGA Tissues and companies automatically while creating summary
    HTML pages.

    First, one need to provide an unique IC50 file. Second, the DrugDecode
    file (see :class:`~gdsctools.readers.DrugDecode`) must be provided
    with the DRUG identifiers and their corresponding names. Third,
    a set of genomic feature files must be provided for each :term:`TCGA`
    tissue.


    You then create a GDSC instance::

        from gdsctools import GDSC
        gg = GDSC('IC50_v18.csv', 'DRUG_DECODE.txt',
            genomic_feature_pattern='GF*csv')

    At that stage you may want to change the settings, e.g::

        gg.settings.FDR_threshold = 20

    Then run the analysis::

        gg.analysis()

    This will launch an ANOVA analysis for each TCGA tissue + PANCAN case
    if provided. This will also create a data package for each tissue.
    The data packages are stored in ./tissue_packages directory.

    Since all private and public drugs are stored together, the next step is
    to create data packages for each company::

        gg.create_data_packages_for_companies()

    you may select a specific one if you wish::

        gg.create_data_packages_for_companies(['AZ'])

    Finally, create some summary pages::

        gg.create_summary_pages()

    You entry point is an HTML file called **index.html**
    """
    def __init__(self, ic50, drug_decode,
            genomic_feature_pattern="GF_*csv",
            main_directory="tissue_packages", verbose=True):
        """.. rubric:: Constructor

        :param ic50: an :class:`~gdsctools.readers.IC50` file.
        :param drug_decode: an :class:`~gdsctools.readers.DrugDecode` file.
        :param genomic_feature_pattern: a glob to a set of
            :class:`~gdsctools.readers.GenomicFeature` files.

        """
        super(GDSC, self).__init__(genomic_feature_pattern, verbose=verbose)
        assert isinstance(ic50, str)
        self.ic50_filename = ic50
        self.dd_filename = drug_decode
        self.main_directory = main_directory

        self.settings = ANOVASettings()
        self.settings.animate = False
        self.drug_decode = DrugDecode(drug_decode)

        print("Those settings will be used (check FDR_threshold)")
        print(self.settings)

        # figure out the cancer types:
        self.results = {}

        self.company_directory = "company_packages"

        # quick test on 15 features
        self.test = False

[docs]    def analyse(self, multicore=None):
        """Launch ANOVA analysis and creating data package for each tissue.

        :param bool onweb: By default, reports are created
            but HTML pages not shown. Set to True if you wish to open
            the HTML pages.
        :param multicore: number of cpu to use (1 by default)

        """
        self.mkdir(self.main_directory)
        # First analyse all TCGA cases + PANCAN once for all and
        # store all the results in a dictionary.
        self._analyse_all(multicore=multicore)

    def _analyse_all(self, multicore=None):
        for gf_filename in sorted(self.gf_filenames):
            tcga = gf_filename.split("_")[1].split('.')[0]
            print(purple('======================== Analysing %s data' % tcga))

            self.mkdir(self.main_directory + os.sep + tcga)
            # Computes the ANOVA
            try:
                self.ic50 = IC50(self.ic50_filename)
            except:
                print("Clustering IC50 (v18 released data ?)")
                self.ic50 = IC50Cluster(self.ic50_filename, verbose=False)
            an = ANOVA(self.ic50, gf_filename, self.drug_decode,
                verbose=False)

            if self.test is True:
                an.features.df = an.features.df[an.features.df.columns[0:15]]

            self.an = an
            an.settings = ANOVASettings(**self.settings)
            an.settings.analysis_type = tcga
            an.init() # This reset the directory

            results = an.anova_all(multicore=multicore)
            an.settings.directory = self.main_directory + os.sep + tcga
            # Store the results
            self.results[tcga] = results

            print('Analysing %s data and creating images' % tcga)
            self.report = ANOVAReport(an)
            self.report.settings.savefig = True

            self.report.create_html_pages(onweb=False)

[docs]    def create_data_packages_for_companies(self, companies=None):
        """Creates a data package for each company found in the DrugDecode file
        """
        ##########################################################
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        #                                                        #
        # DRUG_DECODE and IC50 inputs must be filtered to keep   #
        # only WEBRELEASE=Y and owner                            #
        #                                                        #
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        ##########################################################

        # companies must be just one name (one string) or a list of strings
        # By default, takes all companies found in DrugDecode
        if isinstance(companies, str):
            companies = [companies]

        if companies is None:
            companies = self.companies

        if len(companies) == 0:
            raise ValueError("Could not find any companies in the DrugDecode file")

        # The main directory
        self.mkdir(self.company_directory)

        # Loop over all companies, retrieving information built
        # in analyse() method, selecting for each TCGA all information
        # for that company only (and public drugs)
        Ncomp = len(companies)
        for ii, company in enumerate(companies):
            print(purple("\n=========== Analysing company %s out of %s (%s)" %
                    (ii+1, Ncomp, company)))
            self.mkdir(self.company_directory + os.sep + company)

            # Handle each TCGA case separately
            for gf_filename in sorted(self.gf_filenames):
                tcga = gf_filename.split("_")[1].split('.')[0]
                print(brown("  ------- building TCGA %s sub directory" % tcga))

                # Read the results previously computed either
                try:
                    results_df = self.results[tcga].df.copy()
                except:
                    results_path = "%s/%s/OUTPUT/results.csv" % (self.main_directory, tcga)
                    results_df = ANOVAResults(results_path)


                # MAke sure the results are formatted correctly
                results = ANOVAResults(results_df)

                # Get the DrugDecode information for that company only
                drug_decode_company = self.drug_decode.df.query(
                        "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company)

                # Transform into a proper DrugDecode class for safety
                drug_decode_company = DrugDecode(drug_decode_company)

                # Filter the results to keep only public drugs and that
                # company. Make sure this is integers
                results.df["DRUG_ID"] = results.df["DRUG_ID"].astype(int)

                mask = [True if x in drug_decode_company.df.index else False
                        for x in results.df.DRUG_ID]

                results.df = results.df.loc[mask]

                # We read the IC50 again
                try:
                    self.ic50 = IC50(self.ic50_filename)
                except:
                    self.ic50 = IC50Cluster(self.ic50_filename, verbose=False)

                # And create an ANOVA instance. This is not to do the analyse
                # again but to hold various information
                an = ANOVA(self.ic50, gf_filename, drug_decode_company,
                    verbose=False)

                def drug_to_keep(drug):
                    to_keep = drug in drug_decode_company.df.index
                    return to_keep
                an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1)

                an.settings = ANOVASettings(**self.settings)
                an.init()
                an.settings.directory = self.company_directory + os.sep + company + os.sep + tcga
                an.settings.analysis_type = tcga

                # Now we create the report
                self.report = ANOVAReport(an, results,
                        drug_decode=drug_decode_company,
                        verbose=self.verbose)
                self.report.company = company
                self.report.settings.analysis_type = tcga
                self.report.create_html_main(False)
                self.report.create_html_manova(False)
                self.report.create_html_features()
                self.report.create_html_drugs()
                self.report.create_html_associations()

    def _get_tcga(self):
        return [x.split("_")[1].split(".")[0] for x in self.gf_filenames]
    tcga = property(_get_tcga)

    def _get_companies(self):
        return [x for x in self.drug_decode.companies if x != 'Commercial']
    companies = property(_get_companies)

[docs]    def create_summary_pages(self):
        """Create summary pages

        Once the main analyis is done (:meth:`analyse`), and the company
        packages have been created (:meth:`create_data_packages_for_companies`),
        you can run this method that will creade a summary HTML page
        (index.html) for the tissue, and a similar summary HTML page for the
        tissues of each company. Finally, an HTML summary page for the
        companies is also created.

        The final tree direcorty looks like::


            |-- index.html
            |-- company_packages
            |   |-- index.html
            |   |-- Company1
            |   |   |-- Tissue1
            |   |   |-- Tissue2
            |   |   |-- index.html
            |   |-- Company2
            |   |   |-- Tissue1
            |   |   |-- Tissue2
            |   |   |-- index.html
            |-- tissue_packages
            |   |-- index.html
            |   |-- Tissue1
            |   |-- Tissue2


        """
        # First for the main directory (tissue_packages):
        print(purple("Creating summary index.html for the tissues"))
        self._create_summary_pages(self.main_directory, verbose=False)

        # Then for each companies:
        print(purple("Creating summary index.html for each company"))
        pb = Progress(len(self.companies))
        for i, company in enumerate(self.companies):
            try:
                self._create_summary_pages(self.company_directory + os.sep +
                    company, verbose=False, company=company)
            except Exception as err:
                print(red("Issue with %s. Continue with other companies" % company))
                print(err)
            pb.animate(i+1)

        # Finally, an index towards each company
        self._create_main_index()

    def _create_main_index(self):
        # We could also add a column with number of association ?
        companies = self.companies[:]
        df = pd.DataFrame({"Company": companies})
        html_page = ReportMain(directory=".",
                filename='index.html',
                template_filename='main_summary.html',
                mode="summary")
        html_table = HTMLTable(df)
        html_table.add_href('Company', newtab=True, url="company_packages/",
                suffix="/index.html")
        html_page.jinja['data_table'] =  html_table.to_html(collapse_table=False)
        html_page.jinja['analysis_domain'] =  "All companies / All "
        html_page.jinja['tissue_directory'] = self.main_directory
        html_page.write()

    def _create_summary_pages(self, main_directory, verbose=True,
            company=None):
        # Read all directories in tissue_packages

        directories = glob.glob(main_directory + os.sep + '*')

        summary = []
        for directory in sorted(directories):
            tcga = directory.split(os.sep)[-1]
            if tcga not in self.tcga:
                continue
            if verbose:
                print(directory, tcga)
            # number of hits
            path = directory + os.sep + 'OUTPUT' + os.sep
            try:
                hits = pd.read_csv(path + 'drugs_summary.csv', sep=',')
            except:
                summary.append([tcga] + [None] * 5)
                continue
            total_hits = hits.total.sum()

            drug_involved = hits['Unnamed: 0'].unique()

            drug_involved = [int(str(x).split("-")[0]) for x in drug_involved]

            results = ANOVAResults(path + 'results.csv')
            if len(results) > 0:
                drug_ids = results.df.DRUG_ID.unique()
            else:
                drug_ids = []

            # where to find the DRUG DECODE file. Should
            # have been copied
            path = directory + os.sep + 'INPUT' + os.sep
            drug_decode = DrugDecode(path + 'DRUG_DECODE.csv')
            #drug_decode.df.set_index("DRUG_ID", inplace=True)
            info = drug_decode.get_info()

            webrelease = drug_decode.df.loc[drug_involved].WEBRELEASE
            drug_inv_public = sum(webrelease == 'Y')
            drug_inv_prop = sum(webrelease != 'Y')

            summary.append([tcga, total_hits,
                drug_inv_prop, info['N_prop'],
                drug_inv_public, info['N_public']])
        df = pd.DataFrame(summary)

        df.columns = ['Analysis name', 'Number of hits',
            'Number of involved proprietary compounds', 'out of',
            'Number of involved public', 'out of']

        try:
            df.sort_values(by="Number of hits", ascending=False, inplace=True)
        except:
            df.sort("Number of hits", ascending=False, inplace=True)

        output_dir = main_directory + os.sep + '..' + os.sep
        output_file = output_dir + os.sep + 'index.html'
        self.html_page = ReportMain(directory=main_directory,
                filename='index.html',
                template_filename='datapack_summary.html',
                mode="summary")

        # Let us use our HTMLTable to add the HTML references
        self.html_table = HTMLTable(df)
        self.html_table.add_href('Analysis name', newtab=True, url=None,
                suffix='/index.html')
        self.html_table.add_bgcolor('Number of hits')

        self.html_page.jinja['data_table'] =  self.html_table.to_html(
                collapse_table=False)
        if company:
            self.html_page.jinja["collaborator"] = company

        self.html_page.write()

        return df