Source code for gdsctools.datasets

# coding=utf-8
# -*- python -*-
#
#  This file is part of GDSCTools software
#
#  Copyright (c) 2015 - Wellcome Trust Sanger Institute
#  All rights reserved
#
#  File author(s): Thomas Cokelaer <cokelaer@gmail.com>
#
#  Distributed under the BSD 3-Clause License.
#  See accompanying file LICENSE.txt distributed with this software
#
#  website: http://github.com/CancerRxGene/gdsctools
#
##############################################################################
"""Data sets provided with GDSCTools

The datasets may be for testing purpose:

- :attr:`ic50_test`
- :attr:`drug_test`
- :attr:`cosmic_builder_test`

or informative:

- :attr:`cancer_cell_lines`
- :attr:`cosmic_info`

or used in analysis:

- :attr:`genomic_features_test`
- :attr:`ic50_v17`: IC50s for 1001 cell lines
- :attr:`gf_v17`: dataset with genomic features for 1001 cell lines and 680
  features (mutation, CNA)
- :attr:`ic50_v5`
- :attr:`gf_v5`
"""
# use underscore to hide from API
import easydev


__all__ = ['Data', 'ic50_test', "genomic_features_test", 
            'cosmic_info',  "cosmic_builder_test", "cancer_cell_lines"]

def _gsf(filename):
    from gdsctools import gdsctools_data
    return gdsctools_data(filename)


[docs]class Data(object): """A convenience class to hold information about a dataset Each :class:`Data` instance contains information about : #. the file location (:attr:`filename`) #. the data description (:attr:`description`) #. the authors (:attr:`authors`) But the data is not stored and users must read the data set using their own tools. """ def __init__(self, filename=None, description="No description", authors="GDSC consortium"): #: where is located the data set (full path) self.filename = filename #: a short description (string) self.description = description #: list of authors (string) self.authors = authors def __str__(self): txt = 'location: %s\n' % self.filename txt += 'description: %s\n' % self.description txt += 'authors: %s\n' % self.authors return txt def _get_location(self): return self.filename location = property(_get_location) def __repr__(self): return self.__str__()
# ============== DATA SETS DEFINITION # Dataset with IC50s for 10 drugs (for testing) ic50_test = Data( filename=_gsf("IC50_10drugs.tsv"), description = 'IC50s for 10 public drugs across cell lines') # Dataset with genomic features for 1001 cell lines and 680 features genomic_features_test = Data( filename = _gsf('genomic_features.tsv.gz'), description = 'Set of genomic features / tissue / msi') # Dataset with cancer cell lines name / cosmic id/ tissue type and sub type cancer_cell_lines = Data( filename = _gsf('cancer_cell_lines.csv'), description = "List of cosmic identifiers with "+\ "the corresponding name, tissue and sub tissue types") # Example of flat file to be read by COSMICFetcher cosmic_builder_test = Data( filename = _gsf('cosmic_builder_test.txt'), description = "An example of flat file to be read by COSMICFetcher") # Dataframe with COSMIC ID and their information cosmic_info = Data( filename = _gsf('cosmic_info.csv.gz'), description = "Information about 1001 cell lines including COSMIC ID") # IC50 from v17 ic50_v17 = Data(_gsf("IC50_v17.csv.gz")) __all__.append("ic50_v17") # Genomic Feature from v17 gf_v17 = Data(_gsf("genomic_features_v17.csv.gz"), description="PANCAN genomic features from v17 GDSC release") __all__.append("gf_v17") # IC50 from v5 ic50_v5 = Data(_gsf("IC50_v5.csv.gz")) __all__.append("ic50_v5") # Genomic Feature from v5 gf_v5 = Data(_gsf("genomic_features_v5.csv.gz")) __all__.append("gf_v5") # Build a dedicate data set for testing purposes def _build_testing(): testing = easydev.AttrDict() d = Data() d.filename = _gsf('test_drug_decode.tsv') d.description = 'drug_decode in TSV format' testing.drug_test_tsv = d d = Data() d.filename = _gsf('test_drug_decode.csv') d.description = 'drug_decode in CSV format' testing.drug_test_csv = d d = Data() d.filename = _gsf('test_ic50_11_50.csv') d.description = 'A 10drug/50 cell lines IC50 test file in CSV format' testing.ic50_test_csv = d d = Data() d.filename = _gsf('test_genomic_features.csv') d.description = 'A 50 cell lines by 20 features GenomicFeature in CSV format' testing.genomic_features_csv = d d = Data() d.filename = _gsf('test_IC50.csv') d.description = 'A 10drug/1000 cell lines IC50 test file in CSV format' testing.ic50_test = d d = Data() d.filename = _gsf('test_IC50_header2.csv') d.description = 'An IC50 test (header with column without Drug_ prefix)' testing.ic50_test_header_no_drug_prefix = d d = Data() d.filename = _gsf('test_IC50_header1.csv') d.description = 'An IC50 test (header with column with Drug_ prefix only)' testing.ic50_test_header_drug_prefix_only = d d = Data() d.filename = _gsf('test_IC50_header3.csv') d.description = 'An IC50 test (header with mixed prefixes i.e. Drug_ or not)' testing.ic50_test_header_mixed_drug_prefix = d d = Data() d.filename = _gsf('test_genomic_features_bare.csv') d.description = "A 50 cell lines by 17 features without MSI/tissue/sample" testing.genomic_features_bare_csv = d return testing testing = _build_testing()