"""This module provides an interface to download and filter GDSC1000 data,
as published by Iorio et al (2016).
Author: Elisabeth D. Chen
Date: 2016-06-17
"""
import os
import csv
try: #python 3
from urllib.request import urlretrieve
except:
from urllib import urlretrieve
from gdsctools.default_sets import DefaultDictionaries
import colorlog as logger
import pandas as pd
[docs]class GDSC1000(object ):
"""Help data retrieval from GDSC website (version v17)
::
from gdsctools import GDSC1000
data = GDSC1000()
data.download_data()
Next time, starting GDSCTools in the same directory, just type::
data = GDSCTools()
data.load_data()
CNA (e.g., :attr:`cna_df`), Methylation and gene variants are downloaded.
IC50 as well and a set of annotations. There are combined in
:attr:`genomic_df`.
The :attr:`genomic_df` contains the :attr:`cna_df`, :attr:`methylation_df`
and :attr:`variant_df` data. The three latter contains in common the
CELL_LINE, ALTERATION_TYPE, IDENTIFIER and TISSUE_FACTOR.
ALTERATION_TYPE can be GENETIC_VARIATION, METHYLATION, DELETION / AMPLIFICATION
variant_df also has the COSMIC_ID. The :attr:`annotation_df` gives mapping
between identifiers and gene names.
.. autosummary::
download_data
load_data
make_matrix
With this class, you may (1) download the data, (2) annotate or filter the
data and (3) create a genomic matrix.
To load the data, either download it from the website using
:meth:`download_data` (downloads data from cancerrxgene.org loading methylation, cna,
variant and cell lines datasets). It extract relevant columns, re-names column names
and saves as csv files in the :attr:`data_folder_name` directory.
If you have already downloaded the data, you may just load it using
:meth:`load_data`. This function also annotates the data with gene
information.
Then, you filter the data with one of the filter methods::
filter_by_cell_line([ "AsPC-1", "U-2-OS", "MDA-MB-231"... ] )
filter_by_cosmic_id([ 948121, 2839818, ...] )
filter_by_gene(["ATM", "TP53", ...])
filter_by_recurrence(min_recurrence = 3 )
filter_by_tissue_type([ "BRCA", "COAD/READ", ... ] )
filter_by_alteration_type(["METHYLATION", "AMPLIFICATION",
"DELETION", "GENE_VARIANT" ] )
Finally, you can create the genomic matrix using :meth:`make_matrix`.
You can also look at the unique regions for agiven data set. For
instance, methylation dataframe has about 2,000 regions but only 338 are
unique::
len(data.methylation_df.groupby('IDENTIFIER').groups)
"""
def __init__(self, annotate=False, data_folder_name="./data/gdsc_1000_data/" ):
""".. rubric:: constructor
:param bool annotate:
:param str data_folder_name:
"""
self._save_csv = True
self.annotate = annotate
self.recurrence = 3
self._data_folder_name = data_folder_name
self._url_base = "http://www.cancerrxgene.org/gdsc1000/GDSC1000_WebResources/Data/suppData/"
self._mapper = {"methylation": ["TableS2J.xlsx", "methylation.xlsx"],
"cna": ["TableS2G.xlsx", "cna.xlsx"],
"variant": ["TableS2C.xlsx", "variant.xlsx"],
"cell_line": ["TableS1E.xlsx", "cell_line_dict.xlsx"],
"methylation_annotation": ["TableS2H.xlsx", "methylation_annotation.xlsx"],
"cna_annotation": ["TableS2D.xlsx", "cna_annotation.xlsx"],
"ic50": ["TableS4A.xlsx", "ic50.xlsx"],
}
self.defaults = DefaultDictionaries()
self.core_genes = self.defaults.gene_dict['Core Genes']
def __str__(self):
txt = "Input:"
try:
txt = " Cell lines: {}\n".format(len(self.cell_line_df))
except:
return "You must call download_data() or load_data()"
txt += " CNA features: {}\n".format(len(self.cna_df))
txt += " Methylation features: {}\n".format(len(self.methylation_df))
txt += " Variant: {}\n".format(len(self.variant_df))
total = len(self.cna_df) + len(self.methylation_df) + len(self.variant_df)
txt += " Total: {}\n\n".format(total)
txt += "Genomic Features :"
import collections
txt += str(collections.Counter(self.genomic_df.ALTERATION_TYPE))
return txt
def _get_path(self, key):
return self._data_folder_name + self._mapper[key][1]
[docs] def download_data(self):
"""Download and load data in memory"""
self._download_raw()
self._convert_raw_gdsc1000_data()
def _convert_raw_gdsc1000_data(self):
"""Converts raw Excel document into CSV"""
self._read_cell_line_data()
self._read_cna_data()
self._read_methylation_data()
self._read_variant_data()
self._read_ic50_data()
self._read_annotation()
self._combine_genomic_annotation()
def _to_csv(self, df, filename, quoting=None):
if self._save_csv:
logger.info("Saving %s" % filename)
df.to_csv(self._data_folder_name + filename, index=False)
def _read_csv(self, filename):
return pd.read_csv(self._data_folder_name + filename)
[docs] def load_data(self, annotation=True):
"""If CSV files are already downloaded, just load them"""
self.variant_df = self._read_csv("variant.csv")
self.methylation_df = self._read_csv("methylation.csv")
self.cna_df = self._read_csv("cna.csv")
self.cell_line_df = self._read_csv("cell_line_dict.csv")
self.ic50_df = self._read_csv("ic50_full_matrix.csv")
self.drug_decoder_df = self._read_csv("drug_decoder.csv")
self.genomic_df = self._read_csv("genomic.csv")
self.annotation_df = self._read_csv("annotation.csv")
self.backup_genomic_df = self.genomic_df
self.backup_ic50_df = self.ic50_df
self.backup_drug_decoder_df = self.drug_decoder_df
if annotation:
self.annotate_all()
def _urlretrieve(self, filename, target):
urlretrieve(
self._url_base + filename,
self._data_folder_name + target)
def _download_raw(self):
if os.path.exists(self._data_folder_name):
pass
else:
os.makedirs(self._data_folder_name)
# download all data sets
for key, value in self._mapper.items():
logger.info('Downloading %s ' % key)
self._urlretrieve(*value)
def _read_cna_data(self):
logger.info("Processing CNA data.")
# This block reads in and formats CNA data
xls = pd.ExcelFile(self._get_path('cna'))
df = xls.parse(header=2)
# in version from feb 2017, there are 4 extra blank columns to be
# drop
dummies = ["dum1", "dum2", "dum3", "dum4"]
df.columns = ["BLANK", "CELL_LINE", "TISSUE_ID1", "TISSUE_ID2",
"TISSUE_FACTOR", "ALTERATION_TYPE", "IDENTIFIER" ] + dummies
df = df.drop(['BLANK', 'TISSUE_ID1', 'TISSUE_ID2'] + dummies, axis = 1)
df.IDENTIFIER = df.IDENTIFIER.str.split(' ').str[0]
df.ALTERATION_TYPE = df.ALTERATION_TYPE.str.upper()
self._to_csv(df, 'cna.csv')
self.cna_df = df
def _read_methylation_data(self):
logger.info("Processing methylation data.")
# This block deals with the methylation data
xls = pd.ExcelFile(self._get_path('methylation'))
df = xls.parse(header=2)
df.columns = ["BLANK", "CELL_LINE", "TISSUE_ID1", "TISSUE_ID2",
"TISSUE_FACTOR", "IDENTIFIER"]
df = df.drop(["BLANK", "TISSUE_ID1", "TISSUE_ID2"], axis=1)
df['ALTERATION_TYPE'] = "METHYLATION"
self._to_csv(df, 'methylation.csv')
self.methylation_df = df
def _read_variant_data(self):
logger.info("Processing variant data.")
# This block deals with variant data
xls = pd.ExcelFile(self._get_path('variant'))
df = xls.parse(skiprows=20 )
df = df[ ['SAMPLE', 'COSMIC_ID', 'Cancer Type', 'Gene', 'Recurrence Filter' ] ]
df = df.loc[df['Recurrence Filter'] == 'Yes' ]
df.columns = ['CELL_LINE', 'COSMIC_ID', 'TISSUE_FACTOR', 'IDENTIFIER',
'RECURRENCE' ]
df = df.drop('RECURRENCE', axis=1)
df['ALTERATION_TYPE'] = "GENETIC_VARIATION"
self._to_csv(df, 'variant.csv')
self.variant_df = df
def _read_cell_line_data(self):
logger.info("Processing cell line dictionary.")
# This block imports the cell line dictionary
xls = pd.ExcelFile(self._get_path("cell_line"))
df = xls.parse(header = 2 )
df.columns = ["CELL_LINE", "COSMIC_ID", "WES", "CNA", "GEX",
"METH", "DRUGDATA", "T_ID1", "T_ID2", "TISSUE_FACTOR",
"MSI_FACTOR", "MEDIA_FACTOR", "GROWTH_FACTOR" ]
df.index = range(len(df.index ) )
df = df.drop(0)
df = df[['CELL_LINE', 'COSMIC_ID', 'TISSUE_FACTOR', 'MSI_FACTOR',
'MEDIA_FACTOR', 'GROWTH_FACTOR']]
self._to_csv(df, 'cell_line_dict.csv')
self.cell_line_df = df
def _read_ic50_data(self):
logger.info("Processing IC50 data." )
xls = pd.ExcelFile(self._get_path('ic50') )
df = xls.parse(header = 4 )
df.columns.values[ 0 ] = 'COSMIC_ID'
df = df.drop(0)
df = df.drop('Sample Names', axis = 1 )
self.ic50_df = df
drug_dict = xls.parse(header=4)[0:1]
drug_dict.drop(drug_dict.columns[[0, 1]], axis=1, inplace=True)
drug_dict = drug_dict.T.reset_index()
drug_dict.columns = ['DRUG_ID', 'DRUG_NAME']
drug_dict['DRUG_TARGET'] = 'Unknown'
self.drug_decoder_df = drug_dict
self._to_csv(df, "ic50_full_matrix.csv")
self._to_csv(drug_dict, "drug_decoder.csv")
self.backup_ic50_df = self.ic50_df
self.backup_drug_decoder_df = self.drug_decoder_df
def _combine_genomic_annotation(self):
logger.info("Combining data frames." )
self.genomic_df = pd.concat([self.variant_df, self.cna_df,
self.methylation_df] ,
axis=0, join='inner')
self.genomic_df = self.genomic_df.merge(self.cell_line_df,
how='left', on=['CELL_LINE', 'TISSUE_FACTOR'])
self._to_csv(self.genomic_df, 'genomic.csv')
self.backup_genomic_df = self.genomic_df
[docs] def reset_genomic_data(self ):
self.genomic_df = self.backup_genomic_df
self.annotate = False
self.ic50_df = self.backup_ic50_df
self.drug_decoder_df = self.backup_drug_decoder_df
[docs] def annotate_all(self ):
"""Annotates dataframes after read_annotation"""
logger.info("Annotating data" )
self.genomic_df = self.genomic_df.merge(self.annotation_df,
how='left', on=['IDENTIFIER'])
self.genomic_df = self._string_split(self.genomic_df, 'GENE', ',' )
self.annotate = True
def _string_split(self, to_be_split, col_name, separator):
"""This function splits multiple genes contained within the
same genomic region across multiple rows, duplicating all other values.
"""
s = to_be_split[col_name].str.split(separator ).apply(pd.Series, 1).stack()
# Making index
s.index = s.index.droplevel(-1) # Flattens levels to retrieve indices from original frame
# Defining name
s.name = col_name # Join requires Series name
# Delete column in original dataframe
del to_be_split[col_name]
# Join data frames.
split_df = to_be_split.join(s)
# Return split data frame
return split_df
def _read_annotation(self):
# Read and merge annotations
xls = pd.ExcelFile(self._get_path("methylation_annotation"))
methylation_annotation = xls.parse(header = 16)
methylation_annotation = methylation_annotation[ ['Genomic Coordinates', 'GN' ] ]
methylation_annotation.columns = [ 'IDENTIFIER', 'GENE' ]
methylation_annotation.GENE = methylation_annotation.GENE.replace(
to_replace = "; ", value=",", regex=True)
methylation_annotation = methylation_annotation[
methylation_annotation.IDENTIFIER.duplicated() == False ]
xls = pd.ExcelFile(self._get_path("cna_annotation"))
cna_annotation = xls.parse(header = 19)
cna_annotation = cna_annotation[ [ 'Identifier', 'Contained genes' ] ]
cna_annotation.columns = ['IDENTIFIER', 'GENE']
variant_annotation = pd.DataFrame({
'IDENTIFIER': self.variant_df.IDENTIFIER.unique(),
'GENE': self.variant_df.IDENTIFIER.unique()})
# CHECK FOR UNIQUENESS
self.annotation_df = pd.concat([ methylation_annotation,
cna_annotation, variant_annotation ], axis = 0, join = "inner" )
self._to_csv(self.annotation_df, "annotation.csv")
def _check_filter(self, values, valid_values):
if values is None:
raise TypeError("Please enter list of types to keep. Valid types: " +
", ".join(valid_values))
for this in values:
if this not in valid_values:
raise ValueError("%s not valid. Use one of " % this +
" ".join(valid_values))
[docs] def filter_by_alteration_type(self, type_list=None):
valid_types = self.genomic_df.ALTERATION_TYPE.unique()
self._check_filter(type_list, valid_types)
type_list = [ x.upper() for x in type_list ]
self.genomic_df = self.genomic_df.query("ALTERATION_TYPE in @type_list" )
[docs] def filter_by_gene(self, gene_list=None):
if gene_list is None:
print("Please enter a valid gene list or set to 'Core Genes' " +
"to use default GDSC1000 core genes" )
return
elif isinstance(gene_list, str ):
gene_list = self.defaults.gene_dict[gene_list]
#gene_list = [x.upper() for x in gene_list]
if self.annotate == True:
self.genomic_df = self.genomic_df.query("GENE in @gene_list" )
else:
self.genomic_df = self.genomic_df.query("IDENTIFIER in @gene_list" )
[docs] def filter_by_tissue_type(self, tissue_list=None):
if tissue_list == None:
print("Please enter list of tissues. To use all tissues, add argument tissue_list = 'Complete'" )
print("Possible dictionary keys include: ", self.defaults.tissue_dict.keys() )
return
elif isinstance(tissue_list, str ):
tissue_list = self.defaults.tissue_dict[ tissue_list ]
tissue_list = [ x.upper() for x in tissue_list ]
self.genomic_df = self.genomic_df.query("TISSUE_FACTOR in @tissue_list" )
[docs] def filter_by_cosmic_id(self, cosmic_list=None):
if cosmic_list == None:
print("Please enter list of COSMIC IDs. To use an example set of COSMIC IDs, add argument cosmic_list = 'Dummy'" )
print("Possible dictionary keys include: ", self.defaults.cosmic_dict.keys() )
return
elif isinstance(cosmic_list, str ):
cosmic_list = self.defaults.cosmic_dict[ cosmic_list ]
self.genomic_df = self.genomic_df.query("COSMIC_ID in @cosmic_list" )
[docs] def filter_by_cell_line(self, cell_line_list = None ):
if cell_line_list == None:
print("Please enter list of cell line names. To use an example set of names, add argument cell_line_list = 'Dummy'" )
print("Possible dictionary keys include: ", self.defaults.cell_line_dict.keys() )
return
elif isinstance(cell_line_list, str ):
cell_line_list = self.defaults.cell_line_dict[cell_line_list]
self.genomic_df = self.genomic_df.query("CELL_LINE in @cell_line_list")
[docs] def filter_by_recurrence(self, min_recurrence=None):
if min_recurrence == None:
min_recurrence = self.recurrence
if self.annotate == True:
feature = "GENE"
else:
feature = "IDENTIFIER"
flatten_cell_lines = self.genomic_df.groupby([ feature, "COSMIC_ID", "TISSUE_FACTOR" ],
as_index = False )[[ "ALTERATION_TYPE" ]].count()
feature_count = flatten_cell_lines.groupby([feature], as_index = False )[[ "COSMIC_ID" ]].count()
self.feature_count = feature_count[ feature_count["COSMIC_ID"] >= min_recurrence ]
self.genomic_df = self.genomic_df[self.genomic_df[feature].isin(self.feature_count[feature])]
# Make into matrix
[docs] def make_matrix(self, min_recurrence=None):
"""Return a dataframe compatible with ANOVA analysis"""
msi_dictionary = {"MSI-H": 1, "MSS/MSI-L": 0}
if min_recurrence == None:
min_recurrence = self.recurrence
self.filter_by_recurrence(min_recurrence=min_recurrence)
if self.annotate == True:
feature = "GENE"
else:
feature = "IDENTIFIER"
genomic_matrix = pd.crosstab(self.genomic_df[ feature ],
columns=[ self.genomic_df["COSMIC_ID"], self.genomic_df['TISSUE_FACTOR'],
self.genomic_df["CELL_LINE"], self.genomic_df["MSI_FACTOR"],
self.genomic_df["MEDIA_FACTOR"]])
genomic_matrix[ genomic_matrix > 1 ] = 1
self.genomic_matrix = genomic_matrix.T.reset_index()
self.genomic_matrix.MSI_FACTOR = self.genomic_matrix.MSI_FACTOR.map(msi_dictionary)
self._to_csv(self.genomic_matrix, "genomic_features_make_matrix.csv", quoting=csv.QUOTE_NONNUMERIC)
ic50_filtered_matrix = self.ic50_df.query("COSMIC_ID in @self.genomic_matrix.COSMIC_ID.tolist()" )
self._to_csv(ic50_filtered_matrix, "ic50_make_matrix.csv")
def _get_info(self, annotations):
if self.annotate == True:
feature = "GENE"
summary = annotations.GENE.apply(lambda x: x.split(","))
else:
feature = "IDENTIFIER"
summary = annotations.IDENTIFIER.apply(lambda x: x.split(","))
summary = summary.to_frame()
summary['COUNT'] = summary[feature].apply(lambda x: len(x))
unique = len(set([y for x in summary[feature].values for y in x]))
describe = summary.describe()
describe.loc['unique genes'] = unique
return describe
[docs] def get_methylation_info(self):
"""Return dataframe with number of genes per unique methylation identifier"""
# Get the unique methylated regions
ident = self.methylation_df.IDENTIFIER.unique()
# From the annotation, extract the corresponding data
annotations = self.annotation_df.loc[
self.annotation_df.IDENTIFIER.apply(lambda x: x in ident)]
# Now, from the subset of annotations, get the GENE column and count
# number of genes that may not be unique but separated by commas
return self._get_info(annotations)
[docs] def get_cna_info(self):
"""Return dataframe with number of genes per unique CNA identifier"""
# Get the unique methylated regions
ident = self.cna_df.IDENTIFIER.unique()
# From the annotation, extract the corresponding data
annotations = self.annotation_df.loc[
self.annotation_df.IDENTIFIER.apply(lambda x: x in ident)]
# Now, from the subset of annotations, get the GENE column and count
# number of genes that may not be unique but separated by commas
return self._get_info(annotations)
[docs] def get_genomic_info(self):
"""Return information about the genomic dataframe
The returned dataframes contains two columns: the number of unique
cosmic identifiers and features for each type of alterations.
"""
cosmic = []
features = []
alterations = ['METHYLATION', 'DELETION', 'GENETIC_VARIATION',
'AMPLIFICATION']
for alteration in alterations:
this = self.genomic_df.loc[self.genomic_df.ALTERATION_TYPE == alteration]
N = len(this.COSMIC_ID.unique())
cosmic.append(N)
this = self.genomic_df.loc[self.genomic_df.ALTERATION_TYPE == alteration]
features.append(len(this))
df = pd.DataFrame({"features": features, "cosmic": cosmic})
df.index = alterations
try:
print("Number of unique genes: {}".format(
len(self.genomic_df.GENE.unique())))
except:
print("Number of unique genes: {}".format(
len(self.genomic_df.IDENTIFIER.unique())))
print("Number of unique COSMIC ID: {}".format(
len(self.genomic_df.COSMIC_ID.unique())))
return df