# -*- python -*-
# -*- coding utf-8 -*-
# This file is part of GDSCTools software
#
# Copyright (c) 2015 - Wellcome Trust Sanger Institute
# All rights reserved
#
# File author(s): Thomas Cokelaer <cokelaer@gmail.com>
#
# Distributed under the BSD 3-Clause License.
# See accompanying file LICENSE.txt distributed with this software
#
# website: http://github.com/CancerRxGene/gdsctools
#
##############################################################################
"""IO functionalities
Provides readers to read the following formats
- Matrix of IC50 data set :class:`IC50`
- Matrix of Genomic features with :class:`GenomicFeatures`
- Drug Decoder table with :class:`DrugDecode`
"""
import warnings
from gdsctools.errors import GDSCToolsDuplicatedDrugError
import pandas as pd
import pylab
import numpy as np
import easydev
import colorlog
__all__ = ['IC50', 'GenomicFeatures', 'Reader', 'DrugDecode']
def drug_name_to_int(name):
# We want to remove the prefix Drug_
# We also want to remove suffix _IC50 but in v18, we have names
# such as Drug_1_0.33_IC50 to provide the concentration.
# So, we should remove the string after the second _
# finally, #154 also causes a trouble that is a cast to integer
# from a string that is too large (more than 20 digits) may not be cast
# with pandas. Values must be less than 2**64-1. To guarantee that
# the cast works correctly, we can assume that it has less than 19 digits
def _str_to_int(x, maxdigits=19):
if isinstance(x, (int, np.integer)):
return x
elif isinstance(x, str):
if len(x) > maxdigits:
print("Warnings gdsctools.readers.drug_name_to_int: " +
"%s identifier too long." % x +
"Please use values below 2**64 with less than 19 digits")
x = int(x[0:maxdigits])
else:
x = int(x)
return x
else:
print(type(x))
raise NotImplementedError
# remove characters (' and ")
if isinstance(name, str):
name = name.replace("'", "")
name = name.replace('"', "")
# replace the Drug_ and DRUG_
try:
res = name.replace("Drug_", "").replace("DRUG_", "")
res = res.split("_")[0]
res = _str_to_int(res)
return res
except:
return _str_to_int(name)
[docs]class Reader(object):
"""Convenience base class to read CSV or TSV files (using extension)"""
def __init__(self, data=None):
r""".. rubric:: Constructor
This class takes only one input parameter, however, it may be a
filename, or a dataframe or an instance of :class:`Reader` itself. This
means than children classes such as :class:`IC50` can also be used
as input as long as a dataframe named :attr:`df` can be found.
:param data: a filename in CSV or TSV format with format specified by
child class (see e.g. :class:`IC50`), or a valid dataframe, or an
instance of :class:`Reader`.
The input can be a filename either in CSV (comma separated values) or
TSV (tabular separated values). The extension will be used to interpret
the content, so please be consistent in the naming of the file
extensions.
::
>>> from gdsctools import Reader, ic50_test
>>> r = Reader(ic50_test.filename) # this is a CSV file
>>> len(r.df) # number of rows
988
>>> len(r) # number of elements
11856
Note that :class:`Reader` is a base class and more sophisticated
readers are available. for example, the :class:`IC50` would be
better to read this IC50 data set.
The data has been stored in a data frame in the :attr:`df` attribute.
The dataframe of the object itself can be used as an input to create
an new instance::
>>> from gdsctools import Reader, ic50_test
>>> r = Reader(ic50_test.filename, sep="\t")
>>> r2 = Reader(r) # here r.df is simply copied into r2
>>> r == r2
True
It is sometimes convenient to create an empty Reader that will be
populated later on::
>>> r = Reader()
>>> len(r)
0
More advanced readers (e.g. :class:`IC50`) can also be used as input
as long as they have a :attr:`df` attribute::
>>> from gdsctools import Reader, ic50_test
>>> ic = IC50(ic50_test)
>>> r = Reader(ic)
"""
# input data
if data is None:
# create an empty dataframe
self.df = pd.DataFrame()
self._filename = None
elif isinstance(data, str):
# Read a filename in TSV or CSV format
self.read_data(data)
self._filename = data
elif hasattr(data, 'filename'):
# could be a data sets from gdsctools.datasets.Data
self.read_data(data.filename)
self._filename = data.filename
elif hasattr(data, 'df'):
# an instance of a Reader (or child such as IC50, GenomicFeatures)
self.df = data.df.copy()
self._filename = data._filename
elif isinstance(data, pd.DataFrame):
# Or just a dataframe ?
self.df = data.copy()
self._filename = None
else:
raise TypeError("Input must be a filename, a IC50 instance, or " +
"a dataframe.")
#: if populated, can be used to check validity of a header
# used by drug_decode only may be removed
self.header = []
# sanity check on cleaning columns if not alread done
#try:self.df.columns = [x.strip() for x in self.df.columns]
#except: pass # fails for the IC50 where header is made of integers
[docs] def read_data(self, filename):
# remove possible white spaces in the header's names
if ".csv" in filename:
separator = ","
elif ".tsv" in filename:
separator = "\t"
elif ".txt" in filename:
separator = "\t"
print("GDSCTools warning: files with .txt extension are "
"accepted (we assume a tab-separated file) but "
"should be renamed with .csv or .tsv extension")
else:
raise NotImplementedError("Only .csv or .tsv files are accepted ")
try:
# this is to cope with pandas 0.13 on ReadTheDoc
# and newer versions
na_values = ["NA", "NaN"]
if filename.endswith(".gz"):
compression = "gzip"
elif filename.endswith(".bz2"):
compression = "bz2"
elif filename.endswith(".zip"):
compression = "zip"
elif filename.endswith(".xz"):
compression = "xz"
else:
compression = None
# Sometimes a column in CSV file may have several values
# separated by comma. This should be surrended by quotes "
# To account for that feature, quotechar argument must be provided
# Besides, to avoid conflicts with spaces, skipinitialspace must
# be set to True. This also helps since spaces would be
# interpreted as a string. Using skipinitialspace, the spaces
# is converetd to NA
rawdf = pd.read_csv(filename, sep=separator, comment="#",
na_values=na_values, skipinitialspace=True,
compression=compression, quotechar='"')
#if sum([this.count('\t') for this in rawdf.columns])>2:
# print("Your input file does not seem to be comma"
# " separated. If tabulated, please rename with"
# " .tsv or .txt extension")
# Sometimes, a user will provide a CSV, which is actually
# tab-delimited. This is wrong and difficult to catch
except Exception as err:
msg = 'Could not read %s. See gdsctools.readers.Reader'
print(msg % filename)
raise(err)
# Make sure the columns' names are stripped
#rawdf.rename(columns=lambda x: x.strip(), inplace=True)
# let us drop columns that are unnamed and print information
columns = [x for x in rawdf.columns if x.startswith('Unnamed')]
if len(columns) > 0:
print('%s unnamed columns found and removed. ' % len(columns) +
'Please fix your input file.')
self.df = rawdf.drop(columns, axis=1)
# Some fields may be empty strings, which must be set as NA
import warnings
warnings.filterwarnings('ignore')
self.df = self.df.replace(" ", "").replace("\t", "").replace("",
np.nan)
warnings.filterwarnings("default")
# Finally, check that names do not contain the unwanted character
# / that was used in some old matrices.
if len([True for x in self.df.columns if "/" in x])>0:
print("Your input data contains unwanted / characters in " +
" the header. Let's remove them.")
self.df.columns = [x.replace("/", "_") for x in self.df.columns]
def _interpret(self):
pass
def _valid_header(self, df):
for name in self.header:
if name not in list(df.columns):
return False
return True
def _read_matrix_from_r(self, name):
"""Required biokit. Will be removed"""
print("Reading matrix %s " % (name))
self.session.run("rnames = rownames(%s)" % name)
self.session.run("cnames = colnames(%s)" % name)
self.session.run("data = %s" % name)
cnames = self.session.cnames
rnames = self.session.rnames
data = self.session.data
df = pd.DataFrame(data=data.copy())
df.columns = [x.strip() for x in cnames]
df.index = [x.strip() for x in rnames]
return df
def __str__(self):
self.df.info()
return ""
def __len__(self):
return self.df.shape[0] * self.df.shape[1]
[docs] def to_csv(self, filename, sep=",", index=False, reset_index=True):
"""Save data into a CSV file without indices"""
#Reset the index (e.g., COSMIC ID)
if reset_index is True:
df = self.df.reset_index()
else:
df = self.df
df.to_csv(filename, sep=sep, index=index)
[docs] def check(self):
"""Checking the format of the matrix
Currently, only checks that there is no duplicated column names
"""
if len(self.df.columns.unique()) != len(self.df.columns):
columns = list(self.df.columns)
for this in columns:
if columns.count(this) > 1:
raise GDSCToolsDuplicatedDrugError(this)
def _check_uniqueness(self, data):
if len(set(data)) != len(data):
raise Exception("Error gdsctools in readers.IC50: data " +
" identifiers not unique.")
def __eq__(self, other):
return all(self.df.fillna(0) == other.df.fillna(0))
class CosmicRows(object):
"""Parent class to IC50 and GenomicFeatures to handle cosmic identifiers"""
def _get_cosmic(self):
return list(self.df.index)
def _set_cosmic(self, cosmics):
for cosmic in cosmics:
if cosmic not in self.cosmicIds:
raise ValueError('Unknown cosmic identifier')
self.df = self.df.loc[cosmics]
cosmicIds = property(_get_cosmic, _set_cosmic,
doc="return list of cosmic ids (could have duplicates)")
def drop_cosmic(self, cosmics):
"""drop a drug or a list of cosmic ids"""
cosmics = easydev.to_list(cosmics)
tokeep = [x for x in self.cosmicIds if x not in cosmics]
self.cosmicIds = tokeep
[docs]class IC50(Reader, CosmicRows):
"""Reader of IC50 data set
This input matrix must be a comman-separated value (CSV) or
tab-separated value file (TSV).
The matrix must have a header and at least 2 columns. If the number of rows
is not sufficient, analysis may not be possible.
The header must have a column called "COSMIC_ID" or "COSMIC ID".
This column will be used as indices (row names). All other columns will
be considered as input data.
The column "COSMIC_ID" contains the cosmic identifiers (cell line). The
other columns should be filled with the IC50s corresponding to a pair
of COSMIC identifiers and Drug. Nothing prevents you to fill the file with
data that have other meaning (e.g. AUC).
If at least one column starts with ``Drug_``, all other columns will be
ignored. This was implemented for back compatibility.
The order of the columns is not important.
Here is a simple example of a valid TSV file::
COSMIC_ID Drug_1_IC50 Drug_20_IC50
111111 0.5 0.8
222222 1 2
A test file is provided in the gdsctools package::
from gdsctools import ic50_test
You can read it using this class and plot information as follows:
.. plot::
:width: 80%
:include-source:
from gdsctools import IC50, ic50_test
r = IC50(ic50_test)
r.plot_ic50_count()
You can get basic information using the print function::
>>> from gdsctools import IC50, ic50_test
>>> r = IC50(ic50_test)
>>> print(r)
Number of drugs: 11
Number of cell lines: 988
Percentage of NA 0.206569746043
You can get the drug identifiers as follows::
r.drugIds
and set the drugs, which means other will be removed::
r.drugsIds = [1, 1000]
.. versionchanged:: 0.9.10
The column **COSMIC ID** should now be **COSMIC_ID**.
Previous name is deprecated but still accepted.
"""
cosmic_name = 'COSMIC_ID'
def __init__(self, filename, v18=False):
""".. rubric:: Constructor
:param filename: input filename of IC50s. May also be an instance
of :class:`IC50` or a valid dataframe. The data is stored as a
dataframe in the attribute called :attr:`df`. Input file may be
gzipped
"""
super(IC50, self).__init__(filename)
# interpret the raw data and check some of its contents
self._v18 = v18
if len(self.df) > 0:
self._interpret()
self.check()
def _interpret(self):
# if there is at least one column that starts with Drug or drug or
# DRUG or variant then all other columns are dropped except "COSMIC ID"
# For back compatibility with data that mixes Drug identifiers and
# genomic features:
_cols = [str(x) for x in self.df.columns]
drug_prefix = None
for this in _cols:
if this.startswith("Drug_"):
drug_prefix = "Drug"
_cols = [str(x) for x in self.df.columns]
if "COSMIC ID" in _cols and self.cosmic_name not in _cols:
colorlog.warning("'COSMIC ID' column name is deprecated since " +
"0.9.10. Please replace with 'COSMIC_ID'")
self.df.columns = [x.replace("COSMIC ID", "COSMIC_ID")
for x in self.df.columns]
if "CL" in _cols and "COSMID_ID" not in self.df.columns:
colorlog.warning("'CL column name is deprecated since " +
"0.9.10. Please replace with 'COSMIC_ID'")
self.df.columns = [x.replace("CL", "COSMIC_ID")
for x in self.df.columns]
# If the data has not been interpreted, COSMIC column should be
# found in the column and set as the index
_cols = [str(x) for x in self.df.columns]
if self.cosmic_name in self.df.columns:
self.df.set_index(self.cosmic_name, inplace=True)
_cols = [str(x) for x in self.df.columns]
if drug_prefix:
columns = [x for x in _cols if x.startswith(drug_prefix)]
self.df = self.df[columns]
# If already interpreted, COSMIC name should be the index already.
# and should be integers, so let us cast to integer
elif self.df.index.name == self.cosmic_name:
_cols = [str(x) for x in self.df.columns]
if drug_prefix:
columns = [x for x in _cols if x.startswith(drug_prefix)]
columns = self.df.columns
assert len(columns) == len(set(columns))
self.df = self.df[columns]
# Otherwise, raise an error
else:
raise ValueError("{0} column could not be found in the header".format(
self.cosmic_name))
# In v18, the drug ids may be duplicated
if self._v18 is True:
return
self.df.columns = [drug_name_to_int(x) for x in self.df.columns]
self.df.columns = self.df.columns.astype(int)
self.df.index = [int(x) for x in self.df.index]
self.df.index = self.df.index.astype(int)
self.df.index.name = "COSMIC_ID"
# Check uniqueness
self._check_uniqueness(self.df.index)
[docs] def drug_name_to_int(self, name):
return drug_name_to_int(name)
def _get_drugs(self):
return list(self.df.columns)
def _set_drugs(self, drugs):
for drug in drugs:
if drug not in self.drugIds:
raise ValueError('Unknown drug name')
self.df = self.df[drugs]
drugIds = property(_get_drugs, _set_drugs,
doc='list the drug identifier name or select sub set')
[docs] def drop_drugs(self, drugs):
"""drop a drug or a list of drugs"""
drugs = easydev.to_list(drugs)
tokeep = [x for x in self.drugIds if x not in drugs]
self.drugIds = tokeep
def __contains__(self, item):
if item in self.drugIds:
return True
else:
return False
[docs] def plot_ic50_count(self, **kargs):
"""Plots the fraction of valid/measured IC50 per drug
:param kargs: any valid parameters accepted by pylab.plot function.
:return: the fraction of valid/measured IC50 per drug
"""
data = self.df.count()/len(self.df)
pylab.clf()
pylab.plot(data.values, **kargs)
pylab.grid()
pylab.xlim([0, len(self.drugIds)+1])
pylab.xlabel('Drug index')
pylab.ylim([0,1])
pylab.ylabel('Percentage of valid IC50')
return data
[docs] def hist(self, bins=20, **kargs):
"""Histogram of the measured IC50
:param bins: binning of the histogram
:param kargs: any argument accepted by pylab.hist function.
:return: all measured IC50
.. plot::
:include-source:
:width: 80%
from gdsctools import IC50, ic50_test
r = IC50(ic50_test)
r.hist()
"""
pylab.clf()
pylab.hist(self.get_ic50(), bins=bins, **kargs)
pylab.grid()
pylab.xlabel('log IC50')
[docs] def get_ic50(self):
"""Return all ic50 as a list"""
return [x for x in self.df.values.flatten() if not np.isnan(x)]
def __str__(self):
txt = "Number of drugs: %s\n" % len(self.drugIds)
txt += "Number of cell lines: %s\n" % len(self.df)
N = len(self.drugIds) * len(self.df)
Nna = self.df.isnull().sum().sum()
if N != 0:
txt += "Percentage of NA {0}\n".format(Nna / float(N))
return txt
def __repr__(self):
Nc = len(self.cosmicIds)
Nd = len(self.drugIds)
return "IC50 object <Nd={0}, Nc={1}>".format(Nd, Nc)
"""def __add__(self, other):
print("Experimantal. combines IC50 via COSMIC IDs")
df = pd.concat([self.df, other.df], ignore_index=True)
df = df.drop_duplicates(cols=[self.cosmic_name])
return df
"""
[docs] def copy(self):
new = IC50(self)
return new
[docs]class GenomicFeatures(Reader, CosmicRows):
"""Read Matrix with Genomic Features
These are the compulsary column names required (note the spaces):
- 'COSMIC_ID'
- 'TISSUE_FACTOR'
- 'MSI_FACTOR'
If one of the following column is found, it is removed (deprecated)::
- 'SAMPLE_NAME'
- 'Sample Name'
- 'CELL_LINE'
and features can be also encoded with the following convention:
- columns ending in "_mut" to encode a gene mutation (e.g., BRAF_mut)
- columns starting with "gain_cna"
- columns starting with "loss_cna"
Those columns will be removed:
- starting with `Drug_`, which are supposibly from the IC50 matrix
::
>>> from gdsctools import GenomicFeatures
>>> gf = GenomicFeatures()
>>> print(gf)
Genomic features distribution
Number of unique tissues 27
Number of unique features 677 with
- Mutation: 270
- CNA (gain): 116
- CNA (loss): 291
.. versionchanged:: 0.9.10
The header's columns' names have changed to be more consistant.
Previous names are deprecated but still accepted.
.. versionchanged:: 0.9.15
If a tissue is empty, it is replaced by UNDEFINED.
We also strip the spaces to make sure there is "THIS" and "THIS " are
the same.
"""
colnames = easydev.AttrDict()
colnames.cosmic = 'COSMIC_ID'
colnames.tissue = 'TISSUE_FACTOR'
colnames.msi = 'MSI_FACTOR'
colnames.media = 'MEDIA_FACTOR'
def __init__(self, filename=None, empty_tissue_name="UNDEFINED"):
""".. rubric:: Constructor
If no file is provided, using the default file provided in the
package that is made of 1001 cell lines times 680 features.
:param str empty_tissue_name: if a tissue name is let empty, replace
it with this string.
"""
# first reset the filename to the shared data (if not provided)
if filename is None:
from gdsctools.datasets import genomic_features_test
filename = genomic_features_test
# used in the header so should be ser before call to super()
super(GenomicFeatures, self).__init__(filename)
# FIXME Remove columns related to Drug if any. Can be removed in
# the future
self.df = self.df[[x for x in self.df.columns
if x.startswith('Drug_') is False]]
for this in ['Sample Name', 'SAMPLE_NAME', 'Sample_Name', 'CELL_LINE']:
if this in self.df.columns:
self.df.drop(this, axis=1, inplace=True)
# Let us rename "COSMIC ID" into "COSMIC_ID" if needed
for old, new in {
'Tissue Factor Value': 'TISSUE_FACTOR',
'MS-instability Factor Value': 'MSI_FACTOR',
'COSMIC ID': 'COSMIC_ID'}.items():
if old in self.df.columns:
colorlog.warning("'%s' column name is deprecated " % old +
" since 0.9.10. Please replace with '%s'"% new)
self.df.columns = [x.replace(old, new)
for x in self.df.columns]
if "CL" in self.df.columns and "COSMID_ID" not in self.df.columns:
self.df.columns = [x.replace("CL", "COSMIC_ID")
for x in self.df.columns]
# There are 3 special columns to hold the factors
self._special_names = []
# If tissue factor is not provided, we create and fill it with dummies.
# OTherwise, we need to change a lot in the original code in ANOVA
if self.colnames.tissue not in self.df.columns:
colorlog.info("column named '%s' not found" % self.colnames.tissue)
self.df[self.colnames.tissue] = ['UNDEFINED'] * len(self.df)
self._special_names.append(self.colnames.tissue)
else:
self._special_names.append(self.colnames.tissue)
self.found_msi = self.colnames.msi in self.df.columns
if self.found_msi is False:
colorlog.info("column named '%s' not found" % self.colnames.msi)
else:
self._special_names.append(self.colnames.msi)
self.found_media = self.colnames.media in self.df.columns
if self.found_media is False:
pass
else:
self._special_names.append(self.colnames.media)
# order columns and index
self._order()
#
self._interpret_cosmic()
#
self.check()
self._fix_empty_tissues(empty_tissue_name)
def _fix_empty_tissues(self, name="UNDEFINED"):
# Sometimes, tissues may be empty so a nan is present. This lead to
# to errors in ANOVA or Regression so we replace them with "UNDEFINED"
N = self.df.TISSUE_FACTOR.isnull().sum()
if N > 0:
logger.warning("Some tissues were empty strings and renamed as UNDEFINED!")
self.df.TISSUE_FACTOR.fillna('UNDEFINED', inplace=True)
def _get_shift(self):
return len(self._special_names)
shift = property(_get_shift)
def _interpret_cosmic(self):
if self.colnames.cosmic in self.df.columns:
self.df.set_index(self.colnames.cosmic, inplace=True)
elif self.colnames.cosmic == self.df.index.name:
pass
else:
error_msg = "the features input file must contains a column " +\
" named %s" % self.colnames.cosmic
raise ValueError(error_msg)
self.df.index = [int(x) for x in self.df.index]
self.df.index = self.df.index.astype(int)
self.df.index.name = "COSMIC_ID"
self.df.sort_index(inplace=True)
def _order(self):
others = [x for x in self.df.columns if x not in self._special_names]
self.df = self.df[self._special_names + others]
def _get_features(self):
return list(self.df.columns)
def _set_features(self, features):
for feature in features:
if feature not in self.features:
raise ValueError('Unknown feature name %s' % feature)
features = [x for x in features if x.endswith('FACTOR') is False]
features = self._special_names + features
self.df = self.df[features]
self._order()
features = property(_get_features, _set_features,
doc="return list of features")
def _get_tissues(self):
return list(self.df[self.colnames.tissue])
tissues = property(_get_tissues, doc='return list of tissues')
def _get_unique_tissues(self):
return list(self.df[self.colnames.tissue].unique())
unique_tissues = property(_get_unique_tissues, doc='return set of tissues')
[docs] def plot(self, shadow=True, explode=True, fontsize=12):
"""Histogram of the tissues found
.. plot::
:include-source:
:width: 80%
from gdsctools import GenomicFeatures
gf = GenomicFeatures() # use the default file
gf.plot()
"""
if self.colnames.tissue not in self.df.columns:
return
data = pd.get_dummies(self.df[self.colnames.tissue]).sum()
data.index = [x.replace("_", " ") for x in data.index]
pylab.figure(1)
pylab.clf()
labels = list(data.index)
labels = ["%s (%s)" % (l, count) for l, count in zip(labels, data)]
if explode:
pylab.pie(data, labels=labels, shadow=shadow, explode=[0.2]*len(labels))
else:
pylab.pie(data, labels=labels, shadow=shadow)
# For the pie chart, we do not sort so that small slices are not all
# together and this gives a better final pie chart but for the
# histogram, we can sort it.
try:
data = data.sort_values(ascending=True)
except:
data = data.sort(ascending=True)
# The bar plot
pylab.figure(2)
data.plot(kind='barh', width=0.8)
pylab.grid(True)
pylab.xlabel('Occurences', fontsize=fontsize)
# keep the try to prevent MacOS issue
try:pylab.tight_layout()
except:pass
return data
def __str__(self):
txt = 'Genomic features distribution\n'
try:
tissues = list(self.df[self.colnames.tissue].unique())
Ntissue = len(tissues)
txt += 'Number of unique tissues {0}'.format(Ntissue)
if Ntissue == 1:
txt += ' ({0})\n'.format(tissues[0])
elif Ntissue < 10:
txt += '\nHere are the tissues: '
txt += ",".join(tissues) + "\n"
else:
txt += '\nHere are the first 10 tissues: '
txt += ", ".join(tissues[0:10]) + "\n"
except:
txt += 'No information about tissues\n'
if self.found_msi:
txt += "MSI column: yes\n"
else:
txt += "MSI column: no\n"
if self.found_media:
txt += "MEDIA column: yes\n"
else:
txt += "MEDIA column: no\n"
# -3 since we have also the MSI, tissue, media columns
# TODO should use shift attribute ?
Nfeatures = len(self.features)
txt += '\nThere are {0} unique features distributed as\n'.format(Nfeatures-self.shift)
n_mutations = len([x for x in self.df.columns if x.endswith("_mut")])
txt += "- Mutation: {}\n".format(n_mutations)
n_gain = len([x for x in self.df.columns if x.startswith("gain_cna")])
txt += "- CNA (gain): {}\n".format(n_gain)
n_loss = len([x for x in self.df.columns if x.startswith("loss_cna")])
txt += "- CNA (loss): {}".format(n_loss)
return txt
[docs] def drop_tissue_in(self, tissues):
"""Drop tissues from the list
:param list tissues: a list of tissues to drop. If you have only
one tissue, can be provided as a string. Since rows are removed
some features (columns) may now be empty (all zeros). If so, those
columns are dropped (except for the special columns (e.g, MSI).
"""
tissues = easydev.to_list(tissues)
mask = self.df[self.colnames.tissue].isin(tissues) == False
self.df = self.df[mask]
self._cleanup()
[docs] def keep_tissue_in(self, tissues):
"""Drop tissues not in the list
:param list tissues: a list of tissues to keep. If you have only
one tissue, can be provided as a string. Since rows are removed
some features (columns) may now be empty (all zeros). If so, those
columns are dropped (except for the special columns (e.g, MSI).
"""
tissues = easydev.to_list(tissues)
mask = self.df[self.colnames.tissue].isin(tissues)
self.df = self.df[mask]
self._cleanup()
def _cleanup(self, required_features=0):
# FIXME: there is view/copy warning here in pandas. it should be fixed
# or may have side-effects
to_ignore = self._special_names
# create a view ignoring the informative columns
view = self.df[[x for x in self.df.columns if x not in to_ignore]]
todrop = list(view.columns[view.sum() <= required_features])
self.df.drop(todrop, axis=1, inplace=True)
def __repr__(self):
Nc = len(self.cosmicIds)
Nf = len(self.features) - self.shift
try:
Nt = len(set(self.tissues))
except:
Nt = '?'
return "GenomicFeatures <Nc={0}, Nf={1}, Nt={2}>".format(Nc, Nf, Nt)
[docs] def compress_identical_features(self):
"""Merge duplicated columns/features
Columns duplicated are merged as follows. Fhe first column is kept,
others are dropped but to keep track of those dropped, the column name
is renamed by concatenating the columns's names. The separator is a
double underscore.
::
gf = GenomicFeatures()
gf.compress_identical_features()
# You can now access to the column as follows (arbitrary example)
gf.df['ARHGAP26_mut__G3BP2_mut']
"""
# let us identify the duplicates as True/False
datatr = self.df.transpose()
duplicated_no_first = datatr[datatr.duplicated()]
try:
duplicated = datatr[datatr.duplicated(keep=False)]
except:
# pandas 0.16
duplicated = datatr[datatr.duplicated(take_last=False)]
tokeep = [x for x in duplicated.index if x not in duplicated_no_first.index]
# Let us create a groupby strategy
groups = {}
# Let us now add the corrsponding duplicats
for feature in tokeep:
# Find all row identical to this feature
matches = (duplicated.loc[feature] == duplicated).all(axis=1)
groups[feature] = "__".join(duplicated.index[matches])
# This drops all duplicated columns (the first is kept, others are
# dropped)
self.df = self.df.transpose().drop_duplicates().transpose()
self.df.rename(columns=groups, inplace=True)
# We want to keep the column names informative that is if there were
# duplicates, we rename the column kept with the concatenation of all
# the corresponding duplicates
print("compressed %s groups of duplicates" % len(groups))
return groups
[docs] def get_TCGA(self):
from gdsctools.cosmictools import COSMICInfo
c = COSMICInfo()
tcga = c.df.loc[self.df.index].TCGA
return tcga
class PANCAN(Reader):
"""Reads RData file wit all genomic features including methylation.
will be removed. Used to read original data in R format but
will provide the data as CSV or TSV
.. deprecated:: since v0.12
"""
def __init__(self, filename=None):
print('deprecated')
"""if filename is None:
filename = easydev.get_share_file('gdsctools', 'data',
'PANCAN_simple_MOBEM.rdata')
super(PANCAN, self).__init__(filename)
# Remove R dependencies
from biokit.rtools import RSession
self.session = RSession()
self.session.run('load("%s")' %self._filename)
self.df = self._read_matrix_from_r('MoBEM')
"""
class Extra(Reader):
def __init__(self, filename="djvIC50v17v002-nowWithRMSE.rdata"):
super(Extra, self).__init__(filename)
print("Deprecated since v0.12")
# Remove R dependencies
from biokit.rtools import RSession
self.session = RSession()
self.session.run('load("%s")' %self._filename)
# 3 identical matrices containing AUC, IC50 and
self.dfAUCv17= self._read_matrix_from_r('dfAUCv17')
self.dfIC50v17 = self._read_matrix_from_r('dfIC50v17')
# Residual
self.dfResv17 = self._read_matrix_from_r('dfResv17')
# This df holds the xmid/scale parameters for each cell line
# Can be visualised using the tools.Logistic class.
self.dfCL= self._read_matrix_from_r('dfCL')
# There is an extra matrix called MoBEM, which is the same as in the
# file
def hist_residuals(self, bins=100):
"""Plot residuals across all drugs and cell lines"""
data = [x for x in self.dfResv17.fillna(0).values.flatten() if x != 0]
pylab.clf()
pylab.hist(data, bins=bins, normed=True)
pylab.grid(True)
pylab.xlabel('Residuals')
pylab.ylabel(r'\#')
def scatter(self):
from biokit.viz import scatter
s = scatter.ScatterHist(self.dfCL)
s.plot(kargs_histx={'color':'red', 'bins':20},
kargs_scatter={'alpha':0.9, 's':100, 'c':'b'},
kargs_histy={'color':'red', 'bins':20})
def hist_ic50(self, bins=100):
data = [x for x in self.dfIC50v17.fillna(0).values.flatten() if x != 0]
pylab.clf()
pylab.hist(data, bins=bins, normed=True)
pylab.grid(True)
pylab.xlabel('IC50')
pylab.ylabel(r'\#')
def hist_auc(self, bins=100):
data = [x for x in self.dfAUCv17.fillna(0).values.flatten() if x != 0]
pylab.clf()
pylab.hist(data, bins=bins, normed=True)
pylab.grid(True)
pylab.xlabel('AUC')
pylab.ylabel(r'\#')
[docs]class DrugDecode(Reader):
"""Reads a "drug decode" file
The format must be comma-separated file. There are 3 compulsary columns
called DRUG_ID, DRUG_NAME and DRUG_TARGET. Here is an example::
DRUG_ID ,DRUG_NAME ,DRUG_TARGET
999 ,Erlotinib ,EGFR
1039 ,SL 0101-1 ,"RSK, AURKB, PIM3"
TSV file may also work out of the box. If a column name called
'PUTATIVE_TARGET' is found, it is renamed 'DRUG_TARGET' to be compatible with
earlier formats.
In addition, 3 extra columns may be provided::
- PUBCHEM_ID
- WEBRELEASE
- OWNED_BY
The OWNED_BY and WEBRELEASE may be required to create packages for each
company. If those columns are not provided, the internal dataframe is
filled with None.
Note that older version of identifiers such as::
Drug_950_IC50
are transformed as proper ID that is (in this case), just the number::
950
Then, the data is accessible as a dataframe, the index being the
DRUG_ID column::
data = DrugDecode('DRUG_DECODE.csv')
data.df.iloc[999]
.. note:: the DRUG_ID column must be made of integer
"""
def __init__(self, filename=None):
""".. rubric:: Constructor"""
super(DrugDecode, self).__init__(filename)
self.header = ['DRUG_ID', 'DRUG_NAME', 'DRUG_TARGET', 'OWNED_BY',
'WEBRELEASE']
self.header_extra = ["PUBCHEM_ID", "CHEMBL_ID", "CHEMSPIDER_ID"]
try:
# if the input data is already a DrugDecode instance, this should
# fail since the expected df will not have the DRUG_ID field, that
# should be the index
self._interpret()
except:
pass
self.df = self.df[sorted(self.df.columns)]
def _interpret(self, filename=None):
N = len(self.df)
if N == 0:
return
self.df.rename(columns={
'PUTATIVE_TARGET': 'DRUG_TARGET',
'THERAPEUTIC_TARGET': 'DRUG_TARGET'},
inplace=True)
for column in ["WEBRELEASE", "OWNED_BY"] + self.header_extra:
if column not in self.df.columns:
self.df[column] = [np.nan] * N
#for this in self.header[1:]:
for this in self.header:
msg = " The column %s was not found and may be an issue later on."
if this not in self.df.columns and this != self.df.index.name:
logger.warning(msg % this )
# Finally, set the drug ids as the index.
try:
self.df.set_index('DRUG_ID', inplace=True)
except:
# could be done already
pass
self.df.index = [drug_name_to_int(x) for x in self.df.index]
self.df.index = self.df.index.astype(int)
self.df.index.name = "DRUG_ID"
# sort the columns
try:
self.df.sort_index(inplace=True)
except:
self.df = self.df.iloc[sorted(self.df.index)]
self._check_uniqueness(self.df.index)
def _get_names(self):
return list(self.df.DRUG_NAME.values)
drug_names = property(_get_names)
def _get_target(self):
return list(self.df.DRUG_TARGET.values)
drug_targets = property(_get_target)
def _get_drug_ids(self):
return list(self.df.index)
drugIds = property(_get_drug_ids,
doc="return list of drug identifiers")
def _get_row(self, drug_id, colname):
if drug_id in self.df.index:
return self.df.loc[drug_id][colname]
elif str(drug_id).startswith("Drug_"):
try:
drug_id = int(drug_id.split("_")[1])
except:
print("DRUG ID %s not recognised" % drug_id)
return
if drug_id in self.df.index:
return self.df[colname].loc[drug_id]
elif "_" in str(drug_id):
try:
drug_id = int(drug_id.split("_")[0])
except:
print("DRUG ID %s not recognised" % drug_id)
return
if drug_id in self.df.index:
return self.df[colname].loc[drug_id]
else:
return
[docs] def get_name(self, drug_id):
return self._get_row(drug_id, 'DRUG_NAME')
[docs] def get_target(self, drug_id):
return self._get_row(drug_id, 'DRUG_TARGET')
[docs] def is_public(self, drug_id):
return self._get_row(drug_id, 'WEBRELEASE')
[docs] def check(self):
for x in self.drugIds:
try:
x += 1
except TypeError as err:
print("drug identifiers must be numeric values")
raise err
# it may happen that a drug has no target in the database ! so we
# cannot check that for the moment:
#if self.df.isnull().sum().sum()>0:
# print(d.df.isnull().sum())
# raise ValueError("all values must be non-na. check tabulation")
[docs] def get_info(self):
# Note that there are 4 cases : Y, N, U (unknown?) and NaN
dd = { 'N': len(self),
'N_public': sum(self.df.WEBRELEASE == 'Y'),
'N_prop': sum(self.df.WEBRELEASE != 'Y')}
return dd
def __len__(self):
return len(self.df)
def __str__(self):
txt = "Number of drugs: %s\n" % len(self.df)
return txt
def __repr__(self):
txt = self.__str__()
if len(self.companies):
txt += "Contains %s companies" % len(self.companies)
return txt
def _get_companies(self):
if 'OWNED_BY' in self.df.columns:
companies = list(self.df.OWNED_BY.dropna().unique())
else:
companies = []
return sorted(companies)
companies = property(_get_companies)
[docs] def drug_annotations(self, df):
"""Populate the drug_name and drug_target field if possible
:param df: input dataframe as given by e.g., :meth:`anova_one_drug`
:return df: same as input but with the FDR column populated
"""
if len(self.df) == 0:
return df
# print("Nothing done. DrugDecode is empty.")
# aliases
if 'DRUG_ID' not in df.columns:
raise ValueError('Expected column named DRUG_ID but not found')
drug_names = [self.get_name(x) for x in df.DRUG_ID.values]
drug_target = [self.get_target(x) for x in df.DRUG_ID.values]
# this is not clean. It works but could be simpler surely.
df['DRUG_NAME'] = drug_names
df['DRUG_TARGET'] = drug_target
return df
def __add__(self, other):
"""
Fill missing values but do not overwrite existing fields even though
the field in the other DrugDecode instance is difference.
"""
# Problably not efficient but will do for now
columns = list(self.df.columns)
dd = DrugDecode()
dd.df = self.df.copy()
# add missing entires
missing = [x for x in other.df.index if x not in self.df.index]
dd.df = dd.df.append(other.df.iloc[missing])
# merge existing ones
for index, ts in other.df.iterrows():
# add the drug if not already present
if index in self.df.index:
# here it is found in the 2 instances but
# they may contain either complementary data, which
# could have been done with pandas.merge but we wish
# to check for incompatible data
for column in columns:
a = dd.df.loc[index][column]
b = ts[column]
if pd.isnull(b) is True:
# nothing to do if b is NULL
pass
elif pd.isnull(a) is True:
# we can merge the content of b into a
# that is the content of other into this instance
dd.df.loc[index,column] = b
else:
# a and b are not null
if a != b:
print('WARNING: different fields in drug %s (%s %s %s)' % (index, column, a, b))
return dd
def __eq__(self, other):
try:
return all(self.df.fillna(0) == other.df.fillna(0))
except:
return False
[docs] def get_public_and_one_company(self, company):
"""Return drugs that belong to a specific company and public drugs"""
drug_decode_company = self.df.query(
"WEBRELEASE=='Y' or OWNED_BY=='%s'" % company)
# Transform into a proper DrugDecode class for safety
return DrugDecode(drug_decode_company)