The data was downloaded from the NCCT Website into a directory DAT_DIR
%load_ext autoreload
%autoreload 2
%load_ext sql
import matplotlib.text as text
import scipy.interpolate as interp
import pandas as pd
#from mp.txpepa import *
#from bio.data.toxplorer import *
#import bio.hts.apredica as apr
#from bio.hts.htsdb import *
#from bio.data.toxplorer import *
#import viz.clust as cv
#from chem.clust import *
from sklearn import (manifold, datasets, decomposition, ensemble, lda,
random_projection)
from sklearn.metrics.pairwise import euclidean_distances,manhattan_distances
import statsmodels.api as sm
import numpy.linalg as LA
from rpy2.robjects.packages import importr
from rpy2.robjects.vectors import FloatVector
stats = importr('stats')
from sklearn.neighbors import KNeighborsClassifier
from genra.readacross import *
mng.register_connection("hts-db","htsdb",username="ishah",
password="xxx",host='localhost')
mng.register_connection("txp-db","toxplorerdb",username="ishah",
password="xxx",host='localhost')
#%sql postgresql://ishah:xxx@localhost/chemicals
#CD = ChemDrawing()
DAT_DIR = '/share/home/ishah/projects/Chem/data/tables/'
PKL_DIR = '/share/home/ishah/projects/Chem/data/pickle/'
import pickle
tmstmp = time.strftime("%m-%d-%Y",time.localtime())
import zipfile
DAT_DIR = '/share/home/ishah/projects/ToxCast/data/TX14/'
FD = {}
for F in os.listdir(DAT_DIR):
print '\n>',F,'\n'
ZF1 = zipfile.ZipFile(DAT_DIR+F,'r')
print "\n\t".join([i.filename for i in ZF1.filelist])
for i in ZF1.filelist:
fn = i.filename
#if fn.find('/')>-1:
# fn = fn.split('/')[-1]
#if not fn: continue
FD[fn]=F
def loadTXdata(df,fd=FD,read_fn=pd.read_csv,dat_dir=DAT_DIR):
zf = fd[df]
ZF1 = zipfile.ZipFile(dat_dir+zf,'r')
d = ZF1.extract(df)
return read_fn(d)
#Assay
A0 = loadTXdata( 'Assay_Summary_141121.csv')
A0.set_index(['aenm','aeid','acid','assay_source_name'],inplace=True)
#Chemicals
C0 = loadTXdata('TOX21S_v4b_8599_23Oct2014.xlsx',read_fn=pd.read_excel)
C0['TS_CASRN']=C0.TS_CASRN.apply(lambda x: x.replace("'",""))
C0['ID'] = C0.TS_CASRN.apply(lambda x: 'C'+x.replace('-',''))
C1 = C0[['ID','DSSTox_GSID','TS_CASRN','TS_ChemName']]
C1 = C1.rename(columns={'TS_CASRN':'chemical_casrn','TS_ChemName':'chemical_name'})
C0.set_index('ID',inplace=True)
#Bioactivity
B0= loadTXdata('AllResults_modl_ga_Matrix_141121.csv')
B0.rename(columns=({'Unnamed: 0':'ID'}),inplace=True)
B0.set_index('ID',inplace=True)
B1= loadTXdata('AllResults_modl_la_Matrix_141121.csv')
B1.rename(columns=({'Unnamed: 0':'ID'}),inplace=True)
B1.set_index('ID',inplace=True)
# What was tested
Bt = loadTXdata('AllResults_tested_Matrix_141121.csv')
Bt.rename(columns=({'Unnamed: 0':'ID'}),inplace=True)
Bt.set_index('ID',inplace=True)
# Set what was not test to Null
# Everything that is null is inactive - replace nulls with very high conc
B0[B0.isnull()]=6
B1[B1.isnull()]=6
# Everything that was not tested is null
B0[Bt==0]=None
B1[B1.isnull()]=6
B0 = pd.merge(C1,B0.reset_index(),left_on='ID',right_on='ID')
B0.set_index(['ID','DSSTox_GSID','chemical_casrn','chemical_name'],inplace=True)
# Tox21 data
T21= loadTXdata('ToxCast_Tox21_Level5&6_20141022.csv')
T21.rename(columns=dict(spid='sample_id',casn='casrn',chnm='chemical_name',code='ID',aenm='assay_name'),inplace=True)
T21b = pd.pivot_table(T21,index=['sample_id','ID','casrn','chemical_name'],
columns='assay_name',values='hitc')
T21p = pd.pivot_table(T21,index=['sample_id','ID','casrn','chemical_name'],
columns='assay_name',values='hill_ga')
pickle.dump([A0,C0,C1,B1],file(PKL_DIR+'chm-bio-'+tmstmp+'.pkl','w'))
# Toxicity
T1 = loadTXdata('toxrefdb/toxrefdb_study_tg_effect_endpoint_AUG2014_FOR_PUBLIC_RELEASE.csv')
T1.drop('Unnamed: 0',axis=1,inplace=True)
# Create ID
T1['ID']=T1.chemical_casrn.apply(lambda x: 'C'+x.replace('/','-').replace('-',''))
# DSSTox_GSID
T1['DSSTox_GSID'] = T1.chemical_id.apply(lambda x: ifthen(not x.find('CAS')>-1,x.split('_')[-1],None))
T1.shape
#T11 = T1.set_index('ID')
Ph1 = T1[(T1['data_source']=='opp_der')].ID.unique()
len(Ph1)
T1.columns
T1[['chemical_id','chemical_name','study_id','source_study_numeric_id', u'citation']].ix[:50]
# Effect -> lesion type
Les_Cat = dict(( (E.name.lower(),[t.replace('les_cat:','').lower() for t in E.tags][0])
for E in Entity.objects(tags__istartswith='les_cat:',name__exists=1) ))
#len(T1.effect_desc.unique())
#Les_Cat.items()[:20]
# Add to dict
Les_Cat['abnormal lobation']='other'
Les_Cat['leukemia lymphocytic']='neoplasia'
Les_Cat['carcinoma nos'] = 'neoplasia'
Les_Cat['mixed tumor malignant'] = 'neoplasia'
T1.effect_desc[pd.isnull(T1.effect_desc)]=''
T1['les_cat']=T1.effect_desc.apply(lambda x: Les_Cat.get(x.lower()))
[i.lower() for i in T1.study_type.unique()]
CAS_rn = Chemical.objects(tags='css_rn').distinct('casrn')
len(CAS_rn)
CAS_rn[:10]
len(set(T1.chemical_casrn.unique()).intersection(CAS_rn))
I1 = T1.chemical_casrn.apply(lambda i: i in CAS_rn)
I2 = T1.study_type.apply(lambda i: i.lower() in ['chr', 'sub', 'acu'] )
I3 = T1.species.apply(lambda i: i.lower()=='rat')
I4 = np.logical_and(np.logical_and(I1,I2), I3)
sum(I4)
TS1= T1.ix[T1.index[I4],
['DSSTox_GSID','chemical_id', u'chemical_casrn', u'chemical_name',
'study_type', u'species', u'strain','admin_method', u'admin_route',
'dose', u'toxrefdb_tg_dose_unit', u'duration', u'duration_unit','loael']
].drop_duplicates()
TS1.shape
I5 = np.logical_and(I4,T1.chemical_casrn=='4151-50-2')
T1.ix[T1.index[I5],['DSSTox_GSID','chemical_id', u'chemical_casrn', u'chemical_name',
'study_type', u'species', u'strain','admin_method', u'admin_route',
'dose', u'toxrefdb_tg_dose_unit', u'duration', u'duration_unit']]
I5 = T1.chemical_casrn=='4151-50-2'
T1.ix[T1.index[I5],['DSSTox_GSID','chemical_id', u'chemical_casrn', u'chemical_name',
'study_type', u'species', u'strain','admin_method', u'admin_route',
'dose', u'toxrefdb_tg_dose_unit', u'duration', u'duration_unit']]
TS11 = pd.pivot_table(TS1,
index=['DSSTox_GSID','chemical_id', u'chemical_casrn', u'chemical_name',
'study_type','dose','duration','duration_unit'],
columns='toxrefdb_tg_dose_unit',
values='admin_route',
aggfunc=len)
TS11.shape
xl = pd.ExcelWriter('/share/home/ishah/tmp/toxref-rat-css-treatment-doses-v2.xlsx')
TS11.to_excel(xl,sheet_name='view')
TS11.reset_index().to_excel(xl,sheet_name='data')
xl.close()
T1.loael.unique()
# Figure out the maximum treatment concentration for each study type - This will be treatment concentration
# up to which there was no effect
TF = [['study_type']
]
# If a chemical has an effect in a study then all other specific effects that are NA will be set to zero
from ml.mlearn import concat_df
def mk_str(x):
if type(x) ==tuple:
return '_'.join([i.lower().replace('-','_').replace(' ','_') for i in x])
elif type(x)==str:
return x.lower().replace('-','_').replace(' ','_')
else:
return x
T_mt = pd.DataFrame()
for c_i in TF:
T_i = pd.pivot_table(T1,index=['ID','DSSTox_GSID','chemical_name'],
columns=c_i,
values='dose',
aggfunc=np.min)
#T_i.columns = mk_str([mk_str(jj) for jj in T_i.columns])
if T_mt.shape[0]>0:
T_mt = pd.merge(T_mt,T_i,how='outer',left_index=True,right_index=True)
else:
T_mt = T_i
#I = [i for i in T2.columns if re.search('chr.+liver',i,re.I)]
#I
# Toxicity -> factors
TF = [#['study_type'],
#['species','study_type','effect_target'],
# ['study_type','effect_target'],
['study_type','species','effect_target','les_cat'],
#['study_type','species','effect_target'],
#['study_type','species','effect_target','les_cat']
]
# If a chemical has an effect in a study then all other specific effects that are NA will be set to zero
from ml.mlearn import concat_df
def mk_str(x):
if type(x) ==tuple:
return '_'.join([i.lower().replace('-','_').replace(' ','_') for i in x])
elif type(x)==str:
return x.lower().replace('-','_').replace(' ','_')
else:
return x
T2 = pd.DataFrame()
for c_i in TF:
T_i = pd.pivot_table(T1,index=['ID','DSSTox_GSID','chemical_name'],
columns=c_i,
values='dose',
aggfunc=np.min)
#T_i.columns = mk_str([mk_str(jj) for jj in T_i.columns])
if T2.shape[0]>0:
T2 = pd.merge(T2,T_i,how='outer',left_index=True,right_index=True)
else:
T2 = T_i
#I = [i for i in T2.columns if re.search('chr.+liver',i,re.I)]
#I
All toxic effects that are not significant are not reported. This produces a great deal of missing data. We need an approach to differentiate between unknown effects and effects that are not significant. We assume that if a particular guideline study was conducted but the effects were not reported then a chemical would be negative for that particular effect for that type of guideline study. However, this ignores the effect of species and sex.
for study in set([i[0] for i in T2.columns]):
Yij=T2[(study)]
I = Yij.isnull()
I1 = I.sum(axis=1)<Yij.shape[1]
I2=I.apply(lambda y: y & I1)
T2[(study)][I2]=0
T2.shape
I = [i for i in T2.columns if i[1]=='rat']
Rat_liver=T2[I]
pickle.dump(Rat_liver,file(PKL_DIR+'rat-liver-effects-'+tmstmp+'.pkl','w'))
pickle.dump([T2,T_mt],file(PKL_DIR+'tox-'+tmstmp+'.pkl','w'))
PKL_DIR
CID=[u'C335762',
u'C307244',
u'C375951',
u'C1763231',
u'C335671',
u'C4151502',
u'C375859',
u'C2795393',
u'C29420493',
u'C3825261',
u'C3871996',
u'C754916',
u'C2058948']
#C0.STRUCTURE_MW[CID]
C0.ix[CID].T
Y1 = T2.ix[CID]
#Y1[Y1.notnull()]
I=Y1.notnull().sum()>1
Y1 = T2.ix[CID,I]
#Y1.apply(lambda x: Y1.columns[x==x.min()][0],axis=0)
Y1.T
I = np.logical_not(np.all(np.logical_or(Y1.isnull(),Y1==0),axis=0))
Y1.ix[:,I].T
for study in set([i[0] for i in Y1.columns]):
Yij=Y1[(study)]
I = Yij.isnull()
I1 = I.sum(axis=1)<Yij.shape[1]
I2=I.apply(lambda y: y & I1)
Y1[(study)][I2]=0
X1 = np.log10(Y1.apply(lambda x: x/(1000*C0.STRUCTURE_MW[ID1]),axis=0)).T
X1[np.isinf(X1)]=0
X1
Liv = [i for i in T2.columns if i[1].startswith('Liver')]
T2.ix[:20,Liv]
pd.pivot_table(T11.ix['C101200480'],columns=['study_type','species'],index='effect_target',values='dose')
pickle.dump([T2,T_mt],file(PKL_DIR+'tox-'+tmstmp+'.pkl','w'))
# Chemical fingerprints
from rdkit import Chem
from rdkit.DataStructs import *
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit.Chem import AllChem
from rdkit.Chem.SaltRemover import SaltRemover
df = 'TOX21S_v4b_CID_structures.sdf'
zf = FD[df]
ZF1 = zipfile.ZipFile(DAT_DIR+zf,'r')
d = ZF1.extract(df)
suppl = Chem.SDMolSupplier(d)
# Map DSSTox_CID to ID
CID2ID = dict(zip(C0.DSSTox_CID,C0.index))
MOLS = {}
for m in suppl:
if not m: continue
if 'DSSTox_CID' not in m.GetPropNames():
continue
k = CID2ID.get(int(m.GetProp('DSSTox_CID')))
if not k: continue
MOLS[k] = m
from rdkit.Chem import MACCSkeys
FP1 = pd.DataFrame([np.array(AllChem.GetMorganFingerprintAsBitVect(i,3,1024)) for i in MOLS.values()])
FP1.index=MOLS.keys()
FP1.columns = ['mrgn_%d'%i for i in FP1.columns]
FP2 = pd.DataFrame([np.array(MACCSkeys.FingerprintMol(i)) for i in MOLS.values()])
FP2.index=MOLS.keys()
FP2.columns = ['mccs_%d'%i for i in FP2.columns]
FP3 = pd.DataFrame([np.array(AllChem.GetHashedTopologicalTorsionFingerprintAsBitVect(i)) for i in MOLS.values()])
FP3.index=MOLS.keys()
FP3.columns = ['tptr_%d'%i for i in FP3.columns]
FP0 = pd.merge(FP1,FP2,left_index=True,right_index=True)
FP0 = pd.merge(FP0,FP3,left_index=True,right_index=True)
FP0.index.names=['ID']
#pickle.dump([C0,FP0,FP1,FP2,FP3],file(PKL_DIR+'chm-'+tmstmp+'.pkl','w'))
pickle.dump(MOLS,file(PKL_DIR+'mols-'+tmstmp+'.pkl','w'))
DAT_DIR = '/share/home/ishah/projects/Chem/data/tables/'
if False:
T2.to_csv(DAT_DIR+'tox-v1.csv')
FP0.to_csv(DAT_DIR+'chmfp-v1.csv')
B1.to_csv(DAT_DIR+'bio-v1.csv')
T_mt.to_csv(DAT_DIR+'tox-max-trt-v1.csv')
W = pd.ExcelWriter(DAT_DIR+'chm-v1.xlsx')
C1.to_excel(W,sheet_name='All')
W.save()
[i for i in T2.ix[:10,:10].columns]
T3 = T2.copy()
T3.columns = [i[0].lower() +'_'+i[1].lower().replace(' ','_') for i in T3.columns]
Tox = T3.columns
Bio=B0.columns
Tox=T2.columns
Chm=FP0.columns
X0 = B0.copy()
X0[X0<6]=1
X0[X0==6]=0
BCb = pd.merge(X0.reset_index(),FP0.reset_index(),how='outer',left_on='ID',right_on='ID')
BCTb= pd.merge(BCb,T3.reset_index().drop(['DSSTox_GSID','chemical_name'],axis=1),how='outer',
left_on='ID',right_on='ID')
BCTb= BCTb.set_index(['ID']).drop(['chemical_name','chemical_casrn','DSSTox_GSID'],axis=1)
BCb = BCb.set_index(['ID']).drop(['chemical_name','chemical_casrn','DSSTox_GSID'],axis=1)
print 'All',BCTb.shape
print 'Bio & Chm',BCb.shape
BCTb.ix[CID,Tox[-10:]]
X0 = B0.copy()
X0[X0<6]=1
X0[X0==6]=0
BCc = pd.merge(X0.reset_index(),FP0.reset_index(),how='inner',left_on='ID',right_on='ID')
BCTc= pd.merge(BCc,T3.reset_index().drop(['DSSTox_GSID','chemical_name'],axis=1),how='inner',
left_on='ID',right_on='ID')
BCc = BCc.set_index('ID').drop(['DSSTox_GSID','chemical_casrn','chemical_name'],axis=1)
BCTc= BCTc.set_index('ID').drop(['DSSTox_GSID','chemical_casrn','chemical_name'],axis=1)
BCc.shape,BCTc.shape
tmstmp
pickle.dump([BCc,BCTc,BCTb,Bio,Chm,Tox],file(PKL_DIR+'tx-tr-ch-'+tmstmp+'.pkl','w'))
pickle.dump(BCTb,file(PKL_DIR+'BCTb-'+tmstmp+'.pkl','w'))
BCTb.to_csv(DAT_DIR+'BCTb-'+tmstmp+'.csv')
os.listdir(DAT_DIR)
print "\n".join(os.listdir(PKL_DIR))
[BCc,BCTc,Bio,Chm,Tox] = pickle.load(file(PKL_DIR+'tx-tr-ch-02-12-2015.pkl','r'))
[A0,C0,C1,B1] = pickle.load(file(PKL_DIR+'chm-bio-02-12-2015.pkl','r'))
[T2,T_mt] = pickle.load(file(PKL_DIR+'tox-02-12-2015.pkl','r'))