%load_ext autoreload
%autoreload 2
%load_ext sql
%pylab inline
%matplotlib inline
import matplotlib.text as text
from IPython.html.widgets import FloatProgress
from IPython.display import display
import scipy.interpolate as interp
import pandas as pd
#from mp.txpepa import *
from bio.data.toxplorer import *
import bio.hts.apredica as apr
from bio.hts.htsdb import *
from bio.data.toxplorer import *
import viz.clust as cv
from chem.clust import *
from sklearn import (manifold, datasets, decomposition, ensemble, lda,
random_projection)
from sklearn.metrics.pairwise import euclidean_distances,manhattan_distances
import statsmodels.api as sm
import numpy.linalg as LA
from rpy2.robjects.packages import importr
from rpy2.robjects.vectors import FloatVector
stats = importr('stats')
from sklearn.neighbors import KNeighborsClassifier
from genra.readacross import *
mng.register_connection("hts-db","htsdb",username="ishah",
password="xxx",host='localhost')
mng.register_connection("txp-db","toxplorerdb",username="ishah",
password="xxx",host='localhost')
%sql postgresql://ishah:xxx@localhost/chemicals
DAT_DIR = '/share/home/ishah/projects/Chem/data/tables/'
PKL_DIR = '/share/home/ishah/projects/Chem/data/pickle/'
RES_DIR='/share/home/ishah/projects/Chem/data/results/'
FIG_DIR='/share/home/ishah/projects/Chem/figs/readacross/'
import pickle
tmstmp = time.strftime("%m-%d-%Y",time.localtime())
# Start the parallel machine
from ml.mlearn import *
from ml.readacross import *
import IPython.parallel as PP
%reload_ext autoreload
%autoreload 2
lb_view=None
d_view =None
def initParallel(Data=None):
RC = PP.Client(profile='galaxy_parallel')
global lb_view
global d_view
d_view = RC[:]
d_view.block = True
lb_view = RC.load_balanced_view()
lb_view.block = True
d_view.execute("""
%load_ext autoreload
%autoreload 2
import ml.mlearn as ml
from ml.mlearn import *
from ml.readacross import *
from chem.clust import *
""")
if Data:
d_view.push(Data)
print "\n".join([i for i in os.listdir(PKL_DIR) if i.startswith('tx-tr')])
[BCc,BCTc,BCTb,Bio,Chm,Tox] = pd.read_pickle(PKL_DIR+'tx-tr-ch-02-08-2016.pkl')
#print "\n".join(os.listdir(PKL_DIR))
[A0,C0,C1,B1] = pd.read_pickle(PKL_DIR+'chm-bio-02-12-2015.pkl')
MOLS = pd.read_pickle(PKL_DIR+'mols-02-12-2015.pkl')
[C2M,M2C,Cl_st,C2] = pd.read_pickle(PKL_DIR+'clust-02-12-2015.pkl')
C2 = C1.set_index('ID')
# Chemical fingerprints
from rdkit import Chem
from rdkit.DataStructs import *
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit.Chem import AllChem
FP1 = pd.DataFrame([np.array(AllChem.GetMorganFingerprintAsBitVect(i,3,1024)) for i in MOLS.values()])
FP1.index=MOLS.keys()
FP1.columns = ['mrgn_%d'%i for i in FP1.columns]
FP2 = pd.DataFrame([np.array(MACCSkeys.FingerprintMol(i)) for i in MOLS.values()])
FP2.index=MOLS.keys()
FP2.columns = ['mccs_%d'%i for i in FP2.columns]
FP3 = pd.DataFrame([np.array(AllChem.GetHashedTopologicalTorsionFingerprintAsBitVect(i)) for i in MOLS.values()])
FP3.index=MOLS.keys()
FP3.columns = ['tptr_%d'%i for i in FP3.columns]
FP0 = pd.merge(FP1,FP2,left_index=True,right_index=True)
FP0 = pd.merge(FP0,FP3,left_index=True,right_index=True)
FP0.index.names=['ID']
#Distance calculations
# Distance calculations
# make a binary rep
X = BCTb[Bio]
CID = X.index
X = X.fillna(0)
X[X!=0]=1
D_bio = pd.DataFrame(squareform(pdist(X,'jaccard')),
columns=CID,index=CID)
S_bio = 1-D_bio
S_bio.shape
X = BCTb[Chm]
CID = X.index
X = X.fillna(0)
X[X!=0]=1
D_chm = pd.DataFrame(squareform(pdist(X,'jaccard')),
columns=CID,index=CID)
S_chm = 1-D_chm
X = BCTb[Bio+Chm]
CID = X.index
X = X.fillna(0)
X[X!=0]=1
D_bc = pd.DataFrame(squareform(pdist(X,'jaccard')),
columns=CID,index=CID)
S_bc = 1-D_bc
S_bc.shape
ChmNm1 = C1[['ID','chemical_name']]
ChmNm1.set_index('ID',inplace=True)
STUDIES = ['mgr', 'chr', 'sac', 'sub', 'dev', 'rep', 'oth', 'acu', 'neu', 'dnt']
#pd.merge(C2.ix[CID],BCTb.ix[CID,STUDIES],left_index=True,right_index=True)
R= C1.ix[C1.chemical_name.apply(lambda i: i.lower().find("propiconazole")>-1)]
print R
C2M['C94361065']
len(Tox)
#BCTb0 = BCTb.copy()
BCTb[BCTb>0]=1
CD = ChemDrawing()
cl = 5
CID = M2C[cl]
T0 = BCTb.ix[CID,Tox]
Y = T0.apply(lambda y: np.sum(y==0)>1 and np.sum(y>0)>0,axis=0)
Tox1 = Y[Y].index
print "> CL ",cl,'n=',len(CID),'Tox=',len(Tox1),time.strftime("%H:%M",time.localtime())
sys.stdout.flush()
sys.stderr.flush()
X_cl=BCTb.ix[CID,Tox1]
S_chm_cl=S_chm.ix[CID,:]
S_bio_cl=S_bio.ix[CID,:]
S_bc_cl=S_bc.ix[CID,:]
PERF = []
for t0 in Tox1:
print ' ', t0
#P = ClActivityFromNN(t0,X_cl,S_chm_cl,sim_type='chm',ret='perf',
# k0=3,s0=0.2,perm=0,wt=True)
P = GenRAPerf(t0,X_cl,S_bio_cl,sim_type='bio',ret='perf',
k0=3,s0=0.2,perm=0,wt=True)
PERF.append(P)
PERF_df = pd.DataFrame(PERF)
PERF_df.t0_max.hist(bins=20)
d_view.execute("""
%load_ext autoreload
%autoreload 2
import ml.mlearn as ml
from ml.mlearn import *
from chem.ra.readacross import *
import chem.ra.readacross as genra
from chem.clust import *
""")
from chem.ra.readacross import *
import chem.ra.readacross as genra
from chem.ra.readacross import *
P_ALL = []
print "Starting GenRA Analysis to assign activity " + time.strftime("%d/%m/%Y %H:%M",time.localtime())
initParallel()
from IPython.html.widgets import FloatProgress
from IPython.display import display
N = len(M2C)
pc = int(N/100)
pbar = FloatProgress(min=0, max=100,description='GenRA Perf')
display(pbar)
i_cl =0
for cl,CID in M2C.iteritems():
i_cl+=1
if (i_cl % pc) == 0:
pbar.value += 1
if len(CID)<2: continue
#Tox1=STUDIES1[np.where(BCTb.ix[CID,STUDIES1].notnull().sum()>0)]
T0 = BCTb.ix[CID,Tox]
Y = T0.apply(lambda y: np.sum(y==0)>0 and np.sum(y>0)>0,axis=0)
Tox1 = Y[Y].index
if len(Tox1)<1: continue
print "> CL ",cl,'n=',len(CID),'Tox=',len(Tox1),time.strftime("%H:%M",time.localtime())
sys.stdout.flush()
sys.stderr.flush()
X_cl=BCTb.ix[CID,Tox1]
S_chm_cl=S_chm.ix[CID,:]
S_bio_cl=S_bio.ix[CID,:]
S_bc_cl=S_bc.ix[CID,:]
#k0 = len(CID)
print " Broadcasting data .."
d_view.push(dict(X_cl=X_cl,S_bio_cl=S_bio_cl,S_chm_cl=S_chm_cl,S_bc_cl=S_bc_cl,Tox1=Tox1))
SK0 = []
L = ifthen(len(CID)<10, len(CID), 10)
for ba in Tox1:
for s0 in np.linspace(1,0,num=11)[1:]:
for k0 in range(1,L):
SK0.append((ba,s0,k0))
print " Chm %d" % len(SK0)
P = lb_view.map(lambda (ba_i,s0_i,k0_i): GenRAPerf(ba_i,X_cl,S_chm_cl,sim_type='chm',ret='perf',
k0=k0_i,s0=s0_i,perm=50,wt=True),
SK0)
P_df = pd.DataFrame([i for i in P if len(i)])
P_df['cl']=cl
P_ALL = concat_df(P_ALL,P_df)
print " Bio"
P = lb_view.map(lambda (ba_i,s0_i,k0_i): GenRAPerf(ba_i,X_cl,S_bio_cl,sim_type='bio',ret='perf',
k0=k0_i,s0=s0_i,perm=50,wt=True),
SK0)
P_df = pd.DataFrame([i for i in P if len(i)])
P_df['cl']=cl
P_ALL = concat_df(P_ALL,P_df)
print " BC"
P = lb_view.map(lambda (ba_i,s0_i,k0_i): GenRAPerf(ba_i,X_cl,S_bc_cl,sim_type='bc',ret='perf',
k0=k0_i,s0=s0_i,perm=50,wt=True),
SK0)
P_df = pd.DataFrame([i for i in P if len(i)])
P_df['cl']=cl
P_ALL = concat_df(P_ALL,P_df)
P_ALL.to_csv(RES_DIR+ time.strftime("genra-pred-tox-cl-nn-%Y-%m-%d-%H%M.csv",time.localtime()),index=False)
pbar.value = 100
#send_email(txt="Done",subj="ActivityFromNN bio")
P_ALL = pd.read_csv(RES_DIR+'genra-pred-tox-cl-nn-2016-03-02-1309.csv')
P_ALL.t0_max.hist(bins=20)
N = dict(((i,len(v)) for i,v in M2C.iteritems()))
P_ALL['cl_n'] = P_ALL.cl.apply(lambda i: N[i])
P_ALL['n_chem']=P_ALL.n_sim_pos+P_ALL.n_sim_neg
P_ALL[(P_ALL['n_chem']>1)].shape
P_ALL1 = P_ALL.set_index(['cl','sim_type','effect','s0','k0','n_chem'])
# How many cluster analyzed ?
len(set([i[0] for i in P_ALL1.index]))
def mksig(p):
y = ''
if isfinite(p):
if p<=0.1:
y='*'
elif p<=0.01:
y='**'
return y
G=P_ALL1.reset_index().groupby(['cl','cl_n','sim_type','effect'])
PS0 = G.aggregate(dict(auc=np.max,auc_pval=np.min,ba_max=np.max,t0_max=np.median,n_neg=np.max,n_pos=np.max))
#P = Summary.select(lambda i:i[0]==2 and i[1]=='chm')
#PS0['res1'] = PS0.apply(lambda i: "%3.2f/%3.2f" % (i[2],i[4]),axis=1)
#PS0['res2'] = PS0.apply(lambda i: "%3.2f/%3.2f (%d+,%d-)" % (i[2],i[4],i[5],i[0]),axis=1)
PS0['res1'] = PS0.apply(lambda i: "%3.2f %s (%d+,%d-)" % (i[2],mksig(i[4]),i[5],i[0]),axis=1)
PS0.res1[PS0.index[pd.isnull(PS0.auc)]]=''
#Summary = Summary[(Summary['auc_pval']>0)]
PS0 = PS0.reset_index()
PS0['tox']=PS0.effect.apply(lambda i: i.split('_')[0])
PS0['effect'] = PS0.effect.apply(lambda i: i.replace('[not_in_list]','other'))
PS0.ix[:10]
list(P_ALL1.index)[:10]
P_ALL['tox'] = P_ALL.effect.apply(lambda i: i.split('_')[0])
P_ALL['effect'] = P_ALL.effect.apply(lambda i: i.replace('[not_in_list]','other'))
from scipy.interpolate import *
from IPython.html.widgets import FloatProgress
from IPython.display import display
def smoothData(xi,yi,zi,out='ij'):
x1,y1=np.mgrid[xi.min():xi.max():50j,yi.min():yi.max():50j]
f = bisplrep(xi,yi,zi,s=0.5)
z1= bisplev(x1[:,0],y1[0,:],f)
if out =='ij':
return np.concatenate(x1),np.concatenate(y1),np.concatenate(z1)
else:
return x1,y1,z1
X_vol = []
Work = list(P_ALL1.reset_index()[['cl','sim_type','effect']].drop_duplicates().to_records(index=False))
N = len(Work)
pc = int(N/100)
pbar = FloatProgress(min=0, max=100,description='GenRA Perf Vol')
display(pbar)
i_cl=0
for cl,sim_type,tox in Work:
if (i_cl % pc) == 0:
pbar.value += 1
i_cl+=1
for ft in ['chm','bio','bc']:
X = P_ALL1.xs((cl,ft,tox)).reset_index()
if (X.auc>0).sum()==0: continue
try:
x2,y2,z2 = smoothData(X.n_chem, X.s0, X.auc,out='xy')
z2[z2>1]=1
v1 = z2.sum().sum()
v2 = z2[z2>0.5].sum().sum()
(x,y) = z2.shape
vf1 = v1/(x*y)
vf2 = v2/(x*y)
except:
pass
else:
X_vol.append(dict(cl=cl,tox=tox,sim_type=ft,auc_vol1=v1,auc_vol2=v2,auc_volf1=vf1,auc_volf2=vf2))
if len(X_vol)>0:
PERF_vol=pd.pivot_table(pd.DataFrame(X_vol),
index=['cl','tox'],columns='sim_type',values='auc_volf1')
PERF_vol[PERF_vol>1]=1
pbar.value=100
X=P_ALL1.xs((cl,ft)).reset_index()
X.effect.unique()
#PERF_vol=pd.pivot_table(pd.DataFrame(X_vol),
# index=['cl','tox'],columns='sim_type',values='auc_volf1')
#PERF_vol[PERF_vol>1]=1
V0 = pd.DataFrame(X_vol)
V0.rename(columns=dict(tox='effect'),inplace=True)
V0['tox']=V0.effect.apply(lambda i: i.split('_')[0])
V0['effect'] = V0.effect.apply(lambda i: i.replace('[not_in_list]','other'))
PERF_vol=pd.pivot_table(V0,index=['cl','tox','effect'],columns='sim_type',values='auc_volf1')
#PERF_vol[['auc_volf1']].ix[:10]
"""
PERF0 = pd.merge(PS0,V0,left_on=['cl','tox','effect','sim_type'],right_on=['cl','tox','effect','sim_type'],how='outer')
PERF0.auc_volf1.fillna(0,inplace=True)
# concatentation: VUS_volf1, ROC AUC, pval, n+,n-
#PERF0['res2'] = PERF0.apply(lambda i: "%3.2f %3.2f %s(%d+,%d-)" % (i[2],mksig(i[4]),i[5],i[0]),axis=1)
PERF0['res2'] = PERF0.apply(lambda i: "%3.2f %3.2f %s(%d+,%d-,%d)" % (i['auc_volf1'],i['auc'],mksig(i['auc_pval']),i['n_pos'],i['n_neg'],i['cl_n']),axis=1)
PERF0['n_perf'] = PERF0.n_pos+PERF0.n_neg
#PERF0.res2[PERF0.index[pd.isnull(PERF0.VUS_vol)]]=''
PERF1 = pd.pivot_table(PERF0,index=['cl','tox','effect','cl_n','n_perf'],columns='sim_type',values=['auc_volf1','auc','auc_pval','res2'],
aggfunc=lambda i: np.unique(i)[0])
"""
x=[0,13,22,6,29,12]
P1_vol = PERF_vol.select(lambda x: x[0] in x)
P1_vol.ix[np.where((P1_vol>0.75).sum(axis=1)>0)]
X1=PERF_vol.xs(80)
#tox1=list(X1.ix[X1.index[(X1>0.9).any(axis=1)]].index)
#X1.ix[X1.index[(X1>0.7).any(axis=1)]]
X1.ix[X1.index[(X1>0.75).any(axis=1)]]
tox1=list(X1.ix[X1.index[(X1>0.75).any(axis=1)]].index)
tox1=list(X1.index[(X1>0.75).any(axis=1)])
def mkcol(x):
y = x[0]
if len(x)==2 and x[1]!='':
y += '_'+x[1]
for i in range(1,len(x)-1):
if x[i] or x[i]!='':
y += x[i]
return y
G=P_ALL.groupby(['cl','tox','effect','sim_type','n_pos','n_neg','cl_n'])
P0=G.aggregate(dict(auc=dict(mn=np.mean,sd=np.std),t0_max=dict(mn=np.mean,sd=np.std),
sn_max=dict(mn=np.mean,sd=np.std),sp_max=dict(mn=np.mean,sd=np.std)))
P0['AUC']=P0['auc'].ix[:10].apply(lambda i: "%3.2f(%1.1f)" % (i[0],i[1]),axis=1)
P0['Spec']=P0['sp_max'].ix[:10].apply(lambda i: "%3.2f(%1.1f)" % (i[0],i[1]),axis=1)
P0['Sens']=P0['sn_max'].ix[:10].apply(lambda i: "%3.2f(%1.1f)" % (i[0],i[1]),axis=1)
P0['GenRA0']=P0['t0_max'].ix[:10].apply(lambda i: "%3.2f(%1.1f)" % (i[0],i[1]),axis=1)
#P0[['AUC','Sens','Spec','y0']].ix[:10]
P0[('t0_max')].mn.hist()
P_ALL.t0_max.hist()
#pl.scatter(P0[('AUC','mn')])
P0.columns
VUS=PERF_vol.copy()
VUS[VUS>1]=1
#VUS_sig=VUS.ix[(VUS>0.4).any(axis=1)]
#VUS_sig.ix[:10]
VUS.bc.hist(bins=20)
B = VUS.copy()
B[B<0.5]=0
B[B>=0.5]=1
VUS_summary= B.copy()
VUS_summary['Any bio|chm|bc'] = (B>0).any(axis=1)
VUS_summary['Only chm'] = B.apply(lambda x: x[1]==0 and x[0]==0 and x[2]==1,axis=1)
VUS_summary['Only bio'] = B.apply(lambda x: x[1]==1 and x[0]==0 and x[2]==0,axis=1)
VUS_summary['Only bc'] = B.apply(lambda x: x[1]==0 and x[0]==1 and x[2]==0,axis=1)
VUS_summary['chm>bio|bc'] = VUS.apply(lambda x: x[2]>=0.5 and x[2]>x[1] and x[2]>x[0],axis=1)
VUS_summary['bio>chm|bc'] = VUS.apply(lambda x: x[1]>=0.5 and x[1]>x[2] and x[1]>x[0],axis=1)
VUS_summary['bc>chm|bio'] = VUS.apply(lambda x: x[0]>=0.5 and x[0]>x[2] and x[0]>x[1],axis=1)
#VUS_summary=VUS_summary.ix[:,3:]
#Just to keep count
VUS_summary['n']=1
VUS_summary=VUS_summary[['n', u'chm',u'bio',u'bc', u'Any bio|chm|bc', u'Only chm', u'Only bio',
u'Only bc', u'chm>bio|bc', u'bio>chm|bc', u'bc>chm|bio']]
VUS_summary.sum(axis=0)
VUS_summary.ix[:2]
#len(set([i[0] for i in VUS_summary.index]))
VUS_summary.groupby(level=[1]).aggregate(np.sum)
P2.ix[:10]
PERF_auc=pd.pivot_table(P2.reset_index(),index=['cl','tox','effect'],columns='sim_type',values='auc_mn')
PERF_auc.ix[:3]
B = PERF_auc.copy()
auc0=0.75
B[B<auc0]=0
B[B>=auc0]=1
AUC_summary= B.copy()
#AUC_summary['Only chm'] = B.apply(lambda x: x[1]==0 and x[0]==0 and x[2]==1,axis=1)
#AUC_summary['Only bio'] = B.apply(lambda x: x[1]==1 and x[0]==0 and x[2]==0,axis=1)
#AUC_summary['Only bc'] = B.apply(lambda x: x[1]==0 and x[0]==1 and x[2]==0,axis=1)
AUC_summary['chm>bio|bc'] = PERF_auc.apply(lambda x: x[2]>=auc0 and x[2]>x[1] and x[2]>x[0],axis=1)
AUC_summary['bio>chm|bc'] = PERF_auc.apply(lambda x: x[1]>=auc0 and x[1]>x[2] and x[1]>x[0],axis=1)
AUC_summary['bc>chm|bio'] = PERF_auc.apply(lambda x: x[0]>=auc0 and x[0]>x[2] and x[0]>x[1],axis=1)
AUC_summary['Any'] = (B>0).any(axis=1)
#VUS_summary=VUS_summary.ix[:,3:]
#Just to keep count
AUC_summary['n']=1
AUC_summary=AUC_summary[['n', u'chm',u'bio',u'bc',
#u'Only chm', u'Only bio',u'Only bc',
u'chm>bio|bc', u'bio>chm|bc', u'bc>chm|bio',
'Any']]
AUC_RES = AUC_summary.groupby(level=[1]).aggregate(np.sum)
AUC_RES.ix['All'] = AUC_summary.sum(axis=0)
AUC_RES
#AUC_RES['P%']=100.0*AUC_RES.Any/AUC_RES.n
#AUC_RES.ix['P%']=100.0*AUC_RES.ix['All'] /AUC_RES.n.ix['All']
AUC_RES
AUC_RES_P=AUC_RES.apply(lambda x: 100.0*x/AUC_RES.n)
AUC_RES_P=np.round(AUC_RES_P,decimals=0)
AUC_RES_P.sort('Any',ascending=False)
X = np.round(AUC_RES,decimals=0).copy()
for c in X.columns: X[c] = X[c].astype(np.uint)
for c in X.columns: X[c] = X[c].astype(np.str)
Y = np.round(AUC_RES_P,decimals=0).copy()
for c in Y.columns: Y[c] = Y[c].astype(np.uint)
for c in Y.columns: Y[c] = Y[c].astype(np.str)
Z = X+ ' / ' + Y + '%'
Z['n']=X.n
Z
CL_vus10={}
for (cl,tox),X in VUS_sig.groupby(level=[0,1]):
#print cl,tox,X.shape
#X.sort(['chm','bio','bc'],ascending=False,inplace=True)
CL_vus10[cl]=[i[2] for i in X.index]
#VUS_sig=VUS.ix[(VUS>0.50).any(axis=1)]
X1=pd.melt(VUS.reset_index(),id_vars=['cl','tox','effect'],value_vars=['chm','bio','bc'],var_name='sim_type',value_name='VUS')
X1.set_index(['cl','tox','effect','sim_type'],inplace=True)
X2=P0[['auc','AUC','Sens','Spec','GenRA0']].reset_index().set_index(['cl','tox','effect','sim_type'])
X2.columns=map(mkcol,X2.columns)
P2 = pd.merge(X2,X1,left_index=True,right_index=True)
P2['tested_n']=P2.n_pos+P2.n_neg
P2['untested_n']=P2.cl_n-P2.tested_n
P21 = P2[['cl_n','untested_n','tested_n','n_pos','n_neg','VUS','AUC','Sens','Spec']]
P21.ix[:10]
#P0.ix[:10]
#P0[['auc','AUC','Sens','Spec','GenRA0']].reset_index().set_index(['cl','tox','effect','sim_type']).ix[:10]
P2.ix[:10]
#P.T.to_excel('/share/home/ishah/projects/Chem/data/results/genra-summary-%s.xls' % tmstmp)
# Output clusters to worksheet
W = pd.ExcelWriter(RES_DIR+'/genra-perf-summary-full-%s.xls' % tmstmp,engine='openpyxl')
for cl,P in P2.groupby(level=[0]):
#X = P.reset_index()
#I2= (X1<0.1).sum(axis=1)>0
#X.columns = [i.replace('max','') for i in map(mkcol,X.columns)]
P.to_excel(W,sheet_name="Cluster-%d" %cl,na_rep='')
W.save()
C2.ix[C2.index[C2.chemical_name.apply(lambda x: x.lower().find('benox')>-1)]]
C2M['C98730042']
# Cluster 0
X1=PERF_vol.xs(cc_id)
#len([i[1] for i in X11.index])
#X1.tox.unique(),len(X1.tox.unique()),len(X1.effect.unique())
X1
VUS_sig=VUS.ix[(VUS>0.5).any(axis=1)]
tox1=[i[1] for i in VUS_sig.xs(cc_id).index]
#tox1
tox1=[i for i in tox1 if re.match('chr_|dev_',i)]
tox1
#CL 23
tox1=['chr_brain',
'chr_heart',
'chr_kidney',
'chr_liver',
'chr_lung',
'chr_ovary',
'chr_spleen',
'chr_testes',
'chr_thymus',
'dev_bone',
'dev_kidney',
'dev_liver',
'dev_reproductive_performance']
#cl 80
tox1="""chr_brain
chr_kidney
mgr_body_weight
mgr_bone
mgr_intestine_large
mgr_urinary_bladder
sub_intestine_small
sub_prostate
sub_spleen
sub_urinalysis
sub_water_consumption""".split("\n")
tox1
X=(BCTb.ix[CID,tox1]>0).sum(axis=1)
X.sort(ascending=False)
X[:10]
CID.index(X.index[1])
tox1=['dev_food_consumption',
'dev_sexual_developmental_landmark',
'dev_sperm_morphology',
'mgr_brain',
'mgr_motor_activity',
'sub_lung',
'sub_sperm_morphology',
'sub_testes',
'sub_thymus'
]
# Cl = 5
tox1 = """dev_clinical_chemistry
dev_mortality
dev_reproductive_performance
dev_spleen""".split('\n')
tox1
', '.join(tox1)
FIG_DIR
"""
dev_food_consumption 0.319472
dev_sexual_developmental_landmark 0.319472
dev_sperm_morphology 0.319472
mgr_brain 0.805017
mgr_motor_activity 0.194983
sub_lung 0.295062
sub_sperm_morphology 0.295062
sub_testes 0.295062
sub_thymus 0.295062
"""
CID.index('C98730042')
from chem.ra.viz import *
from chem.ra.readacross import *
SM0 = getNNSimMatrices(CID,BCTb,C=Chm,B=Bio)
CH0 = getNNChmNm(CID,ChmNm1)
fig = pl.figure(figsize=(15,15))
ax = pl.subplot(1,1,1)
X,E1 = vizChmNN(CID[19],BCTb,SM0,dsc='Bio',ax=ax,c_s0=0,chem_sz=(90,60),c_knn=10,
rs=1.0,r_min=150,
th0=0.5*math.pi,th_tot=1.3*math.pi,
lab_len=15,ch_fs=12,
Effects=BCTb.ix[CID,tox1],CN=CH0,
pred=True,t0=0.25,
CD=CD,Mols=MOLS,
save=True,fig_dir=FIG_DIR)
E1
from chem.ra.viz import *
from chem.ra.readacross import *
SM0 = getNNSimMatrices(CID,BCTb,C=Chm,B=Bio)
CH0 = getNNChmNm(CID,ChmNm1)
fig = pl.figure(figsize=(15,15))
ax = pl.subplot(1,1,1)
X,E1 = vizChmNN(CID[19],BCTb,SM0,dsc='Chm',ax=ax,c_s0=0,chem_sz=(90,60),c_knn=10,
rs=2,r_min=150,
th0=0.5*math.pi,th_tot=1.3*math.pi,
lab_len=15,ch_fs=12,
Effects=BCTb.ix[CID,tox1],CN=CH0,
pred=True,t0=None,
CD=CD,Mols=MOLS,
save=True,fig_dir=FIG_DIR)
E1
S0 = SM0['Chm'].ix[CID[1]]
S0.sort()
S0
X = GenRAPred('mgr_body_weight', BCTb.ix[CID], SM0['Chm'],sim_type='chm',k0=8,s0=0.5,t0=None)
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.tri as mtri
import warnings
warnings.filterwarnings('ignore')
show_t0 = True
def smoothData(xi,yi,zi,out='ij'):
x1,y1=np.mgrid[xi.min():xi.max():30j,yi.min():yi.max():30j]
f = bisplrep(xi,yi,zi,s=1)
z1= bisplev(x1[:,0],y1[0,:],f)
if out =='ij':
return np.concatenate(x1),np.concatenate(y1),np.concatenate(z1)
else:
return x1,y1,z1
#for cl in sorted([0,1,5]):
cl=cc_id
print "> CL",cl
j=0
#if cl > 5: break
VUS_sig=VUS.ix[(VUS>0.6).any(axis=1)]
#tox1=[i[1] for i in VUS_sig.xs().index]
fig = pl.figure(figsize=(15,4*len(tox1)))
for y in tox1:
for ft in ['chm','bio','bc']:
X = P_ALL1.xs((cl,ft,y)).reset_index()
X['n_nn'] = X.n_sim_neg+X.n_sim_pos
X.t0_max[X.t0_max>1]=1
j+=1
try:
ax = fig.add_subplot(len(tox1)+1,3, j)
x2,y2,z2 = smoothData(X.n_nn, X.s0, X.auc,out='xy')
z2[z2<0]=0
z2[z2>1]=X.auc.max()
cf = ax.contourf(x2, y2, z2, 30,cmap=cm.coolwarm,vmin=0,vmax=1,alpha=0.9)
x_min=X.n_nn.min()
x_max=X.n_nn.max()
ax.set_xlim(x_min,x_max)
ax.set_ylim(0,X.s0.max())
ax.set_xlabel('k',fontdict=dict(size=12))
ax.set_ylabel('s',fontdict=dict(size=12))
pl.setp(ax.get_xticklabels(), fontsize=8)
pl.setp(ax.get_yticklabels(), fontsize=8)
ax.set_title("CL-%d %s %s" % (cl,ft, y),fontdict=dict(size=14))
if show_t0:
x3,y3,z3 = smoothData(X.n_nn, X.s0, X.t0_max,out='xy')
levels = np.linspace(X.t0_max.min(),X.t0_max.max(),8)
cf2 = ax.contour(x3, y3, z3, levels,linewidths=3,cmap=cm.hot,alpha=0.8)
pl.clabel(cf2, fmt='%2.1f', inline=1, fontsize=13)
except:
#print ">CL",cl,y,ft,"failed"
continue
else:
#print ">CL",cl,y,ft,"passed"
pass
ax = fig.add_subplot(len(tox1)+1,3, j+1)
ax.set_axis_off()
cb = pl.colorbar(cf,orientation='horizontal',ticks=np.linspace(0,1,6),shrink=1.5)
cb.ax.set_xlabel('ROC AUC',fontdict=dict(size=12,weight='bold'))
pl.tight_layout()
pl.subplots_adjust(wspace=0.3,hspace=0.3)
#fig.savefig(FIG_DIR+'genra-pred-mean-auc-k0-s0-cl-%d-nn.svg' % cl)
fig.savefig(FIG_DIR+'genra-pred-mean-auc-k0-s0-cl-%d-nn.png' % cl,dpi=500)
PERF1['res2'].select(lambda x: x[0]==cc_id and x[2] in tox1)
VUS.select(lambda x: x[0]==0 and x[2] in tox1)[['chm','bio','bc']]
SUPDAT_DIR='/share/home/ishah/Dropbox/Projects/Chem/docs/RTP-RA1/SupplData/'
# Data S1
# Data S2
tsv=SUPDAT_DIR+'supplemental-data-DS2-toxicity.tsv'
#W = pd.ExcelWriter(xl)
BCTc[Tox].to_csv(tsv,sep='\t')
#P.T.to_excel('/share/home/ishah/projects/Chem/data/results/genra-summary-%s.xls' % tmstmp)
# Output clusters to worksheet
W = pd.ExcelWriter('/share/home/ishah/projects/Chem/data/results/genra-vus-%s-all.xls' % tmstmp,engine='openpyxl')
for cl in sorted(set([i[0] for i in PERF_vol.index])):
X = PERF_vol.xs(cl)['auc_volf1']
X.to_excel(W,sheet_name="Cluster-%d" %cl)
W.save()
P21.ix[30:32]
#P.T.to_excel('/share/home/ishah/projects/Chem/data/results/genra-summary-%s.xls' % tmstmp)
# Output clusters to worksheet
W = pd.ExcelWriter('/share/home/ishah/projects/Chem/data/results/genra-perf-all-%s.xls' % tmstmp,engine='openpyxl')
for cl in sorted(set([i[0] for i in PERF1.index])):
X = PERF1.xs(cl)['res2']
X.to_excel(W,sheet_name="Cluster-%d" %cl)
W.save()
W = pd.ExcelWriter('/share/home/ishah/projects/Chem/data/results/genra-perf-sig-%s.xls' % tmstmp,engine='openpyxl')
X0 = PERF1['auc'][['chm','bio','bc']]
X = (X0>0.8).groupby(level=[0,1]).sum()
#X['Any bio|chm|bc'] = (X0>0).any(axis=1)
X['Only chm'] = X0.apply(lambda x: x[0]>=0.8 and x[1]<0.8 and x[2]<0.8,axis=1).groupby(level=[0,1]).sum()
X['Only bio'] = X0.apply(lambda x: x[0]< 0.8 and x[1]>=0.8 and x[2]<0.8,axis=1).groupby(level=[0,1]).sum()
X['Only bc'] = X0.apply(lambda x: x[0]< 0.8 and x[1]<0.8 and x[2]>=0.8,axis=1).groupby(level=[0,1]).sum()
X['chm>bio|bc'] = X0.apply(lambda x: x[0]>=0.8 and x[0]>x[1] and x[0]>x[2],axis=1).groupby(level=[0,1]).sum()
X['bio>chm|bc'] = X0.apply(lambda x: x[1]>=0.8 and x[1]>x[0] and x[1]>x[2],axis=1).groupby(level=[0,1]).sum()
X['bc>chm|bio'] = X0.apply(lambda x: x[2]>=0.8 and x[2]>x[0] and x[2]>x[1],axis=1).groupby(level=[0,1]).sum()
X.to_excel(W,sheet_name="Summary")
for cl in sorted(set([i[0] for i in PERF1.index])):
X=PERF1.xs(cl)
I = np.where((X['auc']>0.8).sum(axis=1)>0)
if I and len(I[0])>0:
X.res2[['chm','bio','bc']].ix[I].to_excel(W,sheet_name="Cluster-%d" %cl)
W.save()