%load_ext autoreload
%autoreload 2
%load_ext sql
%pylab inline
%matplotlib inline
import matplotlib.text as text
import scipy.interpolate as interp
import pandas as pd
#from mp.txpepa import *
#from bio.data.toxplorer import *
#import bio.hts.apredica as apr
#from bio.hts.htsdb import *
#from bio.data.toxplorer import *
import viz.clust as cv
from chem.clust import *
from sklearn import (manifold, datasets, decomposition, ensemble, lda,
random_projection)
from sklearn.metrics.pairwise import euclidean_distances,manhattan_distances
import statsmodels.api as sm
import numpy.linalg as LA
import bio.hts.traj as traj
CD = ChemDrawing()
from rpy2.robjects.packages import importr
from rpy2.robjects.vectors import FloatVector
stats = importr('stats')
from sklearn.neighbors import KNeighborsClassifier
from ml.readacross import *
mng.register_connection("hts-db","htsdb",username="ishah",
password="xxx",host='localhost')
mng.register_connection("txp-db","toxplorerdb",username="ishah",
password="xxx",host='localhost')
%sql postgresql://ishah:xxxx@localhost/chemicals
DAT_DIR = '/share/home/ishah/projects/Chem/data/tables/'
PKL_DIR = '/share/home/ishah/projects/Chem/data/pickle/'
RES_DIR='/share/home/ishah/projects/Chem/data/results/'
FIG_DIR='/share/home/ishah/projects/Chem/figs/readacross/'
import pickle
tmstmp = time.strftime("%m-%d-%Y",time.localtime())
# Start the parallel machine
from ml.mlearn import *
from ml.readacross import *
import IPython.parallel as PP
%reload_ext autoreload
%autoreload 2
lb_view=None
d_view =None
def initParallel(Data=None):
RC = PP.Client(profile='galaxy_parallel')
global lb_view
global d_view
d_view = RC[:]
d_view.block = True
lb_view = RC.load_balanced_view()
lb_view.block = True
d_view.execute("""
%load_ext autoreload
%autoreload 2
import ml.mlearn as ml
from ml.mlearn import *
from ml.readacross import *
from chem.clust import *
""")
if Data:
d_view.push(Data)
print "\n".join([i for i in os.listdir(PKL_DIR) if i.startswith('tx-tr')])
[BCc,BCTc,BCTb,Bio,Chm,Tox] = pd.read_pickle(PKL_DIR+'tx-tr-ch-11-16-2015.pkl')
#print "\n".join(os.listdir(PKL_DIR))
[A0,C0,C1,B1] = pd.read_pickle(PKL_DIR+'chm-bio-02-12-2015.pkl')
[T2,T_mt] = pd.read_pickle(PKL_DIR+'tox-02-12-2015.pkl')
MOLS = pd.read_pickle(PKL_DIR+'mols-02-12-2015.pkl')
[C2M,M2C,Cl_st,C2] = pd.read_pickle(PKL_DIR+'clust-02-12-2015.pkl')
C2 = C1.set_index('ID')
# Chemical fingerprints
from rdkit import Chem
from rdkit.DataStructs import *
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit.Chem import AllChem
FP1 = pd.DataFrame([np.array(AllChem.GetMorganFingerprintAsBitVect(i,3,1024)) for i in MOLS.values()])
FP1.index=MOLS.keys()
FP1.columns = ['mrgn_%d'%i for i in FP1.columns]
FP2 = pd.DataFrame([np.array(MACCSkeys.FingerprintMol(i)) for i in MOLS.values()])
FP2.index=MOLS.keys()
FP2.columns = ['mccs_%d'%i for i in FP2.columns]
FP3 = pd.DataFrame([np.array(AllChem.GetHashedTopologicalTorsionFingerprintAsBitVect(i)) for i in MOLS.values()])
FP3.index=MOLS.keys()
FP3.columns = ['tptr_%d'%i for i in FP3.columns]
FP0 = pd.merge(FP1,FP2,left_index=True,right_index=True)
FP0 = pd.merge(FP0,FP3,left_index=True,right_index=True)
FP0.index.names=['ID']
#Distance calculations
# Distance calculations
# make a binary rep
X = BCTb[Bio]
CID = X.index
X = X.fillna(0)
X[X!=0]=1
D_bio = pd.DataFrame(squareform(pdist(X,'jaccard')),
columns=CID,index=CID)
S_bio = 1-D_bio
S_bio.shape
X = BCTb[Chm]
CID = X.index
X = X.fillna(0)
X[X!=0]=1
D_chm = pd.DataFrame(squareform(pdist(X,'jaccard')),
columns=CID,index=CID)
S_chm = 1-D_chm
X = BCTb[Bio+Chm]
CID = X.index
X = X.fillna(0)
X[X!=0]=1
D_bc = pd.DataFrame(squareform(pdist(X,'jaccard')),
columns=CID,index=CID)
S_bc = 1-D_bc
S_bc.shape
initParallel(Data=dict(BCc=BCc,MOLS=MOLS))
d_view.execute("""
from chem.clust import *
CID = list(BCc.index)
M = [MOLS[i] for i in CID]
CC=ChemCluster()
CC.set_chem_ids(CID,M)
CC.clust_chems(fp='torsion')
""")
print "Starting Cluster stability analysis of chemicals " + time.strftime("%d/%m/%Y %H:%M",time.localtime())
send_email(txt="Start",subj="CC.clust_stability k=10:150:5")
P_ALL = []
for sf in [0.5]:
d_view.push(dict(sf=sf))
P = lb_view.map(lambda K: CC.clust_stability(k_cent=K,km_ninit=5,km_iter=30,
sample_frac=sf,sample_iter=10,
dbg=False),
range(10,150,5))
#Res_df=pd.DataFrame(P)
print "> sf=%3.2f " % sf + time.strftime("%H:%M",time.localtime())
P_ALL += P
CL_stability_df = pd.DataFrame(P_ALL)
CL_stability_df.to_csv(RES_DIR+ time.strftime("chm-cluster-stability-%d-%m-%Y.csv",time.localtime()),index=False)
send_email(txt="Done",subj="CC.clust_stability k=10:150:5")
# In case loading from previous run:-
#print os.listdir(RES_DIR)
#CL_stability_df = pd.read_csv(RES_DIR+'chm-cluster-stab-torsion-12-02-2015.csv')
CL_stability_df = pd.read_csv(RES_DIR+'chm-cluster-stability-13-04-2015.csv')
import statsmodels.api as sm
import scipy.interpolate as interp
#Res_df=pd.DataFrame()
#Res_df.sort(['sf','k'],inplace=True)
#Res_df.set_index(['sf'],inplace=True)
def myHyp(Z): return np.sqrt(np.sum(Z**2))
SF = CL_stability_df.sf.unique()
Res_df=CL_stability_df.groupby(by=['sf','k','km_ninit']).aggregate(dict(mn=np.mean,
sd=myHyp))
Res_df=Res_df.reset_index().set_index('sf')
fig = pl.figure(figsize=(8,8))
#for sf in set(Res_df.index):
for sf in SF:
R_sf = Res_df.xs(sf)
Y = sm.nonparametric.lowess(R_sf.mn,R_sf.k,return_sorted=False,it=30,delta=0.5)
#pl.ylim(0.1,0.6)
Yb = sm.nonparametric.lowess(R_sf.mn-R_sf.sd,R_sf.k,return_sorted=False,it=30,delta=0.5)
Yt = sm.nonparametric.lowess(R_sf.mn+R_sf.sd,R_sf.k,return_sorted=False,it=30,delta=0.5)
pl.plot(R_sf.k,Y,label=sf,lw=2)
pl.fill_between(R_sf.k,Yb,Yt,
alpha=0.2,edgecolor='#888888',facecolor='#ababdf',
linewidth=1,linestyle='dashdot',antialiased=True)
pl.legend(loc=2,fontsize=8)
pl.title("Cluster stability analysis")
pl.ylabel("Cluster stability score")
pl.xlabel("K")
# older appraoch
CID = list(BCc.index)
M = [MOLS[i] for i in CID]
CC=ChemCluster()
CC.set_chem_ids(CID,M)
CC.clust_chems(fp='torsion')
CC.find_centroids(k_cent=100,km_ninit=10,km_iter=50)
G1 = CC.clust_centroids(metric='euclidean',link='complete')
# New approach to visualize clusters
Cent = pd.DataFrame([BCTb.ix[I,Chm].median() for I in M2C.values()],index=M2C.keys())
import viz.clust as cv
Cent_Dst= pd.DataFrame(squareform(pdist(Cent,'euclidean')))
#Cent_Dst= 1-Cent_Sim
Cent_Lnk= linkage(Cent_Dst.as_matrix(),method='weighted')
Cent_Dnd= dendrogram(Cent_Lnk,no_plot=True)
Cent_G0 = vc.makeRadialGraph(Cent_Dst.as_matrix(),Cent_Lnk,Cent_Dnd,ret='graph')
#Cent_Lnk= linkage()
# Set the size of each leaf based on the cluster size
for i,v in Cent_G0.nodes(data=True):
if v.has_key('lv'):
v['cl']=v['lv']
v['n']=len(M2C[v['cl']])
# the root as no inputs
r0 = [k for k,n in Cent_G0.in_degree_iter() if n==0][0]
# Update the node size basde on depth first search from root successors
V0 = Cent_G0.node
for p,C in nx.depth_first_search.dfs_successors(Cent_G0,source=r0).iteritems():
#print p,C
V0[p]['n'] = sum([V0[i]['n'] for i in C])
#for cl,CID in M2C.iteritems(): Cent_G0.node[cl]['n']=len(CID)
#V0=dict(Cent_G0.nodes(data=True))
#[v for i,v in Cent_G0.nodes(data=True) if v.has_key('lv') and v.get('lv')==0][:5]
# Extend the reach of the
for k,vi in V0.iteritems():
vi['tx']=1.05*v['r']*np.cos(v['th'])
vi['ty']=1.05*v['r']*np.sin(v['th'])
Cent_G0.node[100]
# Distribution of chemicals in each cluster
X = [len(v) for v in M2C.values()]
pl.hist(X,30,log='y')
[k for k,v in M2C.iteritems() if len(v)>100]
CL_NAMEs = {85:'Acetanilide',
0 :'PFAA',
3 :'Pyrrlidone',
4 :'Cyclodiene',
5 :'Nitobenzene',
16:'Benzoylurea',
53:'Organophosphate',
33:'Aryloxyphenoxypropionic',
56:'Pyrethroid',
34:'Pyrethroid'}
import viz.clust as vc
def myLab1(k,x):
if x.has_key('n') and x['n']<25:
y=str(x['n'])
return y
def mkProp(f=log):
def myfun(x):
y = 0
if x.has_key('n'):
y = f(x['n'])
return y
return myfun
def mkNodeRule(n_min=20):
def myfun(x):
if x.has_key('n'):
if x['n']>=n_min:
return True
else:
return False
return myfun
def mkNodeTxt(n_min=20,cl=False,txt_rot=0,col='red',sz=16,dr=1.07,cl_info=CL_NAMEs):
def myfun(v):
y=''
if v.has_key('n') and v['n']>=n_min:
#y=str(v['n'])
if cl and v.has_key('cl'):
y = 'CL%d' % v['cl']
if cl_info.has_key(v['cl']):
y += ' '+ cl_info.get(v['cl'])
#y='%5.1f' % v['dg']
#dth = ifthen(v['dg']>90 and v['dg']<270,-0.1,0.1)
tx=dr*v['r']*np.cos(v['th'])
ty=dr*v['r']*np.sin(v['th'])
txt_rot = ifthen(v['dg']>90 and v['dg']<270,180,0)
hz_aln = ifthen(v['dg']>90 and v['dg']<270,'right','left')
va_aln = ifthen(v['dg']>0 and v['dg']<180,'bottom','top')
txt = text.Text(tx,ty,
y,rotation=v['dg']+txt_rot,
color=col,zorder=10, ha = hz_aln, va = va_aln,
fontproperties=fm.FontProperties(size=sz))
return txt
return myfun
#def mkNodeText()
vc.drawRadialPrunedDend(Cent_G0,figsz=[30,30],show_labs=True,show_nodes=True,lc='grey',
vsc=0.01,node_col='orange',lab_size=14,
node_prop=mkProp(f=lambda i: 0.5*log(i)),
node_rule=mkNodeRule(n_min=2),
#node_lab=mkNodeLab(n_min=30,cl=True),
node_lab=mkNodeTxt(cl=True,dr=1.08,n_min=1),
txt_rot=0,
dbg=False,
save=FIG_DIR+'clus-dend-n.svg')
ChmNm1 = C1[['ID','chemical_name']]
ChmNm1.set_index('ID',inplace=True)
ChmNm1.ix['C60355'].chemical_name
from scipy.spatial.distance import cdist
CID = M2C[0]
nd_min= 3
S_i=S_chm.ix[CID,CID]
X_i=BCTb.ix[CID,Chm]
# Which features are present in this cluster?
N_d = X_i.sum()
C_i = np.where(N_d>nd_min)
X_i = X_i.ix[:,X_i.columns[C_i]]
nr = X_i.shape[0]
X_i.loc['centroid']=X_i.median()
D_i = pd.DataFrame(squareform(pdist(X_i,'jaccard')),index=X_i.index,columns=X_i.index)
d=D_i.ix['centroid']
d.sort()
cid=d.index[1]
C0.ix[cid]
d.index
def simGraph(S,r_i=None,n0=4,s0=0.5,levels=3,dbg=False,rect=(0,0,1000,1000)):
"""
Input root node
Returns shell layout
"""
def neighbours(ri,ignore=[]):
X1 = S.ix[ri,:]
X1 = X1[X1>s0]
ni = ifthen(n0<len(X1),n0,len(X1))
X1.drop([ri]+[i for i in ignore if i in X1.index],inplace=True)
X1.sort(ascending=False)
X1 = X1[:ni]
return X1
def children(ri,l=0,L=2,V=[]):
l += 1
X1 = neighbours(ri,ignore=V)
E1 = [(r_j,ri,{'weight':1-s_ij}) for r_j,s_ij in X1.iteritems()]
V1 = V+list(X1.index)
if dbg: print '>'*(l+1),ri,l,len(X1)
if l>L: return V1,E1
for v in X1.index:
V2,E2 = children(v,l,L,V1)
E1 += E2
V1 += V2
return list(set(V1)),E1
V3,E3 = children(r_i,l=0,L=levels,V=[])
_NV = NetViz(id='ChemClust',directed=True,
node_list=V3,
edge_list=E3)
_NV.layout(alg='neato',xmin=rect[0],ymin=rect[1],xmax=rect[2],ymax=rect[3])
return _NV,r_i
def clustGraph(CID,X_chm,S,n0=4,nd_min=3,**args):
"""
Input cluster with distances
Returns shell layout
"""
nd_min= 3
S_i=S.ix[CID,CID]
X_i=X_chm.ix[CID]
# Which features are present in this cluster?
N_d = X_i.sum()
#C_i = np.where(N_d>nd_min)
#X_i = X_i.ix[:,X_i.columns[C_i]]
X_i.loc['centroid']=X_i.median()
# Now find the vector closest to the centroid
D_i = pd.DataFrame(squareform(pdist(X_i,'jaccard')),index=X_i.index,columns=X_i.index)
d=D_i.ix['centroid']
d.sort()
r_i=d.index[1]
return simGraph(S,r_i,**args)
ChmNm1 = C1[['ID','chemical_name']]
ChmNm1.set_index('ID',inplace=True)
ChmNm1.ix['C60355'].chemical_name
rc0='C83794'
rc0='C80057'
rc0='C335762'
rc0='C103902'
rc0='C50351'
rc0='C1763231'
rc0='C23184669'
#NV0=simGraph(CC,rc0,s0=0.35,n0=5,levels=2)
#CH0=dict(( (i,ChmNm1.ix[i].chemical_name) for i in NV0.G.nodes(data=False)))
#P=NV0.layout(alg='neato',xmin=0,xmax=1000,ymin=0,ymax=1000)
#CH0[rc0]
from matplotlib.patches import Ellipse, Circle, FancyArrowPatch,FancyBboxPatch
from textwrap import wrap
from chem.draw import *
STUDIES = ['mgr', 'chr', 'sac', 'sub', 'dev', 'rep', 'oth', 'acu', 'neu', 'dnt']
CH0 = None
def drawEffects0(ax,E0,x0,y0,w,h,z0=10,lab=False,fs=10):
xi = x0
yi = y0
for e,v in E0.iteritems():
fc = 'white'
ec = 'grey'
if v>0:
fc = 'red'
ec = 'white'
elif v==0:
fc = 'green'
ec = 'white'
else:
fc = 'white'
ec = 'grey'
ax.add_patch(FancyBboxPatch((xi,yi),w,h,
boxstyle="round,pad=0.2",
ec=ec, fc=fc,fill=True, zorder=z0))
if lab:
ax.add_artist(text.Text(xi+w*0.5,yi+h+2,e,color=(0.2,0.2,0.2),
ha='left',va='bottom',zorder=z0,rotation=90.0,
fontproperties=fm.FontProperties(size=fs)))
xi += w
def vizChemClust(ax,r0,NV0,layout='neato',cl_id=None,AO=STUDIES,Mols=MOLS,ChNm=CH0,
ch_w=60,ch_h=60,lw=2,
save=False,fmt='svg',
xmin=0,ymin=0,W=1500,H=1500,lab_fs=12):
ax.set_xlim(xmin,xmin+W*1.1)
ax.set_ylim(ymin,ymin+H*1.1)
if layout:
NV0.layout(alg=layout,xmin=0,xmax=W*0.9,ymin=0,ymax=W*0.9)
L0 = NV0.P
# Store chemical bbox patches
RECT={}
# Get Tox data
T0=BCTb.ix[L0.keys(),AO]
w,h=ch_w,ch_h
for vi in NV0.G.nodes(data=False):
xi,yi = L0[vi]
xi,yi = xi-w*0.5,yi-w*0.5
J0 = CD.mol2json(Mols[vi],size=(w,h))
bg_rect = np.array([[xi,yi],[xi,yi+h],[xi+w,yi+h],[xi+w,yi]])
ch_name = "\n".join(wrap(ChNm[vi],20))
bb = mpatches.Polygon(bg_rect,edgecolor=ifthen(vi==r0,'red',(0.1,0.1,0.3,0.5)),
facecolor='white',
fill=True,zorder=5)
RECT[vi]=bb
ax.add_patch(bb)
CD.drawChemBS(J0,ax,x0=xi,y0=yi,atom_r=2,z0=6,lw=0.8)
#tx_pos = xi+w*0.5,yi+w
tx_pos = xi+w*0.5,yi-2
ax.add_artist(text.Text(tx_pos[0],tx_pos[1],ch_name,color=(0.2,0.2,0.2),
ha='center',va='top',zorder=6,
fontproperties=fm.FontProperties(size=lab_fs)))
# Show tox information
E1 = T0.ix[vi]
dw = w/len(E1)
drawEffects0(ax,E1,xi,yi+h+4,dw,dw*1)
for v1,v2,EA in NV0.G.edges_iter(data=True):
x1,y1 = L0[v1]
x2,y2 = L0[v2]
shrink=0.1*(w+h)
ax.add_patch(FancyArrowPatch(posA=(x1,y1),posB=(x2,y2),
patchA=RECT[v1],
patchB=RECT[v2],
color='grey',arrowstyle='<|-|>',
mutation_scale=10.0,
shrinkA=shrink,shrinkB=shrink,
linewidth=lw,
connectionstyle='angle3,angleA=95,angleB=0'
))
E1[:] = 0
drawEffects0(ax,E1,xmin+W-150,ymin+H-100,20,20,lab=True,fs=12)
if save:
chmnm = ChNm[r0].lower().replace(' ','-')
if cl_id:
chmnm = 'cl-%d-' % cl_id + chmnm
fig.savefig(FIG_DIR+"nn-ra-%s-2.%s" % (chmnm,fmt),dpi=600)
#C1.ix[C1.chemical_name.apply(lambda i: i.find("PFOS")>-1)]
C1.ix[C1.chemical_name.apply(lambda i: i.find("Diuron")>-1)]
C2M['C330541']
CID = M2C[16]
CID
pd.merge(C2.ix[CID],BCTb.ix[CID,STUDIES],left_index=True,right_index=True)
CID = C1.ix[C1.chemical_name.apply(lambda i: i.find("phenol")>-1),'ID']
X_cl=BCTb.ix[CID,Chm]
S_cl=S_chm.ix[CID,CID]
NV0,rc0=clustGraph(CID,X_cl,S_cl,s0=0.1,n0=5,levels=3)
CH0=dict(( (i,ChmNm1.ix[i].chemical_name) for i in NV0.G.nodes(data=False)))
fig = pl.figure(figsize=(10,10))
ax = pl.subplot(1,1,1)
ax.set_axis_off()
P=NV0.layout(alg='neato',xmin=0,xmax=1800,ymin=0,ymax=1800)
NV0.draw()
CH0[rc0]
#list(CID)
R= C1.ix[C1.chemical_name.apply(lambda i: i.lower().find("propiconazole")>-1)]
print R
C2M['C94361065']
cc_id=0
CID=M2C[cc_id]
X_cl=BCTb.ix[CID,Chm]
S_cl=S_chm.ix[CID,CID]
NV0,rc0=clustGraph(CID,X_cl,S_cl,s0=0.1,n0=5,levels=3)
CH0=dict(( (i,ChmNm1.ix[i].chemical_name) for i in NV0.G.nodes(data=False)))
fig = pl.figure(figsize=(10,10))
ax = pl.subplot(1,1,1)
ax.set_axis_off()
P=NV0.layout(alg='neato',xmin=0,xmax=1200,ymin=0,ymax=1200)
NV0.draw()
CH0[rc0]
#CD = ChemDrawing()
fig = pl.figure(figsize=(15,15))
ax = pl.subplot(1,1,1)
ax.set_axis_off()
vizChemClust(ax,rc0,NV0,cl_id=cc_id,layout='neato',ChNm=CH0,save=True,fmt='png',
ch_w=150,ch_h=90,
W=2000,H=2000,lw=1,
lab_fs=12,xmin=100,ymin=0)
CD = ChemDrawing()
from matplotlib.patches import Ellipse, Circle, FancyArrowPatch,FancyBboxPatch
from textwrap import wrap
from chem.draw import *
STUDIES = ['mgr', 'chr', 'sac', 'sub', 'dev', 'rep', 'oth', 'acu', 'neu', 'dnt']
def drawEffects(ax,E0,x0,y0,w,h,z0=10,lab=False,fs=10,
act='T',
col_act=(0.8,0.1,0.1),
col_act_p=(1.0,0.5,0.1),
col_inact=(0.1,0.8,0.1),
col_inact_p=(0.1,0.1,0.8),
):
xi = x0
yi = y0
for e,v in E0.iteritems():
fc = 'white'
ec = 'grey'
if v>0:
fc = ifthen(act=='T',col_act,col_act_p)
ec = 'white'
elif v==0:
fc = ifthen(act=='T',col_inact,col_inact_p)
ec = 'white'
else:
fc = 'grey'
ec = 'white'
ax.add_patch(FancyBboxPatch((xi,yi),w,h,
boxstyle="round,pad=0.2",
ec=ec, fc=fc,fill=True, zorder=z0))
if lab:
ax.add_artist(text.Text(xi+w*0.5,yi+h+2,e,color=(0.2,0.2,0.2),
ha='left',va='bottom',zorder=z0,rotation=90.0,
fontproperties=fm.FontProperties(size=fs)))
xi += w
def visChmNN(cid,Data,Sc,ax=None,c_knn=5,c_s0=0.3,Mols=MOLS,
chem_sz=(40,60),CN=CH0,
rs=10,r_min=100,
atmr=5,ch_fs=12,
th0=0.5*math.pi,th_tot=math.pi,
lab_len=20,
BBox=(-500,-500,500,500),Origin=[0,0],
Effects=pd.DataFrame(),
pred = False,
save = False
):
NN_0 = getKNN(cid,Sc,k0=c_knn,s0=c_s0,sim=True)
NN_c = pd.DataFrame(NN_0.reset_index())
NN_c.columns=['cid_i','s']
NN_c['d']=1-NN_c.s
NN_c.sort('d',inplace=True)
dth = th_tot/(NN_c.shape[0]-1)
xo,yo = Origin
xmin,ymin,xmax,ymax=BBox
r_max = abs(xmax-xo)-r_min
#NN_c['r'] = r_max*(NN_c.s**2)*rs
NN_c['r'] = r_min+r_max*NN_c.d*rs
NN_c['th'] = th0+(np.arange(0,NN_c.shape[0]) * dth)
NN_c['x'] = NN_c.r*np.cos(NN_c.th)
NN_c['y'] = NN_c.r*np.sin(NN_c.th)
NN_c['lx']= 0.5*(NN_c.x+xo)
NN_c['ly']= 0.5*(NN_c.y+yo)
NN_c['chem'] = NN_c.cid_i.apply(lambda i: CN.get(i))
#return NN_c
ax.set_xlim(xmin,xmax)
ax.set_ylim(ymin,ymax)
ax.set_axis_off()
ch_w,ch_h = chem_sz
ch_hw,ch_hh= 0.5*ch_w,0.5*ch_h
# Center
bg_rect = np.array([[xo-ch_hw,yo-ch_hh],[xo-ch_hw,yo+ch_hh],[xo+ch_hw,yo+ch_hh],[xo+ch_hw,yo-ch_hh]])
ch_name = "\n".join(wrap(CN[cid],lab_len))
tx_pos = xo,yo-ch_hh-2
#ax.add_patch(FancyBboxPatch((xi,yi),w,w,
# boxstyle="round,pad=0.3",mutation_scale=2,
# ec=(0.1,0.1,0.3,0.5), fc='white',fill=True, zorder=5))
ax.add_patch(mpatches.Polygon(bg_rect,edgecolor=(0.1,0.1,0.3,0.5),facecolor='white',fill=True,zorder=5))
J0 = CD.mol2json(Mols[cid],size=chem_sz)
CD.drawChemBS(J0,ax,x0=xo-ch_w*0.5,y0=yo-ch_h*0.5,atom_r=4,z0=6,lw=1)
ax.add_artist(text.Text(tx_pos[0],tx_pos[1],ch_name,color=(0,0,0),
ha='center',va='top',zorder=6,
fontproperties=fm.FontProperties(size=ch_fs)))
if Effects.shape[0]>0:
E_i = Effects.ix[cid]
dw = ch_w/len(E_i)
drawEffects(ax,E_i,xo-ch_hw,yo+ch_hh+4,dw,dw*1)
E_p=None
if pred and Effects.shape[0]>0:
E_p = E_i.copy()
E_p[:] = None
for e in Effects.columns:
A_i = Effects.ix[NN_0.index,e].dropna()
if len(A_i)==0: continue
S_i = NN_0[A_i.index]
E_p[e] = np.sum(A_i*S_i)/S_i.sum()
drawEffects(ax,E_p,xo-ch_hw,yo+ch_hh+2*dw,dw,dw*1,act='P')
for jj in range(NN_c.shape[0]):
C = NN_c.irow(jj)
bg_rect = np.array([[C.x-ch_hw,C.y-ch_hh],[C.x-ch_hw,C.y+ch_hh],[C.x+ch_hw,C.y+ch_hh],[C.x+ch_hw,C.y-ch_hh]])
ch_name = "\n".join(wrap(C.chem,lab_len))
tx_pos = C.x,C.y-ch_hh-2
#ax.add_patch(FancyBboxPatch((xi,yi),w,w,
# boxstyle="round,pad=0.3",mutation_scale=2,
# ec=(0.1,0.1,0.3,0.5), fc='white',fill=True, zorder=5))
ax.add_patch(mpatches.Polygon(bg_rect,edgecolor=(0.1,0.1,0.3,0.5),facecolor='white',fill=True,zorder=5))
ax.add_patch(FancyArrowPatch(posA=(C.x,C.y),posB=(xo,yo),
#patchB=p2,
color='grey',arrowstyle='->',
mutation_scale=15.0,
shrinkA=60,shrinkB=60,
linewidth=2,
#connectionstyle='angle3,angleA=95,angleB=0'
))
ax.add_artist(text.Text(C.lx,C.ly,'%3.2fc' % C.s,color=(1.0,0.1,0.1),
ha='center',va='top',zorder=10,
backgroundcolor='white',
fontproperties=fm.FontProperties(size=12,weight='bold')))
J0 = CD.mol2json(Mols[C.cid_i],size=chem_sz)
CD.drawChemBS(J0,ax,x0=C.x-ch_w*0.5,y0=C.y-ch_h*0.5,atom_r=atmr,z0=6,lw=1,xyoff=[2,-4])
ax.add_artist(text.Text(tx_pos[0],tx_pos[1],ch_name,color=(0,0,0),
ha='center',va='top',zorder=6,
fontproperties=fm.FontProperties(size=ch_fs)))
if Effects.shape[0]>0:
E_i = Effects.ix[C.cid_i]
dw = ch_w/len(E_i)
drawEffects(ax,E_i,C.x-ch_hw,C.y+ch_hh+4,dw,dw*1)
E_i[:] = None
w=15
drawEffects(ax,E_i,xmax-1.1*w*len(E_i),ymax-3*w,w,w,lab=True,fs=w*0.75)
E_i = pd.Series([2,0,None],index=['Act (+)','Act (-)','Act (NT)'])
drawEffects(ax,E_i,xmax-3*w*len(E_i),ymax-8*w,w,w,lab=True,fs=w*0.75,act='T')
E_i = pd.Series([2,0],index=['Pred (+)','Pred (-)'])
drawEffects(ax,E_i,xmax-1.5*w*len(E_i),ymax-8*w,w,w,lab=True,fs=w*0.75,act='P')
if save:
x = CN[cid].lower().replace(' ','-')
fig.savefig(FIG_DIR+'cbai-%s-nn%d-1.svg' % (x,c_knn))
fig.savefig(FIG_DIR+'cbai-%s-nn%d-1.png' % (x,c_knn),dpi=600)
return NN_c,E_p
def visBioNN(cid,Data,Sb,b_knn=5,b_s0=0.3):
NN_b = getKNN(cid,Sb,k0=b_knn,s0=b_s0,sim=True)
def visChmBioNN(cid,Data,Sc,Sb,c_knn=5,b_knn=5,c_s0=0.3,b_s0=0.3):
NN_b = getKNN(cid,Sb,k0=b_knn,s0=b_s0,sim=True)
NN_c = getKNN(cid,Sc,k0=c_knn,s0=c_s0,sim=True)
list(C1.ix[C1.chemical_name.apply(lambda i: i.lower().find("carbamate")>-1)].ID)
len(M2C)
CID1 = C1.ix[C1.chemical_name.apply(lambda i: i.lower().find("carbamate")>-1),'ID']
M1= set([C2M.get(i) for i in CID1])
[(i,len(M2C.get(i))) for i in M1 if i]
#M2C[25]
cc_id = 16
CID = M2C[cc_id]
# Chem
X = BCTb.ix[CID,Chm]
X = X.fillna(0)
X[X!=0]=1
Sc1 = 1-pd.DataFrame(squareform(pdist(X,'jaccard')),columns=CID,index=CID)
# Bio
X = BCTb.ix[CID,Bio]
X = X.fillna(0)
X[X!=0]=1
Sb1 = 1-pd.DataFrame(squareform(pdist(X,'jaccard')),columns=CID,index=CID)
# BC
X = BCTb.ix[CID,Bio+Chm]
X = X.fillna(0)
X[X!=0]=1
Sbc1 = 1-pd.DataFrame(squareform(pdist(X,'jaccard')),columns=CID,index=CID)
CH0=dict(( (i,ChmNm1.ix[i].chemical_name) for i in CID ))
len(CID)
CH0
fig = pl.figure(figsize=(15,15))
ax = pl.subplot(1,1,1)
X,E1 = visChmNN('C264618442',BCTb,Sc1,ax=ax,c_s0=0,chem_sz=(90,60),c_knn=9,
rs=1.0,r_min=180,
th0=0.5*math.pi,th_tot=1.3*math.pi,
lab_len=15,ch_fs=12,
Effects=BCTb.ix[CID,STUDIES],CN=CH0,
pred=True,
save=True)
fig = pl.figure(figsize=(15,15))
ax = pl.subplot(1,1,1)
X,E1 = visChmNN('C330541',BCTb,Sc1,ax=ax,c_s0=0,chem_sz=(90,60),c_knn=9,
rs=1.0,r_min=180,
th0=0.5*math.pi,th_tot=1.3*math.pi,
lab_len=15,ch_fs=12,
Effects=BCTb.ix[CID,STUDIES],CN=CH0,
pred=True,
save=True)
getCh
fig = pl.figure(figsize=(15,15))
ax = pl.subplot(1,1,1)
X,E1 = visChmNN('C330541',BCTb,Sb1,ax=ax,c_s0=0,chem_sz=(90,60),c_knn=9,
rs=0.8,r_min=180,
th0=0.5*math.pi,th_tot=1.3*math.pi,
lab_len=15,ch_fs=12,
Effects=BCTb.ix[CID,STUDIES],CN=CH0,
pred=True,
save=False)
fig = pl.figure(figsize=(15,15))
ax = pl.subplot(1,1,1)
X,E1 = visChmNN('C872504',BCTb,Sbc1,ax=ax,c_s0=0,chem_sz=(90,60),c_knn=9,
rs=0.8,r_min=180,
th0=0.5*math.pi,th_tot=1.3*math.pi,
lab_len=15,ch_fs=12,
Effects=BCTb.ix[CID,STUDIES],CN=CH0,
pred=True,
save=False)
S_df = CC._S_df
#np.triu_indices(10)
X_st = []
for cid in list(S_df.index):
X=S_df.ix[cid].drop(cid)
c_st=dict(ID=cid,n1=0,n2=0,n3=0,n4=0)
if not np.any(X>0):
X_st.append(c_st)
continue
c_st.update(dict(s_min=X[X>0].min(),s_max=X.max(),s_mn=X[X>0].mean(),
s_25p=np.percentile(X[X>0],25),s_5p=np.percentile(X[X>0],25),
s_75p=np.percentile(X[X>0],75),s_50p=np.percentile(X[X>0],50)))
# How many chemicals in all that have similarity > minimum value ?
c_st['n1']=np.sum(X>0.5)
# How many chemicals in all that have similarity > minimum value ?
c_st['n2']=np.sum(X>c_st['s_5p'])
# How many chemicals in all that have similarity > 25percentil ?
c_st['n3']=np.sum(X>c_st['s_50p'])
# How many chemicals in all that have similarity > 25percentil ?
c_st['n4']=np.sum(X>c_st['s_75p'])
X_st.append(c_st)
CHN_st = pd.DataFrame(X_st)
#CHN_st.s_min.isnull().sum()
CHN_st.s_max.hist(bins=20)
CHN_st.sort(columns=['n1','n2'],ascending=[0,0],inplace=True)
CHN_st.set_index('ID',inplace=True)
from chem.clust import *
CID = list(BCc.index)
M = [MOLS[i] for i in CID]
CC=ChemCluster()
CC.set_chem_ids(CID,M)
G1=CC.clust_chems(fp='torsion')
from rdkit import DataStructs
from rdkit.ML.Cluster import Butina
CC.fp2dist1()
dists = CC._FP_D
#nfps = len(CC._FP)
#fps = CC._FP
#for i in range(1,nfps):
# sims = DataStructs.BulkTanimotoSimilarity(fps[i],fps[:i])
# dists.extend([1-x for x in sims])
cs = Butina.ClusterData(CC._FP_D,len(CC._FP),0.6,isDistData=True)
L = np.array([len(i) for i in cs])
pl.hist(L,20)
pl.ylim(0,100)
np.sum(L==1)
Res = []
for d0 in np.linspace(0,1,20):
cs = Butina.ClusterData(CC._FP_D,len(CC._FP),d0,isDistData=True)
L = np.array([len(i) for i in cs])
Res.append(dict(d0=d0,n1=np.sum(L==1),nc=len(L),n_mn=np.mean(L),n_sd=np.std(L)))
Res_df = pd.DataFrame(Res)
pl.plot(Res_df.d0,Res_df.n1,label='n1')
pl.plot(Res_df.d0,Res_df.nc,label='nc')
pl.legend()
pl.plot(Res_df.d0,Res_df.n_mn,label='n_mn')
pl.plot(Res_df.d0,Res_df.n_sd,label='n_sd')
pl.legend()
pl.ylim(0,100)
OECD_tb0 = pd.read_excel('/share/home/ishah/projects/Chem/data/categories/200115_chm-v1.xlsx',sheetname='Toolbox cat output')
OECD_tb0['ID']= OECD_tb0.CAS.apply(lambda i: 'C%s' % i.replace('-',''))
OECD_tb0.columns
Z = pd.merge(OECD_tb0,C2M_df,left_on='ID',right_on='ID')
EPA2CL = pd.pivot_table(Z,rows='US-EPA New Chemical Categories',cols='CL',values='ID',aggfunc=len,fill_value=0)
fig = pl.figure(figsize=(15,13))
ax = pl.subplot(1,1,1)
X = EPA2CL.copy()
X[X==0]=-10
ax.pcolor(X,cmap=cm.jet,vmin=-10,vmax=50,edgecolor='grey',lw=1)
ax.xaxis.tick_top()
ax.set_xticks(np.arange(X.shape[1])+0.5, minor=False)
ax.set_xticklabels(['%d' % i for i in X.columns],rotation=90)
for tick in ax.get_xticklines(): tick.set_visible(True)
for tick in ax.get_xticklabels(): tick.set_fontsize(9)
ax.yaxis.tick_left()
ax.set_ylim(top=X.shape[0])
ax.set_yticks(np.arange(X.shape[0])+0.5, minor=False)
ax.set_yticklabels([i[:35] for i in X.index])
for tick in ax.get_yticklines(): tick.set_visible(True)
for tick in ax.get_yticklabels():
tick.set_fontsize(9)