In [ ]:
 
In [2]:
%load_ext autoreload
%autoreload 2
%load_ext sql 
%pylab inline
%matplotlib inline

import matplotlib.text as text

import scipy.interpolate as interp
import pandas as pd
#from mp.txpepa import *
#from bio.data.toxplorer import *
#import bio.hts.apredica as apr
#from bio.hts.htsdb import *
#from bio.data.toxplorer import *
import viz.clust as cv
from chem.clust import *
from sklearn import (manifold, datasets, decomposition, ensemble, lda,
                     random_projection)
from sklearn.metrics.pairwise import euclidean_distances,manhattan_distances
import statsmodels.api as sm
import numpy.linalg as LA
import bio.hts.traj as traj
CD = ChemDrawing()
from rpy2.robjects.packages import importr
from rpy2.robjects.vectors import FloatVector
stats = importr('stats')
from sklearn.neighbors import KNeighborsClassifier
from ml.readacross import *

mng.register_connection("hts-db","htsdb",username="ishah",
                        password="xxx",host='localhost')
mng.register_connection("txp-db","toxplorerdb",username="ishah",
                        password="xxx",host='localhost')

%sql postgresql://ishah:xxxx@localhost/chemicals

DAT_DIR = '/share/home/ishah/projects/Chem/data/tables/'
PKL_DIR = '/share/home/ishah/projects/Chem/data/pickle/'
RES_DIR='/share/home/ishah/projects/Chem/data/results/'
FIG_DIR='/share/home/ishah/projects/Chem/figs/readacross/'

import pickle

tmstmp = time.strftime("%m-%d-%Y",time.localtime())
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The sql extension is already loaded. To reload it, use:
  %reload_ext sql
Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['product', 'fromstring', 'connect', 'copy', 'compress', 'text', 'interp', 'flatten', 'repeat']
`%matplotlib` prevents importing * from pylab and numpy

Initialization

In [3]:
# Start the parallel machine
from ml.mlearn import *
from ml.readacross import *
import IPython.parallel as PP
%reload_ext autoreload
%autoreload 2

lb_view=None
d_view =None

def initParallel(Data=None):
    RC = PP.Client(profile='galaxy_parallel')
    global lb_view
    global d_view
    d_view = RC[:]
    d_view.block = True
    lb_view = RC.load_balanced_view()
    lb_view.block = True
    d_view.execute("""
%load_ext autoreload
%autoreload 2
import ml.mlearn as ml
from ml.mlearn import *
from ml.readacross import *
from chem.clust import *
    """)
    
    if Data:
        d_view.push(Data)

Load the data from pickles

In [44]:
print "\n".join([i for i in os.listdir(PKL_DIR) if i.startswith('tx-tr')])
[BCc,BCTc,BCTb,Bio,Chm,Tox] = pd.read_pickle(PKL_DIR+'tx-tr-ch-11-16-2015.pkl')
tx-tr-ch-11-16-2015.pkl
tx-tr-ch-02-12-2015.pkl
In [48]:
 
Out[48]:
(8650, 8113)
In [4]:
#print "\n".join(os.listdir(PKL_DIR))
[A0,C0,C1,B1] = pd.read_pickle(PKL_DIR+'chm-bio-02-12-2015.pkl')
[T2,T_mt] = pd.read_pickle(PKL_DIR+'tox-02-12-2015.pkl')
MOLS = pd.read_pickle(PKL_DIR+'mols-02-12-2015.pkl')
[C2M,M2C,Cl_st,C2] = pd.read_pickle(PKL_DIR+'clust-02-12-2015.pkl')
C2 = C1.set_index('ID')
In [5]:
# Chemical fingerprints

from rdkit import Chem
from rdkit.DataStructs import *
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit.Chem import AllChem


FP1 = pd.DataFrame([np.array(AllChem.GetMorganFingerprintAsBitVect(i,3,1024)) for i in MOLS.values()])
FP1.index=MOLS.keys()
FP1.columns = ['mrgn_%d'%i for i in FP1.columns]

FP2 = pd.DataFrame([np.array(MACCSkeys.FingerprintMol(i)) for i in MOLS.values()])
FP2.index=MOLS.keys()
FP2.columns = ['mccs_%d'%i for i in FP2.columns]

FP3 = pd.DataFrame([np.array(AllChem.GetHashedTopologicalTorsionFingerprintAsBitVect(i)) for i in MOLS.values()])
FP3.index=MOLS.keys()
FP3.columns = ['tptr_%d'%i for i in FP3.columns]

FP0 = pd.merge(FP1,FP2,left_index=True,right_index=True)
FP0 = pd.merge(FP0,FP3,left_index=True,right_index=True)
FP0.index.names=['ID']
In [6]:
#Distance calculations

# Distance calculations
# make a binary rep
X = BCTb[Bio]
CID = X.index
X = X.fillna(0)
X[X!=0]=1
D_bio = pd.DataFrame(squareform(pdist(X,'jaccard')),
                     columns=CID,index=CID)
S_bio = 1-D_bio
In [14]:
S_bio.shape
Out[14]:
(8648, 8648)
In [17]:
X = BCTb[Chm]
CID = X.index
X = X.fillna(0)
X[X!=0]=1
D_chm = pd.DataFrame(squareform(pdist(X,'jaccard')),
                     columns=CID,index=CID)
S_chm = 1-D_chm
In [18]:
X = BCTb[Bio+Chm]
CID = X.index
X = X.fillna(0)
X[X!=0]=1
D_bc = pd.DataFrame(squareform(pdist(X,'jaccard')),
                    columns=CID,index=CID)
S_bc = 1-D_bc

S_bc.shape
Out[18]:
(8648, 8648)

Number of chemicals

Cluster Stability Analysis

In [13]:
initParallel(Data=dict(BCc=BCc,MOLS=MOLS))

d_view.execute("""
from chem.clust import *
CID = list(BCc.index)
M = [MOLS[i] for i in CID]
CC=ChemCluster()
CC.set_chem_ids(CID,M)
CC.clust_chems(fp='torsion')
""")    
Out[13]:
<AsyncResult: finished>
In [14]:
print "Starting Cluster stability analysis of chemicals " + time.strftime("%d/%m/%Y %H:%M",time.localtime())

send_email(txt="Start",subj="CC.clust_stability k=10:150:5")
P_ALL = []
for sf in [0.5]:
    d_view.push(dict(sf=sf))
    P = lb_view.map(lambda K: CC.clust_stability(k_cent=K,km_ninit=5,km_iter=30,
                                                 sample_frac=sf,sample_iter=10,
                                                 dbg=False),
                    range(10,150,5))
    #Res_df=pd.DataFrame(P)
    print "> sf=%3.2f " % sf + time.strftime("%H:%M",time.localtime())
    
    P_ALL += P
    
CL_stability_df = pd.DataFrame(P_ALL)
CL_stability_df.to_csv(RES_DIR+ time.strftime("chm-cluster-stability-%d-%m-%Y.csv",time.localtime()),index=False)

send_email(txt="Done",subj="CC.clust_stability k=10:150:5")
Starting Cluster stability analysis of chemicals 13/04/2015 12:41
> sf=0.50 13:10
In [18]:
# In case loading from previous run:-
#print os.listdir(RES_DIR)
#CL_stability_df = pd.read_csv(RES_DIR+'chm-cluster-stab-torsion-12-02-2015.csv')

CL_stability_df = pd.read_csv(RES_DIR+'chm-cluster-stability-13-04-2015.csv')
In [21]:
import statsmodels.api as sm
import scipy.interpolate as interp
 
#Res_df=pd.DataFrame()
#Res_df.sort(['sf','k'],inplace=True)
#Res_df.set_index(['sf'],inplace=True)
def myHyp(Z): return np.sqrt(np.sum(Z**2))
 
SF = CL_stability_df.sf.unique()
Res_df=CL_stability_df.groupby(by=['sf','k','km_ninit']).aggregate(dict(mn=np.mean,
                                                               sd=myHyp))
Res_df=Res_df.reset_index().set_index('sf')
fig = pl.figure(figsize=(8,8))
 
#for sf in set(Res_df.index):
for sf in SF:
    R_sf = Res_df.xs(sf)
    Y = sm.nonparametric.lowess(R_sf.mn,R_sf.k,return_sorted=False,it=30,delta=0.5)    
    #pl.ylim(0.1,0.6)
    Yb = sm.nonparametric.lowess(R_sf.mn-R_sf.sd,R_sf.k,return_sorted=False,it=30,delta=0.5)    
    Yt = sm.nonparametric.lowess(R_sf.mn+R_sf.sd,R_sf.k,return_sorted=False,it=30,delta=0.5)    
 
    pl.plot(R_sf.k,Y,label=sf,lw=2)
    pl.fill_between(R_sf.k,Yb,Yt,
        alpha=0.2,edgecolor='#888888',facecolor='#ababdf',
        linewidth=1,linestyle='dashdot',antialiased=True)
 
pl.legend(loc=2,fontsize=8)
pl.title("Cluster stability analysis")
pl.ylabel("Cluster stability score")
pl.xlabel("K")
 
 
Out[21]:
<matplotlib.text.Text at 0x13fdfc50>
In [11]:
# older appraoch
CID = list(BCc.index)
M = [MOLS[i] for i in CID]
CC=ChemCluster()
CC.set_chem_ids(CID,M)
CC.clust_chems(fp='torsion')
CC.find_centroids(k_cent=100,km_ninit=10,km_iter=50)

G1 = CC.clust_centroids(metric='euclidean',link='complete')
In [85]:
# New approach to visualize clusters

Cent = pd.DataFrame([BCTb.ix[I,Chm].median() for I in M2C.values()],index=M2C.keys())
In [203]:
import viz.clust as cv
Cent_Dst= pd.DataFrame(squareform(pdist(Cent,'euclidean')))
#Cent_Dst= 1-Cent_Sim
Cent_Lnk= linkage(Cent_Dst.as_matrix(),method='weighted')
Cent_Dnd= dendrogram(Cent_Lnk,no_plot=True)
Cent_G0 = vc.makeRadialGraph(Cent_Dst.as_matrix(),Cent_Lnk,Cent_Dnd,ret='graph')
#Cent_Lnk= linkage()

Count the size of each node

In [204]:
# Set the size of each leaf based on the cluster size
for i,v in Cent_G0.nodes(data=True):
     if v.has_key('lv'):
            v['cl']=v['lv']
            v['n']=len(M2C[v['cl']])
            
# the root as no inputs
r0 = [k for k,n in Cent_G0.in_degree_iter() if n==0][0]
# Update the node size basde on depth first search from root successors
V0 = Cent_G0.node 
for p,C in nx.depth_first_search.dfs_successors(Cent_G0,source=r0).iteritems():
    #print p,C
    V0[p]['n'] = sum([V0[i]['n'] for i in C])

#for cl,CID in M2C.iteritems(): Cent_G0.node[cl]['n']=len(CID)
#V0=dict(Cent_G0.nodes(data=True))

#[v for i,v in Cent_G0.nodes(data=True) if v.has_key('lv') and v.get('lv')==0][:5]
In [258]:
# Extend the reach of the 
for k,vi in V0.iteritems():
    vi['tx']=1.05*v['r']*np.cos(v['th'])
    vi['ty']=1.05*v['r']*np.sin(v['th'])
In [259]:
Cent_G0.node[100]
Out[259]:
{'cx': 503.75,
 'cy': 14.6,
 'd': 14.604035976814677,
 'dg': 182.2613065326633,
 'n': 80,
 'px': -0.72456906364043117,
 'py': -0.028611601604389417,
 'r': 0.72513374747775972,
 's': 4.0,
 'th': 3.1810598979816245,
 'tx': -0.76079751682245267,
 'ty': -0.030042181684608888}
In [337]:
# Distribution of chemicals in each cluster

X = [len(v) for v in M2C.values()]
pl.hist(X,30,log='y')
[k for k,v in M2C.iteritems() if len(v)>100]
Out[337]:
[19, 35]
In [327]:
CL_NAMEs = {85:'Acetanilide',
            0 :'PFAA',
            3 :'Pyrrlidone',
            4 :'Cyclodiene',
            5 :'Nitobenzene',
            16:'Benzoylurea',
            53:'Organophosphate',
            33:'Aryloxyphenoxypropionic',
            56:'Pyrethroid',
            34:'Pyrethroid'}
In [328]:
import viz.clust as vc


def myLab1(k,x):
    
    if x.has_key('n') and x['n']<25:
        y=str(x['n'])        
    return y

def mkProp(f=log):
    def myfun(x):
        y = 0
        if x.has_key('n'):
            y = f(x['n'])
            return y
    return myfun

def mkNodeRule(n_min=20):
    def myfun(x):
        if x.has_key('n'):
            if x['n']>=n_min:
                return True
            else:
                return False
    return myfun


def mkNodeTxt(n_min=20,cl=False,txt_rot=0,col='red',sz=16,dr=1.07,cl_info=CL_NAMEs):
    def myfun(v):
        y=''
        if v.has_key('n') and v['n']>=n_min:
            #y=str(v['n'])
            if cl and v.has_key('cl'): 
                y = 'CL%d' % v['cl']
                if cl_info.has_key(v['cl']):
                    y += ' '+ cl_info.get(v['cl']) 
        #y='%5.1f' % v['dg']
        #dth = ifthen(v['dg']>90 and v['dg']<270,-0.1,0.1)
        tx=dr*v['r']*np.cos(v['th'])
        ty=dr*v['r']*np.sin(v['th'])
        txt_rot = ifthen(v['dg']>90 and v['dg']<270,180,0)
        hz_aln = ifthen(v['dg']>90 and v['dg']<270,'right','left')
        va_aln = ifthen(v['dg']>0 and v['dg']<180,'bottom','top')
        
        txt = text.Text(tx,ty,
                    y,rotation=v['dg']+txt_rot,
                    color=col,zorder=10, ha = hz_aln, va = va_aln,
                    fontproperties=fm.FontProperties(size=sz))
        return txt
    return myfun

#def mkNodeText()

vc.drawRadialPrunedDend(Cent_G0,figsz=[30,30],show_labs=True,show_nodes=True,lc='grey',
                        vsc=0.01,node_col='orange',lab_size=14,
                        node_prop=mkProp(f=lambda i: 0.5*log(i)),
                        node_rule=mkNodeRule(n_min=2),
                        #node_lab=mkNodeLab(n_min=30,cl=True),
                        node_lab=mkNodeTxt(cl=True,dr=1.08,n_min=1),
                        txt_rot=0,
                        dbg=False,
                        save=FIG_DIR+'clus-dend-n.svg')
In [19]:
ChmNm1 = C1[['ID','chemical_name']]
ChmNm1.set_index('ID',inplace=True)
ChmNm1.ix['C60355'].chemical_name
Out[19]:
u'Acetamide'
In [20]:
from scipy.spatial.distance import cdist
CID = M2C[0]
nd_min= 3
S_i=S_chm.ix[CID,CID]
X_i=BCTb.ix[CID,Chm]

# Which features are present in this cluster?

N_d = X_i.sum()
C_i = np.where(N_d>nd_min)
X_i = X_i.ix[:,X_i.columns[C_i]]
nr = X_i.shape[0]
X_i.loc['centroid']=X_i.median()

D_i = pd.DataFrame(squareform(pdist(X_i,'jaccard')),index=X_i.index,columns=X_i.index)
d=D_i.ix['centroid']
d.sort()
cid=d.index[1]
C0.ix[cid]
Out[20]:
DSSTox_RID                                                                  79359
DSSTox_GSID                                                                 37708
DSSTox_CID                                                                  17708
TS_ChemName                                                   PFOA, ammonium salt
TS_ChemName_Synonyms            Ammonium perfluorooctanoate (PFOA, ammonium salt)
TS_CASRN                                                                3825-26-1
TS_Description                                           single chemical compound
ChemNote                                                        parent [335-67-1]
STRUCTURE_Shown                                                   tested chemical
STRUCTURE_Formula                                                      C8H4F15NO2
STRUCTURE_MW                                                             431.0989
STRUCTURE_ChemType                                                defined organic
STRUCTURE_DefinedOrganicForm                                                 salt
STRUCTURE_IUPAC                                 ammonium pentadecafluorooctanoate
STRUCTURE_SMILES                [NH4+].FC(F)(C(F)(F)C([O-])=O)C(F)(F)C(F)(F)C(...
STRUCTURE_SMILES_Desalt         FC(F)(C(F)(F)C(=O)O)C(F)(F)C(F)(F)C(F)(F)C(F)(...
STRUCTURE_InChIS_v0             InChI=1S/C8HF15O2.H3N/c9-2(10,1(24)25)3(11,12)...
STRUCTURE_InChIKey_v0                                 YOALFLHFSFEMLP-UHFFFAOYSA-N
Substance_modify_yyyymmdd                                            2.008043e+07
Name: C3825261, dtype: object
In [74]:
d.index
Out[74]:
Index([u'centroid', u'C3825261', u'C335762', u'C375951', u'C335671', u'C2058948', u'C375859', u'C307244', u'C4151502', u'C754916', u'C1763231', u'C2795393', u'C3871996', u'C29420493'], dtype='object')
In [9]:
def simGraph(S,r_i=None,n0=4,s0=0.5,levels=3,dbg=False,rect=(0,0,1000,1000)):
    """
    Input root node
    Returns shell layout
    """

    def neighbours(ri,ignore=[]):
        X1 = S.ix[ri,:]
        X1 = X1[X1>s0]
        ni = ifthen(n0<len(X1),n0,len(X1))
        X1.drop([ri]+[i for i in ignore if i in X1.index],inplace=True)
        X1.sort(ascending=False)
        X1 = X1[:ni]
        return X1
    
    def children(ri,l=0,L=2,V=[]):
        l += 1
        X1 = neighbours(ri,ignore=V)
        E1 = [(r_j,ri,{'weight':1-s_ij}) for r_j,s_ij in X1.iteritems()]
        V1 = V+list(X1.index)
        if dbg: print '>'*(l+1),ri,l,len(X1)
        
        if l>L: return V1,E1
        
        for v in X1.index:
            V2,E2 = children(v,l,L,V1)
            E1 += E2
            V1 += V2
            
        return list(set(V1)),E1
    
    V3,E3 = children(r_i,l=0,L=levels,V=[])
    

    _NV = NetViz(id='ChemClust',directed=True,
                      node_list=V3,
                      edge_list=E3)
    _NV.layout(alg='neato',xmin=rect[0],ymin=rect[1],xmax=rect[2],ymax=rect[3])
    
    return _NV,r_i

def clustGraph(CID,X_chm,S,n0=4,nd_min=3,**args):
    """
    Input cluster with distances
    Returns shell layout
    """
    nd_min= 3
    S_i=S.ix[CID,CID]
    X_i=X_chm.ix[CID]

    # Which features are present in this cluster?
    N_d = X_i.sum()
    #C_i = np.where(N_d>nd_min)
    #X_i = X_i.ix[:,X_i.columns[C_i]]
    X_i.loc['centroid']=X_i.median()

    # Now find the vector closest to the centroid
    D_i = pd.DataFrame(squareform(pdist(X_i,'jaccard')),index=X_i.index,columns=X_i.index)
    d=D_i.ix['centroid']
    d.sort()
    r_i=d.index[1]

    return simGraph(S,r_i,**args)
In [16]:
ChmNm1 = C1[['ID','chemical_name']]
ChmNm1.set_index('ID',inplace=True)
ChmNm1.ix['C60355'].chemical_name
Out[16]:
u'Acetamide'
In [ ]:
rc0='C83794'
rc0='C80057'
rc0='C335762'
rc0='C103902'
rc0='C50351'
rc0='C1763231'
rc0='C23184669'
#NV0=simGraph(CC,rc0,s0=0.35,n0=5,levels=2)
#CH0=dict(( (i,ChmNm1.ix[i].chemical_name) for i in NV0.G.nodes(data=False)))
#P=NV0.layout(alg='neato',xmin=0,xmax=1000,ymin=0,ymax=1000)
#CH0[rc0]
In [21]:
from matplotlib.patches import Ellipse, Circle, FancyArrowPatch,FancyBboxPatch
from textwrap import wrap
from chem.draw import *
STUDIES = ['mgr', 'chr', 'sac', 'sub', 'dev', 'rep', 'oth', 'acu', 'neu', 'dnt']
CH0 = None

def drawEffects0(ax,E0,x0,y0,w,h,z0=10,lab=False,fs=10):
    xi = x0
    yi = y0
    for e,v in E0.iteritems():
        fc = 'white'
        ec = 'grey'
        if v>0:
            fc = 'red'
            ec = 'white'
        elif v==0:
            fc = 'green'
            ec = 'white'
        else:
            fc = 'white'
            ec = 'grey'
            
        ax.add_patch(FancyBboxPatch((xi,yi),w,h,
                    boxstyle="round,pad=0.2",
                    ec=ec, fc=fc,fill=True, zorder=z0))
    
        if lab:
            ax.add_artist(text.Text(xi+w*0.5,yi+h+2,e,color=(0.2,0.2,0.2),
                            ha='left',va='bottom',zorder=z0,rotation=90.0,
                            fontproperties=fm.FontProperties(size=fs)))        
        xi += w
        
def vizChemClust(ax,r0,NV0,layout='neato',cl_id=None,AO=STUDIES,Mols=MOLS,ChNm=CH0,
                 ch_w=60,ch_h=60,lw=2,
                 save=False,fmt='svg',
                 xmin=0,ymin=0,W=1500,H=1500,lab_fs=12):
    ax.set_xlim(xmin,xmin+W*1.1)
    ax.set_ylim(ymin,ymin+H*1.1)

    if layout:
        NV0.layout(alg=layout,xmin=0,xmax=W*0.9,ymin=0,ymax=W*0.9)
        
    L0 = NV0.P
    # Store chemical bbox patches
    RECT={}
    
    # Get Tox data    
    T0=BCTb.ix[L0.keys(),AO]
    w,h=ch_w,ch_h
    for vi in NV0.G.nodes(data=False):
        xi,yi = L0[vi]
        xi,yi = xi-w*0.5,yi-w*0.5
        J0 = CD.mol2json(Mols[vi],size=(w,h))
        bg_rect = np.array([[xi,yi],[xi,yi+h],[xi+w,yi+h],[xi+w,yi]])
        ch_name = "\n".join(wrap(ChNm[vi],20))
        bb = mpatches.Polygon(bg_rect,edgecolor=ifthen(vi==r0,'red',(0.1,0.1,0.3,0.5)),
                              facecolor='white',
                              fill=True,zorder=5)
        RECT[vi]=bb
        ax.add_patch(bb)
        CD.drawChemBS(J0,ax,x0=xi,y0=yi,atom_r=2,z0=6,lw=0.8)
        #tx_pos = xi+w*0.5,yi+w
        tx_pos = xi+w*0.5,yi-2
        ax.add_artist(text.Text(tx_pos[0],tx_pos[1],ch_name,color=(0.2,0.2,0.2),
                                ha='center',va='top',zorder=6,
                                fontproperties=fm.FontProperties(size=lab_fs)))

        # Show tox information
        E1 = T0.ix[vi]
        dw = w/len(E1)
        drawEffects0(ax,E1,xi,yi+h+4,dw,dw*1)

    for v1,v2,EA in NV0.G.edges_iter(data=True):
        x1,y1 = L0[v1]
        x2,y2 = L0[v2]
        shrink=0.1*(w+h)
        ax.add_patch(FancyArrowPatch(posA=(x1,y1),posB=(x2,y2),
                                     patchA=RECT[v1],
                                     patchB=RECT[v2],
                                     color='grey',arrowstyle='<|-|>',
                                     mutation_scale=10.0,
                                     shrinkA=shrink,shrinkB=shrink,
                                     linewidth=lw,
                                     connectionstyle='angle3,angleA=95,angleB=0'
                                    ))
    
    E1[:] = 0
    drawEffects0(ax,E1,xmin+W-150,ymin+H-100,20,20,lab=True,fs=12)

    if save:
        chmnm = ChNm[r0].lower().replace(' ','-')
        if cl_id:
            chmnm = 'cl-%d-' % cl_id + chmnm
        fig.savefig(FIG_DIR+"nn-ra-%s-2.%s" % (chmnm,fmt),dpi=600)
In [54]:
#C1.ix[C1.chemical_name.apply(lambda i: i.find("PFOS")>-1)]
C1.ix[C1.chemical_name.apply(lambda i: i.find("Diuron")>-1)]
C2M['C330541']
CID = M2C[16]
In [55]:
CID
Out[55]:
[u'C330541',
 u'C709988',
 u'C330552',
 u'C113136779',
 u'C86479063',
 u'C103055078',
 u'C116714466',
 u'C4638486',
 u'C50657',
 u'C181640095',
 u'C264618442',
 u'C648917139']
In [56]:
pd.merge(C2.ix[CID],BCTb.ix[CID,STUDIES],left_index=True,right_index=True)
Out[56]:
DSSTox_GSID chemical_casrn chemical_name mgr chr sac sub dev rep oth acu neu dnt
ID
C330541 20446 330-54-1 Diuron 0.58 1.00 NaN 0.3 2.0 NaN NaN NaN NaN NaN
C709988 22111 709-98-8 Propanil 4.00 0.75 NaN 5.0 0.8 NaN NaN NaN NaN NaN
C330552 24163 330-55-2 Linuron 0.74 0.29 NaN NaN 5.0 NaN NaN NaN NaN NaN
C113136779 32600 113136-77-9 Cyclanilide 1.90 1.30 NaN 1.6 3.0 NaN NaN NaN NaN NaN
C86479063 32620 86479-06-3 Hexaflumuron NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
C103055078 34357 103055-07-8 Lufenuron NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
C116714466 34773 116714-46-6 Novaluron 74.00 1.10 NaN 0.7 100.0 NaN NaN NaN NaN NaN
C4638486 37749 4638-48-6 5-Chlorosalicylanilide NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
C50657 40362 50-65-7 Niclosamide NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
C181640095 47339 181640-09-5 SR144190 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
C264618442 47341 264618-44-2 SSR146977 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
C648917139 47376 648917-13-9 AVE2865 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

Search chemicals by similar names

In [ ]:
CID = C1.ix[C1.chemical_name.apply(lambda i: i.find("phenol")>-1),'ID']
X_cl=BCTb.ix[CID,Chm]
S_cl=S_chm.ix[CID,CID]
NV0,rc0=clustGraph(CID,X_cl,S_cl,s0=0.1,n0=5,levels=3)

CH0=dict(( (i,ChmNm1.ix[i].chemical_name) for i in NV0.G.nodes(data=False)))
fig = pl.figure(figsize=(10,10))
ax = pl.subplot(1,1,1)
ax.set_axis_off()
P=NV0.layout(alg='neato',xmin=0,xmax=1800,ymin=0,ymax=1800)
NV0.draw()
CH0[rc0]
#list(CID)

Search chemicals by clusterid

In [12]:
R= C1.ix[C1.chemical_name.apply(lambda i: i.lower().find("propiconazole")>-1)]
print R
C2M['C94361065']
             ID  DSSTox_GSID chemical_casrn  chemical_name
2059  C60207901        24280     60207-90-1  Propiconazole
Out[12]:
20
In [ ]:
cc_id=0
CID=M2C[cc_id]
X_cl=BCTb.ix[CID,Chm]
S_cl=S_chm.ix[CID,CID]
NV0,rc0=clustGraph(CID,X_cl,S_cl,s0=0.1,n0=5,levels=3)

CH0=dict(( (i,ChmNm1.ix[i].chemical_name) for i in NV0.G.nodes(data=False)))
fig = pl.figure(figsize=(10,10))
ax = pl.subplot(1,1,1)
ax.set_axis_off()
P=NV0.layout(alg='neato',xmin=0,xmax=1200,ymin=0,ymax=1200)
NV0.draw()
CH0[rc0]
In [63]:
#CD = ChemDrawing()
fig = pl.figure(figsize=(15,15))
ax = pl.subplot(1,1,1)
ax.set_axis_off()

vizChemClust(ax,rc0,NV0,cl_id=cc_id,layout='neato',ChNm=CH0,save=True,fmt='png',
             ch_w=150,ch_h=90,
             W=2000,H=2000,lw=1,
             lab_fs=12,xmin=100,ymin=0)
In [260]:
 
Out[260]:
C101530103 C107534963 C112281773 C114369436 C116255482 C119446683 C125116236 C125225287 C130726680 C131983727 ... C66246886 C67915315 C72479266 C74512122 C79983714 C83657243 C84625616 C86386734 C94361065 C99592399
0 1225 359 642 465 502 623 347 282 1078 267 ... 413 996 788 1120 878 436 848 593 640 549
1 1076 1029 378 1126 638 786 252 361 466 494 ... 788 715 253 583 504 214 896 950 1195 164

2 rows × 35 columns

Nearest neighbour viz </h3>

In [30]:
CD = ChemDrawing()
Out[30]:
array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])
In [402]:
from matplotlib.patches import Ellipse, Circle, FancyArrowPatch,FancyBboxPatch
from textwrap import wrap
from chem.draw import *
STUDIES = ['mgr', 'chr', 'sac', 'sub', 'dev', 'rep', 'oth', 'acu', 'neu', 'dnt']


def drawEffects(ax,E0,x0,y0,w,h,z0=10,lab=False,fs=10,
                act='T',
                col_act=(0.8,0.1,0.1),
                col_act_p=(1.0,0.5,0.1),
                col_inact=(0.1,0.8,0.1),
                col_inact_p=(0.1,0.1,0.8),
               ):
    xi = x0
    yi = y0
    for e,v in E0.iteritems():
        fc = 'white'
        ec = 'grey'
        if v>0:
            fc = ifthen(act=='T',col_act,col_act_p)
            ec = 'white'
        elif v==0:
            fc = ifthen(act=='T',col_inact,col_inact_p)
            ec = 'white'
        else:
            fc = 'grey'
            ec = 'white'
            
        ax.add_patch(FancyBboxPatch((xi,yi),w,h,
                    boxstyle="round,pad=0.2",
                    ec=ec, fc=fc,fill=True, zorder=z0))
    
        if lab:
            ax.add_artist(text.Text(xi+w*0.5,yi+h+2,e,color=(0.2,0.2,0.2),
                            ha='left',va='bottom',zorder=z0,rotation=90.0,
                            fontproperties=fm.FontProperties(size=fs)))        
        xi += w
        
def visChmNN(cid,Data,Sc,ax=None,c_knn=5,c_s0=0.3,Mols=MOLS,
             chem_sz=(40,60),CN=CH0,
             rs=10,r_min=100,
             atmr=5,ch_fs=12,
             th0=0.5*math.pi,th_tot=math.pi,
             lab_len=20,
             BBox=(-500,-500,500,500),Origin=[0,0],
             Effects=pd.DataFrame(),
             pred = False,
             save = False
            ):
    
    NN_0 = getKNN(cid,Sc,k0=c_knn,s0=c_s0,sim=True)
    NN_c = pd.DataFrame(NN_0.reset_index())
    NN_c.columns=['cid_i','s']
    NN_c['d']=1-NN_c.s
    NN_c.sort('d',inplace=True)
    dth = th_tot/(NN_c.shape[0]-1)
    xo,yo = Origin
    xmin,ymin,xmax,ymax=BBox

    r_max = abs(xmax-xo)-r_min
    
    #NN_c['r'] = r_max*(NN_c.s**2)*rs
    NN_c['r'] = r_min+r_max*NN_c.d*rs
    NN_c['th'] = th0+(np.arange(0,NN_c.shape[0]) * dth)
    NN_c['x'] = NN_c.r*np.cos(NN_c.th)
    NN_c['y'] = NN_c.r*np.sin(NN_c.th)
    NN_c['lx']= 0.5*(NN_c.x+xo)
    NN_c['ly']= 0.5*(NN_c.y+yo)
    
    NN_c['chem'] = NN_c.cid_i.apply(lambda i: CN.get(i))
    #return NN_c

    ax.set_xlim(xmin,xmax)
    ax.set_ylim(ymin,ymax)
    ax.set_axis_off()
    
    ch_w,ch_h = chem_sz
    ch_hw,ch_hh= 0.5*ch_w,0.5*ch_h
    # Center
    
    bg_rect = np.array([[xo-ch_hw,yo-ch_hh],[xo-ch_hw,yo+ch_hh],[xo+ch_hw,yo+ch_hh],[xo+ch_hw,yo-ch_hh]])
    ch_name = "\n".join(wrap(CN[cid],lab_len))
    tx_pos = xo,yo-ch_hh-2
    
    #ax.add_patch(FancyBboxPatch((xi,yi),w,w,
    #                            boxstyle="round,pad=0.3",mutation_scale=2,
    #                            ec=(0.1,0.1,0.3,0.5), fc='white',fill=True, zorder=5))
    ax.add_patch(mpatches.Polygon(bg_rect,edgecolor=(0.1,0.1,0.3,0.5),facecolor='white',fill=True,zorder=5))
    
    J0 = CD.mol2json(Mols[cid],size=chem_sz)
    CD.drawChemBS(J0,ax,x0=xo-ch_w*0.5,y0=yo-ch_h*0.5,atom_r=4,z0=6,lw=1)
    ax.add_artist(text.Text(tx_pos[0],tx_pos[1],ch_name,color=(0,0,0),
                            ha='center',va='top',zorder=6,
                            fontproperties=fm.FontProperties(size=ch_fs)))
    
    if Effects.shape[0]>0:
        E_i = Effects.ix[cid]
        dw = ch_w/len(E_i)
        drawEffects(ax,E_i,xo-ch_hw,yo+ch_hh+4,dw,dw*1)
        
    E_p=None
    if pred and Effects.shape[0]>0:
        E_p = E_i.copy()
        E_p[:] = None
        for e in Effects.columns:
            A_i = Effects.ix[NN_0.index,e].dropna()
            if len(A_i)==0: continue
            S_i = NN_0[A_i.index]
            E_p[e] = np.sum(A_i*S_i)/S_i.sum()
        drawEffects(ax,E_p,xo-ch_hw,yo+ch_hh+2*dw,dw,dw*1,act='P')
        
    for jj in range(NN_c.shape[0]):
        C = NN_c.irow(jj)
        bg_rect = np.array([[C.x-ch_hw,C.y-ch_hh],[C.x-ch_hw,C.y+ch_hh],[C.x+ch_hw,C.y+ch_hh],[C.x+ch_hw,C.y-ch_hh]])
        ch_name = "\n".join(wrap(C.chem,lab_len))
        tx_pos = C.x,C.y-ch_hh-2

        #ax.add_patch(FancyBboxPatch((xi,yi),w,w,
        #                            boxstyle="round,pad=0.3",mutation_scale=2,
        #                            ec=(0.1,0.1,0.3,0.5), fc='white',fill=True, zorder=5))
        ax.add_patch(mpatches.Polygon(bg_rect,edgecolor=(0.1,0.1,0.3,0.5),facecolor='white',fill=True,zorder=5))

        ax.add_patch(FancyArrowPatch(posA=(C.x,C.y),posB=(xo,yo),
                                     #patchB=p2,
                                     color='grey',arrowstyle='->',
                                     mutation_scale=15.0,
                                     shrinkA=60,shrinkB=60,
                                     linewidth=2,
                                     #connectionstyle='angle3,angleA=95,angleB=0'
                                    ))
        
        ax.add_artist(text.Text(C.lx,C.ly,'%3.2fc' % C.s,color=(1.0,0.1,0.1),
                        ha='center',va='top',zorder=10,
                        backgroundcolor='white',
                        fontproperties=fm.FontProperties(size=12,weight='bold')))

        J0 = CD.mol2json(Mols[C.cid_i],size=chem_sz)
        CD.drawChemBS(J0,ax,x0=C.x-ch_w*0.5,y0=C.y-ch_h*0.5,atom_r=atmr,z0=6,lw=1,xyoff=[2,-4])

        ax.add_artist(text.Text(tx_pos[0],tx_pos[1],ch_name,color=(0,0,0),
                        ha='center',va='top',zorder=6,
                        fontproperties=fm.FontProperties(size=ch_fs)))
        
        if Effects.shape[0]>0:
            E_i = Effects.ix[C.cid_i]
            dw = ch_w/len(E_i)
            drawEffects(ax,E_i,C.x-ch_hw,C.y+ch_hh+4,dw,dw*1)

    E_i[:] = None
    w=15
    drawEffects(ax,E_i,xmax-1.1*w*len(E_i),ymax-3*w,w,w,lab=True,fs=w*0.75)

    E_i = pd.Series([2,0,None],index=['Act (+)','Act (-)','Act (NT)'])
    drawEffects(ax,E_i,xmax-3*w*len(E_i),ymax-8*w,w,w,lab=True,fs=w*0.75,act='T')

    E_i = pd.Series([2,0],index=['Pred (+)','Pred (-)'])
    drawEffects(ax,E_i,xmax-1.5*w*len(E_i),ymax-8*w,w,w,lab=True,fs=w*0.75,act='P')

    if save:
        x = CN[cid].lower().replace(' ','-')
        fig.savefig(FIG_DIR+'cbai-%s-nn%d-1.svg' % (x,c_knn))
        fig.savefig(FIG_DIR+'cbai-%s-nn%d-1.png' % (x,c_knn),dpi=600)
        
    return NN_c,E_p
    
    
    
def visBioNN(cid,Data,Sb,b_knn=5,b_s0=0.3):
    NN_b = getKNN(cid,Sb,k0=b_knn,s0=b_s0,sim=True)
    

def visChmBioNN(cid,Data,Sc,Sb,c_knn=5,b_knn=5,c_s0=0.3,b_s0=0.3):
    NN_b = getKNN(cid,Sb,k0=b_knn,s0=b_s0,sim=True)
    NN_c = getKNN(cid,Sc,k0=c_knn,s0=c_s0,sim=True)
    
    
In [61]:
list(C1.ix[C1.chemical_name.apply(lambda i: i.lower().find("carbamate")>-1)].ID)
Out[61]:
[u'C137291',
 u'C598550',
 u'C13927770',
 u'C20624253',
 u'C136232',
 u'C14324551',
 u'C886748',
 u'C148185',
 u'C128041',
 u'C1111780',
 u'C55406536',
 u'C64006',
 u'C17945050',
 u'C23432624',
 u'C59719674']
In [352]:
len(M2C)
Out[352]:
100
In [355]:
CID1 =  C1.ix[C1.chemical_name.apply(lambda i: i.lower().find("carbamate")>-1),'ID']
M1= set([C2M.get(i) for i in CID1])
[(i,len(M2C.get(i))) for i in M1 if i]
#M2C[25]
Out[355]:
[(80, 41), (1, 47), (98, 32), (35, 369)]
In [408]:
cc_id = 16
CID = M2C[cc_id]
# Chem
X = BCTb.ix[CID,Chm]
X = X.fillna(0)
X[X!=0]=1
Sc1 = 1-pd.DataFrame(squareform(pdist(X,'jaccard')),columns=CID,index=CID)


# Bio
X = BCTb.ix[CID,Bio]
X = X.fillna(0)
X[X!=0]=1
Sb1 = 1-pd.DataFrame(squareform(pdist(X,'jaccard')),columns=CID,index=CID)

# BC
X = BCTb.ix[CID,Bio+Chm]
X = X.fillna(0)
X[X!=0]=1
Sbc1 = 1-pd.DataFrame(squareform(pdist(X,'jaccard')),columns=CID,index=CID)

CH0=dict(( (i,ChmNm1.ix[i].chemical_name) for i in CID ))

len(CID)
CH0
Out[408]:
{u'C103055078': u'Lufenuron',
 u'C113136779': u'Cyclanilide',
 u'C116714466': u'Novaluron',
 u'C181640095': u'SR144190',
 u'C264618442': u'SSR146977',
 u'C330541': u'Diuron',
 u'C330552': u'Linuron',
 u'C4638486': u'5-Chlorosalicylanilide',
 u'C50657': u'Niclosamide',
 u'C648917139': u'AVE2865',
 u'C709988': u'Propanil',
 u'C86479063': u'Hexaflumuron'}
In [410]:
fig = pl.figure(figsize=(15,15))
ax = pl.subplot(1,1,1)
X,E1 = visChmNN('C264618442',BCTb,Sc1,ax=ax,c_s0=0,chem_sz=(90,60),c_knn=9,
                rs=1.0,r_min=180,
                th0=0.5*math.pi,th_tot=1.3*math.pi,
                lab_len=15,ch_fs=12,
                Effects=BCTb.ix[CID,STUDIES],CN=CH0,
                pred=True,
                save=True)
In [396]:
fig = pl.figure(figsize=(15,15))
ax = pl.subplot(1,1,1)
X,E1 = visChmNN('C330541',BCTb,Sc1,ax=ax,c_s0=0,chem_sz=(90,60),c_knn=9,
                rs=1.0,r_min=180,
                th0=0.5*math.pi,th_tot=1.3*math.pi,
                lab_len=15,ch_fs=12,
                Effects=BCTb.ix[CID,STUDIES],CN=CH0,
                pred=True,
                save=True)
In [ ]:
getCh
In [397]:
fig = pl.figure(figsize=(15,15))
ax = pl.subplot(1,1,1)

X,E1 = visChmNN('C330541',BCTb,Sb1,ax=ax,c_s0=0,chem_sz=(90,60),c_knn=9,
                rs=0.8,r_min=180,
                th0=0.5*math.pi,th_tot=1.3*math.pi,
                lab_len=15,ch_fs=12,
                Effects=BCTb.ix[CID,STUDIES],CN=CH0,
                pred=True,
                save=False)
In [391]:
fig = pl.figure(figsize=(15,15))
ax = pl.subplot(1,1,1)

X,E1 = visChmNN('C872504',BCTb,Sbc1,ax=ax,c_s0=0,chem_sz=(90,60),c_knn=9,
                rs=0.8,r_min=180,
                th0=0.5*math.pi,th_tot=1.3*math.pi,
                lab_len=15,ch_fs=12,
                Effects=BCTb.ix[CID,STUDIES],CN=CH0,
                pred=True,
                save=False)

How many chemicals in the neighbourhood of each on and how close/far ?

In [222]:
S_df = CC._S_df
#np.triu_indices(10)
X_st = []

for cid in list(S_df.index):
    X=S_df.ix[cid].drop(cid)
    c_st=dict(ID=cid,n1=0,n2=0,n3=0,n4=0)
    if not np.any(X>0):
        X_st.append(c_st)
        continue
    
    
    c_st.update(dict(s_min=X[X>0].min(),s_max=X.max(),s_mn=X[X>0].mean(),
                     s_25p=np.percentile(X[X>0],25),s_5p=np.percentile(X[X>0],25),
                     s_75p=np.percentile(X[X>0],75),s_50p=np.percentile(X[X>0],50)))
        
    # How many chemicals in all that have similarity > minimum value ? 
    c_st['n1']=np.sum(X>0.5)
    
    # How many chemicals in all that have similarity > minimum value ? 
    c_st['n2']=np.sum(X>c_st['s_5p'])

    # How many chemicals in all that have similarity > 25percentil ? 
    c_st['n3']=np.sum(X>c_st['s_50p'])

    # How many chemicals in all that have similarity > 25percentil ? 
    c_st['n4']=np.sum(X>c_st['s_75p'])

    X_st.append(c_st)
    
CHN_st = pd.DataFrame(X_st)
In [209]:
#CHN_st.s_min.isnull().sum()
CHN_st.s_max.hist(bins=20)
Out[209]:
<matplotlib.axes.AxesSubplot at 0x1fcde510>
In [223]:
CHN_st.sort(columns=['n1','n2'],ascending=[0,0],inplace=True)
CHN_st.set_index('ID',inplace=True)

Develop alternate approach for clustering chemicals

1. Use hierarchical agglomerative clustering / try different linkage methods

2. Cut the dendrogram based on different criteria

3. Visualize

See additional information in Comp/Clust

In [359]:
from chem.clust import *
CID = list(BCc.index)
M = [MOLS[i] for i in CID]
CC=ChemCluster()
CC.set_chem_ids(CID,M)
G1=CC.clust_chems(fp='torsion')
In [491]:
from rdkit import DataStructs
from rdkit.ML.Cluster import Butina

CC.fp2dist1()
dists = CC._FP_D
#nfps = len(CC._FP)
#fps = CC._FP
#for i in range(1,nfps):
#    sims = DataStructs.BulkTanimotoSimilarity(fps[i],fps[:i])
#    dists.extend([1-x for x in sims])
In [492]:
cs = Butina.ClusterData(CC._FP_D,len(CC._FP),0.6,isDistData=True)
L = np.array([len(i) for i in cs])

pl.hist(L,20)
pl.ylim(0,100)
np.sum(L==1)
Out[492]:
520
In [386]:
Res = []
for d0 in np.linspace(0,1,20):
    cs = Butina.ClusterData(CC._FP_D,len(CC._FP),d0,isDistData=True)
    L = np.array([len(i) for i in cs])
    Res.append(dict(d0=d0,n1=np.sum(L==1),nc=len(L),n_mn=np.mean(L),n_sd=np.std(L)))
Res_df = pd.DataFrame(Res)
pl.plot(Res_df.d0,Res_df.n1,label='n1')
pl.plot(Res_df.d0,Res_df.nc,label='nc')
pl.legend()
Out[386]:
<matplotlib.legend.Legend at 0x16c223d0>
In [388]:
pl.plot(Res_df.d0,Res_df.n_mn,label='n_mn')
pl.plot(Res_df.d0,Res_df.n_sd,label='n_sd')
pl.legend()
pl.ylim(0,100)
Out[388]:
(0, 100)

EPA Categories and clusters

In [ ]:
OECD_tb0 = pd.read_excel('/share/home/ishah/projects/Chem/data/categories/200115_chm-v1.xlsx',sheetname='Toolbox cat output')
OECD_tb0['ID']= OECD_tb0.CAS.apply(lambda i: 'C%s' % i.replace('-',''))
In [554]:
OECD_tb0.columns
Out[554]:
Index([u'CAS', u'NAME', u'SMILES', u'OECD HPV Chemical Categories', u'Substance Type', u'US-EPA New Chemical Categories', u'Groups of elements', u'CAS.1', u'ID'], dtype='object')
In [555]:
Z = pd.merge(OECD_tb0,C2M_df,left_on='ID',right_on='ID')
EPA2CL = pd.pivot_table(Z,rows='US-EPA New Chemical Categories',cols='CL',values='ID',aggfunc=len,fill_value=0)
In [559]:
fig = pl.figure(figsize=(15,13))
ax  = pl.subplot(1,1,1)
X = EPA2CL.copy()
X[X==0]=-10
ax.pcolor(X,cmap=cm.jet,vmin=-10,vmax=50,edgecolor='grey',lw=1)
ax.xaxis.tick_top()
ax.set_xticks(np.arange(X.shape[1])+0.5, minor=False)
ax.set_xticklabels(['%d' % i for i in X.columns],rotation=90)
for tick in ax.get_xticklines(): tick.set_visible(True)
for tick in ax.get_xticklabels(): tick.set_fontsize(9)

ax.yaxis.tick_left()
ax.set_ylim(top=X.shape[0])
ax.set_yticks(np.arange(X.shape[0])+0.5, minor=False)
ax.set_yticklabels([i[:35] for i in X.index])
for tick in ax.get_yticklines(): tick.set_visible(True)
for tick in ax.get_yticklabels(): 
    tick.set_fontsize(9)