GenRA Data preparation and loading

The data was downloaded from the NCCT Website into a directory DAT_DIR

In [5]:
%load_ext autoreload
%autoreload 2
%load_ext sql 
import matplotlib.text as text

import scipy.interpolate as interp
import pandas as pd
#from mp.txpepa import *
#from bio.data.toxplorer import *
#import bio.hts.apredica as apr
#from bio.hts.htsdb import *
#from bio.data.toxplorer import *
#import viz.clust as cv
#from chem.clust import *
from sklearn import (manifold, datasets, decomposition, ensemble, lda,
                     random_projection)
from sklearn.metrics.pairwise import euclidean_distances,manhattan_distances
import statsmodels.api as sm
import numpy.linalg as LA
from rpy2.robjects.packages import importr
from rpy2.robjects.vectors import FloatVector
stats = importr('stats')
from sklearn.neighbors import KNeighborsClassifier
from genra.readacross import *

mng.register_connection("hts-db","htsdb",username="ishah",
                        password="xxx",host='localhost')
mng.register_connection("txp-db","toxplorerdb",username="ishah",
                        password="xxx",host='localhost')

#%sql postgresql://ishah:xxx@localhost/chemicals
#CD = ChemDrawing()

DAT_DIR = '/share/home/ishah/projects/Chem/data/tables/'
PKL_DIR = '/share/home/ishah/projects/Chem/data/pickle/'
import pickle

tmstmp = time.strftime("%m-%d-%Y",time.localtime())

Identify data files

In [7]:
import zipfile
DAT_DIR = '/share/home/ishah/projects/ToxCast/data/TX14/'
FD = {}
for F in os.listdir(DAT_DIR):
    print '\n>',F,'\n'
    ZF1 = zipfile.ZipFile(DAT_DIR+F,'r')
    print "\n\t".join([i.filename for i in ZF1.filelist])
    
    for i in ZF1.filelist: 
        fn = i.filename
        #if fn.find('/')>-1:
        #    fn = fn.split('/')[-1]
        #if not fn: continue
        FD[fn]=F
        
def loadTXdata(df,fd=FD,read_fn=pd.read_csv,dat_dir=DAT_DIR):
    zf  = fd[df]
    ZF1 = zipfile.ZipFile(dat_dir+zf,'r')
    d   = ZF1.extract(df)
    return read_fn(d)
> ChemicalFiles.zip 

TOX21IDs_v4b_23Oct2014_QCdetails.xlsx
	TOX21S_v4b_8599_23Oct2014.xlsx
	TOX21S_v4b_CID_structures.sdf
	ToxcastChemicalFiles_ReadMe_20141112.txt

> toxrefdb.zip 

toxrefdb/
	toxrefdb/README_ToxRefDB_20141106.docx
	toxrefdb/toxrefdb_endpoint_matrix_AUG2014_FOR_PUBLIC_RELEASE.csv
	toxrefdb/toxrefdb_nel_lel_noael_loael_summary_AUG2014_FOR_PUBLIC_RELEASE.csv
	toxrefdb/toxrefdb_study_tg_effect_endpoint_AUG2014_FOR_PUBLIC_RELEASE.csv

> Assay Annotation.zip 

ToxCast Assay Annotation  Study_Design_info_20141021.csv
	ToxCast Assay Annotation Assay_Target_Info_20141021.csv
	ToxCast_Assay_Annotation_Data_Users_Guide_20141021.pdf

> ToxCast_Tox21_Level5&6_20141022.zip 

ToxCast_Tox21_Level5&6_20141022.csv

> ToxCast_Summary_Files.zip 

AllResults_cyto_dist_141121.csv
	AllResults_fitc_Matrix_141121.csv
	AllResults_flags_141121.csv
	AllResults_hitc_Matrix_141121.csv
	AllResults_l4id_Matrix_141121.csv
	AllResults_logc_max_Matrix_141121.csv
	AllResults_logc_min_Matrix_141121.csv
	AllResults_max_mean_Matrix_141121.csv
	AllResults_max_med_Matrix_141121.csv
	AllResults_modl_ac10_Matrix_141121.csv
	AllResults_modl_acb_Matrix_141121.csv
	AllResults_modl_acc_Matrix_141121.csv
	AllResults_modl_ga_Matrix_141121.csv
	AllResults_modl_gw_Matrix_141121.csv
	AllResults_modl_la_Matrix_141121.csv
	AllResults_modl_lw_Matrix_141121.csv
	AllResults_modl_Matrix_141121.csv
	AllResults_modl_rmse_Matrix_141121.csv
	AllResults_modl_tp_Matrix_141121.csv
	AllResults_spid_Matrix_141121.csv
	AllResults_tested_Matrix_141121.csv
	AllResults_zscore_Matrix_141121.csv
	Assay_Summary_141121.csv
	Chemical_Summary_141121.csv

Load ToxCast Data

In [ ]:
#Assay
A0  = loadTXdata( 'Assay_Summary_141121.csv')
A0.set_index(['aenm','aeid','acid','assay_source_name'],inplace=True)
#Chemicals
C0  = loadTXdata('TOX21S_v4b_8599_23Oct2014.xlsx',read_fn=pd.read_excel)
C0['TS_CASRN']=C0.TS_CASRN.apply(lambda x: x.replace("'",""))
C0['ID'] = C0.TS_CASRN.apply(lambda x: 'C'+x.replace('-',''))
C1 = C0[['ID','DSSTox_GSID','TS_CASRN','TS_ChemName']]
C1 = C1.rename(columns={'TS_CASRN':'chemical_casrn','TS_ChemName':'chemical_name'})
C0.set_index('ID',inplace=True)
#Bioactivity
B0= loadTXdata('AllResults_modl_ga_Matrix_141121.csv')
B0.rename(columns=({'Unnamed: 0':'ID'}),inplace=True)
B0.set_index('ID',inplace=True)
B1= loadTXdata('AllResults_modl_la_Matrix_141121.csv')
B1.rename(columns=({'Unnamed: 0':'ID'}),inplace=True)
B1.set_index('ID',inplace=True)
# What was tested
Bt = loadTXdata('AllResults_tested_Matrix_141121.csv')
Bt.rename(columns=({'Unnamed: 0':'ID'}),inplace=True)
Bt.set_index('ID',inplace=True)
# Set what was not test to Null 
# Everything that is null is inactive - replace nulls with very high conc
B0[B0.isnull()]=6
B1[B1.isnull()]=6
# Everything that was not tested is null
B0[Bt==0]=None
B1[B1.isnull()]=6

B0 = pd.merge(C1,B0.reset_index(),left_on='ID',right_on='ID')
B0.set_index(['ID','DSSTox_GSID','chemical_casrn','chemical_name'],inplace=True)


# Tox21 data
T21= loadTXdata('ToxCast_Tox21_Level5&6_20141022.csv')
T21.rename(columns=dict(spid='sample_id',casn='casrn',chnm='chemical_name',code='ID',aenm='assay_name'),inplace=True)

T21b = pd.pivot_table(T21,index=['sample_id','ID','casrn','chemical_name'],
                      columns='assay_name',values='hitc')
T21p = pd.pivot_table(T21,index=['sample_id','ID','casrn','chemical_name'],
                      columns='assay_name',values='hill_ga')
In [29]:
pickle.dump([A0,C0,C1,B1],file(PKL_DIR+'chm-bio-'+tmstmp+'.pkl','w'))

Load ToxRefDB Data

In [8]:
# Toxicity 
T1 = loadTXdata('toxrefdb/toxrefdb_study_tg_effect_endpoint_AUG2014_FOR_PUBLIC_RELEASE.csv')
T1.drop('Unnamed: 0',axis=1,inplace=True)
# Create ID
T1['ID']=T1.chemical_casrn.apply(lambda x: 'C'+x.replace('/','-').replace('-',''))
# DSSTox_GSID
T1['DSSTox_GSID'] = T1.chemical_id.apply(lambda x: ifthen(not x.find('CAS')>-1,x.split('_')[-1],None))
In [9]:
T1.shape
Out[9]:
(184257, 68)
In [287]:
#T11 = T1.set_index('ID')
Ph1 = T1[(T1['data_source']=='opp_der')].ID.unique()
len(Ph1)
Out[287]:
569
In [11]:
T1.columns
Out[11]:
Index([u'chemical_id', u'chemical_casrn', u'chemical_name', u'chemical_sets',
       u'data_source', u'entry_status_id', u'entry_status', u'entry_level_id',
       u'entry_level', u'usability', u'usability_desc', u'study_id',
       u'source_study_numeric_id', u'source_study_alphanumeric_id', u'year',
       u'citation', u'guideline_no', u'guideline_name', u'study_type_id',
       u'study_type', u'species_id', u'species', u'strain', u'comments_animal',
       u'admin_method', u'admin_route', u'dose_start', u'dose_start_unit',
       u'dose_end', u'dose_end_unit', u'lot_batch', u'purity', u'source',
       u'ldt', u'hdt', u'toxrefdb_study_dose_unit', u'no_doses_tested',
       u'tg_id', u'generation', u'gender', u'dosing_period', u'dose_level',
       u'dose', u'toxrefdb_tg_dose_unit', u'duration', u'duration_unit',
       u'no_animals', u'effect_id', u'effect_type_id', u'effect_type',
       u'effect_target_id', u'effect_target', u'effect_desc_id',
       u'effect_desc', u'effect_direction_id', u'direction',
       u'effect_free_text', u'target_site', u'focal_diffuse', u'loael',
       u'effect_category', u'endpoint_category', u'endpoint_type',
       u'endpoint_system', u'endpoint_target', u'endpoint_lifestage', u'ID',
       u'DSSTox_GSID'],
      dtype='object')
In [13]:
T1[['chemical_id','chemical_name','study_id','source_study_numeric_id', u'citation']].ix[:50]
Out[13]:
chemical_id chemical_name study_id source_study_numeric_id citation
0 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
1 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
2 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
3 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
4 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
5 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
6 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
7 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
8 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
9 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
10 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
11 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
12 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
13 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
14 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
15 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
16 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
17 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
18 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
19 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
20 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
21 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
22 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
23 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
24 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
25 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
26 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
27 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
28 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
29 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
30 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
31 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
32 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
33 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
34 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
35 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
36 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
37 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
38 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
39 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
40 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
41 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
42 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
43 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
44 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
45 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
46 DSSTox_GSID_40801 (+-)-Indoxacarb 3258 44477144 Breslin, W. (1997) Two Generation Reproduction...
47 DSSTox_GSID_20895 Mirex 6306 NaN Huff J (1990) TOXICOLOGY AND CARCINOGENESIS ST...
48 DSSTox_GSID_47282 (2R)-2-{4-[({[2-(1,3-benzodioxol-5-yloxy)pyrid... 7121 6448 Pfizer compound CP-671305:14 DAY EXPLORATORY T...
49 DSSTox_GSID_47282 (2R)-2-{4-[({[2-(1,3-benzodioxol-5-yloxy)pyrid... 7121 6448 Pfizer compound CP-671305:14 DAY EXPLORATORY T...
50 DSSTox_GSID_47282 (2R)-2-{4-[({[2-(1,3-benzodioxol-5-yloxy)pyrid... 7121 6448 Pfizer compound CP-671305:14 DAY EXPLORATORY T...

Categorize lesions

In [ ]:
# Effect -> lesion type
Les_Cat = dict(( (E.name.lower(),[t.replace('les_cat:','').lower() for t in E.tags][0]) 
                for E in Entity.objects(tags__istartswith='les_cat:',name__exists=1) ))
#len(T1.effect_desc.unique())
#Les_Cat.items()[:20]
# Add to dict
Les_Cat['abnormal lobation']='other'
Les_Cat['leukemia lymphocytic']='neoplasia'
Les_Cat['carcinoma nos'] = 'neoplasia'
Les_Cat['mixed tumor malignant'] = 'neoplasia'

T1.effect_desc[pd.isnull(T1.effect_desc)]=''
T1['les_cat']=T1.effect_desc.apply(lambda x: Les_Cat.get(x.lower()))
In [5]:
[i.lower() for i in T1.study_type.unique()]
Out[5]:
['mgr', 'chr', 'sac', 'sub', 'dev', 'rep', 'oth', 'acu', 'neu', 'dnt']
In [6]:
CAS_rn = Chemical.objects(tags='css_rn').distinct('casrn')
len(CAS_rn)
CAS_rn[:10]
Out[6]:
[u'122931-48-0',
 u'26225-79-6',
 u'50594-66-6',
 u'62476-59-9',
 u'63748-59-4',
 u'94128-03-7',
 u'94128-04-8',
 u'2395-00-8',
 u'33496-48-9',
 u'335-67-1']
In [7]:
len(set(T1.chemical_casrn.unique()).intersection(CAS_rn))
Out[7]:
60
In [8]:
I1 = T1.chemical_casrn.apply(lambda i: i in CAS_rn)
I2 = T1.study_type.apply(lambda i: i.lower() in ['chr', 'sub', 'acu'] )
I3 = T1.species.apply(lambda i: i.lower()=='rat')
I4 = np.logical_and(np.logical_and(I1,I2), I3)
sum(I4)
Out[8]:
7296
In [58]:
TS1= T1.ix[T1.index[I4],
      ['DSSTox_GSID','chemical_id', u'chemical_casrn', u'chemical_name',
      'study_type', u'species', u'strain','admin_method', u'admin_route',
      'dose', u'toxrefdb_tg_dose_unit', u'duration', u'duration_unit','loael']
      ].drop_duplicates()
TS1.shape
Out[58]:
(1596, 14)
In [172]:
I5 = np.logical_and(I4,T1.chemical_casrn=='4151-50-2')
T1.ix[T1.index[I5],['DSSTox_GSID','chemical_id', u'chemical_casrn', u'chemical_name',
      'study_type', u'species', u'strain','admin_method', u'admin_route',
      'dose', u'toxrefdb_tg_dose_unit', u'duration', u'duration_unit']]
Out[172]:
DSSTox_GSID chemical_id chemical_casrn chemical_name study_type species strain admin_method admin_route dose toxrefdb_tg_dose_unit duration duration_unit
In [173]:
I5 = T1.chemical_casrn=='4151-50-2'
T1.ix[T1.index[I5],['DSSTox_GSID','chemical_id', u'chemical_casrn', u'chemical_name',
      'study_type', u'species', u'strain','admin_method', u'admin_route',
      'dose', u'toxrefdb_tg_dose_unit', u'duration', u'duration_unit']]
Out[173]:
DSSTox_GSID chemical_id chemical_casrn chemical_name study_type species strain admin_method admin_route dose toxrefdb_tg_dose_unit duration duration_unit
162978 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid SUB rat Sprague Dawley (CD) Feed Oral 7.50 mg/kg/day 90 day
162979 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid SUB rat Sprague Dawley (CD) Feed Oral 7.50 mg/kg/day 90 day
162980 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid SUB rat Sprague Dawley (CD) Feed Oral 7.50 mg/kg/day 90 day
162981 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid SUB rat Sprague Dawley (CD) Feed Oral 7.50 mg/kg/day 90 day
162982 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid SUB rat Sprague Dawley (CD) Feed Oral 7.50 mg/kg/day 90 day
162983 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid SUB rat Sprague Dawley (CD) Feed Oral 7.50 mg/kg/day 90 day
162984 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid SUB rat Sprague Dawley (CD) Feed Oral 7.50 mg/kg/day 90 day
162985 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid SUB rat Sprague Dawley (CD) Feed Oral 7.50 mg/kg/day 90 day
162986 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid SUB rat Sprague Dawley (CD) Feed Oral 7.50 mg/kg/day 90 day
162987 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid SUB rat Sprague Dawley (CD) Feed Oral 7.50 mg/kg/day 90 day
162988 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid SUB rat Sprague Dawley (CD) Feed Oral 7.50 mg/kg/day 90 day
162989 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid SUB rat Sprague Dawley (CD) Feed Oral 7.50 mg/kg/day 90 day
162990 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid SUB rat Sprague Dawley (CD) Feed Oral 7.50 mg/kg/day 90 day
162991 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid SUB rat Sprague Dawley (CD) Feed Oral 7.50 mg/kg/day 90 day
162992 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid SUB rat Sprague Dawley (CD) Feed Oral 7.50 mg/kg/day 90 day
162993 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid SUB rat Sprague Dawley (CD) Feed Oral 7.50 mg/kg/day 90 day
162994 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid SUB rat Sprague Dawley (CD) Feed Oral 7.50 mg/kg/day 90 day
162995 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid SUB rat Sprague Dawley (CD) Feed Oral 7.50 mg/kg/day 90 day
162996 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid SUB rat Sprague Dawley (CD) Feed Oral 7.50 mg/kg/day 90 day
162997 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid SUB rat Sprague Dawley (CD) Feed Oral 7.50 mg/kg/day 90 day
162998 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid SUB rat Sprague Dawley (CD) Feed Oral 7.50 mg/kg/day 90 day
162999 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid SUB rat Sprague Dawley (CD) Feed Oral 7.50 mg/kg/day 90 day
163000 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid SUB rat Sprague Dawley (CD) Feed Oral 7.50 mg/kg/day 90 day
163001 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid SUB rat Sprague Dawley (CD) Feed Oral 7.50 mg/kg/day 90 day
163002 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid SUB rat Sprague Dawley (CD) Feed Oral 2.50 mg/kg/day 90 day
163003 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid SUB rat Sprague Dawley (CD) Feed Oral 2.50 mg/kg/day 90 day
163004 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid SUB rat Sprague Dawley (CD) Feed Oral 2.50 mg/kg/day 90 day
163005 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid SUB rat Sprague Dawley (CD) Feed Oral 2.50 mg/kg/day 90 day
163006 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid SUB rat Sprague Dawley (CD) Feed Oral 2.50 mg/kg/day 90 day
163007 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid SUB rat Sprague Dawley (CD) Feed Oral 2.50 mg/kg/day 90 day
... ... ... ... ... ... ... ... ... ... ... ... ... ...
163222 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid MGR rat Sprague Dawley Feed Oral 1.59 mg/kg/day NaN NaN
163223 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid MGR rat Sprague Dawley Feed Oral 1.59 mg/kg/day NaN NaN
163224 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid MGR rat Sprague Dawley Feed Oral 1.59 mg/kg/day NaN NaN
163225 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid MGR rat Sprague Dawley Feed Oral 0.15 mg/kg/day NaN NaN
163226 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid MGR rat Sprague Dawley Feed Oral 1.59 mg/kg/day 10 week
163227 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid MGR rat Sprague Dawley Feed Oral 1.59 mg/kg/day 10 week
163228 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid MGR rat Sprague Dawley Feed Oral 1.34 mg/kg/day NaN NaN
163229 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid MGR rat Sprague Dawley Feed Oral 1.34 mg/kg/day NaN NaN
163230 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid MGR rat Sprague Dawley Feed Oral 0.17 mg/kg/day NaN NaN
163231 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid MGR rat Sprague Dawley Feed Oral 0.15 mg/kg/day NaN NaN
163232 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid MGR rat Sprague Dawley Feed Oral 0.15 mg/kg/day 10 week
163233 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid MGR rat Sprague Dawley Feed Oral 0.53 mg/kg/day NaN NaN
163234 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid MGR rat Sprague Dawley Feed Oral 0.52 mg/kg/day NaN NaN
163235 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid MGR rat Sprague Dawley Feed Oral 0.52 mg/kg/day 10 week
163236 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid MGR rat Sprague Dawley Feed Oral 0.45 mg/kg/day NaN NaN
163237 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid MGR rat Sprague Dawley Feed Oral 0.45 mg/kg/day NaN NaN
163238 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid MGR rat Sprague Dawley Feed Oral 0.45 mg/kg/day 10 week
163239 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid MGR rat Sprague Dawley Feed Oral 0.18 mg/kg/day NaN NaN
163240 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid MGR rat Sprague Dawley Feed Oral 1.34 mg/kg/day NaN NaN
163241 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid MGR rat Sprague Dawley Feed Oral 1.34 mg/kg/day NaN NaN
163242 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid MGR rat Sprague Dawley Feed Oral 1.34 mg/kg/day NaN NaN
163243 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid MGR rat Sprague Dawley Feed Oral 1.34 mg/kg/day NaN NaN
163244 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid MGR rat Sprague Dawley Feed Oral 1.34 mg/kg/day NaN NaN
163245 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid MGR rat Sprague Dawley Feed Oral 1.34 mg/kg/day NaN NaN
163246 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid MGR rat Sprague Dawley Feed Oral 1.34 mg/kg/day NaN NaN
163247 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid MGR rat Sprague Dawley Feed Oral 1.34 mg/kg/day NaN NaN
163248 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid MGR rat Sprague Dawley Feed Oral 1.34 mg/kg/day NaN NaN
163249 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid MGR rat Sprague Dawley Feed Oral 1.34 mg/kg/day NaN NaN
163250 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid MGR rat Sprague Dawley Feed Oral 1.34 mg/kg/day NaN NaN
163251 32646 DSSTox_GSID_32646 4151-50-2 Sulfluramid MGR rat Sprague Dawley Feed Oral 1.34 mg/kg/day NaN NaN

274 rows × 13 columns

In [10]:
TS11 = pd.pivot_table(TS1, 
                      index=['DSSTox_GSID','chemical_id', u'chemical_casrn', u'chemical_name',
                             'study_type','dose','duration','duration_unit'],
                      columns='toxrefdb_tg_dose_unit',
                      values='admin_route',
                     aggfunc=len)
TS11.shape
Out[10]:
(1408, 1)
In [11]:
xl = pd.ExcelWriter('/share/home/ishah/tmp/toxref-rat-css-treatment-doses-v2.xlsx')
TS11.to_excel(xl,sheet_name='view')
TS11.reset_index().to_excel(xl,sheet_name='data')
xl.close()
In [64]:
T1.loael.unique()
Out[64]:
array([ nan,   0.,  -1.])

Maximum treatment concentration for each study

In [18]:
# Figure out the maximum treatment concentration for each study type - This will be treatment concentration 
# up to which there was no effect 

TF = [['study_type']
     ]
# If a chemical has an effect in a study then all other specific effects that are NA will be set to zero

from ml.mlearn import concat_df
def mk_str(x):
    if type(x) ==tuple:
        return '_'.join([i.lower().replace('-','_').replace(' ','_') for i in x]) 
    elif type(x)==str:
        return x.lower().replace('-','_').replace(' ','_')
    else:
        return x

T_mt = pd.DataFrame()

for c_i in TF:

    T_i = pd.pivot_table(T1,index=['ID','DSSTox_GSID','chemical_name'],
                         columns=c_i,
                         values='dose',
                         aggfunc=np.min)

    #T_i.columns = mk_str([mk_str(jj) for jj in T_i.columns])
    
    if T_mt.shape[0]>0:
        T_mt = pd.merge(T_mt,T_i,how='outer',left_index=True,right_index=True)
    else:
        T_mt = T_i
    
#I = [i for i in T2.columns if re.search('chr.+liver',i,re.I)]
#I

Aggregating toxicity

In [ ]:
# Toxicity -> factors 

TF = [#['study_type'],
    #['species','study_type','effect_target'],
    # ['study_type','effect_target'],
     ['study_type','species','effect_target','les_cat'],
     #['study_type','species','effect_target'],
     #['study_type','species','effect_target','les_cat']
     ]
# If a chemical has an effect in a study then all other specific effects that are NA will be set to zero

from ml.mlearn import concat_df
def mk_str(x):
    if type(x) ==tuple:
        return '_'.join([i.lower().replace('-','_').replace(' ','_') for i in x]) 
    elif type(x)==str:
        return x.lower().replace('-','_').replace(' ','_')
    else:
        return x

T2 = pd.DataFrame()

for c_i in TF:

    T_i = pd.pivot_table(T1,index=['ID','DSSTox_GSID','chemical_name'],
                         columns=c_i,
                         values='dose',
                         aggfunc=np.min)

    #T_i.columns = mk_str([mk_str(jj) for jj in T_i.columns])
    
    if T2.shape[0]>0:
        T2 = pd.merge(T2,T_i,how='outer',left_index=True,right_index=True)
    else:
        T2 = T_i
    
#I = [i for i in T2.columns if re.search('chr.+liver',i,re.I)]
#I

Filling missing data

All toxic effects that are not significant are not reported. This produces a great deal of missing data. We need an approach to differentiate between unknown effects and effects that are not significant. We assume that if a particular guideline study was conducted but the effects were not reported then a chemical would be negative for that particular effect for that type of guideline study. However, this ignores the effect of species and sex.

In [20]:
for study in set([i[0] for i in T2.columns]):
    Yij=T2[(study)]
    I = Yij.isnull()
    I1 = I.sum(axis=1)<Yij.shape[1]
    I2=I.apply(lambda y: y & I1)
    T2[(study)][I2]=0
In [1]:
T2.shape
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-93734ea7d50c> in <module>()
----> 1 T2.shape

NameError: name 'T2' is not defined
In [25]:
I = [i for i in T2.columns if i[1]=='rat']
Rat_liver=T2[I]
pickle.dump(Rat_liver,file(PKL_DIR+'rat-liver-effects-'+tmstmp+'.pkl','w'))
In [245]:
pickle.dump([T2,T_mt],file(PKL_DIR+'tox-'+tmstmp+'.pkl','w'))
In [27]:
PKL_DIR
Out[27]:
'/share/home/ishah/projects/Chem/data/pickle/'
In [223]:
CID=[u'C335762',
 u'C307244',
 u'C375951',
 u'C1763231',
 u'C335671',
 u'C4151502',
 u'C375859',
 u'C2795393',
 u'C29420493',
 u'C3825261',
 u'C3871996',
 u'C754916',
 u'C2058948']
In [203]:
#C0.STRUCTURE_MW[CID]
C0.ix[CID].T
Out[203]:
ID C335762 C307244 C375951 C1763231 C335671 C4151502 C375859 C2795393 C29420493 C3825261 C3871996 C754916 C2058948
DSSTox_RID 78891 78892 78893 78894 78895 79022 79319 79357 79358 79359 79360 79421 82415
DSSTox_GSID 31860 31862 31863 31864 31865 32646 37303 37706 37707 37708 37709 38939 47553
DSSTox_CID 11860 11862 11863 11864 11865 12646 17303 17706 17707 17708 17709 18939 27553
TS_ChemName PFDA PFHxA PFNA PFOS PFOA Sulfluramid Perfluoroheptanoic acid PFOS-K PFBS-K PFOA, ammonium salt PFHS-K PFOSA Perfluoroundecanoic acid
TS_ChemName_Synonyms Perfluorodecanoic acid (PFDA) Perfluorohexanoic acid (PFHXA) Perfluorononanoic acid (PFNA) NaN Perfluorooctanoic acid (PFOA) NaN NaN Potassium perfluorooctanesulfonate (PFOS, K salt) Potassium perfluorobutanesulfonate (PFBS, K salt) Ammonium perfluorooctanoate (PFOA, ammonium salt) Potassium perfluorohexanesulfonate (PFHS, K salt) Perfluorooctanesulfonamide (PFOSA) NaN
TS_CASRN 335-76-2 307-24-4 375-95-1 1763-23-1 335-67-1 4151-50-2 375-85-9 2795-39-3 29420-49-3 3825-26-1 3871-99-6 754-91-6 2058-94-8
TS_Description single chemical compound single chemical compound single chemical compound single chemical compound single chemical compound single chemical compound single chemical compound single chemical compound single chemical compound single chemical compound single chemical compound single chemical compound single chemical compound
ChemNote NaN NaN NaN NaN NaN NaN NaN parent [1763-23-1] parent [375-73-5] parent [335-67-1] parent [355-46-4] NaN NaN
STRUCTURE_Shown tested chemical tested chemical tested chemical tested chemical tested chemical tested chemical tested chemical tested chemical tested chemical tested chemical tested chemical tested chemical tested chemical
STRUCTURE_Formula C10HF19O2 C6HF11O2 C9HF17O2 C8HF17O3S C8HF15O2 C10H6F17NO2S C7HF13O2 C8F17KO3S C4F9KO3S C8H4F15NO2 C6F13KO3S C8H2F17NO2S C11HF21O2
STRUCTURE_MW 514.0834 314.0534 464.0759 500.1296 414.0684 527.198 364.0609 538.22 338.1899 431.0989 438.2049 499.1448 564.0909
STRUCTURE_ChemType defined organic defined organic defined organic defined organic defined organic defined organic defined organic defined organic defined organic defined organic defined organic defined organic defined organic
STRUCTURE_DefinedOrganicForm parent parent parent parent parent parent parent salt salt salt salt parent parent
STRUCTURE_IUPAC nonadecafluorodecanoic acid undecafluorohexanoic acid heptadecafluorononanoic acid 1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,8-heptadecaflu... pentadecafluorooctanoic acid N-ethyl-1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,8-hept... tridecafluoroheptanoic acid potassium 1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,8-he... potassium 1,1,2,2,3,3,4,4,4-nonafluorobutane-1... ammonium pentadecafluorooctanoate potassium 1,1,2,2,3,3,4,4,5,5,6,6,6-tridecaflu... 1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,8-heptadecaflu... henicosafluoroundecanoic acid
STRUCTURE_SMILES FC(F)(C(F)(F)C(=O)O)C(F)(F)C(F)(F)C(F)(F)C(F)(... FC(F)(C(F)(F)C(=O)O)C(F)(F)C(F)(F)C(F)(F)F FC(F)(C(F)(F)C(=O)O)C(F)(F)C(F)(F)C(F)(F)C(F)(... FC(F)(C(F)(F)S(=O)(=O)O)C(F)(F)C(F)(F)C(F)(F)C... FC(F)(C(F)(F)C(=O)O)C(F)(F)C(F)(F)C(F)(F)C(F)(... FC(F)(C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(... FC(F)(C(F)(F)C(=O)O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F [K+].FC(F)(C(F)(F)S([O-])(=O)=O)C(F)(F)C(F)(F)... [K+].FC(F)(C(F)(F)S([O-])(=O)=O)C(F)(F)C(F)(F)F [NH4+].FC(F)(C(F)(F)C([O-])=O)C(F)(F)C(F)(F)C(... [K+].FC(F)(C(F)(F)S([O-])(=O)=O)C(F)(F)C(F)(F)... FC(F)(C(F)(F)S(N)(=O)=O)C(F)(F)C(F)(F)C(F)(F)C... FC(F)(C(F)(F)C(=O)O)C(F)(F)C(F)(F)C(F)(F)C(F)(...
STRUCTURE_SMILES_Desalt FC(F)(C(F)(F)C(=O)O)C(F)(F)C(F)(F)C(F)(F)C(F)(... FC(F)(C(F)(F)C(=O)O)C(F)(F)C(F)(F)C(F)(F)F FC(F)(C(F)(F)C(=O)O)C(F)(F)C(F)(F)C(F)(F)C(F)(... FC(F)(C(F)(F)S(=O)(=O)O)C(F)(F)C(F)(F)C(F)(F)C... FC(F)(C(F)(F)C(=O)O)C(F)(F)C(F)(F)C(F)(F)C(F)(... FC(F)(C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(... FC(F)(C(F)(F)C(=O)O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F FC(F)(C(F)(F)S(=O)(=O)O)C(F)(F)C(F)(F)C(F)(F)C... FC(F)(C(F)(F)S(=O)(=O)O)C(F)(F)C(F)(F)F FC(F)(C(F)(F)C(=O)O)C(F)(F)C(F)(F)C(F)(F)C(F)(... FC(F)(C(F)(F)S(=O)(=O)O)C(F)(F)C(F)(F)C(F)(F)C... FC(F)(C(F)(F)S(N)(=O)=O)C(F)(F)C(F)(F)C(F)(F)C... FC(F)(C(F)(F)C(=O)O)C(F)(F)C(F)(F)C(F)(F)C(F)(...
STRUCTURE_InChIS_v0 InChI=1S/C10HF19O2/c11-2(12,1(30)31)3(13,14)4(... InChI=1S/C6HF11O2/c7-2(8,1(18)19)3(9,10)4(11,1... InChI=1S/C9HF17O2/c10-2(11,1(27)28)3(12,13)4(1... InChI=1S/C8HF17O3S/c9-1(10,3(13,14)5(17,18)7(2... InChI=1S/C8HF15O2/c9-2(10,1(24)25)3(11,12)4(13... InChI=1S/C10H6F17NO2S/c1-2-28-31(29,30)10(26,2... InChI=1S/C7HF13O2/c8-2(9,1(21)22)3(10,11)4(12,... InChI=1S/C8HF17O3S.K/c9-1(10,3(13,14)5(17,18)7... InChI=1S/C4HF9O3S.K/c5-1(6,3(9,10)11)2(7,8)4(1... InChI=1S/C8HF15O2.H3N/c9-2(10,1(24)25)3(11,12)... InChI=1S/C6HF13O3S.K/c7-1(8,3(11,12)5(15,16)17... InChI=1S/C8H2F17NO2S/c9-1(10,3(13,14)5(17,18)7... InChI=1S/C11HF21O2/c12-2(13,1(33)34)3(14,15)4(...
STRUCTURE_InChIKey_v0 PCIUEQPBYFRTEM-UHFFFAOYSA-N PXUULQAPEKKVAH-UHFFFAOYSA-N UZUFPBIDKMEQEQ-UHFFFAOYSA-N YFSUTJLHUFNCNZ-UHFFFAOYSA-N SNGREZUHAYWORS-UHFFFAOYSA-N CCEKAJIANROZEO-UHFFFAOYSA-N ZWBAMYVPMDSJGQ-UHFFFAOYSA-N WFRUBUQWJYMMRQ-UHFFFAOYSA-M LVTHXRLARFLXNR-UHFFFAOYSA-M YOALFLHFSFEMLP-UHFFFAOYSA-N RSCGQEBKFSGWJT-UHFFFAOYSA-M RRRXPPIDPYTNJG-UHFFFAOYSA-N SIDINRCMMRKXGQ-UHFFFAOYSA-N
Substance_modify_yyyymmdd 2.008043e+07 2.008043e+07 2.008043e+07 2.008043e+07 2.008043e+07 2.008043e+07 2.008043e+07 2.008043e+07 2.008043e+07 2.008043e+07 2.008043e+07 2.008043e+07 2.010093e+07
In [ ]:
Y1 = T2.ix[CID]
#Y1[Y1.notnull()]
I=Y1.notnull().sum()>1
Y1 = T2.ix[CID,I]
#Y1.apply(lambda x: Y1.columns[x==x.min()][0],axis=0)
Y1.T
I = np.logical_not(np.all(np.logical_or(Y1.isnull(),Y1==0),axis=0))
Y1.ix[:,I].T
In [208]:
for study in set([i[0] for i in Y1.columns]):
    Yij=Y1[(study)]
    I = Yij.isnull()
    I1 = I.sum(axis=1)<Yij.shape[1]
    I2=I.apply(lambda y: y & I1)
    Y1[(study)][I2]=0
In [213]:
X1 = np.log10(Y1.apply(lambda x: x/(1000*C0.STRUCTURE_MW[ID1]),axis=0)).T
X1[np.isinf(X1)]=0
X1
Out[213]:
ID C307244 C1763231 C4151502 C2795393 C29420493 C3825261
DSSTox_GSID 31862 31864 32646 37706 37707 37708
chemical_name Perfluorohexanoic acid Perfluorooctane sulfonic acid Sulfluramid Potassium perfluorooctanesulfonate Potassium nonafluoro-1-butanesulfonate Ammonium perfluorooctanoate
study_type effect_target
CHR Body Weight NaN NaN NaN -5.73096 NaN -4.482289
Liver NaN NaN NaN -6.73096 NaN -4.482289
DEV Body Weight NaN -5.699083 -5.244852 NaN NaN -3.935607
Bone NaN 0.000000 -4.598122 NaN NaN -3.935607
Clinical Signs NaN 0.000000 -4.598122 NaN NaN -3.458486
MGR Adrenal Gland NaN NaN -5.594869 0.00000 0.000000 -4.634577
Body Weight NaN NaN -5.594869 -6.12890 -2.529161 -5.634577
Brain NaN NaN -5.594869 0.00000 -2.529161 0.000000
Clinical Signs NaN NaN 0.000000 -5.52684 -2.529161 -4.634577
Food Consumption NaN NaN 0.000000 -5.52684 -3.052039 0.000000
Kidney NaN NaN -5.594869 0.00000 -3.052039 -5.634577
Liver NaN NaN -5.594869 0.00000 -3.052039 -5.634577
Offspring Survival-Late NaN NaN -5.594869 -5.52684 0.000000 -4.157456
Sexual Developmental Landmark NaN NaN -5.594869 0.00000 -2.529161 -4.157456
SUB Body Weight -4.497003 NaN -5.324034 0.00000 0.000000 -4.458486
Clinical Chemistry -3.798033 NaN -5.324034 -5.73096 -2.751009 -4.458486
Clinical Signs 0.000000 NaN -5.324034 0.00000 -2.751009 0.000000
Food Consumption -4.497003 NaN -5.324034 -5.73096 0.000000 -4.935607
Hematology -3.195974 NaN -4.846912 -5.73096 -3.228131 0.000000
Kidney -3.195974 NaN -4.356860 0.00000 -2.751009 -5.458486
Liver -3.195974 NaN -5.324034 -6.33302 0.000000 -4.935607
Spleen 0.000000 NaN -4.356860 0.00000 -3.751009 0.000000
Stomach 0.000000 NaN -4.846912 0.00000 -2.751009 0.000000
In [274]:
Liv = [i for i in T2.columns if i[1].startswith('Liver')]
In [275]:
T2.ix[:20,Liv]
Out[275]:
study_type CHR DEV DNT MGR NEU OTH REP SAC SUB
effect_target Liver Liver Liver Liver Liver Liver Liver Liver Liver
ID DSSTox_GSID chemical_name
C100005 20281 1-Chloro-4-nitrobenzene NaN NaN NaN 0 NaN NaN NaN NaN 3.0
C100016 20961 4-Nitroaniline 1.5 NaN NaN NaN NaN NaN NaN NaN 10.0
C100027 21834 4-Nitrophenol NaN 0 NaN NaN NaN NaN NaN NaN NaN
C10016203 30698 alpha-Cyclodextrin NaN 0 NaN NaN NaN NaN NaN NaN NaN
C100210 26080 Terephthalic acid NaN NaN NaN NaN NaN NaN NaN NaN 0.0
C100378 21837 N,N-Diethylethanolamine NaN 0 NaN NaN NaN NaN NaN NaN 75.0
C100425 21284 Styrene NaN NaN 0 NaN NaN NaN NaN NaN NaN
C10043353 20194 Boric acid 0.0 163 NaN 4500 NaN NaN NaN NaN 250.0
C10049044 23958 Chlorine dioxide NaN NaN NaN 0 NaN NaN NaN NaN NaN
C100641 21842 Cyclohexanone oxime NaN NaN NaN NaN NaN NaN NaN NaN 287.0
C1007289 37495 Deisopropylatrazine NaN 0 NaN NaN NaN NaN NaN NaN 0.0
C100784201 34650 Halosulfuron-methyl 0.0 0 NaN 0 NaN NaN NaN NaN 497.0
C100970 20692 Methenamine NaN 0 NaN NaN NaN NaN NaN NaN NaN
C101053 20089 Anilazine NaN 0 NaN 0 NaN NaN NaN NaN 38.5
C101100 34232 Cloprop 25.0 0 NaN 250 NaN NaN NaN NaN NaN
C10118908 45033 Minocycline NaN NaN NaN NaN NaN NaN NaN 0 NaN
C101200480 24101 Tribenuron-methyl 0.0 500 NaN 0 NaN NaN NaN NaN 118.0
C101213 20764 Chlorpropham 350.0 1000 NaN 150 NaN NaN NaN NaN 276.0
C101542 25895 N-Phenyl-1,4-benzenediamine NaN 0 NaN NaN NaN NaN NaN NaN NaN
C101611 20869 4,4-Methylenebis(N,N-dimethylaniline) 187.5 NaN NaN NaN NaN NaN NaN 0 NaN
In [295]:
pd.pivot_table(T11.ix['C101200480'],columns=['study_type','species'],index='effect_target',values='dose')
Out[295]:
study_type CHR DEV MGR SUB
species dog mouse rat rabbit rat rat dog rat
effect_target
Body Weight 51.740000 214 30.25 80 312.5000 55.111765 NaN 243.500000
Bone NaN NaN NaN NaN 382.8125 NaN NaN NaN
Brain NaN NaN NaN NaN NaN NaN NaN 243.500000
Clinical Chemistry 42.086667 NaN NaN NaN NaN NaN NaN 251.777778
Clinical Signs NaN 214 NaN 80 312.5000 NaN NaN 226.500000
Epididymis NaN 214 NaN NaN NaN NaN NaN NaN
Eye NaN NaN 62.50 NaN NaN NaN NaN NaN
Food Consumption NaN NaN NaN 80 312.5000 81.500000 NaN 243.500000
General NaN NaN NaN NaN 500.0000 NaN NaN NaN
Heart NaN NaN 62.50 NaN NaN NaN NaN 243.500000
Hematology NaN NaN NaN NaN NaN NaN 73.3 NaN
Kidney NaN 214 62.50 NaN NaN NaN NaN 243.500000
Liver NaN NaN NaN NaN 500.0000 NaN NaN 268.600000
Lung NaN NaN NaN NaN NaN 50.750000 NaN NaN
Mammary Gland NaN NaN 62.50 NaN NaN NaN NaN NaN
Maternal Wastage NaN NaN NaN 80 NaN NaN NaN NaN
Offspring Survival-Early NaN NaN NaN NaN 500.0000 NaN NaN NaN
Pancreas NaN NaN 62.50 NaN NaN NaN NaN NaN
Reproductive Performance NaN NaN NaN 80 NaN NaN NaN NaN
Seminal Vesicle NaN NaN 62.50 NaN NaN NaN NaN NaN
Skin NaN NaN 62.50 NaN NaN NaN NaN NaN
Spleen NaN NaN 62.50 NaN NaN 50.750000 NaN 174.400000
Stomach NaN NaN 62.50 NaN NaN NaN NaN NaN
Testes NaN 214 NaN NaN NaN NaN NaN 226.500000
Thyroid Gland NaN NaN NaN NaN NaN NaN 78.0 NaN
Urinalysis 29.810000 NaN NaN NaN NaN NaN NaN NaN
Uterus NaN NaN 62.50 NaN NaN NaN NaN NaN
[Not In List] NaN NaN NaN 80 NaN NaN NaN NaN
In [248]:
pickle.dump([T2,T_mt],file(PKL_DIR+'tox-'+tmstmp+'.pkl','w'))
In [271]:
 
Out[271]:
MultiIndex(levels=[[u'ACU', u'CHR', u'DEV', u'DNT', u'MGR', u'NEU', u'OTH', u'REP', u'SAC', u'SUB'], [u'Abdominal Cavity', u'Active Avoidance', u'Adrenal Gland', u'Age Landmark', u'Aorta', u'Aortic arch', u'Artery (General)', u'Auditory Startle Reflex Habituation', u'Bile duct', u'Bladder', u'Blood', u'Blood vessel', u'Body Weight', u'Bone', u'Bone Marrow', u'Brain', u'Bronchus', u'Cervix', u'Classical conditioning', u'Clinical Chemistry', u'Clinical Signs', u'Clitoral Gland', u'Coagulating Gland', u'Coordination', u'Delayed Alternation', u'Developmental Landmark', u'Diaphragm', u'Ductus arteriosus', u'Ear', u'Epididymis', u'Esophagus', u'Estrous Cycle', u'Estrous cycle length', u'Eye', u'Face', u'Food Consumption', u'Gallbladder', u'General', u'Gonad', u'Great vessels', u'Hair Growth', u'Harderian Gland', u'Heart', u'Hematology', u'Innominate artery', u'Instrumental conditioning', u'Interparietal', u'Intestine Large', u'Intestine Small', u'Intestines', u'Kidney', u'Lacrimal Gland', u'Large Intestine', u'Larynx', u'Limb', u'Liver', u'Locomotion', u'Lung', u'Lymph Node', u'Mammary Gland', u'Maternal Wastage', u'Maze', u'Mesentery', u'Mortality', u'Motor activity', u'Mouth / Jaw', u'Nasal', u'Nerve', u'Nose', u'Offspring Survival-Early', u'Offspring Survival-Late', u'Oral Mucosa', u'Other', u'Ovary', u'Oviduct', u'Pancreas', u'Parathyroid', u'Parathyroid Gland', u'Passive Avoidance', u'Paw / Digit', u'Penis', u'Peritoneum', u'Pharynx', u'Pituitary Gland', u'Placenta', u'Pleura', u'Preputial Gland', u'Presphenoid', u'Prostate', u'Pulmonary artery', u'Radius', u'Reflexes', u'Reproductive Outcome', u'Reproductive Performance', u'Salivary glands', u'Seminal Vesicle', u'Sexual Developmental Landmark', u'Skeletal Muscle', u'Skin', u'Sperm Measure', u'Sperm morphology', u'Spinal cord', u'Spleen', u'Squamosal', u'Stomach', u'Subclavian artery', u'Testes', u'Thoracic Cavity', u'Thymus', u'Thyroid Gland', u'Tissue NOS', u'Tongue', u'Tooth', u'Trachea', u'Trunk', u'Ulna', u'Uncertain Primary Site', u'Ureter', u'Urethra', u'Urinalysis', u'Urinary Bladder', u'Uterus', u'Vagina', u'Water Consumption', u'Zygomatic', u'Zymbal's Gland', u'[Clinical]', u'[Not In List]']],
           labels=[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [2, 12, 19, 20, 33, 35, 47, 48, 57, 63]],
           names=[u'study_type', u'effect_target'])

Chemical fingerprints

In [43]:
# Chemical fingerprints

from rdkit import Chem
from rdkit.DataStructs import *
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit.Chem import AllChem
from rdkit.Chem.SaltRemover import SaltRemover

df  = 'TOX21S_v4b_CID_structures.sdf'
zf  = FD[df]
ZF1 = zipfile.ZipFile(DAT_DIR+zf,'r')
d   = ZF1.extract(df)
suppl = Chem.SDMolSupplier(d)

# Map DSSTox_CID to ID
CID2ID = dict(zip(C0.DSSTox_CID,C0.index))
MOLS = {}
for m in suppl:
    if not m: continue
    if 'DSSTox_CID' not in m.GetPropNames(): 
        continue
    k = CID2ID.get(int(m.GetProp('DSSTox_CID')))
    if not k: continue
    MOLS[k] = m
In [45]:
from rdkit.Chem import MACCSkeys

FP1 = pd.DataFrame([np.array(AllChem.GetMorganFingerprintAsBitVect(i,3,1024)) for i in MOLS.values()])
FP1.index=MOLS.keys()
FP1.columns = ['mrgn_%d'%i for i in FP1.columns]

FP2 = pd.DataFrame([np.array(MACCSkeys.FingerprintMol(i)) for i in MOLS.values()])
FP2.index=MOLS.keys()
FP2.columns = ['mccs_%d'%i for i in FP2.columns]

FP3 = pd.DataFrame([np.array(AllChem.GetHashedTopologicalTorsionFingerprintAsBitVect(i)) for i in MOLS.values()])
FP3.index=MOLS.keys()
FP3.columns = ['tptr_%d'%i for i in FP3.columns]

FP0 = pd.merge(FP1,FP2,left_index=True,right_index=True)
FP0 = pd.merge(FP0,FP3,left_index=True,right_index=True)
FP0.index.names=['ID']
In [34]:
#pickle.dump([C0,FP0,FP1,FP2,FP3],file(PKL_DIR+'chm-'+tmstmp+'.pkl','w'))
pickle.dump(MOLS,file(PKL_DIR+'mols-'+tmstmp+'.pkl','w'))

Store everything to tables

In [ ]:
DAT_DIR = '/share/home/ishah/projects/Chem/data/tables/'

if False:
    T2.to_csv(DAT_DIR+'tox-v1.csv')
    FP0.to_csv(DAT_DIR+'chmfp-v1.csv')
    B1.to_csv(DAT_DIR+'bio-v1.csv')
    T_mt.to_csv(DAT_DIR+'tox-max-trt-v1.csv')

    W = pd.ExcelWriter(DAT_DIR+'chm-v1.xlsx')
    C1.to_excel(W,sheet_name='All')
    W.save()

Merge Chm, Bio, Tox

In [265]:
[i for i in T2.ix[:10,:10].columns]
T3 = T2.copy()
T3.columns = [i[0].lower() +'_'+i[1].lower().replace(' ','_') for i in T3.columns]
Tox = T3.columns

Continuous

In [263]:
Bio=B0.columns
Tox=T2.columns
Chm=FP0.columns

X0 = B0.copy()
X0[X0<6]=1
X0[X0==6]=0

BCb = pd.merge(X0.reset_index(),FP0.reset_index(),how='outer',left_on='ID',right_on='ID')
BCTb= pd.merge(BCb,T3.reset_index().drop(['DSSTox_GSID','chemical_name'],axis=1),how='outer',
               left_on='ID',right_on='ID')

BCTb= BCTb.set_index(['ID']).drop(['chemical_name','chemical_casrn','DSSTox_GSID'],axis=1)
BCb = BCb.set_index(['ID']).drop(['chemical_name','chemical_casrn','DSSTox_GSID'],axis=1)
print 'All',BCTb.shape
print 'Bio & Chm',BCb.shape
All (8642, 4634)
Bio & Chm (8403, 4060)
In [267]:
BCTb.ix[CID,Tox[-10:]]
Out[267]:
sub_trachea sub_uncertain_primary_site sub_ureter sub_urethra sub_urinalysis sub_urinary_bladder sub_uterus sub_vagina sub_water_consumption sub_[not_in_list]
ID
C335762 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
C307244 0 0 0 0 0 0 0 0 0 0
C375951 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
C1763231 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
C335671 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
C4151502 0 0 0 0 0 0 0 0 0 0
C375859 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
C2795393 0 0 0 0 0 0 0 0 0 0
C29420493 0 0 0 0 0 0 0 0 0 0
C3825261 0 0 0 0 0 0 0 0 0 0
C3871996 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
C754916 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
C2058948 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

Binary

In [268]:
X0 = B0.copy()
X0[X0<6]=1
X0[X0==6]=0
BCc = pd.merge(X0.reset_index(),FP0.reset_index(),how='inner',left_on='ID',right_on='ID')
BCTc= pd.merge(BCc,T3.reset_index().drop(['DSSTox_GSID','chemical_name'],axis=1),how='inner',
               left_on='ID',right_on='ID')
BCc = BCc.set_index('ID').drop(['DSSTox_GSID','chemical_casrn','chemical_name'],axis=1)
BCTc= BCTc.set_index('ID').drop(['DSSTox_GSID','chemical_casrn','chemical_name'],axis=1)
BCc.shape,BCTc.shape
Out[268]:
((1792, 4060), (607, 4634))
In [269]:
tmstmp
Out[269]:
'02-08-2016'

Pickle everything for usage elsewhere

In [270]:
pickle.dump([BCc,BCTc,BCTb,Bio,Chm,Tox],file(PKL_DIR+'tx-tr-ch-'+tmstmp+'.pkl','w'))
In [47]:
pickle.dump(BCTb,file(PKL_DIR+'BCTb-'+tmstmp+'.pkl','w'))
In [48]:
BCTb.to_csv(DAT_DIR+'BCTb-'+tmstmp+'.csv')
In [55]:
os.listdir(DAT_DIR)
Out[55]:
['tox-max-trt-v1.csv',
 'chm-v1.xlsx',
 'tox-v1.csv',
 'chm-v1.csv',
 'tox21-chm-v1.xlsx',
 'bio-v1.csv',
 'chmfp-v1.csv',
 'BCTb-02-12-2015.csv']

Load the data from pickles

In [ ]:
print "\n".join(os.listdir(PKL_DIR))
[BCc,BCTc,Bio,Chm,Tox] = pickle.load(file(PKL_DIR+'tx-tr-ch-02-12-2015.pkl','r'))
[A0,C0,C1,B1] = pickle.load(file(PKL_DIR+'chm-bio-02-12-2015.pkl','r'))
[T2,T_mt] = pickle.load(file(PKL_DIR+'tox-02-12-2015.pkl','r'))