#--------------------------------------------------------------------------------------
#
# toxref_prep.R - code to preprocess the ToxRefDB data
#
# October 2013
# Richard Judson
#
# US EPA
# Questions, comments to: judson.richard@epa.gov, 919-541-3085
#
# Order of processing
#
#--------------------------------------------------------------------------------------
options(java.parameters = "-Xmx1000m")
library(grDevices)
library(RColorBrewer)
library(stringr)
library(fingerprint)
library(e1071)
library(diptest)
#library(xlsx)
#library(xlsxjars)
library(pheatmap)

source("utils.R")
#--------------------------------------------------------------------------------------
#
# Build the output file in long format
#
#--------------------------------------------------------------------------------------
build.toxrefdb.long <- function(do.read=F,date.string="2013_10_24") {
	cat("==========================================================================\n")
	cat("build.toxrefdb.long\n")
	cat("==========================================================================\n")
    flush.console()
    if(do.read) {
    	file <- "ToxRefDB/toxrefdb_study_tg_effect_summary.csv"
    	temp <- read.csv(file,header=T,sep=",",stringsAsFactors=F,quote="\"",comment="")
    	TOXREF.NEW <<- temp
    	
    	file <- "ToxRefDB/toxrefdb_study_endpoint_summary.csv"
    	temp <- read.csv(file,header=T,sep=",",stringsAsFactors=F,quote="\"",comment="")
    	TOXREF.NEW.SUM <<- temp
    	    	
    	file <- "ToxRefDB/ToxCast_Phase_1_ToxRefDB_20110110.txt"
    	temp <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment="")
		for(i in 1:dim(temp)[1]) {
			casrn <- temp[i,"CASRN"]
			code <- paste("C",str_replace_all(casrn,"-",""),sep="")
			rownames(temp)[i] <- code
		}
		TOXREFDB.0 <<- temp	
		
    	file <- "ToxRefDB/toxref_endpoint_map_2013_10_22.txt"
    	temp <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment="")
    	TOXREF.ENDPOINT.MAP <<- temp
	}
	
	toxref.new <- TOXREF.NEW[TOXREF.NEW[,"data_usability"]<=3,]
	toxref.new <- toxref.new[is.element(toxref.new[,"study_type"],"CHR"),]

	outfile <- paste("ToxRefDB/ToxRefDB_CHR_long_",date.string,".txt",sep="")
	s <- "CODE\tCASRN\tName\tEndpoint\tEndpoint_continuous\tLDT\tHDT\tLEL\n"
	cat(file=outfile,s,append=F)
	casrn.list <- sort(uniquify(TOXREF.NEW.SUM[,"chemical_casrn"]))
	nchem <- length(casrn.list)
	#nchem <- 10
	for(j in 1:nchem) {
		casrn <- casrn.list[j]	
		code <- paste("C",str_replace_all(casrn,"-",""),sep="")
		cname <- CNAMES[code,"ShortName"]
		stemp <- TOXREF.NEW.SUM[is.element(TOXREF.NEW.SUM[,"chemical_casrn"],casrn),]
		stemp <- stemp[is.element(stemp[,"study_type"],"CHR"),]
		stemp <- stemp[stemp[,"data_usability"]<=3,]
		if(is.na(cname)) cname <- stemp[1,"chemical_name"]
		print(cname)
		if(dim(stemp)[1]>0) {
			#print(dim(stemp))
			ldt.rat <- NA
			hdt.rat <- NA
			ldt.mouse <- NA
			hdt.mouse <- NA
			stemp.rat <- stemp[is.element(stemp[,"species"],"rat"),]
			stemp.mouse <- stemp[is.element(stemp[,"species"],"mouse"),]
			if(dim(stemp.rat)[1]>0) {
				ldt.rat <- min(stemp.rat[,"min_dose"])
				hdt.rat <- max(stemp.rat[,"max_dose"])
			}
			if(dim(stemp.mouse)[1]>0) {
				ldt.mouse <- min(stemp.mouse[,"min_dose"])
				hdt.mouse <- max(stemp.mouse[,"max_dose"])
			}
			#browser()
			nendpoint <- dim(TOXREF.ENDPOINT.MAP)[1]
			#nendpoint <- 10
			for(i in 1:nendpoint) {
				if(!is.na(TOXREF.ENDPOINT.MAP[i,"endpoint_subclass"])) {
					endpoint <- TOXREF.ENDPOINT.MAP[i,"endpoint_discrete"]
					#cat(endpoint,"\n")
					endpoint.cont <- TOXREF.ENDPOINT.MAP[i,"endpoint_continuous"]
					study_type	<- TOXREF.ENDPOINT.MAP[i,"study_type"]
					species <- TOXREF.ENDPOINT.MAP[i,"species"]
					doit <- F
					if(species=="rat" && !is.na(ldt.rat) && !is.na(hdt.rat)) doit <- T
					if(species=="mouse" && !is.na(ldt.mouse) && !is.na(hdt.mouse)) doit <- T
					if(doit) {
						endpoint_category <- TOXREF.ENDPOINT.MAP[i,"endpoint_category"]
						endpoint_supercategory <- TOXREF.ENDPOINT.MAP[i,"endpoint_supercategory"]
						endpoint_subcategory <- TOXREF.ENDPOINT.MAP[i,"endpoint_subcategory"]
						endpoint_class <- TOXREF.ENDPOINT.MAP[i,"endpoint_class"]
						endpoint_subclass <- TOXREF.ENDPOINT.MAP[i,"endpoint_subclass"]

						temp <- toxref.new[is.element(toxref.new[,"species"],species),]
						temp <- temp[is.element(temp[,"chemical_casrn"],casrn),]
						#temp <- temp[is.element(temp[,"study_type"],study_type),]
						temp <- temp[is.element(temp[,"endpoint_supercategory"],endpoint_supercategory),]
						temp <- temp[is.element(temp[,"endpoint_subcategory"],endpoint_subcategory),]
						temp <- temp[is.element(temp[,"endpoint_category"],endpoint_category),]
						temp <- temp[is.element(temp[,"endpoint_subclass"],endpoint_subclass),]
						#temp <- temp[temp[,"data_usability"]<=3,]
						ldt <- NA
						hdt <- NA
						if(species=="rat") {
							ldt <- ldt.rat
							hdt <- hdt.rat
						}
						if(species=="mouse") {
							ldt <- ldt.mouse
							hdt <- hdt.mouse
						}
						lel <- 1000000
						if(dim(temp)[1]>0) {
							lel <- min(temp[,"dose"])
						}
						s <- paste(code,"\t",casrn,"\t",cname,"\t",endpoint,"\t",endpoint.cont,"\t",ldt,"\t",hdt,"\t",lel,"\n",sep="")
						cat(file=outfile,s,append=T)
						cat(s)
						flush.console()
					}
				}
			}
		}
	}
}
#--------------------------------------------------------------------------------------
#
# Check a Toxrefdb endpoint for phase 1
#
#--------------------------------------------------------------------------------------
check.toxrefdb <- function(do.read=F) {
	cat("==========================================================================\n")
	cat("check.toxrefdb\n")
	cat("==========================================================================\n")
    flush.console()
    if(do.read) {
		file <- "ToxRefDB/ToxRefDB_CHR_long_2013_10_22.txt"
    	temp <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment="")
    	TOXREFDB.LONG <<- temp
    	    	
    	file <- "phase_I/ToxCast_Phase_1_ToxRefDB_20110110.txt"
    	temp <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment="")
		for(i in 1:dim(temp)[1]) {
			casrn <- temp[i,"CASRN"]
			code <- paste("C",str_replace_all(casrn,"-",""),sep="")
			rownames(temp)[i] <- code
		}
		TOXREFDB.0 <<- temp
    	
	}
	outfile <- "ToxRefDB/check_toxrefdb.txt"
	s <- "CODE\tCASRN\tName\tEndpoint\tEndpoint_continuous\tLDT\tHDT\thitcall.0\thticall.1\tLEL.0\tLEL.1\n"
	cat(file=outfile,s,append=F)
	nchem <- dim(TOXREFDB.0)[1]
	for(i in 1:nchem) {
		code <- rownames(TOXREFDB.0)[i]
		casrn <- TOXREFDB.0[i,"CASRN"]
		cname <- CNAMES[code,"ShortName"]
		temp <- TOXREFDB.LONG[is.element(TOXREFDB.LONG[,"CODE"],code),]
		if(dim(temp)[1]>0) {
			for(j in 4:dim(TOXREFDB.0)[2]) {
				endpoint <- names(TOXREFDB.0)[j]
				temp.1 <- temp[is.element(temp[,"Endpoint"],endpoint),]
				endpoint.cont <- NA
				if(dim(temp.1)[1]>0) {
					endpoint.cont <- temp.1[1,"Endpoint_continuous"]
					#print(temp.1)
					lel.1 <- temp.1[1,"LEL"]
					ldt <- temp.1[1,"LDT"]
					hdt <- temp.1[1,"HDT"]
					if(!is.na(hdt) && !is.na(ldt)) {
						hitcall.1 <- 0
						if(lel.1<1000000) hitcall.1 <- 1
						lel.0 <- NA
						if(!is.na(endpoint.cont)) lel.0 <- TOXREFDB.0[code,endpoint.cont]
						hitcall.0 <- TOXREFDB.0[code,endpoint]
						if(!is.na(hitcall.0)) {
							if(hitcall.0 == 1000000) hitcall.0 <- 0
						}
						erflag <- 0
						if(!is.na(hitcall.0)) {
							if(hitcall.0>hitcall.1) erflag <- 1
							if(hitcall.0<hitcall.1) erflag <- 2
						}
						else {
							if(!is.na(hitcall.1)) erflag <- 3
						}
						s <- paste(code,"\t",casrn,"\t",cname,"\t",endpoint,"\t",endpoint.cont,"\t",ldt,"\t",hdt,"\t",hitcall.0,"\t",hitcall.1,"\t",lel.0,"\t",lel.1,"\t",erflag,"\n",sep="")
						cat(file=outfile,s,append=T)
						if(erflag>0) {
							cat(s)
							flush.console()
							#browser()
						}
					}
				}
			}
		
		}
	}
	browser()
}
#--------------------------------------------------------------------------------------
#
# Compare the old and new ToxCast
#
#--------------------------------------------------------------------------------------
check.toxcast <- function(do.read=F) {
	cat("==========================================================================\n")
	cat("check.toxcast\n")
	cat("==========================================================================\n")
    flush.console()
    if(do.read) {
		file <- "input/ToxCast_Tox21_ALL_AC50_Matrix_2013_03_22.txt"
    	temp <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment="")
    	rownames(temp) <- temp[,"CODE"]
    	PHASE2 <<- temp
    	    	
    	file <- "phase_I/ToxCast_Phase_1_20110110_assays.txt"
    	temp <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment="")
		for(i in 1:dim(temp)[1]) {
			casrn <- temp[i,"CASRN"]
			code <- paste("C",str_replace_all(casrn,"-",""),sep="")
			rownames(temp)[i] <- code
		}
		PHASE1 <<- temp
    	
	}
	outfile <- "phase_I/check_toxcast.txt"
	s <- "CODE\tCASRN\tName\tAssay\tAC5.0\tAC50.1\tflag\n"
	cat(file=outfile,s,append=F)
	code.list <- rownames(PHASE1)
	code.list <- code.list[is.element(code.list,rownames(PHASE2))]
	assay.list <- names(PHASE1)[4:dim(PHASE1)[2]]
	assay.list <- assay.list[is.element(assay.list,names(PHASE2))]
	
	nchem <- length(code.list)
	nassay <- length(assay.list)
	#nchem <- 10
	#nassay <- 10
	for(i in 1:nchem) {
		code <- code.list[i]
		casrn <- CNAMES[code,"CASRN"]
		cname <- CNAMES[code,"ShortName"]
		cat(cname,"\n")
		flush.console()
		for(j in 1:nassay) {
			flag <- NA
			assay <- assay.list[j]
			#cat("\t",assay,"\n")
			#flush.console()
			log.ac50.0 <- PHASE1[code,assay]
			ac50.0 <- log.ac50.0
			if(!is.na(ac50.0)) ac50.0 <- 10**(-ac50.0)*1000000
			ac50.1 <- PHASE2[code,assay]
			if(is.na(ac50.0)) {
				if(!is.na(ac50.1)) flag <- "NA to value"
				else flag <- "both NA"
			}
			else {
				if(is.na(ac50.1)) flag <- "value to NA"
				else {
					if(ac50.0==ac50.1) {
						if(ac50.0==1000000) flag <- "equal inactive"
						else flag <- "equal active"
					}
					else {
						if(ac50.0==1000000 && ac50.1<1000000) flag <- "inactive to active"
						else if(ac50.0<1000000 && ac50.1==1000000) flag <- "active to inactive"
						else {
							if(ac50.0>ac50.1) {
								ratio <- ac50.0/ac50.1
								if(ratio>1.5) flag <- "both active increased potency"
								else flag <- "equal active approximately"
							}
							else {
								ratio <- ac50.1/ac50.0
								if(ratio>1.5) flag <- "both active decreased potency"
								else flag <- "equal active approximately"
							}
						}
					}
				}
			}
			s <- paste(code,"\t",casrn,"\t",cname,"\t",assay,"\t",format(ac50.0,digits=3),"\t",format(ac50.1,digits=3),"\t",flag,"\n",sep="")
			#cat(s)
			cat(file=outfile,s,append=T)
			
		}
	}
	browser()
}
#--------------------------------------------------------------------------------------
#
# Compare the old and new ToxCast
#
#--------------------------------------------------------------------------------------
check.toxcast.assay <- function(do.read=F) {
	cat("==========================================================================\n")
	cat("check.toxcast.assay\n")
	cat("==========================================================================\n")
    flush.console()
    if(do.read) {
		file <- "phase_I/check_toxcast.txt"
    	temp <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment="")
    	CHECK <<- temp	    	
	}
	assay.list <- sort(uniquify(CHECK[,"Assay"]))
	nassay <- length(assay.list)
	result <- as.data.frame(matrix(nrow=nassay,ncol=6))
	names(result) <- c("Assay","Hits.1","Hits.2","Hits.both","Hits.either","Similarity")
	result[,"Assay"] <- assay.list
	for(i in 1:nassay) {
		assay <- assay.list[i]
		temp <- CHECK[is.element(CHECK[,"Assay"],assay),]
		temp.both <- temp[is.element(temp[,"flag"],c("both active decreased potency","both active increased potency","equal active","equal active approximately")),]
		nboth <- dim(temp.both)[1]
		temp.either <- temp[is.element(temp[,"flag"],c("active to inactive","inactive to active","both active decreased potency","both active increased potency","equal active","equal active approximately")),]
		neither <- dim(temp.either)[1]

		temp.1 <- temp[is.element(temp[,"flag"],c("active to inactive","both active decreased potency","both active increased potency","equal active","equal active approximately")),]
		n1 <- dim(temp.1)[1]
		temp.2 <- temp[is.element(temp[,"flag"],c("both active decreased potency","both active increased potency","equal active","equal active approximately","inactive to active")),]
		n2 <- dim(temp.2)[1]

		sim <- nboth / neither
		result[i,"Hits.1"] <- n1
		result[i,"Hits.2"] <- n2
		result[i,"Hits.both"] <- nboth
		result[i,"Hits.either"] <- neither
		result[i,"Similarity"] <- format(sim,digits=2)
	}
	outfile <- "phase_I/phase_I_II_assay_check.txt"
	write.table(result,file=outfile, row.names=F, append=FALSE, quote=F, sep = "\t")
	browser()
}
#--------------------------------------------------------------------------------------
#
# Compare the old and new ToxCast
#
#--------------------------------------------------------------------------------------
check.toxcast.chemical <- function(do.read=F) {
	cat("==========================================================================\n")
	cat("check.toxcast.chemical\n")
	cat("==========================================================================\n")
    flush.console()
    if(do.read) {
		file <- "phase_I/check_toxcast.txt"
    	temp <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment="")
    	CHECK <<- temp	    	
	}
	code.list <- sort(uniquify(CHECK[,"CODE"]))
	nchem <- length(code.list)
	result <- as.data.frame(matrix(nrow=nchem,ncol=8))
	names(result) <- c("CODE","CASRN","Name","Hits.1","Hits.2","Hits.both","Hits.either","Similarity")
	result[,"CODE"] <- code.list
	for(i in 1:nchem) {
		code <- code.list[i]
		result[i,"CASRN"] <- CNAMES[code,"CASRN"]
		result[i,"Name"] <- CNAMES[code,"ShortName"]
		temp <- CHECK[is.element(CHECK[,"CODE"],code),]
		temp.both <- temp[is.element(temp[,"flag"],c("both active decreased potency","both active increased potency","equal active","equal active approximately")),]
		nboth <- dim(temp.both)[1]
		temp.either <- temp[is.element(temp[,"flag"],c("active to inactive","inactive to active","both active decreased potency","both active increased potency","equal active","equal active approximately")),]
		neither <- dim(temp.either)[1]

		temp.1 <- temp[is.element(temp[,"flag"],c("active to inactive","both active decreased potency","both active increased potency","equal active","equal active approximately")),]
		n1 <- dim(temp.1)[1]
		temp.2 <- temp[is.element(temp[,"flag"],c("both active decreased potency","both active increased potency","equal active","equal active approximately","inactive to active")),]
		n2 <- dim(temp.2)[1]

		sim <- nboth / neither
		result[i,"Hits.1"] <- n1
		result[i,"Hits.2"] <- n2
		result[i,"Hits.both"] <- nboth
		result[i,"Hits.either"] <- neither
		result[i,"Similarity"] <- format(sim,digits=2)
	}
	outfile <- "phase_I/phase_I_II_chemical_check.txt"
	write.table(result,file=outfile, row.names=F, append=FALSE, quote=F, sep = "\t")
	browser()
}
#--------------------------------------------------------------------------------------
#
# Build the ToxRefDB matrix
#
#--------------------------------------------------------------------------------------
toxrefdb.matrix.0 <- function(do.read=F,date.string="2013_10_24") {
	cat("==========================================================================\n")
	cat("toxrefdb.matrix.0 \n")
	cat("==========================================================================\n")
    flush.console()
    if(do.read) {
		file <- paste("ToxRefDB/ToxRefDB_CHR_long_",date.string,".txt",sep="")
    	temp <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment="")
    	TOXREFDB.LONG <<- temp
    	
		file <- "input/ToxCastChemNamesMaster_2013_09_06.txt"
		temp <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="")
		cat("loaded Chemical Information\n"); flush.console()
		CODE.LIST <<- temp[,"CODE"]
		rownames(temp) <- temp[,"CODE"]
		CNAMES <<- temp
	}
	endpoint.list <- sort(uniquify(TOXREFDB.LONG[,"Endpoint"]))
	code.list <- sort(uniquify(TOXREFDB.LONG[,"CODE"]))
	nchem <- length(code.list)
	nendpoint <- length(endpoint.list)
	result <- as.data.frame(matrix(nrow=nchem,ncol=3+nendpoint))
	names(result) <- c("CODE","CASRN","Name",endpoint.list)
	result[,"CODE"] <- code.list
	for(i in 1:nendpoint) {
		endpoint <- TOXREFDB.LONG[i,"Endpoint"]
		result[,endpoint] <- NA
	}
	for(i in 1:nchem) {
		code <- code.list[i]
		result[i,"CASRN"] <- CNAMES[code,"CASRN"]
		result[i,"Name"] <- CNAMES[code,"ShortName"]
	}
	rownames(result) <- code.list
	result.l <- result
	result.h <- result
	ndata <- dim(TOXREFDB.LONG)[1]
	#ndata <- 1000
	for(i in 1:ndata) {
		code <- TOXREFDB.LONG[i,"CODE"]
		casrn <- TOXREFDB.LONG[i,"CASRN"]
		cname <- TOXREFDB.LONG[i,"Name"]
		endpoint <- TOXREFDB.LONG[i,"Endpoint"]
		lel <- TOXREFDB.LONG[i,"LEL"]
		hdt <- TOXREFDB.LONG[i,"HDT"]
		ldt <- TOXREFDB.LONG[i,"LDT"]
		result[code,endpoint] <- lel
		result.h[code,endpoint] <- hdt
		result.l[code,endpoint] <- ldt
		if(is.na(result[code,"CASRN"])) {
			result[code,"CASRN"] <- casrn
			result.h[code,"CASRN"] <- casrn
			result.l[code,"CASRN"] <- casrn
		}
		if(is.na(result[code,"Name"])) {
			result[code,"Name"] <- cname
			result.h[code,"Name"] <- cname
			result.l[code,"Name"] <- cname
		}
		
		if(i%%1000==0) {
			cat("processed ",i," of ",ndata," records\n")
			flush.console()
		}
	}
	browser()
	outfile <- paste("ToxRefDB/ToxRefDB_CHR_INIT_continuous_",date.string,".txt",sep="")
	write.table(result,file=outfile, row.names=F, append=FALSE, quote=F, sep = "\t")

	outfile <- paste("ToxRefDB/ToxRefDB_CHR_INIT_continuous_HDT_",date.string,".txt",sep="")
	write.table(result.h,file=outfile, row.names=F, append=FALSE, quote=F, sep = "\t")

	outfile <- paste("ToxRefDB/ToxRefDB_CHR_INIT_continuous_LDT_",date.string,".txt",sep="")
	write.table(result.l,file=outfile, row.names=F, append=FALSE, quote=F, sep = "\t")
}
#--------------------------------------------------------------------------------------
#
# fix the toxref file - if neoplastic is on, so should proliferative
#
#--------------------------------------------------------------------------------------
fix.toxrefdb <- function(do.read=F,date.string="2013_10_24") {
    cat("==========================================================================\n")
    cat("fix.toxrefdb\n")
    cat("==========================================================================\n")
    flush.console()
    if(do.read) {
		file <- paste("ToxRefDB/ToxRefDB_CHR_INIT_continuous_",date.string,".txt",sep="")
    	temp <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment="")
    	rownames(temp) <- temp[,"CODE"]
    	TOXREF.CHR.INIT.CONT <<- temp
    	TOXREF.CHR.FIXED.CONT <<- temp
    }
    nchem <- dim(TOXREF.CHR.FIXED.CONT)[1]
    endpoint.list <- names(TOXREF.CHR.FIXED.CONT)[4:dim(TOXREF.CHR.FIXED.CONT)[2]]
    nendpoint <- length(endpoint.list)
    for(i in 1:nendpoint) {
    	end.1 <- endpoint.list[i]
    	if(sum(grep("2_PreneoplasticLesion",end.1))>0) {
    		end.2 <- str_replace_all(end.1,"2_PreneoplasticLesion","3_NeoplasticLesion")
    		
        	if(is.element(end.2,endpoint.list)) {
        	    for(j in 1:nchem) {
        	    	if(!is.na(TOXREF.CHR.INIT.CONT[j,end.1])) {
						if(TOXREF.CHR.INIT.CONT[j,end.1]>TOXREF.CHR.INIT.CONT[j,end.2]) {
							TOXREF.CHR.FIXED.CONT[j,end.1]=TOXREF.CHR.INIT.CONT[j,end.2]
							cat(end.1,":",end.2,is.element(end.2,endpoint.list),TOXREF.CHR.INIT.CONT[j,end.1],TOXREF.CHR.INIT.CONT[j,end.2],"\n")
						}
					}
        	    }
        	}
        }
    }
    for(i in 1:nendpoint) {
    	end.1 <- endpoint.list[i]
    	if(sum(grep("1_AnyLesion",end.1))>0) {
    		end.2 <- str_replace_all(end.1,"1_AnyLesion","2_PreneoplasticLesion")
        	if(is.element(end.2,endpoint.list)) {
        	    for(j in 1:nchem) {
        	    	if(!is.na(TOXREF.CHR.INIT.CONT[j,end.1])) {
	        	        if(TOXREF.CHR.INIT.CONT[j,end.1]>TOXREF.CHR.INIT.CONT[j,end.2]) {
	        	            TOXREF.CHR.FIXED.CONT[j,end.1]=TOXREF.CHR.INIT.CONT[j,end.2]
	           				cat(end.1,":",end.2,is.element(end.2,endpoint.list),TOXREF.CHR.INIT.CONT[j,end.1],TOXREF.CHR.INIT.CONT[j,end.2],"\n")
	        	        }
	        	    }
        	    }
        	}
        }
    }
	outfile <-  paste("ToxRefDB/ToxRefDB_CHR_FIXED_continuous_",date.string,".txt",sep="")
    write.table(TOXREF.CHR.FIXED.CONT,file=outfile, row.names=F, append=FALSE, quote=F, sep = "\t")
}
#--------------------------------------------------------------------------------------
#
# create the discrete ToxRefDB file
#
#--------------------------------------------------------------------------------------
toxrefdb.cont.to.disc <- function(do.read=F,date.string="2013_10_24") {
    cat("==========================================================================\n")
    cat("toxrefdb.cont.to.disc\n")
    cat("==========================================================================\n")
    flush.console()
    if(do.read) {
		file <-  paste("ToxRefDB/ToxRefDB_CHR_FIXED_continuous_",date.string,".txt",sep="")
    	temp <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment="")
    	rownames(temp) <- temp[,"CODE"]
    	TOXREF.CHR.FIXED.CONT <<- temp
    	TOXREF.CHR.FIXED.DISC <<- temp
    }
    nchem <- dim(TOXREF.CHR.FIXED.CONT)[1]
    endpoint.list <- names(TOXREF.CHR.FIXED.CONT)[4:dim(TOXREF.CHR.FIXED.CONT)[2]]
    nendpoint <- length(endpoint.list)
    for(i in 1:nendpoint) {
    	end.1 <- endpoint.list[i]
		temp <- TOXREF.CHR.FIXED.CONT[,end.1]
		temp[is.na(temp)] <- -1
		temp[temp>=2000] <- 0
		temp[temp>0] <- 1
		temp[temp<0] <- NA
		TOXREF.CHR.FIXED.DISC[,end.1] <- temp
    }
    outfile <- paste("ToxRefDB/ToxRefDB_CHR_FIXED_discrete_",date.string,".txt",sep="")
    write.table(TOXREF.CHR.FIXED.DISC,file=outfile, row.names=F, append=FALSE, quote=F, sep = "\t")
}
##################################################################################
##################################################################################
##################################################################################
##################################################################################
#--------------------------------------------------------------------------------------
#
# Check a Toxrefdb endpoint for phase 1
#
#--------------------------------------------------------------------------------------
check.toxrefdb.old <- function(do.read=F,
							endpoint="CHR_Rat_LiverTumors",
							endpoint.disc="CHR_Rat_Liver_3_NeoplasticLesion",
							study_type="CHR",
							endpoint_category="Systemic",
							endpoint_supercategory="SystemicCarcinogenic",
							endpoint_subcategory="PathologyNeoplastic",
							species="rat",
							max_study_qc=3,
							endpoint_subclass="Liver") {
	cat("==========================================================================\n")
	cat("check.toxrefdb\n")
	cat("==========================================================================\n")
    flush.console()
    if(do.read) {
#    	file <- "ToxRefDB/toxrefdb_study_tg_effect_summary.csv"
#    	temp <- read.csv(file,header=T,sep=",",stringsAsFactors=F,quote="\"",comment="")
#    	TOXREF.NEW <<- temp
    	
    	file <- "ToxRefDB/toxrefdb_study_endpoint_summary.csv"
    	temp <- read.csv(file,header=T,sep=",",stringsAsFactors=F,quote="\"",comment="")
    	TOXREF.NEW.SUM <<- temp
    	    	
    	file <- "phase_I/ToxCast_Phase_1_ToxRefDB_20110110.txt"
    	temp <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment="")
		for(i in 1:dim(temp)[1]) {
			casrn <- temp[i,"CASRN"]
			code <- paste("C",str_replace_all(casrn,"-",""),sep="")
			rownames(temp)[i] <- code
		}
		TOXREFDB.0 <<- temp
    	
	}
	temp <- TOXREF.NEW[is.element(TOXREF.NEW[,"species"],species),]
	temp <- temp[is.element(temp[,"study_type"],study_type),]
	temp <- temp[is.element(temp[,"endpoint_supercategory"],endpoint_supercategory),]
	temp <- temp[is.element(temp[,"endpoint_subcategory"],endpoint_subcategory),]
	temp <- temp[is.element(temp[,"endpoint_category"],endpoint_category),]
	temp <- temp[is.element(temp[,"endpoint_subclass"],endpoint_subclass),]
	tnew <- temp[temp[,"data_usability"]<=max_study_qc,]

	temp <- TOXREF.NEW.SUM[is.element(TOXREF.NEW.SUM[,"species"],species),]
	temp <- temp[is.element(temp[,"study_type"],study_type),]
	tnew.sum <- temp[temp[,"data_usability"]<=max_study_qc,]

	casrn.list <- sort(uniquify(TOXREFDB.0[,"CASRN"]))
	nchem <- length(casrn.list)
	result <- as.data.frame(matrix(nrow=nchem,ncol=10))
	names(result) <- c("CODE","CASRN","Name","Endpoint","Phase.1.LEL","Phase.1.HitCall","QCLevel","LDT","HDT","LEL")
	for(i in 1:nchem) {
		casrn <- casrn.list[i]
		code <- paste("C",str_replace_all(casrn,"-",""),sep="")
		result[i,"CODE"] <- code
		result[i,"CASRN"] <- casrn
		result[i,"Name"] <- CNAMES[code,"ShortName"]
		result[i,"Endpoint"] <- endpoint
		result[i,"Phase.1.LEL"] <- TOXREFDB.0[code,endpoint]
		result[i,"Phase.1.HitCall"] <- TOXREFDB.0[code,endpoint.disc]
		
		temp.2.sum <- tnew.sum[is.element(tnew.sum[,"chemical_casrn"],casrn),]
		if(dim(temp.2.sum)[1]>0) {
			result[i,"QCLevel"] <- min(temp.2.sum[,"data_usability"])
			result[i,"HDT"] <- max(temp.2.sum[,"hdt"])
			result[i,"LDT"] <- min(temp.2.sum[,"ldt"])
			print(result[i,])
			#browser()
		}
		temp.2 <- tnew[is.element(tnew[,"chemical_casrn"],casrn),]
		if(dim(temp.2)[1]>0) {
			result[i,"QCLevel"] <- min(temp.2[,"data_usability"])
			result[i,"HDT"] <- max(temp.2[,"hdt"])
			result[i,"LDT"] <- min(temp.2[,"ldt"])
			result[i,"LEL"] <- min(temp.2[,"dose"])
			print(result[i,])
			#browser()
		}		
	}
	outfile <- paste("ToxRefDB/ToxRefDB_check_",endpoint,".txt",sep="")
	write.table(result,file=outfile, row.names=F, append=FALSE, quote=F, sep = "\t")

	browser()
}
#--------------------------------------------------------------------------------------
#
# Build the ToxRefDB index
#
#--------------------------------------------------------------------------------------
build.toxrefdb.index.old <- function(do.read=F) {
	cat("==========================================================================\n")
	cat("build.toxrefdb.index\n")
	cat("==========================================================================\n")
    flush.console()
    if(do.read) {
    	file <- "ToxRefDB/ToxRefDB_Index_CDR_2013_08_08.xlsx"
    	temp <- read.xlsx2(file,sheetIndex=1,startrow=1,endrow=10)
    	TRINDEX.MASTER <<- temp
	}
	temp <- TRINDEX.MASTER

    file <- "ToxRefDB/ToxRefDB_small_index.txt"
 	s <- "CODE\tCASRN\tName\tStudyType\tStudyID\tSpecies\tMinDose\tMaxDose\tUnits\n"
   	cat(file=file,s,append=F)
	code.list <- sort(uniquify(temp[,"CODE"]))
	for(i in 1:length(code.list)) {
		code <- code.list[i]
		temp.1 <- temp[is.element(temp[,"CODE"],code),]
		cname <- temp.1[1,"chemical_name"]
		casrn <- temp.1[1,"chemical_casrn"]
		sid.list <- sort(uniquify(temp.1[,"study_id"]))
		for(j in 1:length(sid.list)) {
			sid <- sid.list[j]
			temp.2 <- temp.1[is.element(temp.1[,"study_id"],sid),]
			stype <- temp.2[1,"study_type"]
			ldt <- temp.2[1,"ldt"]
			hdt <- temp.2[1,"hdt"]
			species <- temp.2[1,"species"]
			units <- temp.2[1,"dose_unit"]
			s <- paste(code,"\t",casrn,"\t",cname,"\t",stype,"\t",as.integer(as.character(sid)),"\t",species,"\t",ldt,"\t",hdt,"\t",units,"\n",sep="")
			cat(file=file,s,append=T)
			cat(s)
		}
		#browser()
	}
}
#--------------------------------------------------------------------------------------
#
# Build the ToxRefDB index
#
#--------------------------------------------------------------------------------------
build.toxrefdb.type.old <- function(do.read=F,type="CHR",species="rat") {
	cat("==========================================================================\n")
	cat("build.toxrefdb.index\n")
	cat("==========================================================================\n")
    flush.console()
    if(do.read) {
    	file <- "ToxRefDB/ToxrefDB_small_index.txt"
    	INDEX <<- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment="")
    	rownames(INDEX) <- INDEX[,"StudyID"]

    	file <- "ToxRefDB/ToxCast_ToxRefDB_CHR_rat_2013_08_08.txt"
    	RAW.DATA <<- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment="")
	}
	sid.list.with.data <- sort(uniquify(RAW.DATA[,"study_id"]))
	temp.1 <- INDEX[is.element(INDEX[,"StudyType"],type),]
	sid.list <- sort(uniquify(temp.1[is.element(temp.1[,"Species"],species),"StudyID"]))
	sid.list <- sid.list[is.element(sid.list,sid.list.with.data)]
	temp.1 <- temp.1[is.element(temp.1[,"StudyID"],sid.list),]
	code.list <- sort(uniquify(temp.1[,"CODE"]))
	nchem <- length(code.list)
	cat("Number of chemicals:",nchem,"\n")

	cat("build the preliminary list of endpoints\n")
	flush.console()
	cat("Size of raw data: ",dim(RAW.DATA)[1],"\n")
	temp.1 <- RAW.DATA[is.element(RAW.DATA[,"study_id"],sid.list),]
	cat("Size with selected studies: ",dim(temp.1)[1],"\n")

	endpoint.list <- c()
	for(i in 1:dim(temp.1)[1]) {
		esc <- temp.1[i,"endpoint_subcategory"]
		if(esc=="PathologyProliferative" || esc=="PathologyNeoplastic") {
			organ <- temp.1[i,"endpoint_subclass"]
			endpoint.list <- c(endpoint.list,paste(esc,"_",organ,sep=""))
		}
	}
	endpoint.list <- sort(uniquify(endpoint.list))
	cat("Number of endpoints: ",length(endpoint.list),"\n")
	result <- as.data.frame(matrix(nrow=nchem,ncol=8+length(endpoint.list)))
	names(result) <- c("CODE","CASRN","Name","Type","Species","MinDose","MaxDose","TotalEffects",endpoint.list)
	result[,endpoint.list] <- 0
	for(i in 1:nchem) {
		code <- code.list[i]
		#if(code=="C63252") {
			itemp <- INDEX[is.element(INDEX[,"CODE"],code),]
			itemp <- itemp[is.element(itemp[,"Species"],species),]
			itemp <- itemp[is.element(itemp[,"StudyType"],type),]

			casrn <- itemp[1,"CASRN"]
			cname <- itemp[1,"Name"]
			ldt <- min(itemp[,"MinDose"])
			hdt <- max(itemp[,"MaxDose"])
			result[i,"CODE"] <- code
			result[i,"CASRN"] <- casrn
			result[i,"Name"] <- cname
			result[i,"Type"] <- type
			result[i,"Species"] <- species
			result[i,"MinDose"] <- ldt
			result[i,"MaxDose"] <- hdt

			count <- 0
			for(j in 1:dim(itemp)[1]) {
				sid <- itemp[j,"StudyID"]
				if(is.element(sid,temp.1[,"study_id"])) {
					temp.2 <- temp.1[is.element(temp.1[,"study_id"],sid),]
					count <- count+dim(temp.2)[1]
					for(k in 1:dim(temp.2)[1]) {
						esc <- temp.2[k,"endpoint_subcategory"]
						if(esc=="PathologyProliferative" || esc=="PathologyNeoplastic") {
							organ <- temp.2[k,"endpoint_subclass"]
							endpoint <- paste(esc,"_",organ,sep="")
							dose <- temp.2[k,"dose"]
							value <- 2
							if(dose>hdt/2) value <- 1
							if(dose==ldt) value <- 3
							value.current <- result[i,endpoint]
							cat(esc,organ,"value:",value,"current:",value.current,"dose:",dose,"ldt:",ldt,"hdt:",hdt,"\n")
							if(value>value.current) result[i,endpoint] <- value
							#browser()
						}
					}
				}
			}
			result[i,"TotalEffects"] <- count
		#}
	}
	outfile <- paste("ToxRefDB/ToxRefDB_matrix_",type,"_",species,".txt",sep="")
	write.table(result,file=outfile, row.names=F, append=FALSE, quote=F, sep = "\t")
	#browser()
}
#--------------------------------------------------------------------------------------
#
# fix the toxref file - if neoplastic is on, so should proliferative
#
#--------------------------------------------------------------------------------------
fix.toxrefdb.old <- function(do.read=F) {
    cat("==========================================================================\n")
    cat("fix.toxrefdb\n")
    cat("==========================================================================\n")
    flush.console()
    if(do.read) {
    	file <- "ToxRefDB/ToxRefDB_matrix_CHR_rat.txt"
    	temp <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment="")
    	rownames(temp) <- temp[,"CODE"]
    	TOXREF.CHR.RAT <<- temp
    	TOXREF.CHR.RAT.FIX <<- temp
    }
    browser()
    nchem <- dim(TOXREF.CHR.RAT.FIX)[1]
    for(i in 54:92) {
        end.1 <- names(TOXREF.CHR.RAT.FIX)[i]
        end.2 <- str_replace_all(end.1,"Proliferative","Neoplastic")
        cat(end.1,":",end.2,is.element(end.2,names(TOXREF.CHR.RAT.FIX)),"\n")
        if(is.element(end.2,names(TOXREF.CHR.RAT.FIX))) {
            for(j in 1:nchem) {
                if(TOXREF.CHR.RAT[j,end.1]<TOXREF.CHR.RAT[j,end.2]) {
                    TOXREF.CHR.RAT.FIX[j,end.1]=TOXREF.CHR.RAT[j,end.2]
                }
            }
        }
    }
    outfile <- "ToxRefDB/ToxRefDB_matrix_CHR_rat_FIXED.txt"
    write.table(TOXREF.CHR.RAT.FIX,file=outfile, row.names=F, append=FALSE, quote=F, sep = "\t")
}
