#--------------------------------------------------------------------------------------
#
# ToxCastDataPrep.R utilities for managing ToxCast data
#
# June 2014
# Richard Judson
#  
# US EPA
# Questions, comments to: judson.richard@epa.gov
#
#
#--------------------------------------------------------------------------------------
#library(grDevices)
#library(RColorBrewer)
library(RMySQL)
library(stringr)
library(reshape2)
library(data.table)
source("utils.R")
#--------------------------------------------------------------------------------------
#
# Compare the old and new AC50 files
#
#--------------------------------------------------------------------------------------
compAC50 <- function(to.file=T,do.prep=F) {
	if(do.prep) {
		file <- "../oldnewComparison/ToxCast_ResultMatrix_E1K_AC50_2013_12_10.txt"	
		old <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment.char="")

		file <- "../input/fromPipeline/data/AllResults_AC50_Matrix_140828.csv"	
		new <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="")

		file <- "../oldnewComparison/ToxCast_ResultMatrix_E1K_Emax_2013_12_10.txt"	
		olde <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment.char="")

		file <- "../input/fromPipeline/data/AllResults_max_med_Matrix_140828.csv"	
		newe <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="")

		file <- "../input/ToxCast_GenericChemicals_2014_08_26.txt"
		temp <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment.char="")
		rownames(temp) <- temp[,"CODE"]
		CHEMS <<- temp
		
		file <- "../output/cytotox_dist.txt"
		temp <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment.char="")
		rownames(temp) <- temp[,"CODE"]
		cytotox <<- temp
		
		rownames(old) <- old[,"CODE"]
		rownames(new) <- new[,"code"]
		rownames(olde) <- olde[,"CODE"]
		rownames(newe) <- newe[,"code"]

		code.list.old <- rownames(old)
		code.list <- rownames(old)
		code.list <- code.list[is.element(code.list,rownames(new))]
		code.list <- code.list[is.element(code.list,rownames(cytotox))]
		
		code.list <- sort(unique(code.list))
		
		missing.codes <- code.list.old[!is.element(code.list.old,code.list)]
		temp <- old[missing.codes,1:3]
		print(dim(temp))

		assay.list <- names(old)
		assay.list <- assay.list[is.element(assay.list,names(new))]
		assay.list <- assay.list[!is.element(assay.list,"CODE")]
		assay.list <- assay.list[!is.element(assay.list,"CASRN")]
		assay.list <- assay.list[!is.element(assay.list,"ShortName")]

		assay.list <<- assay.list
		NEW <<- new[code.list,assay.list]
		OLD <<- old[code.list,assay.list]
		
		NEWE <<- newe[code.list,assay.list]
		OLDE <<- olde[code.list,assay.list]
		CYTOTOX <<- cytotox[code.list,]		
	}
	cat("Dimensions of OLD: ",dim(OLD),"\n")
	cat("Dimensions of NEW: ",dim(NEW),"\n")
	OLD.hit <- OLD
	OLD.hit[is.na(OLD)] <- -1
	OLD.hit[OLD>=1000000] <- 0
	OLD.hit[OLD.hit>0] <- 1
	OLD.hit[OLD.hit<0] <- NA

	NEW.hit <- NEW
	NEW.hit[is.na(NEW)] <- -1
	NEW.hit[NEW>=1000000] <- 0
	NEW.hit[NEW.hit>0] <- 1
	NEW.hit[NEW.hit<0] <- NA
	
	OLD.hit <<- OLD.hit
	NEW.hit <<- NEW.hit
	
	log.data <- NULL
	
	plotfile <- "../oldnewComparison/ToxCast_fits_compare_old_new_2014_09_24.pdf"
    if(to.file) pdf(file=plotfile,width=7,height=10,pointsize=12,bg="white",paper="letter",pagecentre=T)
    par(mfrow=c(3,2),mar=c(4,4,2,1))

	outfile <- "../oldnewComparison/ToxCast_fits_compare_old_new_2014_09_24_by_assay.txt"
	txt <- TxT(1,2,3,4)
	s <- paste("assay\t",txt$title,"\n",sep="")
	cat(s,file=outfile,append=F)
	nassay <- length(assay.list)
	file <- "../input/AssayList.txt"			
	assays <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment.char="")
	rownames(assays) <- assays[,"Assay"]

	tech.list <- sort(unique(assays[,"Source"]))
	ntech <- length(tech.list)

	for(j in 1:ntech) {
		tech <- tech.list[j]
		assay.list <- assays[is.element(assays[,"Source"],tech),"Assay"]
		nassay <- length(assay.list)
		for(i in 1:nassay) {
			assay <- assay.list[i]
			if(is.element(assay,names(OLD.hit)) && is.element(assay,names(NEW.hit))) {
				lold <- OLD.hit[,assay]
				lnew <- NEW.hit[,assay]
				code.list <- rownames(OLD.hit)
				lold <- lold[!is.na(lnew)]
				code.list <- code.list[!is.na(lnew)]
				lnew <- lnew[!is.na(lnew)]	
				lnew <- lnew[!is.na(lold)]
				code.list <- code.list[!is.na(lold)]
				lold <- lold[!is.na(lold)]

				hit.old <- lold
				hit.new <- lnew
				
				a <- sum(lold*lnew)
				b <- sum((1-lold)*lnew)
				c <- sum(lold*(1-lnew))
				d <- sum((1-lold)*(1-lnew))
				
				aset <- lold*lnew
				bset <- (1-lold)*lnew
				cset <- lold*(1-lnew)
				dset <- (1-lold)*(1-lnew)

				txt <- TxT(a,b,c,d)
				s <- paste(assay,"\t",txt$sval,"\n",sep="")
				cat(s,file=outfile,append=T)
				cat(s)
				flush.console()
				lold <- OLD[,assay]
				lnew <- NEW[,assay]
				lold <- lold[!is.na(lnew)]
				lnew <- lnew[!is.na(lnew)]
				lnew <- lnew[!is.na(lold)]
				lold <- lold[!is.na(lold)]

				#lnew[is.inf(lnew)] <- 1000000
				lnew[lnew>=1000000] <- 10000
				lold[lold>=1000000] <- 10000

				ac50.old <- lold
				ac50.new <- lnew
				
				plot(lnew~lold,log="xy",xlab="Old AC50",ylab="New AC50",cex.lab=1.2,cex.axis=1.2,xlim=c(1e-4,1e4),ylim=c(1e-4,1e4),main=paste(assay, "AC50"), type = "n")
				lines(c(1e-4,1e4),c(1e-4,1e4),lwd=2)
				lines(c(0.0001,0.0001),c(1e-4,1e4),col="gray")
				lines(c(0.001,0.001),c(1e-4,1e4),col="gray")
				lines(c(0.01,0.01),c(1e-4,1e4),col="gray")
				lines(c(0.1,0.1),c(1e-4,1e4),col="gray")
				lines(c(1,1),c(1e-4,1e4),col="gray")
				lines(c(10,10),c(1e-4,1e4),col="black",lwd=2)
				lines(c(100,100),c(1e-4,1e4),col="gray")

				lines(c(1e-4,1e4),c(0.0001,0.0001),col="gray")
				lines(c(1e-4,1e4),c(0.001,0.001),col="gray")
				lines(c(1e-4,1e4),c(0.01,0.01),col="gray")
				lines(c(1e-4,1e4),c(0.1,0.1),col="gray")
				lines(c(1e-4,1e4),c(1,1),col="gray")
				lines(c(1e-4,1e4),c(10,10),col="black",lwd=2)
				lines(c(1e-4,1e4),c(100,100),col="gray")
					
				lnew.s <- lnew[aset==1]
				lold.s <- lold[aset==1]
				points(lnew.s~lold.s,pch=21,bg="green",col="green")
				lnew.s <- lnew[bset==1]
				lold.s <- lold[bset==1]
				points(lnew.s~lold.s,pch=21,bg="blue",col="blue")
				lnew.s <- lnew[cset==1]
				lold.s <- lold[cset==1]
				points(lnew.s~lold.s,pch=21,bg="gray",col="gray")
				lnew.s <- lnew[dset==1]
				lold.s <- lold[dset==1]
				points(lnew.s~lold.s,pch=21,bg="red",col="red")

				y <- 5e2
				dy <- 0.2
				text(1e-4,y,paste("Both:     ",a,sep=""),pos=4); y <- y*dy
				text(1e-4,y,paste("New only: ",b,sep=""),pos=4); y <- y*dy
				text(1e-4,y,paste("Old only: ",c,sep=""),pos=4); y <- y*dy
				text(1e-4,y,paste("Neither:  ",d,sep=""),pos=4); y <- y*dy

				xp <- log10(lold)
				yp <- log10(lnew)
				xp <- xp[yp<6]
				yp <- yp[yp<6]
				yp <- yp[xp<6]
				xp <- xp[xp<6]
				if(length(xp>2)) {
					res <- lm(yp~xp)
					r2 <- summary(res)[[8]]
					text(1e-4,y,paste("R2:       ",format(r2,digits=2),sep=""),pos=4)
				}
		####
				lold <- OLDE[code.list,assay]
				lnew <- NEWE[code.list,assay]

				emax.old <- lold
				emax.new <- lnew

				#lold <- lold[!is.na(lnew)]
				#lnew <- lnew[!is.na(lnew)]
				#lnew <- lnew[!is.na(lold)]
				#lold <- lold[!is.na(lold)]

				#print(assay)
				#browser()
				#lnew[is.inf(lnew)] <- 1000000
				#lnew[lnew>=1000000] <- 10000
				#lold[lold>=1000000] <- 10000
				if(tech=="Attagene_cis" || tech=="Attagene_trans") lold <- log2(lold)
				if(tech=="BioSeek_up" || tech=="BioSeek_down") lold <- log10(lold)
				
				plot.max <- 150
				plot.min <- 0
				if(length(lnew)>1 && length(lold)>1) {
					if(max(lold[!is.na(lold)])< 10 || max(lnew[!is.na(lnew)])< 10) plot.max <- max(c(lold[!is.na(lold)],lnew[!is.na(lnew)]))
					if(min(lold[!is.na(lold)])< -1 || min(lnew[!is.na(lnew)])< -1) plot.min <- min(c(lold[!is.na(lold)],lnew[!is.na(lnew)]))
				}
				if(is.na(plot.max)) plot.max <- 100
				if(plot.max == -Inf) plot.max <- 100
				if(plot.max == Inf) plot.max <- 100
				#cat(assay,":",plot.min,":",plot.max,"\n")
				if(plot.min<0) plot.min <- - plot.max
				plot(lnew~lold,xlab="Old Emax",ylab="New Emax",cex.lab=1.2,cex.axis=1.2,xlim=c(plot.min,plot.max),ylim=c(plot.min,plot.max),main=paste(assay," Emax"), type = "n")
				lines(c(plot.min,plot.max),c(plot.min,plot.max))
				
				if(plot.min<0) {
					lines(c(plot.max,plot.min),c(plot.min,plot.max))
					lines(c(0,0),c(plot.min,plot.max))
					lines(c(plot.min,plot.max),c(0,0))
				}

				cytocut <- CYTOTOX[code.list,"cytotox.min.uM"]

				lnew.s <- lnew[dset==1]
				lold.s <- lold[dset==1]
				#points(lnew.s~lold.s,pch=21,bg="white",col="red")
				points(lnew.s~lold.s,pch=21,col="red")

				lnew.s <- lnew[cset==1]
				lold.s <- lold[cset==1]
				#points(lnew.s~lold.s,pch=21,bg="white",col="gray")
				points(lnew.s~lold.s,pch=21,col="gray")

				lnew.s <- lnew[bset==1]
				lold.s <- lold[bset==1]
				#points(lnew.s~lold.s,pch=21,bg="white",col="blue")
				points(lnew.s~lold.s,pch=21,col="blue")

				lnew.s <- lnew[aset==1]
				lold.s <- lold[aset==1]
				#points(lnew.s~lold.s,pch=21,bg="white",col="green")
				points(lnew.s~lold.s,pch=21,col="green")

				y <- plot.max*0.9
				dy <- plot.max*0.05
				#text(0,y,paste("Both:     ",a,sep=""),pos=4); y <- y-dy
				#text(0,y,paste("New only: ",b,sep=""),pos=4); y <- y-dy
				#text(0,y,paste("Old only: ",c,sep=""),pos=4); y <- y-dy
				#text(0,y,paste("Neither:  ",d,sep=""),pos=4); y <- y-dy

				xp <- lold
				yp <- lnew
				xp <- xp[!is.na(yp)]
				yp <- yp[!is.na(yp)]
				yp <- yp[!is.na(xp)]
				xp <- xp[!is.na(xp)]
				#if(length(xp>2)) {
				#	res <- lm(yp~xp)
				#	r2 <- summary(res)[[8]]
				#	text(0,y,paste("R2:       ",format(r2,digits=2),sep=""),pos=4)
				#}

				nchem <- length(code.list)

				header <- c("assay","CODE","CASRN","GSID","Name","AC50_OLD","AC50_NEW","Emax_OLD","Emax_NEW","Hit_OLD","Hit_NEW","Color","ZCUT","Z_OLD","Z_NEW")
				temp <- as.data.frame(matrix(nrow=length(code.list),ncol=length(header)))
				names(temp) <- header
				temp[,"assay"] <- assay
				temp[,"Z_OLD"] <- "NA"
				temp[,"Z_NEW"] <- "NA"
				
				if(length(ac50.old)==nchem) {
					temp[,"AC50_OLD"] <- ac50.old
					temp[,"ZCUT"] <- cytocut
					temp[ac50.old <  cytocut,"Z_OLD"] <- "HI"
					temp[ac50.old >= cytocut,"Z_OLD"] <- "LO"
					temp[ac50.new <  cytocut,"Z_NEW"] <- "HI"
					temp[ac50.new >= cytocut,"Z_NEW"] <- "LO"
				}
				if(length(ac50.new)==nchem) temp[,"AC50_NEW"] <- ac50.new
				if(length(emax.old)==nchem) temp[,"Emax_OLD"] <- emax.old
				if(length(emax.new)==nchem) temp[,"Emax_NEW"] <- emax.new
				if(length(hit.old)==nchem) temp[,"Hit_OLD"] <- hit.old
				if(length(hit.new)==nchem) temp[,"Hit_NEW"] <- hit.new
				ctemp <- CHEMS[code.list,]
				temp[,"CODE"] <- code.list
				temp[,"CASRN"] <- ctemp[,"CASRN"]
				temp[,"GSID"] <- ctemp[,"dsstox_gsid"]
				temp[,"Name"] <- ctemp[,"short_name"]

				hsum <- hit.old+hit.new
				temp[hsum==0,"Color"] <- "red"
				temp[hsum==2,"Color"] <- "green"
				temp[hit.old>hit.new,"Color"] <- "gray"
				temp[hit.old<hit.new,"Color"] <- "blue"

				log.data <- rbind(log.data,temp)
				if(!to.file) browser()
			}
		}
	}
	
	if(to.file) dev.off()
	logfile <- "../oldnewComparison/ToxCast_fits_compare_old_new_2014_09_24_by_chemical.txt"
	write.table(log.data,file=logfile, row.names=F, append=FALSE, quote=F, sep = "\t")
}
	
#--------------------------------------------------------------------------------------
#
# Summarize by assay source
#
#--------------------------------------------------------------------------------------
summarize <- function(to.file=T) {
	file <- "../oldnewComparison/ToxCast_fits_compare_old_new_2014_09_24_by_assay.txt"
	stats <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment.char="")
	rownames(stats) <- stats[,"assay"]
	file <- "../input/AssayList.txt"			
	assays <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment.char="")
	rownames(assays) <- assays[,"Assay"]

	tech.list <- sort(unique(assays[,"Source"]))
	
	plotfile <- "../oldnewComparison/comp_summary.pdf"
    if(to.file) pdf(file=plotfile,width=7,height=10,pointsize=12,bg="white",paper="letter",pagecentre=T)
    par(mfrow=c(4,3),mar=c(4,4,2,1))

	ntech <- length(tech.list)
	breaks <- seq(0,1,by=0.1) 
	for(i in 1:ntech) {
		tech <- tech.list[i]
		assay.list <- assays[is.element(assays[,"Source"],tech),"Assay"]
		temp <- stats[assay.list,]
		both <- temp[,"TP"]
		old <- temp[,"FN"]
		new <- temp[,"FP"]	
		new.ratio <- (new+both) / (new+both+old)
		old.ratio <- (old+both) / (new+both+old)
		filter <- new+old+both
		new.ratio <- new.ratio[filter>=10]
		old.ratio <- old.ratio[filter>=10]
		
		hist(new.ratio,main=paste(tech,": (new+both)/(new+old+both)"),cex.lab=1,cex.axis=1,breaks=breaks,xlab="fraction")
		hist(old.ratio,main=paste(tech,": (old+both)/(new+old+both)"),cex.lab=1,cex.axis=1,breaks=breaks,xlab="fraction")
		plot(new.ratio~old.ratio, xlim=c(0,1),ylim=c(0,1),xlab="(old+both)/(new+old+both)",ylab="(new+both)/(new+old+both)",cex.lab=1.0,cex.axis=1.0,main=tech)
		lines(c(0,1),c(0,1))

		#hist(temp[,"Sens"],main=paste(tech,": Sensitivity"),cex.lab=1,cex.axis=1,breaks=breaks,xlab="Specificity")
		#hist(temp[,"Spec"],main=paste(tech,": Specificity"),cex.lab=1,cex.axis=1,breaks=breaks,xlab="Specificity")
		#hist(temp[,"BA"],main=paste(tech,": BA"),cex.lab=1,cex.axis=1,breaks=breaks,xlab="Balanced Accuracy")
		if(!to.file) browser()
	}

	if(to.file) dev.off()
	browser()
}
#--------------------------------------------------------------------------------------
#
# Summarize by assay source
#
#--------------------------------------------------------------------------------------
analyze.diffs <- function(do.prep=F) {
	if(do.prep) {
		file <- "../oldnewComparison/ToxCast_fits_compare_old_new_2014_09_24_by_chemical.txt"
		STATS <<- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment.char="")

		file <- "../input/fromPipeline/data/AllResults_modl_Matrix_140828.csv"	
		MODL <<- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="")
	}
	file <- "../input/AssayList.txt"			
	assays <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment.char="")
	rownames(assays) <- assays[,"Assay"]

	ntot <- dim(STATS)[1]
	cat("Original size: ",ntot,"\n")
	flag <- STATS[,"Hit_OLD"]+STATS[,"Hit_NEW"]
	flag.old.only <- STATS[,"Hit_OLD"]
	flag.old.only[STATS[,"Hit_NEW"]==1] <- 0
	flag.new.only <- STATS[,"Hit_NEW"]
	flag.new.only[STATS[,"Hit_OLD"]==1] <- 0
	
	neg.neg <- length(flag[flag==0])
	pos.pos <- length(flag[flag==2])
	pos.neg <- sum(flag.old.only)
	neg.pos <- sum(flag.new.only)
	neg.neg.perc <- neg.neg / ntot
	pos.neg.perc <- pos.neg / ntot
	neg.pos.perc <- neg.pos / ntot
	pos.pos.perc <- pos.pos / ntot
	
	cat("Both negative: ",neg.neg,"\t",format(neg.neg.perc,digits=2),"\n")
	cat("Both positive: ",pos.pos,"\t",format(pos.pos.perc,digits=2),"\n")
	cat("OLD only:      ",pos.neg,"\t",format(pos.neg.perc,digits=2),"\n")
	cat("NEW only:      ",neg.pos,"\t",format(neg.pos.perc,digits=2),"\n")
	
	stemp <- STATS[flag==1,]
	stemp.both <- STATS[flag==2,]
	#cat("Disagree size: ",dim(stemp)[1],"\n")
	flush.console()
	
	stemp.old <- stemp[stemp[,"Hit_OLD"]==1,]
	stemp.new <- stemp[stemp[,"Hit_NEW"]==1,]
	
	source.list <- sort(unique(assays[,"Source"]))

	
	cat("===============================\nOLD only\n===============================\n")
	ntot.old <- dim(stemp.old)[1]
	nlowz <- dim(stemp.old[is.element(stemp.old[,"Z_OLD"],"LO"),])[1]
	ngt10 <- dim(stemp.old[stemp.old[,"AC50_OLD"]>10,])[1]
	cat("Total: ",ntot.old,"\n")
	cat("low Z: ",nlowz,"\t",format(nlowz/ntot.old,digits=2),"\n")
	cat(">10uM: ",ngt10,"\t",format(ngt10/ntot.old,digits=2),"\n")

	cat("===============================\nNEW only\n===============================\n")
	ntot.new <- dim(stemp.new)[1]
	nlowz <- dim(stemp.old[is.element(stemp.old[,"Z_NEW"],"LO"),])[1]
	ngt10 <- dim(stemp.new[stemp.new[,"AC50_NEW"]>10,])[1]
	cat("Total: ",ntot.new,"\n")
	cat("low Z: ",nlowz,"\t",format(nlowz/ntot.new,digits=2),"\n")
	cat(">10uM: ",ngt10,"\t",format(ngt10/ntot.new,digits=2),"\n")


	cat("\n\nSource\tNall\tNold\tNnew\tmean(Emax-all)\tmean(Emax-old only)\tmean(Emax-new only)\tp(old-all)\tp(new-all)\n",sep="")
	for(i in 1:length(source.list)) {
		asource <- source.list[i]
		assay.list <- assays[is.element(assays[,"Source"],asource),"Assay"]
		etemp.all <- c(stemp.both[is.element(stemp.both[,"assay"],assay.list),"Emax_OLD"],
					stemp.both[is.element(stemp.both[,"assay"],assay.list),"Emax_NEW"])
		etemp.old <- stemp.old[is.element(stemp.old[,"assay"],assay.list),"Emax_OLD"]
		etemp.new <- stemp.new[is.element(stemp.new[,"assay"],assay.list),"Emax_NEW"]
		
		etemp.old[etemp.old>100] <- 100
		etemp.new[etemp.new>100] <- 100
		etemp.all[etemp.all>100] <- 100
		
		if(length(etemp.old)>5 && length(etemp.new)>5 && length(etemp.all)>5) {
			t.old <- t.test(x=etemp.old,y=etemp.all,alternative="less")
			t.new <- t.test(x=etemp.new,y=etemp.all,alternative="less")
			cat(asource,"\t",length(etemp.all),"\t",length(etemp.old),"\t",length(etemp.new),"\t",format(mean(etemp.all),digits=2),"\t",format(mean(etemp.old),digits=2),"\t",format(mean(etemp.new),digits=2),"\t",format(t.old$p.value,digits=2),"\t",format(t.new$p.value,digits=2),"\n",sep="")
		}
	}
	
	browser()
}