#--------------------------------------------------------------------------------------
#
# toxcast_driver_v12.R - code to analyze the ToxCast data
#
# November 2014
# Richard Judson
#
# US EPA
# Questions, comments to: judson.richard@epa.gov, 919-541-3085
#
#--------------------------------------------------------------------------------------
options(java.parameters = "-Xmx1000m")
library(grDevices)
library(RColorBrewer)
library(stringr)
library(mixdist)
library(class)
library(lattice)
library(openxlsx)
library(DBI)
library(RMySQL)

source("utils.R")
source("diagnostics_v04.R")
source("genescore_v01.R")
source("pathway_v01.R")

SERVER <<- "134.67.216.114"
#SERVER <<- "134.67.216.45"
SERVER <<- "au.epa.gov"
DB <<- "dev_physchemdb"
USER <<- "rjudson"
PASSWORD <<- "password"

VARMATDIR <<- "../input/varmats_151020_internal/"
VARMATDATE <<- "151020"
SOURCE.GROUPS <<- c("ACEA","ACEA_Cytotoxicity","APR_Cytotoxicity","APR_dn","APR_up","ATG_CIS","ATG_TRANS","BSK_Cytotoxicity","BSK_down","BSK_up","NVS_ADME","NVS_ADME_Activator","NVS_ENZ","NVS_ENZ_Activator","NVS_GPCR","NVS_IC","NVS_MP","NVS_NR","NVS_TR","OT","Tox21_BLA_Agonist","Tox21_BLA_Antagonist","Tox21_Cytotoxicity","Tox21_LUC_Agonist","Tox21_LUC_Antagonist")
#--------------------------------------------------------------------------------------
#
# Run all analyses
#
#--------------------------------------------------------------------------------------
run.all <- function(suffix=VARMATDATE,do.genescore=F,do.diagnostics=F) {
	print.current.function()
	load.and.scale(suffix)
	chem.dictionary()
	save.state()
  export.files()
	print.summary.stats()
	if(do.diagnostics) run.diagnostics(do.prep=T)
	if(do.genescore) {
  	genescore.driver()
  	ref.chems()
  	ref.chems.summary()
  }
}
#--------------------------------------------------------------------------------------
#
# Save the state
#
#--------------------------------------------------------------------------------------
save.state <- function(suffix=VARMATDATE) {
	print.current.function()
	list <- c("ALL.FLAGS","ASSAY.INFO","ASSAY.LIST","CHEMS","CODE.LIST","GENE.FAMILY","MAT.AC10","MAT.AC50","MAT.AC50_loss","MAT.ACB","MAT.ACC","MAT.Emax","MAT.hitcall","MAT.logAC50","MAT.logAC50_loss","MAT.max_conc","MAT.min_conc","MAT.model","MAT.T","MAT.T.SCALED","MAT.tested","MAT.W","MAT.Z","MAT.Z.NORM","NASSAY","NCHEM","CHEM.DICT")
	file <- paste("../input/Robjs_",VARMATDATE,".RData",sep="")
	save(list=list,file=file)
}
#--------------------------------------------------------------------------------------
#
# Load the state
#
#--------------------------------------------------------------------------------------
load.state <- function(suffix=VARMATDATE) {
	print.current.function()
	file <- paste("../input/Robjs_",VARMATDATE,".RData",sep="")
	load(file=file, .GlobalEnv)
}
#--------------------------------------------------------------------------------------
#
# load all of the data and build scaled matrices
#
# CHEMS - matrix of all chemical information
# CODE.LIST - list of unique chemical codes, rownames of all matrices
# NCHEM - number of chemicals
#
# ASSAY.INFO - assay matrix
# ASSAY.LIST - vector of unique assay names - colnames of all matrices
# NASSAY - number of assays
#
# CYTOTOX - matrix of cytotox paramters for each chemical
#
# Matrices (NCHEM x NASSAY)
#
# MAT.AC50 (uM)
# MAT.AC50_loss
# MAT.ACB MAT.ACC  MAT.AC10
# MAT.hitcall
# MAT.logAC50 MAT.logAC50_loss
# MAT.min_conc  MAT.max_conc
# MAT.model
# MAT.Emax
# MAT.T
# MAT.T.SCALED (T scaled so that 95%-ile of hits have T=100%)
# MAT.tested
# MAT.W
# MAT.Z
# MAT.Z.NORM (shifted so htat first peak of Z distribution is at ~ 0
#
# Each time you use a matrix, you need to make sure that you are appropriately checking
# the tested and hitcall matrix
#
#--------------------------------------------------------------------------------------
load.and.scale <- function(suffix=VARMATDATE) {
  print.current.function()
  prep.assay.defs()
  prep.matrices(suffix)
  z.calc()
	source.Zdist(to.file=T,zmode="original")
	shift.zscore()
	scale.top.by.assay()
}
#--------------------------------------------------------------------------------------
#
# Create the full input AC50, T,B,Emax,... files, one row per chemical
#
#--------------------------------------------------------------------------------------
prep.matrices <- function(suffix=VARMATDATE) {
  print.current.function()
  cat("==========================================================================\n")
  cat("Prepare the matrices ...\n")
  cat("==========================================================================\n")
  flush.console()
	cat("Read in chemical data ...\n")
	flush.console()
	file <- paste(VARMATDIR,"Chemical_Summary_",suffix,".csv",sep="")
	temp <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="")
	PIPELINE.SAMPLES <<- temp
	code.list <- sort(unique(temp[,"code"]))
	nchem <- length(code.list)
	chems <- as.data.frame(matrix(nrow=nchem,ncol=4))
	names(chems) <- c("CODE","CASRN","Name","DSSTox_GSID")
	chems[,"CODE"] <- code.list
	rownames(chems) <- chems[,"CODE"]
	for(i in 1:nchem) {
		code <- code.list[i]
		ctemp <- temp[is.element(temp[,"code"],code),]
		chems[code,"CODE"] <- code
		chems[code,"CASRN"] <- ctemp[1,"casn"]
		chems[code,"Name"] <- ctemp[1,"chnm"]
		chems[code,"DSSTox_GSID"] <- paste("DSSTox_",ctemp[1,"chid"],sep="")
	}
	PIPELINE.CHEMS <<- chems
	cat("Dimension of PIPELINE.CHEMS: ",dim(PIPELINE.CHEMS),"\n")
	flush.console()

	file <- paste(VARMATDIR,"AllResults_flags_",suffix,".csv",sep="")
	temp <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="")
	x <- temp[,"chnm"]
	y <- str_replace_all(x,"\"","")
	temp[,"chnm"] <- y
	ALL.FLAGS <<- temp

	cat("Read in old chemical data ...\n")
	flush.console()
	file <- "../input/ToxCast_GenericChemicals_2014_11_24.xlsx"
	temp.chems <- read.xlsx(file)
	rownames(temp.chems) <- temp.chems[,"CODE"]
	OLD.CHEMS <<- temp.chems

	# create the unified chemicals set
	temp <- PIPELINE.CHEMS
	ntemp <- names(temp)
	temp <- cbind(temp,temp[,dim(temp)[2]])
	temp <- cbind(temp,temp[,dim(temp)[2]])
	temp <- cbind(temp,temp[,dim(temp)[2]])
	temp <- cbind(temp,temp[,dim(temp)[2]])
	temp <- cbind(temp,temp[,dim(temp)[2]])
	temp <- cbind(temp,temp[,dim(temp)[2]])
	temp <- cbind(temp,temp[,dim(temp)[2]])
	temp <- cbind(temp,temp[,dim(temp)[2]])
	temp <- cbind(temp,temp[,dim(temp)[2]])
	temp <- cbind(temp,temp[,dim(temp)[2]])
	temp <- cbind(temp,temp[,dim(temp)[2]])
	temp <- cbind(temp,temp[,dim(temp)[2]])

	ntemp <- c(ntemp,"target_gene","toxcast_reference_target_gene","use_category","use_super_category","structure_category","structure_super_category","Phase_I","Phase_II","E1K","Tox21","Use.Original","Use.Original.URL")
	names(temp) <- ntemp
	temp[,"target_gene"] <- NA
	temp[,"toxcast_reference_target_gene"] <- NA
	temp[,"use_category"] <- NA
	temp[,"use_super_category"] <- NA
	temp[,"structure_category"] <- NA
	temp[,"structure_super_category"] <- NA
	temp[,"Phase_I"] <- NA
	temp[,"Phase_II"] <- NA
	temp[,"E1K"] <- NA
	temp[,"Tox21"] <- NA
	temp[,"Use.Original"] <- NA
	temp[,"Use.Original.URL"] <- NA
	rownames(temp) <- temp[,"CODE"]
	NCHEM <<- dim(temp)[1]

  count1 <- 0
  count2 <- 0
	for(i in 1:NCHEM) {
		code <- temp[i,"CODE"]
		if(is.element(code,OLD.CHEMS[,"CODE"])) {
		  temp2 <- OLD.CHEMS[code,]
		  temp[i,"target_gene"] <- temp2[1,"target_gene"]
			temp[i,"toxcast_reference_target_gene"] <- temp2[1,"toxcast_reference_target_gene"]
			temp[i,"use_category"] <- temp2[1,"use_category"]
			temp[i,"use_super_category"] <- temp2[1,"use_super_category"]
			temp[i,"structure_category"] <- temp2[1,"structure_category"]
			temp[i,"structure_super_category"] <- temp2[1,"structure_super_category"]
			temp[i,"Phase_I"] <- temp2[1,"Phase_I"]
			temp[i,"Phase_II"] <- temp2[1,"Phase_II"]
			temp[i,"E1K"] <- temp2[1,"E1K"]
			temp[i,"Tox21"] <- temp2[1,"Tox21"]
			temp[i,"Use.Original"] <- temp2[1,"Use.Original"]
			temp[i,"Use.Original.URL"] <- temp2[1,"Use.Original.URL"]
      count1 <- count1+1
		}
		else {
			temp[i,"target_gene"] <- NA
			temp[i,"use_category"] <- "unknown"
			temp[i,"use_super_category"] <- "Other"
			temp[i,"structure_category"] <- "unknown"
			temp[i,"structure_super_category"] <- NA
			temp[i,"Phase_I"] <- 0
			temp[i,"Phase_II"] <- 0
			temp[i,"E1K"] <- 0
			temp[i,"Tox21"] <- 0
			temp[i,"Use.Original"] <- NA
			temp[i,"Use.Original.URL"] <- NA
			count2 <- count2+1
		}
	}
	x <- temp[,"Name"]
	y <- str_replace_all(x,"\"","")
	temp[,"Name"] <- y

  mask <- temp[,"Phase_I"] + temp[,"Phase_II"]
	CHEMS <<- temp[mask>0,]
	CODE.LIST <<- CHEMS[,"CODE"]

	outfile <- paste("../input/ToxCast_Chems_Master_",suffix,".xlsx",sep="")
	write.xlsx(CHEMS,file=outfile)
	cat("CHEMS read in\n")
	flush.console()

  #
  # assays
  #
  cat("Read in assay data ...\n")

  file <- paste(VARMATDIR,"cyto_dist_Matrix_",suffix,".csv",sep="")
  temp <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="")
  temp <- temp[,2:11]
  names(temp) <- c("CASRN","Name","CODE","cytotox_median_raw","cytotox_mad","nhit","global_mad","cytotox_median_log", "cytotox_median_um","cytotox_lower_bound_um")
  rownames(temp) <- temp[,"CODE"]
  x <- temp[,"Name"]
  y <- str_replace_all(x,"\"","")
  temp[,"Name"] <- y
  CYTOTOX <<- temp

  file <- paste(VARMATDIR,"modl_ga_Matrix_",suffix,".csv",sep="")
  temp.mat <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)	
	code.list <- sort(row.names(temp.mat))
  code.list <- code.list[is.element(code.list,CODE.LIST)]
	assay.list <- sort(names(temp.mat))
  assay.list.2 <- ASSAY.INFO[is.element(ASSAY.INFO[,"source_group"],SOURCE.GROUPS),"assay"]
  assay.list <- assay.list[is.element(assay.list,assay.list.2)]
	temp.log_ac50 <- temp.mat[code.list,assay.list]

  file <- paste(VARMATDIR,"tested_Matrix_",suffix,".csv",sep="")
  temp.mat <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)
  temp.tested <- temp.mat[code.list,assay.list]
  
  file <- paste(VARMATDIR,"hitc_Matrix_",suffix,".csv",sep="")
  temp.mat <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)
  temp.hit <- temp.mat[code.list,assay.list]
  
  file <- paste(VARMATDIR,"modl_ac10_Matrix_",suffix,".csv",sep="")
  temp.mat <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)
  temp.log_ac10  <- temp.mat[code.list,assay.list]
  
  file <- paste(VARMATDIR,"modl_acc_Matrix_",suffix,".csv",sep="")
  temp.mat <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)
  temp.log_acc <- temp.mat[code.list,assay.list]
  
  file <- paste(VARMATDIR,"modl_acb_Matrix_",suffix,".csv",sep="")
  temp.mat <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)
  temp.log_acb <- temp.mat[code.list,assay.list]
  
  file <- paste(VARMATDIR,"modl_la_Matrix_",suffix,".csv",sep="")
  temp.mat <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)
  temp.log_loss_ac50 <- temp.mat[code.list,assay.list]
  
  file <- paste(VARMATDIR,"max_med_Matrix_",suffix,".csv",sep="")
  temp.mat <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)
  temp.emax <- temp.mat[code.list,assay.list]
  
  file <- paste(VARMATDIR,"modl_gw_Matrix_",suffix,".csv",sep="")
  temp.mat <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)
  temp.w  <- temp.mat[code.list,assay.list]
  
  file <- paste(VARMATDIR,"modl_lw_Matrix_",suffix,".csv",sep="")
  temp.mat <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)
  temp.loss_w <- temp.mat[code.list,assay.list]
  
  file <- paste(VARMATDIR,"modl_tp_Matrix_",suffix,".csv",sep="")
  temp.mat <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)
  temp.t <- temp.mat[code.list,assay.list]
  
  file <- paste(VARMATDIR,"modl_Matrix_",suffix,".csv",sep="")
  temp.mat <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)
  temp.modl  <- temp.mat[code.list,assay.list]
  
  file <- paste(VARMATDIR,"zscore_Matrix_",suffix,".csv",sep="")
  temp.mat <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)
  temp.z <- temp.mat[code.list,assay.list]
  
  file <- paste(VARMATDIR,"logc_min_Matrix_",suffix,".csv",sep="")
  temp.mat <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)
  temp.log_cmin <- temp.mat[code.list,assay.list]
  
  file <- paste(VARMATDIR,"logc_max_Matrix_",suffix,".csv",sep="")
  temp.mat <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)
  temp.log_cmax <- temp.mat[code.list,assay.list]
  
  cat("all assay files read in\n")
  flush.console()
  temp.log_ac10[is.na(temp.log_ac10)] <- 6
  temp.log_ac50[is.na(temp.log_ac50)] <- 6
  temp.log_loss_ac50[is.na(temp.log_loss_ac50)] <- 6
  temp.log_acc[is.na(temp.log_acc)] <- 6
  temp.log_acb[is.na(temp.log_acb)] <- 6
  temp.w[is.na(temp.w)] <- 0
  temp.emax[is.na(temp.emax)] <- 0
  temp.loss_w[is.na(temp.loss_w)] <- 0
  temp.t[is.na(temp.t)] <- 0
  temp.z[is.na(temp.z)] <- 0
  temp.modl[is.na(temp.modl)] <- 0
  temp.log_cmax[is.na(temp.log_cmax)] <- 0
  temp.log_cmin[is.na(temp.log_cmin)] <- 0
  temp.tested[is.na(temp.tested)] <- 0
  temp.hit[is.na(temp.hit)] <- 0
  cat("NA fixed\n")
  flush.console()
  
  temp.log_ac10[temp.hit<=0] <- 6
  temp.log_ac50[temp.hit<=0] <- 6
  temp.log_loss_ac50[temp.hit<=0] <- 6
  temp.log_acc[temp.hit<=0] <- 6
  temp.log_acb[temp.hit<=0] <- 6
  temp.w[temp.hit<=0] <- 0
  temp.loss_w[temp.hit<=0] <- 0
  temp.t[temp.hit<=0] <- 0
  temp.z[temp.hit<=0] <- NA
  cat("hit matrix applied\n")

  flush.console()
  temp.ac10 <- 10**(temp.log_ac10)
  temp.ac50 <- 10**(temp.log_ac50)
  temp.loss_ac50 <- 10**(temp.log_loss_ac50)
  temp.acc <- 10**(temp.log_acc)
  temp.acb <- 10**(temp.log_acb)
  temp.cmax <- 10**(temp.log_cmax)
  temp.cmin <- 10**(temp.log_cmin)
  cat("exponentiation\n")
  flush.console()
  
  temp.ac10[temp.tested<=0] <- NA
  temp.ac50[temp.tested<=0] <- NA
  temp.acc[temp.tested<=0] <- NA
  temp.acb[temp.tested<=0] <- NA
  temp.loss_ac50[temp.tested<=0] <- NA
  temp.log_ac10[temp.tested<=0] <- NA
  temp.log_acc[temp.tested<=0] <- NA
  
  temp.emax[temp.tested<=0] <- NA
  temp.w[temp.tested<=0] <- NA
  temp.loss_w[temp.tested<=0] <- NA
  temp.t[temp.tested<=0] <- NA
  temp.modl[temp.tested<=0] <- NA
  temp.z[temp.tested<=0] <- NA
  temp.hit[temp.tested<=0] <- NA
  temp.cmax[temp.tested<=0] <- NA
  temp.cmin[temp.tested<=0] <- NA
  
  temp.log_loss_ac50[temp.tested<=0] <- NA
  temp.log_ac50[temp.tested<=0] <- NA
  
  cat("test matrix applied\n")
  flush.console()
  
  MAT.AC50 <<- temp.ac50
  MAT.logAC50 <<- temp.log_ac50
  cat("AC50: ",dim(MAT.AC50),"\n")
  
  MAT.AC50_loss <<- temp.loss_ac50
  MAT.logAC50_loss <<- temp.log_loss_ac50
  cat("AC50_loss: ",dim(MAT.AC50_loss),"\n");flush.console()
  
  MAT.Emax <<- temp.emax
  cat("Emax: ",dim(MAT.Emax),"\n");flush.console()
  
  MAT.model <<- temp.modl
  cat("model: ",dim(MAT.model),"\n");flush.console()
  
  MAT.hitcall <<- temp.hit
  cat("hitcall: ",dim(MAT.hitcall),"\n");flush.console()
  
  MAT.T <<- temp.t
  cat("T: ",dim(MAT.T),"\n");flush.console()
  
  MAT.W <<- temp.w
  cat("W: ",dim(MAT.W),"\n");flush.console()
  
  MAT.Z <<- temp.z
  cat("Z: ",dim(MAT.Z),"\n");flush.console()
  
  MAT.min_conc <<- temp.cmin
  cat("min_conc: ",dim(MAT.min_conc),"\n");flush.console()
  
  MAT.max_conc <<- temp.cmax
  cat("min_conc: ",dim(MAT.max_conc),"\n");flush.console()
  
  MAT.AC10 <<- temp.ac10
  cat("AC10: ",dim(MAT.AC10),"\n");flush.console()
  
  MAT.ACC <<- temp.acc
  cat("ACC: ",dim(MAT.ACC),"\n");flush.console()
  
  MAT.ACB <<- temp.acb
  cat("ACB: ",dim(MAT.ACB),"\n");flush.console()
  
  MAT.tested <<- temp.tested
  cat("tested: ",dim(MAT.tested),"\n");flush.console()
  
  CODE.LIST <<- rownames(MAT.AC50)
  NCHEM <<- length(CODE.LIST)
  CHEMS <<- CHEMS[CODE.LIST,]
  
  ASSAY.LIST <<- colnames(MAT.AC50)
  NASSAY <<- length(ASSAY.LIST)
  
  file <- "../input/gene_family.xlsx"
  gf <- read.xlsx(file)
  GENE.FAMILY <<- gf
}
#--------------------------------------------------------------------------------------
#
# reload the assay definitions
#
#--------------------------------------------------------------------------------------
prep.assay.defs <- function(suffix=VARMATDATE) {
  print.current.function()
  file <- paste(VARMATDIR,"Assay_Summary_",suffix,".csv",sep="")
  temp.1 <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="")
	file <- paste("../input/assay_manual_info_20160120.xlsx",sep="")
	temp.2 <- read.xlsx(file)

	rownames(temp.1) <- temp.1[,"assay_component_endpoint_name"]
	rownames(temp.2) <- temp.2[,"assay"]
	
	temp.1 <- cbind(temp.1[,1],temp.1)
	temp.1 <- cbind(temp.1[,1],temp.1)
	temp.1 <- cbind(temp.1[,1],temp.1)
  temp.1 <- cbind(temp.1[,1],temp.1)
  names(temp.1)[1] <- "assay"
	names(temp.1)[2] <- "source_group"
	names(temp.1)[3] <- "biological_process"
  names(temp.1)[3] <- "intended_target"
  temp.1[,1] <- temp.1[,"assay_component_endpoint_name"]
  temp.1[,"source_group"] <- NA
  temp.1[,"biological_process"] <- NA
  temp.1[,"intended_target"] <- NA
  n <- dim(temp.1)[1]
	for(i in 1:n) {
		assay <- temp.1[i,"assay"]
    if(is.element(assay,temp.2[,"assay"])) {
		  temp <- temp.2[assay,]
			bioproc <- temp[1,"biological_process"]
			temp.1[i,"source_group"] <- temp[1,"source_group"]
			temp.1[i,"biological_process"] <- temp[1,"biological_process"]
			temp.1[i,"intended_target"] <- temp[1,"intended_target"]
		}
		else {
			cat(assay,"\n",sep="")
		}
	}
	temp.1 <- temp.1[is.element(temp.1[,"source_group"],SOURCE.GROUPS),]
	ASSAY.INFO <<- temp.1
}
#--------------------------------------------------------------------------------------
#
# recalculate the Z matrix
#
#--------------------------------------------------------------------------------------
z.calc <- function() {
  print.current.function()
  cytotox.assay.set <- ASSAY.INFO[is.element(ASSAY.INFO[,"biological_process"],c("cytotoxicity SRB","cytotoxicity BLA","proliferation decrease")),"assay"]
  atemp <- MAT.logAC50[,cytotox.assay.set]
  htemp <- MAT.hitcall[,cytotox.assay.set]
  atemp[is.na(atemp)] <- 6
  htemp[is.na(htemp)] <- 0
  rs <- rowSums(htemp)
  cytotox <- CYTOTOX[CODE.LIST,]
  cytotox[,"cytotox_median_raw"] <- NA
  cytotox[,"cytotox_mad"] <- NA
  cytotox[,"global_mad"] <- NA
  cytotox[,"cytotox_median_log"] <- 3
  cytotox[,"cytotox_median_um"] <- 1000
  cytotox[,"cytotox_lower_bound_um"] <- NA
  cytotox[,"nhit"] <- rs
  nchem <- dim(htemp)[1]
  for(i in 1:nchem) {
    if(rs[i]>1) {
      temp <- atemp[i,]
      temp <- temp[temp<6]
      cytotox[i,"cytotox_median_raw"] <- median(temp)
      cytotox[i,"cytotox_median_log"] <- median(temp)
      cytotox[i,"cytotox_mad"] <- mad(temp)
      cytotox[i,"cytotox_median_um"] <- 10**median(temp)
    }
  }
  mad.global <- median(cytotox[,"cytotox_mad"],na.rm=T)
  cytotox[,"global_mad"] <- mad.global
  cytotox[,"cytotox_lower_bound_um"] <- 10**(cytotox[,"cytotox_median_log"]-3*mad.global)
  CYTOTOX <<- cytotox
  
  nassay <- dim(MAT.hitcall)[2]
  zmat <- MAT.hitcall
  zmat[] <- NA
  for(i in 1:nassay) {
    assay <- names(zmat)[i]
    #print(assay)
    temp <- MAT.logAC50[,i]
    hits <- MAT.hitcall[,i]
    temp <- -(temp-cytotox[,"cytotox_median_log"])/mad.global
    temp[hits==0] <- NA
    temp[is.na(hits)] <- NA
    zmat[,i] <- temp
  }
  MAT.Z <<- zmat
}
#--------------------------------------------------------------------------------------
#
# Assay source summary table
# zmode=original or norm
# QC=OK
#--------------------------------------------------------------------------------------
source.Zdist <- function(to.file=F,zmode="norm") {
  print.current.function()
  if(to.file) {
    fname <- paste("../plots/source_Zdist_",zmode,".pdf",sep="")
    pdf(file=fname,width=7,height=10,pointsize=12,bg="white",paper="letter",pagecentre=T)
  }
  par(mfrow=c(4,2),mar=c(4,4,2,2))
  
  source.list <- sort(uniquify(ASSAY.INFO[,"source_group"]))
  nsource <- length(source.list)
  
  if(zmode=="original") z <- MAT.Z
  else if(zmode=="norm") z <- MAT.Z.NORM
  z[MAT.hitcall==0] <- NA
  z[MAT.tested==0] <- NA
  z[MAT.Z==0] <- NA
  
  ctemp <- MAT.tested
  chem.mask <- rowSums(ctemp)
  chem.mask[chem.mask<200] <- 0
  chem.mask[chem.mask>0] <- 1
  
  resmat <- as.data.frame(matrix(nrow=nsource,ncol=3))
  names(resmat) <- c("source_group","Center1","Center2")
  for(i in 1:nsource) {
    source <- source.list[i]
    assay.list <- ASSAY.INFO[is.element(ASSAY.INFO[,"source_group"],source),"assay"]
    assay.list <- assay.list[is.element(assay.list,colnames(MAT.AC50))]
    cat(source," : ",length(assay.list),"\n")
    flush.console()
    if(length(assay.list)>1) {
      ztemp <- z[,assay.list]
      atemp <- MAT.AC50[,assay.list]
      if(length(assay.list)==1) {
        atemp <- atemp[chem.mask==1]
        ztemp <- ztemp[chem.mask==1]
      }
      else {
        atemp <- atemp[chem.mask==1,]
        ztemp <- ztemp[chem.mask==1,]
        print("fix conversion")
        atemp <- as.numeric(as.matrix(atemp))
        ztemp <- as.numeric(as.matrix(ztemp))
      }
      xmax <- 15
      xmin <- -5
      ztemp <- ztemp[!is.na(ztemp)]
      ztemp <- ztemp[ztemp> xmin]
      ztemp <- ztemp[ztemp< xmax]
      if(length(ztemp)>10) {
        breaks <- seq(-6,xmax,by=0.2)
        x <- hist(ztemp,xlim=c(xmin,xmax),main=paste("Z distribution for",source),cex.lab=1.2,cex.axis=1.2,ylab="Chemical/Assay Hits",xlab="Z-score",breaks=breaks,freq=T)
        
        center1 <- 0
        center2 <- 0
        mymix <- cbind(x$breaks[1:length(x$counts)],x$counts)
        fit <- mix(mymix,mixparam(mu=c(0,xmax),sigma=c(2,2)),"norm")
        center1 <- fit[[1]][1,2]
        center2 <- fit[[1]][2,2]
        ymax <- max(x$counts)
        print(fit)
        if(zmode=="original") lines(c(center1,center1),c(0,ymax/5),lwd=3,col="red")
        if(zmode=="norm") lines(c(0,0),c(0,ymax),lwd=1,col="red")
        lines(c(3,3),c(0,ymax),lwd=2,col="red")
        lines(c(0,0),c(0,ymax),lwd=2,col="red")
        #lines(c(center2,center2),c(0,ymax/5),lwd=3,col="red")
        if(zmode=="original") text(7,ymax*0.9,paste("Peak 1:",format(center1,digits=2)),pos=4)
        #text(-6,ymax*0.7,paste("C2:",format(center2,digits=2)),pos=4)
        resmat[i,1] <- source
        resmat[i,2] <- center1
        resmat[i,3] <- center2
        eps <- 0.1
        xmin <- 1e-4
        xmax <- 1e4
        atemp <- atemp[!is.na(atemp)]
        atemp <- atemp[atemp<xmax]
        atemp <- atemp[atemp>xmin]
        
        breaks <- seq(0,xmax,by=0.2)
        breaks.min <- 1e-4
        nbreaks <- 100
        if(min(atemp)<breaks.min) {
          breaks.min <- breaks.min/100
        }
        if(min(atemp)<breaks.min) {
          breaks.min <- breaks.min/100
        }
        breaksA <- breaks.min
        for(i in 1:100) breaksA <- c(breaksA,1.2*breaksA[length(breaksA)])
        while(max(breaksA)<max(atemp)) {
          breaksA <- c(breaksA,1.2*breaksA[length(breaksA)])
          cat("Added another point to breaksA",max(breaksA),"\n")
        }
        xA <- hist(atemp,breaks=breaksA,plot=F)
        ymax <- 1.5*max(xA$counts)
        hist.log(breaksA,xA$counts,ylim=c(0,ymax),xlab="AC50 (uM)",ylab="Hits",main=source,1000000,1000000,1000000)
        if(!to.file) browser()
      }
    }
  }
  if(to.file) graphics.off()
  else browser()
  outfile <- paste("../output/source_Zshifts_",zmode,".xlsx",sep="")
  write.xlsx(resmat,outfile)
}
#--------------------------------------------------------------------------------------
#
# z-score: shift the data
#
# QC=OK
#--------------------------------------------------------------------------------------
shift.zscore <- function() {
    print.current.function()
    flush.console()
    file <- "../output/source_Zshifts_original.xlsx"

    zshift <- read.xlsx(file)
    ztemp <- MAT.Z
    namat <- ztemp
    namat[] <- 0
    namat[is.na(ztemp)] <- 1
    namat[is.na(MAT.tested)] <- 1
    namat[is.na(MAT.hitcall)] <- 1
    namat[MAT.tested<=0] <- 1
    namat[MAT.hitcall<=0] <- 1

    assay.list <- names(MAT.Z)
    nassay <- length(assay.list)
    for(i in 1:nassay) {
        assay <- assay.list[i]
        source <- ASSAY.INFO[is.element(ASSAY.INFO[,"assay"],assay),"source_group"]
        shift <- zshift[is.element(zshift[,"source_group"],source),"Center1"]
        if(length(shift)==0) shift <- 0
        cat(assay,":",source,":",shift,"\n")
        ztemp[,assay] <- ztemp[,assay] - shift
    }
    ztemp[namat==1] <- NA
    MAT.Z.NORM <<- ztemp
    outfile <- "../output/zscore_matrix_norm.xlsx"
    write.xlsx(ztemp,outfile)
}
#--------------------------------------------------------------------------------------
#
# Scale the top
#
#--------------------------------------------------------------------------------------
scale.top.by.assay <- function() {
  print.current.function()
  
  tscale <- MAT.T
  
  for(i in 1:NASSAY) {
    assay <- ASSAY.INFO[i,"assay"]
    source <- ASSAY.INFO[i,"source_group"]
    scaler <- 0
    if(!is.na(source)) {
      if(substr(source,1,3)=="NVS") scaler <- 1
    else if(substr(source,1,4)=="ACEA") scaler <- 1
      else if(substr(source,1,5)=="Tox21") scaler <- 1
      else if(substr(source,1,2)=="OT") scaler <- 1
      else {
      	temp <- MAT.T[,assay]
      	temp[temp>200] <- 200
      	tested <- MAT.tested[,assay]
      	hits <- MAT.hitcall[,assay]
      	temp[temp<0] <- 0
      	temp[is.na(temp)] <- -1
      	temp[tested==0] <- -1
      	temp[hits==0] <- -1
      	tlist <- as.numeric(temp[temp>0])
      	scaler <- 100/quantile(tlist,probs=seq(0,1,0.05))[20]
      }
      if(is.na(scaler)) scaler <- 1
      if(scaler>200) scaler <- 200
      tscale[,assay] <- MAT.T[,assay]*scaler
      if(scaler!=1) {
      	cat(assay,":",scaler,"\n")
      	flush.console()
      }
    }
  }
  tscale[tscale>200] <- 200
  MAT.T.SCALED <<- tscale
}
#--------------------------------------------------------------------------------------
#
# print the summary stats
#
#--------------------------------------------------------------------------------------
print.summary.stats <- function() {
  print.current.function()
  file <- "../output/toxcast_summary_stats.txt"
  s <- "===================================================\n"
  s <- paste(s,"ToxCast Summary Statistics\n",sep="")
  s <- paste(s,"===================================================\n",sep="")
  s <- paste(s,"Number of assays: ",dim(MAT.AC50)[2],"\n",sep="")
  s <- paste(s,"Number of chemicals: ",dim(MAT.AC50)[1],"\n",sep="")
  
  ctemp <- CHEMS[,c("CODE","Phase_I","Phase_II")]
  mask <- ctemp[,"Phase_I"]+ctemp[,"Phase_II"]
  code.list <- ctemp[mask>0,"CODE"]
  s <- paste(s,"Number of chemicals in Phase I,II: ",length(code.list),"\n",sep="")
  s <- paste(s,"===================================================\n\n",sep="")
  s <- paste(s,"stats on different chemical use categories\n",sep="")
  s <- paste(s,"===================================================\n",sep="")
  use.list <- sort(unique(CHEMS[,"use_category"]))
  s <- paste(s,"Number of use categories:",length(use.list),"\n\n",sep="")
  for(i in 1:length(use.list)) {
    temp <- CHEMS[is.element(CHEMS[,"use_category"],use.list[i]),]
    s <- paste(s,use.list[i],"\t",dim(temp)[1],"\n",sep="")
  }
  
  s <- paste(s,"===================================================\n",sep="")
  s <- paste(s,"stats on different chemical use supercategories\n",sep="")
  use.list <- sort(unique(CHEMS[,"use_super_category"]))
  s <- paste(s,"Number of use super_categories: ",length(use.list),"\n\n",sep="")
  for(i in 1:length(use.list)) {
    temp <- CHEMS[is.element(CHEMS[,"use_super_category"],use.list[i]),]
    s <- paste(s,use.list[i],"\t",dim(temp)[1],"\n",sep="")
  }
  
  s <- paste(s,"===================================================\n",sep="")
  s <- paste(s,"stats on different chemical structure categories\n",sep="")
  use.list <- sort(unique(CHEMS[,"structure_category"]))
  s <- paste(s,"Number of structure categories: ",length(use.list),"\n\n",sep="")
  for(i in 1:length(use.list)) {
    temp <- CHEMS[is.element(CHEMS[,"structure_category"],use.list[i]),]
    s <- paste(s,use.list[i],"\t",dim(temp)[1],"\n",sep="")
  }
  
  s <- paste(s,"===================================================\n",sep="")
  s <- paste(s,"stats on different chemical structure supercategories classes\n",sep="")
  use.list <- sort(unique(CHEMS[,"structure_super_category"]))
  s <- paste(s,"Number of structure super_categories: ",length(use.list),"\n\n",sep="")
  for(i in 1:length(use.list)) {
    temp <- CHEMS[is.element(CHEMS[,"structure_super_category"],use.list[i]),]
    s <- paste(s,use.list[i],"\t",dim(temp)[1],"\n",sep="")
  }
  
  s <- paste(s,"===================================================\n",sep="")
  s <- paste(s,"stats on different assay sources\n",sep="")
  use.list <- sort(unique(ASSAY.INFO[,"source_group"]))
  s <- paste(s,"Number of assay sources: ",length(use.list),"\n\n",sep="")
  for(i in 1:length(use.list)) {
    temp <- ASSAY.INFO[is.element(ASSAY.INFO[,"source_group"],use.list[i]),]
    s <- paste(s,use.list[i],"\t",dim(temp)[1],"\n",sep="")
  }
  
  s <- paste(s,"===================================================\n",sep="")
  s <- paste(s,"stats on different assay genes\n",sep="")
  use.list <- sort(unique(ASSAY.INFO[,"intended_target"]))
  s <- paste(s,"Number of assay gene targets: ",length(use.list),"\n\n",sep="")
  for(i in 1:length(use.list)) {
    temp <- ASSAY.INFO[is.element(ASSAY.INFO[,"intended_target"],use.list[i]),]
    s <- paste(s,use.list[i],"\t",dim(temp)[1],"\n",sep="")
  }
  
  s <- paste(s,"===================================================\n",sep="")
  s <- paste(s,"stats on different assay biological processes\n",sep="")
  use.list <- sort(unique(ASSAY.INFO[,"biological_process"]))
  s <- paste(s,"Number of assay biological_processes: ",length(use.list),"\n\n",sep="")
  for(i in 1:length(use.list)) {
    temp <- ASSAY.INFO[is.element(ASSAY.INFO[,"biological_process"],use.list[i]),]
    s <- paste(s,use.list[i],"\t",dim(temp)[1],"\n",sep="")
  }
  
  s <- paste(s,"===================================================\n",sep="")
  s <- paste(s,"stats on total hits and hits in 4 quadrants\n",sep="")
  
  zmat <- MAT.Z.NORM[code.list,]
  tmat <- MAT.T.SCALED[code.list,]
  hitmat <- MAT.hitcall[code.list,]
  testmat <- MAT.tested[code.list,]
  
  testmat[is.na(testmat)] <- 0
  testmat[testmat<0] <- 0
  
  hitmat[is.na(hitmat)] <- 0
  hitmat[hitmat<0] <- 0
  hitmat[testmat==0] <- 0
  
  zmat[zmat<=0] <- 0.0001
  zmat[is.na(zmat)] <- 0
  zmat[testmat<=0] <- 0
  zmat[hitmat<=0] <- 0
  
  tmat[is.na(tmat)] <- 0
  tmat[testmat<=0] <- 0
  tmat[hitmat<=0] <- 0
  tmat[tmat<0] <- 0
  
  zmat.hi <- zmat
  zmat.hi[zmat.hi<3] <- 0
  zmat.hi[zmat.hi>0] <- 1
  
  zmat.lo <- zmat
  zmat.lo[zmat.lo>=3] <- 0
  zmat.lo[zmat.lo>0] <- 1
  
  tmat.hi <- tmat
  tmat.hi[tmat.hi<50] <- 0
  tmat.hi[tmat.hi>0] <- 1
  
  tmat.lo <- tmat
  tmat.lo[tmat.lo>=50] <- 0
  tmat.lo[tmat.lo>0] <- 1
  
  tall <- tmat
  tall[tall>0] <- 1
  zall <- zmat
  zall[zall>0] <- 1
  
  q1 <- tmat.hi*zmat.hi
  q2 <- tmat.lo*zmat.hi
  q3 <- tmat.hi*zmat.lo
  q4 <- tmat.lo*zmat.lo
  sq1 <- sum(q1)
  sq2 <- sum(q2)
  sq3 <- sum(q3)
  sq4 <- sum(q4)
  ntested <- sum(testmat)
  nhit <- sum(hitmat)
  nall <- dim(testmat)[1]*dim(testmat)[2]
  rq1 <- sq1 / nhit
  rq2 <- sq2 / nhit
  rq3 <- sq3 / nhit
  rq4 <- sq4 / nhit
  rtested <- ntested / nall
  rhit <- nhit / ntested
  stall <- sum(tall)
  szall <- sum(zall)
  rtall <- stall/nhit
  rzall <- szall/nhit
  
  s <- paste(s,"Total Cells: ",nall,"\n",sep="")
  s <- paste(s,"Tested:      ",ntested," : ",format(rtested,digits=2),"\n",sep="")
  s <- paste(s,"Hits:        ",nhit," : ",format(rhit,digits=2),"\n",sep="")
  s <- paste(s,"Z.hi x T.hi: ",sq1," : ",format(rq1,digits=2),"\n",sep="")
  s <- paste(s,"Z.hi x T.lo: ",sq2," : ",format(rq2,digits=2),"\n",sep="")
  s <- paste(s,"Z.lo x T.hi: ",sq3," : ",format(rq3,digits=2),"\n",sep="")
  s <- paste(s,"Z.lo x T.lo: ",sq4," : ",format(rq4,digits=2),"\n",sep="")
  
  s <- paste(s,"===================================================\n",sep="")
  s <- paste(s,"stats on total with and without cytotox\n",sep="")
  
  code.cytotox.no <- CYTOTOX[CYTOTOX[,"nhit"]<2,"CODE"]
  code.cytotox.yes <- CYTOTOX[CYTOTOX[,"nhit"]>=2,"CODE"]
  
  code.cytotox.no <- code.cytotox.no[is.element(code.cytotox.no,code.list)]
  code.cytotox.yes <- code.cytotox.yes[is.element(code.cytotox.yes,code.list)]
  s <- paste(s,"Number of chemicals without cytotox: ",length(code.cytotox.no),"\n",sep="")
  s <- paste(s,"Number of chemicals with cytotox:    ",length(code.cytotox.yes),"\n",sep="")
  
  hit.no <- MAT.hitcall[code.cytotox.no,]
  test.no <- MAT.tested[code.cytotox.no,]
  hit.no[is.na(hit.no)] <- 0
  test.no[is.na(test.no)] <- 0
  hit.no[hit.no<0] <- 0
  test.no[test.no<0] <- 0
  rs.hit.no <- rowSums(hit.no)
  rs.test.no <- rowSums(test.no)
  frac.no <- rs.hit.no/rs.test.no
  mean.no <- mean(frac.no)
  sd.no <- sd(frac.no)
  s <- paste(s,"Mean hit ratio and SD for chemicals without cytotox:  ",format(mean.no,digits=2)," : ",format(sd.no,digits=2),"\n",sep="")
  
  hit.yes <- MAT.hitcall[code.cytotox.yes,]
  test.yes <- MAT.tested[code.cytotox.yes,]
  hit.yes[is.na(hit.yes)] <- 0
  test.yes[is.na(test.yes)] <- 0
  hit.yes[hit.yes<0] <- 0
  test.yes[test.yes<0] <- 0
  rs.hit.yes <- rowSums(hit.yes)
  rs.test.yes <- rowSums(test.yes)
  frac.yes <- rs.hit.yes/rs.test.yes
  mean.yes <- mean(frac.yes)
  sd.yes <- sd(frac.yes)
  s <- paste(s,"Mean hit ratio and SD for chemicals with cytotox:     ",format(mean.yes,digits=2)," : ",format(sd.yes,digits=2),"\n",sep="")
  s <- paste(s,"===================================================\n",sep="")
  
  cat(file=file,s,append=F)
  cat(s)
}
#--------------------------------------------------------------------------------------
#
# cas name gsid cid
#
#--------------------------------------------------------------------------------------
chem.dictionary <- function() {
	print.current.function()
	nchem <- length(CODE.LIST)
	
	name.list <- c("CODE","CASRN","Name","gsid","cid","MW")
	mat <- as.data.frame(matrix(nrow=nchem,ncol=length(name.list)))
	names(mat) <- name.list
	for(i in 1:nchem) {
		code <- CODE.LIST[i]
		mat[i,"CODE"] <- code
		casrn <- CHEMS[code,"CASRN"]
		cname <- CHEMS[code,"Name"]
		mat[i,"CASRN"] <- casrn
		mat[i,"Name"] <- cname
		query <- paste("select distinct gsid from synonym_mv where identifier = '",casrn,"'",sep="")
		gsid <- run.query(query,"dsstox")[1,1]
		mat[i,"gsid"] <- gsid
		if(!is.na(gsid)) {
			query <- paste("select cid from cas_compound where gsid=",gsid,sep="")
			cid <- run.query(query,"dsstox")[1,1]
			mat[i,"cid"] <- cid
			if(!is.na(cid)) {
				query <- paste("select mol_weight from compound where cid=",cid,sep="")
				mol_weight<- run.query(query,"dsstox")[1,1]
				mat[i,"MW"] <- mol_weight
			}
		}
		if(i%%100==0) cat("rows: ",i,"\n")
	}
	fname <- paste("../input/chemical_dictionary_",VARMATDATE,".xlsx",sep="")
	write.xlsx(mat,file=fname)
	CHEM.DICT <<- mat
}
#--------------------------------------------------------------------------------------
#
# export the files needed for the paper
#
#-------------------------------------------------------------------------------------
export.files <- function() {
  print.current.function()
  file <- "../output/S1 ToxCast Chemicals.xlsx"
  write.xlsx(CHEMS,file)
  file <- "../output/S2 ToxCast Assays.xlsx"
  write.xlsx(ASSAY.INFO,file)
}
  