#--------------------------------------------------------------------------------------
#
# tcp2_v11.R - code to analyze the ToxCast data
#
# October 2014
# Richard Judson
#
# US EPA
# Questions, comments to: judson.richard@epa.gov, 919-541-3085
#
# Order of processing
#
# burst.model(to.file=T) # RUN ONCE
# prepMatrices(suffix="140729")
# cytotox.hm(to.file=T,cex.col=0.1,cex.row=0.8)
# cytotox.hit.dist()
# shift.zscore()
# assay.source.summary(do.prep=T,to.file=T,zmode="original")
# assay.source.summary(do.prep=T,to.file=T,zmode="norm")
# assay.summary(to.file=T,do.tanim=F)
# hit.dist(to.file=T,target.gene=NA)
# calc.genescore(do.prep=T,zcut=3)
# load.genescore()
# prep.genescore.matrix(cutoff=0)
# load.genescore.matrix()
# prep.pathway("ER")
# prep.pathway("AR")
#
#--------------------------------------------------------------------------------------
options(java.parameters = "-Xmx1000m")
library(grDevices)
library(RColorBrewer)
library(stringr)
library(mixdist)
source("utils.R")
library(class)
VARMATDIR <<- "../input/fromPipeline/varmats/"

#--------------------------------------------------------------------------------------
#
# load all of the data
#
# QC=OK
#--------------------------------------------------------------------------------------
load.data.all <- function() {
    prepMatrices()
}
#--------------------------------------------------------------------------------------
#
# Create the full input AC50, T,B,Emax,... files, one row per chemical
#
# chemset=(ToxCast,E1K,Tox21)
#
#--------------------------------------------------------------------------------------
prepMatrices <- function(suffix="141024") {
    cat("==========================================================================\n")
    cat("Prepare the matrices ...\n")
    cat("==========================================================================\n")
    cat("Read in chemical data ...\n")
    flush.console()
    file <- paste(VARMATDIR,"Chemical_Summary_",suffix,".csv",sep="")
    temp <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="")
    PIPELINE.SAMPLES <<- temp
    code.list <- sort(unique(temp[,"code"]))
    nchem <- length(code.list)
    chems <- as.data.frame(matrix(nrow=nchem,ncol=4))
    names(chems) <- c("CODE","CASRN","Name","DSSTox_GSID")
    chems[,"CODE"] <- code.list
    rownames(chems) <- chems[,"CODE"]
    for(i in 1:nchem) {
        code <- code.list[i]
        ctemp <- temp[is.element(temp[,"code"],code),]
        chems[code,"CODE"] <- code
        chems[code,"CASRN"] <- ctemp[1,"casn"]
        chems[code,"Name"] <- ctemp[1,"chnm"]
        chems[code,"DSSTox_GSID"] <- paste("DSSTox_",ctemp[1,"chid"],sep="")
    }
    PIPELINE.CHEMS <<- chems
    cat("Dimension of PIPELINE.CHEMS: ",dim(PIPELINE.CHEMS),"\n")
    flush.console()
    
    file <- paste(VARMATDIR,"AllResults_flags_",suffix,".csv",sep="")
    temp <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="")
    x <- temp[,"chnm"]
    y <- str_replace_all(x,"\"","")
    temp[,"chnm"] <- y    
    ALL.FLAGS <<- temp

    file <- "../input/ToxCast_GenericChemicals_2014_08_26.txt"
    temp <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment.char="")
    rownames(temp) <- temp[,"CODE"]
    OLD.CHEMS <<- temp

    # create the unified chemicals set
    temp <- PIPELINE.CHEMS
    ntemp <- names(temp)
    temp <- cbind(temp,temp[,dim(temp)[2]])
    temp <- cbind(temp,temp[,dim(temp)[2]])
    temp <- cbind(temp,temp[,dim(temp)[2]])
    temp <- cbind(temp,temp[,dim(temp)[2]])
    temp <- cbind(temp,temp[,dim(temp)[2]])
    temp <- cbind(temp,temp[,dim(temp)[2]])
    temp <- cbind(temp,temp[,dim(temp)[2]])
    temp <- cbind(temp,temp[,dim(temp)[2]])

    ntemp <- c(ntemp,"target_gene","use_category","structure_category","structure_super_category","Phase_I","Phase_II","E1K","Tox21")
    names(temp) <- ntemp
    temp[,"target_gene"] <- NA
    temp[,"use_category"] <- NA
    temp[,"structure_category"] <- NA
    temp[,"structure_super_category"] <- NA
    temp[,"Phase_I"] <- NA
    temp[,"Phase_II"] <- NA
    temp[,"E1K"] <- NA
    temp[,"Tox21"] <- NA
    rownames(temp) <- temp[,"CODE"]
    NCHEM <<- dim(temp)[1]

    for(i in 1:NCHEM) {
        code <- temp[i,"CODE"]
        temp2 <- OLD.CHEMS[code,]
        if(dim(temp2)[1]==1) {
            temp[i,"target_gene"] <- temp2[1,"target_gene"]
            temp[i,"use_category"] <- temp2[1,"use_category"]
            temp[i,"structure_category"] <- temp2[1,"structure_category"]
            temp[i,"structure_super_category"] <- temp2[1,"structure_super_category"]
            temp[i,"Phase_I"] <- temp2[1,"Phase_I"]
            temp[i,"Phase_II"] <- temp2[1,"Phase_II"]
            temp[i,"E1K"] <- temp2[1,"E1K"]
            temp[i,"Tox21"] <- temp2[1,"Tox21"]
		}
        else {
            temp[i,"target_gene"] <- "unknown"
            temp[i,"use_category"] <- "unknown"
            temp[i,"structure_category"] <- "unknown"
            temp[i,"structure_super_category"] <- "unknown"
            temp[i,"Phase_I"] <- 0
            temp[i,"Phase_II"] <- 0
            temp[i,"E1K"] <- 0
            temp[i,"Tox21"] <- 0
        }
    }

    x <- temp[,"Name"]
    y <- str_replace_all(x,"\"","")
    temp[,"Name"] <- y

    CHEMS <<- temp
    CODE.LIST <<- CHEMS[,"CODE"]

    outfile <- paste("../input/ToxCast_Chems_Master_",suffix,".txt",sep="")
    write.table(CHEMS,file=outfile, row.names=F, append=FALSE, quote=T, sep = "\t")
    cat("CHEMS read in\n")
    flush.console()

    #
    # assays
    #
    cat("Read in assay data ...\n")

    file <- paste(VARMATDIR,"AllResults_cyto_dist_",suffix,".csv",sep="")
    temp <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="")
    temp <- temp[,2:11]
    names(temp) <- c("CASRN","Name","CODE","cytotox_median_raw","cytotox_mad","nhit","global_mad","cytotox_median_log", "cytotox_median_um","cytotox_lower_bound_um")
    rownames(temp) <- temp[,"CODE"]
    x <- temp[,"Name"]
    y <- str_replace_all(x,"\"","")
    temp[,"Name"] <- y    
    CYTOTOX <<- temp

    file <- paste(VARMATDIR,"AllResults_tested_Matrix_",suffix,".csv",sep="")
    temp.tested <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)

    file <- paste(VARMATDIR,"AllResults_hitc_Matrix_",suffix,".csv",sep="")
    temp.hit <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)

    file <- paste(VARMATDIR,"AllResults_modl_ga_Matrix_",suffix,".csv",sep="")
    temp.log_ac50 <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)

    file <- paste(VARMATDIR,"AllResults_modl_ac10_Matrix_",suffix,".csv",sep="")
    temp.log_ac10 <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)

    file <- paste(VARMATDIR,"AllResults_modl_acc_Matrix_",suffix,".csv",sep="")
    temp.log_acc <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)

    file <- paste(VARMATDIR,"AllResults_modl_acb_Matrix_",suffix,".csv",sep="")
    temp.log_acb <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)

    file <- paste(VARMATDIR,"AllResults_modl_la_Matrix_",suffix,".csv",sep="")
    temp.log_loss_ac50 <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)

    file <- paste(VARMATDIR,"AllResults_max_med_Matrix_",suffix,".csv",sep="")
    temp.emax <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)

    file <- paste(VARMATDIR,"AllResults_modl_gw_Matrix_",suffix,".csv",sep="")
    temp.w <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)

    file <- paste(VARMATDIR,"AllResults_modl_lw_Matrix_",suffix,".csv",sep="")
    temp.loss_w <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)

    file <- paste(VARMATDIR,"AllResults_modl_tp_Matrix_",suffix,".csv",sep="")
    temp.t <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)

    file <- paste(VARMATDIR,"AllResults_modl_Matrix_",suffix,".csv",sep="")
    temp.modl <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)

    file <- paste(VARMATDIR,"AllResults_zscore_Matrix_",suffix,".csv",sep="")
    temp.z <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)

    file <- paste(VARMATDIR,"AllResults_logc_min_Matrix_",suffix,".csv",sep="")
    temp.log_cmin <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)

    file <- paste(VARMATDIR,"AllResults_logc_max_Matrix_",suffix,".csv",sep="")
    temp.log_cmax <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)
    
    file <- paste(VARMATDIR,"AllResults_zscore_Matrix_",suffix,".csv",sep="")
    temp.zscore <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)
    
    cat("all assay files read in\n")
    flush.console()
    
    temp.log_ac10[is.na(temp.log_ac10)] <- 6
    temp.log_ac50[is.na(temp.log_ac50)] <- 6
    temp.log_loss_ac50[is.na(temp.log_loss_ac50)] <- 6
    temp.log_acc[is.na(temp.log_acc)] <- 6
    temp.log_acb[is.na(temp.log_acb)] <- 6
    temp.w[is.na(temp.w)] <- 0
    temp.emax[is.na(temp.emax)] <- 0
    temp.loss_w[is.na(temp.loss_w)] <- 0
    temp.t[is.na(temp.t)] <- 0
    temp.z[is.na(temp.z)] <- 0
    temp.modl[is.na(temp.modl)] <- 0
    temp.log_cmax[is.na(temp.log_cmax)] <- 0
    temp.log_cmin[is.na(temp.log_cmin)] <- 0
    temp.tested[is.na(temp.tested)] <- 0
    cat("NA fixed\n")
    flush.console()

    temp.log_ac10[temp.hit==0] <- 6
    temp.log_ac50[temp.hit==0] <- 6
    temp.log_loss_ac50[temp.hit==0] <- 6
    temp.log_acc[temp.hit==0] <- 6
    temp.log_acb[temp.hit==0] <- 6
    temp.w[temp.hit==0] <- 0
    temp.loss_w[temp.hit==0] <- 0
    temp.t[temp.hit==0] <- 0
    cat("hit matrix applied\n")
    
    flush.console()
	temp.ac10 <- 10**(temp.log_ac10)
	temp.ac50 <- 10**(temp.log_ac50)
	temp.loss_ac50 <- 10**(temp.log_loss_ac50)
	temp.acc <- 10**(temp.log_acc)
	temp.acb <- 10**(temp.log_acb)
	temp.cmax <- 10**(temp.log_cmax)
	temp.cmin <- 10**(temp.log_cmin)
    cat("exponentiation\n")
	flush.console()
	
	temp.ac10[temp.tested==0] <- NA
	temp.ac50[temp.tested==0] <- NA
    temp.acc[temp.tested==0] <- NA
    temp.acb[temp.tested==0] <- NA
    temp.loss_ac50[temp.tested==0] <- NA
    temp.log_ac10[temp.tested==0] <- NA
    temp.log_acc[temp.tested==0] <- NA
    
    temp.emax[temp.tested==0] <- NA
    temp.w[temp.tested==0] <- NA
    temp.loss_w[temp.tested==0] <- NA
    temp.t[temp.tested==0] <- NA
    temp.modl[temp.tested==0] <- NA
    temp.z[temp.tested==0] <- NA
    temp.hit[temp.tested==0] <- NA
    temp.cmax[temp.tested==0] <- NA
    temp.cmin[temp.tested==0] <- NA

    temp.log_loss_ac50[temp.tested==0] <- NA
    temp.log_ac50[temp.tested==0] <- NA

    cat("test matrix applied\n")
    flush.console()

	MAT.AC50 <<- temp.ac50
	MAT.logAC50 <<- temp.log_ac50
    cat("AC50: ",dim(MAT.AC50),"\n")

	MAT.AC50_loss <<- temp.loss_ac50
	MAT.logAC50_loss <<- temp.log_loss_ac50
    cat("AC50_loss: ",dim(MAT.AC50_loss),"\n");flush.console()

	MAT.Emax <<- temp.emax
    cat("Emax: ",dim(MAT.Emax),"\n");flush.console()
	
	MAT.model <<- temp.modl
    cat("model: ",dim(MAT.model),"\n");flush.console()
 
    MAT.hitcall <<- temp.hit
    cat("hitcall: ",dim(MAT.hitcall),"\n");flush.console()

    MAT.T <<- temp.t
    cat("T: ",dim(MAT.T),"\n");flush.console()

    MAT.W <<- temp.w
    cat("W: ",dim(MAT.W),"\n");flush.console()

    MAT.Z <<- temp.z
    cat("Z: ",dim(MAT.Z),"\n");flush.console()

    MAT.min_conc <<- temp.cmin
    cat("min_conc: ",dim(MAT.min_conc),"\n");flush.console()

    MAT.max_conc <<- temp.cmax
    cat("min_conc: ",dim(MAT.max_conc),"\n");flush.console()

    MAT.AC10 <<- temp.ac10
    cat("AC10: ",dim(MAT.AC10),"\n");flush.console()

    MAT.ACC <<- temp.acc
    cat("ACC: ",dim(MAT.ACC),"\n");flush.console()

    MAT.ACB <<- temp.acb
    cat("ACB: ",dim(MAT.ACB),"\n");flush.console()

    MAT.ZSCORE <<- temp.zscore
    cat("ZSCORE: ",dim(MAT.ZSCORE),"\n");flush.console()
    
    ASSAY.LIST <<- colnames(MAT.AC50)
    NASSAY <<- length(ASSAY.LIST)
	
	file <- paste(VARMATDIR,"Assay_Summary_modified_141031.csv",sep="")
	temp.assay <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="")
	rownames(temp.assay) <- temp.assay[,"Assay"]
	temp.assay <- temp.assay[ASSAY.LIST,]
	ASSAY.INFO <<- temp.assay
	
}
#--------------------------------------------------------------------------------------
#
# do the heatmap of the cytotox assays
#
# QC=OK
#--------------------------------------------------------------------------------------
cytotox.hm <- function(to.file=T,cex.col=0.1,cex.row=0.8) {
    cat("==========================================================================\n")
    cat("cytotox.hm\n")
    cat("==========================================================================\n")
    flush.console()
    file <- "../input/CytotoxAssays_20141030.txt"
    temp <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment.char="")
    cytotox.assays <- temp[,1]
    cytotox.assays <- cytotox.assays[is.element(cytotox.assays,ASSAY.LIST)]
    CYTOTOX.ASSAYS <<- cytotox.assays

    if(to.file) {
        file <- paste("../plots/cytotox_heatmap.pdf",sep="")
        pdf(file=file,width=7,height=7,pointsize=12,bg="white",paper="letter",pagecentre=T)
    }
    assay.list <- CYTOTOX.ASSAYS
    p1.list <- CHEMS[,"Phase_I"]
    p2.list <- CHEMS[,"Phase_II"]
    mask <- p1.list+p2.list
    clist <- CODE.LIST[mask>0]
    clist <- clist[!is.na(clist)]
    
    mat <- t(as.matrix(MAT.AC50[clist,assay.list]))
    mat <- -log10(mat/1000000)
    mat[is.na(mat)] <- 0
    result <- heatmap(mat,margins=c(5,15),scale="none",main=paste(dim(mat)[2]," chemicals, ",dim(mat)[1]," assays"),
                      xlab="Chemicals",ylab="Assays",cexCol=cex.col,cexRow=cex.row,col=brewer.pal(9,"Reds"),
                      hclustfun=function(x) hclust(d=dist(x),method="ward"),keep.dendro=T,verbose=T)

    if(to.file) dev.off()
    else browser()
}
#--------------------------------------------------------------------------------------
#
# prep pathway
#
#--------------------------------------------------------------------------------------
prep.pathway <- function(pathname="ER") {

    if(pathname=="ER") assay.list <- c("NVS_NR_bER","NVS_NR_hER","NVS_NR_mERa",
       "OT_ER_ERaERa_0480","OT_ER_ERaERa_1440","OT_ER_ERaERb_0480","OT_ER_ERaERb_1440","OT_ER_ERbERb_0480","OT_ER_ERbERb_1440",
       "OT_ERa_EREGFP_0120","OT_ERa_EREGFP_0480",
       "ATG_ERa_TRANS_up","ATG_ERE_CIS_up",
       "Tox21_ERa_BLA_Agonist_ratio","Tox21_ERa_LUC_BG1_Agonist",
       "ACEA_T47D_80hr_Positive",
       "Tox21_ERa_BLA_Antagonist_ratio","Tox21_ERa_LUC_BG1_Antagonist")

    if(pathname=="AR") assay.list <- c("NVS_NR_hAR","NVS_NR_cAR",
       "OT_AR_ARSRC1_0480","OT_AR_ARSRC1_0960",
       "ATG_AR_TRANS_up",
       "Tox21_AR_BLA_Agonist_ratio","Tox21_AR_LUC_MDAKB2_Agonist",
       "Tox21_AR_BLA_Antagonist_ratio","Tox21_AR_LUC_MDAKB2_Antagonist")

    if(pathname=="TR") assay.list <- c("ATG_THRa1_TRANS_up","NVS_NR_hTRa","Tox21_TR_LUC_GH3_Agonist","Tox21_TR_LUC_GH3_Antagonist","NVS_GPCR_rTRH")

    print(is.element(assay.list,names(MAT.AC50)))
    cat("==========================================================================\n")
    cat("Prepare the ",pathname,"file ...\n")
    cat("==========================================================================\n")
    var.list <- c("AC50","W","T","Emax","hitcall","min_conc","max_conc","AC10","ACB","ACC","Z")
    nvar <- length(var.list)

    for(v in 1:nvar) {
        variable <- var.list[v]
        if(variable=="AC50") temp <- MAT.AC50
        if(variable=="Emax") temp <- MAT.Emax
        if(variable=="hitcall") temp <- MAT.hitcall
        if(variable=="T") temp <- MAT.T
        if(variable=="W") temp <- MAT.W
        if(variable=="min_conc") temp <- MAT.min_conc
        if(variable=="max_conc") temp <- MAT.max_conc
        if(variable=="AC10") temp <- MAT.AC10
        if(variable=="ACC") temp <- MAT.ACC
        if(variable=="ACB") temp <- MAT.ACB
        if(variable=="Z") temp <- MAT.Z
        data.mat <- temp[,assay.list]
        if(variable=="AC50") {
            #if(pathname=="ER") {
            #    ntemp <- data.mat[,"NVS_NR_bER"]
            #    ntemp[is.na(ntemp)] <- 1000000
            #    data.mat[,"NVS_NR_bER"] <- ntemp
            #    ntemp <- data.mat[,"NVS_NR_hER"]
            #    ntemp[is.na(ntemp)] <- 1000000
            #    data.mat[,"NVS_NR_hER"] <- ntemp
            #    ntemp <- data.mat[,"NVS_NR_mERa"]
            #    ntemp[is.na(ntemp)] <- 1000000
            #    data.mat[,"NVS_NR_mERa"] <- ntemp
            #}
            #if(pathname=="AR") {
            #    ntemp <- data.mat[,"NVS_NR_hAR"]
            #    ntemp[is.na(ntemp)] <- 1000000
            #    data.mat[,"NVS_NR_hAR"] <- ntemp
            #    ntemp <- data.mat[,"NVS_NR_cAR"]
            #    ntemp[is.na(ntemp)] <- 1000000
            #    data.mat[,"NVS_NR_cAR"] <- ntemp
            #}
            mask <- vector(length=dim(data.mat)[1],mode="numeric")
            mask[] <- 1
            data.temp <- data.mat
            data.temp[is.na(data.temp)] <- -1
            data.temp[data.temp>=0] <- 0
            mask <- rowSums(data.temp)
            mask[mask<0] <- 1
            cat("Number of full rows: ",length(mask)-sum(mask)," out of ",length(mask),"\n")
        }
        if(variable=="T") {
            if(pathname=="ER") {
                ntemp <- data.mat[,"ATG_ERa_TRANS_up"]
                ntemp <- ntemp*25
                data.mat[,"ATG_ERa_TRANS_up"] <- ntemp
                ntemp <- data.mat[,"ATG_ERE_CIS_up"]
                ntemp <- ntemp*25
                data.mat[,"ATG_ERE_CIS_up"] <- ntemp
            }
            if(pathname=="AR") {
                ntemp <- data.mat[,"ATG_AR_TRANS_up"]
                ntemp <- ntemp*25
                data.mat[,"ATG_AR_TRANS_up"] <- ntemp
            }
            if(pathname=="TR") {
                ntemp <- data.mat[,"ATG_THRa1_TRANS_up"]
                ntemp <- ntemp*25
                data.mat[,"ATG_THRa1_TRANS_up"] <- ntemp
            }
        }
        data.new <- cbind(CHEMS[,1:4],data.mat)
        if(pathname!="TR") data.new <- data.new[mask==0,]
        file <- paste("../pathways/Pathway_",pathname,"_",variable,".txt",sep="")
        write.table(data.new, file=file,row.names=FALSE,append=FALSE,quote=TRUE,sep = "\t")
        cat("Data written for variable: ",variable,"\n")

         if(pathname=="TR")  CODE.TEMP <<- CHEMS[,"CODE"]
         else CODE.TEMP <<- CHEMS[mask==0,"CODE"]
         flush.console()
    }


    if(pathname=="ER") flags <- ER.FLAGS
    if(pathname=="AR") flags <- AR.FLAGS
    if(pathname=="TR") flags <- TR.FLAGS
    doit <- F
    if(doit) {
        flags <- cbind(flags[,1],flags)
        names(flags)[1] <- "Assay"
        flags <- cbind(flags[,1],flags)
        names(flags)[1] <- "CODE"
        flags <- cbind(flags[,1],flags)
        names(flags)[1] <- "Name"
        flags[,"Assay"] <- as.character(flags[,"Assay"])
        flags[,"CODE"] <- as.character(flags[,"CODE"])
        flags[,"Name"] <- as.character(flags[,"Name"])
        n <- dim(flags)[1]
        for(i in 1:n) {
            aeid <- flags[i,"aeid"]
            aname <- ASSAY.INFO[is.element(ASSAY.INFO[,"aeid"],aeid),"Assay"]
            spid <- flags[i,"spid"]
            code <- PIPELINE.SAMPLES[is.element(PIPELINE.SAMPLES[,"spid"],spid),"code"]
            cname <- PIPELINE.SAMPLES[is.element(PIPELINE.SAMPLES[,"spid"],spid),"chnm"]
            flags[i,"Assay"] <- aname
            flags[i,"CODE"] <- code
            flags[i,"Name"] <- cname
            browser()
        }
    }
    file <- paste("../pathways/Pathway_",pathname,"_Flags.txt",sep="")
    write.table(flags, file=file,row.names=FALSE,append=FALSE,quote=TRUE,sep = "\t")
    cat("Flags written for pathway: ",pathname,"\n")

	cytotox.temp <- CYTOTOX[CODE.TEMP,]
    file <- paste("../pathways/Pathway_",pathname,"_Cytotox.txt",sep="")
    write.table(cytotox.temp, file=file,row.names=FALSE,append=FALSE,quote=TRUE,sep = "\t")
    cat("Cytotox written for pathway: ",pathname,"\n")

	chems.temp <- CHEMS[CODE.TEMP,]
    file <- paste("../pathways/Pathway_",pathname,"_Chemicals.txt",sep="")
    write.table(chems.temp, file=file,row.names=FALSE,append=FALSE,quote=TRUE,sep = "\t")
    cat("Chemicals written for pathway: ",pathname,"\n")

}
#--------------------------------------------------------------------------------------
#
# Model the burst - make an example plot
#
# QC=OK
#--------------------------------------------------------------------------------------
burst.model <- function(to.file=F,df=4,ncp=1,yscale=10,cmin=0.25,cmax=1.5) {
    cat("==========================================================================\n")
    cat("burst.modelt\n")
    cat("==========================================================================\n")
    flush.console()
    if(to.file) {
        fname <- "plots/burst_model.pdf"
        pdf(file=fname,width=7,height=7,pointsize=12,bg="white",paper="letter",pagecentre=T)
    }
    xmax <- 10
    par(mfrow=c(1,1),mar=c(4,4,2,2))
    plot(1~1,xlim=c(1e-4,1e4),ylim=c(0,11),cex.lab=1.5,cex.axis=1.5,type="n",xlab="Concentration (uM)",ylab="Number of hits",log="x")

    x <- seq(-10,xmax*5,by=xmax/100)
    for(i in 1:5) {
        xmin <- 10**(-(cmin+i))*1000000
        xmax <- 10**(-(cmax+i))*1000000
        xmid <- 10**(-0.5*(cmin+cmax+2*i))*1000000
        rect(xmax,2*(i-1),1e5,2*i-0.5,col="gray")
        y <- dchisq(10+x-5*i, df=df, ncp=ncp, log = F)
        yp <- yscale*y + 2*(i-1)
        xp <- 10**(-(x/5+2+0.30))*1000000
        lines(yp~xp,lwd=2)
                                        #browser()
        lines(c(1e-4,1e4),c(2*(i-1),2*(i-1)),col="black")

        lines(c(xmin,xmax),c(2*i-0.5,2*i-0.5),col="red",lwd=5)
        lines(c(xmid,xmid),c(2*(i-1)+1,2*i-0.5),col="red",lwd=5)

        if(i==4) arrows(x0=xmid,y0=7.1,x1=xmid/100,lwd=3,length=0.1)
    }

    lines(c(100,100),c(0,10),lwd=6)
    arrows(x0=100,y0=10,x1=1e-2,lwd=6)
    text(6e-3,10.5,"Concentration Range Tested",pos=4,cex=1.3)
    text(2e-2,5.5,"Cytotoxicity Range",cex=1.3,pos=4,col="red")
    text(0.3,8.5,"Burst",cex=1.3,pos=4,col="black")
    text(0.01,6.75,"Z (log units)",cex=1.3,pos=4,col="black")
    text(1e-4,9,"Chemical A: Most Potent",cex=1.1,pos=4,col="black")
    text(1e-4,1,"Chemical E: Least Potent",cex=1.1,pos=4,col="black")
    if(to.file) dev.off()
    else browser()
}
#--------------------------------------------------------------------------------------
#
# Assay source summary table
# zmode=original or norm
# QC=OK
#--------------------------------------------------------------------------------------
assay.source.summary <- function(do.prep=F,to.file=F,zmode="norm") {
    if(to.file) {
        fname <- paste("../plots/by_source_z_dist_",zmode,".pdf",sep="")
        pdf(file=fname,width=7,height=10,pointsize=12,bg="white",paper="letter",pagecentre=T)
    }
    par(mfrow=c(4,2),mar=c(4,4,2,2))

    source.list <- sort(uniquify(ASSAY.INFO[,"Source"]))
    nsource <- length(source.list)

    if(do.prep) {
        ctemp <- MAT.hitcall
        ctemp[!is.na(MAT.hitcall)] <- 1
        ctemp[is.na(MAT.hitcall)] <- 0
        chem.mask <- rowSums(ctemp)
        chem.mask[chem.mask<200] <- 0
        chem.mask[chem.mask>0] <- 1
        chem.mask <<- chem.mask
        ctemp <<- ctemp
    }
    resmat <- as.data.frame(matrix(nrow=nsource,ncol=3))
    names(resmat) <- c("Source","Center1","Center2")
    for(i in 1:nsource) {
        source <- source.list[i]
        assay.list <- ASSAY.INFO[is.element(ASSAY.INFO[,"Source"],source),"Assay"]
        assay.list <- assay.list[is.element(assay.list,colnames(MAT.AC50))]
        cat(source," : ",length(assay.list),"\n")

        doit <- T
        if(source == "Tox21_viability") doit <- F
        if(source == "ATG_CYTOTOX") doit <- F
        flush.console()
        if(doit) {
            if(zmode=="original") ztemp <- MAT.ZSCORE[,assay.list]
            else if(zmode=="norm") ztemp <- MAT.ZSCORE.NORM[,assay.list]
            atemp <- MAT.AC50[,assay.list]
            if(length(assay.list)==1) {
                atemp <- atemp[chem.mask==1]
                ztemp <- ztemp[chem.mask==1]
            }
            else {
                atemp <- atemp[chem.mask==1,]
                ztemp <- ztemp[chem.mask==1,]
                print("fix conversion")
                atemp <- as.numeric(as.matrix(atemp))
                ztemp <- as.numeric(as.matrix(ztemp))
            }
            xmax <- 15
            xmin <- -5
            ztemp <- ztemp[!is.na(ztemp)]
            ztemp <- ztemp[ztemp> -5]
            ztemp <- ztemp[ztemp< xmax]
                                        #if(max(ztemp)>6) xmax <- 8
            breaks <- seq(-6,xmax,by=0.2)
            x <- hist(ztemp,xlim=c(xmin,xmax),main=paste("Z dist for",source),cex.lab=1.2,cex.axis=1.2,xlab="Z-score",breaks=breaks,freq=T)
            center1 <- 0
            center2 <- 0
            doit <- T
            if(doit) {
				mymix <- cbind(x$breaks[1:length(x$counts)],x$counts)
				fit <- mix(mymix,mixparam(mu=c(0,xmax),sigma=c(2,2)),"norm")
				center1 <- fit[[1]][1,2]
				center2 <- fit[[1]][2,2]
				ymax <- max(x$counts)
				print(fit)
				if(zmode=="original") lines(c(center1,center1),c(0,ymax/5),lwd=3,col="red")
				if(zmode=="norm") lines(c(0,0),c(0,ymax),lwd=1,col="red")
				lines(c(3,3),c(0,ymax),lwd=2,col="black")
				lines(c(0,0),c(0,ymax),lwd=2,col="black")
											#lines(c(center2,center2),c(0,ymax/5),lwd=3,col="red")
				if(zmode=="original") text(7,ymax*0.9,paste("Peak 1:",format(center1,digits=2)),pos=4)
											#text(-6,ymax*0.7,paste("C2:",format(center2,digits=2)),pos=4)
            }
            resmat[i,1] <- source
            resmat[i,2] <- center1
            resmat[i,3] <- center2
            eps <- 0.1
            xmin <- 1e-4
            xmax <- 1e4
            atemp <- atemp[!is.na(atemp)]
            atemp <- atemp[atemp<xmax]
            atemp <- atemp[atemp>xmin]

            breaks <- seq(0,xmax,by=0.2)
            breaks.min <- 1e-4
            nbreaks <- 100
            if(min(atemp)<breaks.min) {
                breaks.min <- breaks.min/100
            }
            if(min(atemp)<breaks.min) {
                breaks.min <- breaks.min/100
            }
            breaksA <- breaks.min
            for(i in 1:100) breaksA <- c(breaksA,1.2*breaksA[length(breaksA)])
            while(max(breaksA)<max(atemp)) {
                breaksA <- c(breaksA,1.2*breaksA[length(breaksA)])
                cat("Added another point to breaksA",max(breaksA),"\n")
            }
            xA <- hist(atemp,breaks=breaksA,plot=F)
            ymax <- 1.5*max(xA$counts)
            hist.log(breaksA,xA$counts,ylim=c(0,ymax),xlab="AC50 (uM)",ylab="Hits",main=source,1000000,1000000,1000000)
            if(!to.file) browser()
        }
    }
    if(to.file) dev.off()
    else browser()
    outfile <- paste("../output/source_z_shifts_",zmode,".txt",sep="")
    write.table(resmat,file=outfile, row.names=F, append=FALSE, quote=F, sep = "\t")
}
#--------------------------------------------------------------------------------------
#
# Assay summary table
# mode=ToxCast or E1K
# QC=OK
#--------------------------------------------------------------------------------------
assay.summary <- function(to.file=F,do.tanim=F) {
    if(to.file) {
        fname <- "../plots/by_assay_z_dist.pdf"
        pdf(file=fname,width=7,height=10,pointsize=12,bg="white",paper="letter",pagecentre=T)
    }
    par(mfrow=c(4,2),mar=c(4,4,2,2))

    nassay <- length(ASSAY.LIST)
    temp <- as.data.frame(matrix(nrow=nassay,ncol=13))
    names(temp) <- c("Assay","Source","ntest","nhit","nhit.selective","nhit.potent.selective","hit.fraction","hit.fraction.selective","hit.fraction.potent.selective","p.selective","n.struct.sim","p.struct.sim")
    temp [] <- NA
    ztemp <- as.vector(MAT.ZSCORE.NORM)
    zdist <- ztemp[!is.na(ztemp)]
    if(do.tanim) {
    	stemp <- as.vector(TANIM.STRUCTURE)
    	sdist <- stemp[!is.na(stemp)]
    }
    ctemp <- MAT.hitcall
    ctemp[!is.na(MAT.hitcall)] <- 1
    ctemp[is.na(MAT.hitcall)] <- 0
    chem.mask <- rowSums(ctemp)
    chem.mask[chem.mask<200] <- 0
    chem.mask[chem.mask>0] <- 1

    for(i in 1:nassay) {
        assay <- ASSAY.LIST[i]

        source <- ASSAY.INFO[is.element(ASSAY.INFO[,"Assay"],assay),"Source"][1]
        print(assay)
        flush.console()
        if(is.na(source)) {
            cat("No source: ",assay,"\n")
            browser()
        }
        atemp <- MAT.AC50[,assay]
        atemp.disc <- MAT.hitcall[,assay]
        ztemp <- MAT.ZSCORE.NORM[,assay]
        atemp <- atemp[chem.mask==1]
        atemp.disc <- atemp.disc[chem.mask==1]
        ztemp <- ztemp[chem.mask==1]

        test.mask <- atemp.disc
        test.mask[!is.na(test.mask)] <- 1
        test.mask[is.na(test.mask)] <- 0
        ntest <- sum(test.mask)

        hit.mask <- atemp.disc[test.mask==1]
        nhit <- sum(hit.mask)

        ztemp <- ztemp[test.mask==1]
        z.mask <- ztemp
        z.mask <- z.mask[!is.na(ztemp)]
        z.mask[z.mask<3] <- 0
        z.mask[z.mask>0] <- 1
        nhit.selective <- sum(z.mask)

        p.mask <- atemp[test.mask==1]
        p.mask <- p.mask[!is.na(ztemp)]
        p.mask[p.mask>10] <- 0
        p.mask[p.mask>0] <- 1
        pz.mask <- p.mask*z.mask
        nhit.potent.selective <- sum(pz.mask)

        temp[i,"Assay"] <- assay
        temp[i,"Source"] <- source
        temp[i,"ntest"] <- ntest
        temp[i,"nhit"] <- nhit
        temp[i,"nhit.selective"] <- nhit.selective
        temp[i,"nhit.potent.selective"] <- nhit.potent.selective
        temp[i,"hit.fraction"] <- nhit / max(1,ntest)
        temp[i,"hit.fraction.selective"] <- nhit.selective / max(1,ntest)
        temp[i,"hit.fraction.potent.selective"] <- nhit.potent.selective / max(1,ntest)

        p.spec <- 1
        if(length(ztemp)>=5) {
            za <- ztemp[!is.na(ztemp)]
            if(length(za)>=5) {
                p.spec <- wilcox.test(za,zdist,alternative="greater")$p.value
                cat("p.spec: ",assay,":",format(p.spec,digits=3),"\n")
            }
        }
        flush.console()

        temp[i,"p.selective"] <- p.spec

        za <- MAT.ZSCORE.NORM[,assay]
        mask <- za
        mask[] <- 1
        mask[is.na(za)] <- 0
        mask[za<3] <- 0
        p.struct.sim <- 1
        if(sum(mask)>=5 && do.tanim) {
            CHEMS <- CODE.LIST[mask==1]
            smat <- TANIM.STRUCTURE[is.element(names(TANIM.STRUCTURE),CHEMS),is.element(names(TANIM.STRUCTURE),CHEMS)]
            sdist.assay <- as.vector(as.matrix(smat))
            if(length(sdist.assay)>=5) {
                p.struct.sim <- t.test(sdist.assay,sdist,alternative="greater",correct=F,exact=F)$p.value
                cat(i,"p.struct.sim: ",assay,":",sum(mask),":",format(p.struct.sim,digits=3),"\n")
            }
        }
        temp[i,"n.struct.sim"] <- sum(mask)
        temp[i,"p.struct.sim"] <- p.struct.sim
        if(nhit>0) {
            xmax <- 6
            xmin <- -6
            ztemp <- ztemp[!is.na(ztemp)]
            ztemp <- ztemp[ztemp> -5]
            ztemp <- ztemp[ztemp< 8]
            if(length(ztemp)>0) {
                if(max(ztemp)>6) xmax <- 8
                breaks <- seq(-6,xmax,by=0.2)
                x <- hist(ztemp,xlim=c(xmin,xmax),main=assay,cex.lab=1.2,cex.axis=1.2,xlab="Z-score",breaks=breaks,freq=T)
                ymax <- max(x$counts)
                eps <- 0.1
                text(-6,ymax*(1-1*eps),paste("ntry=",ntest,sep=""),pos=4)
                text(-6,ymax*(1-2*eps),paste("nhit=",nhit,sep=""),pos=4)
                text(-6,ymax*(1-3*eps),paste("p.spec=",format(p.spec,digits=2),sep=""),pos=4)
                text(-6,ymax*(1-4*eps),paste("p.strSim=",format(p.struct.sim,digits=2),sep=""),pos=4)
                lines(c(0,0),c(0,ymax/4),lwd=2,col="red")
                #lines(c(2,2),c(0,ymax),lwd=1,col="red")
                #lines(c(4,4),c(0,ymax),lwd=1,col="red")
                                        #lines(c(-3,-3),c(0,ymax),lwd=1,col="red")
                if(!to.file) browser()
            }
        }
    }
    if(to.file) dev.off()
    else browser()
    ASSAY.SUMMARY <<- temp
    outfile <- "../output/assay_summary.txt"
    write.table(temp,file=outfile, row.names=F, append=FALSE, quote=F, sep = "\t")
}
#--------------------------------------------------------------------------------------
#
# load the structure similarity matrix
#
# QC=OK
#--------------------------------------------------------------------------------------
load.tanim.structure <- function() {
	cat("==========================================================================\n")
    cat("load.tanim.structure\n")
    cat("==========================================================================\n")
	flush.console()
	file <- "structure_input/ToxCast_Tanimoto_matrix_REDUCED_2013_03_05.txt"
    temp <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="",comment="")
    TANIM.STRUCTURE <<- temp
    cat("Structure similarity matrix: ",dim(TANIM.STRUCTURE),"\n")
}
#--------------------------------------------------------------------------------------
#
# build the heatmap for the biological processes by chemicals filtered by z
#
# --------------------------------------------------------------------------------------
bio.z.hm <- function(zcut=3,zset="LO",cutoff=0.5,to.file=F) {

	code.list <- CHEMS[is.element(CHEMS[,"Phase_I"],1),"CODE"]
	code.list <- c(code.list,CHEMS[is.element(CHEMS[,"Phase_II"],1),"CODE"])
	code.list <- sort(unique(code.list))
	nchem <- length(code.list)
	temp <- MAT.ZSCORE.NORM[code.list,]
	temp <- MAT.ZSCORE[code.list,]
	if(zset=="LO") {
		temp[is.na(temp)] <- 1000000
		temp[temp>zcut] <- 0
		temp[temp!=0] <- 1
	}
	else if(zset=="HI") {
		temp[is.na(temp)] <- -1000000
		temp[temp<zcut] <- 0
		temp[temp>0] <- 1
	}
	
	bio.list <- sort(unique(ASSAY.INFO[,"biological_process"]))
	bio.list <- bio.list[!is.element(bio.list,"cytotox other")]
	bio.list <- bio.list[!is.element(bio.list,"assay QC")]
	bio.list <- bio.list[!is.element(bio.list,"regulation of catalytic activity activator")]
	nprocess <- length(bio.list)
	dtemp <- matrix(ncol=nprocess,nrow=nchem)
	dtemp[] <- 0
	rownames(dtemp) <- code.list
	for(i in 1:nprocess) {
		bp <- bio.list[i]
		#print(bp)
		assay.list <- ASSAY.INFO[is.element(ASSAY.INFO[,"biological_process"],bp),"Assay"]
		assay.list <- assay.list[is.element(assay.list,colnames(temp))]
		slice <- as.matrix(temp[,assay.list])
		denom <- dim(slice)[2]

		rs <- rowSums(slice) / denom
		dtemp[,i] <- as.numeric(rs)
	}
	colnames(dtemp) <- bio.list
	
	if(to.file) {
		fname <- paste("../plots/bioZ_",zcut,"_",zset,"_hm.pdf",sep="")
		pdf(file=fname,width=7,height=7,pointsize=12,bg="white",paper="letter",pagecentre=T)
	}
	dtemp[dtemp<cutoff] <- 0
	result <- heatmap(t(as.matrix(dtemp)),margins=c(10,10),scale="none",main=paste("Biological process / Z: ",zcut,":",zset),
					  xlab="",ylab="",cexCol=0.1,cexRow=0.8,col=brewer.pal(9,"Reds"),
					  hclustfun=function(x) hclust(d=dist(x),method="ward.D"),keep.dendro=T,verbose=T)

	if(to.file) dev.off()
	else browser()
}
#--------------------------------------------------------------------------------------
#
# find the chemicals responsible for the clusters
#
# --------------------------------------------------------------------------------------
WxZ <- function(zcut1=3,zcut2=5,to.file=F) {

	nassay <- length(ASSAY.LIST)
	result <- matrix(nrow=nassay,ncol=4)
	rownames(result) <- ASSAY.LIST
	result[] <- 0
	color <- vector(length=nassay,mode="character")

	z1 <- MAT.ZSCORE.NORM
	z1[z1>=zcut2] <- 10
	z1[z1<=zcut1] <- 1
	z1[is.na(z1)] <- 100
	for(i in 1:nassay) {
		assay <- ASSAY.LIST[i]
		w1.i <- MAT.W[,assay]
		z1.i <- z1[,assay]

		val.in <- mean(w1.i[z1.i==1])
		sd.in <- sd(w1.i[z1.i==1])
		val.out <- mean(w1.i[z1.i==10])
		sd.out <- sd(w1.i[z1.i==10])
		result[i,1] <- val.in
		result[i,2] <- sd.in
		result[i,3] <- val.out
		result[i,4] <- sd.out

		source <- ASSAYS[assay,"Source"]
		if(source=="ACEA") color[i] <- "orange"
		if(source=="Apredica_down") color[i] <- "yellow"
		if(source=="Apredica_up") color[i] <- "yellow"
		if(source=="Odyssey Thera") color[i] <- "green"
		if(source=="Attagene_cis") color[i] <- "blue"
		if(source=="Attagene_trans") color[i] <- "blue"
		if(source=="BioSeek_down") color[i] <- "violet"
		if(source=="BioSeek_up") color[i] <- "violet"
		if(source=="Tox21_LUC") color[i] <- "gray"
		if(source=="Tox21_BLA") color[i] <- "gray"
		if(source=="Tox21_LUC_viability") color[i] <- "gray"
		if(source=="Tox21_BLA_viability") color[i] <- "gray"
		if(source=="Novascreen_ADME") color[i] <- "red"
		if(source=="Novascreen_ADME_act") color[i] <- "red"
		if(source=="Novascreen_ENZ") color[i] <- "red"
		if(source=="Novascreen_ENZ_act") color[i] <- "red"
		if(source=="Novascreen_GPCR") color[i] <- "red"
		if(source=="Novascreen_IC") color[i] <- "red"
		if(source=="Novascreen_Misc") color[i] <- "red"
		if(source=="Novascreen_NR") color[i] <- "red"
		if(source=="Novascreen_TR") color[i] <- "red"
	}
	if(to.file) {
		fname <- paste("plots/WxZ.pdf",sep="")
		pdf(file=fname,width=7,height=7,pointsize=12,bg="white",paper="letter",pagecentre=T)
	}

	mask <- vector(length=nassay,mode="integer")
	mask[] <- 1
	mask[is.na(result[,1])] <- 0
	mask[is.na(result[,2])] <- 0
	mask[is.na(result[,3])] <- 0
	mask[is.na(result[,4])] <- 0
	x <- result[mask==1,1]
	y <- result[mask==1,3]

	sd.x <- result[mask==1,2]
	sd.y <- result[mask==1,4]

	color <- color[mask==1]

	plot(y~x,type="p",xlim=c(0,8),ylim=c(0,8),xlab="|W(Z<3)|",ylab="|W(Z>5)|",cex.lab=1.2,cex.axis=1.2,main="WxZ")
	for(i in 1:length(x)) {
		lines(c(x[i]-sd.x[i],x[i]+sd.x[i]),c(y[i],y[i]))
		lines(c(x[i],x[i]),c(y[i]-sd.y[i],y[i]+sd.y[i]))
		points(y[i]~x[i],bg=color[i],pch=21)
	}
	lines(c(0,8),c(0,8))
	y <- 8
	x <- 0.25
	dy <- 0.4
	text(x,y,"ACEA",pos=4); points(0,y,pch=21,bg="orange",cex=2); y <- y-dy
	text(x,y,"Apredica",pos=4); points(0,y,pch=21,bg="yellow",cex=2); y <- y-dy
	text(x,y,"Attagen",pos=4); points(0,y,pch=21,bg="blue",cex=2); y <- y-dy
	text(x,y,"BioSeek",pos=4); points(0,y,pch=21,bg="violet",cex=2); y <- y-dy
	text(x,y,"Novascreen",pos=4); points(0,y,pch=21,bg="red",cex=2); y <- y-dy
	text(x,y,"Odyssey Thera",pos=4); points(0,y,pch=21,bg="green",cex=2); y <- y-dy
	text(x,y,"Tox21",pos=4); points(0,y,pch=21,bg="gray",cex=2); y <- y-dy
	if(to.file) dev.off()
	else browser()
}
#--------------------------------------------------------------------------------------
#
# ploth the apr vs. cytotox
#
# --------------------------------------------------------------------------------------
apr.vs.cytotox <- function(to.file=F) {
	file <- "APR/apr_norecover.txt"
    temp <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="",comment="")
    if(to.file) {
        fname <- "plots/apr_vs_cytotox.pdf"
        pdf(file=fname,width=7,height=7,pointsize=12,bg="white",paper="letter",pagecentre=T)
    }
    file <- "output/cytotox_dist.txt"
   	ctemp <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="",comment="")
   	rownames(ctemp) <- ctemp[,"CODE"]


	plot(1~1,xlim=c(0.01,1000),ylim=c(0.01,1000),xlab="Burst Center (uM)",ylab="APR No-recover conc (uM)",cex.lab=1,cex.axis=1,log="xy",main="APR no-recovery vs. Cytotox",type="n")
	npt <- dim(temp)[1]
	for(i in 1:npt) {
		code <- temp[i,"CODE"]
		val <- temp[i,"rs_norecov_mn"]
		val.sd <- temp[i,"rs_norecov_sd"]
		bc <- ctemp[code,"cytotox.median"]
		bc.sd <- ctemp[code,"cytotox.mad.global"]
		bc.min <- 1000000*10**(-(bc+bc.sd))
		bc.max <- 1000000*10**(-(bc-bc.sd))

		bc <- 1000000 * 10**(-bc)
		#cat(format(val,digits=2),":",format(bc,digits=2),"\n")
		if(!is.na(bc)) {
			points(val~bc)
			lines(c(bc.min,bc.max),c(val,val))
			lines(c(bc,bc),c(val-val.sd,val+val.sd))
		}
	}
	lines(c(0.001,1000),c(0.001,1000))
    if(to.file) dev.off()
    else browser()

}
#--------------------------------------------------------------------------------------
#
# look at assay-assay correlation as a function of Z
#
# cutoff - this is the minimum similarity between two assays in the z-matrix the use in clustering
# threshold - this is the minimum fraction of the assays in a cluster to call a chemical positive for that cluster
# --------------------------------------------------------------------------------------
zcorr.all <- function(cutoff=0.5,threshold=0.75) {
	zcorr(zmin= -100,zmax=3,T,cutoff)
	zclust(zmin=0,zmax=3,cutoff)
	zclust.chem(zmin=0,zmax=3,cutoff,threshold,to.file=T)
	flush.console()

	zcorr(zmin=5,zmax=100,T,cutoff)
	zclust(zmin=5,zmax=100,cutoff)
	zclust.chem(zmin=5,zmax=100,cutoff,threshold,to.file=T)
	flush.console()
}
#--------------------------------------------------------------------------------------
#
# look at assay-assay correlation as a function of Z
#
# --------------------------------------------------------------------------------------
zcorr <- function(zmin=-100,zmax=1,to.file=F,cutoff) {

	if(!exists("MAT.AC50")) prepMatrices()
	if(!exists("MAT.ZSCORE.NORM")) {
	    file <- "output/zscore_matrix_norm.txt"
		MAT.ZSCORE.NORM <<- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment.char="")
	}
	file <- "input/AssayList.txt"
	assays <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment.char="")
	rownames(assays) <- assays[,"Assay"]
	assays <- assays[names(MAT.AC50),]

	amask <- assays[,"Type"]
	amask[] <- 1
	amask[assays[,"Type"]=="cytotox"] <- 0
	amask[assays[,"Type"]=="cytotox_fail"] <- 0
	MAT.ZSCORE.NORM <- MAT.ZSCORE.NORM[,amask==1]
	assay.list <- assays[amask==1,"Assay"]

    cfilter <- MAT.AC50[,amask==0]
	cfilter[!is.na(cfilter)] <- 1
	cfilter[is.na(cfilter)] <- 0
	rs <- rowSums(cfilter)

	MAT.ZSCORE.NORM <- MAT.ZSCORE.NORM[rs>10,]
	cat("Dimension of MAT.ZSCORE.NORM: ",dim(MAT.ZSCORE.NORM),"\n")
	MAT.ZSCORE.NORM[MAT.ZSCORE.NORM>zmax] <- NA
	MAT.ZSCORE.NORM[MAT.ZSCORE.NORM<zmin] <- NA
	MAT.ZSCORE.NORM[!is.na(MAT.ZSCORE.NORM)] <- 1
	MAT.ZSCORE.NORM[is.na(MAT.ZSCORE.NORM)] <- 0
	cat("sum of MAT.ZSCORE.NORM: ",sum(MAT.ZSCORE.NORM),"\n")
	rs <- rowSums(MAT.ZSCORE.NORM)
	MAT.ZSCORE.NORM <- MAT.ZSCORE.NORM[rs>2,]
	cs <- colSums(MAT.ZSCORE.NORM)
	MAT.ZSCORE.NORM <- MAT.ZSCORE.NORM[,cs>2]
	cat("Dimension of MAT.ZSCORE: ",dim(MAT.ZSCORE.NORM),"\n")

	temp <- MAT.ZSCORE.NORM
	dmat <- dist(t(temp),method="binary",diag=T,upper=T)
	simmat <- 1-as.matrix(dmat)
   	zmin.name <- max(zmin,0)
    fname <- paste("burst/sim_mat_",zmin.name,"_",zmax,".txt",sep="")
    write.table(simmat,file=fname, row.names=T, append=FALSE, quote=F, sep = "\t")

    if(to.file) {
		nd <- dim(simmat)[1]
		simmat.cast <- matrix(nrow=nd,ncol=5)
		temp <- simmat.cast
		for(i in 1:nd) {
			temp <- matrix(nrow=nd,ncol=5)
			temp[,1] <- colnames(simmat)[i]
			temp[,2] <- colnames(simmat)
			temp[,3] <- simmat[i,]
			temp[,4] <- zmin
			temp[,5] <- zmax
			temp <- temp[temp[,3]>0,]
			if(i==1) simmat.cast <- temp
			else simmat.cast <- rbind(simmat.cast,temp)
		}
		simmat.cast <- as.data.frame(simmat.cast)

		names(simmat.cast) <- c("Assay.1","Assay.2","Similarity","Zmin","Zmax")
	   	zmin.name <- max(zmin,0)
	    fname <- paste("burst/zcorr_long_",zmin.name,"_",zmax,".txt",sep="")
	    write.table(simmat.cast,file=fname, row.names=T, append=FALSE, quote=F, sep = "\t")
	}
    if(to.file) {
    	zmin.name <- max(zmin,0)
        fname <- paste("burst/zcorr_",zmin.name,"_",zmax,"_hm.pdf",sep="")
        pdf(file=fname,width=7,height=7,pointsize=12,bg="white",paper="letter",pagecentre=T)
        fname <- paste("burst/zcorr_",zmin.name,"_",zmax,".txt",sep="")
        write.table(simmat,file=fname, row.names=T, append=FALSE, quote=F, sep = "\t")
    }
    result <- heatmap(simmat,margins=c(10,10),scale="none",main=paste("Z-corr: ",zmin,":",zmax),
                      xlab="",ylab="",cexCol=0.25,cexRow=0.25,col=brewer.pal(9,"Reds"),symm=T,
                      hclustfun=function(x) hclust(d=dist(x),method="ward.D"),keep.dendro=T,verbose=T)

	if(to.file) dev.off()
	else browser()

	simmat.2 <- simmat
	simmat.2[simmat.2<cutoff] <- 0
	simmat.2[simmat.2>0] <- 1
	rs <- rowSums(simmat.2)
	simmat.2 <- simmat.2[rs>1,rs>1]

    if(to.file) {
    	zmin.name <- max(zmin,0)
        fname <- paste("burst/zcorr_",zmin.name,"_",zmax,"_discrete_hm_",cutoff,".pdf",sep="")
        pdf(file=fname,width=7,height=7,pointsize=12,bg="white",paper="letter",pagecentre=T)
        fname <- paste("burst/zcorr_",zmin.name,"_",zmax,"_discrete_",cutoff,".txt",sep="")
        write.table(simmat.2,file=fname, row.names=T, append=FALSE, quote=F, sep = "\t")
    }
    result <- heatmap(simmat.2,margins=c(10,10),scale="none",main=paste("Z-corr: ",zmin,":",zmax),
                      xlab="",ylab="",cexCol=0.25,cexRow=0.25,col=brewer.pal(9,"Reds"),symm=T,
                      hclustfun=function(x) hclust(d=dist(x),method="ward.D"),keep.dendro=T,verbose=T)

	if(to.file) dev.off()
	else browser()
}
#--------------------------------------------------------------------------------------
#
# do the clustering to get assays that are co-regulated
#
# --------------------------------------------------------------------------------------
zclust <- function(zmin=0,zmax=2,cutoff) {
    file <- paste("burst/sim_mat_",zmin,"_",zmax,".txt",sep="")
   	simmat <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="",comment="")

	simmat.2 <- simmat
	simmat.2[simmat.2<cutoff] <- 0
	simmat.2[simmat.2>0] <- 1
	rs <- rowSums(simmat.2)
	simmat.2 <- simmat.2[rs>1,rs>1]

	nassay <- dim(simmat.2)[1]
	dmat <- as.matrix(1-simmat.2)
	hres <- hclust(as.dist(dmat),method="ward.D")
	result1 <- cutreeDynamic(hres,cutHeight=1,minClusterSize=1,method="hybrid",distM=dmat)
	output <- as.data.frame(cbind(names(simmat.2),result1))
	names(output) <- c("assay","cluster")
    file <- paste("burst/cluster_members_cutree_",zmin,"_",zmax,"_",cutoff,".txt",sep="")
    write.table(output,file=file, row.names=F, append=FALSE, quote=F, sep = "\t")

	#result2 <- kmeans(simmat,centers=nassay/2,iter.max=100,nstart=10)
	#fits <- as.data.frame(cbind(names(simmat),fitted(result2,method="classes")))
    #file <- paste("burst/cluster_members_kmeans_",zmin,"_",zmax,"_",cutoff,".txt",sep="")
	#names(fits) <- c("assay","cluster")
    #write.table(fits,file=file, row.names=F, append=FALSE, quote=F, sep = "\t")
}
#--------------------------------------------------------------------------------------
#
# find the chemicals responsible for the clusters
#
# --------------------------------------------------------------------------------------
zclust.chem <- function(zmin=0,zmax=2,cutoff,threshold,to.file=F) {

    file <- paste("burst/cluster_members_cutree_",zmin,"_",zmax,"_",cutoff,".txt",sep="")
   	cassay <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="",comment="")
	ncl <- max(cassay[,"cluster"])

	output <- NULL
	for(i in 1:ncl) {
		assay.set <- cassay[is.element(cassay[,"cluster"],i),"assay"]
		temp <- MAT.ZSCORE.NORM[,assay.set]
		dtemp <- temp
		if(zmin==0) {
			dtemp[is.na(dtemp)] <- 10000
			dtemp[dtemp>=zmax] <- 10000
			dtemp[dtemp<zmax] <- 1
			dtemp[dtemp>=zmax] <- 0
		}
		else {
			dtemp[is.na(dtemp)] <- 0
			dtemp[dtemp<=zmin] <- 0
			dtemp[dtemp>=zmax] <- 0
			dtemp[dtemp>0] <- 1
		}
		rs <- rowSums(dtemp)
		nmin <- threshold * dim(dtemp)[2]
		dtemp <- dtemp[rs>nmin,]

		if(dim(dtemp)[1]>1) {
			chem.list <- CHEMS[rownames(dtemp),]
			out.temp <- cbind(chem.list,chem.list[,1])
			out.temp[,dim(out.temp)[2]] <- i
			out.temp <- cbind(out.temp,chem.list[,1])
			out.temp[,dim(out.temp)[2]] <- paste(zmin," to ",zmax,sep="")
			output <- rbind(output,out.temp)

			if(to.file) {
				fname <- paste("burst/zclust_chem_",zmin,"_",zmax,"_hm_",cutoff,"_",threshold,"_",i,".pdf",sep="")
				pdf(file=fname,width=7,height=7,pointsize=12,bg="white",paper="letter",pagecentre=T)
			}
			result <- heatmap(as.matrix(dtemp),margins=c(10,10),scale="none",main=paste("Z-cluster-chem: ",zmin,":",zmax," cl: ",i),
							  labRow=chem.list[,"short_name"],xlab="",ylab="",cexCol=0.5,cexRow=0.5,col=brewer.pal(9,"Reds"),
							  hclustfun=function(x) hclust(d=dist(x),method="ward.D"),keep.dendro=T,verbose=T)

			if(to.file) dev.off()
			else browser()
		}
	}
	output <- as.data.frame(output)
	names(output) <- c(names(CHEMS),"cluster","z-range")
	fname <- paste("burst/zclust_chem_",zmin,"_",zmax,"_chemicals_",cutoff,"_",threshold,".txt",sep="")
    write.table(output,file=fname, row.names=F, append=FALSE, quote=F, sep = "\t")
}
#--------------------------------------------------------------------------------------
#
# find the chemicals responsible for the clusters
#
# --------------------------------------------------------------------------------------
zclust.summary <- function(cutoff=0.5,threshold=0.75,to.file=F) {
	zmin <- 0
	zmax <- 3
	fname <- paste("burst/zclust_chem_",zmin,"_",zmax,"_chemicals_",cutoff,"_",threshold,".txt",sep="")
   	temp.lo <- read.table(fname,header=T,sep="\t",stringsAsFactors=F,quote="",comment="")
   	mask <- temp.lo[,"Phase_I"]+temp.lo[,"Phase_II"]
   	temp.lo <- temp.lo[mask>0,]

	zmin <- 5
	zmax <- 100
	fname <- paste("burst/zclust_chem_",zmin,"_",zmax,"_chemicals_",cutoff,"_",threshold,".txt",sep="")
   	temp.hi <- read.table(fname,header=T,sep="\t",stringsAsFactors=F,quote="",comment="")
   	mask <- temp.hi[,"Phase_I"]+temp.hi[,"Phase_II"]
   	temp.hi <- temp.hi[mask>0,]

	for(i in 1:dim(temp.lo)[1]) temp.lo[i,"cluster"] <- paste("Lo-Z",temp.lo[i,"cluster"])
	for(i in 1:dim(temp.hi)[1]) temp.hi[i,"cluster"] <- paste("Hi-Z",temp.hi[i,"cluster"])

	temp <- rbind(temp.lo,temp.hi)
	ch.list <- sort(unique(temp[,"short_name"]))
	cl.list <- sort(unique(temp[,"cluster"]))
	nchem <- length(ch.list)
	ncl <- length(cl.list)
	mat <- as.data.frame(matrix(nrow=nchem,ncol=ncl))
	mat[] <- 0
	names(mat) <- cl.list
	rownames(mat) <- ch.list
	for(i in 1:dim(temp)[1]) {
		chem <- temp[i,"short_name"]
		cluster <- temp[i,"cluster"]
		mat[chem,cluster] <- 1
	}

	if(to.file) {
		fname <- paste("burst/zclust_chem_",zmin,"_",zmax,"_hm_",cutoff,"_",threshold,"_",i,".pdf",sep="")
		pdf(file=fname,width=7,height=10,pointsize=12,bg="white",paper="letter",pagecentre=T)
	}
	result <- heatmap(as.matrix(mat),margins=c(2,10),scale="none",main=paste("Chemicals x Clusters"),
					  xlab="",ylab="",cexCol=0.5,cexRow=0.5,col=brewer.pal(9,"Reds"),
					  hclustfun=function(x) hclust(d=dist(x),method="ward.D"),keep.dendro=T,verbose=F)

	ids <- "burst/entities.txt"
	s <- paste("entity\ttype\n",sep="")
	cat(file=ids,s,append=F)
	for(i in 1:dim(CHEMS)[1]) {
		s <- paste(CHEMS[i,"short_name"],"\tchemical\n",sep="")
		cat(file=ids,s,append=T)
	}
	for(i in 1:dim(ASSAYS)[1]) {
		s <- paste(ASSAYS[i,"Assay"],"\tassay\n",sep="")
		cat(file=ids,s,append=T)
	}
	gene.list <- sort(unique(ASSAYS[,"Gene_Process"]))
	for(i in 1:length(gene.list)) {
		s <- paste(gene.list[i],"\tgene\n",sep="")
		cat(file=ids,s,append=T)
	}

	for(i in 1:ncl) {
		type <- "Cluster Hi-Z"
		if(substr(cl.list[i],1,2)=="Lo") type <- "Cluster Lo-Z"
		s <- paste(cl.list[i],"\t",type,"\n",sep="")
		cat(file=ids,s,append=T)
	}

	sif <- "burst/clusters.sif"
	s <- paste("",sep="")
	cat(file=sif,s,append=F)
	for(i in 1:dim(temp)[1]) {
		chem <- temp[i,"short_name"]
		cluster <- temp[i,"cluster"]
		s <- paste(chem,"\tactivates\t",cluster,"\n",sep="")
		cat(file=sif,s,append=T)
	}

	for(i in 1:dim(ASSAYS)[1]) {
		assay <- ASSAYS[i,"Assay"]
		gene <- ASSAYS[i,"Gene_Process"]
		if(!is.na(gene)) {
			s <- paste(assay,"\tmeasures\t",gene,"\n",sep="")
			cat(file=sif,s,append=T)
		}
	}

	zmin <- 0
	zmax <- 3
	prefix <- "Lo-Z"
    file <- paste("burst/cluster_members_cutree_",zmin,"_",zmax,"_",cutoff,".txt",sep="")
   	cassay <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="",comment="")
	ncl <- dim(cassay)[1]
	for(i in 1:ncl) {
		assay <- cassay[i,"assay"]
		cluster <- paste(prefix,cassay[i,"cluster"])
		s <- paste(assay,"\tis_member_of\t",cluster,"\n",sep="")
		cat(file=sif,s,append=T)
	}

	zmin <- 5
	zmax <- 100
	prefix <- "Hi-Z"
    file <- paste("burst/cluster_members_cutree_",zmin,"_",zmax,"_",cutoff,".txt",sep="")
   	cassay <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="",comment="")
	ncl <- dim(cassay)[1]
	for(i in 1:ncl) {
		assay <- cassay[i,"assay"]
		cluster <- paste(prefix,cassay[i,"cluster"])
		s <- paste(assay,"\tis_member_of\t",cluster,"\n",sep="")
		cat(file=sif,s,append=T)
	}


	if(to.file) dev.off()
	else browser()
}
#--------------------------------------------------------------------------------------
#
# Build an SOM on selected structures
#
# chemset is ToxCast, E1K, Tox21
# navg is the average number of chemiacls per cell
# rlen is the number of iterations of the SOM analysis
#
#--------------------------------------------------------------------------------------
build.som <- function(chemset="ToxCast",navg=10,rlen=10,do.debug=F) {

	if(!exists("SMARTSMAT")) {
		fname <- "structure_input/ToxCast_SMARTS_REDUCED_matrix_2013_03_05.txt"
	   	SMARTSMAT <<- read.table(fname,header=T,sep="\t",stringsAsFactors=F,quote="",comment="")
	}
	if(chemset=="ToxCast") {
		code.list <- CHEMS[is.element(CHEMS[,"Phase_I"],1),"CODE"]
		code.list <- c(code.list,CHEMS[is.element(CHEMS[,"Phase_II"],1),"CODE"])
		code.list <- sort(unique(code.list))
		code.list <- code.list[is.element(code.list,rownames(SMARTSMAT))]
		code.list <- sort(unique(code.list))
	}
	else if(chemset=="E1K") {
		code.list <- CHEMS[is.element(CHEMS[,"Phase_I"],1),"CODE"]
		code.list <- c(code.list,CHEMS[is.element(CHEMS[,"Phase_II"],1),"CODE"])
		code.list <- c(code.list,CHEMS[is.element(CHEMS[,"E1K"],1),"CODE"])
		code.list <- sort(unique(code.list))
		code.list <- code.list[is.element(code.list,rownames(SMARTSMAT))]
		code.list <- sort(unique(code.list))
	}
	else {
		cat("chemlist: ",chemlist," not implemented\n")
		return()
	}

	if(do.debug) {
		nchem <- 100
		code.list <- code.list[1:nchem]
	}

	smartsmat <- SMARTSMAT[code.list,]
	cs <- colSums(smartsmat)
	smartsmat <- smartsmat[,cs>1]

	nchem <- length(code.list)
	cat("Number of chemicals: ",nchem,"\n")
	cat("Dimension of smartsmat: ",dim(smartsmat),"\n")

	nside <- floor(sqrt(nchem/navg))
	cat("Length of side: ",nside,"\n")
	flush.console()
	filename <- paste("structure_input/SOM_",chemset,"_",nchem,"_",navg,"_",nside,"_",rlen,"_chemcodes.txt",sep="")
	write(code.list,file=filename)

	grid <- somgrid(xdim=nside,ydim=nside,topo="hexagonal")
	my.som <- som(as.matrix(smartsmat),grid=grid,rlen=rlen,keep.data=T,toroidal=T)
	filename <- paste("structure_input/SOM_",chemset,"_",nchem,"_",navg,"_",nside,"_",rlen,".rdata",sep="")
	save(my.som,file=filename)
}
#--------------------------------------------------------------------------------------
#
# Build an SOM on selected structures
#
# chemset is ToxCast, E1K, Tox21
# navg is the average number of chemiacls per cell
# rlen is the number of iterations of the SOM analysis
#
#--------------------------------------------------------------------------------------
bill.test <- function() {

	code.list <- CHEMS[is.element(CHEMS[,"Phase_I"],1),"CODE"]
	code.list <- c(code.list,CHEMS[is.element(CHEMS[,"Phase_II"],1),"CODE"])
	code.list <- sort(unique(code.list))
	code.list <- code.list[is.element(code.list,rownames(SMARTSMAT))]
	code.list <- sort(unique(code.list))
	temp <- MAT.AC50[code.list,]
	temp[is.na(temp)] <- 1000000
	temp[temp>1000] <- 0
	temp[temp>0] <- 1
	rs <- rowSums(temp)
	ctemp <- CHEMS[code.list,]
	ctemp <- cbind(ctemp,rs)
	names(ctemp[dim(ctemp)[2]]) <- "Hits"
	outfile <- "output/phase_I_hits.txt"
	write.table(ctemp,file=outfile, row.names=F, append=FALSE, quote=F, sep = "\t")

}
#--------------------------------------------------------------------------------------
#
# do the SOM analysis
#
#--------------------------------------------------------------------------------------
zsom <- function(zmin=0,zmax=3,cutoff=0.5,threshold=0.75,rlen=10,to.file=F, do.debug=F, mode="structure_category") {

	if(!exists("SMARTSMAT")) {
		fname <- "structure_input/ToxCast_SMARTS_REDUCED_matrix_2013_03_05.txt"
	   	SMARTSMAT <<- read.table(fname,header=T,sep="\t",stringsAsFactors=F,quote="",comment="")
	}
	code.list <- CHEMS[is.element(CHEMS[,"Phase_I"],1),"CODE"]
	code.list <- c(code.list,CHEMS[is.element(CHEMS[,"Phase_II"],1),"CODE"])
	code.list <- sort(unique(code.list))
	code.list <- code.list[is.element(code.list,rownames(SMARTSMAT))]
	code.list <- sort(unique(code.list))
	nchem <- length(code.list)

	if(do.debug) {
		nchem <- 100
		code.list <- code.list[1:nchem]
	}

	z.chems <- CHEMS[code.list,]
	z.smartsmat <- SMARTSMAT[code.list,]
	cs <- colSums(z.smartsmat)
	z.smartsmat <- z.smartsmat[,cs>0]

	cat("nchem: ",nchem,"\n")
	nside <- floor(sqrt(nchem/5))
	cat("Length of side: ",nside,"\n")
	cat("Dimension of z.smartsmat: ",dim(z.smartsmat),"\n")
	flush.console()
	grid <- somgrid(xdim=nside,ydim=nside,topo="hexagonal")
	if(!exists("SOM.BIG")) {
		temp <- som(as.matrix(z.smartsmat),grid=grid,rlen=rlen,keep.data=T,toroidal=T)
		SOM.BIG <<- temp
		save(SOM.BIG,file="burst/SOM.BIG.rdata")
	}
	z.chems <- cbind(z.chems,SOM.BIG$unit.classif)
	names(z.chems)[dim(z.chems)[2]] <- "SOM.CLASS"
	fname <- paste("burst/chemicals_with_som_classes.txt",sep="")
	write.table(z.chems,file=fname, row.names=F, append=FALSE, quote=F, sep = "\t")
	if(mode=="structure_category") {
		ncell <- max(SOM.BIG$unit.classif)
		if(to.file) {
			fname <- paste("burst/structure_class_som.pdf",sep="")
			pdf(file=fname,width=7,height=10,pointsize=12,bg="white",paper="letter",pagecentre=T)
		}
		par(mfrow=c(4,3),mar=c(1,1,1,1))
		sclass.list <- sort(unique(z.chems[,"structure_category"]))
		nclass <- length(sclass.list)

		for(i in 1:nclass) {
			sclass <- sclass.list[i]

			chem.cluster <- sort(unique(z.chems[is.element(z.chems[,"structure_category"],sclass),"CODE"]))
			cat("number in class ",sclass," = ",length(chem.cluster),"\n")
			doit <- T
			if(do.debug && length(chem.cluster)<3) doit <- F
			if(doit) {
				cell.averages <- vector(length=ncell,mode="double")
				cell.averages[] <- 0
				for(j in 1:ncell) {
					mask.1 <- SOM.BIG$unit.classif
					mask.1[] <- 0
					mask.2 <- mask.1
					mask.1[SOM.BIG$unit.classif==j] <- 1
					mask.2[is.element(z.chems[,"structure_category"],sclass)] <- 1
					cell.averages[j] <- sum(mask.1*mask.2)/sum(mask.1)
				}
				pname <- paste(sclass,":",length(chem.cluster))
				plot.kohonen(SOM.BIG,type="property",property=cell.averages,main=pname,ncolors=20,zlim=c(0,1),palette.name = coolBlueHotRed)
				if(!to.file) browser(F)
			}
		}
		if(to.file) dev.off()
		else browser()
	}
	else {
		if(zmin==0) zname <- "Lo-Z"
		else zname <- "Hi-Z"
		ncell <- max(SOM.BIG$unit.classif)
		if(to.file) {
			fname <- paste("burst/zclust_som_",zmin,"_",zmax,"_",cutoff,"_",threshold,".pdf",sep="")
			pdf(file=fname,width=7,height=10,pointsize=12,bg="white",paper="letter",pagecentre=T)
		}
		par(mfrow=c(4,3),mar=c(1,1,1,1))
		fname <- paste("burst/zclust_chem_",zmin,"_",zmax,"_chemicals_",cutoff,"_",threshold,".txt",sep="")
		zclust <- read.table(fname,header=T,sep="\t",stringsAsFactors=F,quote="",comment="")
		cat("original dimension of zclust: ",dim(zclust),"\n")
		zclust <- zclust[is.element(zclust[,"CODE"],code.list),]
		cat("final dimension of zclust: ",dim(zclust),"\n")
		cl.list <- sort(unique(zclust[,"cluster"]))
		cat("clusters: ",cl.list,"\n")
		ncl <- length(cl.list)

		for(i in 1:ncl) {
			cluster <- cl.list[i]
			cell.averages <- vector(length=ncell,mode="double")
			cell.averages[] <- 0
			for(j in 1:ncell) {
				mask.1 <- SOM.BIG$unit.classif
				mask.1[] <- 0
				mask.2 <- mask.1
				mask.1[SOM.BIG$unit.classif==j] <- 1

				code.cluster <- sort(unique(zclust[is.element(zclust[,"cluster"],cluster),"CODE"]))
				code.mask <- is.element(code.list,code.cluster)
				mask.2[code.mask] <- 1
				cell.averages[j] <- sum(mask.1*mask.2)/sum(mask.1)
			}
			pname <- paste(zname," clstr:",cluster," chems:",sum(mask.2))
			plot.kohonen(SOM.BIG,type="property",property=cell.averages,main=pname,ncolors=20,zlim=c(0,1),palette.name = coolBlueHotRed)
			if(!to.file) browser(F)
		}
		if(to.file) dev.off()
		else browser()
	}
}
#--------------------------------------------------------------------------------------
#
# do the SOM analysis
#
#--------------------------------------------------------------------------------------
zsom.1 <- function(zmin=0,zmax=3,cutoff=0.5,threshold=0.75,rlen=10,to.file=F) {

	if(!exists("SMARTSMAT")) {
		fname <- "structure_input/ToxCast_SMARTS_REDUCED_matrix_2013_03_05.txt"
	   	SMARTSMAT <<- read.table(fname,header=T,sep="\t",stringsAsFactors=F,quote="",comment="")
	}
	fname <- paste("burst/zclust_chem_",zmin,"_",zmax,"_chemicals_",cutoff,"_",threshold,".txt",sep="")
   	zclust <- read.table(fname,header=T,sep="\t",stringsAsFactors=F,quote="",comment="")
	clist <- sort(unique(zclust[,"CODE"]))

	clist <- clist[is.element(clist,rownames(SMARTSMAT))]
	nchem <- length(clist)
	cat("Length of clist: ",nchem,"\n")
	nside <- floor(sqrt(nchem/3))
	cat("Length of side: ",nside,"\n")
	smat <- SMARTSMAT[clist,]
	grid <- somgrid(xdim=nside,ydim=nside,topo="hexagonal")
	flush.console()
	if(zmin==0) {
		if(!exists("SOM.LO")) {
			temp <- som(as.matrix(smat),grid=grid,rlen=rlen,keep.data=T,toroidal=T)
			SOM.LO <<- temp
			save(SOM.LO,file="burst/SOM.LO.rdata")

		}
		SOM.ALL <- SOM.LO
		zname <- "Lo-Z"
	}
	else {
		if(!exists("SOM.HI")) {
			temp <- som(as.matrix(smat),grid=grid,rlen=rlen,keep.data=T,toroidal=T)
			SOM.HI <<- temp
			save(SOM.HI,file="burst/SOM.HI.rdata")
		}
		SOM.ALL <- SOM.HI
		zname <- "Hi-Z"
	}
	cat("original dimension of zclust: ",dim(zclust),"\n")
	zclust <- zclust[is.element(zclust[,"CODE"],clist),]
	cat("final dimension of zclust: ",dim(zclust),"\n")
	cl.list <- sort(unique(zclust[,"cluster"]))
	cat("clusters: ",cl.list,"\n")
	ncl <- length(cl.list)
	ncell <- max(SOM.ALL$unit.classif)

	if(to.file) {
		fname <- paste("burst/zclust_som_",zmin,"_",zmax,"_",cutoff,"_",threshold,".pdf",sep="")
		pdf(file=fname,width=7,height=10,pointsize=12,bg="white",paper="letter",pagecentre=T)
	}
	par(mfrow=c(4,3),mar=c(1,1,1,1))
	for(i in 1:ncl) {
		cluster <- cl.list[i]
		x <- matrix(nrow=nchem,ncol=3)

		x[,1] <- SOM.ALL$unit.classif
		x[,2] <- 0
		x[,3] <- 0
		rownames(x) <- clist
		chem.cluster <- sort(unique(zclust[is.element(zclust[,"cluster"],cluster),"CODE"]))
		x[chem.cluster,2] <- 1
		cat("number in cluster ",i," = ",length(chem.cluster),"\n")
		for(j in 1:ncell) {
			ctemp <- as.matrix(x[is.element(x[,1],j),])
			if(dim(ctemp)[1]==3 && dim(ctemp)[2]==1) ctemp <- t(ctemp)
			denom <- dim(ctemp)[1]
			#print(is.na(dim(ctemp)))
			#cat("dim ctemp: ",dim(ctemp),":",denom,"\n")
			#print(ctemp)
			if(denom>0) {
				if(denom==1) numer <- ctemp[1,2]
				else numer <- sum(ctemp[,2])
				value <- numer / denom
				#cat(numer,denom,"\n")
				x[is.element(x[,1],j),3] <- value
			}
			#browser()
		}
		xval <- x[,3]
		pname <- paste(zname," clstr:",cluster," chems:",length(chem.cluster))
		plot.kohonen(SOM.ALL,type="property",property=xval,main=pname,contin=T,ncolors=20,zlim=c(0,1),palette.name = coolBlueHotRed)
		if(!to.file) browser(F)
	}
	if(to.file) dev.off()
	else browser()

}
coolBlueHotRed <- function(n, alpha = 1) {
	rainbow(n, end=4/6, alpha=alpha)[n:1]
}

######################################################################################
######################################################################################
######################################################################################
######################################################################################
######################################################################################
######################################################################################
######################################################################################
######################################################################################
#--------------------------------------------------------------------------------------
#
# assay source summary
#
# QC=OK
#--------------------------------------------------------------------------------------
assay.source.count <- function() {
    cat("==========================================================================\n")
    cat("assay.source.summary\n")
    cat("==========================================================================\n")
    flush.console()

   	mask <- ASSAY.INFO[,"Type"]
   	mask[is.element(mask,c("cytotox","proliferation","gene","functional"))] <- 1
   	mask[is.element(mask,c("cytotox_fail"))] <- 0
   	mask[is.element(mask,c("bad_assay"))] <- 0
   	atemp <- ASSAY.INFO[mask==1,]

   	assay.set.list <- sort(uniquify(atemp[,"Source"]))
   	nset <- length(assay.set.list)
	asum <- as.data.frame(matrix(nrow=nset,ncol=3))
	asum[,1] <- assay.set.list

   	for(i in 1:nset) {
   		aset <- assay.set.list[i]
   		cat(aset,"\n")
   		assay.list <- atemp[is.element(atemp[,"Source"],aset),"Assay"]
   		assay.list <- assay.list[is.element(assay.list,names(MAT.AC50MAT))]
		asum[i,2] <- length(assay.list)
		if(aset=="Novascreen_ADME") asum[i,3] <- "Duplicate in activator direction"
		if(aset=="Novascreen_ENZ") asum[i,3] <- "Duplicate in activator direction"
	}
	names(asum) <- c("Source","Assays","Note")
	print(asum)
    outfile <- "output/assay_source_count.txt"
    write.table(asum,file=outfile, row.names=F, append=FALSE, quote=F, sep = "\t")
}
#--------------------------------------------------------------------------------------
#
# load data required for the calculations
#
# QC=OK
#--------------------------------------------------------------------------------------
old.load.data <- function(toxcast.export.date="2013_12_10",chemset="E1K") {
    cat("==========================================================================\n")
    cat("load data\n")
    cat("==========================================================================\n")
    flush.console()

    file <- "input/AssayList.txt"
    ASSAY.INFO <<- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"")
    ASSAY.LIST <<- as.character(ASSAY.INFO[,1])
    mask <- ASSAY.LIST
    mask[] <- 1
    mask[is.element(ASSAY.INFO[,"Gene_Process"],"Cytotox_fail")] <- 0
    ASSAY.LIST <<- ASSAY.LIST[mask==1]
    cat("loaded AssayList\t",length(ASSAY.LIST),"\n"); flush.console()

    var.list <- c("AC50","AC50_prep.pathway()","W","T","Emax","max_conc","level7_ac50_hitcall","level8_ac50_hitcall")
    nvar <- length(var.list)

    for(v in 1:nvar) {
        variable <- var.list[v]
        file <- paste("input/ToxCast_ResultMatrix_",chemset,"_",variable,"_",toxcast.export.date,".txt",sep="")
        cat("Load: ",file,"\n")
        flush.console()
        temp <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"'")
        NAME.MAT <<- temp[,1:3]
        rownames(temp) <- temp[,"CODE"]
        if(v==1) MAT.AC50 <<- temp[,ASSAY.LIST]
        if(v==2) AC50MODMAT <<- temp[,ASSAY.LIST]
        if(v==3) WMAT <<- temp[,ASSAY.LIST]
        if(v==4) TMAT <<- temp[,ASSAY.LIST]
        #if(v==5) AC50MAT <<- temp[,ASSAY.LIST]
        if(v==5) EMAXMAT <<- temp[,ASSAY.LIST]
        if(v==6) MAXCONCMAT <<- temp[,ASSAY.LIST]
    }
    file <- paste("input/ToxCast_ResultMatrix_",chemset,"_AC50_level8_",toxcast.export.date,".txt",sep="")
    temp <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"'")
    rownames(temp) <- temp[,"CODE"]
    AC50MAT <<- temp[,ASSAY.LIST]
    file <- "input/actor_quantitative_tolerance.txt"
    temp <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"")
    QTOL <<- temp
    cat("loaded QTOL\n"); flush.console()

    AC50MAT.LOG <<- -log10(AC50MAT/1000000)
    MAT.AC50.LOG <<- -log10(MAT.AC50/1000000)

    temp <- AC50MAT.LOG
    temp[temp>0] <- 1
    AC50MAT.DISC <<- temp

    temp <- MAT.AC50.LOG
    temp[temp>0] <- 1
    MAT.hitcall <<- temp

    file <- "input/ToxCast_Generic_Chemicals_2013_12_10.txt"
    temp <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="",comment="")
    cat("loaded Chemical Information\n"); flush.console()
    cat("Original size: ",dim(temp),"\n")
    CODE.LIST <<- NAME.MAT[,"CODE"]
    rownames(temp) <- temp[,"CODE"]
    CHEMS <<- temp[CODE.LIST,]
    cat("Final size: ",dim(CHEMS),"\n")

    file <- "output/cytotox_dist.txt"
    temp <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="",comment="")
    rownames(temp) <- temp[,"CODE"]
    temp <- temp[CODE.LIST,]
    CYTOTOX <<- temp

    cat("Unique chemicals: ",dim(CHEMS)[1],"\n")
}
#--------------------------------------------------------------------------------------
#
# do the heatmap of the assay coverage
#
# QC=OK
#--------------------------------------------------------------------------------------
assay.coverage.hm <- function(to.file=F,do.prep=T,cex.col=0.1,cex.row=0.1) {
    cat("==========================================================================\n")
    cat("assay.coverage.hm\n")
    cat("==========================================================================\n")
    flush.console()

    if(to.file) {
        file <- paste("plots/assay_coverage_heatmap.pdf",sep="")
        pdf(file=file,width=7,height=7,pointsize=12,bg="white",paper="letter",pagecentre=T)
    }
    if(do.prep) {
        temp <- as.matrix(MAT.AC50)
        temp[!is.na(temp)] <- 1
        temp[is.na(temp)] <- 0
        print(dim(temp))
        MAT <<- temp
    }
	mat <- MAT[1:1700,]
    result <- heatmap(mat,margins=c(5,5),scale="none",main=paste("Assay Coverage",dim(mat)[1]," chemicals, ",dim(mat)[2]," assays"),
                      xlab="Assays",ylab="Chemicals",cexCol=cex.col,cexRow=cex.row,col=brewer.pal(9,"Reds"),
                      hclustfun=function(x) hclust(d=dist(x),method="ward.D"),keep.dendro=T,verbose=T)

    if(to.file) dev.off()
    else browser()
}
#--------------------------------------------------------------------------------------
#
# load chemicals
#
# QC=OK
#--------------------------------------------------------------------------------------
old.load.chemicals <- function() {
    file <- "input/ToxCast_Generic_Chemicals_2013_12_10.txt"
    temp <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="",comment="")
    cat("loaded Chemical Information\n"); flush.console()
    cat("Original size: ",dim(temp),"\n")
    CODE.LIST <<- NAME.MAT[,"CODE"]
    rownames(temp) <- temp[,"CODE"]
    CHEMS <<- temp[CODE.LIST,]
    cat("Final size: ",dim(CHEMS),"\n")
}
#--------------------------------------------------------------------------------------
#
# z-score: load the data
#
# QC=OK
#--------------------------------------------------------------------------------------
old.load.zscore <- function() {
    cat("==========================================================================\n")
    cat("load zscore\n")
    cat("==========================================================================\n")
    flush.console()

    file <- "output/zscore_all_all_2sided.txt"
    temp <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="",comment="")
    rownames(temp) <- temp[,"CODE"]
    temp <- temp[,ASSAY.LIST]
    Z.ALL.ALL.2SIDED <<- temp
}

#--------------------------------------------------------------------------------------
#
# z-score normed: load the data
#
# QC=OK
#--------------------------------------------------------------------------------------
load.zscore.norm <- function() {
    cat("==========================================================================\n")
    cat("load zscore normed\n")
    cat("==========================================================================\n")
    flush.console()

    file <- "output/zscore_matrix_norm.txt"
    temp <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="",comment="")
    #rownames(temp) <- CHEMS[,"CODE"]
    MAT.ZSCORE.NORM <<- temp
}
#--------------------------------------------------------------------------------------
#
# Calculate at the hit distribution by chemical
#
# QC=OK
#--------------------------------------------------------------------------------------
hit.dist <- function(to.file=F,target.gene=NA) {
    cat("==========================================================================\n")
    cat("hit.dist\n")
    cat("==========================================================================\n")
    options(warn=1)
    warning(immediate.=T,call.=T)
    flush.console()
    alist <- ASSAY.LIST
    ztemp <- MAT.logAC50
    ztemp[] <- NA
    target.gene <- as.character(target.gene)
    nassay <- length(alist)
    nchem <- dim(CHEMS)[1]
    file <- "../output/by_chemical_hit_dist.txt"
    s <- "CODE\tCASRN\tName\tIntendedTarget\tStructureCategory\tUseCategory\tAssaysTested\tHits\tHitRatio\tSelectiveHits\tSelectiveHitRatio\tCytotoxAssaysTested\tCytotoxHits\tMinAC50\tCytotoxMedian\tCytotoxMin\n"
    sall <- s
    if(is.na(target.gene)) cat(file=file,s,append=F)
    if(to.file) {
        fname <- "../plots/by_chemical_hit_dist.pdf"
        if(!is.na(target.gene)) fname <- paste("plots/by_chemical_hit_dist_",target.gene,".pdf",sep="")
        pdf(file=fname,width=7,height=10,pointsize=12,bg="white",paper="letter",pagecentre=T)
    }
    par(mfrow=c(4,2),mar=c(4,4,2,2))

    target.assay.list <- NA
    if(!is.na(target.gene)) target.assay.list <- ASSAY.INFO[is.element(ASSAY.INFO[,"intended_target"],target.gene),"Assay"]
    cytotox.assay.set <- CYTOTOX.ASSAYS

    mask <- vector(mode="integer",length=dim(MAT.logAC50)[2])
    mask[] <- 1
    mask[is.element(names(MAT.logAC50),cytotox.assay.set)] <- 0
    istart <- 1

    for(i in istart:nchem) {
    	counter <- i
    	code <- CHEMS[i,"CODE"]
    	casrn <- CHEMS[i,"CASRN"]
    	cname <- CHEMS[i,"Name"]
    	target <- CHEMS[i,"target_gene"]
    	ccat <- CHEMS[i,"structure_category"]
    	ucat <- CHEMS[i,"use_category"]

    	cytotox.median <- as.numeric(as.character(CYTOTOX[code,"cytotox_median_um"]))
    	cytotox.mad <- as.numeric(as.character(CYTOTOX[code,"global_mad"]))
    	cytotox.min <- as.numeric(as.character(CYTOTOX[code,"cytotox_lower_bound_um"]))
        if(is.na(cytotox.median)) {
        	cytotox.median <- 1000
        	cytotox.min <- 100
        }
    	if(!is.na(target.gene)) {
            target.temp <- MAT.logAC50[code,target.assay.list]
            target.temp[is.na(target.temp)] <- 0
            if(sum(target.temp)==0) doit <- F
    	}
		tempA <- MAT.AC50[code,mask==1]
		hit.assays <- names(tempA)[!is.na(tempA)]
		tempA <- tempA[!is.na(tempA)]
		ntry <- length(tempA)
		hit.assays <- hit.assays[tempA<1000000]
		hit.genes <- sort(uniquify(ASSAY.INFO[is.element(ASSAY.INFO[,"Assay"],hit.assays),"intended_target"]))
		nhit.gene <- length(hit.genes)
		tempA <- tempA[tempA<1000000]
		nhit <- length(tempA)
		hit.ratio <- nhit/ntry

		temp.selective <- tempA[tempA<cytotox.min]
		nhit.selective <- length(temp.selective)
		hit.ratio.selective <- nhit.selective / ntry
		hit.gene.ratio <- nhit.gene/ntry
		tempA <- sort(tempA)
		minAC50 <- min(tempA)
		cytotox.ntry <- NA
		cytotox.nhit <- NA
		cytotox.ratio <- NA

		cytotox.tempA <- MAT.AC50[code,cytotox.assay.set]
		cytotox.tempA <- cytotox.tempA[!is.na(cytotox.tempA)]
		cytotox.ntry <- length(cytotox.tempA)


		if(nhit>0) {
			tempA.log <- log10(tempA)-6
			xmin <- 0
			xmax <- 10
			xminA <- 1e-4
			xmaxA <- 1000
			if(minAC50<1E-4) {
				xmax <- 14
				xminA <- 1E-6
			}
			breaksA.min <- 1e-4
			nbreaks <- 100
			if(minAC50<breaksA.min) {
				breaksA.min <- breaksA.min/100
			}
			breaksA <- breaksA.min
			for(i in 1:80) breaksA <- c(breaksA,1.2*breaksA[length(breaksA)])
			while(max(breaksA)<max(tempA)) {
				breaksA <- c(breaksA,1.2*breaksA[length(breaksA)])
				cat("Added another point to breaksA",max(breaksA),"\n")
			}
			xA <- hist(tempA,breaks=breaksA,plot=F)
			ymax <- 1.5*max(xA$counts)

			hist.log(breaksA,xA$counts,ylim=c(0,ymax),xlab="AC50 (uM)",ylab="Hits",main=paste(casrn,":",cname),cytotox.median, cytotox.min, cytotox.max)
			eps <- 0.08
			xpmin <- breaksA.min
			text(xpmin,ymax*(1-1*eps),paste("ntry=",ntry,sep=""),pos=4)
			text(xpmin,ymax*(1-2*eps),paste("nhit=",nhit,sep=""),pos=4)
			text(xpmin,ymax*(1-3*eps),paste("nhit (Z>3)=",nhit.selective,sep=""),pos=4)
			text(xpmin,ymax*(1-4*eps),paste("cytotox median=",format(cytotox.median,digits=2),sep=""),pos=4)
			text(xpmin,ymax*(1-5*eps),paste("cytotox min=",format(cytotox.min,digits=2),sep=""),pos=4)
			tempA.cytotox <- MAT.AC50[code,cytotox.assay.set]
			tempA.cytotox <- tempA.cytotox[!is.na(tempA.cytotox)]
			tempA.cytotox <- tempA.cytotox[tempA.cytotox<1000000]
			cytotox.nhit <- length(tempA.cytotox)
			text(xpmin,ymax*(1-6*eps),paste("cytotox try=",cytotox.ntry,sep=""),pos=4)
			text(xpmin,ymax*(1-7*eps),paste("cytotox hit=",cytotox.nhit,sep=""),pos=4)
			if(length(tempA.cytotox)>0) {
				for(j in 1:length(tempA.cytotox)) {
					ac50 <- tempA.cytotox[j]
					points(ac50,ymax*0.6,pch="*",col="red",cex=2)
				}
			}
			
			s <- paste(code,"\t",casrn,"\t",cname,"\t",target,"\t",ccat,"\t",ucat,"\t",ntry,"\t",nhit,"\t",format(hit.ratio,digits=2),"\t",nhit.selective,"\t",format(hit.ratio.selective,digits=2),"\t",cytotox.ntry,"\t",cytotox.nhit,"\t",format(minAC50,digits=3),"\t",format(cytotox.median,digits=2),"\t",format(cytotox.min,digits=2),"\n",sep="")
			cat(counter,":",s)
			if(is.na(target.gene))  cat(file=file,s,append=T)
			flush.console()

			if(!is.na(target.gene)) {
				agset <- ASSAY.INFO[is.element(ASSAY.INFO[,"Gene_Process"],target.gene),"Assay"]
				for(g in 1:length(agset)) {
					assay.g <- agset[g]
					ac50 <- MAT.AC50[code,assay.g]
					asource <- ASSAY.INFO[is.element(ASSAY.INFO[,"Assay"],agset[g]),"Source"]
					direction <- ASSAY.INFO[is.element(ASSAY.INFO[,"Assay"],agset[g]),"DirectionClass"]
					pch <- 21
					if(direction=="up") pch=24
					if(direction=="down") pch=25
					if(direction=="agonist") pch=24
					if(direction=="antagonist") pch=25

					color <- "white"
					if(asource=="Novascreen") color <- "black"
					if(asource=="Odyssey Thera") color <- "green"
					if(asource=="NCGC") color <- "gray"
					if(asource=="Attagene") color <- "violet"
					if(asource=="ACEA") color <- "red"
					if(asource=="BioSeek_up") color <- "orange"
					if(asource=="BioSeek_down") color <- "orange"
					if(asource=="Apredica") color <- "yellow"
					yval <- ymax*0.5*(1+0.2*rnorm(1,0.1))
					points(ac50,yval,pch=pch,bg=color,fg="black",cex=2)
				}
			}
			if(!to.file) browser()
		}
    }

    if(to.file) dev.off()
    else browser()
}
#--------------------------------------------------------------------------------------
#
# plot a histogram on a log scale Calculate at the hit distribution by chemical
#
# QC=OK
#--------------------------------------------------------------------------------------
hist.log <- function(x,y,ylim,xlab,ylab,main,cytotox.median, cytotox.min, cytotox.max) {
	plot(x[1:length(y)],y,type="n",col="gray40",lwd=2.5,log="x",xlab=xlab,ylim=ylim,ylab=ylab,main=main,cex.lab=1.2,cex.axis=1.2)
	if(cytotox.min<100) {
		rect(cytotox.min,ylim[2],max(x),0,col="gray80")
		lines(c(cytotox.median,cytotox.median),ylim,col="red",lwd=3)
	}
	for(i in 1:length(y)) {
		rect(x[i],y[i],x[i+1],0)
	}

}
#--------------------------------------------------------------------------------------
#
# build the gene-wise spcific hit file
#
# QC=OK
#--------------------------------------------------------------------------------------
calc.genescore <- function(do.prep=T,zcut=3) {
    cat("==========================================================================\n")
    cat("calc.genescore\n")
    cat("==========================================================================\n")
    flush.console()

    file <- "../output/genescore_by_chemical_long.txt"
    s <- "CODE\tCASRN\tName\tStructureCategory\tUseCategory\tIntendedTarget\tGene\tDenominator\tGeneScore\n"
    cat(file=file,s,append=F)

    ctemp <- CHEMS[,c("CODE","Phase_I","Phase_II","E1K")]
    mask <- ctemp[,"Phase_I"]+ctemp[,"Phase_II"]+ctemp[,"E1K"]
    code.list.in.phase <- ctemp[mask>0,"CODE"]
    code.set <- CHEMS[,"CODE"]
    cat("Length of code.set without phase filter: ",length(code.set),"\n")
    code.set <- code.set[is.element(code.set,code.list.in.phase)]
    cat("Length of code.set with phase filter: ",length(code.set),"\n")
    flush.console()
    code.set <- sort(code.set)
    gene.set <- sort(uniquify(toupper(ASSAY.INFO[,"intended_target"])))
    gene.set <- gene.set[!is.element(gene.set,c("CYTOTOX_FAIL","PROLIFERATION","NUCLEARSIZE","MITOFUNCTION","MITOTICARREST","CYTOTOX","CELLCYCLEARREST"))]

    nchem <- length(code.set)
    ngene <- length(gene.set)
    if(do.prep) {
        try.map <- MAT.hitcall[,ASSAY.LIST]
        try.map[try.map==0] <- 1
        try.map[is.na(try.map)] <- 0

        ztemp <- MAT.ZSCORE.NORM[,ASSAY.LIST]
        ztemp[is.na(ztemp)] <- 0
        ztemp[try.map==0] <- 0
        ztemp[ztemp<zcut] <- 0
        ztemp[ztemp>0] <- 1

        temp <- MAT.logAC50[,ASSAY.LIST]*ztemp
        temp <- temp[code.set,]
        try.map <- try.map[code.set,]
        STEMP <<- temp
        TTEMP <<- try.map
        cat("finished preparing big matrices\n")
        flush.console()
    }

    gstart <- 1
    for(j in gstart:ngene) {
        gene <- gene.set[j]
        assay.set <- ASSAY.INFO[is.element(toupper(ASSAY.INFO[,"intended_target"]),gene),"Assay"]
        slice <- 0
        for(k in 1:length(assay.set)) {
            if(sum(grep("_Activator",assay.set[k]))>0) slice <- slice+1
            if(sum(grep("_up",assay.set[k]))>0) slice <- slice+1
        }
        atemp <- STEMP[,assay.set]
        if(length(assay.set)==1) {
            tmp <- as.data.frame(matrix(nrow=nchem,ncol=1))
            rownames(tmp) <- code.set
            tmp[,1] <- atemp
            atemp <- tmp
        }
        if(!is.null(dim(atemp))) {
            cat("\n=======================================================\n")
            cat(gene,":",slice,":",dim(atemp),assay.set,"\n")
            cat("=======================================================\n")

            for(i in 1:nchem) {
                code <- code.set[i]
                temp <- atemp[code,]
                temp <- temp[!is.na(temp)]
                sumtemp <- sum(temp)
                ltemp <- length(temp)
                if(ltemp>0 && sumtemp!=0) {
                    gene.score <- sumtemp
                    denominator <- ltemp-slice
                    if(denominator<=0) denominator <- 1
                    gene.score <- gene.score / denominator
                    s <- paste(code,"\t",CHEMS[code,"CASRN"],"\t",CHEMS[code,"Name"],"\t",CHEMS[code,"structure_category"],"\t",CHEMS[code,"use_category"],"\t",CHEMS[code,"target_gene"],"\t",gene,"\t",denominator,"\t",format(gene.score,digits=3),"\n",sep="")
                    cat(file=file,s,append=T)
                    if(gene.score<0) {
                        cat(s)
                        flush.console()
                    }
                }
            }
        }
    }
}
#--------------------------------------------------------------------------------------
#
# read in the genescore data
#
# QC=OK
#-------------------------------------------------------------------------------------
load.genescore <- function() {
    cat("==========================================================================\n")
    cat("load.genescore\n")
    cat("==========================================================================\n")
    flush.console()
    file <- "../output/genescore_by_chemical_long.txt"
    temp <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"")
    print(dim(temp))
    flush.console()
    mask <- temp[,"GeneScore"]
    mask[] <- 1

    mask[is.na(temp[,"GeneScore"])] <- 0
    mask[temp[,"GeneScore"]==Inf] <- 0
    mask[temp[,"GeneScore"]== -Inf] <- 0
                                        #mask[temp[,"GeneScore"]==0] <- 0

    GENEDATA <<- temp[mask==1,]
    print(dim(GENEDATA))
    cat("read in GENEDATA\n")
    flush.console()
}
#--------------------------------------------------------------------------------------
#
# Plot the distribution of htis gene-wise for targets with good reference chemcials
#
# QC=OK
#-------------------------------------------------------------------------------------
 genescore.refchem.plot <- function(to.file=F) {
    cat("==========================================================================\n")
    cat("genescore.refchem.plot\n")
    cat("==========================================================================\n")
    flush.console()
    chemGene.0 <- sort(uniquify(toupper(CHEMS[,"IntendedTarget"])))
    chemGene <- c()

    for(i in 1:length(chemGene.0)) {
        x <- str_split(chemGene.0[i]," ")
        for(j in 1:length(x[[1]])) {
            if(length(x[[1]][j])>0) chemGene <- c(chemGene,x[[1]][j])
        }
    }
    chemGene <- chemGene[!is.element(chemGene,c("","[","]", "(?)","14","A"))]
    chemGene <- sort(uniquify(chemGene))

    cg.chems <- c()
    cg.genes <- c()
    for(i in 1:dim(CHEMS)[1]) {
        code <- CHEMS[i,"CODE"]
        x <- str_split(toupper(CHEMS[i,"IntendedTarget"])," ")
        for(j in 1:length(x[[1]])) {
            if(length(x[[1]][j])>0) {
                if(!is.element(x[[1]][j],c("","[","]", "(?)","14","A","UNKNOWN"))) {
                    cg.chems <- c(cg.chems,code)
                    cg.genes <- c(cg.genes,x[[1]][j])
                }
            }
        }
    }

    assayGene <- sort(uniquify(toupper(GENEDATA[,"Gene"])))
    assayGene <- assayGene[is.element(assayGene,chemGene)]
    print(assayGene)

    assayGene <- assayGene[!is.element(assayGene,c("HRH2"))]

    temp <- GENEDATA[is.element(GENEDATA[,"Gene"],assayGene),]

    az <- temp[,"GeneScore"]
    gene.group <- temp[, "Gene"]

    gene.group <- gene.group[az!=0]
    az <- az[az!=0]

    gene.group <- gene.group[az!=Inf]
    az <- az[az!=Inf]

    gene.group <- gene.group[az!= -Inf]
    az <- az[az!= -Inf]

    gene.group <- gene.group[!is.na(az)]
    az <- az[!is.na(az)]

    gene.group <- gene.group[!is.na(az)]
    az <- az[!is.na(az)]

    gsmax <- 9
    az[az>gsmax] <- gsmax
    gene.group.unique <- uniquify(gene.group)
    ngene <- length(gene.group.unique)

    if(to.file) {
        fname <- "plots/gene_refchem_plot.pdf"
        pdf(file=fname,width=7,height=10,pointsize=12,bg="white",paper="letter",pagecentre=T)
    }
    par(mfrow=c(1,1),mar=c(4,8,2,2))
    az.log <- 10**-az * 1000000
    box.res <- boxplot(az~gene.group,xlab="Gene Score",ylab="",cex.axis=1,cex.lab=1.2,horizontal=T,las=1,ylim=c(-1,gsmax),plot=F)
    box.res.log <- boxplot(az.log~gene.group,xlab="Gene Score (uM)",ylab="",main=paste(ngene," Genes"),cex.axis=1,cex.lab=1.2,horizontal=T,las=1,ylim=c(0.001,1000000),log="x")

    lines(c(1000000,1000000),c(-100,100),col="gray")
    lines(c(100000,100000),c(-100,100),col="gray")
    lines(c(10000,10000),c(-100,100),col="gray")
    lines(c(1000,1000),c(-100,100),col="gray")
    lines(c(100,100),c(-100,100),col="gray")
    lines(c(10,10),c(-100,100),col="gray")
    lines(c(1,1),c(-100,100),col="gray")
    lines(c(0.1,0.1),c(-100,100),col="gray")
    lines(c(0.01,0.01),c(-100,100),col="gray")
    lines(c(0.001,0.001),c(-100,100),col="gray")
    for(i in 1:ngene) {
        gene <- gene.group.unique[i]
        maxz <- -10000000
        temp <- GENEDATA[is.element(GENEDATA[,"Gene"],gene),]

        mask <- cg.genes
        mask[] <- 0
        mask[is.element(cg.genes,gene)] <- 1
        code.list <- cg.chems[mask==1]
        for(j in 1:length(code.list)) {
            code <- code.list[j]
            temp.1 <- temp[is.element(temp[,"CODE"],code),]
            if(dim(temp.1)[1]>0) {
                for(k in 1:dim(temp.1)[1]) {
                    z <- temp.1[k,"GeneScore"]
                    if(z>maxz) maxz <- z
                    zlog <- 10**-z * 1000000
                    points(zlog,i,pch="*",col="red",cex=2)
                }
            }
        }
        x.flag <- 1e-3
        if(maxz > box.res$stats[3,i]) points(x.flag,i,pch=22,bg="green",cex=2)
        else if(maxz > box.res$stats[2,i]) points(x.flag,i,pch=22,,bg="green",cex=2)
        else points(x.flag,i,pch=22,bg="red",cex=2)
    }

    if(to.file) dev.off()
    else browser()
}
#--------------------------------------------------------------------------------------
#
# Calculate some stats on the genescore
#
#-------------------------------------------------------------------------------------
genescore.stats <- function(do.prep=T,to.file=F) {
    cat("==========================================================================\n")
    cat("genescore.stats\n")
    cat("==========================================================================\n")
    flush.console()
    if(do.prep) {
        chemGene.0 <- sort(uniquify(toupper(CHEMS[,"IntendedTarget"])))
        chemGene <- c()

        for(i in 1:length(chemGene.0)) {
            x <- str_split(chemGene.0[i]," ")
            for(j in 1:length(x[[1]])) {
                if(length(x[[1]][j])>0) chemGene <- c(chemGene,x[[1]][j])
            }
        }
        chemGene <- chemGene[!is.element(chemGene,c("","[","]", "(?)","14","A"))]
        chemGene <- sort(uniquify(chemGene))
        cg.chems <- c()
        cg.genes <- c()
        for(i in 1:dim(CHEMS)[1]) {
            code <- CHEMS[i,"CODE"]
            x <- str_split(toupper(CHEMS[i,"IntendedTarget"])," ")
            for(j in 1:length(x[[1]])) {
                if(length(x[[1]][j])>0) {
                    if(!is.element(x[[1]][j],c("","[","]", "(?)","14","A","UNKNOWN"))) {
                        cg.chems <- c(cg.chems,code)
                        cg.genes <- c(cg.genes,x[[1]][j])
                    }
                }
            }
        }

        assayGene <- sort(uniquify(toupper(GENEDATA[,"Gene"])))
        assayGene <- assayGene[!is.element(assayGene,c("PROLIFERATION","NUCLEARSIZE","MITOFUNCTION","MITOTICARREST","CYTOTOX","CELLCYCLEARREST","",""))]
        ngene <- length(assayGene)
        col.list <- c("gene","ntry","nhit.total","nhit.pos","max.genescore","max.code","max.chemname","max.intendedtarget","max.StructureCategory","max.UseCategory","target.match")
        geneSummary <- as.data.frame(matrix(nrow=ngene,ncol=length(col.list)))
        rownames(geneSummary) <- assayGene
        names(geneSummary) <- col.list

        for(i in 1:ngene) {
            gene <- assayGene[i]
            temp <- GENEDATA[is.element(GENEDATA[,"Gene"],gene),]
            temp <- temp[!is.element(temp[,"CODE"],c("C8018017","C12427382","C36673162")),]
            stemp <- temp[,"GeneScore"]

            index <- sort(temp[,"GeneScore"],index.return=T,decreasing=T)$ix[1]
            geneSummary[gene,"gene"] <- gene
            geneSummary[gene,"ntry"] <- length(stemp)
            geneSummary[gene,"nhit.total"] <- length(stemp[stemp>0])
            geneSummary[gene,"nhit.pos"] <- length(stemp[stemp>0])
            geneSummary[gene,"max.genescore"] <- temp[index,"GeneScore"]
            geneSummary[gene,"max.code"] <- temp[index,"CODE"]
            geneSummary[gene,"max.chemname"] <- temp[index,"ShortName"]
            geneSummary[gene,"max.intendedtarget"] <- temp[index,"IntendedTarget"]
            geneSummary[gene,"max.StructureCategory"] <- CHEMS[temp[index,"CODE"],"StructureCategory"]
            geneSummary[gene,"max.UseCategory"] <- CHEMS[temp[index,"CODE"],"UseCategory"]

            geneSummary[gene,"target.match"] <- "F"
            x <- str_split(toupper(temp[index,"IntendedTarget"])," ")
            if(is.element(gene,x[[1]])) geneSummary[gene,"target.match"] <- "T"
            print(gene)
                                        #browser()
        }
        outfile <- "output/genescore_stats_1.txt"
        write.table(geneSummary,file=outfile, row.names=F, append=FALSE, quote=F, sep = "\t")
        GENE.SUMMARY <<- geneSummary
    }
    if(to.file) {
        fname <- "plots/genescore_2.pdf"
        pdf(file=fname,width=7,height=7,pointsize=12,bg="white",paper="letter",pagecentre=T)
    }
    par(mfrow=c(1,1),mar=c(5,5,2,2))

    temp <- GENE.SUMMARY[GENE.SUMMARY[,"ntry"]>800,]
    temp <- temp[temp[,"nhit.pos"]>0,]
    x <- temp[,"nhit.pos"]/temp[,"ntry"]
    y <- temp[,"max.genescore"]
    z <- temp[,"max.UseCategory"]
    g <- temp[,"gene"]
    ylog <- 10**-y * 1000000
    plot(ylog~x,type="p",cex.lab=1.5,cex.axis=1.5,xlab="f(GeneScore>0)",ylab="Max(GeneScore)",xlim=c(0.001,1),log="xy")
    points(ylog[is.element(z,"Pharmaceutical")]~x[is.element(z,"Pharmaceutical")],pch=21,bg="green")
    pesticide.list <- c("Insecticide/Chemical intermediate" ,"Fungicide/Crop protection", "Fungicide/antimicrobial","degradate of endosulfan (CASRN 115-29-7)" ,"Degradate of Aldicarb (CASRN 116-06-3)","Herbicide","Insecticide","Biocide","Bactericide","Fungicide","Microbicide","microbiocide","Rodenticide")
    points(ylog[is.element(z,pesticide.list)]~x[is.element(z,pesticide.list)],pch=21,bg="red")
    for(i in 1:length(g)) {
        if(x[i]>=0.1) {
            label <- g[i]
            if(label=="NR1I2") label <- "NR1I2 (PXR)"
            if(label=="NFE2L2") label <- "NFE2L2 (NRF2)"
            text(label=label,x=x[i],y=ylog[i],pos=4)
        }
    }
    npharma <- length(y[is.element(z,"Pharmaceutical")])
    npest <- length(y[is.element(z,pesticide.list)])
    ntot <- length(y)
    nother <- ntot-npest-npharma
    points(x=0.001,y=1e-7,pch=21,bg="green",cex=1.2); text(label=paste("Pharmaceutical: ",npharma),x=0.001,y=1e-7,pos=4,cex=1.1)
    points(x=0.001,y=3.5e-7,pch=21,bg="red",cex=1.2); text(label=paste("Pesticide: ",npest),x=0.001,y=3.5e-7,pos=4,cex=1.1)
    points(x=0.001,y=1e-6,pch=21,cex=1.2,col="black"); text(label=paste("Other: ",nother),x=0.001,y=1e-6,pos=4,cex=1.1)

    if(to.file) dev.off()
    else browser()

}
#--------------------------------------------------------------------------------------
#
# prepare the genescore matrix
#
#
# QC=OK
#--------------------------------------------------------------------------------------
prep.genescore.matrix <- function(cutoff=0) {
    cat("==========================================================================\n")
    cat("genescore.matrix.prep\n")
    cat("==========================================================================\n")
    flush.console()

    temp <- GENEDATA[GENEDATA[,"Denominator"]>0,]
    temp <- temp[temp[,"GeneScore"]>cutoff,]
    gene.list <- sort(uniquify(temp[,"Gene"]))
    gene.list <- gene.list[!is.element(gene.list,c("NUCLEARSIZE","MITOFUNCTION","CYTOTOX","FXR"))]

    cat("Length of gene list with 1 or more assays and at least one chemical at the target: ",length(gene.list),"\n")
    code.list <- sort(uniquify(temp[,"CODE"]))

    phase.mask <- CHEMS[,"Phase_I"]+CHEMS[,"Phase_II"]
    codes.ok <- CHEMS[phase.mask>0,"CODE"]
    code.list <- code.list[is.element(code.list,codes.ok)]
    cat("Length of code list: ",length(code.list),"\n")
    ngene <- length(gene.list)
    nchem <- length(code.list)
    mtemp <- matrix(nrow=nchem,ncol=ngene)
    mtemp[] <- 0
    rownames(mtemp) <- code.list
    colnames(mtemp) <- gene.list
    for(i in 1:dim(temp)[1]) {
        code <- temp[i,"CODE"]
        if(is.element(code,code.list)) {
            gene <- temp[i,"Gene"]
            score <- temp[i,"GeneScore"]
            if(is.element(gene,gene.list)) mtemp[code,gene] <- score
        }
    }

    GMATRIX <<- mtemp
    browser()
    outfile <- "../output/genescore_matrix.txt"
    write.table(mtemp,file=outfile, row.names=T, append=FALSE, quote=F, sep = "\t")
}
#--------------------------------------------------------------------------------------
#
# read in the genescore matrix
#
# QC=OK
#-------------------------------------------------------------------------------------
load.genescore.matrix <- function() {
    cat("==========================================================================\n")
    cat("load.genescore.matrix\n")
    cat("==========================================================================\n")
    flush.console()
    file <- "../output/genescore_matrix.txt"
    temp <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"")
    GMATRIX <<- temp
    print(dim(temp))
}
#--------------------------------------------------------------------------------------
#
# do the heatmap of the gene-wise matrix
#
# QC=OK
#--------------------------------------------------------------------------------------
genescore.hm <- function(to.file=F,cex.col=0.1,cex.row=0.1,cutoff=5,minhit=2) {
    cat("==========================================================================\n")
    cat("genescore.hm\n")
    cat("==========================================================================\n")
    flush.console()

    if(to.file) {
        file <- paste("plots/genescore_matrix_heatmap_",cutoff,".pdf",sep="")
        pdf(file=file,width=7,height=7,pointsize=12,bg="white",paper="letter",pagecentre=T)
    }
    temp <- GMATRIX
    temp[temp<cutoff] <- 0
    temp.disc <- temp
    temp.disc[temp.disc>0] <- 1
    rs <- rowSums(temp.disc)
    cs <- colSums(temp.disc)
    temp <- temp[rs>=minhit,cs>=minhit]
    print(dim(temp))
    code.list <- rownames(temp)
    name.list <- CHEMS[code.list,"ShortName"]
    col.col <- code.list
    col.col[] <- "white"
    pesticide.list <- c("Insecticide/Chemical intermediate" ,"Fungicide/Crop protection", "Fungicide/antimicrobial","degradate of endosulfan (CASRN 115-29-7)" ,"Degradate of Aldicarb (CASRN 116-06-3)","Herbicide","Insecticide","Biocide","Bactericide","Fungicide","Microbicide","microbiocide","Rodenticide")

	mask <- CHEMS[,"Phase_I"] + CHEMS[,"Phase_II"]
	use.list <- CHEMS[mask>0,"UseCategory"]
	use.list.pest <- use.list[is.element(use.list,pesticide.list)]
	use.list.pharma <- use.list[is.element(use.list,"Pharmaceutical")]
	cat("Total chemicals: ",length(use.list),"\n")
	cat("Total pesticides: ",length(use.list.pest),"\n")
	cat("Total pharamceuticals: ",length(use.list.pharma),"\n")

    for(i in 1:length(code.list)) {
        code <- code.list[i]
        use <- CHEMS[code,"UseCategory"]
        if(use=="Pharmaceutical") col.col[i] <- "green"
        if(is.element(use,pesticide.list)) col.col[i] <- "red"
    }
    pharma.count <- length(col.col[col.col=="green"])
    pest.count <- length(col.col[col.col=="red"])
    cat("Pharma hits: ",pharma.count," pesticide hits:",pest.count,"\n")
    hres <- hclust(d=dist(temp),method="ward.D")
    nlevel <- 50
    memb.mat <- as.data.frame(matrix(nrow=length(code.list),ncol=nlevel))
    memb.mat[] <- -1
    rownames(memb.mat) <- code.list

    for(i in 1:nlevel) {
        names(memb.mat)[i] <- paste("cutlevel_",i,sep="")
        memb <- cutree(hres,k=i)
        memb.mat[,i] <- memb
    }
    memb.mat <- cbind(rownames(memb.mat),memb.mat)
    memb.mat <- cbind(rownames(memb.mat),memb.mat)
    memb.mat <- cbind(rownames(memb.mat),memb.mat)
    memb.mat <- cbind(rownames(memb.mat),memb.mat)
    memb.mat <- cbind(rownames(memb.mat),memb.mat)
    names(memb.mat)[1] <- "CODE"
    names(memb.mat)[2] <- "Name"
    names(memb.mat)[3] <- "UseCategory"
    names(memb.mat)[4] <- "StructureCategory"
    names(memb.mat)[5] <- "IntendedTarget"
    csub <- CHEMS[code.list,]
    memb.mat[,2] <- csub[,"ShortName"]
    memb.mat[,3] <- csub[,"UseCategory"]
    memb.mat[,4] <- csub[,"StructureCategory"]
    memb.mat[,5] <- csub[,"IntendedTarget"]

    result <- heatmap(t(as.matrix(temp)),margins=c(5,5),scale="none",labCol=name.list,main=paste("Gene Score",dim(temp)[1]," chemicals, ",dim(temp)[2]," genes"),
            xlab="",ylab="",cexCol=cex.col,cexRow=cex.row,col=brewer.pal(9,"Reds"),
            hclustfun=function(x) hclust(d=dist(x),method="ward.D"),ColSideColors=col.col,keep.dendro=T,verbose=T)

	temp <- temp[,result$rowInd]
    memb.mat <- cbind(memb.mat,temp)
    memb.mat <- memb.mat[result$colInd,]
    outfile <- paste("output/genescore_clusters_",cutoff,".txt",sep="")
    write.table(memb.mat,file=outfile, row.names=F, append=FALSE, quote=F, sep = "\t")

    if(to.file) dev.off()
    else browser()
}
#--------------------------------------------------------------------------------------
#
# do the heatmap of hits for a category
#
# QC=OK
#--------------------------------------------------------------------------------------
category.hm <- function(to.file=F,do.filter=F) {
    cat("==========================================================================\n")
    cat("category.hm\n")
    cat("==========================================================================\n")
    flush.console()

    if(to.file) {
        file <- paste("plots/category_heatmap_.pdf",sep="")
        pdf(file=file,width=7,height=7,pointsize=12,bg="white",paper="letter",pagecentre=T)
    }

    cat.list <- sort(uniquify(CHEMS[,"StructureCategory"]))
    cat.list <- cat.list[2:length(cat.list)]
    ncat <- length(cat.list)
    for(i in 1:ncat) {
    	category <- cat.list[i]
    	code.list <- CHEMS[is.element(CHEMS[,"StructureCategory"],category),"CODE"]
    	name.list <- CHEMS[code.list,"ShortName"]
    	nchem <- length(code.list)
    	if(nchem>=5) {
    		atemp <- AC50MAT.LOG[code.list,]
    		ztemp <- MAT.ZSCORE.NORM[code.list,]
    		ztemp[is.na(ztemp)] <- 0
    		ztemp[ztemp<3] <- 0
    		ztemp[ztemp>0] <- 1
    		atemp[is.na(atemp)] <- 0
    		btemp <- atemp*ztemp
    		cs <- colSums(btemp)
    		btemp <- btemp[,cs>0]
    		print(category)
    		result <- heatmap(as.matrix(btemp),margins=c(10,10),scale="none",labRow=name.list,main=category,
			          xlab="",ylab="",cexCol=0.6,cexRow=0.8,col=brewer.pal(9,"Reds"),
			          hclustfun=function(x) hclust(d=dist(x),method="ward.D"),keep.dendro=T,verbose=T)

    		if(!to.file) browser()
    	}
    }
    if(to.file) dev.off()
    else browser()
}
#--------------------------------------------------------------------------------------
#
# print out the negative chemicals
#
# QC=OK
#--------------------------------------------------------------------------------------
negative.chems <- function() {
    cat("==========================================================================\n")
    cat("negative.chems\n")
    cat("==========================================================================\n")
    flush.console()

	nchem <- dim(AC50MAT.DISC)[1]
	code.list <- c()
	for(i in 1:nchem) {
		temp <- AC50MAT.DISC[i,]
		temp <- temp[!is.na(temp)]
		ss <- sum(temp)
		if(ss==0) code.list <- c(code.list,rownames(AC50MAT.DISC)[i])
	}
	temp <- CHEMS[code.list,]
    outfile <- paste("output/negative_chemicals.txt",sep="")
    write.table(temp,file=outfile, row.names=F, append=FALSE, quote=F, sep = "\t")

	browser()
}
#--------------------------------------------------------------------------------------
#
# calculate the most potent and specific hits
#
# QC=OK
#--------------------------------------------------------------------------------------
potent.specific <- function(cutoff=5) {
    cat("==========================================================================\n")
    cat("potent.specific\n")
    cat("==========================================================================\n")
    flush.console()

	code.list <- c()
	gene.list <- c()
	gs.list <- c()
	ngene <- dim(GMATRIX)[2]
	for(i in 1:ngene) {
		gene <- names(GMATRIX)[i]
		temp <- GMATRIX[,gene]
		clist <- rownames(GMATRIX[GMATRIX[,gene]>=cutoff,])
		gslist <- GMATRIX[clist,gene]
		glist <- clist
		glist[] <- gene
		code.list <- c(code.list,clist)
		gene.list <- c(gene.list,glist)
		gs.list <- c(gs.list,gslist)
	}
	z.list <- gs.list
	z.list[] <- 0
	combo.list <- z.list
	match.list <- z.list
	match.list[] <- ""
	cname.list <- CHEMS[code.list,"ShortName"]
	target.list <- CHEMS[code.list,"IntendedTarget"]
	target.class.list <- target.list
	target.class.list[] <- ""
	use.list <- CHEMS[code.list,"UseCategory"]
	str.list <- CHEMS[code.list,"StructureCategory"]

	nr.list <- c("ESR1","ESR2","AR","PGR","NR3C1","NR3C2")
	gpcr.list <- sort(uniquify(toupper(ASSAY.INFO[is.element(ASSAY.INFO[,"Source"],"Novascreen_GPCR"),"Gene_Process"])))
	gpcr.list <- c(gpcr.list,"HTR1B","DRD3","BDKRB1","ADRA2A","DRD2","HTR2A")
	gpcr.list <- sort(uniquify(gpcr.list))
	ptgs.list <- c("PTGS1","PTGS2","PTGER2")
	gaba.list <- c("GABRA1","GABRA5","GABRAx")
	ppar.list <- c("PPARA","PPARD","PPARG","PPARx")
	ache.list <-c("ACHE","BCHE")
	mmp.list <- c("MMP1","MMP2","MMP7","MMP9","MMP13")
	ic.list <- sort(uniquify(toupper(ASSAY.INFO[is.element(ASSAY.INFO[,"Source"],"Novascreen_IC"),"Gene_Process"])))
	ic.list <- c(ic.list,sort(uniquify(toupper(ASSAY.INFO[is.element(ASSAY.INFO[,"Source"],"Novascreen_LGIC"),"Gene_Process"]))))
	ic.list <- c(ic.list,"SIGMAR1","ION_CHANNEL","KCNJx","KCNJX","KCNJ1")

	slc.list <- sort(uniquify(toupper(ASSAY.INFO[is.element(ASSAY.INFO[,"Source"],"Novascreen_TR"),"Gene_Process"])))
	slc.list <- c(slc.list,"SLC6A9","SLC6A4","SLC18A2","SLC6A2","SLC6A3")
	enz.list <- sort(uniquify(toupper(ASSAY.INFO[is.element(ASSAY.INFO[,"Source"],"Novascreen_ENZ"),"Gene_Process"])))
	dna.list <- c("DNA","TP53","H2AFX")
	mito.list <- c("mitochondrial","TSPO","mitochondria","Mitochondria","MITOCHONDRIA")

	n <- length(code.list)
	for(i in 1:n) {
		code <- code.list[i]
		gene <- toupper(gene.list[i])
		if(gene=="HLA.DRA") gene <- "HLA-DRA"
		assay.list <- ASSAY.INFO[is.element(toupper(ASSAY.INFO[,"Gene_Process"]),gene),"Assay"]

		slice <- 0
		for(k in 1:length(assay.list)) {
		      if(sum(grep("_Activator",assay.list[k]))>0) slice <- slice+1
		      if(sum(grep("_up",assay.list[k]))>0) slice <- slice+1
		}
		denom <- length(assay.list)-slice
		if(denom==0) denom <- 1
		z <- MAT.ZSCORE.NORM[code,assay.list]
		z[is.na(z)] <- 0

		zmean <- sum(as.numeric(z))	/denom
		z.list[i] <- zmean
		combo.list[i] <- z.list[i]+gs.list[i]
		temp <- target.list[i]
		temp <- str_replace_all(temp,"ion channel","ion_channel")
		temp <- str_replace_all(temp,"Ion channel","ION_CHANNEL")
		temp <- str_replace_all(temp,"Ion channel Na","ION_CHANNEL")
		x <- str_split(toupper(temp)," ")
		#print(x[[1]])
		tlist <- x[[1]]
		tlist <- tlist[!is.element(tlist," ")]
		tlist <- tlist[!is.element(tlist,"]")]
		tlist <- tlist[!is.element(tlist,"[")]
		tlist <- tlist[!is.element(tlist,"(?)")]

		matched <- F
		for(j in 1:length(tlist)) {
			if(!matched) {
				target <- tlist[j]
				if(sum(grep(target,gpcr.list))>0) {
					if(target!="AR" && target!="RNA") {
						target.class.list[i] <- "GPCR"
					}
				}
				#cat(gene,":",target,"\n")
				if(str_length(target)>0) {
					if(sum(grep(target,gene))>0) {
						match.list[i] <- "Exact Match"
						matched <- T
						#print(match.list[i])
					}
					if(sum(grep(gene,tlist))>0) {
						match.list[i] <- "Exact Match"
						matched <- T
						#print(match.list[i])
					}
					else if(sum(grep(target,dna.list))>0 && sum(grep(gene,dna.list))>0) {
						match.list[i] <- "Exact Match"
						matched <- T
						#print(match.list[i])
					}
					else if(sum(grep(target,mito.list))>0 && sum(grep(gene,mito.list))>0) {
						match.list[i] <- "Exact Match"
						matched <- T
						#print(match.list[i])
					}
					else if(sum(grep(target,gpcr.list))>0 && sum(grep(gene,gpcr.list))>0) {
						if(target!="AR" && target!="RNA") {
							match.list[i] <- "Crosstalk GPCR"
							matched <- T
							target.class.list[i] <- "GPCR"
							#print(match.list[i])
							#cat(gene,":",target,":",tlist,"\n")
							#browser()
						}
					}
					else if(sum(grep(target,nr.list))>0 && sum(grep(gene,nr.list))>0) {
						match.list[i] <- "Crosstalk NR"
						matched <- T
						target.class.list[i] <- "NR"
						#print(match.list[i])
					}
					#else if(sum(grep(target,ppar.list))>0 && sum(grep(gene,ppar.list))>0) {
					#	match.list[i] <- "Crosstalk PPAR"
					#	matched <- T
					#	#print(match.list[i])
					#}
					else if(sum(grep(target,enz.list))>0 && sum(grep(gene,enz.list))>0) {
						match.list[i] <- "Crosstalk ENZ"
						matched <- T
						target.class.list[i] <- "ENZ"

						#print(match.list[i])
					}
					else if(sum(grep(target,ic.list))>0 && sum(grep(gene,ic.list))>0) {
						match.list[i] <- "Crosstalk IC"
						matched <- T
						target.class.list[i] <- "IC"

						#print(match.list[i])
					}
					#else if(sum(grep(target,ptgs.list))>0 && sum(grep(gene,ptgs.list))>0) {
					#	match.list[i] <- "Crosstalk PTGS"
					#	matched <- T
					#	#print(match.list[i])
					#}
					#else if(sum(grep(target,gaba.list))>0 && sum(grep(gene,gaba.list))>0) {
					#	match.list[i] <- "Crosstalk GABA"
					#	matched <- T
					#	print(match.list[i])
					#}
					#else if(sum(grep(target,mmp.list))>0 && sum(grep(gene,mmp.list))>0) {
					#	match.list[i] <- "Crosstalk MMP"
					#	matched <- T
					#	print(match.list[i])
					#}
					#else if(sum(grep(target,slc.list))>0 && sum(grep(gene,slc.list))>0) {
					#	match.list[i] <- "Crosstalk SLC"
					#	matched <- T
					#	#print(match.list[i])
					#}
					#else if(sum(grep(target,ache.list))>0 && sum(grep(gene,ache.list))>0) {
					#	match.list[i] <- "Crosstalk ACHE"
					#	matched <- T
					#	print(match.list[i])
					#}
					else {
					#	browser()
					}
				}
			}
		}
	}
    results <- as.data.frame(cbind(code.list,cname.list,use.list,str.list,target.list,target.class.list,gene.list,as.numeric(gs.list),z.list,combo.list,match.list),stringsAsFactors=F)
    names(results) <- c("CODE","Name","UseCategory","StructureCategory","IntendedTarget","TargetClass","Gene","GeneScore","Zmean","PotentSelective","MatchType")
	outfile <- paste("output/potent_specific_",cutoff,".txt",sep="")
	write.table(results,file=outfile, row.names=F, append=FALSE, quote=F, sep = "\t")
	class.list <- c("Exact Match","Crosstalk GPCR","Crosstalk NR","Crosstalk IC","Crosstalk ENZ")
	nclass <- length(class.list)
	res.sum <- as.data.frame(matrix(nrow=nclass+2,ncol=5))
	names(res.sum) <- c("Class","Description","Chemicals","Hits","Percent")
	for(i in 1:nclass) {
		my.class <- class.list[i]
		res.sum[i,"Class"] <- my.class
		temp <- results[is.element(results[,"MatchType"],my.class),]
		res.sum[i,"Hits"] <- dim(temp)[1]
		res.sum[i,"Percent"] <- format(100*dim(temp)[1]/dim(results)[1],digits=2)
		res.sum[i,"Chemicals"] <- length(uniquify(temp[,"CODE"]))
	}
	temp <- results[is.element(results[,"MatchType"],""),]
	temp.other.target <- temp[!is.element(temp[,"IntendedTarget"],""),]
	temp.no.target <- temp[is.element(temp[,"IntendedTarget"],""),]
	res.sum[nclass+1,"Class"] <- "Other Target"
	res.sum[nclass+2,"Class"] <- "No Target"
	res.sum[nclass+1,"Hits"] <- dim(temp.other.target)[1]
	res.sum[nclass+2,"Hits"] <- dim(temp.no.target)[1]
	res.sum[nclass+1,"Percent"] <- format(100*dim(temp.other.target)[1]/dim(results)[1],digits=2)
	res.sum[nclass+2,"Percent"] <- format(100*dim(temp.no.target)[1]/dim(results)[1],digits=2)
	res.sum[nclass+1,"Chemicals"] <- length(uniquify(temp.other.target[,"CODE"]))
	res.sum[nclass+2,"Chemicals"] <- length(uniquify(temp.no.target[,"CODE"]))
	print(res.sum)
	outfile <- paste("output/potent_specific_summary_",cutoff,".txt",sep="")
	write.table(res.sum,file=outfile, row.names=F, append=FALSE, quote=F, sep = "\t")

	gene.list <- as.character(sort(uniquify(results[,"Gene"])))
	ngene <- length(gene.list)
	gene.sum <- as.data.frame(matrix(nrow=ngene,ncol=3))
	gene.sum[,1] <- gene.list
	names(gene.sum) <- c("Gene","Hits","MeanGS")
	for(i in 1:ngene) {
		gene <- gene.list[i]
		temp <- results[is.element(results[,"Gene"],gene),]
		gene.sum[i,"Hits"] <- dim(temp)[1]
		gene.sum[i,"MeanGS"] <- mean(as.numeric(temp[,"GeneScore"]))
	}
	outfile <- paste("output/potent_specific_gene_summary_",cutoff,".txt",sep="")
	write.table(gene.sum,file=outfile, row.names=F, append=FALSE, quote=F, sep = "\t")

	code.list <- as.character(sort(uniquify(results[,"CODE"])))
	nchem <- length(code.list)
	chem.sum <- as.data.frame(matrix(nrow=nchem,ncol=7))
	chem.sum[,1] <- code.list
	names(chem.sum) <- c("CODE","Name","UseCategory","StructureCategory","IntendedTarget","Hits","MeanGS")
	for(i in 1:nchem) {
		code <- code.list[i]
		temp <- results[is.element(results[,"CODE"],code),]
		chem.sum[i,"Hits"] <- dim(temp)[1]
		chem.sum[i,"MeanGS"] <- mean(as.numeric(temp[,"GeneScore"]))
		chem.sum[i,"Name"] <- CHEMS[code,"ShortName"]
		chem.sum[i,"UseCategory"] <- CHEMS[code,"UseCategory"]
		chem.sum[i,"StructureCategory"] <- CHEMS[code,"StructureCategory"]
		chem.sum[i,"IntendedTarget"] <- CHEMS[code,"IntendedTarget"]
	}
	outfile <- paste("output/potent_specific_chemical_summary_",cutoff,".txt",sep="")
	write.table(chem.sum,file=outfile, row.names=F, append=FALSE, quote=F, sep = "\t")

    browser()
}
#--------------------------------------------------------------------------------------
#
# summarize for a target class
#
# QC=OK
#--------------------------------------------------------------------------------------
target.class.summary <- function(target.class="GPCR",cutoff=5) {
    cat("==========================================================================\n")
    cat("target.class.summary\n")
    cat("==========================================================================\n")
    flush.console()
	gpcr.list <- sort(uniquify(toupper(ASSAY.INFO[is.element(ASSAY.INFO[,"Source"],"Novascreen_GPCR"),"Gene_Process"])))
	gpcr.list <- c(gpcr.list,"HTR1B","DRD3","BDKRB1","ADRA2A","DRD2","HTR2A")
	gpcr.list <- sort(uniquify(gpcr.list))
	ptgs.list <- c("PTGS1","PTGS2","PTGER2")
	gaba.list <- c("GABRA1","GABRA5","GABRAx")
	ppar.list <- c("PPARA","PPARD","PPARG","PPARx")
	ache.list <-c("ACHE","BCHE")
	mmp.list <- c("MMP1","MMP2","MMP7","MMP9","MMP13")
	ic.list <- sort(uniquify(toupper(ASSAY.INFO[is.element(ASSAY.INFO[,"Source"],"Novascreen_IC"),"Gene_Process"])))
	ic.list <- c(ic.list,sort(uniquify(toupper(ASSAY.INFO[is.element(ASSAY.INFO[,"Source"],"Novascreen_LGIC"),"Gene_Process"]))))
	ic.list <- c(ic.list,"SIGMAR1","ION_CHANNEL","KCNJx","KCNJX")

	slc.list <- sort(uniquify(toupper(ASSAY.INFO[is.element(ASSAY.INFO[,"Source"],"Novascreen_TR"),"Gene_Process"])))
	slc.list <- c(slc.list,"SLC6A9","SLC6A4","SLC18A2","SLC6A2","SLC6A3")
	enz.list <- sort(uniquify(toupper(ASSAY.INFO[is.element(ASSAY.INFO[,"Source"],"Novascreen_ENZ"),"Gene_Process"])))
	dna.list <- c("DNA","TP53","H2AFX")
	mito.list <- c("mitochondrial","TSPO","mitochondria","Mitochondria","MITOCHONDRIA")

	code.list <- CHEMS[,"CODE"]
	mask <- code.list
	mask[] <- 0
	nchem <- length(code.list)
	for(i in 1:nchem) {
		code <- code.list[i]
		temp <- CHEMS[code,"IntendedTarget"]
		temp <- str_replace_all(temp,"ion channel","ion_channel")
		temp <- str_replace_all(temp,"Ion channel","ION_CHANNEL")
		temp <- str_replace_all(temp,"Ion channel Na","ION_CHANNEL")
		x <- str_split(toupper(temp)," ")
		tlist <- x[[1]]
		tlist <- tlist[!is.element(tlist," ")]
		tlist <- tlist[!is.element(tlist,"]")]
		tlist <- tlist[!is.element(tlist,"[")]
		tlist <- tlist[!is.element(tlist,"(?)")]

		matched <- F
		for(j in 1:length(tlist)) {
			target <- tlist[j]
			if(!matched && target!="") {
				if(target.class=="GPCR") {
					if(sum(grep(target,gpcr.list))>0) {
						if(target!="AR" && target!="RNA") {
							mask[i] <- 1
							matched <- T
						}
					}
				}
			}
		}
	}

	code.list <- code.list[mask==1]
	cat("Chemicals targeting class:",target.class,":",length(code.list),"\n")
	name.list <- CHEMS[code.list,"ShortName"]
	print(name.list)
	#outfile <- paste("output/potent_specific_target_class_summary_",cutoff,"_",target.class,".txt",sep="")
	#write.table(chem.sum,file=outfile, row.names=F, append=FALSE, quote=F, sep = "\t")

    browser()
}
#--------------------------------------------------------------------------------------
#
# Emax x Z-score
#
# QC=OK
#--------------------------------------------------------------------------------------
emax.z <- function(to.file=F) {
	cat("==========================================================================\n")
	cat("emax.z\n")
	cat("==========================================================================\n")
    flush.console()
	if(to.file) {
	    fname <- paste("plots/emax_z.pdf",sep="")
	    pdf(file=fname,width=7,height=10,pointsize=12,bg="white",paper="letter",pagecentre=T)
	}
    par(mfrow=c(3,2),mar=c(6,6,4,4))

   	mask <- ASSAY.INFO[,"Type"]
   	mask[is.element(mask,c("gene","functional"))] <- 1
   	mask[is.element(mask,c("cytotox","cytotox_fail","proliferation"))] <- 0
   	atemp <- ASSAY.INFO[mask==1,]

   	assay.set.list <- sort(uniquify(atemp[,"Source"]))
   	nset <- length(assay.set.list)
   	for(i in 1:nset) {
   		aset <- assay.set.list[i]
   		cat(aset,"\n")
   		assay.list <- atemp[is.element(atemp[,"Source"],aset),"Assay"]
   		assay.list <- assay.list[is.element(assay.list,names(EMAXMAT))]
   		etemp <- EMAXMAT[,assay.list]
   		ztemp <- MAT.ZSCORE.NORM[,assay.list]
   		if(aset!="ACEA") {
   			ztemp <- as.numeric(as.matrix(ztemp))
   			etemp <- as.numeric(as.matrix(etemp))
   		}
   		etemp <- etemp[!is.na(ztemp)]
   		ztemp <- ztemp[!is.na(ztemp)]
   		cat(length(ztemp),"\n")
   		ztemp <- ztemp[!is.na(etemp)]
   		etemp <- etemp[!is.na(etemp)]
   		cat(length(ztemp),"\n")
   		hist(etemp)

   		browser()
   		if(length(ztemp)>0) {
   			plot(etemp~ztemp,xlab="Z",ylab="Emax",main=aset,cex.lab=1.5,cex.axis=1.5)
   			if(!to.file) browser()
   		}
   	}
   	if(to.file) dev.off()
	else browser()
}
#--------------------------------------------------------------------------------------
#
# LPlot the RTK data
#
# QC=OK
#--------------------------------------------------------------------------------------
rtk.auc.plot <- function(to.file=F,ymin=0.00,ymax=0.2) {
	cat("==========================================================================\n")
	cat("rtk.plot\n")
	cat("==========================================================================\n")
    flush.console()
    file <- "output/by_chemical_hit_dist.txt"
	adata <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="",comment="")
    file <- "output/cytotox_dist.txt"
	cytotox.data <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="",comment="")
	rownames(cytotox.data) <- cytotox.data[,"CODE"]
	#load("../RatAUC/ratvalues-better-073012.RData")
   	file <- "RTK/CHR_MTD.txt"
	trdata <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="",comment="")
  	load("../RatAUC/ratvalues-better-073012.RData")
	if(to.file) {
	    fname <- paste("plots/rtk_auc_with_burst_withToxRefDB.pdf",sep="")
	    pdf(file=fname,width=7,height=10,pointsize=12,bg="white",paper="letter",pagecentre=T)
	}
    par(mfrow=c(1,1),mar=c(6,6,4,4))

	nchem <- length(vLiver.rat.peak.values)
	code.list <- c()
	cname.list <- c()
	burst.mean.list <- c()
	burst.max.list <- c()
	burst.min.list <- c()
	peak.mtd.list <- c()
	peak.lel.list <- c()
	ache.list <- c()

	for(i in 1:nchem) {
		casrn <- names(vLiver.rat.peak.values)[i]
		code <- paste("C",str_replace_all(casrn,"-",""),sep="")
		peak <- vLiver.rat.peak.values[i]
		doit <- T
		if(!is.element(code,CHEMS[,"CODE"])) doit <- F
		if(!is.element(code,cytotox.data[,"CODE"])) doit <- F
		if(!is.element(casrn,trdata[,"chemical_casrn"])) doit <- F
		if(doit) {
			temp <- CHEMS[is.element(CHEMS[,"CODE"],code),]

			cname <- temp[1,"ShortName"]
			target <- temp[1,"IntendedTarget"]
			ache <- F
 			if(!is.na(target)) {
 				if(sum(grep("ACHE",target))>0) ache <- T
 			}
 			burst.mean.log <- cytotox.data[code,"cytotox.mean"]
 			burst.mad.log <- cytotox.data[code,"cytotox.sd.global"]
			if(!is.na(burst.mean.log)) {
				if(burst.mean.log>=4) {
					burst.mean <- 10**(-burst.mean.log)*1000000
					burst.max <- 10**(-(burst.mean.log-3*burst.mad.log))*1000000
					burst.min <- 10**(-(burst.mean.log+3*burst.mad.log))*1000000

					#burst.max <- burst.max / burst.mean
					#burst.min <- burst.min / burst.mean

					temp.2 <- trdata[is.element(trdata[,"chemical_casrn"],casrn),]
					if(dim(temp.2)[1]>0 && !is.na(burst.mean)) {
						mtd <- max(temp.2[,"hdt"])
						lel <- as.numeric(min(temp.2[,"lel_dose"]))

						###peak.mtd <- peak*mtd
						#peak.mtd <- peak.mtd / burst.mean

						###peak.lel <- peak*lel
						#peak.lel <- peak.lel / burst.mean

						peak.lel <- lel
						peak.mtd <- mtd
						burst.mean <- burst.mean / peak
						burst.max <- burst.max / peak
						burst.min <- burst.min / peak

						code.list <- c(code.list,code)
						cname.list <- c(cname.list,cname)
						burst.mean.list <- c(burst.mean.list,burst.mean)
						burst.max.list <- c(burst.max.list,burst.max)
						burst.min.list <- c(burst.min.list,burst.min)
						peak.mtd.list <- c(peak.mtd.list,peak.mtd)
						peak.lel.list <- c(peak.lel.list,peak.lel)
						ache.list <- c(ache.list,ache)
					}
				}
			}
		}
	}
	##############################################################
	nhit <- length(code.list)
	ymax <- nhit+3

	index <- sort(burst.mean.list,index.return=T)$ix
	index <- sort(peak.mtd.list,index.return=T)$ix
	plot(0~0,cex.lab=1.5,log="x",cex.axis=1.5,xlim=c(0.001,10000),ylim=c(ymin,ymax),xlab="Dose (mg/kg/day)", ylab="",type="n",main="")
	for(i in 1:nhit) {
		code <- code.list[i]
		cname <- cname.list[index[i]]
		ache <- ache.list[index[i]]
		burst.mean <- burst.mean.list[index[i]]
		burst.max <- burst.max.list[index[i]]
		burst.min <- burst.min.list[index[i]]
		peak.mtd <- peak.mtd.list[index[i]]
		peak.lel <- peak.lel.list[index[i]]

		#peak.mtd <- peak.mtd / burst.mean
		#peak.lel <- peak.lel / burst.mean
		color <- "gray"
		if(peak.mtd <burst.min) color <- "red"
		if(peak.mtd >burst.max) color <- "green"
		if(color=="red" && ache) color <- "yellow"
		if(color=="red" && cname=="Abamectin") color <- "yellow"
		y <- i
		points(peak.mtd,y,pch=21,bg=color,cex=2)
		#if(color!="gray") points(peak.lel,y,pch="*",bg="black",cex=2)
		lines(c(burst.min,burst.max),c(y,y),lwd=1)
		lines(c(burst.min,burst.min),c(y-0.2,y+0.2),lwd=1)
		lines(c(burst.max,burst.max),c(y-0.2,y+0.2),lwd=1)

		if(peak.mtd<burst.min) text(peak.mtd,y,cname,pos=2,cex=0.8)
		if(peak.mtd>burst.max) {
			#if(peak.mtd<100) text(peak.mtd,y,cname,pos=4,cex=0.8)
			#else
			text(peak.mtd,y,cname,pos=4,cex=0.8)
		}
	}
	text(1,ymax*0.99,paste("Total Chemicals: ",nhit),pos=4)
	xlegend <- 1e-3
	points(xlegend,ymax*0.95,pch=21,bg="gray",cex=2)
	points(xlegend,ymax*0.90,pch=21,bg="green",cex=2)
	points(xlegend,ymax*0.85,pch=21,bg="red",cex=2)
	points(xlegend,ymax*0.8,pch=21,bg="yellow",cex=2)
	#points(xlegend,ymax*0.75,pch="*",bg="black",cex=2)
	text(xlegend*0.5,ymax,"Conc @MTD",pos=4)
	text(xlegend,ymax*0.95,"~burst",pos=4)
	text(xlegend,ymax*0.9,">burst",pos=4)
	text(xlegend,ymax*0.85,"<burst",pos=4)
	text(xlegend,ymax*0.8,"<burst, Neurological-Limited",pos=4)
	#text(xlegend,ymax*0.75,"Systemic LEL",pos=4)
	if(to.file) dev.off()
	else browser()

}
#--------------------------------------------------------------------------------------
#
# Examine the distribution of cytotox hits
#
# QC=OK
#--------------------------------------------------------------------------------------
cytotox.dist <- function(do.prep=T,to.file=F) {
    cat("==========================================================================\n")
    cat("cytotox.dist\n")
    cat("==========================================================================\n")
    flush.console()
    file <- "output/by_chemical_hit_dist.txt"
    temp <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"")
    rownames(temp) <- temp[,"CODE"]
    if(to.file) {
        fname <- "plots/cytotox_dist_summary.pdf"
        pdf(file=fname,width=7,height=7,pointsize=12,bg="white",paper="letter",pagecentre=T)
    }
    par(mfrow=c(1,1),mar=c(4,4,3,3))

    temp.1 <- temp[temp[,"AssaysTested"]>500,]
    ymax <- 30
    zpm <- c()
    zpm.class <- c()
    nchem <- dim(temp.1)[1]
    cytotox.assay.set <- ASSAY.INFO[is.element(ASSAY.INFO[,"Type"],"cytotox"),"Assay"]
    cytotox.assay.set <- c(cytotox.assay.set,"APR_CellCycleArrest_24h_up")
    cytotox.assay.set <- c(cytotox.assay.set,"APR_CellCycleArrest_72h_up")
    cytotox.assay.set.small <- c(cytotox.assay.set,"ATG_NRF2_ARE_CIS")
    zt.all <- c()
    perc.z.in <- c()
    perc.z.out <- c()

    stress <- CYTOTOX
    stress <- cbind(stress,stress[,5])
    stress <- cbind(stress,stress[,5])
    stress <- cbind(stress,stress[,5])
    stress <- cbind(stress,stress[,5])
    stress <- cbind(stress,stress[,5])
    names(stress)[6] <- "cytotox"
    names(stress)[7] <- "APR_OxidativeStress_24h_up"
    names(stress)[8] <- "ATG_NRF2_ARE_CIS"
    names(stress)[9] <- "ATG_MRE_CIS"
    names(stress)[10] <- "ATG_HSE_CIS"
    stress[,"cytotox"] <- 1000000 * 10**(-stress[,"cytotox.mean"])
    stress[,"APR_OxidativeStress_24h_up"] <- MAT.AC50[,"APR_OxidativeStress_24h_up"]
    stress[,"ATG_NRF2_ARE_CIS"] <- MAT.AC50[,"ATG_NRF2_ARE_CIS"]
    stress[,"ATG_MRE_CIS"] <- MAT.AC50[,"ATG_MRE_CIS"]
    stress[,"ATG_HSE_CIS"] <- MAT.AC50[,"ATG_HSE_CIS"]
    stress <- stress[temp.1[,"CODE"],]
    if(do.prep) {
        for(i in 1:nchem) {
            code <- temp.1[i,"CODE"]
            burst.mean <- temp.1[code,"Burst.median"]
            burst.sd <- temp.1[code,"Burst.mad"]
            y <- 100*temp.1[code,"HitRatio"]
            denom <- temp.1[code,"AssaysTested"]
            if(!is.na(burst.mean) && !is.na(burst.sd)) {
                zpm <- c(zpm,y)
                if(burst.mean==3) zpm.class <- c(zpm.class,0)
                else {
                    zpm.class <- c(zpm.class,1)
                    zt <- MAT.ZSCORE.NORM[code,]
                    zt <- zt[!is.na(zt)]
                    zt.all <- c(zt.all,zt)
                    n <- length(zt)
                    perc.out <- 100*length(zt[zt>=3])/denom
                    perc.in <- 100*length(zt[zt<3])/denom
                    perc.z.in <- c(perc.z.in,perc.in)
                    perc.z.out <- c(perc.z.out,perc.out)
                }
            }
        }
        ZPM <<- zpm
        ZPM.CLASS <<- zpm.class
        PERC.Z.IN <<- perc.z.in
        PERC.Z.OUT <<- perc.z.out
        ZT.ALL <<- zt.all
    }
    h.0 <- ZPM[ZPM.CLASS==0]
    h.1 <- ZPM[ZPM.CLASS==1]
    n.0 <- length(h.0)
    n.1 <- length(h.1)
    result.01 <- t.test(h.0,h.1,alternative="less")

    zpm <- ZPM
    zpm.class <- ZPM.CLASS
    perc.z.out <- PERC.Z.OUT
    h.2 <- PERC.Z.OUT
    result.02 <- t.test(h.0,h.2,alternative="less")

    perc.z.out.class <- perc.z.out
    perc.z.out.class[] <- 2
    zpm <- c(zpm,perc.z.out)
    zpm.class <- c(zpm.class,perc.z.out.class)
    result <- boxplot(zpm~zpm.class,xlab="",ylab="%Hit (total)",names=c("Cytotox(-)","Cytotox(+)","Cytotox(+),Z>3"),cex.axis=1.2,cex.lab=1.2,ylim=c(0,35))
    text(label=paste("Cytotox(-): ",n.0,"chemicals"),x=0.5,y=20,pos=4,cex=1.1)
    text(label=paste("Cytotox(+): ",n.1,"chemicals"),x=0.5,y=17,pos=4,cex=1.1)
    text(label=paste("p: ",format(result.01$p.value,digits=2)),x=2,y=36,pos=1,cex=1.1)
    text(label=paste("p: ",format(result.02$p.value,digits=2)),x=3,y=36,pos=1,cex=1.1)
    text(label=paste("median: ",format(result$stats[3,1],digits=2),"%",sep=""),x=1,y=34,pos=1,cex=1.1)
    text(label=paste("median: ",format(result$stats[3,2],digits=2),"%",sep=""),x=2,y=34,pos=1,cex=1.1)
    text(label=paste("median: ",format(result$stats[3,3],digits=2),"%",sep=""),x=3,y=34,pos=1,cex=1.1)
    x <- stress[,"cytotox"]
    y <- stress[,"APR_OxidativeStress_24h_up"]
    plot(y~x,log="xy",xlab="Median Cytotoxicity AC50 (uM)",ylab="Stress AC50(uM)",xlim=c(0.1,200),ylim=c(0.1,200),cex.lab=1.2,cex.axis=1.2,type="n")
    lines(c(0.01,1000),c(0.01,1000),lwd=2)
    lines(c(0.1,1000),c(0.01,100),lwd=1)
    lines(c(0.01,100),c(0.1,1000),lwd=1)
	mask <- x
	mask[mask>=1000] <- 0
	mask[mask>0] <- 1
	denom <- x[mask==1]
    y <- stress[,"APR_OxidativeStress_24h_up"]
	points(y~x,pch=21,bg="red",cex=1)
	num <- y[mask==1]
	f.1 <- length(num[num>denom])/length(denom)

    y <- stress[,"ATG_NRF2_ARE_CIS"]
	points(y~x,pch=21,bg="green",cex=1)
	num <- y[mask==1]
	f.2 <- length(num[num>denom])/length(denom)

    y <- stress[,"ATG_MRE_CIS"]
	points(y~x,pch=21,bg="blue",cex=1)
	num <- y[mask==1]
	f.3 <- length(num[num>denom])/length(denom)

    y <- stress[,"ATG_HSE_CIS"]
	points(y~x,pch=21,bg="gray",cex=1)
	num <- y[mask==1]
	f.4 <- length(num[num>denom])/length(denom)

	text(x=0.08,y=150,"% Hits AC50 > cytotox median",cex=1.2,pos=4)
	text(x=0.1,y=100,paste("Oxidative Stress:",format(100*f.1,digits=3),"%"),pos=4); points(x=0.1,y=100,pch=21,bg="red",cex=1.5)
	text(x=0.1,y=70,paste("NRF2:",format(100*f.2,digits=3),"%"),pos=4); points(x=0.1,y=70,pch=21,bg="green",cex=1.5)
	text(x=0.1,y=50,paste("MRE:",format(100*f.3,digits=3),"%"),pos=4); points(x=0.1,y=50,pch=21,bg="blue",cex=1.5)
	text(x=0.1,y=35,paste("HSE:",format(100*f.4,digits=3),"%"),pos=4); points(x=0.1,y=35,pch=21,bg="gray",cex=1.5)


    par(mfrow=c(2,2),mar=c(4,4,3,3))

    x <- stress[,"cytotox"]
    y <- stress[,"APR_OxidativeStress_24h_up"]
	mask1 <- x
	mask1[mask1>=1000] <- 0
	mask1[mask1>0] <- 1
	mask1[is.na(x)] <- 0

	mask2 <- y
	mask2[mask2>=1000] <- 0
	mask2[mask2>0] <- 1
	mask2[is.na(y)] <- 0

	mask <- mask1*mask2
	denom <- x[mask==1]
	num <- y[mask==1]
	z <- num/denom
	z <- z[!is.na(z)]

	breaks.min <- 0.01
    breaksA <- breaks.min
    for(i in 1:14) breaksA <- c(breaksA,2*breaksA[length(breaksA)])
    z <- z[z<breaksA[length(breaksA)]]
    z <- z[z>breaksA[1]]
    xA <- hist(z,breaks=breaksA,plot=F)
    ymax <- max(xA$counts)
    hist.log(breaksA,xA$counts,ylim=c(0,ymax),xlab="AC50(stress)/AC50(Cytotox)",ylab="Hits",main="Oxidative Stress",1000000,1000000,1000000)
	lines(c(1,1),c(0,ymax),lwd=2,col="red")
	lines(c(10,10),c(0,ymax),lwd=1,col="red")
	lines(c(0.1,0.1),c(0,ymax),lwd=1,col="red")

    x <- stress[,"cytotox"]
    y <- stress[,"ATG_NRF2_ARE_CIS"]
	mask1 <- x
	mask1[mask1>=1000] <- 0
	mask1[mask1>0] <- 1
	mask1[is.na(x)] <- 0

	mask2 <- y
	mask2[mask2>=1000] <- 0
	mask2[mask2>0] <- 1
	mask2[is.na(y)] <- 0

	mask <- mask1*mask2
	denom <- x[mask==1]
	num <- y[mask==1]
	z <- num/denom
	z <- z[!is.na(z)]

	breaks.min <- 0.01
    breaksA <- breaks.min
    for(i in 1:14) breaksA <- c(breaksA,2*breaksA[length(breaksA)])
    z <- z[z<breaksA[length(breaksA)]]
    z <- z[z>breaksA[1]]
    xA <- hist(z,breaks=breaksA,plot=F)
    ymax <- max(xA$counts)
    hist.log(breaksA,xA$counts,ylim=c(0,ymax),xlab="AC50(stress)/AC50(Cytotox)",ylab="Hits",main="NRF2",1000000,1000000,1000000)
	lines(c(1,1),c(0,ymax),lwd=2,col="red")
	lines(c(10,10),c(0,ymax),lwd=1,col="red")
	lines(c(0.1,0.1),c(0,ymax),lwd=1,col="red")

    x <- stress[,"cytotox"]
    y <- stress[,"ATG_MRE_CIS"]
	mask1 <- x
	mask1[mask1>=1000] <- 0
	mask1[mask1>0] <- 1
	mask1[is.na(x)] <- 0

	mask2 <- y
	mask2[mask2>=1000] <- 0
	mask2[mask2>0] <- 1
	mask2[is.na(y)] <- 0

	mask <- mask1*mask2
	denom <- x[mask==1]
	num <- y[mask==1]
	z <- num/denom
	z <- z[!is.na(z)]

	breaks.min <- 0.01
    breaksA <- breaks.min
    for(i in 1:14) breaksA <- c(breaksA,2*breaksA[length(breaksA)])
    z <- z[z<breaksA[length(breaksA)]]
    z <- z[z>breaksA[1]]
    xA <- hist(z,breaks=breaksA,plot=F)
    ymax <- max(xA$counts)
    hist.log(breaksA,xA$counts,ylim=c(0,ymax),xlab="AC50(stress)/AC50(Cytotox)",ylab="Hits",main="MRE",1000000,1000000,1000000)
	lines(c(1,1),c(0,ymax),lwd=2,col="red")
	lines(c(10,10),c(0,ymax),lwd=1,col="red")
	lines(c(0.1,0.1),c(0,ymax),lwd=1,col="red")

    x <- stress[,"cytotox"]
    y <- stress[,"ATG_HSE_CIS"]
	mask1 <- x
	mask1[mask1>=1000] <- 0
	mask1[mask1>0] <- 1
	mask1[is.na(x)] <- 0

	mask2 <- y
	mask2[mask2>=1000] <- 0
	mask2[mask2>0] <- 1
	mask2[is.na(y)] <- 0

	mask <- mask1*mask2
	denom <- x[mask==1]
	num <- y[mask==1]
	z <- num/denom
	z <- z[!is.na(z)]

	breaks.min <- 0.01
    breaksA <- breaks.min
    for(i in 1:14) breaksA <- c(breaksA,2*breaksA[length(breaksA)])
    z <- z[z<breaksA[length(breaksA)]]
    z <- z[z>breaksA[1]]
    xA <- hist(z,breaks=breaksA,plot=F)
    ymax <- max(xA$counts)
    hist.log(breaksA,xA$counts,ylim=c(0,ymax),xlab="AC50(stress)/AC50(Cytotox)",ylab="Hits",main="HSE",1000000,1000000,1000000)
	lines(c(1,1),c(0,ymax),lwd=2,col="red")
	lines(c(10,10),c(0,ymax),lwd=1,col="red")
	lines(c(0.1,0.1),c(0,ymax),lwd=1,col="red")

    if(to.file) dev.off()
    else browser()
}
#--------------------------------------------------------------------------------------
#
# export the hits in a long format
#
# QC=OK
#--------------------------------------------------------------------------------------
export.chem.hits <- function(set.name="spill_analogs",code.list=c("C10042598","C104767","C108930","C111273","C111706","C111875","C112301","C112425","C112538","C112709","C112721","C116029","C123513","C143088","C2433149","C3452979","C36653824","C5349519","C629765","C70568604","C8000417","C96413","C98522")) {
	cat("==========================================================================\n")
	cat("export.chem.hits\n")
	cat("==========================================================================\n")
    flush.console()
    file <- "output/chemical_hits_long.txt"
    if(!is.na(set.name)) file <- paste("output/chemical_hits_long_",set.name,".txt",sep="")
    s <- "CODE\tCASRN\tName\tShortName\tAssaysTested\tHits\tHitRatio\tCategory\tIntendedTarget\tGene\tAssay\tmodifier\tAC50\tEmax\tZ\n"
    cat(file=file,s,append=F)
    if(is.na(set.name)) code.list <- CHEMS[,"CODE"]
    nchem <- length(code.list)
    for(i in 1:nchem) {
    	code <- code.list[i]
    	casrn <- CHEMS[code,"CASRN"]
    	cname <- CHEMS[code,"Name"]
    	sname <- CHEMS[code,"ShortName"]
    	target <- CHEMS[code,"IntendedTarget"]
    	ccat <- CHEMS[code,"StructureCategory"]
    	tline <- MAT.hitcall[code,]
    	aline <- MAT.AC50[code,]
    	zline <- MAT.ZSCORE.NORM[code,]

        temp <- tline
        temp[] <- 1
        temp[is.na(tline)] <- 0
        ntry <- sum(temp)

        temp <- tline
        temp[is.na(tline)] <- 0
        nhit <- sum(temp)

        hit.ratio <- nhit / max(1,ntry)
        if(nhit>0) {

            alist <- names(MAT.AC50)[temp==1]
            for(j in 1:length(alist)) {
                assay <- alist[j]
                gene <- ASSAY.INFO[is.element(ASSAY.INFO[,"Assay"],assay),"Gene_Process"]
                ac50 <- MAT.AC50[code,assay]
                emax <- EMAXMAT[code,assay]
                modifier <- AC50MODMAT[code,assay]
                z <- MAT.ZSCORE.NORM[code,assay]
                s <- paste(code,"\t",casrn,"\t",cname,"\t",sname,"\t",ntry,"\t",nhit,"\t",format(hit.ratio,digits=2),"\t",ccat,"\t",target,"\t",gene,"\t",assay,"\t",modifier,"\t",format(ac50,digits=3),"\t",format(emax,digits=3),"\t",format(z,digits=2),"\n",sep="")
                cat(s)
                cat(file=file,s,append=T)
                flush.console()
            }
        }
    }
}
#######################################################################################
#######################################################################################
#######################################################################################
#######################################################################################
#######################################################################################
#######################################################################################
#######################################################################################
#--------------------------------------------------------------------------------------
#
# make a set of summary calculations
#
# >>> run once per update of assay data
#
#--------------------------------------------------------------------------------------
calc.1 <- function(do.load=F) {
	if(do.load) load.data()
	hit.dist(to.file=T,target.gene=NA)
	hit.dist(to.file=T,target.gene="ESR1")
	load.zscore()
	assay.summary(to.file=T)
	export.chem.hits()
	genescore.hits(do.prep=T)
	load.genescore()
	genescore.FDA(do.prep=T)
	genescore.plots.1(to.file=T)
	calc.genescore.matrix()
	load.genescore.matrix()
	genescore.rank()
	genescore.hm(to.file=T)
	rtk.auc.plot(to.file=T)
	promiscuity.by.category()
	source.summary.boxplot(to.file=T)
	#
	# These only need to be done once
	#
	#####build.toxrefdb.index(do.read=T)
	#####build.toxrefdb.type(do.read=T,type="CHR",species="rat")
	toxcast.toxrefdb.assoc(do.read=T)
	gene.chemsim(do.read=T,cutoff1=1,cutoff2=7)
	gene.stats.summary(do.read=T,to.file=T)
	hm.assay(phase.list=c("Phase_I_V2","Phase_IIa","Phase_IIb","Phase_IIc"),hm.name="ToxCast Phase I and II",to.file=T)
	hm.assay(phase.list=c("Phase_I_V2"),hm.name="ToxCast Phase I",to.file=T)
	hm.assay(phase.list=c("Phase_I_V2","Phase_IIa","Phase_IIb","Phase_IIc","E1K"),hm.name="ToxCast E1K",to.file=T)
	hm.cytotox(phase.list=c("Phase_I_V2","Phase_IIa","Phase_IIb","Phase_IIc"),hm.name="Cytotox",to.file=T)
	mw.dist(do.prep=T,to.file=T)
	hm.chemsim(to.file=T)
	bruns.watson(to.file=T)
	plot.hit.dist(do.prep=T,to.file=T)
	cytotox.dist(to.file=T)
	###one.genescore.plot(do.prep=T,to.file=T,min.score=7,gene="CYP19A1")
	###genescore.hm.small(to.file=T,cutoff=10,cex.col=0.1,title="Mercury_Tin",code.list=c("C1118463","C1461229","C1461252","C2155706","C4342363","C683181","C753731","C76879","C587859","C62384","C7487947"))
	###genescore.hm.small.bytarget(to.file=T,cutoff=5,cex.col=1,cex.row=0.5,title="Mitochondria",gene.list=c("TSPO","MITOFUNCTION"))
	###assay.set(aname="PPAR",min.hit=2,assay.list=c("ATG_PPARa_TRANS","ATG_PPRE_CIS","NVS_NR_hPPARa","ATG_PPARd_TRANS","ATG_PPARg_TRANS","NVS_NR_hPPARg","Tox21_PPARg_BLA_Agonist_ratio","OT_PPARg_PPARgSRC1_1440"))
	###promiscuity.features(do.prep=T)
	###promiscuity.by.category()
	###sa.neighbor()
	###edsp21()
}
########################################################################################
#
# individual routines to load the data
#
########################################################################################
#--------------------------------------------------------------------------------------
#
# plot the range of Bruns Watson calls vs promiscuity
#
# QC=OK
#--------------------------------------------------------------------------------------
bruns.watson <- function(to.file=F,selective=T) {
    file <- "BrunsWatson/BrunsWatsonOutput.txt"
    bdata <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment="")
    print(dim(bdata))

    nchem <- length(CODE.LIST)
    hf <- vector(length=nchem,mode="numeric")
    bw.class <- vector(length=nchem,mode="numeric")
    hf[] <- NA
    bw.class[] <- NA
    for(i in 1:nchem) {
        code <- CODE.LIST[i]
                                        #temp <- cdata[is.element(cdata[,"CODE"],code),]
        temp <- GENEDATA[is.element(GENEDATA[,"CODE"],code),]
        temp.gs <- temp[,"GeneScore"]
        top <- length(temp.gs[temp.gs>=4])
        bot <- length(temp.gs)
        cat(code,top,bot,"\n")
                                        #top <- temp[1,"Hits"]
                                        #if(selective==T) top <- temp[1,"selectiveHits"]
                                        #bot <- temp[1,"AssaysTested"]
        if(bot>200) {
            hf[i] <- top/bot
            if(is.element(code,bdata[,"Title"])) {
                res <- bdata[is.element(bdata[,"Title"],code),"Result"]
                if(res=="clean") bw.class[i] <- 0
                if(res=="dirty") bw.class[i] <- 1
                if(res=="fail") bw.class[i] <- 2
            }
        }
    }
    hf <- hf[!is.na(bw.class)]
    bw.class <- bw.class[!is.na(bw.class)]
    bw.class <- bw.class[!is.na(hf)]
    hf <- hf[!is.na(hf)]

    p.20 <- wilcox.test(hf[bw.class==2],hf[bw.class==0],alternative="greater")$p.value
    p.21 <- wilcox.test(hf[bw.class==2],hf[bw.class==1],alternative="greater")$p.value
    p.10 <- wilcox.test(hf[bw.class==1],hf[bw.class==0],alternative="greater")$p.value

    cat("p.20:",p.20,"\n")
    cat("p.21:",p.21,"\n")
    cat("p.10:",p.10,"\n")

    hf <- hf[!is.na(hf)]
    if(to.file) {
        fname <- paste("plots/bruns_watson_vs_true_promiscuty.pdf",sep="")
        pdf(file=fname,width=7,height=7,pointsize=12,bg="white",paper="letter",pagecentre=T)
    }
    par(mfrow=c(1,1),mar=c(4,4,2,2))
    ymax <- 0.4
    if(selective==T) ymax <- 0.25
    boxplot(hf~bw.class,xlab="Bruns-Watson Class",ylab="Hit Fraction",names=c("clean","dirty","fail"),cex.axis=1.2,cex.lab=1.2,ylim=c(0,ymax),main="Actual vs. Predicted Promiscuity, Gene Hits")

    ytop <- 0.38
    delta <- 0.02
    if(selective==T) {
        ytop <- 0.14
        delta <- 0.01
    }
    text(1.2,ytop,paste("p(fail:clean):",format(p.20,digits=3)),pos=4)
    text(2.2,ytop,paste("p(fail:dirty):",format(p.21,digits=3)),pos=4)
    text(1.2,ytop-delta,paste("p(dirty:clean):",format(p.10,digits=3)),pos=4)
                                        #text(1.2,ytop-2*delta,paste("selective:",selective),pos=4)

    if(to.file) dev.off()
    else browser()
}
#--------------------------------------------------------------------------------------
#
# Calculate promiscuity features
#
# QC=OK
#--------------------------------------------------------------------------------------
promiscuity.features <- function(do.prep=F) {
print("add gene selective")
return()
	if(do.prep) {
		file <- "structure_input/ToxCast_SMARTS_REDUCED_matrix_2013_03_05.txt"
		sdata <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"")
		chemset <- sort(rownames(sdata))
		sdata <- sdata[chemset,]
		SDATA <<- sdata

		file <- "output/by_chemical_hit_dist.txt"
		cdata <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"")
		rownames(cdata) <- cdata[,"CODE"]
		cdata <- cdata[chemset,]

		HF <<- cdata[,"Hits"] / cdata[,"AssaysTested"]
		HF.selective <<- cdata[,"SelectiveHits"] / cdata[,"AssaysTested"]
	}

	file <- "structure_input/SMARTS_index.txt"
	sindex <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment="")

	hf <- HF[!is.na(HF)]
	hfs <- HF.selective[!is.na(HF)]
	sdata <- SDATA[!is.na(HF),]
	fout <- "output/promiscuity_features.txt"
	s <- paste("FeatureID\tSMARTS\tFeatureName\tNchem\tmean.in\tsd.in\tmean.out\tsd.out\tp.value\tmean.in.selective\tsd.in.selective\tmean.out.selective\tsd.out.selective\tp.value.selective\n")
	cat(s,file=fout,append=F)
	nfeature <- dim(sdata)[2]
	for(i in 1:nfeature) {
		fpid <- names(sdata)[i]
		smarts <- sindex[is.element(sindex[,1],fpid),2]
		fpname <- sindex[is.element(sindex[,1],fpid),3]
		count <- sum(sdata[,i])
		if(count>=5) {
			invals <- hf[sdata[,i]==1]
			mean.in <- mean(invals)
			sd.in <- sd(invals)
			outvals <- hf[sdata[,i]==0]
			mean.out <- mean(outvals)
			sd.out <- sd(outvals)
			p <- wilcox.test(invals,outvals,alternative="greater")$p.value

			invals.s <- hfs[sdata[,i]==1]
			mean.in.s <- mean(invals.s)
			sd.in.s <- sd(invals.s)
			outvals.s <- hfs[sdata[,i]==0]
			mean.out.s <- mean(outvals.s)
			sd.out.s <- sd(outvals.s)
			p.s <- wilcox.test(invals.s,outvals.s,alternative="greater")$p.value

			s <- paste(fpid,"\t",smarts,"\t",fpname,"\t",count,"\t",format(mean.in,digits=3),"\t",format(sd.in,digits=3),"\t",format(mean.out,digits=3),"\t",format(sd.out,digits=3),"\t",format(p,digits=3),"\t",format(mean.in.s,digits=3),"\t",format(sd.in.s,digits=3),"\t",format(mean.out.s,digits=3),"\t",format(sd.out.s,digits=3),"\t",format(p.s,digits=3),"\n",sep="")
			cat(s,file=fout,append=T)
			cat(s)
		}
		#browser()
	}
	#browser()
}
#--------------------------------------------------------------------------------------
#
# Calculate promiscuity features
#
# QC=OK
#--------------------------------------------------------------------------------------
promiscuity.chemotypes <- function(do.prep=F) {
print("add gene selective")
return()
	if(do.prep) {
		file <- "structure_input/Altimira_Chemotypes_2013_02_04.txt"
		sdata <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"")
		rownames(sdata) <- sdata[,"CODE"]
		sdata <- sdata[,5:dim(sdata)[2]]
		chemset <- sort(rownames(sdata))
		sdata <- sdata[chemset,]
		SDATA <<- sdata

		file <- "output/by_chemical_hit_dist.txt"
		cdata <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"")
		rownames(cdata) <- cdata[,"CODE"]
		cdata <- cdata[chemset,]

		HF <<- cdata[,"Hits"] / cdata[,"AssaysTested"]
		HF.selective <<- cdata[,"SelectiveHits"] / cdata[,"AssaysTested"]
	}

	#file <- "structure_input/SMARTS_index.txt"
	#sindex <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment="")

	hf <- HF[!is.na(HF)]
	hfs <- HF.selective[!is.na(HF)]
	sdata <- SDATA[!is.na(HF),]
	fout <- "output/promiscuity_by_chemotype.txt"
	s <- paste("FeatureName\tNchem\tmean.in\tsd.in\tmean.out\tsd.out\tp.value\tmean.in.selective\tsd.in.selective\tmean.out.selective\tsd.out.selective\tp.value.selective\n")
	cat(s,file=fout,append=F)
	nfeature <- dim(sdata)[2]
	for(i in 1:nfeature) {
		fpid <- names(sdata)[i]
		#smarts <- sindex[is.element(sindex[,1],fpid),2]
		#fpname <- sindex[is.element(sindex[,1],fpid),3]
		count <- sum(sdata[,i])
		if(count>=5) {
			invals <- hf[sdata[,i]==1]
			mean.in <- mean(invals)
			sd.in <- sd(invals)
			outvals <- hf[sdata[,i]==0]
			mean.out <- mean(outvals)
			sd.out <- sd(outvals)
			p <- wilcox.test(invals,outvals,alternative="greater")$p.value

			invals.s <- hfs[sdata[,i]==1]
			mean.in.s <- mean(invals.s)
			sd.in.s <- sd(invals.s)
			outvals.s <- hfs[sdata[,i]==0]
			mean.out.s <- mean(outvals.s)
			sd.out.s <- sd(outvals.s)
			p.s <- wilcox.test(invals.s,outvals.s,alternative="greater")$p.value

			s <- paste(fpid,"\t",count,"\t",format(mean.in,digits=3),"\t",format(sd.in,digits=3),"\t",format(mean.out,digits=3),"\t",format(sd.out,digits=3),"\t",format(p,digits=3),"\t",format(mean.in.s,digits=3),"\t",format(sd.in.s,digits=3),"\t",format(mean.out.s,digits=3),"\t",format(sd.out.s,digits=3),"\t",format(p.s,digits=3),"\n",sep="")
			cat(s,file=fout,append=T)
			cat(s)
		}
		#browser()
	}
	#browser()
}
#--------------------------------------------------------------------------------------
#
# Calculate promiscuity stats for the structure classes using gene score
#
# QC=OK
#--------------------------------------------------------------------------------------
promiscuity.by.category.by.genescore <- function(cutoff=5) {
    nchem <- length(CODE.LIST)
    hf <- vector(length=nchem,mode="numeric")
    s.class <- vector(length=nchem,mode="numeric")
    hf[] <- NA
    s.class[] <- NA
    for(i in 1:nchem) {
        code <- CODE.LIST[i]
        s.class[i] <- CHEMS[code,"StructureCategory"]
        temp <- GENEDATA[is.element(GENEDATA[,"CODE"],code),]
        temp.gs <- temp[,"GeneScore"]
        top <- length(temp.gs[temp.gs>=cutoff])
        bot <- length(temp.gs)
        if(bot>200) {
            hf[i] <- top/bot
        }
    }
    hf <- hf[!is.na(s.class)]
    s.class <- s.class[!is.na(s.class)]
    s.class <- s.class[!is.na(hf)]
    hf <- hf[!is.na(hf)]

    fout <- paste("output/promiscuity_by_category_",cutoff,".txt",sep="")
    s <- paste("Category\tNchem\tmean_HitRatio\tSD_HitRatio\tZ-value\tp-hot\tp-cold\n")
    cat(s,file=fout,append=F)
    cat.set <- sort(uniquify(CHEMS[,"StructureCategory"]))
    for(i in 1:length(cat.set)) {
        category <- cat.set[i]
        print(category)
        ratio.in <- hf[is.element(s.class,category)]
        ratio.out <- hf[!is.element(s.class,category)]

        if(length(ratio.in)>4 && length(ratio.out)>4) {
            cmean.in <- mean(ratio.in)
            cmean.out <- mean(ratio.out)
            csd.in <- sd(ratio.in)
            csd.out <- sd(ratio.out)
            nchem <- length(ratio.in)
            z.value <- (cmean.in-cmean.out)/csd.out
            p.hot <- wilcox.test(ratio.in,ratio.out,alternative="greater")$p.value
            p.cold <- wilcox.test(ratio.in,ratio.out,alternative="less")$p.value
            s <- paste(category,"\t",nchem,"\t",format(cmean.in,digits=3),"\t",format(csd.in,digits=3),"\t",format(z.value,digits=3),"\t",format(p.hot,digits=3),"\t",format(p.cold,digits=3),"\n",sep="")
            cat(s,file=fout,append=T)
            cat(s)
        }
    }
}
#--------------------------------------------------------------------------------------
#
# Look for structure category correlations with BSK SVM scores
#
#--------------------------------------------------------------------------------------
bsk.category <- function(do.prep.1=T,do.prep.2=F,do.calc=F) {
	if(do.prep.1) {
		file <- "input/BSK_Supplemental_Table_8.txt"
		bsk.svm <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"",comment="")
		print(dim(bsk.svm))
		file <- "input/ToxCast_Samples_2014_01_13.txt"
		samples <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="",comment="")
		print(dim(samples))

		bsk.svm[,"Concentration"] <- bsk.svm[,"Concentration"]/1000

		tx.list <- bsk.svm[,"TXCode"]
		conc.list <- bsk.svm[,"Concentration"]
		code.list <- tx.list
		name.list <- tx.list
		category.list <- tx.list
		z.list <- tx.list
		code.list[] <- NA
		name.list[] <- NA
		category.list[] <- NA
		z.list[] <- NA
		for(i in 1:length(tx.list)) {
			tx <- tx.list[i]
			stemp <- samples[is.element(samples[,"sample_id"],tx),]
			code <- stemp[1,"CODE"]
			code.list[i] <- code
			name.list[i] <- stemp[1,"ShortName"]
			category.list[i] <- CHEMS[code,"StructureCategory"]
			cmean <- CYTOTOX[code,"cytotox.mean"]
			if(is.na(cmean)) cmean <- 3
			csd <- CYTOTOX[code,"cytotox.sd.global"]
			conc <- conc.list[i]
			logconc <- -log10(conc/1000000)
			z.list[i] <- (logconc-cmean)/csd
		}
		z.list[is.na(z.list)] <- 0
		z.list <- as.numeric(z.list)
		bsk.svm <- cbind(z.list,bsk.svm)
		bsk.svm <- cbind(category.list,bsk.svm)
		bsk.svm <- cbind(name.list,bsk.svm)
		bsk.svm <- cbind(code.list,bsk.svm)
		names(bsk.svm)[1:4] <- c("CODE","Name","Category","Z")
		BSK.SVM.1 <<- bsk.svm
	}
	if(do.prep.2) {
		mask <- BSK.SVM.1[,"Z"]
		mask[is.na(mask)] <- 0
		mask[mask<3] <- 0
		mask[mask>0] <- 1
		bsk.svm.2 <- BSK.SVM.1[mask==1,]
		code.list <- sort(uniquify(bsk.svm.2[,"CODE"]))
		bsk.svm.3 <- bsk.svm.2[1:2,]
		for(i in 1:length(code.list)) {
			code <- code.list[i]
			temp <- bsk.svm.2[is.element(bsk.svm.2[,"CODE"],code),]
			temp.row <- temp[1,]
			block <- temp[,11:34]
			block.best <- colMax(block)
			temp.row[,11:34] <- block.best
			bsk.svm.3 <- rbind(bsk.svm.3,temp.row)
		}
		bsk.svm.3 <- bsk.svm.3[3:dim(bsk.svm.3)[1],]
		BSK.SVM.2 <<- bsk.svm.3
	}

	if(do.calc) {
		cat("BSK.SVM: ",dim(BSK.SVM),"\n")
		cat.list <- sort(uniquify(BSK.SVM.2[,"Category"]))
		svm.list <- names(BSK.SVM.2)[11:34]
		nsvm <- length(svm.list)
		ncat <- length(cat.list)
		fout <- "input/bsk_svm_correlation.txt"
		s <- paste("SVM.Class\tCategory\tN.in\tmean.in\tN.out\tmean.out\tp.value\n")
		cat(s,file=fout,append=F)

		for(i in 1:nsvm) {
			svm <- svm.list[i]
			print(svm)
			temp <- BSK.SVM.2[,svm]
			for(j in 2:ncat) {
				scat <- cat.list[j]
				print(scat)
				clist <- as.character(BSK.SVM.2[,"Category"])
				mask <- vector(mode="integer",length=length(clist))
				mask[] <- 0
				mask[is.element(clist,scat)] <- 1
				in.val <- temp[mask==1]
				out.val <- temp[mask==0]
				n.in <- length(in.val)
				mean.in <- mean(in.val)
				n.out <- length(out.val)
				mean.out <- mean(out.val)
				p.value <- wilcox.test(in.val,out.val,alternative="greater")$p.value
				s <- paste(svm,"\t",scat,"\t",n.in,"\t",format(mean.in,digits=2),"\t",n.out,"\t",format(mean.out,digits=2),"\t",format(p.value,digits=3),"\n",sep="")
				cat(s,file=fout,append=T)
				cat(s)
			}
		}
	}
browser()
}
#--------------------------------------------------------------------------------------
#
# Calculate promiscuity stats for the structure classes
#
# QC=OK
#--------------------------------------------------------------------------------------
promiscuity.by.category <- function() {
print("add gene selective")
return()
	#file <- "input/by_chemical_hit_dist.txt"
	#cdata <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"")

	fout <- "output/promiscuity_by_category.txt"
	s <- paste("BurstClass\tCategory\tType\tNchem\tmean_HitRatio\tSD_HitRatio\tZ-value\tp-hot\tp-cold\n")
	cat(s,file=fout,append=F)
	cat.set <- sort(uniquify(cdata[,"StructureCategory"]))
	for(i in 1:length(cat.set)) {
		category <- cat.set[i]
		print(category)
		temp.in <- cdata[is.element(cdata[,"StructureCategory"],category),]
		temp.out <- cdata[!is.element(cdata[,"StructureCategory"],category),]



		ratio.in <- temp.in[,"HitRatio"]
		ratio.out <- temp.out[,"HitRatio"]

		if(length(ratio.in)>4 && length(ratio.out)>4) {
			#print(length(ratio.in))
			#print(length(ratio.out))
			cmean.in <- mean(ratio.in)
			cmean.out <- mean(ratio.out)
			csd.in <- sd(ratio.in)
			csd.out <- sd(ratio.out)
			nchem <- length(ratio.in)
			z.value <- (cmean.in-cmean.out)/csd.out
			p.hot <- wilcox.test(ratio.in,ratio.out,alternative="greater")$p.value
			p.cold <- wilcox.test(ratio.in,ratio.out,alternative="less")$p.value
			s <- paste("All\t",category,"\tCategory\t",nchem,"\t",format(cmean.in,digits=3),"\t",format(csd.in,digits=3),"\t",format(z.value,digits=3),"\t",format(p.hot,digits=3),"\t",format(p.cold,digits=3),"\n",sep="")
			cat(s,file=fout,append=T)
			cat(s)
		}
		ratio.in <- temp.in[,"SelectiveHitRatio"]
		ratio.out <- temp.out[,"SelectiveHitRatio"]
		if(length(ratio.in)>4 && length(ratio.out)>4) {
			cmean.in <- mean(ratio.in)
			cmean.out <- mean(ratio.out)
			csd.in <- sd(ratio.in)
			csd.out <- sd(ratio.out)
			nchem <- length(ratio.in)
			z.value <- (cmean.in-cmean.out)/csd.out
			p.hot <- wilcox.test(ratio.in,ratio.out,alternative="greater")$p.value
			p.cold <- wilcox.test(ratio.in,ratio.out,alternative="less")$p.value
			s <- paste("Selective\t",category,"\tCategory\t",nchem,"\t",format(cmean.in,digits=3),"\t",format(csd.in,digits=3),"\t",format(z.value,digits=3),"\t",format(p.hot,digits=3),"\t",format(p.cold,digits=3),"\n",sep="")
			cat(s,file=fout,append=T)
			cat(s)
		}
	}

}
#--------------------------------------------------------------------------------------
#
# plot the range of hits as a fucntion of MW
#
# QC=OK
#--------------------------------------------------------------------------------------
mw.dist <- function(do.prep=F,to.file=F) {

	if(do.prep) {
		temp.hit <- MAT.hitcall
		temp.hit.ps <- MAT.ZSCORE.NORM
		temp.hit[is.na(temp.hit)] <- 0
		temp.hit[temp.hit>0] <- 1
		temp.hit.ps[is.na(temp.hit.ps)] <- 0
		temp.hit.ps[temp.hit.ps>0] <- 1

		mask <- MAT.hitcall
		mask[!is.na(mask)] <- 1
		mask[is.na(mask)] <- 0
		bot <- rowSums(mask)
		bot[bot==0] <- 1

		top.hit <- rowSums(temp.hit)
		fraction.hit <- top.hit/bot
		top.hit.ps <- rowSums(temp.hit.ps)
		fraction.hit.ps <- top.hit.ps/bot

		mw.min <<- c(10,50,100,150,200,250,300,400,500,750,1000)
		mw.max <<- c(   50,100,150,200,250,300,400,500,750,1000,10000)
		#mw.min <- c(50,100,150,200,250,300,400,500,750)
		#mw.max <- c(   100,150,200,250,300,400,500,750,1000)
		ngroups <- length(mw.min)
		groups <- fraction.hit
		groups[] <- 0

		nchem <- length(groups)
		for(i in 1:nchem) {
			for(j in 1:ngroups) {
				if(CHEMSTRUCT[i,"MW"] > mw.min[j] && CHEMSTRUCT[i,"MW"]<=mw.max[j]) groups[i] <- j
			}
		}
		FRACTION.HIT <<- fraction.hit
		FRACTION.HIT.PS <<- fraction.hit.ps
		GROUPS <<- groups
	}
	fraction.hit <- FRACTION.HIT[GROUPS>0]
	fraction.hit.ps <- FRACTION.HIT.PS[GROUPS>0]
	groups <- GROUPS[GROUPS>0]
	ngroups <- max(groups)
    if(to.file) {
        fname <- paste("plots/mw_hit_dist.pdf",sep="")
        pdf(file=fname,width=7,height=10,pointsize=12,bg="white",paper="letter",pagecentre=T)
    }
    par(mfrow=c(2,1),mar=c(4,4,2,2))
    ymax <- 0.4
    boxplot(fraction.hit~groups,xlab="MW range",ylab="Hit Fraction",names=mw.min,cex.axis=1.2,cex.lab=1.2,ylim=c(0,ymax),main="All Hits")
   	for(i in 1:ngroups) text(i,ymax,paste(length(groups[groups==i])),pos=1)

    boxplot(fraction.hit.ps~groups,xlab="MW range",ylab="Hit Fraction",names=mw.min,cex.axis=1.2,cex.lab=1.2,ylim=c(0,ymax),main="Potent and Selective Hits")
   	for(i in 1:ngroups) text(i,ymax,paste(length(groups[groups==i])),pos=1)


   	if(to.file) dev.off()
    else browser()
}
#--------------------------------------------------------------------------------------
#
# Load the phase I data
#
#--------------------------------------------------------------------------------------
loadPhaseI <- function() {
    cat("==========================================================================\n")
    cat("load phase I data ...\n")
    cat("==========================================================================\n")
    flush.console()
    mydate <- "20100129"
    techlist <- c("Novascreen","ACEA","Attagene","BioSeek","Cellumen","CellzDirect","NCGC")
    ntech <- length(techlist)
    for(i in 1:ntech) {
    	file <- paste("../input/Phase_I/ToxCast_",techlist[i],"_",mydate,".txt",sep="")  	
    	temp <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="\"")
    	print(file)
    	print(dim(temp))
    	if(i==1) {
    		adata <- temp[,4:dim(temp)[2]]
    		CHEMS <- temp[,1:3]
    	}
    	else {
    		adata <- cbind(adata,temp[,4:dim(temp)[2]])
    	}
    }
    code.list <- CHEMS[,2]
    nchem <- length(code.list)
    for(i in 1:nchem) {
    	code <- code.list[i]
    	code <- paste("C",str_replace_all(code,"-",""),sep="")
		code.list[i] <- code
	}
	mask <- vector(length=nchem,mode="numeric")
	mask[] <- 1
	for(i in 2:nchem) {
		if(is.element(code.list[i],code.list[1:(i-1)])) mask[i] <- 0
	}
	#browser()
	CHEMS <- CHEMS[mask==1,]
	adata <- adata[mask==1,]
	code.list <- code.list[mask==1]
	rownames(CHEMS) <- code.list
	rownames(adata) <- code.list
    #rownames(adata) <- code.list
    #rownames(CHEMS) <- code.list
    PHASE_I_CHEMS <<- CHEMS
    PHASE_I_DATA <<- adata
    PHASE_I_CODES <<- code.list
}
#--------------------------------------------------------------------------------------
#
# compare the old and new minimum AC50 values
#
#--------------------------------------------------------------------------------------
comp.old.new <- function(to.file=F) {
	
   	if(to.file) {
	   fname <- "../plots/toxcast_old_new.pdf"
		pdf(file=fname,width=7,height=7,pointsize=12,bg="white",paper="letter",pagecentre=T)
   	}
	par(mfrow=c(2,2),mar=c(4,4,2,2))
	
	code.list.old <- PHASE_I_CODES
	code.list <- code.list.old[is.element(code.list.old,CODE.LIST)]
	nchem <- length(code.list)

############################
	data.old <- PHASE_I_DATA[code.list,]
	data.new <- MAT.AC50[code.list,]

	data.old[is.na(data.old)] <- 1000000
	data.new[is.na(data.new)] <- 1000000
	
	min.old <- rowMin(data.old)
	min.new <- rowMin(data.new)
	
	plot(min.new~min.old,log="xy",xlim=c(1e-4,100),ylim=c(1e-4,100),main="Minimum Values: All",xlab="Phase I",ylab="Phase II")
	lines(c(1e-5,1e5),c(1e-5,1e5))

############################
	q2.old <- vector(length=nchem,mode="numeric")
	q2.old[] <- 0
	q2.new <- q2.old
	q3.new <- q2.old
	q4.new <- q2.old
	q3.old <- q2.old
	q4.old <- q2.old

	log.old <- -log10(data.old/1000000)
	log.new <- -log10(data.new/1000000)
	
	for(i in 1:nchem) {
		temp <- log.old[i,]
		x <- quantile(temp[temp>0])
		q2.old[i] <- x[2]
		q3.old[i] <- x[3]
		q4.old[i] <- x[4]
		temp <- log.new[i,]
		x <- quantile(temp[temp>0])
		q2.new[i] <- x[2]
		q3.new[i] <- x[3]
		q4.new[i] <- x[4]
	}
	plot(q2.new~q2.old,xlim=c(4,8),ylim=c(4,8),main="log(quantile 2) Values: All",xlab="Phase I",ylab="Phase II")
	lines(c(0,8),c(0,8))
	plot(q3.new~q3.old,xlim=c(4,8),ylim=c(4,8),main="log(quantile 3) Values: All",xlab="Phase I",ylab="Phase II")
	lines(c(0,8),c(0,8))
	plot(q4.new~q4.old,xlim=c(4,8),ylim=c(4,8),main="log(quantile 4) Values: All",xlab="Phase I",ylab="Phase II")
	lines(c(0,8),c(0,8))

############################
	prefix.list.old <- c("NVS","ATG","BSK","CLM","NCGC")
	prefix.list.new <- c("NVS","ATG","BSK","APR","Tox21")
	for(j in 1:length(prefix.list.old)) {
		prefix.old <- prefix.list.old[j]
		prefix.new <- prefix.list.new[j]
		assay.old <- colnames(PHASE_I_DATA)
		assay.new <- colnames(MAT.AC50)
		useme.old <- grep(prefix.old,assay.old)
		useme.new <- grep(prefix.new,assay.new)
		assay.old.sub <- assay.old[useme.old]
		assay.new.sub <- assay.new[useme.new]	
		data.old <- PHASE_I_DATA[code.list,assay.old.sub]
		data.new <- MAT.AC50[code.list,assay.new.sub]

		data.old[is.na(data.old)] <- 1000000
		data.new[is.na(data.new)] <- 1000000
	
		min.old <- rowMin(data.old)
		min.new <- rowMin(data.new)
	
		plot(min.new~min.old,log="xy",xlim=c(1e-4,100),ylim=c(1e-4,100),main=paste("Minimum Values: ",prefix.new),xlab="Phase I",ylab="Phase II")
		lines(c(1e-5,1e5),c(1e-5,1e5))

############################
		q2.old <- vector(length=nchem,mode="numeric")
		q2.old[] <- 0
		q2.new <- q2.old
		q3.new <- q2.old
		q4.new <- q2.old
		q3.old <- q2.old
		q4.old <- q2.old

		log.old <- -log10(data.old/1000000)
		log.new <- -log10(data.new/1000000)

		for(i in 1:nchem) {
			temp <- log.old[i,]
			x <- quantile(temp[temp>0])
			q2.old[i] <- x[2]
			q3.old[i] <- x[3]
			q4.old[i] <- x[4]
			temp <- log.new[i,]
			x <- quantile(temp[temp>0])
			q2.new[i] <- x[2]
			q3.new[i] <- x[3]
			q4.new[i] <- x[4]
		}
		plot(q2.new~q2.old,xlim=c(2,8),ylim=c(2,8),main=paste("log(quantile 2) Values:",prefix.new),xlab="Phase I",ylab="Phase II")
		lines(c(0,8),c(0,8))
		plot(q3.new~q3.old,xlim=c(2,8),ylim=c(2,8),main=paste("log(quantile 3) Values:",prefix.new),xlab="Phase I",ylab="Phase II")
		lines(c(0,8),c(0,8))
		plot(q4.new~q4.old,xlim=c(2,8),ylim=c(2,8),main=paste("log(quantile 4) Values:",prefix.new),xlab="Phase I",ylab="Phase II")
		lines(c(0,8),c(0,8))
	}
	
	
	if(to.file) dev.off()
	else browser()
}
#-----------------------------------------------------------------------------------
#
# median by row
#
#-----------------------------------------------------------------------------------
rowMed <- function(x) {
	ret <- apply(x,FUN=median,MARGIN=1)
}
#--------------------------------------------------------------------------------------
#
# z-score: shift the data
#
# QC=OK
#--------------------------------------------------------------------------------------
shift.zscore <- function() {
    cat("==========================================================================\n")
    cat("shift zscore\n")
    cat("==========================================================================\n")
    flush.console()
    file <- "../output/source_z_shifts_original.txt"

    zshift <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="",comment="")
    ztemp <- MAT.ZSCORE
    assay.list <- names(MAT.ZSCORE)
    nassay <- length(assay.list)
    for(i in 1:nassay) {
        assay <- assay.list[i]
        source <- ASSAY.INFO[is.element(ASSAY.INFO[,"Assay"],assay),"Source"]
        shift <- zshift[is.element(zshift[,"Source"],source),"Center1"]
        cat(assay,":",source,":",shift,"\n")
        ztemp[,assay] <- ztemp[,assay] - shift
    }
    MAT.ZSCORE.NORM <<- ztemp
    outfile <- "../output/zscore_matrix_norm.txt"
    write.table(ztemp,file=outfile, row.names=T, append=FALSE, quote=F, sep = "\t")
}
########################################################################################
########################################################################################
########################################################################################
########################################################################################
########################################################################################
########################################################################################
########################################################################################
########################################################################################

#--------------------------------------------------------------------------------------
#
# plot histograms of hits by chemical and assay
#
# QC=OK
#--------------------------------------------------------------------------------------
plot.hit.dist <- function(do.prep=T,to.file=F) {
    if(do.prep) {
        temp.hit <- MAT.hitcall
        temp.hit[is.na(temp.hit)] <- 0
        temp.hit[temp.hit>0] <- 1

        mask <- MAT.hitcall
        mask[!is.na(mask)] <- 1
        mask[is.na(mask)] <- 0

        bot <- rowSums(mask)
        bot[bot==0] <- 1
        top.hit <- rowSums(temp.hit)
        fraction.hit.by.chem <<- top.hit/bot

        bot <- colSums(mask)
        bot[bot==0] <- 1
        top.hit <- colSums(temp.hit)
        fraction.hit.by.assay <<- top.hit/bot
    }
    if(to.file) {
        fname <- paste("plots/hit_dist.pdf",sep="")
        pdf(file=fname,width=7,height=10,pointsize=12,bg="white",paper="letter",pagecentre=T)
    }
    par(mfrow=c(2,1),mar=c(4,4,2,2))
    ylim <- 0.4
    breaks <- seq(0,0.6,by=0.02)
    hist(fraction.hit.by.chem,xlab="Fraction Hits",ylab="Chemicals",cex.axis=1.2,cex.lab=1.2,main="By Chemical",freq=T,breaks=breaks,ylim=c(0,1000))
    breaks <- seq(0,0.6,by=0.02)
    hist(fraction.hit.by.assay,xlab="Fraction Hits",ylab="Assays",cex.axis=1.2,cex.lab=1.2,main="By Assay",freq=T,breaks=breaks,ylim=c(0,500))

    if(to.file) dev.off()
    else browser()
}
