#--------------------------------------------------------------------------------------
#
# ToxCast_varmats_loader_v01.R - code to load the ToxCast data from varmats and 
# generate all of the standard matrix files
#
# November 2014
# Richard Judson
#
# US EPA
# Questions, comments to: judson.richard@epa.gov, 919-541-3085#--------------------------------------------------------------------------------------
#
# load all of the data and build scaled matrices
#
# CHEMS - matrix of all chemical information
# CODE.LIST - list of unique chemical codes, rownames of all matrices
# NCHEM - number of chemicals
#
# ASSAY.INFO - assay matrix
# ASSAY.LIST - vector of unique assay names - colnames of all matrices
# NASSAY - number of assays
#
# CYTOTX - matrix of cytotox paramters for each chemical
#
# Matrices (NCHEM x NASSAY)
# 
# MAT.AC50 (uM)
# MAT.AC50_loss     
# MAT.ACB MAT.ACC  MAT.AC10   
# MAT.hitcall       
# MAT.logAC50 MAT.logAC50_loss  
# MAT.min_conc  MAT.max_conc    
# MAT.model
# MAT.Emax
# MAT.T             
# MAT.T.SCALED (T scaled so that 95%-ile of hits have T=100%)
# MAT.tested        
# MAT.W
# MAT.Z             
# MAT.Z.NORM (shifted so htat first peak of Z distribution is at ~ 0
#
# Each time you use a matrix, you need to make sure that you are appropriately checking
# the tested and hitcall matrix
#
#--------------------------------------------------------------------------------------
load_and_scale <- function(suffix="141024") {
	prep_matrices(suffix)
	load_assay_defs()
	shift_zscore()
	scale_top_by_assay()
}
#--------------------------------------------------------------------------------------
#
# Create the full input AC50, T,B,Emax,... files, one row per chemical
#
#--------------------------------------------------------------------------------------
prep_matrices <- function(suffix="141024") {
    cat("==========================================================================\n")
    cat("Prepare the matrices ...\n")
    cat("==========================================================================\n")
    cat("Read in chemical data ...\n")
    flush.console()
    file <- paste(VARMATDIR,"Chemical_Summary_",suffix,".csv",sep="")
    temp <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="")
    PIPELINE.SAMPLES <<- temp
    code.list <- sort(unique(temp[,"code"]))
    nchem <- length(code.list)
    chems <- as.data.frame(matrix(nrow=nchem,ncol=4))
    names(chems) <- c("CODE","CASRN","Name","DSSTox_GSID")
    chems[,"CODE"] <- code.list
    rownames(chems) <- chems[,"CODE"]
    for(i in 1:nchem) {
        code <- code.list[i]
        ctemp <- temp[is.element(temp[,"code"],code),]
        chems[code,"CODE"] <- code
        chems[code,"CASRN"] <- ctemp[1,"casn"]
        chems[code,"Name"] <- ctemp[1,"chnm"]
        chems[code,"DSSTox_GSID"] <- paste("DSSTox_",ctemp[1,"chid"],sep="")
    }
    PIPELINE.CHEMS <<- chems
    cat("Dimension of PIPELINE.CHEMS: ",dim(PIPELINE.CHEMS),"\n")
    flush.console()

    file <- paste(VARMATDIR,"AllResults_flags_",suffix,".csv",sep="")
    temp <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="")
    x <- temp[,"chnm"]
    y <- str_replace_all(x,"\"","")
    temp[,"chnm"] <- y
    ALL.FLAGS <<- temp

	load_old_Chemical_defs()

    # create the unified chemicals set
    temp <- PIPELINE.CHEMS
    ntemp <- names(temp)
    temp <- cbind(temp,temp[,dim(temp)[2]])
    temp <- cbind(temp,temp[,dim(temp)[2]])
    temp <- cbind(temp,temp[,dim(temp)[2]])
    temp <- cbind(temp,temp[,dim(temp)[2]])
    temp <- cbind(temp,temp[,dim(temp)[2]])
    temp <- cbind(temp,temp[,dim(temp)[2]])
    temp <- cbind(temp,temp[,dim(temp)[2]])
    temp <- cbind(temp,temp[,dim(temp)[2]])
    temp <- cbind(temp,temp[,dim(temp)[2]])

    ntemp <- c(ntemp,"target_gene","use_category","use_super_category","structure_category","structure_super_category","Phase_I","Phase_II","E1K","Tox21")
    names(temp) <- ntemp
    temp[,"target_gene"] <- NA
    temp[,"use_category"] <- NA
    temp[,"use_super_category"] <- NA
    temp[,"structure_category"] <- NA
    temp[,"structure_super_category"] <- NA
    temp[,"Phase_I"] <- NA
    temp[,"Phase_II"] <- NA
    temp[,"E1K"] <- NA
    temp[,"Tox21"] <- NA
    rownames(temp) <- temp[,"CODE"]
    NCHEM <<- dim(temp)[1]

    for(i in 1:NCHEM) {
        code <- temp[i,"CODE"]
        temp2 <- OLD.CHEMS[code,]
        if(dim(temp2)[1]==1) {
            temp[i,"target_gene"] <- temp2[1,"target_gene"]
            temp[i,"use_category"] <- temp2[1,"use_category"]
            temp[i,"use_super_category"] <- temp2[1,"use_super_category"]
            temp[i,"structure_category"] <- temp2[1,"structure_category"]
            temp[i,"structure_super_category"] <- temp2[1,"structure_super_category"]
            temp[i,"Phase_I"] <- temp2[1,"Phase_I"]
            temp[i,"Phase_II"] <- temp2[1,"Phase_II"]
            temp[i,"E1K"] <- temp2[1,"E1K"]
            temp[i,"Tox21"] <- temp2[1,"Tox21"]
		}
        else {
            temp[i,"target_gene"] <- NA
            temp[i,"use_category"] <- "unknown"
            temp[i,"use_super_category"] <- "Other"
            temp[i,"structure_category"] <- "unknown"
            temp[i,"structure_super_category"] <- NA
            temp[i,"Phase_I"] <- 0
            temp[i,"Phase_II"] <- 0
            temp[i,"E1K"] <- 0
            temp[i,"Tox21"] <- 0
        }
    }

    x <- temp[,"Name"]
    y <- str_replace_all(x,"\"","")
    temp[,"Name"] <- y

    CHEMS <<- temp
    CODE.LIST <<- CHEMS[,"CODE"]

    outfile <- paste("../input/ToxCast_Chems_Master_",suffix,".txt",sep="")
    write.table(CHEMS,file=outfile, row.names=F, append=FALSE, quote=T, sep = "\t")
    cat("CHEMS read in\n")
    flush.console()

    #
    # assays
    #
    cat("Read in assay data ...\n")

    file <- paste(VARMATDIR,"AllResults_cyto_dist_",suffix,".csv",sep="")
    temp <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="")
    temp <- temp[,2:11]
    names(temp) <- c("CASRN","Name","CODE","cytotox_median_raw","cytotox_mad","nhit","global_mad","cytotox_median_log", "cytotox_median_um","cytotox_lower_bound_um")
    rownames(temp) <- temp[,"CODE"]
    x <- temp[,"Name"]
    y <- str_replace_all(x,"\"","")
    temp[,"Name"] <- y
    CYTOTOX <<- temp

    file <- paste(VARMATDIR,"AllResults_tested_Matrix_",suffix,".csv",sep="")
    temp.tested <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)

    file <- paste(VARMATDIR,"AllResults_hitc_Matrix_",suffix,".csv",sep="")
    temp.hit <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)

    file <- paste(VARMATDIR,"AllResults_modl_ga_Matrix_",suffix,".csv",sep="")
    temp.log_ac50 <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)

    file <- paste(VARMATDIR,"AllResults_modl_ac10_Matrix_",suffix,".csv",sep="")
    temp.log_ac10 <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)

    file <- paste(VARMATDIR,"AllResults_modl_acc_Matrix_",suffix,".csv",sep="")
    temp.log_acc <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)

    file <- paste(VARMATDIR,"AllResults_modl_acb_Matrix_",suffix,".csv",sep="")
    temp.log_acb <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)

    file <- paste(VARMATDIR,"AllResults_modl_la_Matrix_",suffix,".csv",sep="")
    temp.log_loss_ac50 <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)

    file <- paste(VARMATDIR,"AllResults_max_med_Matrix_",suffix,".csv",sep="")
    temp.emax <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)

    file <- paste(VARMATDIR,"AllResults_modl_gw_Matrix_",suffix,".csv",sep="")
    temp.w <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)

    file <- paste(VARMATDIR,"AllResults_modl_lw_Matrix_",suffix,".csv",sep="")
    temp.loss_w <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)

    file <- paste(VARMATDIR,"AllResults_modl_tp_Matrix_",suffix,".csv",sep="")
    temp.t <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)

    file <- paste(VARMATDIR,"AllResults_modl_Matrix_",suffix,".csv",sep="")
    temp.modl <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)

    file <- paste(VARMATDIR,"AllResults_zscore_Matrix_",suffix,".csv",sep="")
    temp.z <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)

    file <- paste(VARMATDIR,"AllResults_logc_min_Matrix_",suffix,".csv",sep="")
    temp.log_cmin <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)

    file <- paste(VARMATDIR,"AllResults_logc_max_Matrix_",suffix,".csv",sep="")
    temp.log_cmax <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="", row.names = 1)

    cat("all assay files read in\n")
    flush.console()

    temp.log_ac10[is.na(temp.log_ac10)] <- 6
    temp.log_ac50[is.na(temp.log_ac50)] <- 6
    temp.log_loss_ac50[is.na(temp.log_loss_ac50)] <- 6
    temp.log_acc[is.na(temp.log_acc)] <- 6
    temp.log_acb[is.na(temp.log_acb)] <- 6
    temp.w[is.na(temp.w)] <- 0
    temp.emax[is.na(temp.emax)] <- 0
    temp.loss_w[is.na(temp.loss_w)] <- 0
    temp.t[is.na(temp.t)] <- 0
    temp.z[is.na(temp.z)] <- 0
    temp.modl[is.na(temp.modl)] <- 0
    temp.log_cmax[is.na(temp.log_cmax)] <- 0
    temp.log_cmin[is.na(temp.log_cmin)] <- 0
    temp.tested[is.na(temp.tested)] <- 0
    cat("NA fixed\n")
    flush.console()

    temp.log_ac10[temp.hit==0] <- 6
    temp.log_ac50[temp.hit==0] <- 6
    temp.log_loss_ac50[temp.hit==0] <- 6
    temp.log_acc[temp.hit==0] <- 6
    temp.log_acb[temp.hit==0] <- 6
    temp.w[temp.hit==0] <- 0
    temp.loss_w[temp.hit==0] <- 0
    temp.t[temp.hit==0] <- 0
    cat("hit matrix applied\n")

    flush.console()
	temp.ac10 <- 10**(temp.log_ac10)
	temp.ac50 <- 10**(temp.log_ac50)
	temp.loss_ac50 <- 10**(temp.log_loss_ac50)
	temp.acc <- 10**(temp.log_acc)
	temp.acb <- 10**(temp.log_acb)
	temp.cmax <- 10**(temp.log_cmax)
	temp.cmin <- 10**(temp.log_cmin)
    cat("exponentiation\n")
	flush.console()

	temp.ac10[temp.tested==0] <- NA
	temp.ac50[temp.tested==0] <- NA
    temp.acc[temp.tested==0] <- NA
    temp.acb[temp.tested==0] <- NA
    temp.loss_ac50[temp.tested==0] <- NA
    temp.log_ac10[temp.tested==0] <- NA
    temp.log_acc[temp.tested==0] <- NA

    temp.emax[temp.tested==0] <- NA
    temp.w[temp.tested==0] <- NA
    temp.loss_w[temp.tested==0] <- NA
    temp.t[temp.tested==0] <- NA
    temp.modl[temp.tested==0] <- NA
    temp.z[temp.tested==0] <- NA
    temp.z[temp.z==0] <- NA
    temp.hit[temp.tested==0] <- NA
    temp.cmax[temp.tested==0] <- NA
    temp.cmin[temp.tested==0] <- NA

    temp.log_loss_ac50[temp.tested==0] <- NA
    temp.log_ac50[temp.tested==0] <- NA

    cat("test matrix applied\n")
    flush.console()

	MAT.AC50 <<- temp.ac50
	MAT.logAC50 <<- temp.log_ac50
    cat("AC50: ",dim(MAT.AC50),"\n")

	MAT.AC50_loss <<- temp.loss_ac50
	MAT.logAC50_loss <<- temp.log_loss_ac50
    cat("AC50_loss: ",dim(MAT.AC50_loss),"\n");flush.console()

	MAT.Emax <<- temp.emax
    cat("Emax: ",dim(MAT.Emax),"\n");flush.console()

	MAT.model <<- temp.modl
    cat("model: ",dim(MAT.model),"\n");flush.console()

    MAT.hitcall <<- temp.hit
    cat("hitcall: ",dim(MAT.hitcall),"\n");flush.console()

    MAT.T <<- temp.t
    cat("T: ",dim(MAT.T),"\n");flush.console()

    MAT.W <<- temp.w
    cat("W: ",dim(MAT.W),"\n");flush.console()

    MAT.Z <<- temp.z
    cat("Z: ",dim(MAT.Z),"\n");flush.console()

    MAT.min_conc <<- temp.cmin
    cat("min_conc: ",dim(MAT.min_conc),"\n");flush.console()

    MAT.max_conc <<- temp.cmax
    cat("min_conc: ",dim(MAT.max_conc),"\n");flush.console()

    MAT.AC10 <<- temp.ac10
    cat("AC10: ",dim(MAT.AC10),"\n");flush.console()

    MAT.ACC <<- temp.acc
    cat("ACC: ",dim(MAT.ACC),"\n");flush.console()

    MAT.ACB <<- temp.acb
    cat("ACB: ",dim(MAT.ACB),"\n");flush.console()

    MAT.tested <<- temp.tested
    cat("tested: ",dim(MAT.tested),"\n");flush.console()

    ASSAY.LIST <<- colnames(MAT.AC50)
    NASSAY <<- length(ASSAY.LIST)


    file <- "../input/gene_family.csv"
    gf <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="")
    GENE.FAMILY <<- gf
}
#--------------------------------------------------------------------------------------
#
# reload the assay definitions
#
#--------------------------------------------------------------------------------------
load_assay_defs <- function() {
	file <- paste(VARMATDIR,"Assay_Summary_modified_141031.csv",sep="")
	temp.assay <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="")
	rownames(temp.assay) <- temp.assay[,"Assay"]
	temp.assay <- temp.assay[ASSAY.LIST,]
	ASSAY.INFO <<- temp.assay
}
#--------------------------------------------------------------------------------------
#
# reload the chemical definitions
#
#--------------------------------------------------------------------------------------
load_old_chemical_defs <- function() {
	file <- "../input/ToxCast_GenericChemicals_2014_11_12.csv"
	temp.chems <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="")
	rownames(temp.chems) <- temp.chems[,"CODE"]
	OLD.CHEMS <<- temp.chems
}
#--------------------------------------------------------------------------------------
#
# z-score: shift the data
#
# QC=OK
#--------------------------------------------------------------------------------------
shift_zscore <- function() {
    cat("==========================================================================\n")
    cat("shift zscore\n")
    cat("==========================================================================\n")
    flush.console()
    file <- "../output/source_z_shifts_original.txt"

    zshift <- read.table(file,header=T,sep="\t",stringsAsFactors=F,quote="",comment="")
    ztemp <- MAT.Z
    assay.list <- names(MAT.Z)
    nassay <- length(assay.list)
    for(i in 1:nassay) {
        assay <- assay.list[i]
        source <- ASSAY.INFO[is.element(ASSAY.INFO[,"Assay"],assay),"Source"]
        shift <- zshift[is.element(zshift[,"Source"],source),"Center1"]
        cat(assay,":",source,":",shift,"\n")
        ztemp[,assay] <- ztemp[,assay] - shift
    }
    MAT.Z.NORM <<- ztemp
    outfile <- "../output/zscore_matrix_norm.txt"
    write.table(ztemp,file=outfile, row.names=T, append=FALSE, quote=F, sep = "\t")
}
#--------------------------------------------------------------------------------------
#
# Scale the top
#
#--------------------------------------------------------------------------------------
scale_top_by_assay <- function() {
    cat("==========================================================================\n")
    cat("scale.top.by.assay\n")
    cat("==========================================================================\n")

	tscale <- MAT.T

	for(i in 1:NASSAY) {
		assay <- ASSAY.INFO[i,"Assay"]
		source <- ASSAY.INFO[i,"Source"]

		scaler <- 0
		if(substr(source,1,3)=="NVS") scaler <- 1
		else if(substr(source,1,4)=="ACEA") scaler <- 1
		else if(substr(source,1,5)=="Tox21") scaler <- 1
		else if(substr(source,1,2)=="OT") scaler <- 1
		else {
			temp <- MAT.T[,assay]
			temp[temp>200] <- 200
			tested <- MAT.tested[,assay]
			hits <- MAT.hitcall[,assay]
			temp[temp<0] <- 0
			temp[is.na(temp)] <- -1
			temp[tested==0] <- -1
			temp[hits==0] <- -1
			tlist <- as.numeric(temp[temp>0])
			scaler <- 100/quantile(tlist,probs=seq(0,1,0.05))[20]
		}
		if(is.na(scaler)) scaler <- 1
		if(scaler>200) scaler <- 200
		tscale[,assay] <- MAT.T[,assay]*scaler
		if(scaler!=1) {
			cat(assay,":",scaler,"\n")
			flush.console()
		}
	}
	tscale[tscale>200] <- 200
	MAT.T.SCALED <<- tscale
}


#
#--------------------------------------------------------------------------------------

