#--------------------------------------------------------------------------------------
#
# Run all genescore functions
#
#--------------------------------------------------------------------------------------
genescore.driver <- function() {
  calc.genescore(do.prep=T,zcut=3)
  load.genescore()
  prep.genescore.matrix()
  load.genescore.matrix()
  genescore.hm(to.file=T,cex.col=0.1,cex.row=0.1,cutoff=4,minhit=2)
}
#--------------------------------------------------------------------------------------
#
# build the gene-wise spcific hit file
#
# QC=OK
#--------------------------------------------------------------------------------------
calc.genescore <- function(do.prep=T,zcut=3) {
  print.current.function()  
  name.list <- c("CODE","CASRN","Name","StructureCategory","UseCategory","UseSuperCategory","IntendedTarget","Gene","Denominator","GeneScore")
  onechem <- as.data.frame(matrix(nrow=1,ncol=length(name.list)))
  names(onechem) <- name.list
  result <- NULL
  
  ctemp <- CHEMS[,c("CODE","Phase_I","Phase_II","E1K")]
  mask <- ctemp[,"Phase_I"]+ctemp[,"Phase_II"]+ctemp[,"E1K"]
  code.list.in.phase <- ctemp[mask>0,"CODE"]
  code.list <- CHEMS[,"CODE"]
  cat("Length of code.list without phase filter: ",length(code.list),"\n")
  code.list <- code.list[is.element(code.list,code.list.in.phase)]
  cat("Length of code.list with phase filter: ",length(code.list),"\n")
  flush.console()
  code.list <- sort(code.list)
  gene.list.0 <- sort(uniquify(toupper(ASSAY.INFO[,"intended_target"])))
  ngene0 <- length(gene.list.0)
  gene.list <- NULL
  for(i in 1:ngene0) {
    x <- str_split(gene.list.0[i]," ")[[1]]
    for(j in 1:length(x)) gene.list <- c(gene.list,x[j])
  }
  gene.list <- sort(unique(gene.list))
  
  nchem <- length(code.list)
  ngene <- length(gene.list)
  if(do.prep) {
    ztemp <- MAT.Z.NORM
    ztemp[ztemp==0] <- NA
    ztemp[is.na(ztemp)] <- 0
    ztemp[ztemp<zcut] <- 0
    
    ztemp[MAT.tested==0] <- 0
    ztemp[ztemp>0] <- 1
    
    temp <- (6-MAT.logAC50)*ztemp
    STEMP <<- temp[code.list,]
    cat("finished preparing big matrices\n")
    flush.console()
    
    a.list <- NULL
    g.list <- NULL
    for(i in 1:NASSAY) {
      assay <- ASSAY.LIST[i]
      target <- toupper(ASSAY.INFO[assay,"intended_target"])
      x <- str_split(target," ")[[1]]
      for(j in 1:length(x)) {
        if(!is.na(x[j])) {
          g.list <- c(g.list,x[j])
          a.list <- c(a.list,assay)
        }
      }
    }
    temp <- as.data.frame(cbind(a.list,g.list),stringsAsFactors=F)
    names(temp) <- c("Assay","Gene")
  }
  browser()
  ASSAY.GENE <<- temp
  
  gstart <- 1
  for(j in gstart:ngene) {
    gene <- gene.list[j]
    #if(gene=="PGR") {
    assay.list <- ASSAY.GENE[is.element(ASSAY.GENE[,"Gene"],gene),"Assay"]
    #ASSAY.INFO[is.element(toupper(ASSAY.INFO[,"intended_target"]),gene),"Assay"]
    slice <- 0
    for(k in 1:length(assay.list)) {
      if(sum(grep("_Activator",assay.list[k]))>0) slice <- slice+1
      if(sum(grep("_up",assay.list[k]))>0) slice <- slice+1
    }
    atemp <- STEMP[,assay.list]
    if(length(assay.list)==1) {
      tmp <- as.data.frame(matrix(nrow=nchem,ncol=1),stringsAsFactors=F)
      rownames(tmp) <- code.list
      tmp[,1] <- atemp
      atemp <- tmp
    }
    if(!is.null(dim(atemp))) {
      cat("\n=======================================================\n")
      cat(gene,":",slice,":",dim(atemp),assay.list,"\n")
      cat("=======================================================\n")
      
      for(i in 1:nchem) {
        code <- code.list[i]
        temp <- atemp[code,]
        temp <- temp[!is.na(temp)]
        sumtemp <- sum(temp)
        ltemp <- length(temp)
        if(ltemp>0 && sumtemp!=0) {
          gene.score <- sumtemp
          denominator <- ltemp-slice
          if(denominator<=0) denominator <- 1
          gene.score <- gene.score / denominator
          onechem[,"CODE"] <- code
          onechem[,"CASRN"] <- CHEMS[code,"CASRN"]
          onechem[,"Name"] <- CHEMS[code,"Name"]
          onechem[,"StructureCategory"] <- CHEMS[code,"structure_category"]
          onechem[,"UseSuperCategoryUseCategory"] <- CHEMS[code,"use_category"]
          onechem[,"IntendedTarget"] <- CHEMS[code,"use_super_category"]
          onechem[,"Gene"] <- CHEMS[code,"target_gene"]
          onechem[,"Denominator"] <- denominator
          onechem[,"GeneScore"] <- gene.score
          result <- rbind(result,onechem)
        }
      }
    }
  }
  file <- "../output/genescore_by_chemical_long.xlsx"
  write.xlsx(result,file)
}
#--------------------------------------------------------------------------------------
#
# read in the genescore data
#
# QC=OK
#-------------------------------------------------------------------------------------
load.genescore <- function() {
  print.current.function()
  file <- "../output/genescore_by_chemical_long.xlsx"
  temp <- read.xlsx(file)
  print(dim(temp))
  flush.console()
  mask <- temp[,"GeneScore"]
  mask[] <- 1
  
  mask[is.na(temp[,"GeneScore"])] <- 0
  mask[temp[,"GeneScore"]==Inf] <- 0
  mask[temp[,"GeneScore"]== -Inf] <- 0
  
  GENEDATA <<- temp[mask==1,]
  print(dim(GENEDATA))
  cat("read in GENEDATA\n")
  flush.console()
}
#--------------------------------------------------------------------------------------
#
# prepare the genescore matrix
#
#
# QC=OK
#--------------------------------------------------------------------------------------
prep.genescore.matrix <- function() {
  print.current.function()  
  temp <- GENEDATA[GENEDATA[,"Denominator"]>0,]
  gene.list <- sort(uniquify(temp[,"Gene"]))
  
  cat("Length of gene list with 1 or more assays and at least one chemical at the target: ",length(gene.list),"\n")
  code.list <- sort(uniquify(temp[,"CODE"]))
  
  phase.mask <- CHEMS[,"Phase_I"]+CHEMS[,"Phase_II"]
  codes.ok <- CHEMS[phase.mask>0,"CODE"]
  code.list <- code.list[is.element(code.list,codes.ok)]
  cat("Length of code list: ",length(code.list),"\n")
  ngene <- length(gene.list)
  nchem <- length(code.list)
  mtemp <- matrix(nrow=nchem,ncol=ngene)
  mtemp[] <- 0
  rownames(mtemp) <- code.list
  colnames(mtemp) <- gene.list
  for(i in 1:dim(temp)[1]) {
    code <- temp[i,"CODE"]
    if(is.element(code,code.list)) {
      gene <- temp[i,"Gene"]
      score <- temp[i,"GeneScore"]
      if(is.element(gene,gene.list)) mtemp[code,gene] <- score
    }
  }
  
  GMATRIX <<- mtemp
  outfile <- "../output/genescore_matrix.xlsx"
  write.xlsx(mtemp,outfile)
}
#--------------------------------------------------------------------------------------
#
# read in the genescore matrix
#
# QC=OK
#-------------------------------------------------------------------------------------
load.genescore.matrix <- function() {
  print.current.function()
  file <- "../output/genescore_matrix.xlsx"
  temp <- read.xlsx(file)
  GMATRIX <<- temp
  print(dim(temp))
}
#--------------------------------------------------------------------------------------
#
# do the heatmap of the gene-wise matrix
#
# QC=OK
#--------------------------------------------------------------------------------------
genescore.hm <- function(to.file=F,cex.col=0.1,cex.row=0.1,cutoff=2,minhit=2) {
  print.current.function()  
  if(to.file) {
    file <- paste("../plots/genescore_hm_",cutoff,".pdf",sep="")
    pdf(file=file,width=7,height=7,pointsize=12,bg="white",paper="letter",pagecentre=T)
  }
  temp <- GMATRIX
  temp[temp<cutoff] <- 0
  temp.disc <- temp
  temp.disc[temp.disc>0] <- 1
  rs <- rowSums(temp.disc)
  cs <- colSums(temp.disc)
  temp <- temp[rs>=minhit,cs>=minhit]
  print(dim(temp))
  code.list <- rownames(temp)
  name.list <- CHEMS[code.list,"Name"]
  col.col <- code.list
  col.col[] <- "white"
  pesticide.list <- c("Insecticide/Chemical intermediate" ,"Fungicide/Crop protection", "Fungicide/antimicrobial","degradate of endosulfan (CASRN 115-29-7)" ,"Degradate of Aldicarb (CASRN 116-06-3)","Herbicide","Insecticide","Biocide","Bactericide","Fungicide","Microbicide","microbiocide","Rodenticide")
  
  mask <- CHEMS[,"Phase_I"] + CHEMS[,"Phase_II"]
  use.list <- CHEMS[mask>0,"use_category"]
  use.slist <- CHEMS[mask>0,"use_super_category"]
  use.list.pharma <- use.slist[is.element(use.list,"Pharmaceutical")]
  use.list.pest <- use.slist[is.element(use.list,"Pesticide")]
  cat("Total chemicals: ",length(use.list),"\n")
  cat("Total pesticides: ",length(use.list.pest),"\n")
  cat("Total pharamceuticals: ",length(use.list.pharma),"\n")
  
  for(i in 1:length(code.list)) {
    code <- code.list[i]
    use <- CHEMS[code,"use_super_category"]
    if(use=="Pharmaceutical") col.col[i] <- "red"
    if(use=="Pesticide") col.col[i] <- "black"
  }
  pharma.count <- length(col.col[col.col=="red"])
  pest.count <- length(col.col[col.col=="black"])
  cat("Pharma hits: ",pharma.count," pesticide hits:",pest.count,"\n")
  hres <- hclust(d=dist(temp),method="ward.D")
  nlevel <- 50
  memb.mat <- as.data.frame(matrix(nrow=length(code.list),ncol=nlevel),stringsAsFactors=F)
  memb.mat[] <- -1
  rownames(memb.mat) <- code.list
  
  for(i in 1:nlevel) {
    names(memb.mat)[i] <- paste("cutlevel_",i,sep="")
    memb <- cutree(hres,k=i)
    memb.mat[,i] <- memb
  }
  memb.mat <- cbind(rownames(memb.mat),memb.mat)
  memb.mat <- cbind(rownames(memb.mat),memb.mat)
  memb.mat <- cbind(rownames(memb.mat),memb.mat)
  memb.mat <- cbind(rownames(memb.mat),memb.mat)
  memb.mat <- cbind(rownames(memb.mat),memb.mat)
  memb.mat <- cbind(rownames(memb.mat),memb.mat)
  names(memb.mat)[1] <- "CODE"
  names(memb.mat)[2] <- "Name"
  names(memb.mat)[3] <- "use_category"
  names(memb.mat)[4] <- "use_super_category"
  names(memb.mat)[5] <- "structure_category"
  names(memb.mat)[6] <- "target_gene"
  csub <- CHEMS[code.list,]
  memb.mat[,2] <- csub[,"Name"]
  memb.mat[,3] <- csub[,"use_category"]
  memb.mat[,4] <- csub[,"use_super_category"]
  memb.mat[,5] <- csub[,"structure_category"]
  memb.mat[,6] <- csub[,"target_gene"]
  
  result <- heatmap(t(as.matrix(temp)),margins=c(5,5),scale="none",labCol=name.list,main=paste("Gene Score",dim(temp)[1]," chemicals, ",dim(temp)[2]," genes"),
                    xlab="",ylab="",cexCol=cex.col,cexRow=cex.row,col=brewer.pal(9,"Reds"),
                    hclustfun=function(x) hclust(d=dist(x),method="ward.D"),ColSideColors=col.col,keep.dendro=T,verbose=F)
  
  temp <- temp[,result$rowInd]
  memb.mat <- cbind(memb.mat,temp)
  memb.mat <- memb.mat[result$colInd,]
  outfile <- paste("../output/genescore_clusters_",cutoff,".xlsx",sep="")
  write.xlsx(memb.mat,outfile)
  
  if(to.file) graphics.off()
  else browser()
}
#--------------------------------------------------------------------------------------
#
# Analyze the reference chemicals
#
#--------------------------------------------------------------------------------------
ref.chems <- function() {
  print.current.function()  
  file <- "../input/gene_family.csv"
  gf <- read.csv(file,header=T,stringsAsFactors=F,quote="\"",comment.char="")
  GENE.FAMILY <<- gf
  
  target.list <- names(GMATRIX)
  
  code.list <- CHEMS[is.element(CHEMS[,"Phase_I"],1),"CODE"]
  code.list <- c(code.list,CHEMS[is.element(CHEMS[,"Phase_II"],1),"CODE"])
  code.list <- sort(unique(code.list))
  
  c.list <- NULL
  n.list <- NULL
  g.list <- NULL
  f.list <- NULL
  s.list <- NULL
  temp <- CHEMS[code.list,]
  temp <- temp[!is.na(temp[,"toxcast_reference_target_gene"]),]
  temp <- temp[!is.element(temp[,"toxcast_reference_target_gene"],""),]
  nt <- dim(temp)[1]
  for(i in 1:nt) {
    code <- temp[i,"CODE"]
    cname <- temp[i,"Name"]
    target_gene <- toupper(temp[i,"toxcast_reference_target_gene"])
    x <- str_split(target_gene," ")[[1]]
    for(j in 1:length(x)) {
      if(is.element(x[j],target.list)) {
        c.list <- c(c.list,code)
        n.list <- c(n.list,cname)
        gene <- x[j]
        g.list <- c(g.list,gene)
        family <- GENE.FAMILY[is.element(GENE.FAMILY[,"gene"],gene),"gene_group"]
        f.list <- c(f.list,family)
        score <- GMATRIX[code,gene]
        if(is.na(score)) score <- 0
        s.list <- c(s.list,score)
        
      }
    }
  }
  
  t.list <- c.list
  t.list[] <- "Reference Target"
  
  n <- length(c.list)
  for(i in 1:n) {
    code <- c.list[i]
    cname <- n.list[i]
    gene <- g.list[i]
    family <- f.list[i]
    temp <- GENE.FAMILY[is.element(GENE.FAMILY[,"gene_group"],family),"gene"]
    if(length(temp)>1) {
      for(j in 1:length(temp)) {
        gtemp <- temp[j]
        if(is.element(gtemp,names(GMATRIX))) {
          if(gtemp!=gene) {
            score <- GMATRIX[code,gtemp]
            if(is.na(score)) score <- 0
            if(score>0) {
              c.list <- c(c.list,code)
              n.list <- c(n.list,cname)
              f.list <- c(f.list,family)
              g.list <- c(g.list,gtemp)
              t.list <- c(t.list,"Related Target")
              s.list <- c(s.list,score)
            }
          }
        }
      }
    }
  }
  
  result <- cbind(c.list,n.list)
  result <- cbind(result,t.list)
  result <- cbind(result,g.list)
  result <- cbind(result,f.list)
  result <- cbind(result,s.list)
  result <- as.data.frame(result,stringsAsFactors=F)
  names(result) <- c("CODE","Name","Type","Gene","GeneFamily","GeneScore")
  
  REFCHEMS <<- result
  outfile <- "../output/reference_chemicals.xlsx"
  write.xlsx(result,outfile)
}
#--------------------------------------------------------------------------------------
#
# Summarize the reference chemicals
#
#--------------------------------------------------------------------------------------
ref.chems.summary <- function() {
  print.current.function()
  file <- "../output/reference_chemicals.xlsx"
  rc <- read.xlsx(file)
  REFCHEMS <<- rc
  
  gene.list <- sort(unique(rc[,"Gene"]))
  ngene <- length(gene.list)
  cnames <- c("Target Gene","Positives","Negatives","Percent(Positive)","Off-target Positives","Gene Family")
  result <- as.data.frame(matrix(nrow=ngene,ncol=length(cnames)),stringsAsFactors=F)
  names(result) <- cnames
  result[] <- NA
  result[,1] <- gene.list
  for(i in 1:ngene) {
    gene <- gene.list[i]
    temp <- rc[is.element(rc[,"Gene"],gene),]
    n <- dim(temp)[1]
    npos <- 0
    nneg <- 0
    noff <- 0
    for(j in 1:n) {
      type <- temp[j,"Type"]
      gf <- temp[j,"GeneFamily"]
      score <- temp[j,"GeneScore"]
      if(type=="Reference Target") {
        if(score>0) npos <- npos+1
        else nneg <- nneg+1
      }
      else if(type=="Related Target") {
        if(score>0) noff <- noff+1
      }
    }
    result[i,"Positives"] <- npos
    result[i,"Negatives"] <- nneg
    result[i,"Off-target Positives"] <- noff
    result[i,"Gene Family"] <- gf
  }
  mask <- result[,"Positives"]+result[,"Negatives"]
  mask <- result[,"Positives"]+result[,"Negatives"]
  result[,"Percent(Positive)"] <- format(100*result[,"Positives"]/mask,digits=0)
  
  outfile <- "../output/reference_chemicals_summary.xlsx"
  write.xlsx(result,outfile)
}
