#--------------------------------------------------------------------------------------
#
# utils.R utilities for managing ToxCast data
#
# September 2016
# Richard Judson
#
# US EPA
# Questions, comments to: judson.richard@epa.gov
#
#
#--------------------------------------------------------------------------------------
library(stringi)
library(stringr)
library(RMySQL)
library(DBI)
library(openxlsx)
SERVER <<- "au.epa.gov"
#DB <<- "dev_toxval_actor"
#DB <<- "actor_2015q3"
USER <<- "rjudson"
PASSWORD <<- "password"
#-----------------------------------------------------------------------------------
#
# run a query
#
#-----------------------------------------------------------------------------------
run.query <- function(query,db,do.halt=F,do.print=F) {
  if(do.print) {
    print.current.function()
    cat("query: ",query,"\n")
    cat("db: ",db,"\n")
  }
  tryCatch({
    con <- dbConnect(drv=RMySQL::MySQL(),user=USER,password=PASSWORD,host=SERVER,dbname=db)
    if(do.print) {
      print(con)
      #browser()
    }
    rs <- dbSendQuery(con, query)
	  d1 <- dbFetch(rs, n = -1) 
	  if(do.print) {
	    print(d1)
	    flush.console()
	  }
	  dbHasCompleted(rs)
	  dbClearResult(rs)
	  dbDisconnect(con)
	  return(d1)
	}, warning = function(w) {
	  cat("WARNING: ",query," : [",w,"]\n",sep="")
	  dbDisconnect(con)
	  if(do.halt) browser()
	  return(NULL)
	}, error = function(e) {
	  cat("ERROR:",query,"\n")
	  print(e)
	  dbDisconnect(con)
	  if(do.halt) browser()
	  return(NULL)
	})
}
#-----------------------------------------------------------------------------------
#
# run a query
#
#-----------------------------------------------------------------------------------
run.insert <- function(query,db,do.halt=F,do.print=F,auto.increment.id=F) {
  if(do.print) {
    print.current.function()
    cat("query: ",query,"\n")
    cat("db: ",db,"\n")
  }
  id <- -1
  tryCatch({
    con <- dbConnect(drv=RMySQL::MySQL(),user=USER,password=PASSWORD,host=SERVER,dbname=db)
    rs <- dbSendQuery(con, query)
    dbHasCompleted(rs)
    dbClearResult(rs)
    if(auto.increment.id) {
      rs2 <- dbSendQuery(con, "select LAST_INSERT_ID()")
      d2 <- dbFetch(rs2, n = -1) 
      id <- d2[1,1]
      dbHasCompleted(rs2)
      dbClearResult(rs2)
    }     
    dbDisconnect(con)
  }, warning = function(w) {
    cat("WARNING:",query," : [",db,"]\n",sep="")
    if(do.halt) browser()
    dbDisconnect(con)
    if(auto.increment.id) return(-1)
  }, error = function(e) {
    cat("ERROR:",query," : [",db,"]\n",sep="")
    print(e)
    if(do.halt) browser()
    dbDisconnect(con)
    if(auto.increment.id) return(-1)
  })
  if(auto.increment.id) return(id)
}
#-----------------------------------------------------------------------------------
#
# run a query
#
#-----------------------------------------------------------------------------------
run.insert.table <- function(mat,table,db,do.halt=F,do.print=F) {
  if(do.print) {
    print.current.function()
    cat("mat: ",dim(mat),"\n")
    cat("table: ",table,"\n")
    cat("db: ",db,"\n")
  }
  tryCatch({
    con <- dbConnect(drv=RMySQL::MySQL(),user=USER,password=PASSWORD,host=SERVER,dbname=db)
    dbWriteTable(con,name=table,value=mat,field.type=NULL,row.names=F,overwrite=F,append=T)
    dbDisconnect(con)
  }, warning = function(w) {
    cat("WARNING:",table," : [",db,"]\n",sep="")
    dbDisconnect(con)
    if(do.halt) browser()
  }, error = function(e) {
    cat("ERROR:",table," : [",db,"]\n",sep="")
    print(e)
    dbDisconnect(con)
    if(do.halt) browser()
  })
}

#-----------------------------------------------------------------------------------
#
# escape a string
#
#-----------------------------------------------------------------------------------
string.escape <- function(x) {
  x <- str_replace_all(x,"\'","")
  x <- str_replace_all(x,"\"","")
  return(x)
}
#-----------------------------------------------------------------------------------
#
# test the interaction with the DB
#
#-----------------------------------------------------------------------------------
test.con <- function() {
	#ret <- run.query("SELECT distinct Db from db where Db like 'actor%'","mysql")
	#print(ret)
	ret <- run.query("show databases",DB)
	print(ret)
	ret <- run.query("show tables",DB)
	print(ret)
	ret <- run.query("desc pred_epi",DB)
	print(ret[,"Field"])
	#ret <- run.query("desc assay_category_cv",DB)
	#print(ret)
	#ret <- run.query("select category, description from assay_category_cv",DB)
	#print(ret)
	#ret <- run.query("desc assay_component",DB)
	#print(ret)
	
}
#--------------------------------------------------------------------------------------
#
# get info for a code
#
#--------------------------------------------------------------------------------------
probe.code <- function(code) {
	print(CHEMS[code,"Name"])
	temp <- MAT.logAC50[code,]
	name.list <- names(temp)
	mask <- temp
	mask[] <- 1
	mask[is.na(temp)] <- 0
	mask[temp==6] <- 0
	temp <- temp[mask==1]
	name.list <- name.list[mask==1]
	mat <- as.data.frame(cbind(name.list,as.numeric(temp)),stringsAsFactors=F)
	names(mat) <- c("assay","logAC50")
	mat <- mat[order(as.numeric(mat$logAC50)),]	
	print(mat)
}

#--------------------------------------------------------------------------------------
#
# get the name of the current functio nand pretty print it
#
#--------------------------------------------------------------------------------------
print.current.function <- function(comment.string=NA) {
	cat("=========================================\n")
	curcall <- sys.call(sys.parent(n=1))[[1]]
	cat(curcall,"\n")
	if(!is.na(comment.string))	cat(comment.string,"\n")
	cat("=========================================\n")
	flush.console()
}
#-----------------------------------------------------------------------------------
#
# do an or on a chunk of a matrix
#
#-----------------------------------------------------------------------------------
aORb <<- function(V,M) {
    res <- c()
    for(i in 1:dim(M)[1]) res <- c(res,sum(V|M[i,]))
    return(res)
}
#--------------------------------------------------------------------------------------
#
# uniquify - take a list and get the unique set
#
#--------------------------------------------------------------------------------------
uniquify <- function(x) {
	temp <- duplicated(x)
	y <- x[!temp]
	return(y)
}
#--------------------------------------------------------------
#
# calculate the paramteres for a 2 x 2 matrix
#
#--------------------------------------------------------------
TxT<<- function(tp,fp,fn,tn,do.p=TRUE) {

	sens<-tp/(tp+fn)
	spec<-tn/(tn+fp)
	ppv<-tp/(tp+fp)
	npv<-tn/(tn+fn)

	relative.risk <- (tp/(tp+fp)) / (fn/(tn+fn))
	odds.ratio <- (tp*tn)/(fp*fn)

	accuracy <- (tp+tn)/(tp+tn+fp+fn)
	x<-matrix(data=NA, nrow=2, ncol=2)
	x[1,1]<-tp
	x[1,2]<-fp
	x[2,1]<-fn
	x[2,2]<-tn
	if(is.infinite(relative.risk))  relative.risk <- 1000000
	if(is.infinite(odds.ratio))  odds.ratio <- 1000000
	if(is.infinite(sens))  sens <- 0
	if(is.infinite(spec))  spec <- 0
	if(is.nan(relative.risk))  relative.risk <- 1
	if(is.nan(odds.ratio))  odds.ratio <- 1
	if(is.nan(sens))  sens <- 0
	if(is.nan(spec))  spec <- 0
	if(is.na(sens))  sens <- 0
	if(is.na(spec))  spec <- 0
	or.ci.lwr <- 0
	or.ci.upr <- 1000000
	if(odds.ratio<1000000 && tp>0 && tn>0 && fp>0 && fn>0) {
		lnor <- log(odds.ratio)
		selnor <- sqrt(1/tp+1/tn+1/fp+1/fn)
		ln.upr <- lnor+1.96*selnor
		ln.lwr <- lnor-1.96*selnor
		or.ci.lwr <- exp(ln.lwr)
		or.ci.upr <- exp(ln.upr)
	}
	p.value<-1
	if(do.p==TRUE) {
		if(sens>0 && spec>0) {
			c<-fisher.test(x)
			p.value <- c$p.value
		}
	}
	ba <- 0.5*(sens+spec)

	sval<-paste(tp,"\t",fp,"\t",fn,"\t",tn,"\t",format(sens,digits=3),"\t",format(spec,digits=3),"\t",format(ba,digits=3),"\t",format(accuracy,digits=3),"\t",format(relative.risk,digits=3),"\t",format(odds.ratio,digits=3),"\t",format(or.ci.lwr,digits=3),"\t",format(or.ci.upr,digits=3),"\t",format(ppv,digits=3),"\t",format(npv,digits=3),"\t",format(p.value,digits=3),sep="")
	title<-paste("TP\tFP\tFN\tTN\tSens\tSpec\tBA\tAcrcy\tRelRsk\tOR\tCI.OR.LWR\tCI.OR.UPR\tPPV\tNPV\tp.value",sep="")

  name.list <- c("TP","FP","FN","TN","Sens","Spec","BA","Acrcy","RelRsk","OR","CI.OR.LWR","CI.OR.UPR","PPV","NPV","p.value")
  mat <- as.data.frame(matrix(nrow=1,ncol=length(name.list)))
  names(mat) <- name.list
  mat[1,"TP"] <- tp
  mat[1,"FP"] <- fp
  mat[1,"FN"] <- fn
  mat[1,"TN"] <- tn
  mat[1,"Sens"] <- sens
  mat[1,"Spec"] <- spec
  mat[1,"BA"] <- ba
  mat[1,"Acrcy"] <- accuracy
  mat[1,"RelRsk"] <- relative.risk
  mat[1,"OR"] <- odds.ratio
  mat[1,"CI.OR.LWR"] <- or.ci.lwr
  mat[1,"CI.OR.UPR"] <- or.ci.upr
  mat[1,"PPV"] <- ppv
  mat[1,"NPV"] <- npv
  mat[1,"p.value"] <- p.value
    
  r<<-list(a=tp,b=fp,c=fn,d=tn,sens=sens,spec=spec,ba=ba,accuracy=accuracy,relative.risk=relative.risk,odds.ratio=odds.ratio,or.ci.lwr=or.ci.lwr,or.ci.upr=or.ci.upr,ppv=ppv,npv=npv,p.value=p.value,sval=sval,title=title,mat=mat)
  r
}
#-----------------------------------------------------------------------------------
#
# chisq on a single variable
#   used as the inner function for factor selection
#
#-----------------------------------------------------------------------------------
chisq_1Var <- function(x,Class) {
	y <- Class[,1]
	a <-sum(x*y)
	b <- sum(x*(1-y))
	c <- sum((1-x)*y)
	d <- sum((1-x)*(1-y))
	txt <- TxT(a,b,c,d)
	return(txt)
}
#-----------------------------------------------------------------------------------
#
# minimum by col
#
#-----------------------------------------------------------------------------------
colMin <- function(x) {
	ret <- apply(x,FUN=min,MARGIN=2,na.rm=T)
}
#-----------------------------------------------------------------------------------
#
# maximum by col
#
#-----------------------------------------------------------------------------------
colMax <- function(x) {
	ret <- apply(x,FUN=max,MARGIN=2)
}
#-----------------------------------------------------------------------------------
#
# minimum by row
#
#-----------------------------------------------------------------------------------
rowMin <- function(x) {
	ret <- apply(x,FUN=min,MARGIN=1)
}
#-----------------------------------------------------------------------------------
#
# minimum by row
#
#-----------------------------------------------------------------------------------
rowMax <- function(x) {
	ret <- apply(x,FUN=max,MARGIN=1)
}
#-----------------------------------------------------------------------------------
#
# minimum by row
#
#-----------------------------------------------------------------------------------
rowSum <- function(x) {
	ret <- apply(x,FUN=sum,MARGIN=1)
}
#-----------------------------------------------------------------------------------
#
# median by row
#
#-----------------------------------------------------------------------------------
rowMed <- function(x) {
	ret <- apply(x,FUN=median,MARGIN=1)
}
#-----------------------------------------------------------------------------------
#
# contains
#
#-----------------------------------------------------------------------------------
contains <- function(x,query,debug=F) {
  if(debug) {
    print(x)
    print(query)
  }
  if(is.null(x)) return(FALSE)
  if(is.null(query)) return(FALSE)
  if(is.na(x)) return(FALSE)
  if(is.na(query)) return(FALSE)
  x <- stri_trans_tolower(x)
  query <- stri_trans_tolower(query)
  
  val <- sum(grep(query,x,fixed=T))
  if(val>0) return(TRUE)
  else return(FALSE)
}
#--------------------------------------------------------------------------------------
#
# plot a histogram on a log scale Calculate at the hit distribution by chemical
#
# QC=OK
#--------------------------------------------------------------------------------------
hist.log <- function(x,y,ylim,xlab,ylab,main,cytotox.median, cytotox.min, cytotox.max) {
  print.current.function()
  plot(x[1:length(y)],y,type="n",col="gray40",lwd=2.5,log="x",xlab=xlab,ylim=ylim,ylab=ylab,main=main,cex.lab=1.2,cex.axis=1.2)
  if(cytotox.min<100) {
    rect(cytotox.min,ylim[2],max(x),0,col="gray80")
    lines(c(cytotox.median,cytotox.median),ylim,col="red",lwd=3)
  }
  for(i in 1:length(y)) {
    rect(x[i],y[i],x[i+1],0)
  }
}
#-----------------------------------------------------------------------------------
#
# fix a casrn
#
#-----------------------------------------------------------------------------------
fix.casrn <- function(casrn,cname="",do.print=F) {
  if(do.print) cat("input: ",cname,":",casrn,"\n")
  if(contains(casrn,"NOCAS")) return(casrn)
  doit <- T
  while(doit) {
    if(substr(casrn,1,1)=="0") casrn <- substr(casrn,2,nchar(casrn))
    else doit <- F
  }
  
  if(!contains(casrn,"-")) {
    nc <- nchar(casrn)
    ctemp <- casrn
    right <- substr(ctemp,nc,nc)
    mid <- substr(ctemp,nc-2,nc-1)
    left <- substr(ctemp,1,nc-3)
    casrn <- paste(left,"-",mid,"-",right,sep="")
  }
  #cat("[",cname,"]\n",sep="")
  if(!is.na(cname)) {
    if(cname=="epsilon-Hexachlorocyclohexane (epsilon-HC)") casrn <- "6108-10-7"
    if(cname=="Captafol") casrn <- "2425-06-1"
    if(cname=="Hydrogen sulfide") casrn <- "7783-06-4"
    if(cname=="Picloram") casrn <- "1918-02-1"
    if(cname=="Dodine") casrn <- "2439-10-3"
  }
  if(do.print) cat("output: ",cname,":",casrn,"\n")
  return(casrn)
}
