#--------------------------------------------------------------------------------------
#
# physchem.R - code to prepare the physchem data
#
# November 2015
# Richard Judson
#
# US EPA
# Questions, comments to: judson.richard@epa.gov, 919-541-3085
#
#--------------------------------------------------------------------------------------
options(java.parameters = "-Xmx1000m")
library(grDevices)
library(RColorBrewer)
library(stringr)
library(mixdist)
library(class)
library(lattice)
library(openxlsx)
library(DBI)
library(RMySQL)

source("utils.R")
source("diagnostics_v01.R")
source("genescore_v01.R")
source("pathway_v01.R")
#source("burst_v01.R")
#source("rat_cancer_model_ivive.R")

SERVER <<- "134.67.216.114"
#SERVER <<- "134.67.216.45"
SERVER <<- "au.epa.gov"
DB <<- "dev_physchemdb"
USER <<- "rjudson"
PASSWORD <<- "password"

VARMATDIR <<- "../input/varmats_151020_internal/"
VARMATDATE <<- "151020"

#--------------------------------------------------------------------------------------
#
# build the physchem table
#
#--------------------------------------------------------------------------------------
load.physchem.1 <- function() {
	print.current.function()
	nchem <- length(CODE.LIST)
	file <- "../input/chemical_dictionary_151020.xlsx"
	dict <- read.xlsx(file)
	nchem <- dim(dict)[1]
	prop.list.1 <- c("logBCF","logP","MW","MP","BP","VP","IP")
	mclass.list.1 <- c("Log BCF","LogP",
		"MW",
		"MP",
		"BP",
		"VP",
		"IP")
	
	prop.list.2 <- c("logWaterSol","accptHB","donorHB","volume","PISA","PSA","SASA","ReactiveFunctionalGroups")
	model.list.2 <- c("aqueous solubility","accptHB","donorHB","volume","PISA","PSA","Solvent Accessible Surface Area","Reactive Functional Groups")
	nprop.1 <- length(prop.list.1)
	nprop.2 <- length(prop.list.2)


	name.list <- c("CODE","CASRN","Name","gsid","cid",prop.list.1,prop.list.2)
	mat <- as.data.frame(matrix(nrow=nchem,ncol=length(name.list)))
	names(mat) <- name.list
	mat[,"CODE"] <- dict[,"CODE"]
	mat[,"CASRN"] <- dict[,"CASRN"]
	mat[,"Name"] <- dict[,"Name"]
	mat[,"gsid"] <- dict[,"gsid"]
	mat[,"cid"] <- dict[,"cid"]
	for(i in 1:nprop.1) {
		prop <- prop.list.1[i]
		mat[,prop] <- NA
	}
	for(i in 1:nprop.2) {
		prop <- prop.list.2[i]
		mat[,prop] <- NA
	}
	
	rownames(mat) <- mat[,"CODE"]
	#nchem <- 200
	for(i in 1:nprop.1) {
		prop <- prop.list.1[i]
		mclass <- mclass.list.1[i]
		cat(prop,":",mclass,"\n")
		for(j in 1:nchem) {
			if(j%%100==0) {cat(" chemicals: ",j,"\n"); flush.console()}
			
			code <- mat[j,"CODE"]
			cid <- mat[j,"cid"]
			if(!is.na(cid)) {
				query <- paste("select m.name,m.model_class,m.source,mr.result_mean 
								from model m, model_results mr 
								where m.model_id = mr.model_id and
								mr.cid=",cid," and 
								m.model_class='",mclass,"'",sep="")
				#res <- run.query(query,"dev_physchemdb")
				#print(res)				

				query <- paste("select avg(result_mean) from model m, model_results mr where m.model_id = mr.model_id and mr.cid=",cid," and m.model_class='",mclass,"'",sep="")
				#print(query)
				val <- run.query(query,"dev_physchemdb")[1][,1]
				#print(val)
				if(!is.na(val)) mat[code,prop] <- val
			}
		}	
		flush.console()
	}
	for(i in 1:nprop.2) {
		prop <- prop.list.2[i]
		model <- model.list.2[i]
		cat(prop,":",model,"\n")
		for(j in 1:nchem) {
			if(j%%100==0) {cat(" chemicals: ",j,"\n"); flush.console()}
			code <- mat[j,"CODE"]
			cid <- mat[j,"cid"]
			if(!is.na(cid)) {
			
				query <- paste("select avg(result_mean) 
								from model m, model_results mr 
								where m.model_id = mr.model_id and
								mr.cid=",cid," and 
								m.name='",model,"'",sep="")
				val <- run.query(query,"dev_physchemdb")[1][,1]
				#print(val)
				if(!is.na(val)) mat[code,prop] <- val
			}
		}	
		flush.console()
	}
	
	fname <- paste("../physchem/physchem_",VARMATDATE,".xlsx",sep="")
	write.xlsx(mat,file=fname)
	PHYSCHEM <<- mat
	browser()
}
#--------------------------------------------------------------------------------------
#
# cas name gsid cid
#
#--------------------------------------------------------------------------------------
load.physchem.old <- function() {
	print.current.function()
	nchem <- length(CODE.LIST)
	file <- "../input/chemical_dictionary_151020.xlsx"
	dict <- read.xlsx(file)
	nchem <- dim(dict)[1]
	prop.list <- c("logP","MW","MP","BP","VP","logWatSol","IP","logBCF","polarizability","accptHB","donorHB","volume","PISA","PSA","SASA","ReactiveFunctionalGroups")
	nprop <- length(prop.list)
	props <- NULL
	ids <- NULL

	query <- "select model_id from model where name in 
		('Measured Log Kow',
		'Estimated Log Kow',
		'LogP',
		'LogP RI',
		'QPlogPo/w')
		and source in ('EPI SUITE','QikProp','ACD/Labs')"
	id.list <- run.query(query,"dev_physchemdb")[1][,1]
	ids <- c(ids,id.list)
	for(i in 1:length(id.list)) props <- c(props,prop.list[1])
	
	query <- "select model_id from model where name in ('molecular weight','mol_MW','MW')"
	id.list <- run.query(query,"dev_physchemdb")[1][,1]
	ids <- c(ids,id.list)
	for(i in 1:length(id.list)) props <- c(props,prop.list[2])

	query <- "select model_id from model where name in ('Measured MP (oC)','Melting Point','Estimated MP (oC)')"
	id.list <- run.query(query,"dev_physchemdb")[1][,1]	
	ids <- c(ids,id.list)
	for(i in 1:length(id.list)) props <- c(props,prop.list[3])

	query <- "select model_id from model where name in ('Measured BP (oC)','Boiling Point','Estimated BP (oC)')"
	id.list <- run.query(query,"dev_physchemdb")[1][,1]	
	ids <- c(ids,id.list)
	for(i in 1:length(id.list)) props <- c(props,prop.list[4])

	query <- "select model_id from model where name in ('Measured VP (mm Hg)','Estimated VP (mm Hg)')"
	id.list <- run.query(query,"dev_physchemdb")[1][,1]	
	ids <- c(ids,id.list)
	for(i in 1:length(id.list)) props <- c(props,prop.list[5])

	query <- "select model_id from model where name in ('Solubility','Solubility (pH)','aqueous solubility')"
	id.list <- run.query(query,"dev_physchemdb")[1][,1]	
	ids <- c(ids,id.list)
	for(i in 1:length(id.list)) props <- c(props,prop.list[6])

	query <- "select model_id from model where name in ('Ionization Potent','Ionization Potential')"
	id.list <- run.query(query,"dev_physchemdb")[1][,1]	
	ids <- c(ids,id.list)
	for(i in 1:length(id.list)) props <- c(props,prop.list[7])

	query <- "select model_id from model where name in ('Measured Log BCF (screening test)','Measured Log BCF (steady state test)','Estimated Log BCF')"
	id.list <- run.query(query,"dev_physchemdb")[1][,1]	
	ids <- c(ids,id.list)
	for(i in 1:length(id.list)) props <- c(props,prop.list[8])

	#query <- "select model_id from model where name in ('pKa','pKa (Acidic)')"
	#id.list <- run.query(query,"dev_physchemdb")[1][,1]	
	#ids <- c(ids,id.list)
	#for(i in 1:length(id.list)) props <- c(props,prop.list[9])

	query <- "select model_id from model where name in ('Polarizability')"
	id.list <- run.query(query,"dev_physchemdb")[1][,1]	
	ids <- c(ids,id.list)
	for(i in 1:length(id.list)) props <- c(props,prop.list[9])
	
	#query <- "select model_id from model where name in ('dipole')"
	#id <- run.query(query,"dev_physchemdb")[1][,1]	
	#ids <- c(ids,id)
	#props <- c(props,prop.list[11])

	query <- "select model_id from model where name in ('accptHB')"
	id <- run.query(query,"dev_physchemdb")[1][,1]	
	ids <- c(ids,id)
	props <- c(props,prop.list[10])

	query <- "select model_id from model where name in ('donorHB')"
	id <- run.query(query,"dev_physchemdb")[1][,1]	
	ids <- c(ids,id)
	props <- c(props,prop.list[11])

	#query <- "select model_id from model where name in ('Electron Affinity')"
	#id <- run.query(query,"dev_physchemdb")[1][,1]	
	#ids <- c(ids,id)
	#props <- c(props,prop.list[14])

	query <- "select model_id from model where name in ('volume')"
	id <- run.query(query,"dev_physchemdb")[1][,1]	
	ids <- c(ids,id)
	props <- c(props,prop.list[12])

	query <- "select model_id from model where name in ('PISA')"
	id <- run.query(query,"dev_physchemdb")[1][,1]	
	ids <- c(ids,id)
	props <- c(props,prop.list[13])

	query <- "select model_id from model where name in ('PSA')"
	id <- run.query(query,"dev_physchemdb")[1][,1]	
	ids <- c(ids,id)
	props <- c(props,prop.list[14])

	query <- "select model_id from model where name in ('Solvent Accessible Surface Area')"
	id <- run.query(query,"dev_physchemdb")[1][,1]	
	ids <- c(ids,id)
	props <- c(props,prop.list[15])

	query <- "select model_id from model where name in ('Reactive Functional Groups')"
	id <- run.query(query,"dev_physchemdb")[1][,1]	
	ids <- c(ids,id)
	props <- c(props,prop.list[16])

	name.list <- c("CODE","CASRN","Name","gsid","cid",prop.list)
	mat <- as.data.frame(matrix(nrow=nchem,ncol=length(name.list)))
	names(mat) <- name.list
	mat[,"CODE"] <- dict[,"CODE"]
	mat[,"CASRN"] <- dict[,"CASRN"]
	mat[,"Name"] <- dict[,"Name"]
	mat[,"gsid"] <- dict[,"gsid"]
	mat[,"cid"] <- dict[,"cid"]
	for(i in 1:nprop) {
		prop <- prop.list[i]
		mat[,prop] <- NA
	}
	rownames(mat) <- mat[,"CODE"]
	nchem <- 10
	for(i in 1:nprop) {
		prop <- prop.list[i]
		id.list <- ids[props==prop]
		print(prop)
		print(id.list)
		for(j in 1:nchem) {
			casrn <- mat[j,"CASRN"]
			code <- mat[j,"CODE"]
			cid <- mat[j,"cid"]
			vals <- NULL
			if(length(id.list)==1) id.string <- paste("(",id.list[1],")",sep="")
			else {
				id.string <- "("
				nid <- length(id.list)
				for(k in 1:(nid-1)) id.string <- paste(id.string,id.list[k],",",sep="")
				id.string <- paste(id.string,id.list[nid],")",sep="")
			}
			query <- paste("select result_mean from model_results where model_id in ",id.string,"and cid=",cid)
			vals <- run.query(query,"dev_physchemdb")[1][,1]
			if(length(vals)>0) 	mat[code,prop] <- mean(vals)
			
			
			
#			for(k in 1:length(id.list)) {
#				id <- id.list[k]
#				query <- paste("select count(result_mean) from model_results where model_id=",id,"and cid=",cid)
#				count <- run.query(query,"dev_physchemdb")[1][,1]
#				if(count==1) {
#					query <- paste("select result_mean from model_results where model_id=",id,"and cid=",cid)
#					vals <- c(vals,run.query(query,"dev_physchemdb")[1][,1])
#				}
#			}
#			if(length(vals)>0) 	mat[code,prop] <- mean(vals)
		}	
		flush.console()
		browser()
	}
browser()	
#select m.name, m.source, m.description, m.model_class, mr.result_type, mr.raw_result, mr.result_mean, mr.result_min, mr.result_max, mr.result_unit
#from dev_physchemdb.model m, dev_physchemdb.model_results mr 
#where mr.model_id = m.model_id 
#and mr.cid = (select cid from dsstox.cas a join dsstox.cas_compound b on a.gsid = b.gsid where casrn = '<CASRN>');
	
	fname <- paste("../physchem/physchem_",VARMATDATE,".xlsx",sep="")
	write.xlsx(mat,file=fname)
	PHYSCHEM <<- mat
	browser()
}

