#-------------------------------------------------------------------------------
# Script to generate package data from tcpl databases
#-------------------------------------------------------------------------------

## The data for this package was generated using invitrodb_v2 and tcpl version
## 1.0. This script will only run properly when the tcpl settings are 
## configured and the working directory is set to the package source files 
## directory.

library(data.table)
library(tcpl)

## Load in the food use chemical file (available in supplimentary materials)
CL <- fread(file.path("inst", "Food_Chem_Categorized.csv"))
CL[ , code := paste0("C", gsub("\\s|-|_", "", casn))]
setkey(CL, cat)
CL <- CL[CL[ , list(ind = .I[1]), by = list(casn)][ , ind]]

## Create the ToxCast variable matrices
p <- list(clib = c("toxcast:ph1v2_ph2_e1k_ph3"),
          srgx = "(?!^Tox21_1.*)(?!^Tox21_2.*)(?=^.*)")
vars <- c("tested", "modl_ga", "hitc", "zscore")
TM <- mapply(tcplVarMat, vars, MoreArgs = p)
TM[["zscore"]] <- TM$zscore$zscr
CD <- TM$zscore$zdst

TM <- lapply(TM, function(x) x[CD$code, ])

setkey(CD, code)
CD[ , 
    ntested := apply(TM$tested[ , tcplLoadAeid("burst_assay", 1)$aenm], 1, sum)]

TM_subset <- function(x) {
  x[rownames(x) %in% CL[ , code],
    colnames(x) %in% tcplLoadAeid("burst_assay", 0)[ , aenm]]
}

TM <- lapply(TM, TM_subset)
CD <- CD[rownames(TM[[1]])]
CD <- merge(CD, CL[ , list(code, i = cat)], by = "code")

cyto_pt <- CD[ , ifelse(nhit > 2, cyto_pt - 3*global_mad, 3)]
cyto <- apply(TM$modl_ga, 2, function(x) x >= cyto_pt)
TM$hitc[TM$hitc < 0] <- 0
TM$filt_hit <- TM$hitc
TM$filt_hit[cyto] <- 0

save(CD, file = file.path("data", "CD.RData"))
save(TM, file = file.path("data", "TM.RData"))


#-------------------------------------------------------------------------------
# Script to generate SOM
#-------------------------------------------------------------------------------

library(rcdk)
library(kohonen)
library(data.table)

smdat <- fread(file.path("inst", "DSSTox_SMILES.csv"))
smis <- smdat$smiles
names(smis) <- smdat$casn
save(smis, file = file.path("data", "smis.RData"))

## Get molecules from SMILES with kekulise = FALSE
mols <- sapply(smis, parse.smiles, kekulise = FALSE)
names(mols) <- names(smis)

## Add the information suggested by rcdk
for (i in names(mols)) do.aromaticity(mols[[i]])
for (i in names(mols)) do.typing(mols[[i]])
for (i in names(mols)) do.isotopes(mols[[i]])

## Get pubchem fingerprints
fps_pubchem <- lapply(mols, get.fingerprint, type = "pubchem")
fpmat_pubchem <- fp.to.matrix(fps_pubchem)
rownames(fpmat_pubchem) <- names(fps_pubchem)
colnames(fpmat_pubchem) <- paste0("PUBCHEM", 1:ncol(fpmat_pubchem))

## Get MACCS fingerprints
fps_maccs <- lapply(mols, get.fingerprint, type = "maccs")
fpmat_maccs <- fp.to.matrix(fps_maccs)
rownames(fpmat_maccs) <- names(fps_maccs)
colnames(fpmat_maccs) <- paste0("MACCS", 1:ncol(fpmat_maccs))

fpmat <- cbind(fpmat_pubchem, fpmat_maccs)
fpmat <- fpmat[ , apply(fpmat, 2, function(x) any(x == 1))]

## Calculate SOM with kekulise = FALSE
set.seed(1234)
stime <- Sys.time()
SOM <- som(data = fpmat, grid = somgrid(20, 24, "hexagonal"), rlen = 1e3)
etime <- Sys.time()
save(SOM, file = file.path("data", "SOM.RData"))

#-------------------------------------------------------------------------------
# Script to generate SOM results (SR)
#-------------------------------------------------------------------------------

SR <- data.table(casn = rownames(SOM$data), bin = SOM$unit.classif)

SR[ , toxc := casn %in% CD$casn]
setkey(SR, casn)
setkey(CL, casn)
SR <- CL[ , unique(.SD), .SDcols = c("cat", "casn")][SR]
SR[ , cat1 := cat == 1]
SR[ , cat2 := cat == 2]
SR[ , cat3 := cat == 3]

dat <- tcplLoadData(lvl = 5L)
dat <- tcplPrepOtpt(dat,
                    clib = c("toxcast:ph1v2_ph2_e1k_ph3"),
                    srgx = "(?!^Tox21_1.*)(?!^Tox21_2.*)(?=^.*)")
dat <- dat[!is.na(casn)]
dat <- tcplSubsetChid(dat = dat)
dat[hitc == 1, pot := 10^modl_ga]
dat[hitc != 1, pot := 1e6]
dat[ , pot := -log10(pot/1e6)]

sp_all <- dat[ ,
              list(sump = sum(pot),
                   nhit = lw(hitc == 1),
                   N = .N,
                   maxp = max(pot)),
              by = casn]
sp_all[ , phit := nhit/N]
setkey(sp_all, casn)
sp <- sp_all[casn %in% CD$casn & casn %in% CL$casn]
setkey(sp, casn)
SR <- sp[SR]

setkey(SR, bin)
SR <- rbind(SR, SR[J(which(!1:480 %in% SR$bin))])
setkey(SR, bin)

save(SR, file = file.path("data", "SR.RData"))

