Processing the Thyroperoxidase (TPO) high throughput screening data through the pipeline requires several steps. First, the data needs to be brought into one separate table for each assay in multi conc, and a separate table for each assay in single conc. At this point, the data can be loaded into the pipeline and processed. Finally, the processed data can be retrieved from the pipeline database and analyzed.

The TPO dataset has morphed over time, with four separate ‘phases’ (including the retest and GUA overlap samples), collected by two different people, with five separate chemical orders and modifications to the protocol. This document seeks to document all of these events and adjustments to ensure the highest possible data fidelity and analysis transparency.

Data prep for tcpl load

Each ‘phase’ has a different dataset, and each data file is formatted slightly differently. To handle this, each data file is processed separately to get into the correct format. Next, all of the tables are merged together into one. Final processing includes remapping sample IDs. Phase I and II were performed under the older TX code formatted sample ids, which is reflected in the raw data. The new pipeline, however, requires that these sample IDs be mapped to the new TP based codes. This is accomplished using tcpl:::tcplSpidMap()

For reference, this document was compiled with the following version of R and R packages.

sessionInfo()
## R version 3.2.3 (2015-12-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Red Hat Enterprise Linux Workstation release 6.8 (Santiago)
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  grid      stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] sparkline_1.0    DT_0.1           tcpl_1.2.2       pander_0.6.0    
##  [5] gridExtra_2.2.1  ggplot2_2.1.0    stringr_1.0.0    data.table_1.9.6
##  [9] magrittr_1.5     dplyr_0.5.0     
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.5        knitr_1.13         munsell_0.4.3     
##  [4] colorspace_1.2-6   R6_2.1.2           plyr_1.8.4        
##  [7] tools_3.2.3        gtable_0.2.0       DBI_0.4-1         
## [10] htmltools_0.3.5    RMySQL_0.10.9      yaml_2.1.13       
## [13] assertthat_0.1     digest_0.6.9       tibble_1.1        
## [16] numDeriv_2014.2-1  RColorBrewer_1.1-2 formatR_1.4       
## [19] htmlwidgets_0.6    RSQLite_1.0.0      evaluate_0.9      
## [22] rmarkdown_0.9.6    stringi_1.1.1      scales_0.4.0      
## [25] chron_2.3-47

AUR TPO multi concentration

AUR TPO MC phase I

Collected_By Date_Collected Order_Number Test_Samples Plates
Katie Paul Friedman Unknown Unknown 100 12
# Read in data ----------

file.loc <- './Data/AUR TPO DR Phase I KP032 Compiled N from Dose Response_CompiledN.csv'
dat_mc_p1 <- data.table(read.table (file = file.loc, 
                                    header = TRUE, 
                                    sep=',', 
                                    comment.char="", 
                                    na.strings = c("na", "#VALUE!"), #to convert these to NA
                                    stringsAsFactors = FALSE))

# Rename columns to match uniform variable names ----------

setnames(dat_mc_p1, 
         c("COLUMN", 
           "ROW", 
           "EPA_SAMPLE_ID", 
           "ASSAY_CONC",
           "PLATE"), 
         c("coli", 
           "rowi",
           "spid",
           "conc",
           "apid"))

# Remove extra columns ----------

set(dat_mc_p1, 
    j=which(colnames(dat_mc_p1) %in% c("WELL", 
                                       "ALIQUOT_WELL_ID",
                                       "ALIQUOT_SOLVENT",
                                       "ALIQUOT_CONC",
                                       "ALIQUOT_CONC_UNIT",
                                       "ALIQUOT_PLATE_BARCODE_SOURCE",
                                       "SISTER_PLATE",
                                       "ASSAY_CONC_UNIT")), 
    value=NULL)

# Add new columns ----------

dat_mc_p1[, wllq := 1L]         
dat_mc_p1[, wllt := 't']            
dat_mc_p1[, srcf := 'AUR TPO DR Phase I KP032 Compiled N from Dose Response_CompiledN.csv']
dat_mc_p1[, acsn := 'AUR_TPO_DoseResponse_PhaseI']

# Modify column entries ----------

dat_mc_p1[is.na(spid), wllt := 'b']     
dat_mc_p1[spid %in% c('MMI', 'NAP', 'LUCINH1', 'LUCINH2'), wllt := 'c'] 
dat_mc_p1[spid == 'DMSO', wllt := 'n']
dat_mc_p1[spid == 'NEG', wllt := 'f']

# Melt 'N' columns to get rval ----------

dat_mc_p1 <- melt(dat_mc_p1, 
                  id.vars = c('rowi', 'coli', 'spid', 'conc', 'wllq', 
                              'wllt', 'srcf', 'acsn', 'apid'),
                  measure.vars = c('N1', 'N2', 'N3'),
                  variable.name='N', 
                  value.name='rval')

dat_mc_p1[, apid := paste("phase_1_plate", apid, N, sep = "_")]

set(dat_mc_p1, j=which(colnames(dat_mc_p1) == "N"), value=NULL)

# Clean up column types ----------

letter2number <- c("A" = 1L,
                   "B" = 2L,
                   "C" = 3L,
                   "D" = 4L,
                   "E" = 5L,
                   "F" = 6L,
                   "G" = 7L,
                   "H" = 8L,
                   "I" = 9L,
                   "J" = 10L,
                   "K" = 11L,
                   "L" = 12L,
                   "M" = 13L,
                   "N" = 14L,
                   "O" = 15L,
                   "P" = 16L,
                   "Q" = 17L,
                   "R" = 18L,
                   "S" = 19L,
                   "T" = 20L,
                   "U" = 21L,
                   "V" = 22L,
                   "W" = 23L,
                   "X" = 24L,
                   "Y" = 25L,
                   "Z" = 26L)

dat_mc_p1[, rowi := unname(letter2number[rowi])] 
dat_mc_p1[, rval := as.numeric(rval)]

# Set TX codes that were retested to wllq of 0 ----------
retest_spids <- c("TX000781", "TX000845", "TX001425", "TX002339", "TX003444", 
                  "TX003725", "TX004371", "TX004536", "TX006158", "TX007182", 
                  "TX007242", "TX008327", "TX008582", "TX008666", "TX009149", 
                  "TX009328", "TX009711", "TX009779", "TX011582", "TX011587", 
                  "TX011630", "TX011653", "TX015181", "TX015273", "TX015590", 
                  "TX015604", "TX109149", "TX111582", "TX111587", "TX209149", 
                  "TX211582", "TX211587")

dat_mc_p1[spid %in% retest_spids, wllq := 0]

AUR TPO MC phase II

Collected_By Date_Collected Order_Number Test_Samples Plates
Katie Paul Friedman Unknown Unknown 252 18
# Read in data ----------

file.loc <- './Data/L1_KP036_AUR_TPO_PhaseII_Dose_Response.csv'
dat_mc_p2 <- data.table(read.table (file = file.loc, 
                                    header = TRUE, 
                                    sep=',', 
                                    na.strings = c("NA", "EMPTY", "Kpaul"), 
                                    stringsAsFactors = FALSE)
                        [,1:14]) #There are blank columns after 14

# Rename columns to match uniform variable names ----------

setnames(dat_mc_p2, 
         c("COLUMN", 
           "ROW", 
           "EPA_SAMPLE_ID", 
           "ASSAY_CONC",
           "Plate_ID",
           "Response"), 
         c("coli", 
           "rowi",
           "spid",
           "conc",
           "apid",
           "rval"))

# Remove extra columns ----------

set(dat_mc_p2, 
    j=which(colnames(dat_mc_p2) %in% c("ALIQUOT_WELL_ID",
                                       "ALIQUOT_SOLVENT",
                                       "ALIQUOT_CONC",
                                       "ALIQUOT_CONC_UNIT",
                                       "ALIQUOT_PLATE_BARCODE",
                                       "SISTER_PLATE",
                                       "ASSAY_CONC_UNIT",
                                       "N")), 
    value=NULL)

# Add new columns ----------

dat_mc_p2[, wllq := 1L]
dat_mc_p2[, wllt := 't'] 
dat_mc_p2[, srcf := 'L1_KP036_AUR_TPO_PhaseII_Dose_Response.csv']
dat_mc_p2[, acsn := 'AUR_TPO_DoseResponse_PhaseII']

# Modify column entries ----------

dat_mc_p2[is.na(spid), wllt := 'b']
dat_mc_p2[spid %in% c('MMI', 'NAP', 'LUCINH1', 'LUCINH2'), wllt := 'c']
dat_mc_p2[spid == 'DMSO',  wllt := 'n']
dat_mc_p2[spid == 'NEG',  wllt := 'f']

#Plate 6 (18 after change below) seems to have error. 
#Values are much lower, and some dose response is in opposite direction. 
#Set wllq to 0L
dat_mc_p2[apid==6,wllq := 0L]

# Clean up column types ----------

dat_mc_p2[, rowi := unname(letter2number[rowi])]   #Use letter2number from chunk mc_phase1
dat_mc_p2[, apid := as.character(paste("phase_2_plate", apid, sep = "_"))]
dat_mc_p2[, rval := as.numeric(rval)]

# Set TX codes that were retested to wllq of 0 ----------

retest_spids <- c("TX000781", "TX000845", "TX001425", "TX002339", "TX003444", 
                  "TX003725", "TX004371", "TX004536", "TX006158", "TX007182", 
                  "TX007242", "TX008327", "TX008582", "TX008666", "TX009149", 
                  "TX009328", "TX009711", "TX009779", "TX011582", "TX011587", 
                  "TX011630", "TX011653", "TX015181", "TX015273", "TX015590", 
                  "TX015604", "TX109149", "TX111582", "TX111587", "TX209149", 
                  "TX211582", "TX211587")

dat_mc_p2[spid %in% retest_spids, wllq := 0]

AUR TPO MC retest with GUA additions

Collected_By Date_Collected Order_Number Test_Samples Plates
Steve Simmons July 2015 Unknown 82 6
# Read in Data ----------

file.loc <- './Data/AUR_level.0_master.csv'
dat_mc_retest <- data.table(read.table (file = file.loc, 
                                        header = TRUE, 
                                        sep=',', 
                                        comment.char="", 
                                        na.strings = c("na", "#VALUE!"), 
                                        stringsAsFactors = FALSE))

# Rename columns to match uniform variable names ----------

setnames(dat_mc_retest, 
         c("Column", 
           "Row", 
           "EPA_Sample_ID", 
           "Assay_conc..uM.",
           "RFU"), 
         c("coli", 
           "rowi",
           "spid",
           "conc",
           "rval"))

# Add new columns ----------

dat_mc_retest[, wllq := 1L ]
dat_mc_retest[, wllt := 't']
dat_mc_retest[, srcf := 'AUR_level.0_master.csv']
dat_mc_retest[, acsn := 'AUR_TPO_DoseResponse_PhaseII']
dat_mc_retest[, apid := paste(Replicate,AUR_Plate,Bioloigcal_sample,sep="_")]

# Clean up column types ----------

dat_mc_retest[, rowi := unname(letter2number[rowi])]   #Use letter2number from chunk mc_phase1
dat_mc_retest[, apid := as.character(apid)]    
dat_mc_retest[, rval := as.numeric(rval)]     

# Fix Lucinh formatting ----------

lucs <- c("LUCINH1", "LUCINH2", "LUCINH3", "LUCINH4",     
          "LUCINH5", "LUCINH6", "LUCINH7", "LUCINH8", "LUCINH9", "LUCINH10", "LUCINH11",    
          "LUCINH12", "LUCINH13")
dat_mc_retest[rowi==15L & spid %in% lucs, spid := "LUCINH1"]
dat_mc_retest[rowi==16L & spid %in% lucs, spid := "LUCINH2"]

# Modify column entries ----------

chem_controls <- c("MMI", "LUCINH1", "LUCINH2", "DCNQ", "DCNP", "PTU", "BP3")
dat_mc_retest[spid %in% chem_controls, wllt := 'c']  
dat_mc_retest[spid == 'DMSO', wllt := 'n']   
dat_mc_retest[spid == 'blank', wllt := 'f']   

# Remove extra rows ----------

#Keep only those with Microsomes, remove Buffer_only
dat_mc_retest <- dat_mc_retest[Bioloigcal_sample == 'microsomes']

# Remove extra columns ----------

set(dat_mc_retest, 
    j=which(colnames(dat_mc_retest) %in% c("Replicate", 
                                           "stock_conc..mM.", 
                                           "Aliquot_conc..mM.", 
                                           "Bioloigcal_sample", 
                                           "AUR_Plate")), 
    value=NULL)

AUR TPO mc combine and load

# Combine ----------

dat_mc <- rbindlist(list(dat_mc_p1, 
                         dat_mc_p2, 
                         dat_mc_retest),
                    use.names = TRUE)

# Map spids TX to TP ----------

dat_mc <- tcpl:::tcplSpidMap(dat_mc) #Function is not exported?
#Gives warning "The following source spids did not map to updated spids:"
#This is because newer assays with TP rather than TX don't need to map.
#The TP is simply left alone, the warning is fine.
#Remove extra columns added by tcplSpidMap
set(dat_mc, 
    j=which(colnames(dat_mc) %in% c("asid", 
                                    "aid", 
                                    "spid_source")), 
    value=NULL)

# Change NAP to DCNQ ----------

dat_mc[spid == "NAP", spid := "DCNQ"]
# Write level 0 data to the database ----------
tcplWriteLvl0(dat_mc, type="mc")

AUR TPO Single Concentration

AUR TPO sc phase I

Collected_By Date_Collected Order_Number Test_Samples Plates
Katie Paul Friedman Unknown Unknown 311 3

AUR TPO sc phase I data wrangling

This data was originally provided as a worksheet in excel document KP032 AURTPO ToxCast Phase I Single Pt Screen.xlsx I output tab Compiled N as a separate CSV file This CSV file is read in below, with file name KP032 AURTPO ToxCast Phase I Single Pt Screen_CompiledN.csv Looking at column 24, it seems as though this column marked as DMSO is either blank or NEG. I set this to blank so that it does not throw off normalization.

# Read in data ----------

file.loc <- './Data/KP032 AURTPO ToxCast Phase I Single Pt Screen_CompiledN.csv'
dat_sc_p1 <- data.table(read.table(file = file.loc, 
                                   header = TRUE, 
                                   sep=',', 
                                   na.strings = c("EMPTY", "Kpaul"), 
                                   stringsAsFactors = FALSE))


# Rename columns to match uniform variable names ----------

setnames(dat_sc_p1, 
         c("ROW", 
           "COLUMN", 
           "EPA_SAMPLE_ID", 
           "ASSAY_CONC"), 
         c("rowi", 
           "coli",
           "spid",
           "conc"))

# Remove extra columns ----------

set(dat_sc_p1, 
    j=which(colnames(dat_sc_p1) %in% c("WELL", 
                                       "ALIQUOT_WELL_ID",
                                       "ALIQUOT_SOLVENT",
                                       "ALIQUOT_CONC",
                                       "ALIQUOT_CONC_UNIT",
                                       "ALIQUOT_PLATE_BARCODE_SOURCE",
                                       "SISTER_PLATE",
                                       "ASSAY_CONC_UNIT")), 
    value=NULL)

# Add new columns ----------

dat_sc_p1[, wllq := 1L]                
dat_sc_p1[, wllt := 't']               
dat_sc_p1[, srcf := 'KP032 AURTPO ToxCast Phase I Single Pt Screen Compiled N.csv']
dat_sc_p1[, acsn := 'AUR_TPO_SingleConc_PhaseI']

# Modify column entries ----------

dat_sc_p1[is.na(spid), wllt := 'b'] 
dat_sc_p1[spid %in% c('MMI', 'NAP', 'LUCINH1', 'LUCINH2', 'MMB', 'DDZ', 'MBT'), 
          wllt := 'p'] 
dat_sc_p1[spid == 'DMSO', wllt := 'n']
dat_sc_p1[spid == 'NEG', wllt := 'f'] 
dat_sc_p1[coli == 24, wllt := 'b'] 
#See note above, coli == 24 seems to be blank or NEG, not DMSO

# Melt 'N' columns to get rval ----------

dat_sc_p1 <- melt(dat_sc_p1, 
                  id.vars = c('rowi', 'coli', 'spid', 'conc',
                              'wllq', 'wllt', 'srcf', 'acsn'),
                  measure.vars = c("N1", "N2", "N3"),
                  variable.name='apid', 
                  value.name='rval')

# Clean up column types ----------

dat_sc_p1[, rowi := unname(letter2number[rowi])]
dat_sc_p1[, rval := as.numeric(rval)]
dat_sc_p1[, apid := as.character(apid)]

AUR TPO sc phase II

Collected_By Date_Collected Order_Number Test_Samples Plates
Katie Paul Friedman Unknown Unknown 812 9

AUR TPO sc phase II data wrangling

This data was originally provided as a worksheet in excel document KP036 AUR-TPO Screening of the ToxCast Phase II Chemical Library022014.xlsx I output tab Longformat as a separate CSV file This CSV file is read in below, with file name KP036 AUR-TPO Screening of the ToxCast Phase II Chemical Library022014_Longformat.csv

# Read in data ----------

file.loc <- './Data/KP036 AUR-TPO Screening of the ToxCast Phase II Chemical Library022014_Longformat.csv'
dat_sc_p2 <- data.table(read.table (file = file.loc, 
                                    header = TRUE, 
                                    sep=',', 
                                    na.strings = c("BACKGROUND", "Kpaul", "NA"), 
                                    stringsAsFactors = FALSE))

# Rename columns to match uniform variable names ----------

setnames(dat_sc_p2, 
         c("COLUMN",
           "ROW", 
           "EPA_SAMPLE_ID", 
           "ASSAY_CONC",
           "plate_id",
           "Resp"), 
         c("coli", 
           "rowi",
           "spid",
           "conc",
           "apid",
           "rval"))

# Remove extra columns ----------

set(dat_sc_p2, j=which(colnames(dat_sc_p2) %in% c("ALIQUOT_WELL_ID",
                                                  "ALIQUOT_SOLVENT",
                                                  "ALIQUOT_CONC",
                                                  "ALIQUOT_CONC_UNIT",
                                                  "SISTER_PLATE",
                                                  "ASSAY_CONC_UNIT",
                                                  "ALIQUOT_PLATE_BARCODE",
                                                  "N",
                                                  "sample_tech_rep_id")), 
    value=NULL)

# Add new columns ----------

dat_sc_p2[, wllq := 1L]  
dat_sc_p2[, wllt := 't']    
dat_sc_p2[, srcf := 'KP036 AUR-TPO Screening of the ToxCast Phase II Chemical Library022014_Longformat.csv']         
dat_sc_p2[, acsn := 'AUR_TPO_SingleConc_PhaseII']

# Modify column entries ----------

dat_sc_p2[is.na(spid), wllt := 'b']     
dat_sc_p2[spid %in% c('MMI', 'NAP', 'LUCINH1', 'LUCINH2'), wllt := 'p']     
dat_sc_p2[spid == 'DMSO', wllt := 'n']   
dat_sc_p2[spid == 'NEG', wllt := 'f']   

# Clean up column types ----------

dat_sc_p2[, rowi := unname(letter2number[rowi])] 
#add 3 to apid to get it to start at 4, 
#this way it will be different than Phase I and not overwrite.
dat_sc_p2[, apid := as.character(apid + 3)]
dat_sc_p2[, rval := as.numeric(rval)] 

AUR TPO sc combine and load

# Combine ----------

dat_sc <- rbindlist(list(dat_sc_p1, 
                         dat_sc_p2),
                    use.names = TRUE)

# Map spids TX to TP ----------

dat_sc <- tcpl:::tcplSpidMap(dat_sc) #Function is not exported?
#Gives warning "The following source spids did not map to updated spids:"
#This is because newer assays with TP rather than TX don't need to map.
#The TP is simply left alone, the warning is fine.
#Remove extra columns added by tcplSpidMap
set(dat_sc, 
    j=which(colnames(dat_sc) %in% c("asid", 
                                    "aid", 
                                    "spid_source")), 
    value=NULL)

# Change NAP to DCNQ ----------

dat_sc[spid == "NAP", spid := "DCNQ"]
# Write level 0 data to the database ----------
tcplWriteLvl0(dat_sc, type="sc")

Cell Titer Glo (CTG)

CTG phase I

Collected_By Date_Collected Order_Number Test_Samples Plates
Katie Paul Friedman Unknown Unknown 100 12

CTG phase I data wrangling

This data was originally provided as a worksheet in excel document KP032 Compiled N from Dose Response.xlsx I output tab Longformat CTG as a separate CSV file. This CSV file is read in below, with file name KP032 Compiled N from Dose Response_LongformatCTG.csv The CSV file has values of #VALUE!, comment.char=“” allows these to be converted to NA. Only the first plate has values for concentration. I recalculate these, assuming the same dilution factor was used for all plates. This needs to be confirmed.

# Read in data ----------

file.loc <- './Data/KP032 Compiled N from Dose Response_LongformatCTG.csv'
dat_ctg_p1 <- data.table(read.table (file = file.loc, 
                                     header = TRUE, 
                                     sep=',', 
                                     comment.char="", 
                                     na.strings = c("na", "na_1", "#VALUE!"), 
                                     stringsAsFactors = FALSE))


# Populate conc column ----------
#Do this before next steps, using Aliquot conc values to propagate
#Need to confirm this is correct
#
#In .xlsx file, this seems to have been done using equation
#=((H9*0.35)/65)*1000 
#where H9 is Aliquot conc 

dat_ctg_p1[, conc := ((ALIQUOT_CONC*0.35)/65)*1000]

# Populate apid column ----------
dat_ctg_p1[, apid := paste("ctg_phase_1_plate", PLATE, "N", N, sep = "_")]

# Rename columns to match uniform variable names ----------

setnames(dat_ctg_p1, 
         c("COLUMN", 
           "ROW", 
           "EPA_SAMPLE_ID", 
           "resp"), 
         c("coli", 
           "rowi",
           "spid",
           "rval"))

# Remove extra columns ----------

set(dat_ctg_p1, 
    j=which(colnames(dat_ctg_p1) %in% c("sample_tech_rep_id",
                                        "sample_id",
                                        "rep_id",
                                        "WELL", 
                                        "ALIQUOT_WELL_ID",
                                        "ALIQUOT_SOLVENT",
                                        "ALIQUOT_CONC",
                                        "ALIQUOT_CONC_UNIT",
                                        "ALIQUOT_PLATE_BARCODE_SOURCE",
                                        "SISTER_PLATE",
                                        "ASSAY_CONC_UNIT",
                                        "PLATE",
                                        "N")), 
    value=NULL) 

# Add new columns -----------

dat_ctg_p1[, wllq := 1L]
dat_ctg_p1[, wllt := 't']
dat_ctg_p1[, srcf := 'KP032 Compiled N from Dose Response_LongformatCTG.csv'] 
dat_ctg_p1[, acsn := 'CellTiterGlo_PhaseI']

# Modify column entries ----------

dat_ctg_p1[is.na(spid), wllt := 'b']
dat_ctg_p1[spid %in% c('MMI', 
                       'NAP', 
                       'LUCINH1', 
                       'LUCINH2'), 
           wllt := 'c']
dat_ctg_p1[spid == 'DMSO', wllt := 'n']
dat_ctg_p1[spid == 'NEG', wllt := 'f']

# Clean up column types ----------

dat_ctg_p1[, rowi := unname(letter2number[rowi])]
dat_ctg_p1[, rval := as.numeric(rval)]

# Set TX codes that were retested to wllq of 0 ----------
#retest_spids from chunk mc_phase1
dat_ctg_p1[spid %in% retest_spids, wllq := 0]

CTG phase II

Collected_By Date_Collected Order_Number Test_Samples Plates
Katie Paul Friedman Unknown Unknown 252 18
# Read in data ----------

file.loc <- './Data/L1_KP036_CellTiterGlo_PhaseII_Dose_Response.csv'
dat_ctg_p2 <- data.table(read.table (file = file.loc, 
                                     header = TRUE, 
                                     sep=',', 
                                     na.strings = c("EMPTY", "Kpaul", "NA"), 
                                     stringsAsFactors = FALSE))

# Rename columns to match uniform variable names ----------

setnames(dat_ctg_p2, 
         c("COLUMN", 
           "ROW", 
           "EPA_SAMPLE_ID", 
           "ASSAY_CONC",
           "Plate_ID",
           "Response"), 
         c("coli", 
           "rowi",
           "spid",
           "conc",
           "apid",
           "rval"))

# Remove extra columns ----------

set(dat_ctg_p2, 
    j=which(colnames(dat_ctg_p2) %in% c("ALIQUOT_WELL_ID",
                                        "ALIQUOT_SOLVENT",
                                        "ALIQUOT_CONC",
                                        "ALIQUOT_CONC_UNIT",
                                        "ALIQUOT_PLATE_BARCODE",
                                        "SISTER_PLATE",
                                        "ASSAY_CONC_UNIT",
                                        "N")), 
    value=NULL)

# Add new columns ----------

dat_ctg_p2[, wllq := 1L]
dat_ctg_p2[, wllt := 't']
dat_ctg_p2[, srcf := 'L1_KP036_CellTiterGlo_PhaseII_Dose_Response.csv']
dat_ctg_p2[, acsn := 'CellTiterGlo_PhaseII']

# Modify column entries ----------

dat_ctg_p2[is.na(spid), wllt := 'b']
dat_ctg_p2[spid %in% c('MMI', 
                       'NAP', 
                       'LUCINH1', 
                       'LUCINH2'),
           wllt := 'c']
dat_ctg_p2[spid == 'DMSO', wllt := 'n']
dat_ctg_p2[spid == 'NEG', wllt := 'f']

# Clean up column types ----------

dat_ctg_p2[, rowi := unname(letter2number[rowi])]   #Use letter2number from chunk mc_phase1
dat_ctg_p2[, apid := as.character(paste("ctg_phase_2_plate", apid, sep = "_"))]
dat_ctg_p2[, rval := as.numeric(rval)]

# Set TX codes that were retested to wllq of 0 ----------
#retest_spids from chunk mc_phase1
dat_ctg_p2[spid %in% retest_spids, wllq := 0]

CTG retest with GUA additions

Collected_By Date_Collected Order_Number Test_Samples Plates
Steve Simmons July 2015 Unknown 82 6

CTG retest data wrangling

File provided by Steve Simmons June 19 2015

# Read in data ----------

file.loc <- './Data/CTG_level.0_master.csv'
dat_ctg_retest <- data.table(TPO <- read.table(file = file.loc, 
                                               header = TRUE, 
                                               sep=',', 
                                               comment.char="", 
                                               na.strings = c("na", "#VALUE!"), 
                                               stringsAsFactors = FALSE))

# Rename columns to match uniform variable names ----------

setnames(dat_ctg_retest, 
         c("Column", 
           "Row", 
           "EPA_Sample_ID", 
           "Assay_conc..uM.",
           "RLU"), 
         c("coli", 
           "rowi",
           "spid",
           "conc",
           "rval"))

# Add new columns ----------

dat_ctg_retest[, wllq := 1L]
dat_ctg_retest[, wllt := 't']
dat_ctg_retest[, srcf := 'CTG_level.0_master.csv']        
dat_ctg_retest[, acsn := 'CellTiterGlo_PhaseII']
dat_ctg_retest[, apid := paste(Replicate,AUR_Plate,sep="_")]

# Clean up column types ----------

dat_ctg_retest[, rowi := unname(letter2number[rowi])]
dat_ctg_retest[, rval := as.numeric(rval)]

# Fix Lucinh formatting ----------

lucs <- c("LUCINH1", "LUCINH2", "LUCINH3", "LUCINH4",     
          "LUCINH5", "LUCINH6", "LUCINH7", "LUCINH8", 
          "LUCINH9", "LUCINH10", "LUCINH11", "LUCINH12", 
          "LUCINH13")
dat_ctg_retest[rowi==15L & spid %in% lucs, spid := "LUCINH1"]
dat_ctg_retest[rowi==16L & spid %in% lucs, spid := "LUCINH2"]

# Modify column entries ----------

chem_controls <- c("MMI", "LUCINH1", "LUCINH2", "DCNQ", "DCNP", "PTU", "BP3")
dat_ctg_retest[spid %in% chem_controls, wllt := 'c']
dat_ctg_retest[spid == 'DMSO', wllt := 'n']
dat_ctg_retest[spid == 'blank', wllt := 'f']

# Remove extra columns ----------

set(dat_ctg_retest, 
    j=which(colnames(dat_ctg_retest) %in% c("Replicate", 
                                            "stock_conc..mM.", 
                                            "Aliquot_conc..mM.", 
                                            "Bioloigcal_sample", 
                                            "AUR_Plate")), 
    value=NULL)

CTG combine and load

# Combine ----------

dat_ctg <- rbindlist(list(dat_ctg_p1, 
                          dat_ctg_p2, 
                          dat_ctg_retest),
                     use.names = TRUE)

# Map spids TX to TP ----------

dat_ctg <- tcpl:::tcplSpidMap(dat_ctg) #Function is not exported?
#Gives warning "The following source spids did not map to updated spids:"
#This is because newer assays with TP rather than TX don't need to map.
#The TP is simply left alone, the warning is fine.
#Remove extra columns added by tcplSpidMap
set(dat_ctg, 
    j=which(colnames(dat_ctg) %in% c("asid", 
                                     "aid", 
                                     "spid_source")), 
    value=NULL)

# Change NAP to DCNQ ----------

dat_ctg[spid == "NAP", spid := "DCNQ"]
# Write level 0 data to the database ----------

tcplWriteLvl0(dat_ctg, type="mc")

Quantilum (QLI)

QLI phase I/II

Collected_By Date_Collected Order_Number Test_Samples Plates
Steve Simmons July 2015 Unknown 341 30

QLI phase I/II data wrangling

# Read in data ----------

file.loc <- './Data/KP032 plates_QLI_2.0.csv'
dat_qli_p1p2 <- data.table(read.table(file = file.loc, 
                                      header = TRUE, 
                                      sep=',', 
                                      comment.char="", 
                                      na.strings = c("na", "#VALUE!", "EMPTY", "Kpaul", "NA"), 
                                      stringsAsFactors = FALSE))

# Rename columns to match uniform variable names ----------

setnames(dat_qli_p1p2, 
         c("COLUMN",
           "ROW",
           "EPA_SAMPLE_ID",
           "ASSAY_CONC",
           "PLATE"), 
         c("coli",
           "rowi",
           "spid",
           "conc",
           "apid"))

# Remove extra columns ----------

set(dat_qli_p1p2, j=which(colnames(dat_qli_p1p2) %in% c("ALIQUOT_WELL_ID", 
                                                        "ALIQUOT_SOLVENT", 
                                                        "ALIQUOT_CONC", 
                                                        "ALIQUOT_CONC_UNIT", 
                                                        "SISTER_PLATE",
                                                        "ASSAY_CONC_UNIT",
                                                        "ALIQUOT_PLATE_BARCODE_SOURCE",
                                                        "WELL")), 
    value=NULL)

# Add new columns ----------

dat_qli_p1p2[ , wllq := 1L]
dat_qli_p1p2[ , wllt := 't']
dat_qli_p1p2[ , acsn := 'Quantilum_Inhibition_2']

#The first 4 chemical plates were phase 1, final 6 were phase 2
dat_qli_p1p2[apid %in% c(1:4) , srcf := 'KP032 plates_QLI_2.0.csv Phase I']
dat_qli_p1p2[apid %in% c(5:10) , srcf := 'KP032 plates_QLI_2.0.csv Phase II']

# Modify column entries ----------

dat_qli_p1p2[is.na(spid), wllt := 'b']
dat_qli_p1p2[spid %in% c('MMI', 'NAP', 'LUCINH1', 'LUCINH2'), wllt := 'c']
dat_qli_p1p2[spid == 'DMSO', wllt := 'n']
dat_qli_p1p2[spid == 'NEG', wllt := 'f']

# Melt data to get one column of response ----------
#N1, N2, N3 become values in N
#values from N1, N2, N3 become values in rval

dat_qli_p1p2 <- melt(dat_qli_p1p2, 
                     id.vars = c('rowi', 'coli', 'spid', 'conc',
                                 'wllq', 'wllt', 'srcf', 'acsn', 'apid'),
                     measure.vars=c("N1", "N2", "N3"),
                     variable.name='N', 
                     value.name='rval')

dat_qli_p1p2[srcf == "KP032 plates_QLI_2.0.csv Phase I", phase := "phase_1"]
dat_qli_p1p2[srcf == "KP032 plates_QLI_2.0.csv Phase II", phase := "phase_2"]

dat_qli_p1p2[, apid := paste("qli", phase, "plate", apid, N, sep = "_")]

# remove more columns ----------

set(dat_qli_p1p2, j=which(colnames(dat_qli_p1p2) %in% c("N", "phase")), value=NULL)

# Modify column entries ----------

dat_qli_p1p2[, rowi := unname(letter2number[rowi])]
dat_qli_p1p2[, rval := as.numeric(rval)]

# Set TX codes that were retested to wllq of 0 ----------
#retest_spids from chunk mc_phase1

dat_qli_p1p2[spid %in% retest_spids, wllq := 0]

QLI retest with GUA additions

Collected_By Date_Collected Order_Number Test_Samples Plates
Steve Simmons July 2015 Unknown 82 6
# Read in data ----------

file.loc <- './Data/QLI_level.0_master.csv'
dat_qli_retest <- data.table(read.table (file = file.loc, 
                                         header = TRUE, 
                                         sep=',', 
                                         comment.char="", 
                                         na.strings = c("na", "#VALUE!"), 
                                         stringsAsFactors = FALSE))

# Rename columns to match uniform variable names ----------

setnames(dat_qli_retest, 
         c("Column", 
           "Row", 
           "EPA_Sample_ID",
           "Assay_conc..uM.",
           "RLU"), 
         c("coli", 
           "rowi",
           "spid",
           "conc",
           "rval"))

# Add new columns ----------

dat_qli_retest[, wllq := 1L]
dat_qli_retest[, wllt := 't']
dat_qli_retest[, srcf := 'QLI_level.0_master.csv']
dat_qli_retest[, acsn := 'Quantilum_Inhibition_2']
dat_qli_retest[, apid := paste(Replicate,AUR_Plate,sep="_")]

# Clean up column types ----------

dat_qli_retest[, rowi := unname(letter2number[rowi])]
dat_qli_retest[, rval := as.numeric(rval)]

# Fix Lucinh formatting ----------

lucs <- c("LUCINH1", "LUCINH2", "LUCINH3", "LUCINH4",     
          "LUCINH5", "LUCINH6", "LUCINH7", "LUCINH8", 
          "LUCINH9", "LUCINH10", "LUCINH11", "LUCINH12", 
          "LUCINH13")
dat_qli_retest[rowi==15L & spid %in% lucs, spid := "LUCINH1"]
dat_qli_retest[rowi==16L & spid %in% lucs, spid := "LUCINH2"]

# Modify column entries ----------

chem_controls <- c("MMI", "LUCINH1", "LUCINH2", "DCNQ", "DCNP", "PTU", "BP3")
dat_qli_retest[spid %in% chem_controls, wllt := 'c']
dat_qli_retest[spid == 'DMSO', wllt := 'n']
dat_qli_retest[spid == 'blank', wllt := 'f']

# Remove extra columns ----------

set(dat_qli_retest, 
    j=which(colnames(dat_qli_retest) %in% c("Replicate", 
                                            "stock_conc..mM.", 
                                            "Bioloigcal_sample", 
                                            "AUR_Plate")), 
    value=NULL)

QLI combine and load

# Combine ----------

dat_qli <- rbindlist(list(dat_qli_p1p2, 
                          dat_qli_retest),
                     use.names = TRUE)

# Map spids TX to TP ----------

dat_qli <- tcpl:::tcplSpidMap(dat_qli) #Function is not exported?
#Gives warning "The following source spids did not map to updated spids:"
#This is because newer assays with TP rather than TX don't need to map.
#The TP is simply left alone, the warning is fine.
#Remove extra columns added by tcplSpidMap
set(dat_qli, 
    j=which(colnames(dat_qli) %in% c("asid", 
                                     "aid", 
                                     "spid_source")), 
    value=NULL)

# Change NAP to DCNQ ----------

dat_qli[spid == "NAP", spid := "DCNQ"]
# Write level 0 data to the database ----------
tcplWriteLvl0(dat_qli, type="mc")

Guaiacol TPO

Guaiacol TPO read and load

Collected_By Date_Collected Order_Number Test_Samples Plates
Steve Simmons June 2015 Unknown Unknown Unknown
# Read in data ----------

file.loc <- './Data/GUA_level.0_master.csv'
dat_gua <- data.table(read.table (file = file.loc, 
                                  header = TRUE, 
                                  sep=',', 
                                  comment.char="", 
                                  na.strings = c("na", "#VALUE!"), 
                                  stringsAsFactors = FALSE))

# Rename columns to match uniform variable names ----------
setnames(dat_gua, 
         c("Column",
           "Row",
           "EPA_Sample_ID",
           "Assay_conc..uM."), 
         c("coli",
           "rowi",
           "spid",
           "conc"
         ))

# Add new columns ----------

dat_gua[, wllq := 1L]
dat_gua[, wllt := 't']
dat_gua[, srcf := 'GUA_level.0_master.csv']
dat_gua[, acsn := 'GUA_TPO_DoseResponse']
dat_gua[, apid := paste(Replicate,GUA_Plate,Bioloigcal_sample,sep="_")]

# Caluclate rval ----------

final_cols <- c("t_145",   
                "t_145.1", 
                "t_145.2", 
                "t_145.3", 
                "t_145.4", 
                "t_145.5", 
                "t_145.6", 
                "t_145.7", 
                "t_145.8", 
                "t_145.9", 
                "t_146",   
                "t_146.1", 
                "t_146.2", 
                "t_146.3", 
                "t_146.4", 
                "t_146.5", 
                "t_146.6", 
                "t_146.7", 
                "t_146.8")
first_cols <- c("t_64", "t_64.25")

first_median_vect <- apply(dat_gua[, first_cols, with=FALSE], 1, function(x) median(x))
final_median_vect <- apply(dat_gua[, final_cols, with=FALSE], 1, function(x) median(x))
dat_gua[, rval := final_median_vect - first_median_vect]

# remove the time resolved columns ----------

set(dat_gua, j=which(colnames(dat_gua) %in% c(first_cols, final_cols)), value=NULL)

# Clean up column types ----------

dat_gua[, rowi := unname(letter2number[rowi])]
dat_gua[, rval := as.numeric(rval)]

# Modify column entries ----------

dat_gua[spid %in% c('MMI', 'PTU', 'LifeTech.AUR', 'BP3'), wllt := 'c']
dat_gua[spid == 'DMSO', wllt := 'n']

# Remove extra rows ----------
#Keep only those with Microsomes, remove Buffer_only
dat_gua <- dat_gua[Bioloigcal_sample == 'Microsomes']

# Remove extra columns ----------

set(dat_gua, 
    j=which(colnames(dat_gua) %in% c("Replicate", 
                                     "Aliquot_conc..mM.", 
                                     "Bioloigcal_sample", 
                                     "t_0", 
                                     "GUA_Plate")), 
    value=NULL)

# Add NA rows for blank wells that were unused ----------

#Only columns 1:6 and rows A:E were used. Set all others to NA rows
#Eventually rows will be 1:16 and columns 1:24
apids <- unique(dat_gua[, apid])
blank_data <- expand.grid(rowi=1:16, 
                          coli=1:24,
                          spid=NA,
                          conc=NA,
                          wllq=0L,
                          wllt='b',
                          srcf='GUA_level.0_master.csv',
                          acsn='GUA_TPO_DoseResponse',
                          apid=apids,
                          rval=NA,
                          stringsAsFactors=FALSE)
blank_data <- data.table(blank_data)
blank_data <- blank_data[!(rowi %in% 1:5 & coli %in% 1:6)]

#combine blank_data and dat_gua

dat_gua <- rbind(dat_gua, blank_data, use.names=TRUE, fill=FALSE)
# Write level 0 data to the database ----------

tcplWriteLvl0(dat_gua, type="mc")

Process data with ToxCast pipeline

AUR TPO multi concentration

AUR TPO MC complete run

The following block of code will set all of the proper methods for the pipeline and process the data with these settings.

# acid and aeid can be found using functions ----------
# tcpl::tcplLoadAsid()
# tcpl::tcplLoadAcid()
# tcpl::tcplLoadAeid()

acid <- 963
aeid <- 1508

# set methods level 2 ----------

tcplClearMthd(lvl = 2L, id = acid, type = "mc")
tcplAssignMthd(lvl = 2L,
               id = acid, 
               mthd_id = 1, 
               ordr = 1, 
               type = "mc") 

# set methods level 3 ----------


tcplClearMthd(lvl = 3L, id = aeid, type = "mc") 
tcplAssignMthd(lvl = 3L,
               id = aeid, 
               mthd_id = c(11, 28, 5),
               ordr = c(1,2,3),
               type = "mc")

# set methods level 5 ----------

tcplClearMthd(lvl = 5L, id = aeid, type = "mc") 
tcplAssignMthd(lvl = 5L,
               id = aeid,
               mthd_id = 2, 
               type = "mc") 

# set methods level 6 ----------

tcplClearMthd(lvl = 6L, id = aeid, type = "mc") 
tcplAssignMthd(lvl = 6L,
               id = aeid,
               mthd_id = c(6, 7, 8, 10, 11, 12, 15, 16), 
               type = "mc") 

# run all ----------

tcplRun(id = acid, 
        slvl = 1L, 
        elvl = 6L, 
        type = "mc",
        mc.cores = 1L)

# make aeid plots ----------

tcplMakeAeidPlts(aeid, lvl=6L)

The above analysis runs with the following settings. A complete list of available setting can be found here.

AUR TPO MC level 2 settings

acid mthd mthd_id ordr
963 none 1 1

There are no level 2 methods applied to this data set. This level is used to do a global scaling or shifting of the data, and is not necessary for the TPO dataset.

AUR TPO MC level 3 settings

aeid mthd mthd_id ordr
1508 bval.apid.nwlls.med 11 1
1508 pval.apid.f.min 28 2
1508 resp.pc 5 3

Level 3 is the step when the data is normalized. There are 3 methods applied during level 3.

  1. bval.apid.nwlls.med: calculates a baseline value (bval) by taking the median of the DMSO wells at the plate level.
  2. pval.apid.f.min: calculates a positive control value (pval) by taking the median of the ‘blank’ wells at the plate level.
    • In this assay, the blank wells were those with microsome, DMSO, but no H2O2 and therefore represent maximal inhibition while correcting for background fluorescence.
  3. resp.pc: calculates a response value from bval and pval using equation \[ resp = \frac{cval - bval}{pval - bval}*100 \] where cval is the measured response in raw fluorescence units for a particular well.

AUR TPO MC level 5 settings

aeid mthd mthd_id
1508 pc20 2

Level 5 is the step where the cutoff value is set, the winning model is chosen, and a hit call is determined. This analysis sets to cutoff to 20%.

AUR TPO MC level 6 settings

aeid mthd mthd_id nddr
1508 singlept.hit.high 6 0
1508 singlept.hit.mid 7 0
1508 multipoint.neg 8 0
1508 noise 10 0
1508 border.hit 11 0
1508 border.miss 12 0
1508 gnls.lowconc 15 0
1508 overfit.hit 16 0

Level 6 adds flags to the analysis. These do not change the hit call, but will flag a curve in it meets the criteria for a particular flag. The flags that are evaluated in this analysis are in the above table.

Cell Titer Glo multi concentration

CTG MC complete run

The following block of code will set all of the proper methods for the pipeline and process the data with these settings.

# acid and aeid can be found using functions ----------
# tcpl::tcplLoadAsid()
# tcpl::tcplLoadAcid()
# tcpl::tcplLoadAeid()

ctg_acid <- 964
ctg_aeid <- 1509

# set methods level 2 ----------

tcplClearMthd(lvl = 2L, id = ctg_acid, type = "mc")
tcplAssignMthd(lvl = 2L,
               id = ctg_acid, 
               mthd_id = 1, 
               ordr = 1, 
               type = "mc") 

# set methods level 3 ----------


tcplClearMthd(lvl = 3L, id = ctg_aeid, type = "mc") 
tcplAssignMthd(lvl = 3L,
               id = ctg_aeid, 
               mthd_id = c(11, 32, 5),
               ordr = c(1,2,3),
               type = "mc")

# set methods level 5 ----------

tcplClearMthd(lvl = 5L, id = ctg_aeid, type = "mc") 
tcplAssignMthd(lvl = 5L,
               id = ctg_aeid,
               mthd_id = 2, 
               type = "mc") 

# set methods level 6 ----------

tcplClearMthd(lvl = 6L, id = ctg_aeid, type = "mc") 
tcplAssignMthd(lvl = 6L,
               id = ctg_aeid,
               mthd_id = c(6, 7, 8, 10, 11, 12, 15, 16), 
               type = "mc") 

# run all ----------

tcplRun(id = ctg_acid, 
        slvl = 1L, 
        elvl = 6L, 
        type = "mc",
        mc.cores = 1L)

# make aeid plots ----------

tcplMakeAeidPlts(ctg_aeid, lvl=6L)

CTG MC level 2 settings

acid mthd mthd_id ordr
964 none 1 1

There are no level 2 methods applied to this data set. This level is used to do a global scaling or shifting of the data, and is not necessary for the TPO dataset.

CTG MC level 3 settings

aeid mthd mthd_id ordr
1509 bval.apid.nwlls.med 11 1
1509 pval.zero 32 2
1509 resp.pc 5 3

Level 3 is the step when the data is normalized. There are 3 methods applied during level 3.

  1. bval.apid.nwlls.med: calculates a baseline value (bval) by taking the median of the DMSO wells at the plate level.
  2. pval.zero: sets a positive control value (pval) to 0.
  3. resp.pc: calculates a response value from bval and pval using equation \[ resp = \frac{cval - bval}{pval - bval}*100 \] where cval is the measured response in raw fluorescence units for a particular well.

CTG MC level 5 settings

aeid mthd mthd_id
1509 bmad3 1
1509 pc20 2

Level 5 is the step where the cutoff value is set, the winning model is chosen, and a hit call is determined. This analysis sets to cutoff to 20%.

CTG MC level 6 settings

aeid mthd mthd_id nddr
1509 singlept.hit.high 6 0
1509 singlept.hit.mid 7 0
1509 multipoint.neg 8 0
1509 noise 10 0
1509 border.hit 11 0
1509 border.miss 12 0
1509 gnls.lowconc 15 0
1509 overfit.hit 16 0

Level 6 adds flags to the analysis. These do not change the hit call, but will flag a curve in it meets the criteria for a particular flag. The flags that are evaluated in this analysis are in the above table.

QLI multi concentration

QLI MC complete run

The following block of code will set all of the proper methods for the pipeline and process the data with these settings.

# acid and aeid can be found using functions ----------
# tcpl::tcplLoadAsid()
# tcpl::tcplLoadAcid()
# tcpl::tcplLoadAeid()

qli_acid <- 1146
qli_aeid <- 1848

# set methods level 2 ----------

tcplClearMthd(lvl = 2L, id = qli_acid, type = "mc")
tcplAssignMthd(lvl = 2L,
               id = qli_acid, 
               mthd_id = 1, 
               ordr = 1, 
               type = "mc") 

# set methods level 3 ----------


tcplClearMthd(lvl = 3L, id = qli_aeid, type = "mc") 
tcplAssignMthd(lvl = 3L,
               id = qli_aeid, 
               mthd_id = c(11, 32, 5),
               ordr = c(1,2,3),
               type = "mc")

# set methods level 5 ----------

tcplClearMthd(lvl = 5L, id = qli_aeid, type = "mc") 
tcplAssignMthd(lvl = 5L,
               id = qli_aeid,
               mthd_id = 2, 
               type = "mc") 

# set methods level 6 ----------

tcplClearMthd(lvl = 6L, id = qli_aeid, type = "mc") 
tcplAssignMthd(lvl = 6L,
               id = qli_aeid,
               mthd_id = c(6, 7, 8, 10, 11, 12, 15, 16), 
               type = "mc") 

# run all ----------

tcplRun(id = qli_acid, 
        slvl = 1L, 
        elvl = 6L, 
        type = "mc",
        mc.cores = 1L)

# make aeid plots ----------

tcplMakeAeidPlts(qli_aeid, lvl=6L)

QLI MC level 2 settings

acid mthd mthd_id ordr
1146 none 1 1

There are no level 2 methods applied to this data set. This level is used to do a global scaling or shifting of the data, and is not necessary for the TPO dataset.

QLI MC level 3 settings

aeid mthd mthd_id ordr
1848 bval.apid.nwlls.med 11 1
1848 pval.zero 32 2
1848 resp.pc 5 3

Level 3 is the step when the data is normalized. There are 3 methods applied during level 3.

  1. bval.apid.nwlls.med: calculates a baseline value (bval) by taking the median of the DMSO wells at the plate level.
  2. pval.zero: sets a positive control value (pval) to 0.
  3. resp.pc: calculates a response value from bval and pval using equation \[ resp = \frac{cval - bval}{pval - bval}*100 \] where cval is the measured response in raw fluorescence units for a particular well.

QLI MC level 5 settings

aeid mthd mthd_id
1848 pc20 2

Level 5 is the step where the cutoff value is set, the winning model is chosen, and a hit call is determined. This analysis sets to cutoff to 20%.

QLI MC level 6 settings

aeid mthd mthd_id nddr
1848 singlept.hit.high 6 0
1848 singlept.hit.mid 7 0
1848 multipoint.neg 8 0
1848 noise 10 0
1848 border.hit 11 0
1848 border.miss 12 0
1848 gnls.lowconc 15 0
1848 overfit.hit 16 0

Level 6 adds flags to the analysis. These do not change the hit call, but will flag a curve in it meets the criteria for a particular flag. The flags that are evaluated in this analysis are in the above table.

GUA TPO multi concentration

GUA MC complete run

The following block of code will set all of the proper methods for the pipeline and process the data with these settings.

# acid and aeid can be found using functions ----------
# tcpl::tcplLoadAsid()
# tcpl::tcplLoadAcid()
# tcpl::tcplLoadAeid()

gua_acid <- 1128
gua_aeid <- 1824

# set methods level 2 ----------

tcplClearMthd(lvl = 2L, id = gua_acid, type = "mc")
tcplAssignMthd(lvl = 2L,
               id = gua_acid, 
               mthd_id = 1, 
               ordr = 1, 
               type = "mc") 

# set methods level 3 ----------


tcplClearMthd(lvl = 3L, id = gua_aeid, type = "mc") 
tcplAssignMthd(lvl = 3L,
               id = gua_aeid, 
               mthd_id = c(11, 32, 5),
               ordr = c(1,2,3),
               type = "mc")

# set methods level 5 ----------

tcplClearMthd(lvl = 5L, id = gua_aeid, type = "mc") 
tcplAssignMthd(lvl = 5L,
               id = gua_aeid,
               mthd_id = 2, 
               type = "mc") 

# set methods level 6 ----------

tcplClearMthd(lvl = 6L, id = gua_aeid, type = "mc") 
tcplAssignMthd(lvl = 6L,
               id = gua_aeid,
               mthd_id = c(6, 7, 8, 10, 11, 12, 15, 16), 
               type = "mc") 

# run all ----------

tcplRun(id = gua_acid, 
        slvl = 1L, 
        elvl = 6L, 
        type = "mc",
        mc.cores = 1L)

# make aeid plots ----------

tcplMakeAeidPlts(gua_aeid, lvl=6L)

GUA TPO MC level 2 settings

acid mthd mthd_id ordr
1128 none 1 1

There are no level 2 methods applied to this data set. This level is used to do a global scaling or shifting of the data, and is not necessary for the TPO dataset.

GUA TPO MC level 3 settings

aeid mthd mthd_id ordr
1824 bval.apid.nwlls.med 11 1
1824 pval.zero 32 2
1824 resp.pc 5 3

Level 3 is the step when the data is normalized. There are 3 methods applied during level 3.

  1. bval.apid.nwlls.med: calculates a baseline value (bval) by taking the median of the DMSO wells at the plate level.
  2. pval.zero: sets a positive control value (pval) to 0.
  3. resp.pc: calculates a response value from bval and pval using equation \[ resp = \frac{cval - bval}{pval - bval}*100 \] where cval is the measured response in raw fluorescence units for a particular well.

GUA TPO MC level 5 settings

aeid mthd mthd_id
1824 pc20 2

Level 5 is the step where the cutoff value is set, the winning model is chosen, and a hit call is determined. This analysis sets to cutoff to 20%.

GUA TPO MC level 6 settings

aeid mthd mthd_id nddr
1824 singlept.hit.high 6 0
1824 singlept.hit.mid 7 0
1824 multipoint.neg 8 0
1824 noise 10 0
1824 border.hit 11 0
1824 border.miss 12 0
1824 gnls.lowconc 15 0
1824 overfit.hit 16 0

Level 6 adds flags to the analysis. These do not change the hit call, but will flag a curve in it meets the criteria for a particular flag. The flags that are evaluated in this analysis are in the above table.

AUR TPO single concentration

Results analysis

Plate diagnostics

Parameters of interest are Z’-factor and CV. We calculate these for every plate. Because each plate and assay and a different layout, the exact method varies.

acids <- c(963L, 964L, 1146L, 1128L)
dat_L0_mc <- tcplPrepOtpt(tcplLoadData(lvl=0L, fld = "acid", val = acids, type = "mc"))
dat_L0_sc <- tcplPrepOtpt(tcplLoadData(lvl=0L, fld = "acid", val = acids, type = "sc"))
dat_L0_mc[, acid := paste(acid, "_mc", sep="")]
dat_L0_sc[, acid := paste(acid, "_sc", sep="")]
dat_L0 <- rbindlist(list(dat_L0_mc, dat_L0_sc), 
                    use.names = TRUE, 
                    fill = TRUE)
acids <- c(963L, 964L, 1146L, 1128L)
dat_L0_mc <- tcplPrepOtpt(tcplLoadData(lvl=0L, fld = "acid", val = acids, type = "mc"))
dat_L0_sc <- tcplPrepOtpt(tcplLoadData(lvl=0L, fld = "acid", val = acids, type = "sc"))
dat_L0_mc[, acid := paste(acid, "_mc", sep="")]
dat_L0_sc[, acid := paste(acid, "_sc", sep="")]
dat_L0 <- rbindlist(list(dat_L0_mc, dat_L0_sc), 
                    use.names = TRUE, 
                    fill = TRUE)

# Database column 24 for phase I should be blank, not DMSO

dat_L0[srcf == "KP032 AURTPO ToxCast Phase I Single Pt Screen Compiled N.csv" &
         coli == 24,
       wllt := 'f']

dat_mmi_all <- data.table()
dat_qli_all <- data.table()
dat_nap_all <- data.table()
dat_mmi_fit_all <- data.table()
dat_ctg_fit_all <- data.table()
dat_qli_fit_all <- data.table()
sum_stats <- list()
i <- 1 #used in list of table stats before rbindlist after for loops
for(assay in unique(dat_L0[, acid])){
  for(plate in unique(dat_L0[acid==assay, apid])){
    dmso_sd <- sd(dat_L0[acid==assay & apid == plate & wllt=='n', rval])
    dmso_mean <- mean(dat_L0[acid==assay & apid == plate & wllt=='n', rval])
    CV <- dmso_sd/dmso_mean
    
    dmso_mad <- mad(dat_L0[acid==assay & apid == plate & wllt=='n', rval])
    dmso_median <- median(dat_L0[acid==assay & apid == plate & wllt=='n', rval])
    CV_robust <- dmso_mad/dmso_median
    
    pval <- NA
    psd <- NA
    bval <- NA
    bsd <- NA
    mmiconcs <- NA
    mmi_high_concs <- NA
    z_prime_factor <- NA
    
    if(assay %in% c("963_sc", "963_mc", "1128_mc")){
      mmiconcs <- sort(dat_L0[acid==assay & 
                                apid == plate & 
                                spid == "MMI",
                              conc])
      mmi_high_concs <- tail(mmiconcs,2)
      pval <- mean(dat_L0[acid==assay & 
                            apid == plate & 
                            spid == "MMI" & 
                            conc %in% mmi_high_concs, 
                          rval])
      psd <- sd(dat_L0[acid==assay & 
                         apid == plate & 
                         spid == "MMI" & 
                         conc %in% mmi_high_concs, 
                       rval])
      pval_med <- median(dat_L0[acid==assay & 
                                  apid == plate & 
                                  spid == "MMI" & 
                                  conc %in% mmi_high_concs, 
                                rval])
      pmad <- mad(dat_L0[acid==assay & 
                           apid == plate & 
                           spid == "MMI" & 
                           conc %in% mmi_high_concs, 
                         rval])
    }
    else if(assay == "964_mc"){
      mmiconcs <- sort(dat_L0[acid==assay & 
                                apid == plate & 
                                spid == "DCNQ", 
                              conc])
      mmi_high_concs <- tail(mmiconcs,2)
      pval <- mean(dat_L0[acid==assay & 
                            apid == plate & 
                            spid == "DCNQ" & 
                            conc %in% mmi_high_concs, 
                          rval])
      psd <- sd(dat_L0[acid==assay & 
                         apid == plate & 
                         spid == "DCNQ" & 
                         conc %in% mmi_high_concs, 
                       rval])
      pval_med <- median(dat_L0[acid==assay & 
                                  apid == plate & 
                                  spid == "DCNQ" & 
                                  conc %in% mmi_high_concs, 
                                rval])
      pmad <- mad(dat_L0[acid==assay & 
                           apid == plate & 
                           spid == "DCNQ" & 
                           conc %in% mmi_high_concs, 
                         rval])
    }
    else if(assay == "1146_mc"){
      mmiconcs <- sort(dat_L0[acid==assay & 
                                apid == plate & 
                                spid == "LUCINH2", 
                              conc])
      mmi_high_concs <- tail(mmiconcs,2)
      pval <- mean(dat_L0[acid==assay & 
                            apid == plate & 
                            spid == "LUCINH2" & 
                            conc %in% mmi_high_concs, 
                          rval])
      psd <- sd(dat_L0[acid==assay & 
                         apid == plate & 
                         spid == "LUCINH2" & 
                         conc %in% mmi_high_concs, 
                       rval])
      pval_med <- median(dat_L0[acid==assay & 
                                  apid == plate & 
                                  spid == "LUCINH2" & 
                                  conc %in% mmi_high_concs, 
                                rval])
      pmad <- mad(dat_L0[acid==assay & 
                           apid == plate & 
                           spid == "LUCINH2" & 
                           conc %in% mmi_high_concs, 
                         rval])
    }
    
    #Z' factor    
    bval <- mean(dat_L0[acid == assay & 
                          apid == plate & 
                          wllt == "n", 
                        rval])
    bsd <- sd(dat_L0[acid == assay & 
                       apid == plate & 
                       wllt == "n", 
                     rval])
    z_prime_factor <- (1 - (3*psd + 3*bsd)/abs(pval - bval))

    #Robust Z' factor
    bval_med <- median(dat_L0[acid == assay & 
                                apid == plate & 
                                wllt == "n", 
                              rval])
    bmad <- mad(dat_L0[acid == assay & 
                         apid == plate & 
                         wllt == "n", 
                       rval])
    z_prime_factor_robust <- (1 - (3*pmad + 3*bmad)/abs(pval_med - bval_med))

    stats <- data.table(acid = assay, 
                        apid = plate, 
                        CVar = CV, 
                        zpf = z_prime_factor, 
                        CVrobust = CV_robust, 
                        zpf_robust = z_prime_factor_robust)
    sum_stats[[i]] <- stats
    i <- i + 1
  }
}
dat_sum_stats <- rbindlist(sum_stats)
summary_table <- dat_sum_stats[,.(Mean_CVar = mean(CVar, na.rm=T),
                                  Mean_zpf = mean(zpf, na.rm=T),
                                  Mean_CVrobust = mean(CVrobust, na.rm=T),
                                  Mean_zpf_robust = mean(zpf_robust, na.rm=T),
                                  SD_CVrobust = sd(CVrobust, na.rm=T),
                                  SD_zpf_robust = sd(zpf_robust, na.rm=T)
                                  ),
                               by = acid]

Single concentration

The main use of the single concentration screen was to find likely hits at high concentration to cut down on the number of chemicals that would need to be tested in dose response.

datL0sc_prep <- tcplPrepOtpt(tcplLoadData(lvl = 0L, "acid", 963, type="sc"))
datL4mc_prep <- tcplPrepOtpt(tcplLoadData(lvl = 4L, "aeid", 1508, type="mc"))

# Database column 24 for phase I should be blank, not DMSO
datL0sc_prep[srcf == "KP032 AURTPO ToxCast Phase I Single Pt Screen Compiled N.csv" &
               coli == 24,
             wllt := 'f']

plates <- unique(datL0sc_prep[,apid])
for (plate in plates){
  mmiconcs <- sort(datL0sc_prep[apid == plate & spid == "MMI",conc])
  mmi_high_concs <- tail(mmiconcs,2)
  pval <- mean(datL0sc_prep[apid == plate & spid == "MMI" & conc %in% mmi_high_concs, rval])
  psd <- sd(datL0sc_prep[apid == plate & spid == "MMI" & conc %in% mmi_high_concs, rval])
  bval <- mean(datL0sc_prep[apid == plate & wllt == "n", rval])
  bsd <- sd(datL0sc_prep[apid == plate & wllt == "n", rval])
  z_prime_factor <- (1 - (3*psd + 3*bsd)/abs(pval - bval))
  
  datL0sc_prep[apid == plate, resp := (rval - bval)/(0 - bval)*100]
  datL0sc_prep[apid == plate, plate_z_prime_factor := z_prime_factor]
  
  datL0sc_prep[apid == plate, plate_psd := psd]
  datL0sc_prep[apid == plate, plate_bsd := bsd]
  
}


mean_zpf <- mean(unique(datL0sc_prep[, plate_z_prime_factor]))
sd_zpf <- sd(unique(datL0sc_prep[, plate_z_prime_factor]))

mmiconcs <- sort(unique(datL0sc_prep[spid == "MMI",conc]))
mmi_high_concs <- tail(mmiconcs,2)

psd <- sd(datL0sc_prep[spid == "MMI" & conc %in% mmi_high_concs, resp])
bsd <- sd(datL0sc_prep[wllt == "n", resp])
pval <- mean(datL0sc_prep[spid == "MMI" & conc %in% mmi_high_concs, resp])
bval <- mean(datL0sc_prep[wllt == "n", resp])
z_prime_factor <- (1 - (3*psd + 3*bsd)/abs(pval - bval))
#Generate column of random numbers, for random sorting
num_row <- dim(datL0sc_prep)[1]
datL0sc_prep[, sortval := runif(n=num_row)]

spids <- unique(datL0sc_prep[, spid])
for (chem in spids){
  datL0sc_prep[spid == chem, resp_mean := mean(datL0sc_prep[spid == chem, resp])]
}

dat_sc_mean <- data.table(spid = unique(datL0sc_prep[,spid]))
for (chem in dat_sc_mean[,spid]){
  dat_sc_mean[spid == chem, casn := unique(datL0sc_prep[spid == chem, casn])]
  dat_sc_mean[spid == chem, chnm := unique(datL0sc_prep[spid == chem, chnm])]
  dat_sc_mean[spid == chem, resp_mean := unique(datL0sc_prep[spid == chem, resp_mean])]
}
dat_sc_mean[resp_mean > 20, activity := "Active"]
dat_sc_mean[resp_mean < 20, activity := "Inactive"]

dat_sc_mean_plot <- data.table(spid = unique(datL0sc_prep[wllt=='t',spid]))
for (chem in dat_sc_mean_plot[, spid]){
  dat_sc_mean_plot[spid == chem,
                   resp_mean := unique(datL0sc_prep[spid == chem, resp_mean])]
  dat_sc_mean_plot[spid == chem,
                   sortval := unique(datL0sc_prep[spid == chem, sortval])]
}

single_conc_plot_mean <- ggplot(dat_sc_mean_plot, 
                                aes(x=reorder(spid, 
                                              sortval),
                                                            
                                    y=resp_mean))+
  geom_point(size=2) +
  theme_bw() +
  scale_x_discrete(breaks=NULL) +
  xlab("Sample ID") +
  ylab("Percent Activity") +
  scale_y_continuous(breaks = seq(from = -40, 
                                  to = 100, 
                                  by = 20),
                     limits = c(-55, 105), 
                     expand=c(0,0)) +
  geom_hline(yintercept = 20, color = "red") +
  geom_hline(yintercept = 3*bsd, color = "blue") +
  geom_hline(yintercept = -3*bsd, color = "blue") +
  ggtitle(NULL)

single_conc_plot_mean_hist <- ggplot(dat_sc_mean_plot, 
                                     aes(x=resp_mean))+
  geom_histogram(binwidth=5) +
  coord_cartesian(xlim = c(10, 20)) +
  theme_bw() +
  scale_y_discrete(breaks=NULL) +
  scale_x_continuous(breaks = seq(from = -40, to = 100, by = 20),
                     limits = c(-55, 105), 
                     expand=c(0,0)) +
  xlab(NULL) +
  ylab("Count") +
  coord_flip() +
  geom_vline(xintercept = 20, color = "red") +
  geom_vline(xintercept = 3*bsd, color = "blue") +
  geom_vline(xintercept = -3*bsd, color = "blue") +
  ggtitle(NULL)



layout <- matrix(c(1,1,1,2), nrow=1, byrow=TRUE)
singleplotlist <- list(single_conc_plot_mean, single_conc_plot_mean_hist)

multiplot(plotlist = singleplotlist, layout = layout)

#Get list of spids where more than one spid per cas
rep_casn <- c()
non_na_cas <- unique(datL0sc_prep[!(is.na(casn)),casn])
for(cas in non_na_cas){
  spid_num <- length(unique(datL0sc_prep[casn == cas, spid]))
  if(spid_num > 1){
    rep_casn <- c(rep_casn, cas)
  }
}

#Figure
dat <- datL0sc_prep[casn %in% rep_casn]
spids <- unique(dat[,spid])
for (chem in spids){
  dat[spid == chem,resp_mean := mean(dat[spid == chem, resp])]
  dat[spid == chem,resp_sd := sd(dat[spid == chem, resp])]
}

spids <- unique(dat[,spid])

dat_spid <- data.table(spid = spids)

for (chem in spids){
  dat_spid[spid == chem,casn := unique(dat[spid == chem, casn])]
  dat_spid[spid == chem,resp_mean := unique(dat[spid == chem,resp_mean])]
  dat_spid[spid == chem,resp_sd := unique(dat[spid == chem,resp_sd])]
}

for (chem in dat_spid[,casn]){
  reps <- as.numeric(length(dat_spid[casn == chem,spid]))
  dat_spid[casn == chem,chem_rep := reps/10] 
}

num_row <- dim(dat_spid)[1]
dat_spid[,sortval := runif(n=num_row)]

pos <- position_dodge(width=0.25)
single_conc_rep_mean_sd <- ggplot(dat_spid, aes(x=casn,
                                                y=resp_mean,
                                                ymin = resp_mean - resp_sd,
                                                ymax = resp_mean + resp_sd,
                                                group = spid))+
  geom_point(size=2, position=pos, stat="identity") +
  geom_errorbar(position=pos, aes(width=chem_rep*2.5), show_guide=FALSE) +
  theme_bw() +
  theme(axis.text.x = element_text(angle=90, hjust=1, vjust=0.5)) +
  scale_y_continuous(breaks=seq(from = -20, to = 100, by = 20)) +
  xlab("CAS") +
  ylab("Percent Activity") +
  coord_cartesian(ylim=c(-25,100)) +
  geom_hline(yintercept = 20, color = "red")

print(single_conc_rep_mean_sd)

Multi concentration

Fetch data for all MC analysis

aeids <- c(1508L, ctg_aeid, qli_aeid, gua_aeid)
dat_L5 <- tcplPrepOtpt(tcplLoadData(lvl = 5L,
                                    fld = "aeid",
                                    val = aeids,
                                    type = "mc"))
#Set chnm and code to spid if NA
dat_L5[is.na(chnm), chnm := dat_L5[is.na(chnm), spid]]
dat_L5[is.na(code), code := dat_L5[is.na(code), spid]]
dat_L4 <- tcplLoadData(lvl=4L, fld = "aeid", val = aeids, type = "mc")
dat_L3 <- tcplLoadData(lvl=3L, fld = "aeid", val = aeids, type = "mc")

AUR TPO AUC calculation and images

# Calculate AUC for each sample id ----------

dat_L5[hitc == 1L & modl == "hill", 
       AUCval := mapply(function(lower, 
                                 upper, 
                                 hill_tp, 
                                 hill_ga, 
                                 hill_gw) integrate(hill_curve, 
                                                    lower, 
                                                    upper, 
                                                    hill_tp=hill_tp, 
                                                    hill_ga=hill_ga, 
                                                    hill_gw=hill_gw)$value,
                                  lower = dat_L5[hitc == 1L & modl == "hill", logc_min], 
                                  upper = dat_L5[hitc == 1L & modl == "hill", logc_max], 
                                  hill_tp = dat_L5[hitc == 1L & modl == "hill", hill_tp], 
                                  hill_ga = dat_L5[hitc == 1L & modl == "hill", hill_ga], 
                                  hill_gw = dat_L5[hitc == 1L & modl == "hill", hill_gw])]

dat_L5[hitc == 1L & modl == "gnls", 
       AUCval := mapply(function(lower, 
                                 upper, 
                                 top, 
                                 ga, 
                                 gw, 
                                 la, 
                                 lw) integrate(gnls_curve, 
                                               lower, 
                                               upper, 
                                               top=top, 
                                               ga=ga, 
                                               gw=gw, 
                                               la=la, 
                                               lw=lw)$value,
                                  lower = dat_L5[hitc == 1L & modl == "gnls", logc_min], 
                                  upper = dat_L5[hitc == 1L & modl == "gnls", logc_max], 
                                  top = dat_L5[hitc == 1L & modl == "gnls", gnls_tp], 
                                  ga = dat_L5[hitc == 1L & modl == "gnls", gnls_ga], 
                                  gw = dat_L5[hitc == 1L & modl == "gnls", gnls_gw],
                                  la = dat_L5[hitc == 1L & modl == "gnls", gnls_la], 
                                  lw = dat_L5[hitc == 1L & modl == "gnls", gnls_lw])]
dat_L5[is.na(AUCval), AUCval:=0]

# Get the median AUC for each CAS code ----------

dat_L5[, AUC_med := median(AUCval),
       by = .(code, aeid)]

We can now plot individual curves with the AUC value included and the AUC area shaded to highlight what is being integrated. For publication, we can combine these plots with a fourth plot that includes all of the AUC values.

chem_1 <- "TP0001207H21"
chem_2 <- "TP0001441G03"
chem_3 <- "TP0001206D06" 

curve_1 <- ggcurve(chem_1, dat_L5, dat_L4, dat_L3, aeid)
curve_2 <- ggcurve(chem_2, dat_L5, dat_L4, dat_L3, aeid)
curve_3 <- ggcurve(chem_3, dat_L5, dat_L4, dat_L3, aeid)

# Find x positions of plotted curves ----------

dat_L5_unique <- unique(dat_L5[aeid == 1508L & !(is.na(casn)), .(casn, AUC_med)])
setkey(dat_L5_unique, AUC_med)
AUC_med_vect <- dat_L5_unique[,AUC_med]
auc_val_1 <- dat_L5[aeid == 1508L & spid ==chem_1, AUC_med]
auc_val_2 <- dat_L5[aeid == 1508L & spid ==chem_2, AUC_med]
auc_val_3 <- dat_L5[aeid == 1508L & spid ==chem_3, AUC_med]
auc_pos_1 <- which(AUC_med_vect %in% auc_val_1)
auc_pos_2 <- which(AUC_med_vect %in% auc_val_2)
auc_pos_3 <- which(AUC_med_vect %in% auc_val_3)
num_casn <- length(unique(dat_L5[aeid == 1508L, casn]))
auc_end_1 <- floor(num_casn/6L)
auc_end_2 <- floor(num_casn/2L)
auc_end_3 <- floor(num_casn*5.5/6L)
AUC_plot <- ggplot(dat_L5_unique, 
                   aes(x=reorder(casn, AUC_med),
                       y=AUC_med))+
  geom_bar(stat="identity", width=1) +
  theme_bw() +
  geom_segment(aes(x = auc_pos_1, 
                   y = auc_val_1+1, 
                   xend = auc_end_1, 
                   yend = 260), 
               arrow = arrow(type = "closed", angle = 30)) +
  geom_segment(aes(x = auc_pos_2, 
                   y = auc_val_2+1, 
                   xend = auc_end_2, 
                   yend = 260), 
               arrow = arrow(type = "closed", angle = 30)) +
  geom_segment(aes(x = auc_pos_3, 
                   y = auc_val_3+1, 
                   xend = auc_end_3, 
                   yend = 260), 
               arrow = arrow(type = "closed", angle = 30)) +
  scale_x_discrete(breaks=NULL) +
  xlab("Chemical") +
  ylab("AUR-TPO Activity (AUC)") 

AUC_plot_list <- list(curve_1, curve_2, curve_3, AUC_plot)  

layout <- matrix(c(1,2,3,
                   4,4,4,
                   4,4,4,
                   4,4,4), 
                 nrow=4, byrow=TRUE)

AUR and GUA TPO comparison

# Read GUA hit calls from Steve's literature search ----------

dat_gua_file <- data.table(read.csv("Data/Fig6_data.csv", stringsAsFactors = F))
dat_gua_file <- dat_gua_file[!(is.na(GUA_hitc))] #Removes extra blank rows at bottom
setnames(dat_gua_file, 
         c("Chemical.name",
           "CASRN",
           "GUA_hitc"), 
         c("chnm",
           "casn",
           "gua_hitc")
         )
gua_lit_casrns <- unique(dat_gua_file[,casn])

# Merge gua_hitc value into dat_L5 subset for only AUR TPO ----------

dat_L5_auc_gua <- dat_L5[aeid == 1508L & casn %in% gua_lit_casrns]
dat_L5_auc_gua <- merge(dat_L5_auc_gua, 
                        dat_gua_file[,.(casn, gua_hitc)], 
                        by = "casn")

dat_auc_gua_plot <- unique(dat_L5_auc_gua[, .(casn, AUC_med, gua_hitc)])

# Add arrow values if gua_hitc is 1 but AUR AUC is 0 ----------

for(cas in unique(dat_auc_gua_plot[,casn])){
  if(dat_auc_gua_plot[casn == cas, gua_hitc] == 1 & 
       dat_auc_gua_plot[casn == cas, AUC_med] == 0){
    dat_auc_gua_plot[casn == cas, cas_label := cas]
    dat_auc_gua_plot[casn == cas, arrow_bottom := 5]
    dat_auc_gua_plot[casn == cas, arrow_top := 50]
  }else{
    dat_auc_gua_plot[casn == cas, cas_label := NA_character_]
    dat_auc_gua_plot[casn == cas, arrow_bottom := NA_integer_]
    dat_auc_gua_plot[casn == cas, arrow_top := NA_integer_]
  }
}

dat_auc_gua_plot[,gua_hitc := as.factor(gua_hitc)]

# Plot AUC vs GUA hitc ----------

AUC_plot_GUA <- ggplot(dat_auc_gua_plot, aes(x=reorder(casn, AUC_med),
                                             y=AUC_med,
                                             label = cas_label,
                                             color = gua_hitc,
                                             fill = gua_hitc))+
  geom_bar(stat="identity", width=1) +
  theme_bw() +
  geom_segment(aes(x = reorder(casn, AUC_med), 
                   y = arrow_top, 
                   xend = reorder(casn, AUC_med), 
                   yend = arrow_bottom), 
               arrow = arrow(type = "closed", angle = 10)) +#
  geom_text(angle = 90, 
            hjust=0, 
            y = 51, 
            size = 3, 
            color = "black") +
  theme(axis.text.x = element_text(angle=90, hjust=1, vjust=0.5)) +
  theme(panel.grid.major.x = element_blank()) +
  theme(legend.title = element_text(size=22)) +
  theme(legend.text = element_text(size=18)) +
  scale_colour_manual(values=c("gray", "blue4"),
                      name = "GUA Activity",
                      breaks=c(0,1),
                      labels=c("Inactive", "Active")) +
  scale_fill_manual(values=c("gray", "blue4"),
                    name = "GUA Activity",
                    breaks=c(0,1),
                    labels=c("Inactive", "Active")) +
  theme(legend.justification=c(0,1), legend.position=c(0,1)) +
  xlab("CAS") +
  ylab("AUR-TPO Activity (AUC)")

# Compare modl_acc AUR vs GUA ----------

aur_cas <- unique(dat_L5[aeid == 1508L & !(is.na(casn)),casn])
gua_cas <- unique(dat_L5[aeid == gua_aeid & !(is.na(casn)),casn])
common_cas <- intersect(gua_cas, aur_cas)

dat_gua_dot_plot <- data.table(casn = common_cas)

for (cas in common_cas){
  aur_acc_val <- median(dat_L5[aeid == 1508L & 
                                 casn == cas & 
                                 hitc == 1L, 
                               modl_acc], 
                        na.rm = T)
  if(is.na(aur_acc_val)){ 
    aur_acc_val <- 5
  }
  gua_acc_val <- median(dat_L5[aeid == gua_aeid & 
                                 casn == cas & 
                                 hitc == 1L, 
                               modl_acc], 
                        na.rm = T)
  if(is.na(gua_acc_val)){  
    gua_acc_val <- 5
  }
  dat_gua_dot_plot[casn == cas, aur_acc := aur_acc_val]
  dat_gua_dot_plot[casn == cas, gua_acc := gua_acc_val]
}

AUR_GUA_dot_plot <- ggplot(dat_gua_dot_plot, aes(x=aur_acc,
                                                 y=gua_acc))+
  geom_point(size=3) +
  theme_bw() +
  theme(axis.text.x = element_text(angle=90, 
                                   hjust=1, 
                                   vjust=0.5)) +
  theme(legend.title = element_text(size=22)) +
  theme(legend.text = element_text(size=18)) +
  expand_limits(x = c(-2, 5)) +
  expand_limits(y = c(-2, 5)) +
  scale_x_continuous(breaks=c(-2, 0, 2, 4, 5), 
                     labels=c("-2", "0", "2", "4", "Inactive")) +
  scale_y_continuous(breaks=c(-2, 0, 2, 4, 5), 
                     labels=c("-2", "0", "2", "4", "Inactive")) +
  coord_equal(ratio=1) +
  geom_abline(slope=1, intercept=0) +
  xlab("AUR-TPO Threshold Potency (log uM)") +
  ylab("GUA-TPO Threshold Potency (log uM)") 

# Build inset with hitc table ----------

gua_plus_aur_plus <- length(dat_L5_auc_gua[hitc == 1 & gua_hitc == 1,casn])
gua_plus_aur_neg  <- length(dat_L5_auc_gua[hitc == 0 & gua_hitc == 1,casn])
gua_neg_aur_plus  <- length(dat_L5_auc_gua[hitc == 1 & gua_hitc == 0,casn])
gua_neg_aur_neg   <- length(dat_L5_auc_gua[hitc == 0 & gua_hitc == 0,casn])

balance_table_rows <- c("GUA Active", "GUA Inactive")
balance_table <- data.frame(row.names = balance_table_rows, 
                                AUR_Active = c(gua_plus_aur_plus, gua_neg_aur_plus),
                                AUR_Inactive = c(gua_plus_aur_neg, gua_neg_aur_neg))

val_table_theme <- ttheme_minimal(core=list(bg_params = list(fill = NA, col=NA),
                                            fg_params=list(cex = 1.5)), 
                                  colhead=list(fg_params=list(cex = 1.5)),
                                  rowhead=list(fg_params=list(fontface=2L, 
                                                              cex = 1.5)))

balance_table_grob <- tableGrob(balance_table, 
                                theme = val_table_theme, 
                                cols = c("AUR\nActive", 
                                         "AUR\nInactive"))

ten_by_ten <- data.table(x=1:10, y=1:10)
inset_table_plot <- ggplotGrob(ggplot(ten_by_ten, aes(x = x, y = y)) + 
                                 geom_blank() + 
                                 theme_bw() +
                                 theme(axis.line=element_blank(),
                                       axis.text.x=element_blank(),
                                       axis.text.y=element_blank(),
                                       axis.ticks=element_blank(),
                                       axis.title.x=element_blank(),
                                       axis.title.y=element_blank(),
                                       legend.position="none",
                                       panel.background=element_blank(),
                                       panel.border=element_blank(),
                                       panel.grid.major=element_blank(),
                                       panel.grid.minor=element_blank(),
                                       plot.background=element_blank(),
                                       plot.title = element_text(size = 5)) +
                                 theme(plot.background = element_rect(colour = "black")) +
                                 annotation_custom(grob = balance_table_grob))

# Add inset table to AUC subplot ----------

AUC_plot_GUA_inset <- ggplot(dat_auc_gua_plot, 
                             aes(x=reorder(casn, AUC_med),
                                 y=AUC_med,
                                 label = cas_label,
                                 color = gua_hitc,
                                 fill = gua_hitc))+
  geom_bar(stat="identity", width=1) +
  theme_bw() +
  geom_segment(aes(x = reorder(casn, AUC_med), 
                   y = arrow_top, 
                   xend = reorder(casn, AUC_med), 
                   yend = arrow_bottom), 
               arrow = arrow(type = "closed", 
                             angle = 10)) +
  geom_text(angle = 90, 
            hjust=0, 
            y = 51, 
            size = 3, 
            color = "black") +
  theme(axis.text.x = element_text(angle=90, 
                                   hjust=1, 
                                   vjust=0.5)) +
  theme(panel.grid.major.x = element_blank()) +
  theme(legend.title = element_text(size=22)) +
  theme(legend.text = element_text(size=18)) +
  scale_colour_manual(values=c("gray", "blue4"),
                      name = "GUA Activity",
                      breaks=c(0,1),
                      labels=c("Inactive", "Active")) +
  scale_fill_manual(values=c("gray", "blue4"),
                    name = "GUA Activity",
                    breaks=c(0,1),
                    labels=c("Inactive", "Active")) +
  theme(legend.justification=c(1,1), legend.position=c(0.9,0.97)) +
  xlab("CAS") +
  ylab("AUR-TPO Activity (AUC)") +
  annotate(geom="text", 
           x = 5, 
           y = 250, 
           label = "A", 
           color = "black", 
           hjust = 0.5, 
           vjust = 1, 
           size = 12) +
  annotation_custom(grob = inset_table_plot, 
                    xmin = 30, 
                    xmax = 65, 
                    ymin = 175, 
                    ymax = 250)

# Add label to modl_acc subplot ----------

AUR_GUA_dot_plot_multi <- AUR_GUA_dot_plot + 
  annotate(geom="text", 
           x = -2, 
           y = 5, 
           label = "B", 
           color = "black", 
           hjust = 0, 
           vjust = 1, 
           size = 12) 
# Multiplot print ----------

layout <- matrix(c(1,1,2), nrow=1, byrow=TRUE)
singleplotlist <- list(AUC_plot_GUA_inset, AUR_GUA_dot_plot_multi)
multiplot(plotlist = singleplotlist, layout = layout)

AUR TPO selectivity

Selectivity is one method to interpret the potency of the AUR TPO assay in the context of other assays. As a loss of signal assay, one concern is that non-specific enzyme interactions, rather than direct TPO inhibition, could cause a drop in activity. This is explored in the code and images below.

# Pull in cytotox individual data ----------

file.loc <- './Data/Assay_Summary_modified_141031.csv'
dat_cytotox_indiv <- data.table(read.table(file = file.loc, 
                                           header = TRUE, 
                                           sep=',', 
                                           na.strings = c("NA"), 
                                           stringsAsFactors = FALSE))

all_assays <- tcplLoadAeid()

#The database had a change where Tox21_* assays are now names TOX21_*. 
#The above cytotoxicity file uses the old names.
#First use regex to convert Tox21 to TOX21, then continue.
cytoassay <- c("proliferation decrease", 
               "cytotoxicity SRB", 
               "cytotoxicity BLA")
cyto_aenms <- dat_cytotox_indiv[biological_process %in% cytoassay, Assay]
cyto_aenms <- gsub("Tox21_", "TOX21_", cyto_aenms)
cytoaeids <- all_assays[aenm %in% cyto_aenms, aeid]
cyto_dat_L5 <- tcplPrepOtpt(tcplLoadData(lvl = 5L, fld = "aeid", cytoaeids))
cyto_dat_L5 <- cyto_dat_L5[hitc == 1]

tpocodes <- unique(dat_L5[aeid == 1508L,code])
cyto_dat_L5 <- cyto_dat_L5[code %in% tpocodes]

cyto_dat_L5[, aenm := "cytotox"]

dat_L5_with_cyto <- rbind(dat_L5[!(aeid %in% 1824L)], 
                          cyto_dat_L5, 
                          use.names=TRUE, 
                          fill=TRUE)


# Make Assay a factor to ensure stable order ----------

dat_L5_with_cyto[, aenm := factor(aenm, 
                                  levels = c("NCCT_TPO_AUR_dn",
                                             "NCCT_QuantiLum_inhib_2_dn",
                                             "NCCT_HEK293T_CellTiterGLO",
                                             "cytotox"))]


# Calculate specificity ----------

num_cytotox <- length(unique(dat_L5_with_cyto[aenm == "cytotox", aeid]))
unique_chem <- unique(dat_L5_with_cyto[,chnm])
# And specificity column for ac20 
for (chemical in unique_chem){
  cyto_modl_acc <- dat_L5_with_cyto[hitc == 1L & 
                                      chnm == chemical & 
                                      aenm == "cytotox", 
                                    modl_acc]
  num_cyto_hits <- length(cyto_modl_acc)
  cyto_modl_acc_padded <- rep(cyto_modl_acc, length.out=num_cytotox)
  cyto_modl_acc_padded[(num_cyto_hits + 1):num_cytotox] <- 5
  
  vect_modl_acc <- dat_L5_with_cyto[hitc == 1L & 
                                      chnm == chemical & 
                                      !(aenm %in% c("NCCT_TPO_AUR_dn", 
                                                    "cytotox")), 
                                    modl_acc]
  vect_modl_acc <- c(vect_modl_acc, median(cyto_modl_acc_padded))
  
  
  tpo_modl_acc <- dat_L5_with_cyto[hitc == 1L & 
                                     chnm == chemical & 
                                     aenm == "NCCT_TPO_AUR_dn", 
                                   modl_acc]
  min_tpo_modl_acc <- 5
  if (length(tpo_modl_acc[!is.na(tpo_modl_acc)]) > 0){
    min_tpo_modl_acc <- min(tpo_modl_acc, na.rm=T)
  }
  
  modl_acc_min <- min(vect_modl_acc, na.rm=T)
  specificity_by_modl_acc <- modl_acc_min - min_tpo_modl_acc
  dat_L5_with_cyto[chnm == chemical, specificity_modl_acc := specificity_by_modl_acc]
}

The issue of selectivity is explored by comparing the potency in AUR TPO to other assays. The parameter used for potency is the concentration at which the winning model has a response equal to the cutoff for that assay, modl_acc. We use the equation \[ selectivity = min(CTG_{ACC},QLI_{ACC},cyto_med) - min(AUR_{ACC},5) \] Here, CTGACC, QLIACC, AURACC are the modl_acc values for the Cell Titer Glo, Quantilum Luciferase, and AUR TPO assays, respectively.

The value of cyto_med is determined by grouping the modl_acc values for the 36 cytotoxicity assays in ToxCast. If the winning model for a cytotoxicity assay has hitc = 0, the value of modl_acc is set to 5. The value of cyto_med is then set to the median of modl_acc values for these 36 assays

# Make Plots ----------

# Setup colors, better than default of red, blue, and green ----------

# The palette with grey:
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", 
               "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

# The palette with black:
cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", 
                "#F0E442", "#0072B2", "#D55E00", "#CC79A7")


# Assay, acc, specificity to qli and ctg, individual cyto ----------

tpo_hits <- unique(dat_L5_with_cyto[aenm == "NCCT_TPO_AUR_dn" & hitc==1L, chnm])
dat_L5_with_cyto_hits <- dat_L5_with_cyto[chnm %in% tpo_hits & hitc == 1L]
full_specificity_plot <- ggplot(dat_L5_with_cyto_hits, 
                                aes(x=modl_acc, 
                                    y=reorder(chnm, specificity_modl_acc), 
                                    group = aenm, 
                                    color = aenm, 
                                    shape = aenm)) + 
  geom_point() +
  ylab("Chemical") +
  xlab("modl_acc") +
  theme_bw() +
  theme(legend.position="top", legend.title=element_blank()) +
  scale_x_continuous(breaks=seq(-6,5,1)) +
  scale_colour_manual(values=cbbPalette) +
  scale_shape_manual(values=c(19,17,15,8)) +
  guides(colour = guide_legend(nrow = 3)) 


file.dir <- paste("documents/images/", Sys.Date(), "/", sep="")
file.name <- paste("/full_specificity_plot_", Sys.Date(), ".pdf", sep="")
file.path <- paste(file.dir, file.name, sep="")
dir.create(path=file.dir, showWarnings = FALSE, recursive = TRUE)
pdf(file.path, width=12, height=70)
print(full_specificity_plot)
dev.off()

# Plots for actual manuscript image ----------

specificity_sort <- sort(unique(dat_L5_with_cyto_hits[,specificity_modl_acc]))
specificity_1_cutoff <- which(specificity_sort>1)[1]
specificity_0_cutoff <- which(specificity_sort>0)[1]

manu_1_arrow_x <- 4.5

size_box <- 20
top_first_box <- length(tpo_hits)
bottom_first_box <- top_first_box - size_box
top_second_box <- specificity_1_cutoff
bottom_second_box <- top_second_box - size_box
top_third_box <- specificity_0_cutoff
bottom_third_box <- top_third_box - size_box

# Image 1 ----------  

manuscript_1 <- ggplot(dat_L5_with_cyto_hits, 
                       aes(x=modl_acc, 
                           y=reorder(chnm, specificity_modl_acc), 
                           group = aenm, 
                           color = aenm, 
                           shape = aenm)) + 
  geom_point() +
  ylab("Chemical") +
  xlab("Concentration at\nThreshold\n(log uM)") +
  annotate("rect", 
           xmin = -6, 
           xmax = 5, 
           ymin = bottom_first_box, 
           ymax = top_first_box, 
           fill = "red", 
           alpha = 0.1, 
           size = 1, 
           color = "red") +
  annotate("rect", 
           xmin = -6, 
           xmax = 5, 
           ymin = bottom_second_box,  
           ymax = top_second_box, 
           fill = "red", 
           alpha = 0.1, 
           size = 1, 
           color = "red") +
  annotate("rect", 
           xmin = -6, 
           xmax = 5, 
           ymin = bottom_third_box,   
           ymax = top_third_box,  
           fill = "red", 
           alpha = 0.1, 
           size = 1, 
           color = "red") +
  geom_segment(aes(x = manu_1_arrow_x, 
                   y = top_first_box, 
                   xend = manu_1_arrow_x, 
                   yend = specificity_1_cutoff),
               color = "black",
               arrow = arrow(type = "closed", 
                             angle = 20, 
                             ends="both")) +
  geom_segment(aes(x = manu_1_arrow_x, 
                   y = specificity_1_cutoff, 
                   xend = manu_1_arrow_x, 
                   yend = specificity_0_cutoff),
               color = "black",
               arrow = arrow(type = "closed", 
                             angle = 20, 
                             ends="both")) +
  geom_segment(aes(x = manu_1_arrow_x, 
                   y = specificity_0_cutoff, 
                   xend = manu_1_arrow_x, 
                   yend = 0),
               color = "black",
               arrow = arrow(type = "closed", 
                             angle = 20, 
                             ends="both")) +
  annotate(geom="text", 
           x = 3.25, 
           y = mean(c(top_first_box, top_second_box)), 
           label = "High Selectivity", 
           color = "black",
           size = 7, 
           fontface="bold", 
           angle=90, 
           vjust=0.5) +
  annotate(geom="text", 
           x = 3.25, 
           y = mean(c(top_third_box, top_second_box)), 
           label = "Low Selectivity", 
           color = "black",
           size = 7, 
           fontface="bold", 
           angle=90, 
           vjust=0.5) +
  annotate(geom="text", 
           x = 3.25, 
           y = mean(c(0, top_third_box)), 
           label = "No Selectivity", 
           color = "black",
           size = 7, 
           fontface="bold", 
           angle=90, 
           vjust=0.5) +
  geom_hline(yintercept = specificity_1_cutoff, size = 1) +
  geom_hline(yintercept = specificity_0_cutoff, size = 1) +
  theme_bw() +
  theme(legend.position="none") +
  theme(axis.text.y = element_blank(), 
        axis.ticks.y = element_blank()) +
  scale_x_continuous(breaks=seq(-6,5,1)) +
  coord_cartesian(xlim = c(-6, 5)) +
  scale_colour_manual(values=cbbPalette,
                      labels=c("AUR TPO", "QLI", "CTG", "Cytotox")) +
  scale_shape_manual(values=c(19,17,15,8),
                     labels=c("AUR TPO", "QLI", "CTG", "Cytotox"))  


# Image 2a ----------   

manuscript_2a <- ggplot(dat_L5_with_cyto_hits, 
                        aes(x=modl_acc, 
                            y=reorder(strtrim(chnm, 30), specificity_modl_acc), 
                            group = aenm, 
                            color = aenm, 
                            shape = aenm)) + 
  coord_cartesian(xlim = c(-6, 5),
                  ylim = c(bottom_first_box:top_first_box)) +
  geom_point() +
  xlab("Concentration at Threshold (log uM)") +
  theme_bw() +
  theme(legend.position="none") +
  theme(axis.title.y = element_blank()) +
  theme(axis.text.y = element_text(family = "mono", 
                                   face = "bold",
                                   size=c(rep(0,bottom_first_box-1), 
                                          rep(12,size_box+1))),
        axis.ticks.y = element_line(size = c(rep(0,bottom_first_box), 
                                             rep(1,size_box+1)))) +
  scale_x_continuous(breaks=seq(-6,5,1)) +
  scale_colour_manual(values=cbbPalette,
                      labels=c("AUR TPO", "QLI", "CTG", "Cytotox")) +
  scale_shape_manual(values=c(19,17,15,8),
                     labels=c("AUR TPO", "QLI", "CTG", "Cytotox")) +
  annotate(geom="text", 
           x = -5, 
           y = top_first_box-1, 
           label = "A", 
           color = "black", 
           hjust = 0, 
           vjust = 1, 
           size = 10)

# Image 2b ----------  

manuscript_2b <- ggplot(dat_L5_with_cyto_hits, 
                        aes(x=modl_acc, 
                            y=reorder(strtrim(chnm, 30), specificity_modl_acc), 
                            group = aenm, 
                            color = aenm, 
                            shape = aenm)) + 
  geom_point() +
  xlab("Concentration at Threshold (log uM)") +
  theme_bw() +
  theme(legend.position="none") +
  theme(axis.title.y = element_blank()) +
  theme(axis.text.y = element_text(family = "mono", 
                                   face = "bold",
                                   size=c(rep(0,bottom_second_box-1), 
                                          rep(12,size_box+1), 
                                          rep(0,top_first_box-top_second_box))),
        axis.ticks.y = element_line(size = c(rep(0,bottom_second_box-1), 
                                             rep(1,size_box+1), 
                                             rep(0,top_first_box-top_second_box)))) +
  scale_x_continuous(breaks=seq(-6,5,1)) +
  coord_cartesian(xlim = c(-6, 5),
                  ylim = c(bottom_second_box:top_second_box)) +
  scale_colour_manual(values=cbbPalette,
                      labels=c("AUR TPO", "QLI", "CTG", "Cytotox")) +
  scale_shape_manual(values=c(19,17,15,8),
                     labels=c("AUR TPO", "QLI", "CTG", "Cytotox")) +
  annotate(geom="text", 
           x = -5, 
           y = top_second_box-1, 
           label = "B", 
           color = "black", 
           hjust = 0, 
           vjust = 1, 
           size = 10)

# Image 2c ----------  

manuscript_2c <- ggplot(dat_L5_with_cyto_hits, 
                        aes(x=modl_acc, 
                            y=reorder(strtrim(chnm, 30), specificity_modl_acc), 
                            group = aenm, 
                            color = aenm, 
                            shape = aenm)) + 
  geom_point() +
  xlab("Concentration at Threshold (log uM)") +
  theme_bw() +
  theme(legend.position="none") +
  theme(axis.title.y = element_blank()) +
  theme(axis.text.y = element_text(family = "mono", 
                                   face = "bold",
                                   size=c(rep(0,bottom_third_box-1), 
                                          rep(12,size_box+1),
                                          rep(0,top_first_box-top_third_box))),
        axis.ticks.y = element_line(size = c(rep(0,bottom_third_box-1), 
                                             rep(1,size_box+1),
                                             rep(0,top_first_box-top_third_box)))) +
  scale_x_continuous(breaks=seq(-6,5,1)) +
  coord_cartesian(xlim = c(-6, 5),
                  ylim = c(bottom_third_box:top_third_box)) +
  scale_colour_manual(values=cbbPalette,
                      labels=c("AUR TPO", "QLI", "CTG", "Cytotox")) +
  scale_shape_manual(values=c(19,17,15,8),
                     labels=c("AUR TPO", "QLI", "CTG", "Cytotox")) +
  annotate(geom="text", 
           x = -5, 
           y = top_third_box-1, 
           label = "C", 
           color = "black", 
           hjust = 0, 
           vjust = 1, 
           size = 10)


layout <- matrix(c(1,2,2,2,
                   1,3,3,3,
                   1,4,4,4), nrow=3, byrow=TRUE)




file.dir <- paste("documents/images/", Sys.Date(), "/", sep="")
file.name <- paste("/Selectivity_figure_with_subsets_", Sys.Date(), ".png", sep="")
file.path <- paste(file.dir, file.name, sep="")
dir.create(path=file.dir, showWarnings = FALSE, recursive = TRUE)
png(file.path, width = 5100, height = 6600, res=480)
multiplot(manuscript_1, manuscript_2a, manuscript_2b, manuscript_2c, layout = layout)
dev.off()
multiplot(manuscript_1, manuscript_2a, manuscript_2b, manuscript_2c, layout = layout)

Appendix

Model parameters and hitcall values

file.dir <- paste("documents/tables/", Sys.Date(), "/", sep="")
file.name <- paste("/AUR_TPO_selectivity_potency_AUC_", Sys.Date(), ".tsv", sep="")
file.path <- paste(file.dir, file.name, sep="")
dir.create(path=file.dir, showWarnings = FALSE, recursive = TRUE)

dat_app_result <- dat_L5[, .(aeid, chnm, casn, spid, modl, modl_ga, modl_acc, modl_tp, hitc, AUCval)]
dat_app_result <- merge(dat_app_result, 
                        dat_L5_with_cyto[, .(aeid, spid, specificity_modl_acc)],
                        by = c("spid", "aeid"))

write.table(dat_app_result, 
            sep='\t', 
            file = file.path, 
            quote=FALSE, 
            row.names = FALSE)

AUR TPO fit values

CTG fit values

QLI fit values

GUA fit values

All methods and flags available

Function definitions

I make use of multiple functions in the above analysis, but leave the definition here for document clarity.

# ggcurve based on R/manuscript/ggcurve.R ----------
# 
ggcurve <- function(chem, dat_L5, dat_L4, dat_L3, assay){
  dat_L5 <- dat_L5[assay == aeid]
  dat_L4 <- dat_L4[assay == aeid]
  dat_L3 <- dat_L3[assay == aeid]
  chemname <- unique(dat_L5[spid == chem, chnm])
  chem_AUC <- round(dat_L5[spid == chem, AUCval], 1)
  
  conc <- dat_L3[spid==chem, logc]
  resp <- dat_L3[spid==chem, resp]
  
  
  fit_hill_tp <- dat_L4[spid==chem, hill_tp]
  fit_hill_ga <- dat_L4[spid==chem, hill_ga]
  fit_hill_gw <- dat_L4[spid==chem, hill_gw]
  
  dat_mmi <- data.table(logc = conc, resp = resp)
  
  xmin <- min(conc)
  xmax <- max(conc)
  curve_max <- max(resp, 120)
  curve_min <- min(resp, -20)
  range_curve <- curve_max - curve_min
  plot_max <- curve_max + 0.15*range_curve
  plot_min <- curve_min - 0.15*range_curve
  
  curves_plot <- ggplot(dat_mmi, aes(x=logc, y=resp))
  curves_plot <- curves_plot +
    stat_function(fun = hill_curve, 
                  args=list(hill_tp = fit_hill_tp, 
                            hill_ga = fit_hill_ga, 
                            hill_gw = fit_hill_gw),
                  alpha=1, 
                  color = "black", 
                  size=3) +
    stat_function(fun = hill_curve, 
                  args=list(hill_tp = fit_hill_tp, 
                            hill_ga = fit_hill_ga, 
                            hill_gw = fit_hill_gw),
                  alpha=0.4, 
                  color = "black", 
                  geom="ribbon",
                  mapping = aes(ymin = 0, 
                                ymax = ..y.., 
                                linetype = NA)) +
    theme_bw() +
    geom_point(aes(size=3)) +
    annotate(geom="text", 
             x = 0.5, 
             y = 110, 
             label = paste("AUC : ", chem_AUC), 
             color = "black", hjust = 0.5) +
    theme(legend.position="none", 
          legend.title=element_blank()) +
    ylab("Percent Activity") +
    xlab("Log Concentration (uM)") +
    coord_cartesian(ylim = c(plot_min,plot_max)) +
    ggtitle(chemname)
  
  return(curves_plot)
  
}

# Hill curve and gnls curve ----------

hill_curve <- function(hill_tp, hill_ga, hill_gw, lconc){
  return(hill_tp/(1+10^((hill_ga - lconc)*hill_gw)))
}
gnls_curve <- function(top, ga, gw, la, lw, lconc){
  gain <- 1/(1+10^((ga - lconc)*gw))
  loss <- 1/(1+10^((lconc - la)*lw))
  return(top*gain*loss)
}

# Multiplot ----------
# multiplot was obtained from
# http://www.cookbook-r.com/Graphs/Multiple_graphs_on_one_page_%28ggplot2%29/
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  require(grid)
  
  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)
  
  numPlots = length(plots)
  
  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                     ncol = cols, nrow = ceiling(numPlots/cols))
  }
  
  if (numPlots==1) {
    print(plots[[1]])
    
  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
    
    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
      
      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}