Processing the Thyroperoxidase (TPO) high throughput screening data through the pipeline requires several steps. First, the data needs to be brought into one separate table for each assay in multi conc, and a separate table for each assay in single conc. At this point, the data can be loaded into the pipeline and processed. Finally, the processed data can be retrieved from the pipeline database and analyzed.
The TPO dataset has morphed over time, with four separate ‘phases’ (including the retest and GUA overlap samples), collected by two different people, with five separate chemical orders and modifications to the protocol. This document seeks to document all of these events and adjustments to ensure the highest possible data fidelity and analysis transparency.
Each ‘phase’ has a different dataset, and each data file is formatted slightly differently. To handle this, each data file is processed separately to get into the correct format. Next, all of the tables are merged together into one. Final processing includes remapping sample IDs. Phase I and II were performed under the older TX code formatted sample ids, which is reflected in the raw data. The new pipeline, however, requires that these sample IDs be mapped to the new TP based codes. This is accomplished using tcpl:::tcplSpidMap()
For reference, this document was compiled with the following version of R and R packages.
sessionInfo()
## R version 3.2.3 (2015-12-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Red Hat Enterprise Linux Workstation release 6.8 (Santiago)
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel grid stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] sparkline_1.0 DT_0.1 tcpl_1.2.2 pander_0.6.0
## [5] gridExtra_2.2.1 ggplot2_2.1.0 stringr_1.0.0 data.table_1.9.6
## [9] magrittr_1.5 dplyr_0.5.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.5 knitr_1.13 munsell_0.4.3
## [4] colorspace_1.2-6 R6_2.1.2 plyr_1.8.4
## [7] tools_3.2.3 gtable_0.2.0 DBI_0.4-1
## [10] htmltools_0.3.5 RMySQL_0.10.9 yaml_2.1.13
## [13] assertthat_0.1 digest_0.6.9 tibble_1.1
## [16] numDeriv_2014.2-1 RColorBrewer_1.1-2 formatR_1.4
## [19] htmlwidgets_0.6 RSQLite_1.0.0 evaluate_0.9
## [22] rmarkdown_0.9.6 stringi_1.1.1 scales_0.4.0
## [25] chron_2.3-47
| Collected_By | Date_Collected | Order_Number | Test_Samples | Plates |
|---|---|---|---|---|
| Katie Paul Friedman | Unknown | Unknown | 100 | 12 |
# Read in data ----------
file.loc <- './Data/AUR TPO DR Phase I KP032 Compiled N from Dose Response_CompiledN.csv'
dat_mc_p1 <- data.table(read.table (file = file.loc,
header = TRUE,
sep=',',
comment.char="",
na.strings = c("na", "#VALUE!"), #to convert these to NA
stringsAsFactors = FALSE))
# Rename columns to match uniform variable names ----------
setnames(dat_mc_p1,
c("COLUMN",
"ROW",
"EPA_SAMPLE_ID",
"ASSAY_CONC",
"PLATE"),
c("coli",
"rowi",
"spid",
"conc",
"apid"))
# Remove extra columns ----------
set(dat_mc_p1,
j=which(colnames(dat_mc_p1) %in% c("WELL",
"ALIQUOT_WELL_ID",
"ALIQUOT_SOLVENT",
"ALIQUOT_CONC",
"ALIQUOT_CONC_UNIT",
"ALIQUOT_PLATE_BARCODE_SOURCE",
"SISTER_PLATE",
"ASSAY_CONC_UNIT")),
value=NULL)
# Add new columns ----------
dat_mc_p1[, wllq := 1L]
dat_mc_p1[, wllt := 't']
dat_mc_p1[, srcf := 'AUR TPO DR Phase I KP032 Compiled N from Dose Response_CompiledN.csv']
dat_mc_p1[, acsn := 'AUR_TPO_DoseResponse_PhaseI']
# Modify column entries ----------
dat_mc_p1[is.na(spid), wllt := 'b']
dat_mc_p1[spid %in% c('MMI', 'NAP', 'LUCINH1', 'LUCINH2'), wllt := 'c']
dat_mc_p1[spid == 'DMSO', wllt := 'n']
dat_mc_p1[spid == 'NEG', wllt := 'f']
# Melt 'N' columns to get rval ----------
dat_mc_p1 <- melt(dat_mc_p1,
id.vars = c('rowi', 'coli', 'spid', 'conc', 'wllq',
'wllt', 'srcf', 'acsn', 'apid'),
measure.vars = c('N1', 'N2', 'N3'),
variable.name='N',
value.name='rval')
dat_mc_p1[, apid := paste("phase_1_plate", apid, N, sep = "_")]
set(dat_mc_p1, j=which(colnames(dat_mc_p1) == "N"), value=NULL)
# Clean up column types ----------
letter2number <- c("A" = 1L,
"B" = 2L,
"C" = 3L,
"D" = 4L,
"E" = 5L,
"F" = 6L,
"G" = 7L,
"H" = 8L,
"I" = 9L,
"J" = 10L,
"K" = 11L,
"L" = 12L,
"M" = 13L,
"N" = 14L,
"O" = 15L,
"P" = 16L,
"Q" = 17L,
"R" = 18L,
"S" = 19L,
"T" = 20L,
"U" = 21L,
"V" = 22L,
"W" = 23L,
"X" = 24L,
"Y" = 25L,
"Z" = 26L)
dat_mc_p1[, rowi := unname(letter2number[rowi])]
dat_mc_p1[, rval := as.numeric(rval)]
# Set TX codes that were retested to wllq of 0 ----------
retest_spids <- c("TX000781", "TX000845", "TX001425", "TX002339", "TX003444",
"TX003725", "TX004371", "TX004536", "TX006158", "TX007182",
"TX007242", "TX008327", "TX008582", "TX008666", "TX009149",
"TX009328", "TX009711", "TX009779", "TX011582", "TX011587",
"TX011630", "TX011653", "TX015181", "TX015273", "TX015590",
"TX015604", "TX109149", "TX111582", "TX111587", "TX209149",
"TX211582", "TX211587")
dat_mc_p1[spid %in% retest_spids, wllq := 0]
| Collected_By | Date_Collected | Order_Number | Test_Samples | Plates |
|---|---|---|---|---|
| Katie Paul Friedman | Unknown | Unknown | 252 | 18 |
# Read in data ----------
file.loc <- './Data/L1_KP036_AUR_TPO_PhaseII_Dose_Response.csv'
dat_mc_p2 <- data.table(read.table (file = file.loc,
header = TRUE,
sep=',',
na.strings = c("NA", "EMPTY", "Kpaul"),
stringsAsFactors = FALSE)
[,1:14]) #There are blank columns after 14
# Rename columns to match uniform variable names ----------
setnames(dat_mc_p2,
c("COLUMN",
"ROW",
"EPA_SAMPLE_ID",
"ASSAY_CONC",
"Plate_ID",
"Response"),
c("coli",
"rowi",
"spid",
"conc",
"apid",
"rval"))
# Remove extra columns ----------
set(dat_mc_p2,
j=which(colnames(dat_mc_p2) %in% c("ALIQUOT_WELL_ID",
"ALIQUOT_SOLVENT",
"ALIQUOT_CONC",
"ALIQUOT_CONC_UNIT",
"ALIQUOT_PLATE_BARCODE",
"SISTER_PLATE",
"ASSAY_CONC_UNIT",
"N")),
value=NULL)
# Add new columns ----------
dat_mc_p2[, wllq := 1L]
dat_mc_p2[, wllt := 't']
dat_mc_p2[, srcf := 'L1_KP036_AUR_TPO_PhaseII_Dose_Response.csv']
dat_mc_p2[, acsn := 'AUR_TPO_DoseResponse_PhaseII']
# Modify column entries ----------
dat_mc_p2[is.na(spid), wllt := 'b']
dat_mc_p2[spid %in% c('MMI', 'NAP', 'LUCINH1', 'LUCINH2'), wllt := 'c']
dat_mc_p2[spid == 'DMSO', wllt := 'n']
dat_mc_p2[spid == 'NEG', wllt := 'f']
#Plate 6 (18 after change below) seems to have error.
#Values are much lower, and some dose response is in opposite direction.
#Set wllq to 0L
dat_mc_p2[apid==6,wllq := 0L]
# Clean up column types ----------
dat_mc_p2[, rowi := unname(letter2number[rowi])] #Use letter2number from chunk mc_phase1
dat_mc_p2[, apid := as.character(paste("phase_2_plate", apid, sep = "_"))]
dat_mc_p2[, rval := as.numeric(rval)]
# Set TX codes that were retested to wllq of 0 ----------
retest_spids <- c("TX000781", "TX000845", "TX001425", "TX002339", "TX003444",
"TX003725", "TX004371", "TX004536", "TX006158", "TX007182",
"TX007242", "TX008327", "TX008582", "TX008666", "TX009149",
"TX009328", "TX009711", "TX009779", "TX011582", "TX011587",
"TX011630", "TX011653", "TX015181", "TX015273", "TX015590",
"TX015604", "TX109149", "TX111582", "TX111587", "TX209149",
"TX211582", "TX211587")
dat_mc_p2[spid %in% retest_spids, wllq := 0]
| Collected_By | Date_Collected | Order_Number | Test_Samples | Plates |
|---|---|---|---|---|
| Steve Simmons | July 2015 | Unknown | 82 | 6 |
# Read in Data ----------
file.loc <- './Data/AUR_level.0_master.csv'
dat_mc_retest <- data.table(read.table (file = file.loc,
header = TRUE,
sep=',',
comment.char="",
na.strings = c("na", "#VALUE!"),
stringsAsFactors = FALSE))
# Rename columns to match uniform variable names ----------
setnames(dat_mc_retest,
c("Column",
"Row",
"EPA_Sample_ID",
"Assay_conc..uM.",
"RFU"),
c("coli",
"rowi",
"spid",
"conc",
"rval"))
# Add new columns ----------
dat_mc_retest[, wllq := 1L ]
dat_mc_retest[, wllt := 't']
dat_mc_retest[, srcf := 'AUR_level.0_master.csv']
dat_mc_retest[, acsn := 'AUR_TPO_DoseResponse_PhaseII']
dat_mc_retest[, apid := paste(Replicate,AUR_Plate,Bioloigcal_sample,sep="_")]
# Clean up column types ----------
dat_mc_retest[, rowi := unname(letter2number[rowi])] #Use letter2number from chunk mc_phase1
dat_mc_retest[, apid := as.character(apid)]
dat_mc_retest[, rval := as.numeric(rval)]
# Fix Lucinh formatting ----------
lucs <- c("LUCINH1", "LUCINH2", "LUCINH3", "LUCINH4",
"LUCINH5", "LUCINH6", "LUCINH7", "LUCINH8", "LUCINH9", "LUCINH10", "LUCINH11",
"LUCINH12", "LUCINH13")
dat_mc_retest[rowi==15L & spid %in% lucs, spid := "LUCINH1"]
dat_mc_retest[rowi==16L & spid %in% lucs, spid := "LUCINH2"]
# Modify column entries ----------
chem_controls <- c("MMI", "LUCINH1", "LUCINH2", "DCNQ", "DCNP", "PTU", "BP3")
dat_mc_retest[spid %in% chem_controls, wllt := 'c']
dat_mc_retest[spid == 'DMSO', wllt := 'n']
dat_mc_retest[spid == 'blank', wllt := 'f']
# Remove extra rows ----------
#Keep only those with Microsomes, remove Buffer_only
dat_mc_retest <- dat_mc_retest[Bioloigcal_sample == 'microsomes']
# Remove extra columns ----------
set(dat_mc_retest,
j=which(colnames(dat_mc_retest) %in% c("Replicate",
"stock_conc..mM.",
"Aliquot_conc..mM.",
"Bioloigcal_sample",
"AUR_Plate")),
value=NULL)
# Combine ----------
dat_mc <- rbindlist(list(dat_mc_p1,
dat_mc_p2,
dat_mc_retest),
use.names = TRUE)
# Map spids TX to TP ----------
dat_mc <- tcpl:::tcplSpidMap(dat_mc) #Function is not exported?
#Gives warning "The following source spids did not map to updated spids:"
#This is because newer assays with TP rather than TX don't need to map.
#The TP is simply left alone, the warning is fine.
#Remove extra columns added by tcplSpidMap
set(dat_mc,
j=which(colnames(dat_mc) %in% c("asid",
"aid",
"spid_source")),
value=NULL)
# Change NAP to DCNQ ----------
dat_mc[spid == "NAP", spid := "DCNQ"]
# Write level 0 data to the database ----------
tcplWriteLvl0(dat_mc, type="mc")
| Collected_By | Date_Collected | Order_Number | Test_Samples | Plates |
|---|---|---|---|---|
| Katie Paul Friedman | Unknown | Unknown | 311 | 3 |
This data was originally provided as a worksheet in excel document KP032 AURTPO ToxCast Phase I Single Pt Screen.xlsx I output tab Compiled N as a separate CSV file This CSV file is read in below, with file name KP032 AURTPO ToxCast Phase I Single Pt Screen_CompiledN.csv Looking at column 24, it seems as though this column marked as DMSO is either blank or NEG. I set this to blank so that it does not throw off normalization.
# Read in data ----------
file.loc <- './Data/KP032 AURTPO ToxCast Phase I Single Pt Screen_CompiledN.csv'
dat_sc_p1 <- data.table(read.table(file = file.loc,
header = TRUE,
sep=',',
na.strings = c("EMPTY", "Kpaul"),
stringsAsFactors = FALSE))
# Rename columns to match uniform variable names ----------
setnames(dat_sc_p1,
c("ROW",
"COLUMN",
"EPA_SAMPLE_ID",
"ASSAY_CONC"),
c("rowi",
"coli",
"spid",
"conc"))
# Remove extra columns ----------
set(dat_sc_p1,
j=which(colnames(dat_sc_p1) %in% c("WELL",
"ALIQUOT_WELL_ID",
"ALIQUOT_SOLVENT",
"ALIQUOT_CONC",
"ALIQUOT_CONC_UNIT",
"ALIQUOT_PLATE_BARCODE_SOURCE",
"SISTER_PLATE",
"ASSAY_CONC_UNIT")),
value=NULL)
# Add new columns ----------
dat_sc_p1[, wllq := 1L]
dat_sc_p1[, wllt := 't']
dat_sc_p1[, srcf := 'KP032 AURTPO ToxCast Phase I Single Pt Screen Compiled N.csv']
dat_sc_p1[, acsn := 'AUR_TPO_SingleConc_PhaseI']
# Modify column entries ----------
dat_sc_p1[is.na(spid), wllt := 'b']
dat_sc_p1[spid %in% c('MMI', 'NAP', 'LUCINH1', 'LUCINH2', 'MMB', 'DDZ', 'MBT'),
wllt := 'p']
dat_sc_p1[spid == 'DMSO', wllt := 'n']
dat_sc_p1[spid == 'NEG', wllt := 'f']
dat_sc_p1[coli == 24, wllt := 'b']
#See note above, coli == 24 seems to be blank or NEG, not DMSO
# Melt 'N' columns to get rval ----------
dat_sc_p1 <- melt(dat_sc_p1,
id.vars = c('rowi', 'coli', 'spid', 'conc',
'wllq', 'wllt', 'srcf', 'acsn'),
measure.vars = c("N1", "N2", "N3"),
variable.name='apid',
value.name='rval')
# Clean up column types ----------
dat_sc_p1[, rowi := unname(letter2number[rowi])]
dat_sc_p1[, rval := as.numeric(rval)]
dat_sc_p1[, apid := as.character(apid)]
| Collected_By | Date_Collected | Order_Number | Test_Samples | Plates |
|---|---|---|---|---|
| Katie Paul Friedman | Unknown | Unknown | 812 | 9 |
This data was originally provided as a worksheet in excel document KP036 AUR-TPO Screening of the ToxCast Phase II Chemical Library022014.xlsx I output tab Longformat as a separate CSV file This CSV file is read in below, with file name KP036 AUR-TPO Screening of the ToxCast Phase II Chemical Library022014_Longformat.csv
# Read in data ----------
file.loc <- './Data/KP036 AUR-TPO Screening of the ToxCast Phase II Chemical Library022014_Longformat.csv'
dat_sc_p2 <- data.table(read.table (file = file.loc,
header = TRUE,
sep=',',
na.strings = c("BACKGROUND", "Kpaul", "NA"),
stringsAsFactors = FALSE))
# Rename columns to match uniform variable names ----------
setnames(dat_sc_p2,
c("COLUMN",
"ROW",
"EPA_SAMPLE_ID",
"ASSAY_CONC",
"plate_id",
"Resp"),
c("coli",
"rowi",
"spid",
"conc",
"apid",
"rval"))
# Remove extra columns ----------
set(dat_sc_p2, j=which(colnames(dat_sc_p2) %in% c("ALIQUOT_WELL_ID",
"ALIQUOT_SOLVENT",
"ALIQUOT_CONC",
"ALIQUOT_CONC_UNIT",
"SISTER_PLATE",
"ASSAY_CONC_UNIT",
"ALIQUOT_PLATE_BARCODE",
"N",
"sample_tech_rep_id")),
value=NULL)
# Add new columns ----------
dat_sc_p2[, wllq := 1L]
dat_sc_p2[, wllt := 't']
dat_sc_p2[, srcf := 'KP036 AUR-TPO Screening of the ToxCast Phase II Chemical Library022014_Longformat.csv']
dat_sc_p2[, acsn := 'AUR_TPO_SingleConc_PhaseII']
# Modify column entries ----------
dat_sc_p2[is.na(spid), wllt := 'b']
dat_sc_p2[spid %in% c('MMI', 'NAP', 'LUCINH1', 'LUCINH2'), wllt := 'p']
dat_sc_p2[spid == 'DMSO', wllt := 'n']
dat_sc_p2[spid == 'NEG', wllt := 'f']
# Clean up column types ----------
dat_sc_p2[, rowi := unname(letter2number[rowi])]
#add 3 to apid to get it to start at 4,
#this way it will be different than Phase I and not overwrite.
dat_sc_p2[, apid := as.character(apid + 3)]
dat_sc_p2[, rval := as.numeric(rval)]
# Combine ----------
dat_sc <- rbindlist(list(dat_sc_p1,
dat_sc_p2),
use.names = TRUE)
# Map spids TX to TP ----------
dat_sc <- tcpl:::tcplSpidMap(dat_sc) #Function is not exported?
#Gives warning "The following source spids did not map to updated spids:"
#This is because newer assays with TP rather than TX don't need to map.
#The TP is simply left alone, the warning is fine.
#Remove extra columns added by tcplSpidMap
set(dat_sc,
j=which(colnames(dat_sc) %in% c("asid",
"aid",
"spid_source")),
value=NULL)
# Change NAP to DCNQ ----------
dat_sc[spid == "NAP", spid := "DCNQ"]
# Write level 0 data to the database ----------
tcplWriteLvl0(dat_sc, type="sc")
| Collected_By | Date_Collected | Order_Number | Test_Samples | Plates |
|---|---|---|---|---|
| Katie Paul Friedman | Unknown | Unknown | 100 | 12 |
This data was originally provided as a worksheet in excel document KP032 Compiled N from Dose Response.xlsx I output tab Longformat CTG as a separate CSV file. This CSV file is read in below, with file name KP032 Compiled N from Dose Response_LongformatCTG.csv The CSV file has values of #VALUE!, comment.char=“” allows these to be converted to NA. Only the first plate has values for concentration. I recalculate these, assuming the same dilution factor was used for all plates. This needs to be confirmed.
# Read in data ----------
file.loc <- './Data/KP032 Compiled N from Dose Response_LongformatCTG.csv'
dat_ctg_p1 <- data.table(read.table (file = file.loc,
header = TRUE,
sep=',',
comment.char="",
na.strings = c("na", "na_1", "#VALUE!"),
stringsAsFactors = FALSE))
# Populate conc column ----------
#Do this before next steps, using Aliquot conc values to propagate
#Need to confirm this is correct
#
#In .xlsx file, this seems to have been done using equation
#=((H9*0.35)/65)*1000
#where H9 is Aliquot conc
dat_ctg_p1[, conc := ((ALIQUOT_CONC*0.35)/65)*1000]
# Populate apid column ----------
dat_ctg_p1[, apid := paste("ctg_phase_1_plate", PLATE, "N", N, sep = "_")]
# Rename columns to match uniform variable names ----------
setnames(dat_ctg_p1,
c("COLUMN",
"ROW",
"EPA_SAMPLE_ID",
"resp"),
c("coli",
"rowi",
"spid",
"rval"))
# Remove extra columns ----------
set(dat_ctg_p1,
j=which(colnames(dat_ctg_p1) %in% c("sample_tech_rep_id",
"sample_id",
"rep_id",
"WELL",
"ALIQUOT_WELL_ID",
"ALIQUOT_SOLVENT",
"ALIQUOT_CONC",
"ALIQUOT_CONC_UNIT",
"ALIQUOT_PLATE_BARCODE_SOURCE",
"SISTER_PLATE",
"ASSAY_CONC_UNIT",
"PLATE",
"N")),
value=NULL)
# Add new columns -----------
dat_ctg_p1[, wllq := 1L]
dat_ctg_p1[, wllt := 't']
dat_ctg_p1[, srcf := 'KP032 Compiled N from Dose Response_LongformatCTG.csv']
dat_ctg_p1[, acsn := 'CellTiterGlo_PhaseI']
# Modify column entries ----------
dat_ctg_p1[is.na(spid), wllt := 'b']
dat_ctg_p1[spid %in% c('MMI',
'NAP',
'LUCINH1',
'LUCINH2'),
wllt := 'c']
dat_ctg_p1[spid == 'DMSO', wllt := 'n']
dat_ctg_p1[spid == 'NEG', wllt := 'f']
# Clean up column types ----------
dat_ctg_p1[, rowi := unname(letter2number[rowi])]
dat_ctg_p1[, rval := as.numeric(rval)]
# Set TX codes that were retested to wllq of 0 ----------
#retest_spids from chunk mc_phase1
dat_ctg_p1[spid %in% retest_spids, wllq := 0]
| Collected_By | Date_Collected | Order_Number | Test_Samples | Plates |
|---|---|---|---|---|
| Katie Paul Friedman | Unknown | Unknown | 252 | 18 |
# Read in data ----------
file.loc <- './Data/L1_KP036_CellTiterGlo_PhaseII_Dose_Response.csv'
dat_ctg_p2 <- data.table(read.table (file = file.loc,
header = TRUE,
sep=',',
na.strings = c("EMPTY", "Kpaul", "NA"),
stringsAsFactors = FALSE))
# Rename columns to match uniform variable names ----------
setnames(dat_ctg_p2,
c("COLUMN",
"ROW",
"EPA_SAMPLE_ID",
"ASSAY_CONC",
"Plate_ID",
"Response"),
c("coli",
"rowi",
"spid",
"conc",
"apid",
"rval"))
# Remove extra columns ----------
set(dat_ctg_p2,
j=which(colnames(dat_ctg_p2) %in% c("ALIQUOT_WELL_ID",
"ALIQUOT_SOLVENT",
"ALIQUOT_CONC",
"ALIQUOT_CONC_UNIT",
"ALIQUOT_PLATE_BARCODE",
"SISTER_PLATE",
"ASSAY_CONC_UNIT",
"N")),
value=NULL)
# Add new columns ----------
dat_ctg_p2[, wllq := 1L]
dat_ctg_p2[, wllt := 't']
dat_ctg_p2[, srcf := 'L1_KP036_CellTiterGlo_PhaseII_Dose_Response.csv']
dat_ctg_p2[, acsn := 'CellTiterGlo_PhaseII']
# Modify column entries ----------
dat_ctg_p2[is.na(spid), wllt := 'b']
dat_ctg_p2[spid %in% c('MMI',
'NAP',
'LUCINH1',
'LUCINH2'),
wllt := 'c']
dat_ctg_p2[spid == 'DMSO', wllt := 'n']
dat_ctg_p2[spid == 'NEG', wllt := 'f']
# Clean up column types ----------
dat_ctg_p2[, rowi := unname(letter2number[rowi])] #Use letter2number from chunk mc_phase1
dat_ctg_p2[, apid := as.character(paste("ctg_phase_2_plate", apid, sep = "_"))]
dat_ctg_p2[, rval := as.numeric(rval)]
# Set TX codes that were retested to wllq of 0 ----------
#retest_spids from chunk mc_phase1
dat_ctg_p2[spid %in% retest_spids, wllq := 0]
| Collected_By | Date_Collected | Order_Number | Test_Samples | Plates |
|---|---|---|---|---|
| Steve Simmons | July 2015 | Unknown | 82 | 6 |
File provided by Steve Simmons June 19 2015
# Read in data ----------
file.loc <- './Data/CTG_level.0_master.csv'
dat_ctg_retest <- data.table(TPO <- read.table(file = file.loc,
header = TRUE,
sep=',',
comment.char="",
na.strings = c("na", "#VALUE!"),
stringsAsFactors = FALSE))
# Rename columns to match uniform variable names ----------
setnames(dat_ctg_retest,
c("Column",
"Row",
"EPA_Sample_ID",
"Assay_conc..uM.",
"RLU"),
c("coli",
"rowi",
"spid",
"conc",
"rval"))
# Add new columns ----------
dat_ctg_retest[, wllq := 1L]
dat_ctg_retest[, wllt := 't']
dat_ctg_retest[, srcf := 'CTG_level.0_master.csv']
dat_ctg_retest[, acsn := 'CellTiterGlo_PhaseII']
dat_ctg_retest[, apid := paste(Replicate,AUR_Plate,sep="_")]
# Clean up column types ----------
dat_ctg_retest[, rowi := unname(letter2number[rowi])]
dat_ctg_retest[, rval := as.numeric(rval)]
# Fix Lucinh formatting ----------
lucs <- c("LUCINH1", "LUCINH2", "LUCINH3", "LUCINH4",
"LUCINH5", "LUCINH6", "LUCINH7", "LUCINH8",
"LUCINH9", "LUCINH10", "LUCINH11", "LUCINH12",
"LUCINH13")
dat_ctg_retest[rowi==15L & spid %in% lucs, spid := "LUCINH1"]
dat_ctg_retest[rowi==16L & spid %in% lucs, spid := "LUCINH2"]
# Modify column entries ----------
chem_controls <- c("MMI", "LUCINH1", "LUCINH2", "DCNQ", "DCNP", "PTU", "BP3")
dat_ctg_retest[spid %in% chem_controls, wllt := 'c']
dat_ctg_retest[spid == 'DMSO', wllt := 'n']
dat_ctg_retest[spid == 'blank', wllt := 'f']
# Remove extra columns ----------
set(dat_ctg_retest,
j=which(colnames(dat_ctg_retest) %in% c("Replicate",
"stock_conc..mM.",
"Aliquot_conc..mM.",
"Bioloigcal_sample",
"AUR_Plate")),
value=NULL)
# Combine ----------
dat_ctg <- rbindlist(list(dat_ctg_p1,
dat_ctg_p2,
dat_ctg_retest),
use.names = TRUE)
# Map spids TX to TP ----------
dat_ctg <- tcpl:::tcplSpidMap(dat_ctg) #Function is not exported?
#Gives warning "The following source spids did not map to updated spids:"
#This is because newer assays with TP rather than TX don't need to map.
#The TP is simply left alone, the warning is fine.
#Remove extra columns added by tcplSpidMap
set(dat_ctg,
j=which(colnames(dat_ctg) %in% c("asid",
"aid",
"spid_source")),
value=NULL)
# Change NAP to DCNQ ----------
dat_ctg[spid == "NAP", spid := "DCNQ"]
# Write level 0 data to the database ----------
tcplWriteLvl0(dat_ctg, type="mc")
| Collected_By | Date_Collected | Order_Number | Test_Samples | Plates |
|---|---|---|---|---|
| Steve Simmons | July 2015 | Unknown | 341 | 30 |
# Read in data ----------
file.loc <- './Data/KP032 plates_QLI_2.0.csv'
dat_qli_p1p2 <- data.table(read.table(file = file.loc,
header = TRUE,
sep=',',
comment.char="",
na.strings = c("na", "#VALUE!", "EMPTY", "Kpaul", "NA"),
stringsAsFactors = FALSE))
# Rename columns to match uniform variable names ----------
setnames(dat_qli_p1p2,
c("COLUMN",
"ROW",
"EPA_SAMPLE_ID",
"ASSAY_CONC",
"PLATE"),
c("coli",
"rowi",
"spid",
"conc",
"apid"))
# Remove extra columns ----------
set(dat_qli_p1p2, j=which(colnames(dat_qli_p1p2) %in% c("ALIQUOT_WELL_ID",
"ALIQUOT_SOLVENT",
"ALIQUOT_CONC",
"ALIQUOT_CONC_UNIT",
"SISTER_PLATE",
"ASSAY_CONC_UNIT",
"ALIQUOT_PLATE_BARCODE_SOURCE",
"WELL")),
value=NULL)
# Add new columns ----------
dat_qli_p1p2[ , wllq := 1L]
dat_qli_p1p2[ , wllt := 't']
dat_qli_p1p2[ , acsn := 'Quantilum_Inhibition_2']
#The first 4 chemical plates were phase 1, final 6 were phase 2
dat_qli_p1p2[apid %in% c(1:4) , srcf := 'KP032 plates_QLI_2.0.csv Phase I']
dat_qli_p1p2[apid %in% c(5:10) , srcf := 'KP032 plates_QLI_2.0.csv Phase II']
# Modify column entries ----------
dat_qli_p1p2[is.na(spid), wllt := 'b']
dat_qli_p1p2[spid %in% c('MMI', 'NAP', 'LUCINH1', 'LUCINH2'), wllt := 'c']
dat_qli_p1p2[spid == 'DMSO', wllt := 'n']
dat_qli_p1p2[spid == 'NEG', wllt := 'f']
# Melt data to get one column of response ----------
#N1, N2, N3 become values in N
#values from N1, N2, N3 become values in rval
dat_qli_p1p2 <- melt(dat_qli_p1p2,
id.vars = c('rowi', 'coli', 'spid', 'conc',
'wllq', 'wllt', 'srcf', 'acsn', 'apid'),
measure.vars=c("N1", "N2", "N3"),
variable.name='N',
value.name='rval')
dat_qli_p1p2[srcf == "KP032 plates_QLI_2.0.csv Phase I", phase := "phase_1"]
dat_qli_p1p2[srcf == "KP032 plates_QLI_2.0.csv Phase II", phase := "phase_2"]
dat_qli_p1p2[, apid := paste("qli", phase, "plate", apid, N, sep = "_")]
# remove more columns ----------
set(dat_qli_p1p2, j=which(colnames(dat_qli_p1p2) %in% c("N", "phase")), value=NULL)
# Modify column entries ----------
dat_qli_p1p2[, rowi := unname(letter2number[rowi])]
dat_qli_p1p2[, rval := as.numeric(rval)]
# Set TX codes that were retested to wllq of 0 ----------
#retest_spids from chunk mc_phase1
dat_qli_p1p2[spid %in% retest_spids, wllq := 0]
| Collected_By | Date_Collected | Order_Number | Test_Samples | Plates |
|---|---|---|---|---|
| Steve Simmons | July 2015 | Unknown | 82 | 6 |
# Read in data ----------
file.loc <- './Data/QLI_level.0_master.csv'
dat_qli_retest <- data.table(read.table (file = file.loc,
header = TRUE,
sep=',',
comment.char="",
na.strings = c("na", "#VALUE!"),
stringsAsFactors = FALSE))
# Rename columns to match uniform variable names ----------
setnames(dat_qli_retest,
c("Column",
"Row",
"EPA_Sample_ID",
"Assay_conc..uM.",
"RLU"),
c("coli",
"rowi",
"spid",
"conc",
"rval"))
# Add new columns ----------
dat_qli_retest[, wllq := 1L]
dat_qli_retest[, wllt := 't']
dat_qli_retest[, srcf := 'QLI_level.0_master.csv']
dat_qli_retest[, acsn := 'Quantilum_Inhibition_2']
dat_qli_retest[, apid := paste(Replicate,AUR_Plate,sep="_")]
# Clean up column types ----------
dat_qli_retest[, rowi := unname(letter2number[rowi])]
dat_qli_retest[, rval := as.numeric(rval)]
# Fix Lucinh formatting ----------
lucs <- c("LUCINH1", "LUCINH2", "LUCINH3", "LUCINH4",
"LUCINH5", "LUCINH6", "LUCINH7", "LUCINH8",
"LUCINH9", "LUCINH10", "LUCINH11", "LUCINH12",
"LUCINH13")
dat_qli_retest[rowi==15L & spid %in% lucs, spid := "LUCINH1"]
dat_qli_retest[rowi==16L & spid %in% lucs, spid := "LUCINH2"]
# Modify column entries ----------
chem_controls <- c("MMI", "LUCINH1", "LUCINH2", "DCNQ", "DCNP", "PTU", "BP3")
dat_qli_retest[spid %in% chem_controls, wllt := 'c']
dat_qli_retest[spid == 'DMSO', wllt := 'n']
dat_qli_retest[spid == 'blank', wllt := 'f']
# Remove extra columns ----------
set(dat_qli_retest,
j=which(colnames(dat_qli_retest) %in% c("Replicate",
"stock_conc..mM.",
"Bioloigcal_sample",
"AUR_Plate")),
value=NULL)
# Combine ----------
dat_qli <- rbindlist(list(dat_qli_p1p2,
dat_qli_retest),
use.names = TRUE)
# Map spids TX to TP ----------
dat_qli <- tcpl:::tcplSpidMap(dat_qli) #Function is not exported?
#Gives warning "The following source spids did not map to updated spids:"
#This is because newer assays with TP rather than TX don't need to map.
#The TP is simply left alone, the warning is fine.
#Remove extra columns added by tcplSpidMap
set(dat_qli,
j=which(colnames(dat_qli) %in% c("asid",
"aid",
"spid_source")),
value=NULL)
# Change NAP to DCNQ ----------
dat_qli[spid == "NAP", spid := "DCNQ"]
# Write level 0 data to the database ----------
tcplWriteLvl0(dat_qli, type="mc")
| Collected_By | Date_Collected | Order_Number | Test_Samples | Plates |
|---|---|---|---|---|
| Steve Simmons | June 2015 | Unknown | Unknown | Unknown |
# Read in data ----------
file.loc <- './Data/GUA_level.0_master.csv'
dat_gua <- data.table(read.table (file = file.loc,
header = TRUE,
sep=',',
comment.char="",
na.strings = c("na", "#VALUE!"),
stringsAsFactors = FALSE))
# Rename columns to match uniform variable names ----------
setnames(dat_gua,
c("Column",
"Row",
"EPA_Sample_ID",
"Assay_conc..uM."),
c("coli",
"rowi",
"spid",
"conc"
))
# Add new columns ----------
dat_gua[, wllq := 1L]
dat_gua[, wllt := 't']
dat_gua[, srcf := 'GUA_level.0_master.csv']
dat_gua[, acsn := 'GUA_TPO_DoseResponse']
dat_gua[, apid := paste(Replicate,GUA_Plate,Bioloigcal_sample,sep="_")]
# Caluclate rval ----------
final_cols <- c("t_145",
"t_145.1",
"t_145.2",
"t_145.3",
"t_145.4",
"t_145.5",
"t_145.6",
"t_145.7",
"t_145.8",
"t_145.9",
"t_146",
"t_146.1",
"t_146.2",
"t_146.3",
"t_146.4",
"t_146.5",
"t_146.6",
"t_146.7",
"t_146.8")
first_cols <- c("t_64", "t_64.25")
first_median_vect <- apply(dat_gua[, first_cols, with=FALSE], 1, function(x) median(x))
final_median_vect <- apply(dat_gua[, final_cols, with=FALSE], 1, function(x) median(x))
dat_gua[, rval := final_median_vect - first_median_vect]
# remove the time resolved columns ----------
set(dat_gua, j=which(colnames(dat_gua) %in% c(first_cols, final_cols)), value=NULL)
# Clean up column types ----------
dat_gua[, rowi := unname(letter2number[rowi])]
dat_gua[, rval := as.numeric(rval)]
# Modify column entries ----------
dat_gua[spid %in% c('MMI', 'PTU', 'LifeTech.AUR', 'BP3'), wllt := 'c']
dat_gua[spid == 'DMSO', wllt := 'n']
# Remove extra rows ----------
#Keep only those with Microsomes, remove Buffer_only
dat_gua <- dat_gua[Bioloigcal_sample == 'Microsomes']
# Remove extra columns ----------
set(dat_gua,
j=which(colnames(dat_gua) %in% c("Replicate",
"Aliquot_conc..mM.",
"Bioloigcal_sample",
"t_0",
"GUA_Plate")),
value=NULL)
# Add NA rows for blank wells that were unused ----------
#Only columns 1:6 and rows A:E were used. Set all others to NA rows
#Eventually rows will be 1:16 and columns 1:24
apids <- unique(dat_gua[, apid])
blank_data <- expand.grid(rowi=1:16,
coli=1:24,
spid=NA,
conc=NA,
wllq=0L,
wllt='b',
srcf='GUA_level.0_master.csv',
acsn='GUA_TPO_DoseResponse',
apid=apids,
rval=NA,
stringsAsFactors=FALSE)
blank_data <- data.table(blank_data)
blank_data <- blank_data[!(rowi %in% 1:5 & coli %in% 1:6)]
#combine blank_data and dat_gua
dat_gua <- rbind(dat_gua, blank_data, use.names=TRUE, fill=FALSE)
# Write level 0 data to the database ----------
tcplWriteLvl0(dat_gua, type="mc")
The following block of code will set all of the proper methods for the pipeline and process the data with these settings.
# acid and aeid can be found using functions ----------
# tcpl::tcplLoadAsid()
# tcpl::tcplLoadAcid()
# tcpl::tcplLoadAeid()
acid <- 963
aeid <- 1508
# set methods level 2 ----------
tcplClearMthd(lvl = 2L, id = acid, type = "mc")
tcplAssignMthd(lvl = 2L,
id = acid,
mthd_id = 1,
ordr = 1,
type = "mc")
# set methods level 3 ----------
tcplClearMthd(lvl = 3L, id = aeid, type = "mc")
tcplAssignMthd(lvl = 3L,
id = aeid,
mthd_id = c(11, 28, 5),
ordr = c(1,2,3),
type = "mc")
# set methods level 5 ----------
tcplClearMthd(lvl = 5L, id = aeid, type = "mc")
tcplAssignMthd(lvl = 5L,
id = aeid,
mthd_id = 2,
type = "mc")
# set methods level 6 ----------
tcplClearMthd(lvl = 6L, id = aeid, type = "mc")
tcplAssignMthd(lvl = 6L,
id = aeid,
mthd_id = c(6, 7, 8, 10, 11, 12, 15, 16),
type = "mc")
# run all ----------
tcplRun(id = acid,
slvl = 1L,
elvl = 6L,
type = "mc",
mc.cores = 1L)
# make aeid plots ----------
tcplMakeAeidPlts(aeid, lvl=6L)
The above analysis runs with the following settings. A complete list of available setting can be found here.
| acid | mthd | mthd_id | ordr |
|---|---|---|---|
| 963 | none | 1 | 1 |
There are no level 2 methods applied to this data set. This level is used to do a global scaling or shifting of the data, and is not necessary for the TPO dataset.
| aeid | mthd | mthd_id | ordr |
|---|---|---|---|
| 1508 | bval.apid.nwlls.med | 11 | 1 |
| 1508 | pval.apid.f.min | 28 | 2 |
| 1508 | resp.pc | 5 | 3 |
Level 3 is the step when the data is normalized. There are 3 methods applied during level 3.
| aeid | mthd | mthd_id |
|---|---|---|
| 1508 | pc20 | 2 |
Level 5 is the step where the cutoff value is set, the winning model is chosen, and a hit call is determined. This analysis sets to cutoff to 20%.
| aeid | mthd | mthd_id | nddr |
|---|---|---|---|
| 1508 | singlept.hit.high | 6 | 0 |
| 1508 | singlept.hit.mid | 7 | 0 |
| 1508 | multipoint.neg | 8 | 0 |
| 1508 | noise | 10 | 0 |
| 1508 | border.hit | 11 | 0 |
| 1508 | border.miss | 12 | 0 |
| 1508 | gnls.lowconc | 15 | 0 |
| 1508 | overfit.hit | 16 | 0 |
Level 6 adds flags to the analysis. These do not change the hit call, but will flag a curve in it meets the criteria for a particular flag. The flags that are evaluated in this analysis are in the above table.
The following block of code will set all of the proper methods for the pipeline and process the data with these settings.
# acid and aeid can be found using functions ----------
# tcpl::tcplLoadAsid()
# tcpl::tcplLoadAcid()
# tcpl::tcplLoadAeid()
ctg_acid <- 964
ctg_aeid <- 1509
# set methods level 2 ----------
tcplClearMthd(lvl = 2L, id = ctg_acid, type = "mc")
tcplAssignMthd(lvl = 2L,
id = ctg_acid,
mthd_id = 1,
ordr = 1,
type = "mc")
# set methods level 3 ----------
tcplClearMthd(lvl = 3L, id = ctg_aeid, type = "mc")
tcplAssignMthd(lvl = 3L,
id = ctg_aeid,
mthd_id = c(11, 32, 5),
ordr = c(1,2,3),
type = "mc")
# set methods level 5 ----------
tcplClearMthd(lvl = 5L, id = ctg_aeid, type = "mc")
tcplAssignMthd(lvl = 5L,
id = ctg_aeid,
mthd_id = 2,
type = "mc")
# set methods level 6 ----------
tcplClearMthd(lvl = 6L, id = ctg_aeid, type = "mc")
tcplAssignMthd(lvl = 6L,
id = ctg_aeid,
mthd_id = c(6, 7, 8, 10, 11, 12, 15, 16),
type = "mc")
# run all ----------
tcplRun(id = ctg_acid,
slvl = 1L,
elvl = 6L,
type = "mc",
mc.cores = 1L)
# make aeid plots ----------
tcplMakeAeidPlts(ctg_aeid, lvl=6L)
| acid | mthd | mthd_id | ordr |
|---|---|---|---|
| 964 | none | 1 | 1 |
There are no level 2 methods applied to this data set. This level is used to do a global scaling or shifting of the data, and is not necessary for the TPO dataset.
| aeid | mthd | mthd_id | ordr |
|---|---|---|---|
| 1509 | bval.apid.nwlls.med | 11 | 1 |
| 1509 | pval.zero | 32 | 2 |
| 1509 | resp.pc | 5 | 3 |
Level 3 is the step when the data is normalized. There are 3 methods applied during level 3.
| aeid | mthd | mthd_id |
|---|---|---|
| 1509 | bmad3 | 1 |
| 1509 | pc20 | 2 |
Level 5 is the step where the cutoff value is set, the winning model is chosen, and a hit call is determined. This analysis sets to cutoff to 20%.
| aeid | mthd | mthd_id | nddr |
|---|---|---|---|
| 1509 | singlept.hit.high | 6 | 0 |
| 1509 | singlept.hit.mid | 7 | 0 |
| 1509 | multipoint.neg | 8 | 0 |
| 1509 | noise | 10 | 0 |
| 1509 | border.hit | 11 | 0 |
| 1509 | border.miss | 12 | 0 |
| 1509 | gnls.lowconc | 15 | 0 |
| 1509 | overfit.hit | 16 | 0 |
Level 6 adds flags to the analysis. These do not change the hit call, but will flag a curve in it meets the criteria for a particular flag. The flags that are evaluated in this analysis are in the above table.
The following block of code will set all of the proper methods for the pipeline and process the data with these settings.
# acid and aeid can be found using functions ----------
# tcpl::tcplLoadAsid()
# tcpl::tcplLoadAcid()
# tcpl::tcplLoadAeid()
qli_acid <- 1146
qli_aeid <- 1848
# set methods level 2 ----------
tcplClearMthd(lvl = 2L, id = qli_acid, type = "mc")
tcplAssignMthd(lvl = 2L,
id = qli_acid,
mthd_id = 1,
ordr = 1,
type = "mc")
# set methods level 3 ----------
tcplClearMthd(lvl = 3L, id = qli_aeid, type = "mc")
tcplAssignMthd(lvl = 3L,
id = qli_aeid,
mthd_id = c(11, 32, 5),
ordr = c(1,2,3),
type = "mc")
# set methods level 5 ----------
tcplClearMthd(lvl = 5L, id = qli_aeid, type = "mc")
tcplAssignMthd(lvl = 5L,
id = qli_aeid,
mthd_id = 2,
type = "mc")
# set methods level 6 ----------
tcplClearMthd(lvl = 6L, id = qli_aeid, type = "mc")
tcplAssignMthd(lvl = 6L,
id = qli_aeid,
mthd_id = c(6, 7, 8, 10, 11, 12, 15, 16),
type = "mc")
# run all ----------
tcplRun(id = qli_acid,
slvl = 1L,
elvl = 6L,
type = "mc",
mc.cores = 1L)
# make aeid plots ----------
tcplMakeAeidPlts(qli_aeid, lvl=6L)
| acid | mthd | mthd_id | ordr |
|---|---|---|---|
| 1146 | none | 1 | 1 |
There are no level 2 methods applied to this data set. This level is used to do a global scaling or shifting of the data, and is not necessary for the TPO dataset.
| aeid | mthd | mthd_id | ordr |
|---|---|---|---|
| 1848 | bval.apid.nwlls.med | 11 | 1 |
| 1848 | pval.zero | 32 | 2 |
| 1848 | resp.pc | 5 | 3 |
Level 3 is the step when the data is normalized. There are 3 methods applied during level 3.
| aeid | mthd | mthd_id |
|---|---|---|
| 1848 | pc20 | 2 |
Level 5 is the step where the cutoff value is set, the winning model is chosen, and a hit call is determined. This analysis sets to cutoff to 20%.
| aeid | mthd | mthd_id | nddr |
|---|---|---|---|
| 1848 | singlept.hit.high | 6 | 0 |
| 1848 | singlept.hit.mid | 7 | 0 |
| 1848 | multipoint.neg | 8 | 0 |
| 1848 | noise | 10 | 0 |
| 1848 | border.hit | 11 | 0 |
| 1848 | border.miss | 12 | 0 |
| 1848 | gnls.lowconc | 15 | 0 |
| 1848 | overfit.hit | 16 | 0 |
Level 6 adds flags to the analysis. These do not change the hit call, but will flag a curve in it meets the criteria for a particular flag. The flags that are evaluated in this analysis are in the above table.
The following block of code will set all of the proper methods for the pipeline and process the data with these settings.
# acid and aeid can be found using functions ----------
# tcpl::tcplLoadAsid()
# tcpl::tcplLoadAcid()
# tcpl::tcplLoadAeid()
gua_acid <- 1128
gua_aeid <- 1824
# set methods level 2 ----------
tcplClearMthd(lvl = 2L, id = gua_acid, type = "mc")
tcplAssignMthd(lvl = 2L,
id = gua_acid,
mthd_id = 1,
ordr = 1,
type = "mc")
# set methods level 3 ----------
tcplClearMthd(lvl = 3L, id = gua_aeid, type = "mc")
tcplAssignMthd(lvl = 3L,
id = gua_aeid,
mthd_id = c(11, 32, 5),
ordr = c(1,2,3),
type = "mc")
# set methods level 5 ----------
tcplClearMthd(lvl = 5L, id = gua_aeid, type = "mc")
tcplAssignMthd(lvl = 5L,
id = gua_aeid,
mthd_id = 2,
type = "mc")
# set methods level 6 ----------
tcplClearMthd(lvl = 6L, id = gua_aeid, type = "mc")
tcplAssignMthd(lvl = 6L,
id = gua_aeid,
mthd_id = c(6, 7, 8, 10, 11, 12, 15, 16),
type = "mc")
# run all ----------
tcplRun(id = gua_acid,
slvl = 1L,
elvl = 6L,
type = "mc",
mc.cores = 1L)
# make aeid plots ----------
tcplMakeAeidPlts(gua_aeid, lvl=6L)
| acid | mthd | mthd_id | ordr |
|---|---|---|---|
| 1128 | none | 1 | 1 |
There are no level 2 methods applied to this data set. This level is used to do a global scaling or shifting of the data, and is not necessary for the TPO dataset.
| aeid | mthd | mthd_id | ordr |
|---|---|---|---|
| 1824 | bval.apid.nwlls.med | 11 | 1 |
| 1824 | pval.zero | 32 | 2 |
| 1824 | resp.pc | 5 | 3 |
Level 3 is the step when the data is normalized. There are 3 methods applied during level 3.
| aeid | mthd | mthd_id |
|---|---|---|
| 1824 | pc20 | 2 |
Level 5 is the step where the cutoff value is set, the winning model is chosen, and a hit call is determined. This analysis sets to cutoff to 20%.
| aeid | mthd | mthd_id | nddr |
|---|---|---|---|
| 1824 | singlept.hit.high | 6 | 0 |
| 1824 | singlept.hit.mid | 7 | 0 |
| 1824 | multipoint.neg | 8 | 0 |
| 1824 | noise | 10 | 0 |
| 1824 | border.hit | 11 | 0 |
| 1824 | border.miss | 12 | 0 |
| 1824 | gnls.lowconc | 15 | 0 |
| 1824 | overfit.hit | 16 | 0 |
Level 6 adds flags to the analysis. These do not change the hit call, but will flag a curve in it meets the criteria for a particular flag. The flags that are evaluated in this analysis are in the above table.
Parameters of interest are Z’-factor and CV. We calculate these for every plate. Because each plate and assay and a different layout, the exact method varies.
acids <- c(963L, 964L, 1146L, 1128L)
dat_L0_mc <- tcplPrepOtpt(tcplLoadData(lvl=0L, fld = "acid", val = acids, type = "mc"))
dat_L0_sc <- tcplPrepOtpt(tcplLoadData(lvl=0L, fld = "acid", val = acids, type = "sc"))
dat_L0_mc[, acid := paste(acid, "_mc", sep="")]
dat_L0_sc[, acid := paste(acid, "_sc", sep="")]
dat_L0 <- rbindlist(list(dat_L0_mc, dat_L0_sc),
use.names = TRUE,
fill = TRUE)
acids <- c(963L, 964L, 1146L, 1128L)
dat_L0_mc <- tcplPrepOtpt(tcplLoadData(lvl=0L, fld = "acid", val = acids, type = "mc"))
dat_L0_sc <- tcplPrepOtpt(tcplLoadData(lvl=0L, fld = "acid", val = acids, type = "sc"))
dat_L0_mc[, acid := paste(acid, "_mc", sep="")]
dat_L0_sc[, acid := paste(acid, "_sc", sep="")]
dat_L0 <- rbindlist(list(dat_L0_mc, dat_L0_sc),
use.names = TRUE,
fill = TRUE)
# Database column 24 for phase I should be blank, not DMSO
dat_L0[srcf == "KP032 AURTPO ToxCast Phase I Single Pt Screen Compiled N.csv" &
coli == 24,
wllt := 'f']
dat_mmi_all <- data.table()
dat_qli_all <- data.table()
dat_nap_all <- data.table()
dat_mmi_fit_all <- data.table()
dat_ctg_fit_all <- data.table()
dat_qli_fit_all <- data.table()
sum_stats <- list()
i <- 1 #used in list of table stats before rbindlist after for loops
for(assay in unique(dat_L0[, acid])){
for(plate in unique(dat_L0[acid==assay, apid])){
dmso_sd <- sd(dat_L0[acid==assay & apid == plate & wllt=='n', rval])
dmso_mean <- mean(dat_L0[acid==assay & apid == plate & wllt=='n', rval])
CV <- dmso_sd/dmso_mean
dmso_mad <- mad(dat_L0[acid==assay & apid == plate & wllt=='n', rval])
dmso_median <- median(dat_L0[acid==assay & apid == plate & wllt=='n', rval])
CV_robust <- dmso_mad/dmso_median
pval <- NA
psd <- NA
bval <- NA
bsd <- NA
mmiconcs <- NA
mmi_high_concs <- NA
z_prime_factor <- NA
if(assay %in% c("963_sc", "963_mc", "1128_mc")){
mmiconcs <- sort(dat_L0[acid==assay &
apid == plate &
spid == "MMI",
conc])
mmi_high_concs <- tail(mmiconcs,2)
pval <- mean(dat_L0[acid==assay &
apid == plate &
spid == "MMI" &
conc %in% mmi_high_concs,
rval])
psd <- sd(dat_L0[acid==assay &
apid == plate &
spid == "MMI" &
conc %in% mmi_high_concs,
rval])
pval_med <- median(dat_L0[acid==assay &
apid == plate &
spid == "MMI" &
conc %in% mmi_high_concs,
rval])
pmad <- mad(dat_L0[acid==assay &
apid == plate &
spid == "MMI" &
conc %in% mmi_high_concs,
rval])
}
else if(assay == "964_mc"){
mmiconcs <- sort(dat_L0[acid==assay &
apid == plate &
spid == "DCNQ",
conc])
mmi_high_concs <- tail(mmiconcs,2)
pval <- mean(dat_L0[acid==assay &
apid == plate &
spid == "DCNQ" &
conc %in% mmi_high_concs,
rval])
psd <- sd(dat_L0[acid==assay &
apid == plate &
spid == "DCNQ" &
conc %in% mmi_high_concs,
rval])
pval_med <- median(dat_L0[acid==assay &
apid == plate &
spid == "DCNQ" &
conc %in% mmi_high_concs,
rval])
pmad <- mad(dat_L0[acid==assay &
apid == plate &
spid == "DCNQ" &
conc %in% mmi_high_concs,
rval])
}
else if(assay == "1146_mc"){
mmiconcs <- sort(dat_L0[acid==assay &
apid == plate &
spid == "LUCINH2",
conc])
mmi_high_concs <- tail(mmiconcs,2)
pval <- mean(dat_L0[acid==assay &
apid == plate &
spid == "LUCINH2" &
conc %in% mmi_high_concs,
rval])
psd <- sd(dat_L0[acid==assay &
apid == plate &
spid == "LUCINH2" &
conc %in% mmi_high_concs,
rval])
pval_med <- median(dat_L0[acid==assay &
apid == plate &
spid == "LUCINH2" &
conc %in% mmi_high_concs,
rval])
pmad <- mad(dat_L0[acid==assay &
apid == plate &
spid == "LUCINH2" &
conc %in% mmi_high_concs,
rval])
}
#Z' factor
bval <- mean(dat_L0[acid == assay &
apid == plate &
wllt == "n",
rval])
bsd <- sd(dat_L0[acid == assay &
apid == plate &
wllt == "n",
rval])
z_prime_factor <- (1 - (3*psd + 3*bsd)/abs(pval - bval))
#Robust Z' factor
bval_med <- median(dat_L0[acid == assay &
apid == plate &
wllt == "n",
rval])
bmad <- mad(dat_L0[acid == assay &
apid == plate &
wllt == "n",
rval])
z_prime_factor_robust <- (1 - (3*pmad + 3*bmad)/abs(pval_med - bval_med))
stats <- data.table(acid = assay,
apid = plate,
CVar = CV,
zpf = z_prime_factor,
CVrobust = CV_robust,
zpf_robust = z_prime_factor_robust)
sum_stats[[i]] <- stats
i <- i + 1
}
}
dat_sum_stats <- rbindlist(sum_stats)
summary_table <- dat_sum_stats[,.(Mean_CVar = mean(CVar, na.rm=T),
Mean_zpf = mean(zpf, na.rm=T),
Mean_CVrobust = mean(CVrobust, na.rm=T),
Mean_zpf_robust = mean(zpf_robust, na.rm=T),
SD_CVrobust = sd(CVrobust, na.rm=T),
SD_zpf_robust = sd(zpf_robust, na.rm=T)
),
by = acid]
The main use of the single concentration screen was to find likely hits at high concentration to cut down on the number of chemicals that would need to be tested in dose response.
datL0sc_prep <- tcplPrepOtpt(tcplLoadData(lvl = 0L, "acid", 963, type="sc"))
datL4mc_prep <- tcplPrepOtpt(tcplLoadData(lvl = 4L, "aeid", 1508, type="mc"))
# Database column 24 for phase I should be blank, not DMSO
datL0sc_prep[srcf == "KP032 AURTPO ToxCast Phase I Single Pt Screen Compiled N.csv" &
coli == 24,
wllt := 'f']
plates <- unique(datL0sc_prep[,apid])
for (plate in plates){
mmiconcs <- sort(datL0sc_prep[apid == plate & spid == "MMI",conc])
mmi_high_concs <- tail(mmiconcs,2)
pval <- mean(datL0sc_prep[apid == plate & spid == "MMI" & conc %in% mmi_high_concs, rval])
psd <- sd(datL0sc_prep[apid == plate & spid == "MMI" & conc %in% mmi_high_concs, rval])
bval <- mean(datL0sc_prep[apid == plate & wllt == "n", rval])
bsd <- sd(datL0sc_prep[apid == plate & wllt == "n", rval])
z_prime_factor <- (1 - (3*psd + 3*bsd)/abs(pval - bval))
datL0sc_prep[apid == plate, resp := (rval - bval)/(0 - bval)*100]
datL0sc_prep[apid == plate, plate_z_prime_factor := z_prime_factor]
datL0sc_prep[apid == plate, plate_psd := psd]
datL0sc_prep[apid == plate, plate_bsd := bsd]
}
mean_zpf <- mean(unique(datL0sc_prep[, plate_z_prime_factor]))
sd_zpf <- sd(unique(datL0sc_prep[, plate_z_prime_factor]))
mmiconcs <- sort(unique(datL0sc_prep[spid == "MMI",conc]))
mmi_high_concs <- tail(mmiconcs,2)
psd <- sd(datL0sc_prep[spid == "MMI" & conc %in% mmi_high_concs, resp])
bsd <- sd(datL0sc_prep[wllt == "n", resp])
pval <- mean(datL0sc_prep[spid == "MMI" & conc %in% mmi_high_concs, resp])
bval <- mean(datL0sc_prep[wllt == "n", resp])
z_prime_factor <- (1 - (3*psd + 3*bsd)/abs(pval - bval))
#Generate column of random numbers, for random sorting
num_row <- dim(datL0sc_prep)[1]
datL0sc_prep[, sortval := runif(n=num_row)]
spids <- unique(datL0sc_prep[, spid])
for (chem in spids){
datL0sc_prep[spid == chem, resp_mean := mean(datL0sc_prep[spid == chem, resp])]
}
dat_sc_mean <- data.table(spid = unique(datL0sc_prep[,spid]))
for (chem in dat_sc_mean[,spid]){
dat_sc_mean[spid == chem, casn := unique(datL0sc_prep[spid == chem, casn])]
dat_sc_mean[spid == chem, chnm := unique(datL0sc_prep[spid == chem, chnm])]
dat_sc_mean[spid == chem, resp_mean := unique(datL0sc_prep[spid == chem, resp_mean])]
}
dat_sc_mean[resp_mean > 20, activity := "Active"]
dat_sc_mean[resp_mean < 20, activity := "Inactive"]
dat_sc_mean_plot <- data.table(spid = unique(datL0sc_prep[wllt=='t',spid]))
for (chem in dat_sc_mean_plot[, spid]){
dat_sc_mean_plot[spid == chem,
resp_mean := unique(datL0sc_prep[spid == chem, resp_mean])]
dat_sc_mean_plot[spid == chem,
sortval := unique(datL0sc_prep[spid == chem, sortval])]
}
single_conc_plot_mean <- ggplot(dat_sc_mean_plot,
aes(x=reorder(spid,
sortval),
y=resp_mean))+
geom_point(size=2) +
theme_bw() +
scale_x_discrete(breaks=NULL) +
xlab("Sample ID") +
ylab("Percent Activity") +
scale_y_continuous(breaks = seq(from = -40,
to = 100,
by = 20),
limits = c(-55, 105),
expand=c(0,0)) +
geom_hline(yintercept = 20, color = "red") +
geom_hline(yintercept = 3*bsd, color = "blue") +
geom_hline(yintercept = -3*bsd, color = "blue") +
ggtitle(NULL)
single_conc_plot_mean_hist <- ggplot(dat_sc_mean_plot,
aes(x=resp_mean))+
geom_histogram(binwidth=5) +
coord_cartesian(xlim = c(10, 20)) +
theme_bw() +
scale_y_discrete(breaks=NULL) +
scale_x_continuous(breaks = seq(from = -40, to = 100, by = 20),
limits = c(-55, 105),
expand=c(0,0)) +
xlab(NULL) +
ylab("Count") +
coord_flip() +
geom_vline(xintercept = 20, color = "red") +
geom_vline(xintercept = 3*bsd, color = "blue") +
geom_vline(xintercept = -3*bsd, color = "blue") +
ggtitle(NULL)
layout <- matrix(c(1,1,1,2), nrow=1, byrow=TRUE)
singleplotlist <- list(single_conc_plot_mean, single_conc_plot_mean_hist)
multiplot(plotlist = singleplotlist, layout = layout)
#Get list of spids where more than one spid per cas
rep_casn <- c()
non_na_cas <- unique(datL0sc_prep[!(is.na(casn)),casn])
for(cas in non_na_cas){
spid_num <- length(unique(datL0sc_prep[casn == cas, spid]))
if(spid_num > 1){
rep_casn <- c(rep_casn, cas)
}
}
#Figure
dat <- datL0sc_prep[casn %in% rep_casn]
spids <- unique(dat[,spid])
for (chem in spids){
dat[spid == chem,resp_mean := mean(dat[spid == chem, resp])]
dat[spid == chem,resp_sd := sd(dat[spid == chem, resp])]
}
spids <- unique(dat[,spid])
dat_spid <- data.table(spid = spids)
for (chem in spids){
dat_spid[spid == chem,casn := unique(dat[spid == chem, casn])]
dat_spid[spid == chem,resp_mean := unique(dat[spid == chem,resp_mean])]
dat_spid[spid == chem,resp_sd := unique(dat[spid == chem,resp_sd])]
}
for (chem in dat_spid[,casn]){
reps <- as.numeric(length(dat_spid[casn == chem,spid]))
dat_spid[casn == chem,chem_rep := reps/10]
}
num_row <- dim(dat_spid)[1]
dat_spid[,sortval := runif(n=num_row)]
pos <- position_dodge(width=0.25)
single_conc_rep_mean_sd <- ggplot(dat_spid, aes(x=casn,
y=resp_mean,
ymin = resp_mean - resp_sd,
ymax = resp_mean + resp_sd,
group = spid))+
geom_point(size=2, position=pos, stat="identity") +
geom_errorbar(position=pos, aes(width=chem_rep*2.5), show_guide=FALSE) +
theme_bw() +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=0.5)) +
scale_y_continuous(breaks=seq(from = -20, to = 100, by = 20)) +
xlab("CAS") +
ylab("Percent Activity") +
coord_cartesian(ylim=c(-25,100)) +
geom_hline(yintercept = 20, color = "red")
print(single_conc_rep_mean_sd)
aeids <- c(1508L, ctg_aeid, qli_aeid, gua_aeid)
dat_L5 <- tcplPrepOtpt(tcplLoadData(lvl = 5L,
fld = "aeid",
val = aeids,
type = "mc"))
#Set chnm and code to spid if NA
dat_L5[is.na(chnm), chnm := dat_L5[is.na(chnm), spid]]
dat_L5[is.na(code), code := dat_L5[is.na(code), spid]]
dat_L4 <- tcplLoadData(lvl=4L, fld = "aeid", val = aeids, type = "mc")
dat_L3 <- tcplLoadData(lvl=3L, fld = "aeid", val = aeids, type = "mc")
# Calculate AUC for each sample id ----------
dat_L5[hitc == 1L & modl == "hill",
AUCval := mapply(function(lower,
upper,
hill_tp,
hill_ga,
hill_gw) integrate(hill_curve,
lower,
upper,
hill_tp=hill_tp,
hill_ga=hill_ga,
hill_gw=hill_gw)$value,
lower = dat_L5[hitc == 1L & modl == "hill", logc_min],
upper = dat_L5[hitc == 1L & modl == "hill", logc_max],
hill_tp = dat_L5[hitc == 1L & modl == "hill", hill_tp],
hill_ga = dat_L5[hitc == 1L & modl == "hill", hill_ga],
hill_gw = dat_L5[hitc == 1L & modl == "hill", hill_gw])]
dat_L5[hitc == 1L & modl == "gnls",
AUCval := mapply(function(lower,
upper,
top,
ga,
gw,
la,
lw) integrate(gnls_curve,
lower,
upper,
top=top,
ga=ga,
gw=gw,
la=la,
lw=lw)$value,
lower = dat_L5[hitc == 1L & modl == "gnls", logc_min],
upper = dat_L5[hitc == 1L & modl == "gnls", logc_max],
top = dat_L5[hitc == 1L & modl == "gnls", gnls_tp],
ga = dat_L5[hitc == 1L & modl == "gnls", gnls_ga],
gw = dat_L5[hitc == 1L & modl == "gnls", gnls_gw],
la = dat_L5[hitc == 1L & modl == "gnls", gnls_la],
lw = dat_L5[hitc == 1L & modl == "gnls", gnls_lw])]
dat_L5[is.na(AUCval), AUCval:=0]
# Get the median AUC for each CAS code ----------
dat_L5[, AUC_med := median(AUCval),
by = .(code, aeid)]
We can now plot individual curves with the AUC value included and the AUC area shaded to highlight what is being integrated. For publication, we can combine these plots with a fourth plot that includes all of the AUC values.
chem_1 <- "TP0001207H21"
chem_2 <- "TP0001441G03"
chem_3 <- "TP0001206D06"
curve_1 <- ggcurve(chem_1, dat_L5, dat_L4, dat_L3, aeid)
curve_2 <- ggcurve(chem_2, dat_L5, dat_L4, dat_L3, aeid)
curve_3 <- ggcurve(chem_3, dat_L5, dat_L4, dat_L3, aeid)
# Find x positions of plotted curves ----------
dat_L5_unique <- unique(dat_L5[aeid == 1508L & !(is.na(casn)), .(casn, AUC_med)])
setkey(dat_L5_unique, AUC_med)
AUC_med_vect <- dat_L5_unique[,AUC_med]
auc_val_1 <- dat_L5[aeid == 1508L & spid ==chem_1, AUC_med]
auc_val_2 <- dat_L5[aeid == 1508L & spid ==chem_2, AUC_med]
auc_val_3 <- dat_L5[aeid == 1508L & spid ==chem_3, AUC_med]
auc_pos_1 <- which(AUC_med_vect %in% auc_val_1)
auc_pos_2 <- which(AUC_med_vect %in% auc_val_2)
auc_pos_3 <- which(AUC_med_vect %in% auc_val_3)
num_casn <- length(unique(dat_L5[aeid == 1508L, casn]))
auc_end_1 <- floor(num_casn/6L)
auc_end_2 <- floor(num_casn/2L)
auc_end_3 <- floor(num_casn*5.5/6L)
AUC_plot <- ggplot(dat_L5_unique,
aes(x=reorder(casn, AUC_med),
y=AUC_med))+
geom_bar(stat="identity", width=1) +
theme_bw() +
geom_segment(aes(x = auc_pos_1,
y = auc_val_1+1,
xend = auc_end_1,
yend = 260),
arrow = arrow(type = "closed", angle = 30)) +
geom_segment(aes(x = auc_pos_2,
y = auc_val_2+1,
xend = auc_end_2,
yend = 260),
arrow = arrow(type = "closed", angle = 30)) +
geom_segment(aes(x = auc_pos_3,
y = auc_val_3+1,
xend = auc_end_3,
yend = 260),
arrow = arrow(type = "closed", angle = 30)) +
scale_x_discrete(breaks=NULL) +
xlab("Chemical") +
ylab("AUR-TPO Activity (AUC)")
AUC_plot_list <- list(curve_1, curve_2, curve_3, AUC_plot)
layout <- matrix(c(1,2,3,
4,4,4,
4,4,4,
4,4,4),
nrow=4, byrow=TRUE)
# Read GUA hit calls from Steve's literature search ----------
dat_gua_file <- data.table(read.csv("Data/Fig6_data.csv", stringsAsFactors = F))
dat_gua_file <- dat_gua_file[!(is.na(GUA_hitc))] #Removes extra blank rows at bottom
setnames(dat_gua_file,
c("Chemical.name",
"CASRN",
"GUA_hitc"),
c("chnm",
"casn",
"gua_hitc")
)
gua_lit_casrns <- unique(dat_gua_file[,casn])
# Merge gua_hitc value into dat_L5 subset for only AUR TPO ----------
dat_L5_auc_gua <- dat_L5[aeid == 1508L & casn %in% gua_lit_casrns]
dat_L5_auc_gua <- merge(dat_L5_auc_gua,
dat_gua_file[,.(casn, gua_hitc)],
by = "casn")
dat_auc_gua_plot <- unique(dat_L5_auc_gua[, .(casn, AUC_med, gua_hitc)])
# Add arrow values if gua_hitc is 1 but AUR AUC is 0 ----------
for(cas in unique(dat_auc_gua_plot[,casn])){
if(dat_auc_gua_plot[casn == cas, gua_hitc] == 1 &
dat_auc_gua_plot[casn == cas, AUC_med] == 0){
dat_auc_gua_plot[casn == cas, cas_label := cas]
dat_auc_gua_plot[casn == cas, arrow_bottom := 5]
dat_auc_gua_plot[casn == cas, arrow_top := 50]
}else{
dat_auc_gua_plot[casn == cas, cas_label := NA_character_]
dat_auc_gua_plot[casn == cas, arrow_bottom := NA_integer_]
dat_auc_gua_plot[casn == cas, arrow_top := NA_integer_]
}
}
dat_auc_gua_plot[,gua_hitc := as.factor(gua_hitc)]
# Plot AUC vs GUA hitc ----------
AUC_plot_GUA <- ggplot(dat_auc_gua_plot, aes(x=reorder(casn, AUC_med),
y=AUC_med,
label = cas_label,
color = gua_hitc,
fill = gua_hitc))+
geom_bar(stat="identity", width=1) +
theme_bw() +
geom_segment(aes(x = reorder(casn, AUC_med),
y = arrow_top,
xend = reorder(casn, AUC_med),
yend = arrow_bottom),
arrow = arrow(type = "closed", angle = 10)) +#
geom_text(angle = 90,
hjust=0,
y = 51,
size = 3,
color = "black") +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=0.5)) +
theme(panel.grid.major.x = element_blank()) +
theme(legend.title = element_text(size=22)) +
theme(legend.text = element_text(size=18)) +
scale_colour_manual(values=c("gray", "blue4"),
name = "GUA Activity",
breaks=c(0,1),
labels=c("Inactive", "Active")) +
scale_fill_manual(values=c("gray", "blue4"),
name = "GUA Activity",
breaks=c(0,1),
labels=c("Inactive", "Active")) +
theme(legend.justification=c(0,1), legend.position=c(0,1)) +
xlab("CAS") +
ylab("AUR-TPO Activity (AUC)")
# Compare modl_acc AUR vs GUA ----------
aur_cas <- unique(dat_L5[aeid == 1508L & !(is.na(casn)),casn])
gua_cas <- unique(dat_L5[aeid == gua_aeid & !(is.na(casn)),casn])
common_cas <- intersect(gua_cas, aur_cas)
dat_gua_dot_plot <- data.table(casn = common_cas)
for (cas in common_cas){
aur_acc_val <- median(dat_L5[aeid == 1508L &
casn == cas &
hitc == 1L,
modl_acc],
na.rm = T)
if(is.na(aur_acc_val)){
aur_acc_val <- 5
}
gua_acc_val <- median(dat_L5[aeid == gua_aeid &
casn == cas &
hitc == 1L,
modl_acc],
na.rm = T)
if(is.na(gua_acc_val)){
gua_acc_val <- 5
}
dat_gua_dot_plot[casn == cas, aur_acc := aur_acc_val]
dat_gua_dot_plot[casn == cas, gua_acc := gua_acc_val]
}
AUR_GUA_dot_plot <- ggplot(dat_gua_dot_plot, aes(x=aur_acc,
y=gua_acc))+
geom_point(size=3) +
theme_bw() +
theme(axis.text.x = element_text(angle=90,
hjust=1,
vjust=0.5)) +
theme(legend.title = element_text(size=22)) +
theme(legend.text = element_text(size=18)) +
expand_limits(x = c(-2, 5)) +
expand_limits(y = c(-2, 5)) +
scale_x_continuous(breaks=c(-2, 0, 2, 4, 5),
labels=c("-2", "0", "2", "4", "Inactive")) +
scale_y_continuous(breaks=c(-2, 0, 2, 4, 5),
labels=c("-2", "0", "2", "4", "Inactive")) +
coord_equal(ratio=1) +
geom_abline(slope=1, intercept=0) +
xlab("AUR-TPO Threshold Potency (log uM)") +
ylab("GUA-TPO Threshold Potency (log uM)")
# Build inset with hitc table ----------
gua_plus_aur_plus <- length(dat_L5_auc_gua[hitc == 1 & gua_hitc == 1,casn])
gua_plus_aur_neg <- length(dat_L5_auc_gua[hitc == 0 & gua_hitc == 1,casn])
gua_neg_aur_plus <- length(dat_L5_auc_gua[hitc == 1 & gua_hitc == 0,casn])
gua_neg_aur_neg <- length(dat_L5_auc_gua[hitc == 0 & gua_hitc == 0,casn])
balance_table_rows <- c("GUA Active", "GUA Inactive")
balance_table <- data.frame(row.names = balance_table_rows,
AUR_Active = c(gua_plus_aur_plus, gua_neg_aur_plus),
AUR_Inactive = c(gua_plus_aur_neg, gua_neg_aur_neg))
val_table_theme <- ttheme_minimal(core=list(bg_params = list(fill = NA, col=NA),
fg_params=list(cex = 1.5)),
colhead=list(fg_params=list(cex = 1.5)),
rowhead=list(fg_params=list(fontface=2L,
cex = 1.5)))
balance_table_grob <- tableGrob(balance_table,
theme = val_table_theme,
cols = c("AUR\nActive",
"AUR\nInactive"))
ten_by_ten <- data.table(x=1:10, y=1:10)
inset_table_plot <- ggplotGrob(ggplot(ten_by_ten, aes(x = x, y = y)) +
geom_blank() +
theme_bw() +
theme(axis.line=element_blank(),
axis.text.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks=element_blank(),
axis.title.x=element_blank(),
axis.title.y=element_blank(),
legend.position="none",
panel.background=element_blank(),
panel.border=element_blank(),
panel.grid.major=element_blank(),
panel.grid.minor=element_blank(),
plot.background=element_blank(),
plot.title = element_text(size = 5)) +
theme(plot.background = element_rect(colour = "black")) +
annotation_custom(grob = balance_table_grob))
# Add inset table to AUC subplot ----------
AUC_plot_GUA_inset <- ggplot(dat_auc_gua_plot,
aes(x=reorder(casn, AUC_med),
y=AUC_med,
label = cas_label,
color = gua_hitc,
fill = gua_hitc))+
geom_bar(stat="identity", width=1) +
theme_bw() +
geom_segment(aes(x = reorder(casn, AUC_med),
y = arrow_top,
xend = reorder(casn, AUC_med),
yend = arrow_bottom),
arrow = arrow(type = "closed",
angle = 10)) +
geom_text(angle = 90,
hjust=0,
y = 51,
size = 3,
color = "black") +
theme(axis.text.x = element_text(angle=90,
hjust=1,
vjust=0.5)) +
theme(panel.grid.major.x = element_blank()) +
theme(legend.title = element_text(size=22)) +
theme(legend.text = element_text(size=18)) +
scale_colour_manual(values=c("gray", "blue4"),
name = "GUA Activity",
breaks=c(0,1),
labels=c("Inactive", "Active")) +
scale_fill_manual(values=c("gray", "blue4"),
name = "GUA Activity",
breaks=c(0,1),
labels=c("Inactive", "Active")) +
theme(legend.justification=c(1,1), legend.position=c(0.9,0.97)) +
xlab("CAS") +
ylab("AUR-TPO Activity (AUC)") +
annotate(geom="text",
x = 5,
y = 250,
label = "A",
color = "black",
hjust = 0.5,
vjust = 1,
size = 12) +
annotation_custom(grob = inset_table_plot,
xmin = 30,
xmax = 65,
ymin = 175,
ymax = 250)
# Add label to modl_acc subplot ----------
AUR_GUA_dot_plot_multi <- AUR_GUA_dot_plot +
annotate(geom="text",
x = -2,
y = 5,
label = "B",
color = "black",
hjust = 0,
vjust = 1,
size = 12)
# Multiplot print ----------
layout <- matrix(c(1,1,2), nrow=1, byrow=TRUE)
singleplotlist <- list(AUC_plot_GUA_inset, AUR_GUA_dot_plot_multi)
multiplot(plotlist = singleplotlist, layout = layout)
Selectivity is one method to interpret the potency of the AUR TPO assay in the context of other assays. As a loss of signal assay, one concern is that non-specific enzyme interactions, rather than direct TPO inhibition, could cause a drop in activity. This is explored in the code and images below.
# Pull in cytotox individual data ----------
file.loc <- './Data/Assay_Summary_modified_141031.csv'
dat_cytotox_indiv <- data.table(read.table(file = file.loc,
header = TRUE,
sep=',',
na.strings = c("NA"),
stringsAsFactors = FALSE))
all_assays <- tcplLoadAeid()
#The database had a change where Tox21_* assays are now names TOX21_*.
#The above cytotoxicity file uses the old names.
#First use regex to convert Tox21 to TOX21, then continue.
cytoassay <- c("proliferation decrease",
"cytotoxicity SRB",
"cytotoxicity BLA")
cyto_aenms <- dat_cytotox_indiv[biological_process %in% cytoassay, Assay]
cyto_aenms <- gsub("Tox21_", "TOX21_", cyto_aenms)
cytoaeids <- all_assays[aenm %in% cyto_aenms, aeid]
cyto_dat_L5 <- tcplPrepOtpt(tcplLoadData(lvl = 5L, fld = "aeid", cytoaeids))
cyto_dat_L5 <- cyto_dat_L5[hitc == 1]
tpocodes <- unique(dat_L5[aeid == 1508L,code])
cyto_dat_L5 <- cyto_dat_L5[code %in% tpocodes]
cyto_dat_L5[, aenm := "cytotox"]
dat_L5_with_cyto <- rbind(dat_L5[!(aeid %in% 1824L)],
cyto_dat_L5,
use.names=TRUE,
fill=TRUE)
# Make Assay a factor to ensure stable order ----------
dat_L5_with_cyto[, aenm := factor(aenm,
levels = c("NCCT_TPO_AUR_dn",
"NCCT_QuantiLum_inhib_2_dn",
"NCCT_HEK293T_CellTiterGLO",
"cytotox"))]
# Calculate specificity ----------
num_cytotox <- length(unique(dat_L5_with_cyto[aenm == "cytotox", aeid]))
unique_chem <- unique(dat_L5_with_cyto[,chnm])
# And specificity column for ac20
for (chemical in unique_chem){
cyto_modl_acc <- dat_L5_with_cyto[hitc == 1L &
chnm == chemical &
aenm == "cytotox",
modl_acc]
num_cyto_hits <- length(cyto_modl_acc)
cyto_modl_acc_padded <- rep(cyto_modl_acc, length.out=num_cytotox)
cyto_modl_acc_padded[(num_cyto_hits + 1):num_cytotox] <- 5
vect_modl_acc <- dat_L5_with_cyto[hitc == 1L &
chnm == chemical &
!(aenm %in% c("NCCT_TPO_AUR_dn",
"cytotox")),
modl_acc]
vect_modl_acc <- c(vect_modl_acc, median(cyto_modl_acc_padded))
tpo_modl_acc <- dat_L5_with_cyto[hitc == 1L &
chnm == chemical &
aenm == "NCCT_TPO_AUR_dn",
modl_acc]
min_tpo_modl_acc <- 5
if (length(tpo_modl_acc[!is.na(tpo_modl_acc)]) > 0){
min_tpo_modl_acc <- min(tpo_modl_acc, na.rm=T)
}
modl_acc_min <- min(vect_modl_acc, na.rm=T)
specificity_by_modl_acc <- modl_acc_min - min_tpo_modl_acc
dat_L5_with_cyto[chnm == chemical, specificity_modl_acc := specificity_by_modl_acc]
}
The issue of selectivity is explored by comparing the potency in AUR TPO to other assays. The parameter used for potency is the concentration at which the winning model has a response equal to the cutoff for that assay, modl_acc. We use the equation \[ selectivity = min(CTG_{ACC},QLI_{ACC},cyto_med) - min(AUR_{ACC},5) \] Here, CTGACC, QLIACC, AURACC are the modl_acc values for the Cell Titer Glo, Quantilum Luciferase, and AUR TPO assays, respectively.
The value of cyto_med is determined by grouping the modl_acc values for the 36 cytotoxicity assays in ToxCast. If the winning model for a cytotoxicity assay has hitc = 0, the value of modl_acc is set to 5. The value of cyto_med is then set to the median of modl_acc values for these 36 assays
# Make Plots ----------
# Setup colors, better than default of red, blue, and green ----------
# The palette with grey:
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73",
"#F0E442", "#0072B2", "#D55E00", "#CC79A7")
# The palette with black:
cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73",
"#F0E442", "#0072B2", "#D55E00", "#CC79A7")
# Assay, acc, specificity to qli and ctg, individual cyto ----------
tpo_hits <- unique(dat_L5_with_cyto[aenm == "NCCT_TPO_AUR_dn" & hitc==1L, chnm])
dat_L5_with_cyto_hits <- dat_L5_with_cyto[chnm %in% tpo_hits & hitc == 1L]
full_specificity_plot <- ggplot(dat_L5_with_cyto_hits,
aes(x=modl_acc,
y=reorder(chnm, specificity_modl_acc),
group = aenm,
color = aenm,
shape = aenm)) +
geom_point() +
ylab("Chemical") +
xlab("modl_acc") +
theme_bw() +
theme(legend.position="top", legend.title=element_blank()) +
scale_x_continuous(breaks=seq(-6,5,1)) +
scale_colour_manual(values=cbbPalette) +
scale_shape_manual(values=c(19,17,15,8)) +
guides(colour = guide_legend(nrow = 3))
file.dir <- paste("documents/images/", Sys.Date(), "/", sep="")
file.name <- paste("/full_specificity_plot_", Sys.Date(), ".pdf", sep="")
file.path <- paste(file.dir, file.name, sep="")
dir.create(path=file.dir, showWarnings = FALSE, recursive = TRUE)
pdf(file.path, width=12, height=70)
print(full_specificity_plot)
dev.off()
# Plots for actual manuscript image ----------
specificity_sort <- sort(unique(dat_L5_with_cyto_hits[,specificity_modl_acc]))
specificity_1_cutoff <- which(specificity_sort>1)[1]
specificity_0_cutoff <- which(specificity_sort>0)[1]
manu_1_arrow_x <- 4.5
size_box <- 20
top_first_box <- length(tpo_hits)
bottom_first_box <- top_first_box - size_box
top_second_box <- specificity_1_cutoff
bottom_second_box <- top_second_box - size_box
top_third_box <- specificity_0_cutoff
bottom_third_box <- top_third_box - size_box
# Image 1 ----------
manuscript_1 <- ggplot(dat_L5_with_cyto_hits,
aes(x=modl_acc,
y=reorder(chnm, specificity_modl_acc),
group = aenm,
color = aenm,
shape = aenm)) +
geom_point() +
ylab("Chemical") +
xlab("Concentration at\nThreshold\n(log uM)") +
annotate("rect",
xmin = -6,
xmax = 5,
ymin = bottom_first_box,
ymax = top_first_box,
fill = "red",
alpha = 0.1,
size = 1,
color = "red") +
annotate("rect",
xmin = -6,
xmax = 5,
ymin = bottom_second_box,
ymax = top_second_box,
fill = "red",
alpha = 0.1,
size = 1,
color = "red") +
annotate("rect",
xmin = -6,
xmax = 5,
ymin = bottom_third_box,
ymax = top_third_box,
fill = "red",
alpha = 0.1,
size = 1,
color = "red") +
geom_segment(aes(x = manu_1_arrow_x,
y = top_first_box,
xend = manu_1_arrow_x,
yend = specificity_1_cutoff),
color = "black",
arrow = arrow(type = "closed",
angle = 20,
ends="both")) +
geom_segment(aes(x = manu_1_arrow_x,
y = specificity_1_cutoff,
xend = manu_1_arrow_x,
yend = specificity_0_cutoff),
color = "black",
arrow = arrow(type = "closed",
angle = 20,
ends="both")) +
geom_segment(aes(x = manu_1_arrow_x,
y = specificity_0_cutoff,
xend = manu_1_arrow_x,
yend = 0),
color = "black",
arrow = arrow(type = "closed",
angle = 20,
ends="both")) +
annotate(geom="text",
x = 3.25,
y = mean(c(top_first_box, top_second_box)),
label = "High Selectivity",
color = "black",
size = 7,
fontface="bold",
angle=90,
vjust=0.5) +
annotate(geom="text",
x = 3.25,
y = mean(c(top_third_box, top_second_box)),
label = "Low Selectivity",
color = "black",
size = 7,
fontface="bold",
angle=90,
vjust=0.5) +
annotate(geom="text",
x = 3.25,
y = mean(c(0, top_third_box)),
label = "No Selectivity",
color = "black",
size = 7,
fontface="bold",
angle=90,
vjust=0.5) +
geom_hline(yintercept = specificity_1_cutoff, size = 1) +
geom_hline(yintercept = specificity_0_cutoff, size = 1) +
theme_bw() +
theme(legend.position="none") +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank()) +
scale_x_continuous(breaks=seq(-6,5,1)) +
coord_cartesian(xlim = c(-6, 5)) +
scale_colour_manual(values=cbbPalette,
labels=c("AUR TPO", "QLI", "CTG", "Cytotox")) +
scale_shape_manual(values=c(19,17,15,8),
labels=c("AUR TPO", "QLI", "CTG", "Cytotox"))
# Image 2a ----------
manuscript_2a <- ggplot(dat_L5_with_cyto_hits,
aes(x=modl_acc,
y=reorder(strtrim(chnm, 30), specificity_modl_acc),
group = aenm,
color = aenm,
shape = aenm)) +
coord_cartesian(xlim = c(-6, 5),
ylim = c(bottom_first_box:top_first_box)) +
geom_point() +
xlab("Concentration at Threshold (log uM)") +
theme_bw() +
theme(legend.position="none") +
theme(axis.title.y = element_blank()) +
theme(axis.text.y = element_text(family = "mono",
face = "bold",
size=c(rep(0,bottom_first_box-1),
rep(12,size_box+1))),
axis.ticks.y = element_line(size = c(rep(0,bottom_first_box),
rep(1,size_box+1)))) +
scale_x_continuous(breaks=seq(-6,5,1)) +
scale_colour_manual(values=cbbPalette,
labels=c("AUR TPO", "QLI", "CTG", "Cytotox")) +
scale_shape_manual(values=c(19,17,15,8),
labels=c("AUR TPO", "QLI", "CTG", "Cytotox")) +
annotate(geom="text",
x = -5,
y = top_first_box-1,
label = "A",
color = "black",
hjust = 0,
vjust = 1,
size = 10)
# Image 2b ----------
manuscript_2b <- ggplot(dat_L5_with_cyto_hits,
aes(x=modl_acc,
y=reorder(strtrim(chnm, 30), specificity_modl_acc),
group = aenm,
color = aenm,
shape = aenm)) +
geom_point() +
xlab("Concentration at Threshold (log uM)") +
theme_bw() +
theme(legend.position="none") +
theme(axis.title.y = element_blank()) +
theme(axis.text.y = element_text(family = "mono",
face = "bold",
size=c(rep(0,bottom_second_box-1),
rep(12,size_box+1),
rep(0,top_first_box-top_second_box))),
axis.ticks.y = element_line(size = c(rep(0,bottom_second_box-1),
rep(1,size_box+1),
rep(0,top_first_box-top_second_box)))) +
scale_x_continuous(breaks=seq(-6,5,1)) +
coord_cartesian(xlim = c(-6, 5),
ylim = c(bottom_second_box:top_second_box)) +
scale_colour_manual(values=cbbPalette,
labels=c("AUR TPO", "QLI", "CTG", "Cytotox")) +
scale_shape_manual(values=c(19,17,15,8),
labels=c("AUR TPO", "QLI", "CTG", "Cytotox")) +
annotate(geom="text",
x = -5,
y = top_second_box-1,
label = "B",
color = "black",
hjust = 0,
vjust = 1,
size = 10)
# Image 2c ----------
manuscript_2c <- ggplot(dat_L5_with_cyto_hits,
aes(x=modl_acc,
y=reorder(strtrim(chnm, 30), specificity_modl_acc),
group = aenm,
color = aenm,
shape = aenm)) +
geom_point() +
xlab("Concentration at Threshold (log uM)") +
theme_bw() +
theme(legend.position="none") +
theme(axis.title.y = element_blank()) +
theme(axis.text.y = element_text(family = "mono",
face = "bold",
size=c(rep(0,bottom_third_box-1),
rep(12,size_box+1),
rep(0,top_first_box-top_third_box))),
axis.ticks.y = element_line(size = c(rep(0,bottom_third_box-1),
rep(1,size_box+1),
rep(0,top_first_box-top_third_box)))) +
scale_x_continuous(breaks=seq(-6,5,1)) +
coord_cartesian(xlim = c(-6, 5),
ylim = c(bottom_third_box:top_third_box)) +
scale_colour_manual(values=cbbPalette,
labels=c("AUR TPO", "QLI", "CTG", "Cytotox")) +
scale_shape_manual(values=c(19,17,15,8),
labels=c("AUR TPO", "QLI", "CTG", "Cytotox")) +
annotate(geom="text",
x = -5,
y = top_third_box-1,
label = "C",
color = "black",
hjust = 0,
vjust = 1,
size = 10)
layout <- matrix(c(1,2,2,2,
1,3,3,3,
1,4,4,4), nrow=3, byrow=TRUE)
file.dir <- paste("documents/images/", Sys.Date(), "/", sep="")
file.name <- paste("/Selectivity_figure_with_subsets_", Sys.Date(), ".png", sep="")
file.path <- paste(file.dir, file.name, sep="")
dir.create(path=file.dir, showWarnings = FALSE, recursive = TRUE)
png(file.path, width = 5100, height = 6600, res=480)
multiplot(manuscript_1, manuscript_2a, manuscript_2b, manuscript_2c, layout = layout)
dev.off()
multiplot(manuscript_1, manuscript_2a, manuscript_2b, manuscript_2c, layout = layout)
file.dir <- paste("documents/tables/", Sys.Date(), "/", sep="")
file.name <- paste("/AUR_TPO_selectivity_potency_AUC_", Sys.Date(), ".tsv", sep="")
file.path <- paste(file.dir, file.name, sep="")
dir.create(path=file.dir, showWarnings = FALSE, recursive = TRUE)
dat_app_result <- dat_L5[, .(aeid, chnm, casn, spid, modl, modl_ga, modl_acc, modl_tp, hitc, AUCval)]
dat_app_result <- merge(dat_app_result,
dat_L5_with_cyto[, .(aeid, spid, specificity_modl_acc)],
by = c("spid", "aeid"))
write.table(dat_app_result,
sep='\t',
file = file.path,
quote=FALSE,
row.names = FALSE)
I make use of multiple functions in the above analysis, but leave the definition here for document clarity.
# ggcurve based on R/manuscript/ggcurve.R ----------
#
ggcurve <- function(chem, dat_L5, dat_L4, dat_L3, assay){
dat_L5 <- dat_L5[assay == aeid]
dat_L4 <- dat_L4[assay == aeid]
dat_L3 <- dat_L3[assay == aeid]
chemname <- unique(dat_L5[spid == chem, chnm])
chem_AUC <- round(dat_L5[spid == chem, AUCval], 1)
conc <- dat_L3[spid==chem, logc]
resp <- dat_L3[spid==chem, resp]
fit_hill_tp <- dat_L4[spid==chem, hill_tp]
fit_hill_ga <- dat_L4[spid==chem, hill_ga]
fit_hill_gw <- dat_L4[spid==chem, hill_gw]
dat_mmi <- data.table(logc = conc, resp = resp)
xmin <- min(conc)
xmax <- max(conc)
curve_max <- max(resp, 120)
curve_min <- min(resp, -20)
range_curve <- curve_max - curve_min
plot_max <- curve_max + 0.15*range_curve
plot_min <- curve_min - 0.15*range_curve
curves_plot <- ggplot(dat_mmi, aes(x=logc, y=resp))
curves_plot <- curves_plot +
stat_function(fun = hill_curve,
args=list(hill_tp = fit_hill_tp,
hill_ga = fit_hill_ga,
hill_gw = fit_hill_gw),
alpha=1,
color = "black",
size=3) +
stat_function(fun = hill_curve,
args=list(hill_tp = fit_hill_tp,
hill_ga = fit_hill_ga,
hill_gw = fit_hill_gw),
alpha=0.4,
color = "black",
geom="ribbon",
mapping = aes(ymin = 0,
ymax = ..y..,
linetype = NA)) +
theme_bw() +
geom_point(aes(size=3)) +
annotate(geom="text",
x = 0.5,
y = 110,
label = paste("AUC : ", chem_AUC),
color = "black", hjust = 0.5) +
theme(legend.position="none",
legend.title=element_blank()) +
ylab("Percent Activity") +
xlab("Log Concentration (uM)") +
coord_cartesian(ylim = c(plot_min,plot_max)) +
ggtitle(chemname)
return(curves_plot)
}
# Hill curve and gnls curve ----------
hill_curve <- function(hill_tp, hill_ga, hill_gw, lconc){
return(hill_tp/(1+10^((hill_ga - lconc)*hill_gw)))
}
gnls_curve <- function(top, ga, gw, la, lw, lconc){
gain <- 1/(1+10^((ga - lconc)*gw))
loss <- 1/(1+10^((lconc - la)*lw))
return(top*gain*loss)
}
# Multiplot ----------
# multiplot was obtained from
# http://www.cookbook-r.com/Graphs/Multiple_graphs_on_one_page_%28ggplot2%29/
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
require(grid)
# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)
numPlots = length(plots)
# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==1) {
print(plots[[1]])
} else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}