1 Minnesota Public Health CEC List

283 CEC list
Added the guidance list (1/21/2021)
Added the candidate pool list (1/21/2021)
Determined the unique union and list membership after mapping CASRN to DTXSID

mn.cec <- as.data.table(read.xlsx('./source/mn_CEC_full_list_opera_dashboard.xlsx', sheet = 1, colNames = TRUE))
mn.guid <- as.data.table(read.xlsx('./source/mdh_guidance_list_opera_test_dashboard.xlsx', sheet=1, colNames = TRUE))
mn.cand <- as.data.table(read.xlsx('./source/mdh_candidate_pool_opera_test_dashboard.xlsx', sheet=1, colNames = TRUE))

mn.cec[, list_membership := paste('CEC_283')]
mn.guid[, list_membership := paste('Guidance_List')]
mn.cand[, list_membership := paste('Candidate_Pool')]

# make mn.chem union list

mn.chem <- rbind(mn.cec, mn.guid, mn.cand)
mn.chem[, list_memberships := paste0(unique(list_membership), collapse=", "), by=list(DTXSID)]
#unique(mn.chem$list_memberships)
mn.chem[,c('list_membership','INPUT','FOUND_BY'):= NULL]
mn.chem <- unique(mn.chem)
mn.dtxsid <- unique(mn.chem$DTXSID)#283

2 Assemble the high-throughput screening (HTS) bioactivity dataset

Only 184/283 substances have ToxCast/Tox21 data in the ToxCast database invitrodb version 3.3 (released Aug 2020).
Data cleaning: similar to previous work
remove curve-fits that might be less reproducible: 3 caution flags or more are removed
remove curve-fits that might be less quantitatively informative: remove fit categories for fits where the efficacy is near the cut-off and the AC50 is less than the concentration range screened

chems <- tcplLoadChem(field='dsstox_substance_id',val=unique(mn.chem[,DTXSID]))
mn.chem[,in.toxcast := 'no']
mn.chem[DTXSID %in% chems[,dsstox_substance_id], in.toxcast := 'yes']
datatable(mn.chem[,c('DTXSID','in.toxcast')], filter='top', 
          options=list(pagelength=25, autoWidth=FALSE,  scrollX=TRUE, initComplete = JS(
    "function(settings, json) {",
    "$('body').css({'font-family': 'Calibri'});",
    "}"
  )))

#nrow(mn.chem[in.toxcast=='yes']) #270

mc5 <- tcplPrepOtpt(tcplLoadData(val=chems$spid, lvl=5, type = 'mc',fld='spid'))
mc6 <- tcplPrepOtpt(tcplLoadData(lvl=6, fld='m4id', val=mc5$m4id, type='mc'))
setDT(mc6)
mc6_mthds <- mc6[ , .( mc6_mthd_id = paste(mc6_mthd_id, collapse=",")), by = m4id]
mc6_flags <- mc6[ , .( flag = paste(flag, collapse=";")), by = m4id]
mc5$mc6_flags <- mc6_mthds$mc6_mthd_id[match(mc5$m4id, mc6_mthds$m4id)]
mc5[, flag.length := ifelse(!is.na(mc6_flags), count.fields(textConnection(mc6_flags), sep =','), NA)]

# filter the dataset, with coarse filters
mc5[hitc==1 & flag.length < 3, use.me := 1]
mc5[hitc==1 & is.na(flag.length), use.me := 1]
mc5[hitc==1 & flag.length >= 3, use.me := 0]
mc5[fitc %in% c(36,45), use.me := 0]
mc5[hitc==-1, use.me := 0] # make hitc interpretable as a positive sum
mc5[use.me==0, modl_ga := as.numeric(NA)]
mc5[use.me==0, hitc := 0]
mc5[hitc==0, modl_ga := as.numeric(NA)]

mc5[hitc==1,ac50_uM := ifelse(!is.na(modl_ga), 10^modl_ga, NA)]
mc5.hitc1 <- mc5[hitc==1]

cerapp <- as.data.table(dbGetQuery(con, "select model_generic_chemical_cerapp_score.*
                                       from prod_internal_invitrodb_v3_3.model_generic_chemical_cerapp_score;"))

compara <- as.data.table(dbGetQuery(con, "select model_generic_chemical_compara_scores.*
                                         from prod_internal_invitrodb_v3_3.model_generic_chemical_compara_scores;"))

er <- as.data.table(dbGetQuery(con, "select * from prod_internal_invitrodb_v3_3.model_generic_chemical_er_scores;"))
ar <- as.data.table(dbGetQuery(con, "select * from prod_internal_invitrodb_v3_3.model_generic_chemical_ar_scores;"))

3 High-throughput screening (HTS) domain of applicability

3.1 Hit-rate in ToxCast/Tox21

It can be helpful to take a look at hit-rate in ToxCast/Tox21. A low hit-rate can indicate a number of potential issues: lack of bioactivity; lack of in vitro bioavailability and/or lack of amenability of the physicochemical properties to HTS; chemical degradation in sample or assay; limited screening of the biological space (i.e., biological target is absent); requirement for metabolism.

What we see in examination of hit-rate for this set are some chemicals that may require manual selection of a reasonable concentration value as a POD.

For example, meprobamate is a GABA-active compound that was not screened in GABA-related assays (only screened as part of Tox21 chemical set), and has zero active assays after filtering.
We could substitute the highest concentration screened (100 uM) or take the therapeutic range (10-25 mg/L, with 10 mg/L equivalent to 46 uM). In latter part of this work (for AED calculation), we use 46 uM.
We also see some disinfection by-products where we may want to select bioactivity values from the in vitro genotoxicity literature (see DBP example table based on Plewa et al. 2010). For now, these are just listed here, but we could do more using these values. We might have to mine the literature for an appropriate HTTK dataset for DBPs.

load(file='./output/mn_mc5.RData')

hit.rates <- mc5[ , list(
  active.percent = round((lw(hitc==1)/.N)*100,2), #active percent
  inactive.percent = round((lw(hitc==0)/.N)*100,2), #inactive percent
  total.assay.screened  = .N, #total number of aeids tested in mc
  active.assay.count  = as.double(lw(hitc==1)),  # active count
  inactive.assay.count  = as.double(lw(hitc==0))  #inactive count
), by = list(chnm, casn, dsstox_substance_id)]

#range(hit.rates$total.assay.screened) #235 to 1993
#median(hit.rates$total.assay.screened) #395.5

hit.rates <- hit.rates[order(active.percent)]

datatable(hit.rates, filter='top', 
          options=list(pagelength=25, autoWidth=FALSE,  scrollX=TRUE, initComplete = JS(
    "function(settings, json) {",
    "$('body').css({'font-family': 'Calibri'});",
    "}"
  )))

3.2 Physico-chemical properties

What if some chemicals fall on the extremes for domain of applicability for the current HTS?

tolyltriazole is an ill-defined mixture of benzotriazoles. Should we map to all constituents instead?
germanium is elemental; should we map to a salt instead? I looked at some salts and we didn't have data there either.

mn.chem[,log10VP := log10(as.numeric(VAPOR_PRESSURE_MMHG_OPERA_PRED))]
col.num <- c('AVERAGE_MASS','OCTANOL_WATER_PARTITION_LOGP_OPERA_PRED')
mn.chem[, (col.num) := lapply (.SD, as.numeric), .SDcols = col.num ]

hit.rate.opera <- merge.data.table(mn.chem,
                                   hit.rates,
                                   by.x='DTXSID',
                                   by.y='dsstox_substance_id')

hit.rate.opera[,log10VP.caution := ifelse(log10VP > 1, "Caution - High VP", NA)]
hit.rate.opera[,logP.caution := NA]
hit.rate.opera[OCTANOL_WATER_PARTITION_LOGP_OPERA_PRED < -2|OCTANOL_WATER_PARTITION_LOGP_OPERA_PRED > 7,logP.caution := "Caution - Bioavailability based on LogP"]
#hit.rate.opera[!is.na(log10VP.caution)]
datatable(hit.rate.opera[active.percent < 2, c('PREFERRED_NAME','DTXSID','log10VP','AVERAGE_MASS','OCTANOL_WATER_PARTITION_LOGP_OPERA_PRED',
                            'active.percent','total.assay.screened','active.assay.count')],
          filter='top', 
          options=list(pagelength=25, autoWidth=FALSE,  scrollX=TRUE,
                       initComplete = JS(
    "function(settings, json) {",
    "$('body').css({'font-family': 'Calibri'});",
    "}"
  ))) %>% formatStyle('log10VP',
                      backgroundColor = styleInterval(c(-1,1), 
                                                      c('green','yellow','red')),
                      fontWeight = 'bold') %>% formatStyle('OCTANOL_WATER_PARTITION_LOGP_OPERA_PRED',
                backgroundColor = styleInterval(c(-2,1,7,11), c('red','yellow','green','yellow','red')),
                fontWeight = 'bold')

3.3 Low hit-rate and physicochemical cautions

For some substances, very low active hit-rate coincides with cautions on log10 vapor pressure (semi-volatiles).

datatable(hit.rate.opera[active.percent < 0.5, c('DTXSID','PREFERRED_NAME','AVERAGE_MASS','log10VP','OCTANOL_WATER_PARTITION_LOGP_OPERA_PRED','logP.caution','log10VP.caution','active.percent','total.assay.screened')],
          filter='top', 
          options=list(pagelength=25, autoWidth=FALSE,  scrollX=TRUE,
                       initComplete = JS(
    "function(settings, json) {",
    "$('body').css({'font-family': 'Calibri'});",
    "}"
  ))) %>% formatStyle('log10VP',
                      backgroundColor = styleInterval(c(-1,1), 
                                                      c('green','yellow','red')),
                      fontWeight = 'bold') %>% formatStyle('OCTANOL_WATER_PARTITION_LOGP_OPERA_PRED',
                backgroundColor = styleInterval(c(-2,1,7,11), c('red','yellow','green','yellow','red')),
                fontWeight = 'bold')

4 Bioactivity:exposure ratio

4.1 Calculate 5th percentile ToxCast AC50

For a first pass, get a bioactivity threshold (5th percentile). We've used this before in APCRA and TSCA work.

mc5[,ac50_5p := quantile(ac50_uM, probs=c(0.05),na.rm=TRUE), by=list(dsstox_substance_id)]
mc5.dist <- unique(mc5[,c('casn','dsstox_substance_id','chnm','ac50_5p')])
mc5.dist[chnm=='Meprobamate', ac50_5p := 45.8]
datatable(mc5.dist, filter='top', 
          options=list(pagelength=25, autoWidth=FALSE,  scrollX=TRUE, initComplete = JS(
    "function(settings, json) {",
    "$('body').css({'font-family': 'Calibri'});",
    "}"
  )))

4.2 Calculate AED (mg/kg/day) for 5th percentile AC50 for the 95th percentile

Using 3compss model with Monte Carlo for interindividual toxicokinetic variability.
95th percentile from this is used to equate a lower bound on the dose that would result in the same plasma conc as the in vitro conc value.
Here I've chosen the species: Human.
only 4 of the 14 substances with ToxCast data have HTTK data.
should we search the literature to try to extract some clearance values or max/steady state clearance values?
should we consider POD-QSAR approaches for repeat dose since the data landscape is sparse for empirical BER?
The boxplots represent all of the AEDs based on all AC50 values.
The circles represent the AED based on the 5th percentile AC50.

# 5th percentile value
mc5.dist.df <- as.data.frame(subset(mc5.dist, casn %in% get_cheminfo(species='Human'))) #only 127/264 covered by empirical

load_sipes2017() # load in silico data

## Loading predictions from Sipes et al. (2017) for 8758 chemicals.
## Existing data are not being overwritten.
## Please wait...

mc5.dist.df <- as.data.frame(subset(mc5.dist, casn %in% get_cheminfo(species='Human'))) # only 260/264 - much better

4.3 AEDs using HTTK (including in silico)

We get much better coverage of the chemical space if we include the Sipes et al 2017 in silico HTTK dataset.

mc5.dist[chnm=='Meprobamate', ac50_5p := 45.8]

new.row <- mc5[chnm=='Meprobamate'& aeid==759]
new.row <- new.row[c(1),]
new.row[,ac50_uM := 45.8]
new.row[,c('ac50_5p'):= NULL]

mc5.hitc1 <- rbind(mc5.hitc1, new.row, fill=TRUE)

# 5th percentile value
mc5.dist.df <- as.data.frame(subset(mc5.dist, casn %in% get_cheminfo(species='Human', model='pbtk'))) #257 substances

aed.hu.css.pbtk.df=data.frame()
for (i in 1:nrow (mc5.dist.df))
{

  aed.hu.pbtk.95 <-(calc_mc_oral_equiv(conc=mc5.dist.df$ac50_5p[i], 
                                     chem.cas=mc5.dist.df$casn[i], 
                                     which.quantile=c(0.95),
                                     model = 'pbtk',
                                     species='Human',
                                     restrictive.clearance=T, 
                                     output.units='mgpkgpday'))
  
    aed.hu.pbtk.50 <-(calc_mc_oral_equiv(conc=mc5.dist.df$ac50_5p[i], 
                                     chem.cas=mc5.dist.df$casn[i], 
                                     which.quantile=c(0.50),
                                     model = 'pbtk',
                                     species='Human',
                                     restrictive.clearance=T, 
                                     output.units='mgpkgpday'))
    aed.hu.css.95 <-(calc_mc_oral_equiv(conc=mc5.dist.df$ac50_5p[i], 
                                     chem.cas=mc5.dist.df$casn[i], 
                                     which.quantile=c(0.95),
                                     model = '3compartmentss',
                                     species='Human',
                                     restrictive.clearance=T, 
                                     output.units='mgpkgpday'))
  
    aed.hu.css.50 <-(calc_mc_oral_equiv(conc=mc5.dist.df$ac50_5p[i], 
                                     chem.cas=mc5.dist.df$casn[i], 
                                     which.quantile=c(0.50),
                                     model = '3compartmentss',
                                     species='Human',
                                     restrictive.clearance=T, 
                                     output.units='mgpkgpday'))
    
  aed.hu.css.pbtk.df<-rbind(aed.hu.css.pbtk.df,
                     cbind(
                           mc5.dist.df$dsstox_substance_id[i], 
                           mc5.dist.df$casn[i],
                           mc5.dist.df$chnm[i],
                           mc5.dist.df$ac50_5p[i], 
                           aed.hu.pbtk.95,
                           aed.hu.pbtk.50,
                           aed.hu.css.95,
                           aed.hu.css.50))
}

# all ac50 values
mc5.df <- as.data.frame(subset(mc5.hitc1, casn %in% get_cheminfo(species='Human', model='pbtk')))

all.aed.hu.css.df=data.frame()
for (i in 1:nrow (mc5.df))
{

  aed.ac50.hu.css.95 <-(calc_mc_oral_equiv(conc=mc5.df$ac50_uM[i], 
                                     chem.cas=mc5.df$casn[i], 
                                     which.quantile=c(0.95),
                                     species='Human',
                                     restrictive.clearance=T, 
                                     output.units='mgpkgpday',
                                     model='3compartmentss'))
  all.aed.hu.css.df<-rbind(all.aed.hu.css.df,
                     cbind(
                           mc5.df$dsstox_substance_id[i], 
                           mc5.df$casn[i],
                           mc5.df$chnm[i],
                           mc5.df$ac50_uM[i], 
                           aed.ac50.hu.css.95))
}
save(all.aed.hu.css.df,
     aed.hu.css.pbtk.df,
     file='./output/aed_from_hu_httk_insilico_21jan2021.RData')

load('./output/aed_from_hu_httk_insilico_21jan2021.RData')

aed.hu.css.pbtk.dt <- as.data.table(aed.hu.css.pbtk.df)
all.aed.hu.css.dt <- as.data.table(all.aed.hu.css.df)

setnames(all.aed.hu.css.dt,
         c('V1','V2','V3','V4'),
         c('dsstox_substance_id','casn','chnm', 'ac50_uM'))

setnames(aed.hu.css.pbtk.dt,
         c('V1','V2','V3','V4'),
         c('dsstox_substance_id','casn','chnm', 'ac50_5p'))

col.num <- c('ac50_5p', 'aed.hu.css.95', 'aed.hu.css.50','aed.hu.pbtk.95','aed.hu.pbtk.50')
aed.hu.css.pbtk.dt[, (col.num) := lapply (.SD, as.numeric), .SDcols = col.num ]
aed.hu.css.pbtk.dt <- aed.hu.css.pbtk.dt[order(-aed.hu.css.95)]

col.num <- c('ac50_uM', 'aed.ac50.hu.css.95')
all.aed.hu.css.dt[, (col.num) := lapply (.SD, as.numeric), .SDcols = col.num ]
all.aed.hu.css.dt <- all.aed.hu.css.dt[order(-aed.ac50.hu.css.95)]

all.aed.hu.css.dt$AED_5p <- aed.hu.css.pbtk.dt$aed.hu.css.95[match(all.aed.hu.css.dt$dsstox_substance_id,
                                                              aed.hu.css.pbtk.dt$dsstox_substance_id)]
hit.rates$AED_5p <- aed.hu.css.pbtk.dt$aed.hu.css.95[match(hit.rates$dsstox_substance_id,
                                                              aed.hu.css.pbtk.dt$dsstox_substance_id)]

p <- ggplot(data=all.aed.hu.css.dt, 
            aes(x=reorder(factor(chnm),-AED_5p), 
                y=aed.ac50.hu.css.95, text = paste(chnm,', ',aed.ac50.hu.css.95, 'mg/kg/day')))+
  geom_boxplot(outlier.shape=4)+
  geom_point(aes(x=reorder(factor(chnm),-AED_5p), 
                 y=AED_5p), color="#95D840FF", shape=17, size = 2) +
  xlab('')+
  ylab('log10-mg/kg/day')+
  theme_bw() +
    theme(legend.position="none",
        legend.title = element_blank(),
        axis.text.x = element_text(angle=45, vjust=0.5),
        plot.title = element_text(hjust=0.5))+
  scale_y_continuous(trans='log10',
                         breaks= c(0.000000001,
                                    0.00000001,
                                    0.0000001,
                                   0.000001,
                                   0.00001,
                                   0.0001,
                                   0.001,
                                   0.01,
                                   0.1,
                                   1,
                                   10,
                                   100,
                                   1000),
                         labels = c(0.000000001,
                                    0.00000001,
                                    0.0000001,
                                   0.000001,
                                   0.00001,
                                   0.0001,
                                   0.001,
                                   0.01,
                                   0.1,
                                   1,
                                   10,
                                   100,
                                   1000
                                    ))+
  coord_flip(ylim=c(0.000000001,1000))+ ggtitle('AEDs using HTTK in silico')

fig <- ggplotly(p, tooltip=c('text'), height=2500, width=700)

fig

4.4 Now let's look at BER based on SEEM3 and the AEDs from HTTK-insilico

Note that this is now sorted by BER. A BER < 10,000 might suggest an interesting substance for further investigations.
The SEEM3 upper 95th %-ile (estimate of the median) and the AED based on the 5th percentile AC50 (green triangle) were used to calculate the BER.
The teal linerange represents the median to upper 95th %-ile estimate of the median from ExpoCast SEEM3 values for exposure.
The boxplots represent all of the AEDs based on the AC50s available.

seem3 <- fread('./source/SupTable1-all.chem.preds-2018-04-23.txt')

seem.mn <- seem3[dsstox_substance_id %in% aed.hu.css.pbtk.dt[,dsstox_substance_id]]
seem.mn$chnm <- aed.hu.css.pbtk.dt$chnm[match(seem.mn$dsstox_substance_id,
                                          aed.hu.css.pbtk.dt$dsstox_substance_id)]
seem.mn$AED_5p <- all.aed.hu.css.dt$AED_5p[match(seem.mn$dsstox_substance_id,
                                                    all.aed.hu.css.dt$dsstox_substance_id)]

seem.mn[,seem3.num := 10^seem3]
seem.mn[,seem3.u95.num := 10^seem3.u95]
seem.mn[,ber := AED_5p/seem3.u95.num]


all.aed.hu.css.dt$ber <- seem.mn$ber[match(all.aed.hu.css.dt$dsstox_substance_id,
                                           seem.mn$dsstox_substance_id)]

all.aed.hu.css.dt[is.na(ber), ber := 1e10]
all.aed.hu.css.dt$seem3.num <- seem.mn$seem3.num[match(all.aed.hu.css.dt$dsstox_substance_id,
                                           seem.mn$dsstox_substance_id)]
all.aed.hu.css.dt$seem3.u95.num <- seem.mn$seem3.u95.num[match(all.aed.hu.css.dt$dsstox_substance_id,
                                           seem.mn$dsstox_substance_id)]

r <- ggplot(data=all.aed.hu.css.dt[ber < 10000], 
            aes(x=reorder(factor(chnm),-ber), 
                y=aed.ac50.hu.css.95,
                text = paste(chnm,
                             ', ','POD-NAM =',aed.ac50.hu.css.95, 'mg/kg/day', ',', 'SEEM3.upper95 = ', seem3.u95.num, 'mg/kg/day')))+
  #geom_boxplot(outlier.shape=4)+
  geom_point(aes(x=reorder(factor(chnm),-ber), 
                 y=AED_5p), color="#95D840FF", shape=17, size = 2) +
  geom_linerange(
                 aes(x=reorder(factor(chnm),-ber),
                     ymin=seem3.num,
                     ymax=seem3.u95.num), color="#1F968BFF", lty='dashed')+
  xlab('')+
  ylab('log10-mg/kg/day')+
  theme_bw() +
    theme(
        legend.title = element_blank(),
        axis.text.x = element_text(angle=45, vjust=0.5),
        plot.title = element_text(hjust=0.5))+
  scale_y_continuous(trans='log10',
                         breaks= c(0.000000001,
                                    0.00000001,
                                    0.0000001,
                                   0.000001,
                                   0.00001,
                                   0.0001,
                                   0.001,
                                   0.01,
                                   0.1,
                                   1,
                                   10,
                                   100,
                                   1000),
                         labels = c(0.000000001,
                                    0.00000001,
                                    0.0000001,
                                   0.000001,
                                   0.00001,
                                   0.0001,
                                   0.001,
                                   0.01,
                                   0.1,
                                   1,
                                   10,
                                   100,
                                   1000
                                    ))+
  coord_flip(ylim=c(0.000000001,1000))+ ggtitle('Bioactivity:exposure ratio plot')

fig3 <- ggplotly(r, tooltip=c('text','seem3.upper95'), width=800, height = 1500)
fig3

5 Create Endocrine and Developmental Hazard Flags

Match the logic in the recent comments provided to the EPA program office partners on a prioritization and screening workflow using ER and AR models.
If ToxCast ER/AR pathway models available: these scores trump CERAPP and COMPARA.
ToxCast ER/AR pathway model positives: => 0.1
Grouped equivocals (0.001-0.1) with the negative to enable easier prioritization.
For ToxCast AR pathway model (older version to match what is in invitrodb version 3.3 and CompTox Chemicals Dashboard), require confidence flags to be > 2 for a positive.
Highlight substances with no data - could be a data gap?
Use the Stemina DevTox platform from ToxCast for developmental hazard flag.

5.1 ER, AR, and Devtox Hazard flags

er.ar <- merge.data.table(mn.chem[,c('DTXSID','PREFERRED_NAME','CASRN')],
                                cerapp[,c('CASRN','CHEMICAL_NAME','input_SMILES','InChI_Key',
                                                'Potency_class_2_binding', 'Potency_class_2_agonist', 'Potency_class_2_antagonist','consensus_2_binding', 'consensus_2_agonist','consensus_2_antagonist')],
                                by = 'CASRN',
                                all.x=TRUE,
                          all.y=FALSE)

er.ar <- merge.data.table(er.ar,
                                compara[, c('dsstox_substance_id','consensus_binding', 'consensus_agonist','consensus_antagonist')],
                                by.x='DTXSID',
                                by.y='dsstox_substance_id',
                                all.x=TRUE,
                          all.y=FALSE)

colnames(er) <- paste0('er_model_', colnames(er))
er.ar <- merge.data.table(er.ar,
                                er,
                                by.x='CASRN',
                                by.y='er_model_CASRN',
                                all.x=TRUE,
                          all.y=FALSE)

colnames(ar) <- paste0('ar_model_', colnames(ar))
er.ar <- merge.data.table(er.ar,
                                ar,
                                by.x='CASRN',
                                by.y='ar_model_CASRN',
                                all.x=TRUE,
                          all.y=FALSE)

col.num <- c('consensus_2_binding', 'consensus_2_agonist', 'consensus_2_antagonist')
er.ar[, (col.num) := lapply (.SD, as.numeric), .SDcols = col.num ]
# indicate any active result from compara consensus qsar
er.ar[is.na(consensus_binding & consensus_agonist & consensus_antagonist),compara.flag := 2] # if data not available
er.ar[consensus_binding==0 & consensus_agonist==0 & consensus_antagonist==0, compara.flag := 0] # if data are all negative
er.ar[consensus_binding==1|consensus_agonist==1|consensus_antagonist==1, compara.flag := 1] # if any mode is positive

# indicate any active result from cerapp consensus qsar
er.ar[is.na(consensus_2_agonist & consensus_2_binding & consensus_2_antagonist),cerapp.flag := 2]
er.ar[consensus_2_agonist==0 & consensus_2_binding==0 & consensus_2_antagonist==0, cerapp.flag := 0]
er.ar[consensus_2_agonist==1|consensus_2_binding==1|consensus_2_antagonist==1, cerapp.flag := 1]

# create positive, equivocal, and negative code on ToxCast AR model
er.ar[is.na(ar_model_Agonist_AUC & ar_model_Antagonist_AUC), toxcast.ar.flag := 2] # data not available
er.ar[ar_model_Agonist_AUC < 0.1 & ar_model_Antagonist_AUC < 0.1|ar_model_Antagonist_AUC > 0.1 & ar_model_Antagonist_Confidence_Score <= 2, toxcast.ar.flag := 0] # grouped equivocals into the negative space
er.ar[ar_model_Agonist_AUC >= 0.1|ar_model_Antagonist_AUC >= 0.1 & ar_model_Antagonist_Confidence_Score >2, 
            toxcast.ar.flag := 1]

# create positive, equivocal, and negative code on ToxCast ER model
er.ar[is.na(er_model_Agonist_AUC & er_model_Antagonist_AUC), toxcast.er.flag := 2] # no data available
er.ar[er_model_Agonist_AUC< 0.1 & er_model_Antagonist_AUC < 0.1, toxcast.er.flag := 0] # grouped equivocals into the negative space
er.ar[er_model_Agonist_AUC >= 0.1|er_model_Antagonist_AUC >= 0.1, toxcast.er.flag := 1]

# require toxcast er model data to trump cerapp call
er.ar[toxcast.er.flag ==2 & cerapp.flag==2, final.er.flag :=0.1] # no data
er.ar[toxcast.er.flag ==2 & cerapp.flag==0, final.er.flag :=0] # only cerapp data and it's negative
er.ar[toxcast.er.flag ==2 & cerapp.flag==1, final.er.flag :=0.5] # only cerapp data and it's positive
er.ar[toxcast.er.flag ==0 & cerapp.flag %in% c(0,1,2), final.er.flag := 0] # er model available and negative
er.ar[toxcast.er.flag ==1 & cerapp.flag %in% c(0,1,2), final.er.flag := 1] # er model available and positive

# require toxcast ar model data to trump compara call
er.ar[toxcast.ar.flag ==2 & compara.flag==2, final.ar.flag :=0.1] # no data
er.ar[toxcast.ar.flag ==2 & compara.flag==0, final.ar.flag :=0] # only cerapp data and it's negative
er.ar[toxcast.ar.flag ==2 & compara.flag==1, final.ar.flag :=0.5] # only cerapp data and it's positive
er.ar[toxcast.ar.flag ==0 & compara.flag %in% c(0,1,2), final.ar.flag := 0] # er model available and negative
er.ar[toxcast.ar.flag ==1 & compara.flag %in% c(0,1,2), final.ar.flag := 1] # er model available and positive

er.ar[, ed.flag.sum := sum(final.er.flag, final.ar.flag), by=list(DTXSID)]

er.ar.long <- melt.data.table(er.ar,
                                    id.vars = c('DTXSID', 'CASRN','PREFERRED_NAME', 'ed.flag.sum',
                                                'er_model_pseudo_AC50_median','er_model_pseudo_AC50_min','er_model_Agonist_AUC',
                                                'er_model_Antagonist_AUC',
                                                'ar_model_pseudo_AC50_median','ar_model_pseudo_AC50_min',
                                                'ar_model_Agonist_AUC','ar_model_Antagonist_AUC'),
                                    measure.vars = c('cerapp.flag',
                                                     'compara.flag',
                                                     'toxcast.er.flag',
                                                     'toxcast.ar.flag',
                                                     'final.ar.flag',
                                                     'final.er.flag'),
                                    variable.name = 'flags',
                                    value.name = 'flag.values')

er.ar.long2 <- melt.data.table(er.ar.long,
                                     id.vars = c('DTXSID','CASRN','PREFERRED_NAME','ed.flag.sum'),
                                     measure.vars = c('er_model_pseudo_AC50_median', 'er_model_pseudo_AC50_min',
                                                      'er_model_Agonist_AUC','er_model_Antagonist_AUC',
                                                      'ar_model_pseudo_AC50_median','ar_model_pseudo_AC50_min',
                                                      'ar_model_Agonist_AUC','ar_model_Antagonist_AUC'),
                                     variable.name = 'toxcast_models',
                                     value.name = 'toxcast_model_values')

er.ar.long <- er.ar.long[order(-ed.flag.sum)]


er.ar.long2 <- unique(er.ar.long2) # this is for potency values. Maybe return to this later.

sc2.stm <- tcplPrepOtpt(tcplLoadData(lvl=2,type='sc',fld='aeid',val=c(1691,1858)))
#write.csv(sc2.stm, './source/sc2_4todd_stm_newmthd_28jul2020.csv')
mc5.stm <- tcplPrepOtpt(tcplLoadData(lvl=5,type='mc',fld='aeid',val=c(1691,1858)))

sc <- sc2.stm$dsstox_substance_id
mc <- mc5.stm$dsstox_substance_id

stm.tested <- c(sc,mc)
stm.dtxsids <- unique(stm.tested)


flags <- as.data.table(er.ar)
flags <- flags[,c('CASRN','DTXSID','PREFERRED_NAME','compara.flag', 'cerapp.flag', 'toxcast.ar.flag', 'toxcast.er.flag', 'final.er.flag', 'final.ar.flag', 'ed.flag.sum')]
flags[,stm.tested := 0]
flags[DTXSID %in% stm.dtxsids, stm.tested := 1]

stm.mc6 <- tcplPrepOtpt(tcplLoadData(lvl=6, fld='m4id', val=mc5.stm$m4id, type='mc'))
setDT(stm.mc6)
mc6_mthds <- stm.mc6[ , .( mc6_mthd_id = paste(mc6_mthd_id, collapse=",")), by = m4id]
mc6_flags <- stm.mc6[ , .( flag = paste(flag, collapse=";")), by = m4id]
mc5.stm$mc6_flags <- mc6_mthds$mc6_mthd_id[match(mc5.stm$m4id, mc6_mthds$m4id)]
mc5.stm[, flag.length := ifelse(!is.na(mc6_flags), count.fields(textConnection(mc6_flags), sep =','), NA)]

# filter the dataset, with coarse filters

mc5.stm[hitc==1 & flag.length < 3, use.me := 1]
mc5.stm[hitc==1 & is.na(flag.length), use.me := 1]
mc5.stm[hitc==1 & flag.length >= 3, use.me := 0]
mc5.stm[fitc %in% c(36,45), use.me := 0]
mc5.stm[hitc==-1, use.me := 0] # make hitc interpretable as a positive sum
mc5.stm[use.me==0, modl_ga := as.numeric(NA)]
mc5.stm[use.me==0, hitc := 0]
mc5.stm[hitc==0, modl_ga := as.numeric(NA)]

dev.flag <- dcast.data.table(mc5.stm, dsstox_substance_id + chnm ~ aenm,
                                  value.var = 'modl_ga',
                                  fun.aggregate= max)

dev.flag[,stm.cyto.dist := ifelse(!is.na(STM_H9_Viability_norm), STM_H9_Viability_norm - STM_H9_OrnCyssISnorm_ratio_dn, 3 - STM_H9_OrnCyssISnorm_ratio_dn)]

dev.flag[,dev.flag := 0]
dev.flag[!is.na(STM_H9_OrnCyssISnorm_ratio_dn), dev.flag := 1]
dev.flag[,dev.flag.specific := 0]
dev.flag[stm.cyto.dist >0.25, dev.flag.specific := 1]

flags <- merge.data.table(flags,
                          dev.flag,
                          by.x='DTXSID',
                          by.y='dsstox_substance_id',
                          all.x=TRUE,
                          all.y=FALSE)
flags[stm.tested==1 & is.na(dev.flag), dev.flag := 0]
flags[stm.tested==1 & is.na(dev.flag.specific), dev.flag.specific := 0]


flags <- flags[,c('chnm') := NULL]

flags$ber <- all.aed.hu.css.dt$ber[match(flags$DTXSID,all.aed.hu.css.dt$dsstox_substance_id)]

setnames(flags, c('final.ar.flag','final.er.flag','dev.flag','dev.flag.specific'), c('AR','ER','DEV','DEV-S'))
flags[stm.tested==0 & is.na(DEV), DEV := 0.1]
flags[stm.tested==0 & is.na(`DEV-S`), `DEV-S` := 0.1]


aed.tbl.ed <- melt.data.table(flags,
                              id.vars = c('DTXSID',
                                          'CASRN',
                                          'PREFERRED_NAME',
                                          'ber'),
                              measure.vars = c(
                                               'AR',
                                               'ER',
                                               'DEV',
                                               'DEV-S'),
                              variable.name = 'cats',
                              value.name = 'cat_values')





aed.tbl.ed <- aed.tbl.ed[order(-ber)]

chnm.order <- factor(unique(aed.tbl.ed$PREFERRED_NAME), levels = unique(aed.tbl.ed$PREFERRED_NAME), ordered=TRUE)
aed.tbl.ed[,chnm.factor := factor(PREFERRED_NAME) ]

5.2 Hazard Flags for Chemicals with BER

6 BER + Hazard flags

Chemicals are ordered from smallest BER to largest to aid in prioritization.
AR and ER are integrated flags that consider COMPARA, CERAPP, and ToxCast ER and AR models.
DEV and DEV-S are developmental and specific developmental findings in the Stemina assay.

7 Create spreadsheet

mn.output <- merge.data.table(mn.chem[,c('DTXSID','PREFERRED_NAME','CASRN','list_memberships', 'in.toxcast')],
                              aed.hu.css.pbtk.dt,
                              by.x='DTXSID',
                              by.y='dsstox_substance_id',all.x=TRUE)

ber.tbl <- unique(all.aed.hu.css.dt[,c('dsstox_substance_id',
                                                   'seem3.num',
                                                   'seem3.u95.num',
                                                   'ber')])
mn.output <- merge.data.table(mn.output,
                              ber.tbl,
                              by.x='DTXSID',
                              by.y='dsstox_substance_id',
                              all.x=TRUE)
                              
                              
mn.output <- merge.data.table(mn.output,
                              hit.rate.opera[,c('DTXSID',
                                                'AVERAGE_MASS',
                                                'log10VP',
                                                'log10VP.caution',
                                                'OCTANOL_WATER_PARTITION_LOGP_OPERA_PRED',
                                                'logP.caution',
                                                'active.percent',
                                                'inactive.percent',
                                                'total.assay.screened',
                                                'active.assay.count',
                                                'inactive.assay.count')],
                              by.x='DTXSID',
                              by.y='DTXSID',
                              all.x=TRUE)

mn.output <- merge.data.table(mn.output,
                              flags[,c('DTXSID','ER','AR','DEV','DEV-S')],
                              by.x='DTXSID',
                              by.y='DTXSID',
                              all.x=TRUE)

#write.xlsx(mn.output, file='./output/cec_283+cand+guid_HTS_BER_flags_20JAN2021.xlsx')

MN Public Health List

Katie Paul Friedman

Jan 22, 2021