##  Compare_AERMOD_test_cases.R
##  June 03, 2021
##
##  Read .rdata files created with the "Process_AERMOD_test_cases_output.R" script for all test cases from 
##  two sets of AERMET/AERMOD model configurations and loop through cases to create model comparison plots 
##  and statistics for each case.
##    MODIFIED: 
##  Generates a file "TestCaseDifferences_BestFitSlope.csv", which contains 
##  the ratio of Best Fit slop for the testcase concentrations and percent difference of those slopes.
##
##    WSP 8/09/23

## clear Environments and console:
rm(list = ls(all.names = TRUE))

## clear global environments:
rm(list=ls())

##  Load R packages
library(ggplot2)
require(gridExtra)
require(reshape2)

################################
## User Defined Section 
################################

## Define test case 1 identifier
# testcase_names1 <- c("aermet_24142_aermod_24142")
# testcase_names2 <- c("aermet_23132_aermod_23132")

testcase_names1 <- c("aermet_24142_aermod_24142")
testcase_names2 <- c("aermet_23132_aermod_24142")

## Set working directory, it will identify the directory this script is located in
## Uncomment the second line if you would like to define your own working directory
working_dir <- normalizePath(getwd(),winslash="/",mustWork=NA) 
#working_dir <- 'D:/23132_Release_Files/aermod/TestCasesDeliverable/TestCases/'
setwd(working_dir)

################################
## End of User Defined Section 
################################

for (j in 1:length(testcase_names1)){
  
  testcase_name1 <- testcase_names1[j]
  testcase_name2 <- testcase_names2[j]
  
  print(paste("Now examining", testcase_name1, "and", testcase_name2))
  
  ## Define rdata directory for test case.  
  rdata_dir1 <- paste(working_dir,testcase_name1, "rdata/",sep="/")
  rdata_dir2 <- paste(working_dir,testcase_name2, "rdata/",sep="/")
  
  ## Define output directory.  Create directory if it does not exist.
  plots_dir <- paste(working_dir, "/plots_", testcase_name1, "_v_", testcase_name2, sep="")
  if(!dir.exists(plots_dir)) {
    dir.create(plots_dir,showWarnings = FALSE)
  }
  
  ## Saving all of the plots directories to an array to use as column names for failing test report
  if (!exists("plots_dir_array")) {
    plots_dir_array <- c(paste(testcase_name1, "_v_", testcase_name2, sep=""))
  }
  else {
    plots_dir_array <- c(plots_dir_array, paste(testcase_name1, "_v_", testcase_name2, sep=""))
  }
  
  ## Define AERMOD test cases
  cases <- c("AERMOD-BALDWIN45_01H","AERMOD-BALDWINHORIZ_01H","AERMOD-BALDWINVERT_01H","AERTEST_01H",
             "ALLSRCS_AREA_01H","ALLSRCS_AREAP_01H","ALLSRCS_BLINE_01H","ALLSRCS_CIRC_01H",
             "ALLSRCS_LINE_01H","ALLSRCS_OPENPIT_01H","ALLSRCS_RLINEB_01H","ALLSRCS_RLINEB2_01H",
             "ALLSRCS_RLINEBA_01H","ALLSRCS_RLINEDE_01H","ALLSRCS_STACK_01H","ALLSRCS_STACKDW_01H",
             "ALLSRCS_VOL_01H","BLP_URBAN_2S26_01H","BLP_URBAN_2S29_01H",
             "CAPPED_NOSTD_STACK2C_01H","CAPPED_NOSTD_STACK2CE_01H","CAPPED_NOSTD_STACK2H_01H",
             "CAPPED_NOSTD_STACK2HE_01H","CAPPED_STACK1_01H","CAPPED_STACK1C_01H",
             "CAPPED_STACK1C0_01H","CAPPED_STACK1H_01H","CAPPED_STACK2_01H","CAPPED_STACK2C_01H",
             "CAPPED_STACK2C0_01H","CAPPED_STACK2CE_01H","CAPPED_STACK2H_01H","FLATELEV_ELEV_STK_01H",
             "FLATELEV_FLAT_STK_01H","HRDOW_STACK1_01H","HRDOW_STACK2_01H","HRDOW_STACK3_01H",
             "HRDOW_STACK4_01H","HRDOW_STACK5_01H","HRDOW_STACK6_01H","IN_URBAN_01H",
             "LOVETT_01H","LOVETT_24H","LVT24_ELEV","LVT24_FLAT","MCR_01H","MCR_03H","MCR_24H",
             "MULTURB_STACK1_01H","MULTURB_STACK2_01H","MULTURB_STACK3_01H","MULTURB_STACK4_01H",
             "NO2_ARM2_PPB_01H","NO2_ARM2_UGM3_01H","NO2_GRSM_PPB_01H","NO2_GRSM_UGM3_01H",
             "NO2_OLM_PPB_01H","NO2_OLM_UGM3_01H","NO2_PVMRM_PPB_01H","NO2_PVMRM_UGM3_01H",
             "NO2_TTRM_PPB_01H","NO2_TTRM_UGM3_01H","NO2_ARM2_PPB_BKG","NO2_ARM2_UGM3_BKG",
             "NO2_GRSM_PPB_BKG","NO2_GRSM_UGM3_BKG","NO2_OLM_PPB_BKG","NO2_OLM_UGM3_BKG",
             "NO2_PVMRM_PPB_BKG","NO2_PVMRM_UGM3_BKG","NO2_TTRM_PPB_BKG","NO2_TTRM_UGM3_BKG",
             "NO2_ARM2_PPB_SRC","NO2_ARM2_UGM3_SRC","NO2_GRSM_PPB_SRC","NO2_GRSM_UGM3_SRC",
             "NO2_OLM_PPB_SRC","NO2_OLM_UGM3_SRC","NO2_PVMRM_PPB_SRC","NO2_PVMRM_UGM3_SRC",
             "NO2_TTRM_PPB_SRC","NO2_TTRM_UGM3_SRC","OLM_01H","OLMGRP_01H","OPENPITS_PITGAS_01H",
             "OPENPITS_PITPRT1_01H","OPENPITS_PITPRT2_01H","PSDCRED_NAAQS_01H","PSDCRED_PSDINC_01H",
             "PVMRM_01H","SURFCOAL_01H", "SURFCOAL_ACTVTY_01H","SURFCOAL_NPIT_01H",
             "SURFCOAL_OTHERS_01H","SURFCOAL_ROADS_01H","SURFCOAL_SPIT_01H",
             "TEST1_BASE_CART_3COND_SNC","TEST3_BASE_CART_3COND_SNC_BAR",
             "TEST4_BASE_CART_3COND_SNC_DEP","TEST20_URBAN_CART_3COND_SNC","TESTGAS_01H",
             "TESTGAS2_01H","TESTPART_01H","TESTPM10_01H","TESTPM10_MULTYR_01H","TESTPRT2_01H","TESTPRT2_MON"
  )
  # cases <- c("TESTPRT2_01H","TESTPRT2_MON")  
  count = 0 
  missing_count = 0
  
  for (i in 1:length(cases)) {
    print(cases[i])
    
    case1_postdata <- paste(rdata_dir1, cases[i], ".rdata", sep = "")
    if (!file.exists(case1_postdata)) {
      print(paste(case1_postdata, "does not exist."))
      case1 <- FALSE
    } else {
      load(case1_postdata)
      case1_postdata <- case_postdata
      #case1_postdata <- case_postdata[!duplicated(case_postdata),]
      case1 <-TRUE
    }
    
    case2_postdata <- paste(rdata_dir2, cases[i], ".rdata", sep = "")
    if (!file.exists(case2_postdata)) {
      print(paste(case2_postdata, "does not exist."))
      case2 <- FALSE
    } else {
      load(case2_postdata)
      case2 <- TRUE
      case2_postdata <- case_postdata
      #case2_postdata <- case_postdata[!duplicated(case_postdata),]
    }
    
    # if (nrow(case1_postdata) != nrow(case2_postdata)){
    #   print("Cannot compare case, postfiles different lengths")
    #   case1 <- FALSE
    # }
    
    if (case1 & case2) {
      count = count + 1
      print(count)
      #  Combine data for each test case by DATE and RECEPTOR
      
      comb_data <- merge(case1_postdata, case2_postdata[!duplicated(case2_postdata[, c("date", "Xcoord", "Ycoord")]), ],
                         by = c("date", "Xcoord", "Ycoord"), suffixes = c(".1", ".2"), all.x = T)
      
      ptitle <- paste(cases[i], "All ", sep = " - ")
      comp_avgconc <- ggplot(data = comb_data, aes(x = avgconc.1, y = avgconc.2)) +
        geom_point() + theme_bw() +
        xlab(testcase_name1) + ylab(testcase_name2) + ggtitle(ptitle) +
        geom_abline(slope=1, intercept = 0,size=1.3, na.rm = FALSE, show.legend = NA)+ 
        geom_abline(slope=2, intercept = 0,size=1.3, na.rm = FALSE, show.legend = NA)+ 
        geom_abline(slope=0.5, intercept = 0,size=1.3, na.rm = FALSE, show.legend = NA)+ 
        geom_smooth(method = "lm", se = TRUE, color = "blue",linetype="dashed",size=1.5) +
        ylim(c(0, NA)) + xlim(c(0, NA))
      
      
      case1_postdata_max <- aggregate(case1_postdata$avgconc, 
                                      by = list(case1_postdata$Xcoord,case1_postdata$Ycoord), 
                                      max)
      names(case1_postdata_max) <- c("Xcoord","Ycoord","avgconc")
      
      case2_postdata_max <- aggregate(case2_postdata$avgconc, 
                                      by = list(case2_postdata$Xcoord,case2_postdata$Ycoord), 
                                      max)
      names(case2_postdata_max) <- c("Xcoord","Ycoord","avgconc")
      
      comb_data_max <- merge(case1_postdata_max, case2_postdata_max, by = c("Xcoord", "Ycoord"), suffixes = c(".1", ".2"), all = T)
      
      ptitle <- paste(cases[i], "Max ", sep = " - ")    
      comp_avgconc_max <-  ggplot(data = comb_data_max, aes(x = avgconc.1, y = avgconc.2)) +
        geom_point() + theme_bw() +
        xlab(testcase_name1) + ylab(testcase_name2) + ggtitle(ptitle) +
        geom_abline(slope=1, intercept = 0,size=1.3, na.rm = FALSE, show.legend = NA)+ 
        geom_abline(slope=2, intercept = 0,size=1.3, na.rm = FALSE, show.legend = NA)+ 
        geom_abline(slope=0.5, intercept = 0,size=1.3, na.rm = FALSE, show.legend = NA)+ 
        geom_smooth(method = "lm", se = TRUE, color = "blue",linetype="dashed",size=1.5) +
        ylim(c(0, NA)) + xlim(c(0, NA))
      
      
      # XY plots, max
      ptitle <- paste(cases[i], "Max Case 1", sep = " - ")    
      case1_xy_max <- ggplot(case1_postdata_max, aes(x=Xcoord,y=Ycoord,color=avgconc)) + 
        geom_point() + scale_color_gradientn(colours = c("blue","orange","red")) +
        ggtitle(ptitle) + theme_bw()+ 
        theme(axis.title=element_text(size=10), plot.title=element_text(size=10))
      
      ptitle <- paste(cases[i], "Max Case 2", sep = " - ")    
      case2_xy_max <- ggplot(case2_postdata_max, aes(x=Xcoord,y=Ycoord,color=avgconc)) + 
        geom_point() + scale_color_gradientn(colours = c("blue","orange","red")) +
        ggtitle(ptitle) + theme_bw()+ 
        theme(axis.title=element_text(size=10), plot.title=element_text(size=10))
      
      
      
      comb_data$abs_delt <- comb_data$avgconc.1 - comb_data$avgconc.2
      
      delta_postdata_max <- aggregate(comb_data$abs_delt, 
                                      by = list(case1_postdata$Xcoord,case1_postdata$Ycoord), 
                                      max)
      names(delta_postdata_max) <- c("Xcoord","Ycoord","abs_delt")
      
      #xy plots, delta and ratio    
      ptitle <- paste(cases[i], "Max Delt, case (1 - 2)", sep = " - ")    
      delta_xy_max <- ggplot(delta_postdata_max, aes(x=Xcoord,y=Ycoord,color=abs_delt)) + 
        geom_point() + scale_color_gradientn(colours = c("blue","orange","red")) +
        ggtitle(ptitle) + theme_bw()+ 
        theme(axis.title=element_text(size=10), plot.title=element_text(size=10))
      
      
      comb_data$ratio <- comb_data$avgconc.1 / comb_data$avgconc.2   
      comb_data$ratio[is.nan(comb_data$ratio)] <- 0
      comb_data$ratio[is.infinite(comb_data$ratio)] <- 0
      ratio_postdata_max <- aggregate(comb_data$ratio, 
                                      by = list(case1_postdata$Xcoord,case1_postdata$Ycoord), 
                                      max)
      names(ratio_postdata_max) <- c("Xcoord","Ycoord","ratio")
      
      ptitle <- paste(cases[i], "Max Ratio, case (1/2)", sep = " - ")    
      ratio_xy_max <- ggplot(ratio_postdata_max, aes(x=Xcoord,y=Ycoord,color=ratio)) + 
        geom_point() + scale_color_gradientn(colours = c("blue","orange","red")) +
        ggtitle(ptitle) + theme_bw() + 
        theme(axis.title=element_text(size=10),plot.title=element_text(size=10))
      
      
      fname <- paste(plots_dir, "/", cases[i], ".png", sep = "")
      png(filename = fname, height = 8, width = 10, units = "in", res = 200)
      grid.arrange(comp_avgconc, comp_avgconc_max,
                   case1_xy_max, case2_xy_max, 
                   delta_xy_max, ratio_xy_max, nrow = 3)
      dev.off()
      
      
      ###################################################################################################
      ##  Calculate statistics
      ##
      print("Calculating statistics.")
      # Calculate number of data points in run
      nobs <- nrow(case1_postdata)
      # Calculate number of hours in run
      hour <- case1_postdata[, "date"]
      hours <- split(case1_postdata, case1_postdata$date)
      nhours <- length(hours)
      # Calculate number of receptors in run
      allreceps <- as.data.frame(case1_postdata[, 1:2])
      receps <- unique(allreceps)
      nreceps <- length(receps[, 1])
      
      print("Calculating mean concentrations")
      # Calculate mean concentrations
      conc1mean <- (mean(case1_postdata[, "avgconc"], trim = 0, na.rm = TRUE))
      conc2mean <- (mean(case2_postdata[, "avgconc"], trim = 0, na.rm = TRUE))
      # Calculate the average ratio between concentrations
      conc1 <- as.data.frame(case1_postdata[, "avgconc"], na.rm = TRUE)
      conc2 <- as.data.frame(case2_postdata[, "avgconc"], na.rm = TRUE)
      allconcs <- cbind(conc1, conc2)
      conccolnames <- c("conc1", "conc2")
      avgconcratio <- conc1mean / conc2mean
      # Calculate the average difference between concentrations (mean bias)
      concdiff <- with(allconcs, conc1 - conc2)
      meanbias <- mean(concdiff[, 1], trim = 0, na.rm = TRUE)
      # Calculate the average absolute difference between concentrations (mean error)
      concabsdiff <- with(allconcs, abs(conc1 - conc2))
      meanerror <- mean(concabsdiff[, 1], trim = 0, na.rm = TRUE)
      # Calculate the average relative difference between concentrations
      concreldiff <- with(allconcs, ((conc1 - conc2) / conc1), na.rm = TRUE)
      avgreldiff <- mean(concreldiff[, 1], trim = 0, na.rm = TRUE)
      # Calculate the normalized mean bias
      concdiffsum <- sum(concdiff[, 1], trim = 0, na.rm = TRUE)
      conc2sum <- sum(conc2[, 1], trim = 0, na.rm = TRUE)
      normmeanbias <- (concdiffsum / conc2sum)
      # Calculate the normalized mean error
      concabsdiffsum <- sum(concabsdiff[, 1], trim = 0, na.rm = TRUE)
      normmeanerror <- (concabsdiffsum / conc2sum)
      # Calculate the fractional bias
      concavg <- ((conc1 + conc2) / 2)
      concavgsum <- sum(concavg[, 1], trim = 0, na.rm = TRUE)
      fracbias <- mean(concdiffsum / concavgsum)
      # Calculate the fractional error
      fracerror <- mean(concabsdiffsum / concavgsum)
      # Calculate the correlation coefficient
      corrcoeff <- cor(conc1, conc2)
      # Calculate the robust highest concentration (RHC) based on top 26 values
      # where:  RHC = X(R) + Theta * log((3R-1)/2)
      conc1desc <- conc1[order(conc1[, 1], decreasing = TRUE), ]
      conc2desc <- conc2[order(conc1[, 1], decreasing = TRUE), ]
      conc1top26 <- as.data.frame(conc1desc[1:26])
      conc2top26 <- as.data.frame(conc2desc[1:26])
      XBAR1 <- mean(conc1top26[1:25, ])
      XBAR2 <- mean(conc2top26[1:25, ])
      X1 <- conc1top26[26, ]
      X2 <- conc2top26[26, ]
      THETA1 <- (XBAR1 - X1)
      THETA2 <- (XBAR2 - X2)
      R <- (26)
      RHC1 <- (X1 + THETA1 * log10((3 * R - 1) / 2))
      RHC2 <- (X2 + THETA2 * log10((3 * R - 1) / 2))
      # Calculate the fractional bias based on top 26 values
      concdifftop26 <- with(allconcs, conc1top26 - conc2top26)
      concdiffsumtop26 <- sum(concdifftop26[, 1], trim = 0, na.rm = TRUE)
      concavgtop26 <- ((conc1top26 + conc2top26) / 2)
      concavgsumtop26 <- sum(concavg[, 1], trim = 0, na.rm = TRUE)
      fracbiastop26 <- mean(concdiffsumtop26 / concavgsumtop26)
      print("Completed Calculating statistics.")
      
      
      casename <- c(cases[i])
      
      stats <- c(
        casename, nhours, nreceps, nobs, conc1mean, conc2mean, avgconcratio, meanbias, meanerror,
        avgreldiff, normmeanbias, normmeanerror, fracbias, fracerror, corrcoeff, RHC1, RHC2, fracbiastop26
      )
      ## Every time you run through the test cases, if it is the first time, we have to reset
      ## all_stats to remove the past test cases in older versions.
      if (!exists("all_stats") || i == 1) {
        all_stats <- stats
      } else {
        all_stats <- rbind(all_stats, stats)
      }
      ## the last formula gives the percent difference in means between concentration 1 and concentration 2
      failed_stats <- c(casename, testcase_names1[j], testcase_names2[j], conc1mean, conc2mean, avgconcratio, ((conc1mean - conc2mean) / ((conc1mean + conc2mean ) / 2)))
      ##Failed tests will be a collection of all the tests that have significant differences in avgconcratio.
      if (!exists("all_failed_stats")) {
      all_failed_stats <- failed_stats
      }
      else {
        all_failed_stats <- rbind(all_failed_stats, failed_stats)
      }
      
      
      ###################################################################################################
      ##  Write to .csv
      ##
      # Remove case data
      #rm(case1_postdata, case2_postdata, comp_avgconc, fname)
      print("Removed case data.")
    }
    else {
      missing_count = missing_count + 1
    }
  }
  
  print("Combining statistics for file saving.")
  all_stats_df <- as.data.frame(all_stats,row.names=FALSE)
  colnames_df <- c("Case Name","nhours","nreceps","nobs","conc1mean","conc2mean","avgconcratio","meanbias","meanerror",
                   "avgreldiff","normmeanbias","normmeanerror","fracbias","fracerror","corrcoeff","RHC1","RHC2",
                   "fracbias-top26")
  
  write.table(all_stats_df,file=paste(plots_dir,"comparison_stats.csv",sep="/"), sep=",",col.names=colnames_df,row.names=FALSE)
  print("Compare_AERMOD_test_cases.R processing complete")
  
  if (!count == length(cases)){
    print(paste(missing_count, " cases were not compared."))
  }
}

## The rest of the script will create a spreadsheet titled "TestCaseDifferences_BestFitSlope.csv" that analyzes
## differences in the versions of each test case's mean.


## Now that all of the tests are computed, we can compare the results of all the test cases and each version.
## The smaller the margin of error, the more test cases that will be added to the failing test report.
## The test cases compute average concentration. They compare two models; call them V1 and V2.
## If you graph these points, with the coordinates(V1.concentration, V2.concentration), then they
## should graph to a straight line with a slope of 1. Any graphs that do NOT have a slope of 1
## are "failing". So, with the margin of error of .001, any slopes that are either less than
## .999 or greater than 1.001 would be "failing".
margin_of_error <- .001
for (i in 1: length(cases)) {
  formatted_failed_tests <- c(cases[i])
  for (j in 1:length(testcase_names1)) {
    ## find each corresponding slope line for each test case and each version and format it to correspond with each test case name.
    formatted_failed_tests <- c(formatted_failed_tests, all_failed_stats[5 * (length(cases) * length(testcase_names1)) + i + length(cases) * (j - 1)], 
                                all_failed_stats[6 * length(cases) * length(testcase_names1) + i + length(cases) * (j - 1)]) ## Adding the percent difference
    
    ## A bit of an explanation on the complicated formulas: Looking at the data, you can essentially treat it as a 2-d spreadsheet. So, the first part of the
    ## formula (n * length(cases) * length(testcase_names1))) will go over to the 1st entry in the nth column! Now, in order to traverse that column, you have to add i.
    ## But, in addition, each column has multiple test case versions. So, you have to repeat this for each test case version hence the j variable.
    ## Since you start at the top of the column, assuming an index starts at one, you can use (j - 1) * length(testcases) to move up and down the length of the test cases and find each
    ## version of one test case.
  }
  
  
  for (j in 1:length(testcase_names1)) {
    ## only add to the final failed tests if the slope of the line is outside of the margin of error.
    if (all_failed_stats[5 * (length(testcase_names1) * length(cases)) + i + length(cases) * (j - 1)] < 1 - margin_of_error ||
        all_failed_stats[5 * (length(testcase_names1) * length(cases)) + i + length(cases) * (j - 1)] > 1 + margin_of_error) {
      if (!exists("finished_failed_tests")) {
        finished_failed_tests <- formatted_failed_tests
      }
      else {
        finished_failed_tests <- rbind(finished_failed_tests, formatted_failed_tests)
      }
    }
  }
}



## Export to CSV file if any tests failed
if (exists("finished_failed_tests")) {
  
  finished_failed_tests_df <- as.data.frame(finished_failed_tests,row.names=FALSE)
  formatted_colnames_df <- c("Case Name")
  
  for (i in 1: length(testcase_names1)) {
    ## Setting up the column names depending on the number of test cases and names
    formatted_colnames_df <- c(formatted_colnames_df, plots_dir_array[i], "Percent difference between means")
  }
  
  ## Note on the table: The first column is the test case name, then the following columns are paired in two.
  ## Of the pair, the first column is a ratio of conc1mean / conc2mean between the two versions.
  ## The second column in the pair is the percent difference between conc1mean and conc2mean.
  write.table(finished_failed_tests_df,file=paste(working_dir,"TestCaseDifferences_BestFitSlope.csv", sep="/"), sep=",",col.names=formatted_colnames_df,row.names=FALSE)

} else {
  write.table("No Failed Tests",file=paste(working_dir,"TestCaseDifferences_BestFitSlope.csv", sep="/"), sep=",",col.names=FALSE,row.names=FALSE)
}



##
########################################################################################################################