# Update activity tables for NEI
# Doug Jackson
# doug.jackson@erg.com
import os
import pymysql
import pandas as pd
import re
import mysql.connector
import numpy as np
import glob

##########################################################################################
# Constants
##########################################################################################
workingDir = "/Users/djackson/Documents/NEI/activityUpdates"

loadFF10scriptFile = "/Users/djackson/Documents/NEI/activityUpdates/Load_FF10_datasets.sql"
populateCDBSscriptFile = "/Users/djackson/Documents/NEI/activityUpdates/Populate_CDBs_from_FF10.sql"
hotellingScriptFile = "/Users/djackson/Documents/NEI/activityUpdates/hotellinghours.sql"

FF10database = "ff10data"
hotelingFF10file = "/Users/djackson/Documents/NEI/activityUpdates/fromEPA/Hotelling_EPA_Default_2014_v3.csv"

#dBdir = "/Volumes/ERG_ARCHIVE/NEI/activityUpdates/2014_nonsubmittedCDBs_2011activity_TN"
#dBdir = "/Volumes/ERG_ARCHIVE/NEI/activityUpdates/2014_nonsubmittedCDBs_2011activity"
dBdir = "/Volumes/ERG_ARCHIVE/NEI/islandUpdates/2014_island_cdbs"

MySQLdir = "/usr/local/mysql/data/"
MySQLuser = "root"
MySQLpassword = "moves"

summaryStatsPrefix = "summaryStats_19jul16"

#maxNumDBs = 10

##########################################################################################
# Functions
##########################################################################################
def queryDB(cnx, query):
    cursor = cnx.cursor()
    cursor.execute(query)
    columnNames = cursor.column_names
    result = pd.DataFrame(cursor.fetchall(), columns = columnNames)
    return result

def noReturnDB(cnx, query):
    cursor = cnx.cursor()
    cursor.execute(query)
    
##########################################################################################
# Run
##########################################################################################
os.chdir(workingDir)

# Read the names of all of the MOVES databases
dBnames = glob.glob(os.path.join(dBdir, "*"))
dBnames = [os.path.basename(p) for p in dBnames]
#dBnames = dBnames[0:maxNumDBs]
numDBs = len(dBnames)

# Remove any symbolic links to these databases
for d in dBnames:
    if os.path.exists(os.path.join(MySQLdir, d)):
        os.remove(os.path.join(MySQLdir, d))
        
# Update the date stamp on the databases
dateStamp = datetime.datetime.now().strftime(format="%Y%m%d")
for d in dBnames:
    oldDateStamp = re.search("\d{8}", d).group(0)
    newDBname = d.replace(oldDateStamp, dateStamp)
    os.rename(os.path.join(dBdir, d), os.path.join(dBdir, newDBname))

# Read the names of all of the MOVES databases
dBnames = glob.glob(os.path.join(dBdir, "*"))
dBnames = [os.path.basename(p) for p in dBnames]
#dBnames = dBnames[0:maxNumDBs]
numDBs = len(dBnames)

# Create symbolic links to the databases
for d in dBnames:
    if not os.path.exists(os.path.join(MySQLdir, d)):
        os.symlink(os.path.join(dBdir, d), os.path.join(MySQLdir, d))
        
# Connect to the database
connection = pymysql.connect(user=MySQLuser, passwd=MySQLpassword, host='127.0.0.1',
                                 database=FF10database) 
cursor = connection.cursor()

# Read the FF10 data into the database
with open(loadFF10scriptFile, "r") as fH:
    loadFF10script = fH.read()

# Run the FF10 script
cursor.execute(loadFF10script)
connection.commit()

connection.close()

# Loop through the CDBs
summaryStats = pd.DataFrame()
count = 0
for CDBname in dBnames:
    
    count+=1
    print("Processing", count, "of", numDBs)
    
    # Remove residual ibd files from previous times this script was run
    ibdFiles = glob.glob(os.path.join(MySQLdir, CDBname, "*.ibd"))
    for f in ibdFiles:
        os.remove(f)
        
    # Read the MySQL scripts
    with open(populateCDBSscriptFile, "r") as fH:
        populateCDBSscript = fH.read()
        
    populateCDBSscript = populateCDBSscript.replace("CDBPLACEHOLDER", CDBname)
    # The -sig is necessary to remove the BOM, \ufeff
    with open(hotellingScriptFile, "r", encoding="utf-8-sig") as fH:
        hotellingScript = fH.read()
    hotellingScript = hotellingScript.replace("CDBPLACEHOLDER", CDBname)
    
    # Connect to the database
    connection = pymysql.connect(user=MySQLuser, passwd=MySQLpassword, host='127.0.0.1',
                                     database=FF10database) 
    cursor = connection.cursor()

    cursor.execute("set sql_mode='';")
    cursor.execute(populateCDBSscript)
    connection.commit()
    cursor.execute(hotellingScript)
    connection.commit()
    
    connection.close()
    
    countyID = re.search("c\d+y2014", CDBname).group(0)
    countyID = countyID.replace("c", "")
    countyID = countyID.replace("y2014", "")
    
    # Calculate the VMT and population sums in the FF10 data
    cnx = mysql.connector.connect(user=MySQLuser, password=MySQLpassword, host='127.0.0.1', database=FF10database)
    thisVMTfF10 = queryDB(cnx, "SELECT * FROM vmtff10 WHERE region_cd='" + countyID + "';")
    thisVMTfF10sum = np.sum(np.array(thisVMTfF10["ann_value"]))
    thisPopFF10 = queryDB(cnx, "SELECT * FROM popff10 WHERE region_cd='" + countyID + "';")
    thisPopFF10sum = np.sum(np.array(thisPopFF10["ann_value"]))
    cnx.close()
    
     # Calculate the VMT and population sums in the CDB, and read the hotellinghours table
    cnx = mysql.connector.connect(user=MySQLuser, password=MySQLpassword, host='127.0.0.1', database=CDBname)
    thisVMTcDBsum = queryDB(cnx, "SELECT sum(VMT) FROM sourceTypeYearVMT")["sum(VMT)"][0] 
    thisPopCDBsum = queryDB(cnx, "SELECT sum(sourceTypePopulation) FROM sourceTypeYear")["sum(sourceTypePopulation)"][0]  
    thisHotellingHoursSum = queryDB(cnx, "SELECT sum(hotellingHours) FROM hotellingHours")["sum(hotellingHours)"][0]  
    cnx.close()    
    
    # Fill with zeros if the VMT, population, or hotelling hours are missing
    if thisVMTcDBsum is None:
        thisVMTcDBsum = 0
    if thisPopCDBsum is None:
        thisVMTPopCDBsum = 0
    if thisHotellingHoursSum is None:
        thisHotellingHoursSum = 0 
    if thisVMTfF10sum==False:
        thisVMTfF10sum = 0
    if thisPopFF10sum==False:
        thisPopFF10sum=0
    
    # Save the summary stats to a dataframe
    thisSummaryStats = pd.DataFrame([{"countyID":countyID,
                                    "VMTfF10sum":thisVMTfF10sum,
                                      "VMTcDBsum":thisVMTcDBsum,
                                      "VMTdiff":thisVMTcDBsum - thisVMTfF10sum,
                                      "popFF10sum":thisPopFF10sum,
                                      "popCDBsum":thisPopCDBsum,
                                      "hotellingHoursSum":thisHotellingHoursSum,
                                      "popDiff":thisPopCDBsum - thisPopFF10sum}])
                                     
    summaryStats = pd.concat([summaryStats, thisSummaryStats], ignore_index=True)

summaryStats = summaryStats[["countyID", "VMTfF10sum", "VMTcDBsum", "VMTdiff", "popFF10sum",
                             "popCDBsum", "popDiff", "hotellingHoursSum"]]
summaryStats["countyID"] = [int(c) for c in np.array(summaryStats["countyID"])]
summaryStats.to_csv(summaryStatsPrefix + ".csv", index=False)

hotelingFF10 = pd.read_csv(hotelingFF10file, skiprows=15)
hotelingFF10grp = hotelingFF10.groupby(["region_cd"], as_index=False)
hotelingFF10 = hotelingFF10grp["ann_value"].sum()

hotelingFF10 = hotelingFF10.rename(columns={"region_cd":"countyID"})

summaryStats = pd.merge(summaryStats, hotelingFF10, on="countyID", how="left")
summaryStats.to_csv(summaryStatsPrefix + "_Merged.csv", index=False)

################################################################################
# Verify that the CDBs have the correct numbers of files
for CDBname in dBnames:
    thisFiles = glob.glob(os.path.join(dBdir, CDBname, "*"))
    numFiles = len(thisFiles)
    if numFiles!=138:
        print("numFiles in", CDBname, "is", len(thisFiles))
    else:
        print(".", end="")