# Update activity tables for NEI
# Doug Jackson
# doug.jackson@erg.com
import os
import pymysql
import pandas as pd
import re
import mysql.connector
import numpy as np
import glob
import datetime as dT
import time

##########################################################################################
# Constants
##########################################################################################
workingDir = "/Users/djackson/Documents/NEI/activityUpdates"

# Be sure to update the file paths and the number of lines to skip.
# Should test the loadFF10script in MySQL Workbench before running.
# Input files should have Unix line endings if running on a Mac.
loadFF10scriptFile = "/Users/djackson/Documents/NEI/activityUpdates/Load_FF10_datasets.sql"
populateCDBSscriptFile = "/Users/djackson/Documents/NEI/activityUpdates/Populate_CDBs_from_FF10.sql"
hotellingScriptFile = "/Users/djackson/Documents/NEI/activityUpdates/hotellinghours.sql"

FF10database = "ff10data"
hotelingFF10file = "/Users/djackson/Documents/WA5-08/Task1/2014v2_activity/HOTELLING_NEI_v2_2014_candidate_20170630_30jun2017_v3_withHeader.csv"

dBdir = "/Volumes/ERG_ARCHIVE/NEI/CDBs_etc_12jul17/allCountyCDBs_reranActivityUpdates_25aug17"

# Flag to indicate whether calc_year in the FF10 activity tables should be changed to calcYear
changeCalcYear = False
calcYear = 2014

NEIyear = "2014"

MySQLdir = "/usr/local/mysql/data/"

summaryStatsPrefix = "summaryStats_25aug17_" + NEIyear

numFilesFile = "/Users/djackson/Documents/WA5-08/Task1/updateCDBsFiles_12jul17/dBfileCounts.txt"

#maxNumDBs = 10

##########################################################################################
# Functions
##########################################################################################
def queryDB(cnx, query):
    cursor = cnx.cursor()
    cursor.execute(query)
    columnNames = cursor.column_names
    result = pd.DataFrame(cursor.fetchall(), columns = columnNames)
    return result

def noReturnDB(cnx, query):
    cursor = cnx.cursor()
    cursor.execute(query)
    
##########################################################################################
# Run
##########################################################################################
os.chdir(workingDir)

# Read the names of all of the MOVES databases
dBnames = glob.glob(os.path.join(dBdir, "*"))
dBnames = [os.path.basename(p) for p in dBnames]
#dBnames = dBnames[0:maxNumDBs]
numDBs = len(dBnames)

# Read the number of files in each database
numFilesBeforeDF = pd.DataFrame()
for d in dBnames:
    thisFiles = glob.glob(os.path.join(dBdir, d, "*"))
    numFilesBeforeDF = numFilesBeforeDF.append(pd.DataFrame([{"dB":d.split("_")[0], "numFiles": len(thisFiles)}]), ignore_index=True)
    
# Remove any symbolic links to these databases
for d in dBnames:
    if os.path.exists(os.path.join(MySQLdir, d)):
        os.remove(os.path.join(MySQLdir, d))
        
# Update the date stamp on the databases
dateStamp = dT.datetime.now().strftime(format="%Y%m%d")
for d in dBnames:
    oldDateStamp = re.search("\d{8}", d).group(0)
    newDBname = d.replace(oldDateStamp, dateStamp)
    os.rename(os.path.join(dBdir, d), os.path.join(dBdir, newDBname))

# Read the names of all of the MOVES databases
dBnames = glob.glob(os.path.join(dBdir, "*"))
dBnames = [os.path.basename(p) for p in dBnames]
#dBnames = dBnames[0:maxNumDBs]
numDBs = len(dBnames)

# Create symbolic links to the databases
for d in dBnames:
    if not os.path.exists(os.path.join(MySQLdir, d)):
        os.symlink(os.path.join(dBdir, d), os.path.join(MySQLdir, d))

# Write the scripts to files and run using os.system, as the pymysql approach is unreliable
returnStatus = os.system("mysql --defaults-extra-file=user.cnf < " + loadFF10scriptFile)  
print("loadFF10scriptFile returnStatus=", returnStatus)
 
## Connect to the database
#connection = pymysql.connect(user='root', passwd='moves', host='127.0.0.1',
#                                 database=FF10database) 
#cursor = connection.cursor()
#
## Read the FF10 data into the database
#with open(loadFF10scriptFile, "r") as fH:
#    loadFF10script = fH.read()
#
## Run the FF10 script
#cursor.execute(loadFF10script)
#connection.commit()
#
#connection.close()

# Try some settings that might prevent the MySQL connection from dying
cnx = mysql.connector.connect(user="root", password="moves", host='127.0.0.1', database=FF10database)
cursor = cnx.cursor()
cursor.execute("SET GLOBAL connect_timeout=28800")
cnx.commit()
cursor.execute("SET GLOBAL wait_timeout=28800")
cnx.commit()
cursor.execute("SET GLOBAL interactive_timeout=28800")
cnx.commit()
cnx.close()

# Change the calcYear, if desired
if changeCalcYear:
    cnx = mysql.connector.connect(user="root", password="moves", host='127.0.0.1', database=FF10database)
    cursor = cnx.cursor()
    
    cursor.execute("UPDATE popff10 SET calc_year=" + str(calcYear))
    cnx.commit()
    
    cursor.execute("UPDATE vmtff10 SET calc_year=" + str(calcYear))
    cnx.commit()
    
    cursor.execute("UPDATE hotff10 SET calc_year=" + str(calcYear))
    cnx.commit()
    
    cnx.close()
    
# Loop through the CDBs
count = 0
for CDBname in dBnames:
    
    count+=1
    print("Processing", count, "of", numDBs)
    
    # Remove residual ibd files from previous times this script was run
    ibdFiles = glob.glob(os.path.join(MySQLdir, CDBname, "*.ibd"))
    for f in ibdFiles:
        os.remove(f)
        
    # Read the MySQL scripts
    with open(populateCDBSscriptFile, "r") as fH:
        populateCDBSscript = fH.read()
        
    populateCDBSscript = populateCDBSscript.replace("CDBPLACEHOLDER", CDBname)
    # The -sig is necessary to remove the BOM, \ufeff
    with open(hotellingScriptFile, "r", encoding="utf-8-sig") as fH:
        hotellingScript = fH.read()
    hotellingScript = hotellingScript.replace("CDBPLACEHOLDER", CDBname)
    hotellingScript = hotellingScript.replace("CALCYEARPLACEHOLDER", str(calcYear))
    
    # Write the scripts to files and run using os.system, as the pymysql approach is unreliable
    with open("tempPopulateCDBSscript.sql", "w") as fH:
        print(populateCDBSscript, file=fH)
    with open("tempHotellingScript.sql", "w") as fH:
        print(hotellingScript, file=fH)
    
    returnStatus = os.system("mysql --defaults-extra-file=user.cnf < " + os.path.join(workingDir, "tempPopulateCDBSscript.sql"))
    if returnStatus!=0:
        print("Failed to run tempPopulateCDBSscript. returnStatus=", returnStatus)
    returnStatus = os.system("mysql --defaults-extra-file=user.cnf < " + os.path.join(workingDir, "tempHotellingScript.sql"))
    if returnStatus!=0:
        print("Failed to run tempHotellingScript. returnStatus=", returnStatus)
    
#    # Wrap this operation in a try because it often fails
#    success = False
#    tryCount = 0
#    while not success and tryCount<100:
#        try:
#            # Connect to the database
#            connection = pymysql.connect(user='root', passwd='moves', host='127.0.0.1',
#                                             database=FF10database) 
#            cursor = connection.cursor()
#        
#            cursor.execute("set sql_mode='';")
#            cursor.execute(populateCDBSscript)
#            connection.commit()
#            cursor.execute(hotellingScript)
#            connection.commit()
#            
#            connection.close()
#            success = True
#        except:
#            print("MySQL connection was lost. Waiting a bit and then trying again...")
#            time.sleep(10)
#            tryCount+=1          

# Loop through the CDBs calculating summaries
summaryStats = pd.DataFrame()
count = 0
for CDBname in dBnames:
            
    count+=1
    print("Processing", count, "of", numDBs)
    
    countyID = re.search("c\d+y" + NEIyear, CDBname).group(0)
    countyID = countyID.replace("c", "")
    countyID = countyID.replace("y" + NEIyear, "")
    
    # Calculate the VMT and population sums in the FF10 data
    success = False
    while not success:
        try:
            cnx = mysql.connector.connect(user="root", password="moves", host='127.0.0.1', database=FF10database)
            thisVMTfF10 = queryDB(cnx, "SELECT * FROM vmtff10 WHERE region_cd='" + countyID + "';")
            thisVMTfF10sum = np.sum(np.array(thisVMTfF10["ann_value"]))
            thisPopFF10 = queryDB(cnx, "SELECT * FROM popff10 WHERE region_cd='" + countyID + "';")
            thisPopFF10sum = np.sum(np.array(thisPopFF10["ann_value"]))
            cnx.close()
            success = True
        except:
            print("Could not run FF10 sums queries. Reconnecting and trying again...")
            time.sleep(10)
            cnx.close()
            cnx = mysql.connector.connect(user="root", password="moves", host='127.0.0.1', database=FF10database)

     # Calculate the VMT and population sums in the CDB, and read the hotellinghours table
    success = False
    while not success:
        try:
            cnx = mysql.connector.connect(user="root", password="moves", host='127.0.0.1', database=CDBname)
            thisVMTcDBsum = queryDB(cnx, "SELECT sum(VMT) FROM sourceTypeYearVMT")["sum(VMT)"][0] 
            thisPopCDBsum = queryDB(cnx, "SELECT sum(sourceTypePopulation) FROM sourceTypeYear")["sum(sourceTypePopulation)"][0]  
            thisHotellingHoursSum = queryDB(cnx, "SELECT sum(hotellingHours) FROM hotellingHours")["sum(hotellingHours)"][0]  
            cnx.close()            
            success = True
        except:
            print("Could not run CDB sums queries. Reconnecting and trying again...")
            time.sleep(10)
            cnx.close()
            cnx = mysql.connector.connect(user="root", password="moves", host='127.0.0.1', database=CDBname)
    
    # Fill with zeros if the VMT, population, or hotelling hours are missing
    if thisVMTcDBsum is None:
        thisVMTcDBsum = 0
    if thisPopCDBsum is None:
        thisVMTPopCDBsum = 0
    if thisHotellingHoursSum is None:
        thisHotellingHoursSum = 0 
    if thisVMTfF10sum==False:
        thisVMTfF10sum = 0
    if thisPopFF10sum==False:
        thisPopFF10sum=0
    
    # Save the summary stats to a dataframe
    thisSummaryStats = pd.DataFrame([{"countyID":countyID,
                                    "VMTfF10sum":thisVMTfF10sum,
                                      "VMTcDBsum":thisVMTcDBsum,
                                      "VMTdiff":thisVMTcDBsum - thisVMTfF10sum,
                                      "popFF10sum":thisPopFF10sum,
                                      "popCDBsum":thisPopCDBsum,
                                      "hotellingHoursSum":thisHotellingHoursSum,
                                      "popDiff":thisPopCDBsum - thisPopFF10sum}])
                                     
    summaryStats = pd.concat([summaryStats, thisSummaryStats], ignore_index=True)

summaryStats = summaryStats[["countyID", "VMTfF10sum", "VMTcDBsum", "VMTdiff", "popFF10sum",
                             "popCDBsum", "popDiff", "hotellingHoursSum"]]
summaryStats["countyID"] = [int(c) for c in np.array(summaryStats["countyID"])]
summaryStats.to_csv(summaryStatsPrefix + ".csv", index=False)

hotelingFF10 = pd.read_csv(hotelingFF10file, skiprows=19)
hotelingFF10grp = hotelingFF10.groupby(["region_cd"], as_index=False)
hotelingFF10 = hotelingFF10grp["ann_value"].sum()

hotelingFF10 = hotelingFF10.rename(columns={"region_cd":"countyID"})

summaryStats = pd.merge(summaryStats, hotelingFF10, on="countyID", how="left")
summaryStats.to_csv(summaryStatsPrefix + "_Merged.csv", index=False)

# Remove any symbolic links to these databases
for d in dBnames:
    if os.path.exists(os.path.join(MySQLdir, d)):
        os.remove(os.path.join(MySQLdir, d))
        
################################################################################
numFilesAfterDF = pd.DataFrame()
for d in dBnames:
    thisFiles = glob.glob(os.path.join(dBdir, d, "*"))
    numFilesAfterDF = numFilesAfterDF.append(pd.DataFrame([{"dB":d.split("_")[0], "numFiles": len(thisFiles)}]), ignore_index=True)

numFilesDF = pd.merge(numFilesBeforeDF, numFilesAfterDF, on="dB", how="outer", suffixes=["_before", "_after"])  
    
# Verify that the CDBs have the correct numbers of files
# Read the number of files previously extracted from the original databases
numFiles = pd.read_csv(numFilesFile)

numFilesDF = pd.merge(numFilesDF, numFiles, on="dB", how="left")
numFilesDF = numFilesDF.rename(columns={"numFiles":"numFiles_orig"})

numFilesMismatch = numFilesDF.loc[numFilesDF["numFiles_after"]!=numFilesDF["numFiles_orig"]]
print(numFilesMismatch)
