#!/usr/bin/env python
# Allocates the gridded reports to annual and monthly county and state reports.
#
# 10/15/2014 James Beidler <beidler.james@epa.gov>

from datetime import datetime, timedelta
from optparse import OptionParser
import os, csv, sys
import time
import pandas as pd 
import numpy as np
#import cProfile, pstats 

def open_file(file_name, access_type = 'r'):
	"""
	Tests to see if a file is available for access.  If not it returns an error and exits.
	If it is available it returns an open file object.
	"""
	try: 
		file = open(file_name, access_type)
	except IOError:
		raise IOError, '%s not available for access.' %file_name
	else: 
		return file

def check_ev(ev_name):
	"""
	Checks if an environment variable is set.  If not, exits.  If it is, returns the variable.
	Takes the name of the environment variable.
	"""
	try: 
		var = os.environ[ev_name]
	except KeyError:
		raise KeyError, 'Environment variable "%s" is not defined.' %ev_name
	else: 
		return var

class MovesReport(object):
	"""
	General moves report object
	Container for pandas DF 
	"""
	def __init__(self, rep_df=pd.DataFrame()):
		self.rep_df = rep_df

	def _name_repfile(self, run_type, sector, esdate, grid, spec, case, home):
		"""
		Set the infile name for the REP file based on the SMOKE conventions.
		"""
		infile_name = 'rep_mole_%s_%s_%s_%s_%s_%s.txt' %(run_type, sector, esdate, grid, spec, case)
		inpath = os.path.join(home, 'reports/smkmerge/%s/%s' %(sector, run_type))
		return os.path.join(inpath, infile_name)

	def _get_pol_cols(self, pols, units):
		'''
		Set the inventory columns with "tons" to ignore and add a "_g" tag
		where the units are grams
		'''
		if len(pols) != len(units):
			raise ValueError, 'Number of pollutants and units columns do not match in header.'

		names = []
		use_cols = []
		mol_list = []
		for x, pol in enumerate(pols):

			# Deal with CAS numbers by tacking an S to the front
			try:
				z = int(pol)
			except ValueError:
				pass
			else:
				pol = 'S' + pol

			# Put moles in a separate mole list
			if 'mol' in units[x]:
				out_pol = pol[:]
				mol_list.append(out_pol)
			else:
				# Label tons and grams as units
				if 'tons' in units[x]:
					out_pol = pol + '_t'
				elif 'g/' in units[x]:
					out_pol = pol + '_g'

				# Avoid doubling up on two mass-based numbers
				if '%s_t' %pol not in use_cols and '%s_g' %pol not in use_cols:
					use_cols.append(out_pol)

			names.append(out_pol)

		# Add mole-based columns to the use columns if there are no mass-based entries
		for pol in mol_list:
			if '%s_t' %pol not in use_cols and '%s_g' %pol not in use_cols:
				use_cols.append(pol)

		if not use_cols or not names:
			raise ValueError, 'Could not find any columns to use'

		return names, use_cols

	def _strip(self, txt):
		try:
			return txt.strip()
		except AttributeError:
			return txt

	def _read_repfile(self, rep_file, mon):
		"""
		Read the rep file into a dictionary by fips and scc
		"""
		with open_file(rep_file) as f:
			for line in f:
				if line.startswith('#'):
					line = [cell.strip() for cell in line[1:].strip().split(';') if cell.strip()]
					if len(line) > len(self.head_line):
						if '/day]' in line[len(self.head_line)]:
							units = line[:]
						if self.head_line[0] in line[0].lower():
							meta = line[:len(self.head_line)]
							pols = line[len(self.head_line):] 
							pol_names, self.species_list = self._get_pol_cols(pols, units)
							col_names = self.head_line + pol_names
							use_cols = ['fips','state','county','scc'] + self.species_list
							rep_dtype = {'fips': '|S6', 'state': '|S25', 'county': '|S40', 'scc': '|S10'}
							[rep_dtype.setdefault(pol,'f') for pol in pol_names]
							break

			f.seek(0)
			rep_df = pd.read_csv(f, skiprows=6, delimiter=';', skipinitialspace=True, usecols=use_cols, converters={'state': self._strip, 'county': self._strip}, dtype=rep_dtype, names=col_names)
			# Wide to long transform
			rep_df = pd.melt(rep_df, id_vars=['fips','state','county','scc'], var_name='species')
		return rep_df

	def read_month(self, year, mon, r_days, run_type = '', sector = '', grid = '', spec = '', case = '', home=None):
		"""
		Reads the month for each run type and puts into the dictionary
		"""
		self.head_line = ['date', 'fips', 'state', 'county', 'scc']
		today = datetime.strptime('%s%0.2d01' %(year, mon), '%Y%m%d')

		mon_df = pd.DataFrame(columns=['fips','state','county','scc','species','value'])
		while today.month == mon:
			rep_file = self._name_repfile(run_type, sector, datetime.strftime(today, '%Y%m%d'), grid, spec, case, home)
			print rep_file
			mon_df = pd.concat((mon_df, self._read_repfile(rep_file,mon)), ignore_index=True)
			today = today + timedelta(r_days)

		mon_df = mon_df.groupby(['fips','state','county','scc','species'], sort=False, as_index=False).sum()

		if self.rep_df.empty:
			self.rep_df = mon_df
		else:
			raise ValueError, 'Report data already exists. Cannot store new monthly data.'

	def sum_runs(self, in_df):
		'''
		Sum together the object DF with another DF by fips, scc, poll 
		'''
		if not self.rep_df.empty:
			self.rep_df = pd.concat((self.rep_df, in_df), ignore_index=True)
			self.rep_df = self.rep_df.groupby(['fips','state','county','scc','species'], sort=False, as_index=False).sum()
		else:
			self.rep_df = in_df

def open_outfile(mon = None, area_type = 'state', by_scc = False, run_name = '', case=None, sector=None, grid=None, home=None, moles=False):
	"""
	Set the outfile name.
	"""
	if mon:
		duration = datetime.strftime(datetime.strptime('%s' %mon, '%m'), '%b').lower()
	else:
		duration = 'annual'

	if moles:
		rep_prefix = 'rep_moles'
	else:
		rep_prefix = 'rep'

	if by_scc:
		outfile_name = '%s_scc_%s_%s_%s_%s%s_%s.txt' %(rep_prefix, area_type, duration, sector, run_name, case, grid)
	else:
		outfile_name = '%s_%s_%s_%s_%s%s_%s.txt' %(rep_prefix, area_type, duration, sector, run_name, case, grid)

	outpath = os.path.join(home, 'reports/smkmerge/%s' %sector)
	outfile_name = os.path.join(outpath, outfile_name)

	print 'Writing %s %s to: %s' %(area_type, duration, outfile_name)

	return outfile_name 

def conv2tons(in_df, molec_dct):
	'''
	Convert the species values to tons
	'''
	rep_df = in_df.copy()
	species_list = pd.unique(rep_df['species'].values.ravel())

	# Adjust the species to tons
	for species in species_list:
		if species.endswith('_g'):
			rep_df.ix[rep_df['species'] == species, 'species'] = species[:-2]
			fac = 0.00000110231131  # grams to tons
			species = species[:-2]
			rep_df.ix[rep_df['species'] == species, 'value'] = rep_df.ix[rep_df['species'] == species, 'value'] * fac
		elif species.endswith('_t'):
			rep_df.ix[rep_df['species'] == species, 'species'] = species[:-2]
			fac = 1
			species = species[:-2]
		else:
			try:
				fac = molec_dct[species]
			except KeyError:
				raise KeyError, 'Could not find species %s in molecular weight file' %species
			fac = fac * 0.00000110231131   # moles to tons
			rep_df.ix[rep_df['species'] == species, 'value'] = rep_df.ix[rep_df['species'] == species, 'value'] * fac

	return rep_df

def write_rep(in_df, mon = None, area_type = 'state', by_scc = False, run_name = '', case=None, sector=None, grid=None, home=None, moles=False):
	'''
	Write out the report
	'''
	group_by = []

	if area_type == 'state':
		group_by.append('state')
	else:
		group_by += ['fips','state','county']

	if by_scc:
		group_by.append('scc')

	out_df = pd.pivot_table(in_df, values='value', columns='species', index=group_by, aggfunc=np.sum)
	out_df.reset_index(inplace=True)

	if mon:
		out_df.insert(0, 'month', str(mon))

	outfile_name = open_outfile(mon, area_type, by_scc, run_name, case, sector, grid, home, moles)
	out_df.to_csv(outfile_name, index=False)

def write_emf(in_df, area_type = 'state', run_name = '', case=None, sector=None, grid=None, home=None):
	# Write the EMF-style annual report
	outfile_name = 'annual_%s_%s_%s%s_cmaq_cb05_soa_%s_emf.csv' %(case, sector, run_name, grid, area_type)
	outpath = os.path.join(home, 'reports/annual_report')
	outfile_name = os.path.join(outpath, outfile_name)
	outfile = open_file(outfile_name, 'w')

	print 'Writing %s annual EMF to: %s' %(area_type, outfile_name)

	out_df = in_df.copy()
	out_df.columns = ['FIPS','State','County','SCC','Species','ann_emis']
	out_df.insert(4, 'Sector', sector)

	if area_type == 'state':
		group_by = ['State','Sector','Species']
	else:
		group_by = ['FIPS','State','County','Sector','Species']

	out_df = out_df.groupby(group_by, as_index=False).sum()
	out_df.to_csv(outfile_name, index=False, columns=group_by + ['ann_emis',])

def __main__():
	# Set up the options parser
	parser = OptionParser(usage = 'usage: %prog [options]')
	parser.add_option('-m', '--month', dest='run_month', help='Run the one specified month.  Use a two character month code (ie. 02 or 10)', metavar='MONTH', default='')
	parser.add_option('-w', '--molec_dct', dest='molec_dct', help='Path to molecular weight dictionary.', default='/garnet/oaqps/smoke/test/smoke3.6/scripts/annual_report/parameter_file_cmaq_cb6.txt')
	parser.add_option('-s', '--typesplit', action='store_true', dest='type_split', help='Split the output reports by run type rather than combining into a single output report file.', default=False)
	parser.add_option('-r', '--runtype', dest='run_type', help='Run single run type of either RPP, RPD, or RPV.  Automatically turns on the type splitting.', default='')
	parser.add_option('-d', '--days', dest='rdays', help='Length in days of each report chunk (ie. 1-31) that makes up a month of reports. Default of one day in each report.', default=1)
	parser.add_option('-l', '--moles', dest='moles', action='store_true', help='Produce reports with moles only, no mass conversion.', default=False)
	(options, args) = parser.parse_args()

	# Set the global variables based on EMF set environment variables
	year = check_ev('BASE_YEAR')
	grid = check_ev('GRID')
	sector = check_ev('SECTOR')
	spec = check_ev('EMF_SPC')
	case = check_ev('CASE')
	home = check_ev('PROJECT_ROOT')

	# Process the parser options
	type_split = options.type_split
	if options.run_month: 
		mon_list = [int(options.run_month),]
	else:
		mon_list = range(1,13)
	if options.run_type:
		run_types = [options.run_type,]
		type_split = True
	else:
		run_types = ['RPD', 'RPP', 'RPV', 'RPH']

	r_days = int(options.rdays)
	exec(open(options.molec_dct).read())

	#### Main loop

	start_time = time.time()

	annual_dict = {}

	for mon in mon_list:
		mon_rep = MovesReport()
		for run_type in run_types:
			# Read in the report for that month and type
			run_rep = MovesReport()
			run_rep.read_month(year, mon, r_days, run_type, sector, grid, spec, case, home)

			if type_split:
				# Split out the run types for the monthly reports rather than combine them
				run_name = '%s_' %run_type

				mon_mass_out = conv2tons(run_rep.rep_df, molecDct)
				for area_type in ['county', 'state']:
					write_rep(mon_mass_out, mon=mon, area_type=area_type, by_scc=False, run_name=run_name, case=case, sector=sector, grid=grid, home=home)
					write_rep(mon_mass_out, mon=mon, area_type=area_type, by_scc=True, run_name=run_name, case=case, sector=sector, grid=grid, home=home)
					if options.moles:
						mon_mol_out = run_rep.rep_df[(run_rep.rep_df['species'].str.endswith('_t') == False) & (run_rep.rep_df['species'].str.endswith('_g') == False)]
						write_rep(mon_mol_out, mon=mon, area_type=area_type, by_scc=False, run_name=run_name, case=case, sector=sector, grid=grid, home=home, moles=True)
						write_rep(mon_mol_out, mon=mon, area_type=area_type, by_scc=True, run_name=run_name, case=case, sector=sector, grid=grid, home=home, moles=True)
			else:
				mon_rep.sum_runs(run_rep.rep_df)

			# Add together monthly runs for annual reports as needed
			if len(mon_list) == 12:
				if run_type not in annual_dict:
					annual_dict[run_type] = MovesReport(run_rep.rep_df)
				else:
					annual_dict[run_type].sum_runs(run_rep.rep_df)

		# Write the combined monthly report
		if not type_split:
			mon_mass_out = conv2tons(mon_rep.rep_df, molecDct)
			for area_type in ['county', 'state']:
				write_rep(mon_mass_out, mon=mon, area_type=area_type, by_scc=False, case=case, sector=sector, grid=grid, home=home)
				write_rep(mon_mass_out, mon=mon, area_type=area_type, by_scc=True, case=case, sector=sector, grid=grid, home=home)
				if options.moles:
					mon_mol_out = run_rep.rep_df[(run_rep.rep_df['species'].str.endswith('_t') == False) & (run_rep.rep_df['species'].str.endswith('_g') == False)]
					write_rep(mon_mol_out, mon=mon, area_type=area_type, by_scc=False, case=case, sector=sector, grid=grid, home=home, moles=True)
					write_rep(mon_mol_out, mon=mon, area_type=area_type, by_scc=True, case=case, sector=sector, grid=grid, home=home, moles=True)

	if len(mon_list) == 12:
		# Write the annual reports if the month list has all twelve months
		if type_split:
			for run_type in run_types:
			# Split out the annual reports and write by type
				run_name = '%s_' %run_type

				ann_mass_out = conv2tons(annual_dict[run_type].rep_df, molecDct)
				for area_type in ['county', 'state']:
					write_rep(ann_mass_out, area_type=area_type, by_scc=False, run_name=run_name, case=case, sector=sector, grid=grid, home=home)
					write_rep(ann_mass_out, area_type=area_type, by_scc=True, run_name=run_name, case=case, sector=sector, grid=grid, home=home)
					write_emf(ann_mass_out, area_type, run_name, case=case, sector=sector, grid=grid, home=home)
					if options.moles:
						ann_mol_out = annual_dict[run_type].rep_df[(annual_dict[run_type].rep_df['species'].str.endswith('_t') == False) & (annual_dict[run_type].rep_df['species'].str.endswith('_g') == False)]
						write_rep(ann_mol_out, area_type=area_type, by_scc=False, run_name=run_name, case=case, sector=sector, grid=grid, home=home, moles=True)
						write_rep(ann_mol_out, area_type=area_type, by_scc=True, run_name=run_name, case=case, sector=sector, grid=grid, home=home, moles=True)
		else:
			ann_rep = MovesReport()
			# Create the combined reports
			for run_type in run_types:
				ann_rep.sum_runs(annual_dict[run_type].rep_df)

			ann_mass_out = conv2tons(ann_rep.rep_df, molecDct)
			# Write the combined annual reports
			for area_type in ['county', 'state']:
				write_rep(ann_mass_out, area_type=area_type, by_scc=False, case=case, sector=sector, grid=grid, home=home)
				write_rep(ann_mass_out, area_type=area_type, by_scc=True, case=case, sector=sector, grid=grid, home=home)
				write_emf(ann_mass_out, area_type, run_name, case=case, sector=sector, grid=grid, home=home)
				if options.moles:
					ann_mol_out = ann_rep.rep_df[(ann_rep.rep_df['species'].str.endswith('_t') == False) & (ann_rep.rep_df['species'].str.endswith('_g') == False)]
					write_rep(ann_mol_out, area_type=area_type, by_scc=False, case=case, sector=sector, grid=grid, home=home, moles=True)
					write_rep(ann_mol_out, area_type=area_type, by_scc=True, case=case, sector=sector, grid=grid, home=home, moles=True)
	print '%s seconds elapsed' %(time.time() - start_time)

if __name__ == '__main__':
#	pr = cProfile.Profile()
#	pr.enable()
	__main__()
#	pr.disable()
#	ps = pstats.Stats(pr).sort_stats('cumulative')
#	ps.print_stats(50)

