#!C:/Python27/ArcGIS10.5/python.exe
#TODO: Set your python interpreter above.

import xml.etree.cElementTree as et
import pandas as pd
from sqlalchemy import create_engine
import os
from progress.bar import Bar
import sys


#TODO:set the location of the wcproduction.xml file and the dbname to output.
sourceXML="K:/hbarrett/wcproduction.xml"
dbname = "ocd.db"

#this is how many to process before inserting into the db
iterate_amount=1000

#Prompt the user to make sure they want to recreate the DB. Only proceed if yes or y
yes = {'yes','y'}
if os.path.exists(dbname):
    print("This application will DELETE and recreate the "+dbname+" database. Do you want to continue?")
    choice = raw_input().lower()
    if choice in yes:
        os.remove(dbname)
    else:
        quit()


class OilBar(Bar):
    '''Progress bar that shows estimated time to complete processing all these oil records.
    This is nessicary for large files like the XML we are parsing here, otherwise we would be staring a blank screen for a day.'''
    message = 'Processing'
    fill = '*'
    suffix = '%(percent).1f%% Done.  Estimated time to complete:%(eta_td)s'

def extract_local_tag(qname):
    '''extracts and returns just the local tag for an element.'''
    return qname.split('}')[-1]

def save_oil_records(fulldf,db_engine):
    '''Params:
        fulldf - a full dataframe ready for processing.
        db_engine - a sqlalchemy db engine.

    Function: Formats colums in data frame to correct sql data type, and saves them to a supplied db engine.

    Returns: Empty dataframe or an error.'''
    fulldf[["api_st_cde","api_cnty_cde","api_well_idn","pool_idn","prodn_mth","prodn_yr","ogrid_cde","prod_amt","prodn_day_num"]] = fulldf[["api_st_cde","api_cnty_cde","api_well_idn","pool_idn","prodn_mth","prodn_yr","ogrid_cde","prod_amt","prodn_day_num"]].apply(pd.to_numeric)
    fulldf[["eff_dte","mod_dte"]] = fulldf[["eff_dte","mod_dte"]].apply(pd.to_datetime)
    fulldf.to_sql('oil_log', db_engine, if_exists='append')
    new_empty_df = pd.DataFrame()
    return new_empty_df

#To get an estimate of the number of records we need to process without actually processing them.
#Open the file
fh=open(sourceXML)
#go to the last byte(or char in this case)
fh.seek(0, os.SEEK_END)
#Each char is a byte so we can assume the number of bytes is the number of chars.
remaining_size = fh.tell()
#Note: The source xml has null values for every char due to the file format it was saved under
    #For example <\x00r\x00o\x00o\x00t\x00 \x00x\x00m\x00l\x00n\x00s\x00:\x00x\x00s\x00i\x00=\x00"\x00h\x00t instead of <root xmlns:xsi="ht
    #Because of this we are doubleing our corrections.
#Remove the 2230 characters that make up the schema definition.
remaining_size= remaining_size-4460
#Close the file
fh.close()
#divide by the max size of a wcproduction element (507 chars)
estimate_of_records=remaining_size/1014


#Create a progress bar 
bar = OilBar('Processing', max=estimate_of_records)

#Create a sqlite sqlalchemy db connection.
disk_engine = create_engine('sqlite:///'+ dbname)

#idx is the index for every record we will insert.
idx=0

#count is used to iterate n number of records before we insert.
count=0

#This is the schema tag we want to ignore.
ixid_uri = 'urn:schemas-microsoft-com:sql:SqlRowSet1'

#Create empty Data frame.
fulldf = pd.DataFrame()

#Open the xmlfile...
with open(sourceXML) as xml_file:
    #For each element...
    for event, elem in et.iterparse(xml_file):
        #if the element tag is 'wcproduction'...
        if elem.tag == et.QName(ixid_uri, 'wcproduction'):
            #create and populate a dictionary with the tag name and content of each chiled element in wcproduction
            #Example:{'eff_dte': '1973-10-31T00:00:00', 'prodn_day_num': '99', 'amend_ind': 'N', 'c115_wc_stat_cde': 'P', 'mod_dte': '2015-04-07T07:31:29.370', 'prod_amt': '91847', 'api_well_idn': '5250', 'pool_idn': '28509', 'ogrid_cde': '21075', 'api_cnty_cde': '15', 'prodn_yr': '1973', 'prodn_mth': '10', 'prd_knd_cde': 'O', 'api_st_cde': '30'}
            d = {}
            for child in elem:
                tagname=extract_local_tag(child.tag)
                d[tagname] = child.text.strip()
            #Add new dictionary to a temporary dataframe with the current index.
            dftmp=pd.DataFrame(d,index=[idx])
            #Append this dataframe to the full dataframe.
            fulldf=fulldf.append(dftmp)
            #If we have enough to insert, then insert them and set count to 0 and set the bar to reflect the new amount processed.
            if count==iterate_amount:
                bar.next(iterate_amount)
                fulldf=save_oil_records(fulldf,disk_engine)
                count=0
            #Otherwise, just increment the index and the count.
            idx=idx+1
            count=count+1
    #if the for loop has finished, and there are still records to be processed, (count is something other than 0), then we insert the last records, and finish the bar.
    if count>0:
        fulldf=save_oil_records(fulldf,disk_engine)
        count=0
        bar.finish()
    #Otherwise we assume that the number of records were exactly divisible by our iterate amount, and finish.
    else:
        bar.finish()