rm( list=ls() )
set.seed( "20150226" )

sink("finalModel-console.out", append=FALSE, split=FALSE)

### CHANGE LINE BELOW TO YOUR WORKING DIRECTORY ###
setwd( "/Users/mike/Documents/Classes/OIT 367/Project/" )

source( 'http://www.stanford.edu/~bayati/oit367/T367_utilities_10.R' )
source( 'cv_utilities.R' ) # Some useful utilities for CV and data processing

normalize = FALSE # TRUE to normalize variables

# To save computation time, we cache the processed data. If given a filename,
# the script assumes that cached data exists there and reads it. Otherwise, we
# regenerate the processed data (from raw data) and save it to write.path
data.file = "cleaned-avg_trends-seasons-weather-avg_ticket.csv"
write.path = "cleaned-all.csv"
if ( is.null( data.file ) ) {
  all.data = read.data( lags=c(1:10) )
  if ( !is.null( write.path ) ) {
    write.csv( all.data, file=write.path, row.names=FALSE )  
  }
} else {
  all.data = read.csv( data.file )
  all.data$Week = as.Date( all.data$Week, "%Y-%m-%d")
}

# Sort the data by increasing time and make sure the show name variable is
# stored as a factor
all.data = all.data[order( all.data$Week ),]
all.data$Show.name = as.factor( all.data$Show.name )

# Placeholder variables
data.to.use = all.data
response = "This.Week.s.Gross"

# The core model building logic
build.model = function( train, test, summarize=FALSE, return.rmse=TRUE ) {
  
  # First, get rid of any "forbidden" variables (i.e. those that contain future
  # information)
  train$X = NULL
  train$X.1 = NULL
  train$X.2 = NULL
  train$Diff.. = NULL
  train$Gross...of.Potential = NULL
  train$Average.Ticket = NULL
  train$This.Week.. = NULL
  train$Diff...1 = NULL
  train$Category = NULL
  train$Total.Gross = NULL
  train$Last.Week.. = NULL
  train$SeatsSold = NULL
  train$Last.Week.s.Gross = NULL
  train$Year = NULL
  train$MonthNUM = NULL
  train$QuarterFull = NULL
  
  test$X = NULL
  test$X.1 = NULL
  test$X.2 = NULL
  test$Diff.. = NULL
  test$Gross...of.Potential = NULL
  test$Average.Ticket = NULL
  test$This.Week.. = NULL
  test$Diff...1 = NULL
  test$Category = NULL
  test$Total.Gross = NULL
  test$Last.Week.. = NULL
  test$SeatsSold = NULL
  test$Last.Week.s.Gross = NULL
  test$Year = NULL
  test$MonthNUM = NULL
  test$QuarterFull = NULL

  # Clean and NAs in the data
  train = fix.na( train )
  test = fix.na( test )
  
  
  # If we've been told to normalize the variables, replace each one with the
  # normalization of itself (otherwise, this loop is effectively a noop)
  cols = colnames( train )
  types = sapply( train, class )
  train.preds = train
  for ( i in 1:length( cols ) ) {
    if ( types[i] == "numeric" || types[i] == "integer" && cols[i] != "Week" ) {
      scaled = scale( train[, cols[i]] )
      if ( attr( scaled, "scaled:scale" ) != 0 && normalize ) {
        train[, cols[i]] = scaled
        train.preds[, cols[i]] = scaled
        
        test[, cols[i]] = ( test[, cols[i]] -
                             attr( scaled, "scaled:center" ) ) /
                             attr( scaled, "scaled:scale" )
      }
    }
  }

  # Ensure that the training and test show name factors have the same number of
  # levels, to avoid cryptic errors in predict
  levels = unique( union( train$Show.name, test$Show.name ) )
  
  # Build a model matrix for the training set -- the library buildModel function
  # raised errors for some reason, so we have to roll our own matrix
  train.response = train.preds$This.Week.s.Gross
  train.preds$This.Week.s.Gross = NULL
  x = model.matrix( ~ . +
                      (.)*This.Week.s.Gross_lag_1 +
                      (.)*This.Week.s.Gross_lag_2 +
                      (.)*This.Week.s.Gross_lag_3 + 
                      (.)*This.Week.s.Gross_lag_4 + 
                      (.)*This.Week.s.Gross_lag_5 + 
                      (.)*This.Week.s.Gross_lag_6 +
                      (.)*This.Week.s.Gross_lag_7 +
                      XmasFlags*YearNum*This.Week.s.Gross_lag_1 +
                      I(TotalSeats^2) + I(Per^2) + I(Num.Shows^2), train.preds )
  
  # Build a LASSO model
  m = cv.glmnet( x=x, y=train.response, alpha=1, family="gaussian", type="mse" )

  # If we've been told to print summary statistics about the model, do that now
  if ( summarize ) {
    print( summary( m ) )
    print( coef( m ) )
    plot( m )
  }

  # Same as above, build a model matrix for the test set
  test.preds = test
  test.preds$This.Week.s.Gross = NULL
  newx = model.matrix( ~ . +
                         (.)*This.Week.s.Gross_lag_1 +
                         (.)*This.Week.s.Gross_lag_2 +
                         (.)*This.Week.s.Gross_lag_3 + 
                         (.)*This.Week.s.Gross_lag_4 + 
                         (.)*This.Week.s.Gross_lag_5 + 
                         (.)*This.Week.s.Gross_lag_6 +
                         (.)*This.Week.s.Gross_lag_7 +
                         XmasFlags*YearNum*This.Week.s.Gross_lag_1 +
                         I(TotalSeats^2) + I(Per^2) + I(Num.Shows^2),
                       test.preds )

  # Predict on the test set, being careful to manage data types
  preds = predict( m, newx=newx )
  preds = as.vector( preds )
  preds[is.na( preds )] = median( preds, na.rm=TRUE )

  # "Denormalize" the variables, if we've normalized
  scale.factor = 1
  center.factor = 0

  if ( normalize ) {
    scale.factor = attr( train$This.Week.s.Gross, "scaled:scale" )
    center.factor = attr( train$This.Week.s.Gross, "scaled:center" )
  }

  preds = ( preds * scale.factor ) + center.factor
  test$This.Week.s.Gross = ( test$This.Week.s.Gross * scale.factor ) +
                              center.factor

  # Calculate per-show RMSE and output it, if we're summarizing
  show.rmse = sqrt( mean( ( preds - test$This.Week.s.Gross )^2 ) )
  if ( summarize ) {
    cat( "Show RMSE = ", show.rmse, "\n", sep="" )
  }
  
  # Build the prediction set and calculate RMSE
  shows.preds = data.frame( Week=test$Week, This.Week.s.Gross=preds )
  preds = aggregate( shows.preds$This.Week.s.Gross, list( shows.preds$Week ),
                     sum )[, 2]
  actuals = aggregate( test$This.Week.s.Gross, list( test$Week ), sum )[, 2]
  
  rmse = sqrt( mean( ( preds - actuals )^2 ) )
  
  # Return either the predictions or the RMSE, depending on the call paramters
  if ( return.rmse ) return( c( rmse, show.rmse ) )
  else return( preds )
}

# Run the cross validation on the model and output some useful standardized
# charts about the fit
rmse = cross.validate.time.rmse( data.to.use, build.model,
                            response=response, k=20, train.len=min( 5000,
                                            floor( nrow( data.to.use ) / 2 ) ) )

show.rmse = rmse$Show.RMSE
rmse = rmse$RMSE

mean.rmse = mean( rmse )
se.rmse = sd( rmse ) / sqrt( length( rmse ) )

mean.show.rmse = mean( show.rmse )
se.show.rmse = sd( show.rmse ) / sqrt( length( show.rmse ) )

train_frac = 0.8
train.data = data.to.use[1:( train_frac * nrow( data.to.use ) ),]
test.data = data.to.use[( train_frac * nrow( data.to.use ) + 1 ):
                          nrow( data.to.use ),]

preds = build.model( train.data, test.data, return.rmse=FALSE )

unscaled = test.data

actuals = aggregate( unscaled$This.Week.s.Gross,
                     list( unscaled$Week ), sum )
weeks = actuals[, 1]
actuals = actuals[, 2]

plot.fit( weeks, NULL, actuals - preds,
          main=paste( "Residuals (RMSE = ",
                      format( mean.rmse, digits=3, scientific=TRUE ), ", SE = ",
                      format( se.rmse, digits=3, scientific=TRUE ), ")",
                      sep="" ), xlab="Week", ylab=response )

plot.fit( weeks, actuals, preds,
          main=paste( "Model vs. test set (RMSE = ",
                      format( mean.rmse, digits=3, scientific=TRUE ), ", SE = ",
                      format( se.rmse, digits=3, scientific=TRUE ), ")",
                      sep="" ), xlab="Week", ylab=response )

build.model( data.to.use, data.to.use, summarize=TRUE )

cat( "Show RMSE = ", mean.show.rmse, ", SE = ", se.show.rmse, "\n" )