rm( list=ls() ) set.seed( "20150226" ) sink("finalModel-console.out", append=FALSE, split=FALSE) ### CHANGE LINE BELOW TO YOUR WORKING DIRECTORY ### setwd( "/Users/mike/Documents/Classes/OIT 367/Project/" ) source( 'http://www.stanford.edu/~bayati/oit367/T367_utilities_10.R' ) source( 'cv_utilities.R' ) # Some useful utilities for CV and data processing normalize = FALSE # TRUE to normalize variables # To save computation time, we cache the processed data. If given a filename, # the script assumes that cached data exists there and reads it. Otherwise, we # regenerate the processed data (from raw data) and save it to write.path data.file = "cleaned-avg_trends-seasons-weather-avg_ticket.csv" write.path = "cleaned-all.csv" if ( is.null( data.file ) ) { all.data = read.data( lags=c(1:10) ) if ( !is.null( write.path ) ) { write.csv( all.data, file=write.path, row.names=FALSE ) } } else { all.data = read.csv( data.file ) all.data$Week = as.Date( all.data$Week, "%Y-%m-%d") } # Sort the data by increasing time and make sure the show name variable is # stored as a factor all.data = all.data[order( all.data$Week ),] all.data$Show.name = as.factor( all.data$Show.name ) # Placeholder variables data.to.use = all.data response = "This.Week.s.Gross" # The core model building logic build.model = function( train, test, summarize=FALSE, return.rmse=TRUE ) { # First, get rid of any "forbidden" variables (i.e. those that contain future # information) train$X = NULL train$X.1 = NULL train$X.2 = NULL train$Diff.. = NULL train$Gross...of.Potential = NULL train$Average.Ticket = NULL train$This.Week.. = NULL train$Diff...1 = NULL train$Category = NULL train$Total.Gross = NULL train$Last.Week.. = NULL train$SeatsSold = NULL train$Last.Week.s.Gross = NULL train$Year = NULL train$MonthNUM = NULL train$QuarterFull = NULL test$X = NULL test$X.1 = NULL test$X.2 = NULL test$Diff.. = NULL test$Gross...of.Potential = NULL test$Average.Ticket = NULL test$This.Week.. = NULL test$Diff...1 = NULL test$Category = NULL test$Total.Gross = NULL test$Last.Week.. = NULL test$SeatsSold = NULL test$Last.Week.s.Gross = NULL test$Year = NULL test$MonthNUM = NULL test$QuarterFull = NULL # Clean and NAs in the data train = fix.na( train ) test = fix.na( test ) # If we've been told to normalize the variables, replace each one with the # normalization of itself (otherwise, this loop is effectively a noop) cols = colnames( train ) types = sapply( train, class ) train.preds = train for ( i in 1:length( cols ) ) { if ( types[i] == "numeric" || types[i] == "integer" && cols[i] != "Week" ) { scaled = scale( train[, cols[i]] ) if ( attr( scaled, "scaled:scale" ) != 0 && normalize ) { train[, cols[i]] = scaled train.preds[, cols[i]] = scaled test[, cols[i]] = ( test[, cols[i]] - attr( scaled, "scaled:center" ) ) / attr( scaled, "scaled:scale" ) } } } # Ensure that the training and test show name factors have the same number of # levels, to avoid cryptic errors in predict levels = unique( union( train$Show.name, test$Show.name ) ) # Build a model matrix for the training set -- the library buildModel function # raised errors for some reason, so we have to roll our own matrix train.response = train.preds$This.Week.s.Gross train.preds$This.Week.s.Gross = NULL x = model.matrix( ~ . + (.)*This.Week.s.Gross_lag_1 + (.)*This.Week.s.Gross_lag_2 + (.)*This.Week.s.Gross_lag_3 + (.)*This.Week.s.Gross_lag_4 + (.)*This.Week.s.Gross_lag_5 + (.)*This.Week.s.Gross_lag_6 + (.)*This.Week.s.Gross_lag_7 + XmasFlags*YearNum*This.Week.s.Gross_lag_1 + I(TotalSeats^2) + I(Per^2) + I(Num.Shows^2), train.preds ) # Build a LASSO model m = cv.glmnet( x=x, y=train.response, alpha=1, family="gaussian", type="mse" ) # If we've been told to print summary statistics about the model, do that now if ( summarize ) { print( summary( m ) ) print( coef( m ) ) plot( m ) } # Same as above, build a model matrix for the test set test.preds = test test.preds$This.Week.s.Gross = NULL newx = model.matrix( ~ . + (.)*This.Week.s.Gross_lag_1 + (.)*This.Week.s.Gross_lag_2 + (.)*This.Week.s.Gross_lag_3 + (.)*This.Week.s.Gross_lag_4 + (.)*This.Week.s.Gross_lag_5 + (.)*This.Week.s.Gross_lag_6 + (.)*This.Week.s.Gross_lag_7 + XmasFlags*YearNum*This.Week.s.Gross_lag_1 + I(TotalSeats^2) + I(Per^2) + I(Num.Shows^2), test.preds ) # Predict on the test set, being careful to manage data types preds = predict( m, newx=newx ) preds = as.vector( preds ) preds[is.na( preds )] = median( preds, na.rm=TRUE ) # "Denormalize" the variables, if we've normalized scale.factor = 1 center.factor = 0 if ( normalize ) { scale.factor = attr( train$This.Week.s.Gross, "scaled:scale" ) center.factor = attr( train$This.Week.s.Gross, "scaled:center" ) } preds = ( preds * scale.factor ) + center.factor test$This.Week.s.Gross = ( test$This.Week.s.Gross * scale.factor ) + center.factor # Calculate per-show RMSE and output it, if we're summarizing show.rmse = sqrt( mean( ( preds - test$This.Week.s.Gross )^2 ) ) if ( summarize ) { cat( "Show RMSE = ", show.rmse, "\n", sep="" ) } # Build the prediction set and calculate RMSE shows.preds = data.frame( Week=test$Week, This.Week.s.Gross=preds ) preds = aggregate( shows.preds$This.Week.s.Gross, list( shows.preds$Week ), sum )[, 2] actuals = aggregate( test$This.Week.s.Gross, list( test$Week ), sum )[, 2] rmse = sqrt( mean( ( preds - actuals )^2 ) ) # Return either the predictions or the RMSE, depending on the call paramters if ( return.rmse ) return( c( rmse, show.rmse ) ) else return( preds ) } # Run the cross validation on the model and output some useful standardized # charts about the fit rmse = cross.validate.time.rmse( data.to.use, build.model, response=response, k=20, train.len=min( 5000, floor( nrow( data.to.use ) / 2 ) ) ) show.rmse = rmse$Show.RMSE rmse = rmse$RMSE mean.rmse = mean( rmse ) se.rmse = sd( rmse ) / sqrt( length( rmse ) ) mean.show.rmse = mean( show.rmse ) se.show.rmse = sd( show.rmse ) / sqrt( length( show.rmse ) ) train_frac = 0.8 train.data = data.to.use[1:( train_frac * nrow( data.to.use ) ),] test.data = data.to.use[( train_frac * nrow( data.to.use ) + 1 ): nrow( data.to.use ),] preds = build.model( train.data, test.data, return.rmse=FALSE ) unscaled = test.data actuals = aggregate( unscaled$This.Week.s.Gross, list( unscaled$Week ), sum ) weeks = actuals[, 1] actuals = actuals[, 2] plot.fit( weeks, NULL, actuals - preds, main=paste( "Residuals (RMSE = ", format( mean.rmse, digits=3, scientific=TRUE ), ", SE = ", format( se.rmse, digits=3, scientific=TRUE ), ")", sep="" ), xlab="Week", ylab=response ) plot.fit( weeks, actuals, preds, main=paste( "Model vs. test set (RMSE = ", format( mean.rmse, digits=3, scientific=TRUE ), ", SE = ", format( se.rmse, digits=3, scientific=TRUE ), ")", sep="" ), xlab="Week", ylab=response ) build.model( data.to.use, data.to.use, summarize=TRUE ) cat( "Show RMSE = ", mean.show.rmse, ", SE = ", se.show.rmse, "\n" )