library( zoo ) library( timeDate ) # Runs a time-series cross validation with the given parameters and model cross.validate.time = function( data, build.model, response="Gross", k=NULL, train.len=NULL, test.len=NULL, min.train.length=400 ) { if ( is.null( k ) ) k = floor( nrow( data ) / min.train.length ) if ( is.null( train.len ) ) train.len = round( nrow( data ) / k ) if ( is.null( test.len ) ) test.len = min( round( train.len * 0.5 ), nrow( data ) - train.len ) rmse = rep( -1, k ) show.rmse = rep( -1, k ) for ( i in 1:k ) { train.start = sample( nrow( data ) - ( train.len + test.len ), 1, replace=F ) test.start = train.start + train.len + 1 train = data[train.start:( train.start + train.len ),] test = data[test.start:( test.start + test.len ),] preds = build.model( train, test ) if ( !is.null( attr( preds, "show.rmse" ) ) ) { show.rmse[i] = attr( preds, "show.rmse" ) } rmse[i] = sqrt( mean( ( preds - test[, response] )^2 ) ) } return( list( RMSE=rmse, Show.RMSE=show.rmse ) ) } # Runs a time-series cross validation with the given parameters and model, but # assumes build.model returns RMSE instead of predictions cross.validate.time.rmse = function( data, build.model, response="Gross", k=NULL, train.len=NULL, test.len=NULL, min.train.length=400 ) { if ( is.null( k ) ) k = floor( nrow( data ) / min.train.length ) if ( is.null( train.len ) ) train.len = round( nrow( data ) / k ) if ( is.null( test.len ) ) test.len = min( round( train.len * 0.5 ), nrow( data ) - train.len ) rmse = rep( -1, k ) show.rmse = rep( -1, k ) for ( i in 1:k ) { train.start = sample( nrow( data ) - ( train.len + test.len ), 1, replace=F ) test.start = train.start + train.len + 1 train = data[train.start:( train.start + train.len ),] test = data[test.start:( test.start + test.len ),] rmses = build.model( train, test ) rmse[i] = rmses[1] show.rmse[i] = rmses[2] #rmse[i] = sqrt( mean( ( preds - test[, response] )^2 ) ) } return( list( RMSE=rmse, Show.RMSE=show.rmse ) ) } # Produces a basic plot of a time-series fit plot.fit = function( xvals, yvals, preds, residuals=NULL, plot.residuals=FALSE, ... ) { plot( range( xvals ), range( c( yvals, preds ) ), type='n', ... ) lines( xvals, preds, col="red", lwd=2 ) lines( xvals, yvals, col="green", lwd=2 ) } # Aggregates raw data into total Broadway gross rows, for aggregate models total.gross.by.week = function( data, lags=c( 1, 2 ) ) { Total.Gross = aggregate( data$This.Week.s.Gross, list( data$Week ), sum ) Week = Total.Gross$Group.1 Week.of.Year = as.factor( format( Week + 3, "%U" ) ) Total.Gross = Total.Gross[,2] Total.Seats = aggregate( data$TotalSeats, list( data$Week ), sum )[,2] Avg.Top.Ticket = aggregate( data$Top.Ticket, list( data$Week ), mean )[,2] Avg.Ticket = Total.Gross / Total.Seats Avg.Ticket[Avg.Ticket == Inf] = 0 Total.Performances = aggregate( data$Per, list( data$Week ), sum )[,2] Num.Shows = rep( 0, length( Week ) ) for ( w in 1:length( Week ) ) { Num.Shows[w] = length( unique( data$Show.name[data$Week == Week[w]] ) ) } Num.Musicals = rep( 0, length( Week ) ) for ( w in 1:length( Week ) ) { Num.Musicals[w] = length( unique( data$Show.name[data$Week == Week[w] & data$Category == "musical"] ) ) } Median.Weeks.Since.Open = aggregate( data$Weeks.since.open, list( data$Week ), median )[,2] Mean.Weeks.Since.Open = aggregate( data$Weeks.since.open, list( data$Week ), mean )[,2] Max.Weeks.Since.Open = aggregate( data$Weeks.since.open, list( data$Week ), max)$x Min.Weeks.Since.Open = aggregate( data$Weeks.since.open, list( data$Week ), min )[,2] data$DJIA = suppressWarnings( as.numeric( as.character( data$DJIA ) ) ) data$DJIA[is.na( data$DJIA )] = 0 DJIA = aggregate( data$DJIA, list( data$Week ), mean )[,2] PRCP = aggregate( data$PRCP, list( data$Week ), mean )[,2] SNWD = aggregate( data$SNWD, list( data$Week ), mean )[,2] TMIN = aggregate( data$TMIN, list( data$Week ), min )[,2] TMAX = aggregate( data$TMAX, list( data$Week ), max )[,2] All.Broadway = data.frame( cbind( Week, Week.of.Year, Total.Gross, Total.Seats, Total.Performances, Num.Shows, Avg.Top.Ticket, Median.Weeks.Since.Open, Mean.Weeks.Since.Open, Max.Weeks.Since.Open, Min.Weeks.Since.Open, Num.Musicals, Avg.Ticket, DJIA, PRCP, SNWD, TMIN, TMAX ) ) All.Broadway$Week = Week All.Broadway = add.lags( All.Broadway, variable="Total.Gross", lags=lags ) All.Broadway = add.lags( All.Broadway, variable="Avg.Ticket", lags=c( 1, 2, 3 ) ) return( All.Broadway ) } # Adds lags for the given variable, grouping by group.by (if given) # # WARNING: Runs really slowly when given a group.by, likely because it's not # implemented particularly efficiently add.lags = function( data, variable, lags=c( 1 ), group.by=NULL, time.field="Week", remove.NAs=TRUE ) { if (!is.null( group.by ) ) { data = data[order( data[, group.by], data[, time.field]),] } for ( l in lags ) { col.name = paste( variable, "lag", l, sep="_" ) if ( is.null( group.by ) ) { data[, col.name] = rep( NA, nrow( data ) ) data[, col.name][( l + 1 ):nrow( data )] = data[, variable][1:( nrow( data ) - l )] } else { data[, col.name] = rep( NA, nrow( data ) ) cur.group = NA for ( i in 1:nrow( data ) ) { group = data[i, group.by] if ( is.na( cur.group ) || group != cur.group ) { cur.group = group lag.vals = rep( NA, length( lags ) ) start.i = i } else if ( i - start.i > l) { data[i, col.name] = data[i - l, variable] } } } } # Clean NA's automatically, if we;ve been told to if ( remove.NAs ) { if ( is.null( group.by ) ) data = data[( max( lags ) + 1 ):nrow( data ),] else { clean.rows = rep( TRUE, nrow( data ) ) for ( i in 1:nrow( data ) ) { for ( l in lags ) { col.name = paste( variable, "lag", l, sep="_" ) if ( is.na( data[i, col.name] ) ) clean.rows[i] = FALSE } } data = data[clean.rows,] } } return( data[( max( lags ) + 1 ):nrow( data ),] ) } # A generic "problem solver" function for cleaning data fix.probs = function( data, test.func, numeric.func=median, factor.val="Unknown") { cleaned = data for ( i in 1:ncol( cleaned ) ) { type = sapply( cleaned, class )[i][1] if ( sum( test.func( cleaned[, i] ) ) == 0 ) next if ( type == "character") { cleaned[, i] = as.factor( cleaned[, i] ) type = "factor" } if ( type == "factor" ){ levels( cleaned[, i] )[length( levels( cleaned[, i] ) ) + 1] = factor.val cleaned[, i][test.func( cleaned[, i] )] = factor.val } else if ( type == "integer" | type == "numeric" ) { if ( is.numeric( numeric.func ) ) v = numeric.func else v = numeric.func( cleaned[, i], na.rm=TRUE ) cleaned[, i][test.func( cleaned[, i] )] = v } else if ( type == "logical" ) { cleaned[, i][test.func( cleaned[, i] )] = FALSE } } return( cleaned ) } # Removes NA's in data fix.na = function( data, numeric.func=median, factor.val="Unknown" ) { cleaned = data for ( i in 1:ncol( cleaned ) ) { type = sapply( cleaned, class )[i][1] if ( sum( is.na( cleaned[, i] ) ) == 0 ) next if ( type == "character") { cleaned[, i] = as.factor( cleaned[, i] ) type = "factor" } if ( type == "factor" ){ levels( cleaned[, i] )[length( levels( cleaned[, i] ) ) + 1] = factor.val cleaned[, i][is.na( cleaned[, i] )] = factor.val } else if ( type == "integer" | type == "numeric" ) { if ( is.numeric( numeric.func ) ) v = numeric.func else v = numeric.func( cleaned[, i], na.rm=TRUE ) cleaned[, i][is.na( cleaned[, i] )] = v } else if ( type == "logical" ) { cleaned[, i][is.na( cleaned[, i] )] = FALSE } } return( cleaned ) } # Aggregates and adds Google Trends data to data, as separate columns add.trends = function( data ) { # Files containing semi-raw Trends data (from Python script) trends.files = list( name=read.csv( "trends-name.csv" ), tickets=read.csv( "trends-name-tickets.csv" ), broadway=read.csv( "trends-name-broadway.csv" ), name.rel=read.csv( "trends-name-relative.csv" ), tickets.rel=read.csv( "trends-name-tickets-relative.csv" ), broadway.rel=read.csv( "trends-name-broadway-relative.csv" ) ) # First, clean up the data for ( f in names( trends.files ) ) { for ( c in 2:ncol( trends.files[[f]] ) ) { to.replace = sapply( trends.files[[f]][,c], FUN=function( x ){ return( x == "" || x == "N/A") } ) trends.files[[f]][,c] = replace( trends.files[[f]][,c], to.replace, 0 ) trends.files[[f]][,c] = as.numeric( as.character( trends.files[[f]][,c] ) ) } } # Compute the baselines baselines = list() for ( f in names( trends.files ) ) { l = list( one.mos.pre=rep( 1, nrow( as.data.frame(trends.files[f] ) ) ), three.mos.pre=rep( 1, nrow( as.data.frame(trends.files[f] ) ) ), six.mos.pre=rep( 1, nrow( as.data.frame(trends.files[f] ) ) ), one.yr.pre=rep( 1, nrow( as.data.frame(trends.files[f] ) ) ) ) baselines[[f]] = l } week.to.col = function( w ) { return( w + 104 + 2 ) } range.mean = function( begin, end, data ) { } for ( f in names( baselines ) ) { for ( i in 1:nrow( trends.files[[f]] ) ) { baselines[[f]]$one.mos.pre[i] = mean( as.numeric( trends.files[[f]][i, week.to.col(-5):week.to.col(-1)] ) ) baselines[[f]]$three.mos.pre[i] = mean( as.numeric( trends.files[[f]][i, week.to.col(-13):week.to.col(-1)] ) ) baselines[[f]]$six.mos.pre[i] = mean( as.numeric( trends.files[[f]][i, week.to.col(-25):week.to.col(-1)] ) ) baselines[[f]]$one.yr.pre[i] = mean( as.numeric( trends.files[[f]][i, week.to.col(-53):week.to.col(-1)] ) ) } } # Parameters for lag terms, denormalization denominators, and average terms lags = c( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ) terms = c( "name", "tickets", "broadway" ) denoms = c( "1-mo", "3-mo", "6-mo", "1-yr" ) averages = c( 2, 4, 6, 12, 26 ) # Create columns for all of these terms (not complexity of f*p*l) for ( f in terms ) { for ( p in denoms ) { for ( l in lags ) { col.name = paste( f, "-", l, ":", p, sep="" ) data[, col.name] = rep( 0, nrow( data ) ) } } } # Add some additional columns for ( f in terms ) { for ( d in denoms ) { e = "1-yr" if ( d == e ) next col.name = paste( f, "-", d, ":", e, sep="" ) data[, col.name] = rep( 0, nrow( data ) ) } } # And a couple more columns... for ( f in c( "tickets", "broadway" ) ) { for ( l in lags ) { col.name = paste( f, "-", l, "-percent-of-tot", sep="" ) data[, col.name] = rep( 0, nrow( data ) ) } for ( a in averages ) { col.name = paste( f, "-", a, "-week-average", sep="" ) data[, col.name] = rep( 0, nrow( data ) ) } } # Now fill those columns max.week = 623 # Maximum week to search for, to speed things up a little for ( i in 1:nrow( data ) ) { cur.week = data[i, "Weeks.since.open"] show.i = which.max( as.character(trends.files$name.rel[,1]) == as.character(data[i, "Show.name"]) ) # Add the fixed pre-open average columns for ( f in terms ) { for ( d in denoms ) { e = "1-yr" if ( d == e ) next col.name = paste( f, "-", d, ":", e, sep="" ) denom = baselines[[f]]$one.yr.pre[show.i] numer = 0 if ( d == "1-mo" ) numer = baselines[[f]]$one.mos.pre[show.i] if ( d == "3-mo" ) numer = baselines[[f]]$three.mos.pre[show.i] if ( d == "6-mo" ) numer = baselines[[f]]$six.mos.pre[show.i] v = numer / denom if ( !is.null( v ) && !is.na( v ) ) { if ( !is.nan( v ) && v != Inf ) { data[show.i, col.name] = v } } } } if ( cur.week > max.week ) next # Grab the relative query rows, we'll need them soon name.rel.row = trends.files$name.rel[show.i,] tickets.rel.row = trends.files$tickets.rel[show.i,] broadway.rel.row = trends.files$broadway.rel[show.i,] # Add the relatie query rows for ( l in lags ) { tickets.rel = as.numeric( tickets.rel.row[week.to.col(cur.week - l)] ) / as.numeric( name.rel.row[week.to.col(cur.week - l)] ) broadway.rel = as.numeric( broadway.rel.row[week.to.col(cur.week - l)] ) / as.numeric( name.rel.row[week.to.col(cur.week - l)] ) if ( !is.nan( tickets.rel ) && !is.na( tickets.rel ) && tickets.rel != Inf ) { data[i, paste( "tickets-", l, "-percent-of-tot", sep="" )] = tickets.rel } if ( !is.nan( broadway.rel ) && !is.na( broadway.rel ) && broadway.rel != Inf ) { data[i, paste( "broadway-", l, "-percent-of-tot", sep="" )] = broadway.rel } } # Now add some averaged relative terms for ( a in averages ) { tickets.rel = mean( as.numeric( tickets.rel.row[week.to.col(cur.week - a - 1): week.to.col(cur.week - 1)] ) ) / mean( as.numeric( name.rel.row[week.to.col(cur.week - a - 1): week.to.col(cur.week - 1)] ) ) broadway.rel = mean( as.numeric( broadway.rel.row[week.to.col(cur.week - a - 1): week.to.col(cur.week - 1)] ) ) / mean( as.numeric( name.rel.row[week.to.col(cur.week - a - 1): week.to.col(cur.week - 1)] ) ) if ( !is.nan( tickets.rel ) && !is.na( tickets.rel ) && tickets.rel != Inf ) { data[i, paste( f, "-", a, "-week-average", sep="" )] = tickets.rel } if ( !is.nan( broadway.rel ) && !is.na( broadway.rel ) && broadway.rel != Inf ) { data[i, paste( "broadway-", a, "-week-average", sep="" )] = broadway.rel } } # Finally, add the absolute terms, divided by all of the different # denominators we're trying for ( f in terms ) { for ( p in denoms ) { denom = NULL if ( p == "1-mo" ) denom = baselines[[f]]$one.mos.pre[show.i] if ( p == "3-mo" ) denom = baselines[[f]]$three.mos.pre[show.i] if ( p == "6-mo" ) denom = baselines[[f]]$six.mos.pre[show.i] if ( p == "1-yr" ) denom = baselines[[f]]$one.yr.pre[show.i] for ( l in lags ) { num = as.numeric( trends.files[[f]][show.i, week.to.col( cur.week - l )] ) rel = as.numeric( num / denom ) if ( !is.null( rel ) && !is.na( rel ) ) { if ( !is.nan( rel ) && rel != Inf ) { col.name = paste( f, "-", l, ":", p, sep="" ) data[show.i, col.name] = rel } } } } } } return( data ) } # Process the raw data (takes a long time, so we usually cache this...) read.data = function( lags=c( 1:20 ) ) { # Read the raw grosses data and do a little type cleaning all.data = read.csv( "grosses.csv" ) all.data$Week = as.Date( all.data$Week, format="%m/%d/%y" ) all.data = all.data[order( all.data$Show.name, all.data$Week ),] # Add lags of the response variable all.data = add.lags( all.data, "This.Week.s.Gross", lags=lags, group.by="Show.name" ) # Add some summary columns about how many shows are running right now Num.Shows = rep( 0, length( all.data$Week ) ) Num.Musicals = rep( 0, length( all.data$Week ) ) for ( w in 1:length( all.data$Week ) ) { Num.Shows[w] = length( unique( all.data$Show.name[all.data$Week == all.data$Week[w]] ) ) Num.Musicals[w] = length( unique( all.data$Show.name[ all.data$Week == all.data$Week[w] & all.data$Category == "musical"] ) ) } # Bin the category variable all.data$Is.Play = all.data$Category == "play" all.data$Is.Musical = all.data$Category == "musical" # Aggregate total grosses all.data$Num.Shows = Num.Shows all.data$Num.Musicals = Num.Musicals all.data$Total.Gross = rep( 0, nrow( all.data ) ) for ( i in 1:nrow( all.data ) ) { all.data$Total.Gross[i] = sum( all.data$This.Week.s.Gross[ all.data$Week == all.data$Week[i]] ) } all.data = all.data[order( all.data$Week ),] # Add trends and seasonality predictors all.data = add.trends( all.data ) all.data = add.seasonality( all.data ) return( all.data ) } # Reads and adds seasonality predictors add.seasonality = function( data ) { data$Month = format(data$Week, "%b" ) # can also use as.yearmon data$MonthNUM = as.numeric(format(data$Week, "%m" )) data$QuarterFull <- as.yearqtr(as.yearmon(data$Week, "%m/%d/%Y") + 1/12) data$QuarterNum <- factor(format(data$QuarterFull, "%q"), levels = 1:4) data$Season <- factor(format(data$QuarterFull, "%q"), levels = 1:4, labels = c("winter", "spring", "summer", "fall")) data$Year = format(data$Week, "%Y" ) data$YearNum = as.numeric(format(data$Week, "%Y" ))-1983 # create column per date tagging whether that week is within 5 days of a # public holiday all holidays (G-7 --> likely meaningless) allHolidays = as.data.frame(holiday(1984:2014,listHolidays())) colnames(allHolidays)[1]="Date" n = length(data$Week) m = length(allHolidays$Date) # create flags for NYSE holidays NYSEHolidays = as.data.frame(holidayNYSE(1984:2014)) colnames(NYSEHolidays)[1]="Date" o = length(NYSEHolidays$Date) data$NYSEHolidayFlags = 0 for(j in (1: o)) { for (i in (1 : n)) { if( abs(data$Week[i]-as.Date(NYSEHolidays$Date[j])) < 6) { data$NYSEHolidayFlags[i] = 1 } } } # create flags for Christmas weeks Xmas = as.data.frame(holiday(1984:2014,"ChristmasDay")) colnames(Xmas)[1]="Date" p = length(Xmas$Date) data$XmasFlags = 0 for(j in (1: p)) { for (i in (1 : n)) { if( abs(data$Week[i]-as.Date(Xmas$Date[j])) < 6) { data$XmasFlags[i] = 1 } } } # create flags for Thanksgiving weeks Thanksgiv = as.data.frame(holiday(1984:2014,"USThanksgivingDay")) colnames(Thanksgiv)[1]="Date" q = length(Thanksgiv$Date) data$ThanksgivFlags = 0 for(j in (1: q)) { for (i in (1 : n)) { if( abs(data$Week[i]-as.Date(Thanksgiv$Date[j])) < 6) { data$ThanksgivFlags[i] = 1 } } } return( data ) } # Add weather data add.weather = function( data, file="newGrosses.csv" ) { weather = read.csv( file ) weather$Week = as.Date( weather$Week, "%m/%d/%y" ) weather$DJIA = suppressWarnings( as.numeric( as.character( weather$DJIA ) ) ) weather$DJIA[is.na( weather$DJIA )] = 0 DJIA = aggregate( weather$DJIA, list( weather$Week ), mean ) PRCP = aggregate( weather$PRCP, list( weather$Week ), mean ) SNWD = aggregate( weather$SNWD, list( weather$Week ), mean ) TMIN = aggregate( weather$TMIN, list( weather$Week ), min ) TMAX = aggregate( weather$TMAX, list( weather$Week ), max ) data$DJIA = rep( -1, nrow( data ) ) data$PRCP = rep( -1, nrow( data ) ) data$SNWD = rep( -1, nrow( data ) ) data$TMAX = rep( -1, nrow( data ) ) data$TMIN = rep( -1, nrow( data ) ) for ( i in 1:nrow( data ) ) { w = data[i, "Week"] data[i, "DJIA"] = DJIA[which.min( DJIA[,1] == w ), 2] data[i, "PRCP"] = PRCP[which.min( PRCP[,1] == w ), 2] data[i, "SNWD"] = SNWD[which.min( SNWD[,1] == w ), 2] data[i, "TMAX"] = TMAX[which.min( TMAX[,1] == w ), 2] data[i, "TMIN"] = TMIN[which.min( TMIN[,1] == w ), 2] } return( data ) }