library( zoo )
library( timeDate )

# Runs a time-series cross validation with the given parameters and model
cross.validate.time = function( data, build.model, response="Gross",
                                k=NULL, train.len=NULL, test.len=NULL,
                                min.train.length=400 ) {
  if ( is.null( k ) ) k = floor( nrow( data ) / min.train.length )
  if ( is.null( train.len ) ) train.len = round( nrow( data ) / k )
  if ( is.null( test.len ) ) test.len = min( round( train.len * 0.5 ),
                                             nrow( data ) - train.len )
  
  rmse = rep( -1, k )
  show.rmse = rep( -1, k )
  for ( i in 1:k ) {
    train.start = sample( nrow( data ) - ( train.len + test.len ), 1,
                          replace=F )
    test.start = train.start + train.len + 1
    
    train = data[train.start:( train.start + train.len ),]
    test = data[test.start:( test.start + test.len ),]
    
    preds = build.model( train, test )
    
    if ( !is.null( attr( preds, "show.rmse" ) ) ) {
      show.rmse[i] = attr( preds, "show.rmse" )
    }
    
    rmse[i] = sqrt( mean( ( preds - test[, response] )^2 ) )
  }
  
  return( list( RMSE=rmse, Show.RMSE=show.rmse ) )
}

# Runs a time-series cross validation with the given parameters and model, but
# assumes build.model returns RMSE instead of predictions
cross.validate.time.rmse = function( data, build.model, response="Gross",
                                     k=NULL, train.len=NULL, test.len=NULL,
                                     min.train.length=400 ) {
  if ( is.null( k ) ) k = floor( nrow( data ) / min.train.length )
  if ( is.null( train.len ) ) train.len = round( nrow( data ) / k )
  if ( is.null( test.len ) ) test.len = min( round( train.len * 0.5 ),
                                             nrow( data ) - train.len )
  
  rmse = rep( -1, k )
  show.rmse = rep( -1, k )
  for ( i in 1:k ) {
    train.start = sample( nrow( data ) - ( train.len + test.len ), 1,
                          replace=F )
    test.start = train.start + train.len + 1
    
    train = data[train.start:( train.start + train.len ),]
    test = data[test.start:( test.start + test.len ),]
    
    rmses = build.model( train, test )
    
    rmse[i] = rmses[1]
    show.rmse[i] = rmses[2]
    
    #rmse[i] = sqrt( mean( ( preds - test[, response] )^2 ) )
  }
  
  return( list( RMSE=rmse, Show.RMSE=show.rmse ) )
}

# Produces a basic plot of a time-series fit
plot.fit = function( xvals, yvals, preds, residuals=NULL,
                      plot.residuals=FALSE, ... ) {
  plot( range( xvals ), range( c( yvals, preds ) ), type='n', ... )
  lines( xvals, preds, col="red", lwd=2 )
  lines( xvals, yvals, col="green", lwd=2 )
}

# Aggregates raw data into total Broadway gross rows, for aggregate models
total.gross.by.week = function( data, lags=c( 1, 2 ) ) {
  Total.Gross = aggregate( data$This.Week.s.Gross, list( data$Week ), sum )
  Week = Total.Gross$Group.1
  Week.of.Year = as.factor( format( Week + 3, "%U" ) )
  
  Total.Gross = Total.Gross[,2]
  
  Total.Seats = aggregate( data$TotalSeats, list( data$Week ), sum )[,2]
  
  Avg.Top.Ticket = aggregate( data$Top.Ticket, list( data$Week ), mean )[,2]
  
  Avg.Ticket = Total.Gross / Total.Seats
  Avg.Ticket[Avg.Ticket == Inf] = 0
  
  Total.Performances = aggregate( data$Per, list( data$Week ), sum )[,2]
  
  Num.Shows = rep( 0, length( Week ) )
  for ( w in 1:length( Week ) ) {
    Num.Shows[w] = length( unique( data$Show.name[data$Week == Week[w]] ) )
  }
  
  Num.Musicals = rep( 0, length( Week ) )
  for ( w in 1:length( Week ) ) {
    Num.Musicals[w] = length( unique( data$Show.name[data$Week == Week[w] &
                                                data$Category == "musical"] ) )
  }  
  
  Median.Weeks.Since.Open = aggregate( data$Weeks.since.open, list( data$Week ),
                                       median )[,2]
  Mean.Weeks.Since.Open = aggregate( data$Weeks.since.open, list( data$Week ),
                                       mean )[,2]
  Max.Weeks.Since.Open = aggregate( data$Weeks.since.open, list( data$Week ),
                                    max)$x
  Min.Weeks.Since.Open = aggregate( data$Weeks.since.open, list( data$Week ),
                                    min )[,2]
  
  data$DJIA = suppressWarnings( as.numeric( as.character( data$DJIA ) ) )
  data$DJIA[is.na( data$DJIA )] = 0
  
  DJIA = aggregate( data$DJIA, list( data$Week ), mean )[,2]
  PRCP = aggregate( data$PRCP, list( data$Week ), mean )[,2]
  SNWD = aggregate( data$SNWD, list( data$Week ), mean )[,2]
  TMIN = aggregate( data$TMIN, list( data$Week ), min )[,2]
  TMAX = aggregate( data$TMAX, list( data$Week ), max )[,2]
  
  All.Broadway = data.frame( cbind( Week, Week.of.Year, Total.Gross,
                                    Total.Seats, Total.Performances, Num.Shows,
                                    Avg.Top.Ticket, Median.Weeks.Since.Open,
                                    Mean.Weeks.Since.Open,
                                    Max.Weeks.Since.Open,
                                    Min.Weeks.Since.Open, Num.Musicals,
                                    Avg.Ticket, DJIA, PRCP, SNWD, TMIN, TMAX ) )
  
  All.Broadway$Week = Week
  
  All.Broadway = add.lags( All.Broadway, variable="Total.Gross", lags=lags )
  All.Broadway = add.lags( All.Broadway, variable="Avg.Ticket",
                           lags=c( 1, 2, 3 ) )
  
  return( All.Broadway )
}

# Adds lags for the given variable, grouping by group.by (if given)
#
# WARNING: Runs really slowly when given a group.by, likely because it's not
# implemented particularly efficiently
add.lags = function( data, variable, lags=c( 1 ), group.by=NULL,
                     time.field="Week", remove.NAs=TRUE ) {
  if (!is.null( group.by ) ) {
    data = data[order( data[, group.by], data[, time.field]),]
  }
  
  for ( l in lags ) {
    col.name = paste( variable, "lag", l, sep="_" )
    
    if ( is.null( group.by ) ) {
      data[, col.name] = rep( NA, nrow( data ) )
      data[, col.name][( l + 1 ):nrow( data )] =
        data[, variable][1:( nrow( data ) - l )]
    } else {
      data[, col.name] = rep( NA, nrow( data ) )
      
      cur.group = NA
      for ( i in 1:nrow( data ) ) {
        group = data[i, group.by]
        if ( is.na( cur.group ) || group != cur.group ) {
          cur.group = group
          lag.vals = rep( NA, length( lags ) )
          start.i = i
        } else if ( i - start.i > l) {
          data[i, col.name] = data[i - l, variable]
        }
      }
    }
  }

  # Clean NA's automatically, if we;ve been told to
  if ( remove.NAs ) {
    if ( is.null( group.by ) ) data = data[( max( lags ) + 1 ):nrow( data ),]
    else {
      clean.rows = rep( TRUE, nrow( data ) )
      for ( i in 1:nrow( data ) ) {
        for ( l in lags ) {
          col.name = paste( variable, "lag", l, sep="_" )
          if ( is.na( data[i, col.name] ) ) clean.rows[i] = FALSE
        }
      }
      
      data = data[clean.rows,]
    }
  }
  
  return( data[( max( lags ) + 1 ):nrow( data ),] )
}

# A generic "problem solver" function for cleaning data
fix.probs = function( data, test.func, numeric.func=median,
                      factor.val="Unknown") {
  cleaned = data
  
  for ( i in 1:ncol( cleaned ) ) {
    type = sapply( cleaned, class )[i][1]
    
    if ( sum( test.func( cleaned[, i] ) ) == 0 ) next
    
    if ( type == "character") {
      cleaned[, i] = as.factor( cleaned[, i] )
      type = "factor"
    }
    
    if ( type == "factor" ){
      levels( cleaned[, i] )[length( levels( cleaned[, i] ) ) + 1] = factor.val
      cleaned[, i][test.func( cleaned[, i] )] = factor.val
    } else if ( type == "integer" | type == "numeric" ) {
      if ( is.numeric( numeric.func ) ) v = numeric.func
      else v = numeric.func( cleaned[, i], na.rm=TRUE )
      cleaned[, i][test.func( cleaned[, i] )] = v
    } else if ( type == "logical" ) {
      cleaned[, i][test.func( cleaned[, i] )] = FALSE
    }
  }
  
  return( cleaned )  
}

# Removes NA's in data
fix.na = function( data, numeric.func=median, factor.val="Unknown" ) {
  cleaned = data
  
  for ( i in 1:ncol( cleaned ) ) {
    type = sapply( cleaned, class )[i][1]
    
    if ( sum( is.na( cleaned[, i] ) ) == 0 ) next
    
    if ( type == "character") {
      cleaned[, i] = as.factor( cleaned[, i] )
      type = "factor"
    }
    
    if ( type == "factor" ){
      levels( cleaned[, i] )[length( levels( cleaned[, i] ) ) + 1] = factor.val
      cleaned[, i][is.na( cleaned[, i] )] = factor.val
    } else if ( type == "integer" | type == "numeric" ) {
      if ( is.numeric( numeric.func ) ) v = numeric.func
      else v = numeric.func( cleaned[, i], na.rm=TRUE )
      cleaned[, i][is.na( cleaned[, i] )] = v
    } else if ( type == "logical" ) {
      cleaned[, i][is.na( cleaned[, i] )] = FALSE
    }
  }
  
  return( cleaned )
}

# Aggregates and adds Google Trends data to data, as separate columns
add.trends = function( data ) {
  # Files containing semi-raw Trends data (from Python script)
  trends.files = list(
    name=read.csv( "trends-name.csv" ),
    tickets=read.csv( "trends-name-tickets.csv" ),
    broadway=read.csv( "trends-name-broadway.csv" ),
    name.rel=read.csv( "trends-name-relative.csv" ),
    tickets.rel=read.csv( "trends-name-tickets-relative.csv" ),
    broadway.rel=read.csv( "trends-name-broadway-relative.csv" ) )
  
  # First, clean up the data
  for ( f in names( trends.files ) ) {
    for ( c in 2:ncol( trends.files[[f]] ) ) {
      to.replace = sapply( trends.files[[f]][,c],
                           FUN=function( x ){ return( x == "" || x == "N/A") } )
      trends.files[[f]][,c] = replace( trends.files[[f]][,c], to.replace, 0 )
      trends.files[[f]][,c] = as.numeric( as.character(
                                            trends.files[[f]][,c] ) )
    }
  }
  
  # Compute the baselines
  baselines = list()
  for ( f in names( trends.files ) ) {
    l = list( one.mos.pre=rep( 1, nrow( as.data.frame(trends.files[f] ) ) ),
              three.mos.pre=rep( 1, nrow( as.data.frame(trends.files[f] ) ) ),
              six.mos.pre=rep( 1, nrow( as.data.frame(trends.files[f] ) ) ),
              one.yr.pre=rep( 1, nrow( as.data.frame(trends.files[f] ) ) ) )
    baselines[[f]] = l
  }
  
  week.to.col = function( w ) { return( w + 104 + 2 ) }
  range.mean = function( begin, end, data ) { }
  
  for ( f in names( baselines ) ) {
    for ( i in 1:nrow( trends.files[[f]] ) ) {
      baselines[[f]]$one.mos.pre[i] = mean( as.numeric( trends.files[[f]][i,
                                            week.to.col(-5):week.to.col(-1)] ) )
      baselines[[f]]$three.mos.pre[i] = mean( as.numeric( trends.files[[f]][i,
                                          week.to.col(-13):week.to.col(-1)] ) )
      baselines[[f]]$six.mos.pre[i] = mean( as.numeric( trends.files[[f]][i,
                                          week.to.col(-25):week.to.col(-1)] ) )
      baselines[[f]]$one.yr.pre[i] = mean( as.numeric( trends.files[[f]][i,
                                           week.to.col(-53):week.to.col(-1)] ) )
    }
  }
  
  # Parameters for lag terms, denormalization denominators, and average terms
  lags = c( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 )
  terms = c( "name", "tickets", "broadway" )
  denoms = c( "1-mo", "3-mo", "6-mo", "1-yr" )
  averages = c( 2, 4, 6, 12, 26 )
  
  # Create columns for all of these terms (not complexity of f*p*l)
  for ( f in terms ) {
    for ( p in denoms ) {
      for ( l in lags ) {
        col.name = paste( f, "-", l, ":", p, sep="" )
        data[, col.name] = rep( 0, nrow( data ) )
      }
    }
  }
  
  # Add some additional columns
  for ( f in terms ) {
    for ( d in denoms ) {
      e = "1-yr"
      if ( d == e ) next
      
      col.name = paste( f, "-", d, ":", e, sep="" )
      data[, col.name] = rep( 0, nrow( data ) )
    }
  }
  
  # And a couple more columns...
  for ( f in c( "tickets", "broadway" ) ) {
    for ( l in lags ) {
      col.name = paste( f, "-", l, "-percent-of-tot", sep="" )
      data[, col.name] = rep( 0, nrow( data ) )
    }
    
    for ( a in averages ) {
      col.name = paste( f, "-", a, "-week-average", sep="" )
      data[, col.name] = rep( 0, nrow( data ) )
    }
  }
  
  # Now fill those columns
  max.week = 623 # Maximum week to search for, to speed things up a little
  for ( i in 1:nrow( data ) ) {
    cur.week = data[i, "Weeks.since.open"]
    show.i = which.max( as.character(trends.files$name.rel[,1]) ==
                        as.character(data[i, "Show.name"]) )
    
    # Add the fixed pre-open average columns
    for ( f in terms ) {
      for ( d in denoms ) {
        e = "1-yr"
        if ( d == e ) next
        
        col.name = paste( f, "-", d, ":", e, sep="" )
        
        denom = baselines[[f]]$one.yr.pre[show.i]
        numer = 0
        if ( d == "1-mo" ) numer = baselines[[f]]$one.mos.pre[show.i]
        if ( d == "3-mo" ) numer = baselines[[f]]$three.mos.pre[show.i]
        if ( d == "6-mo" ) numer = baselines[[f]]$six.mos.pre[show.i]
        
        v = numer / denom
        
        if ( !is.null( v ) && !is.na( v ) ) {
          if ( !is.nan( v ) && v != Inf ) {
            data[show.i, col.name] = v
          }
        }
      }
    }
    
    if ( cur.week > max.week ) next

    # Grab the relative query rows, we'll need them soon
    name.rel.row = trends.files$name.rel[show.i,]
    tickets.rel.row = trends.files$tickets.rel[show.i,]
    broadway.rel.row = trends.files$broadway.rel[show.i,]
    
    # Add the relatie query rows
    for ( l in lags ) {
      tickets.rel = as.numeric( tickets.rel.row[week.to.col(cur.week - l)] ) /
        as.numeric( name.rel.row[week.to.col(cur.week - l)] )
      broadway.rel = as.numeric( broadway.rel.row[week.to.col(cur.week - l)] ) /
        as.numeric( name.rel.row[week.to.col(cur.week - l)] )
      
      if ( !is.nan( tickets.rel ) && !is.na( tickets.rel ) &&
             tickets.rel != Inf ) {
        data[i, paste( "tickets-", l, "-percent-of-tot", sep="" )] =
          tickets.rel
      }
      
      if ( !is.nan( broadway.rel ) && !is.na( broadway.rel ) &&
             broadway.rel != Inf ) {
        data[i, paste( "broadway-", l, "-percent-of-tot", sep="" )] =
          broadway.rel
      }
    }
    
    # Now add some averaged relative terms
    for ( a in averages ) {
      tickets.rel = mean( as.numeric(
                          tickets.rel.row[week.to.col(cur.week - a - 1):
                                          week.to.col(cur.week - 1)] ) ) /
                    mean( as.numeric(
                          name.rel.row[week.to.col(cur.week - a - 1):
                                       week.to.col(cur.week - 1)] ) )

      broadway.rel = mean( as.numeric(
                           broadway.rel.row[week.to.col(cur.week - a - 1):
                                            week.to.col(cur.week - 1)] ) ) /
                     mean( as.numeric(
                           name.rel.row[week.to.col(cur.week - a - 1):
                                        week.to.col(cur.week - 1)] ) )
      
      if ( !is.nan( tickets.rel ) && !is.na( tickets.rel ) &&
             tickets.rel != Inf ) {
        data[i, paste( f, "-", a, "-week-average", sep="" )] =
          tickets.rel
      }
      
      if ( !is.nan( broadway.rel ) && !is.na( broadway.rel ) &&
             broadway.rel != Inf ) {
        data[i, paste( "broadway-", a, "-week-average", sep="" )] =
          broadway.rel
      }      
    }
    
    # Finally, add the absolute terms, divided by all of the different
    # denominators we're trying
    for ( f in terms ) {
      for ( p in denoms ) {
        denom = NULL
        
        if ( p == "1-mo" ) denom = baselines[[f]]$one.mos.pre[show.i]
        if ( p == "3-mo" ) denom = baselines[[f]]$three.mos.pre[show.i]
        if ( p == "6-mo" ) denom = baselines[[f]]$six.mos.pre[show.i]
        if ( p == "1-yr" ) denom = baselines[[f]]$one.yr.pre[show.i]
        
        for ( l in lags ) {
          num = as.numeric( trends.files[[f]][show.i,
                                              week.to.col( cur.week - l )] )
          
          rel = as.numeric( num / denom )

          if ( !is.null( rel ) && !is.na( rel ) ) {
            if ( !is.nan( rel ) && rel != Inf ) {
              col.name = paste( f, "-", l, ":", p, sep="" )
              data[show.i, col.name] = rel
            }
          }
        }
      }
    }
  }
  
  
  return( data )
}

# Process the raw data (takes a long time, so we usually cache this...)
read.data = function( lags=c( 1:20 ) ) {
  # Read the raw grosses data and do a little type cleaning
  all.data = read.csv( "grosses.csv" )
  all.data$Week = as.Date( all.data$Week, format="%m/%d/%y" )
  all.data = all.data[order( all.data$Show.name, all.data$Week ),]
  
  # Add lags of the response variable
  all.data = add.lags( all.data, "This.Week.s.Gross", lags=lags,
                       group.by="Show.name" )
  
  # Add some summary columns about how many shows are running right now
  Num.Shows = rep( 0, length( all.data$Week ) )
  Num.Musicals = rep( 0, length( all.data$Week ) )
  for ( w in 1:length( all.data$Week ) ) {
    Num.Shows[w] = length( unique( all.data$Show.name[all.data$Week ==
                                                        all.data$Week[w]] ) )
    Num.Musicals[w] = length( unique( all.data$Show.name[
      all.data$Week == all.data$Week[w] & all.data$Category == "musical"] ) )
  }
  
  # Bin the category variable
  all.data$Is.Play = all.data$Category == "play"
  all.data$Is.Musical = all.data$Category == "musical"
  
  # Aggregate total grosses
  all.data$Num.Shows = Num.Shows
  all.data$Num.Musicals = Num.Musicals
  all.data$Total.Gross = rep( 0, nrow( all.data ) )
  for ( i in 1:nrow( all.data ) ) {
    all.data$Total.Gross[i] = sum( all.data$This.Week.s.Gross[
      all.data$Week == all.data$Week[i]] )
  }
  all.data = all.data[order( all.data$Week ),]

  # Add trends and seasonality predictors
  all.data = add.trends( all.data )
  all.data = add.seasonality( all.data )
  
  return( all.data )
}

# Reads and adds seasonality predictors
add.seasonality = function( data ) {
  data$Month = format(data$Week, "%b" ) # can also use as.yearmon
  data$MonthNUM = as.numeric(format(data$Week, "%m" ))
  data$QuarterFull <- as.yearqtr(as.yearmon(data$Week, "%m/%d/%Y") + 1/12)
  data$QuarterNum <- factor(format(data$QuarterFull, "%q"), levels = 1:4)
  data$Season <- factor(format(data$QuarterFull, "%q"), levels = 1:4, 
                              labels = c("winter", "spring", "summer", "fall"))
  data$Year = format(data$Week, "%Y" )
  data$YearNum = as.numeric(format(data$Week, "%Y" ))-1983
  
  
  # create column per date tagging whether that week is within 5 days of a
  # public holiday all holidays (G-7 --> likely meaningless)
  allHolidays = as.data.frame(holiday(1984:2014,listHolidays()))
  colnames(allHolidays)[1]="Date"
  
  n = length(data$Week)
  m = length(allHolidays$Date)
  
  # create flags for NYSE holidays
  NYSEHolidays = as.data.frame(holidayNYSE(1984:2014))
  colnames(NYSEHolidays)[1]="Date"
  
  o = length(NYSEHolidays$Date)
  
  data$NYSEHolidayFlags = 0
  
  for(j in (1: o)) {
    for (i in (1 : n)) {
      if(   abs(data$Week[i]-as.Date(NYSEHolidays$Date[j])) < 6) {
        data$NYSEHolidayFlags[i] = 1 
      }
    }
  }
  
  
  # create flags for Christmas weeks
  Xmas = as.data.frame(holiday(1984:2014,"ChristmasDay"))
  colnames(Xmas)[1]="Date"
  
  p = length(Xmas$Date)
  
  data$XmasFlags = 0
  
  for(j in (1: p)) {
    for (i in (1 : n)) {
      if(   abs(data$Week[i]-as.Date(Xmas$Date[j])) < 6) {
        data$XmasFlags[i] = 1 
      }
    }
  }
  
  
  # create flags for Thanksgiving weeks
  Thanksgiv = as.data.frame(holiday(1984:2014,"USThanksgivingDay"))
  colnames(Thanksgiv)[1]="Date"
  
  q = length(Thanksgiv$Date)
  
  data$ThanksgivFlags = 0
  
  for(j in (1: q)) {
    for (i in (1 : n)) {
      if(   abs(data$Week[i]-as.Date(Thanksgiv$Date[j])) < 6) {
        data$ThanksgivFlags[i] = 1 
      }
    }
  }
  
  return( data )
}

# Add weather data
add.weather = function( data, file="newGrosses.csv" ) {
  weather = read.csv( file )
  
  weather$Week = as.Date( weather$Week, "%m/%d/%y" )
  
  weather$DJIA = suppressWarnings( as.numeric( as.character( weather$DJIA ) ) )
  weather$DJIA[is.na( weather$DJIA )] = 0
  
  DJIA = aggregate( weather$DJIA, list( weather$Week ), mean )
  PRCP = aggregate( weather$PRCP, list( weather$Week ), mean )
  SNWD = aggregate( weather$SNWD, list( weather$Week ), mean )
  TMIN = aggregate( weather$TMIN, list( weather$Week ), min )
  TMAX = aggregate( weather$TMAX, list( weather$Week ), max )
  
  data$DJIA = rep( -1, nrow( data ) )
  data$PRCP = rep( -1, nrow( data ) )
  data$SNWD = rep( -1, nrow( data ) )
  data$TMAX = rep( -1, nrow( data ) )
  data$TMIN = rep( -1, nrow( data ) )
  
  for ( i in 1:nrow( data ) ) {
    w = data[i, "Week"]
    data[i, "DJIA"] = DJIA[which.min( DJIA[,1] == w ), 2]
    data[i, "PRCP"] = PRCP[which.min( PRCP[,1] == w ), 2]
    data[i, "SNWD"] = SNWD[which.min( SNWD[,1] == w ), 2]
    data[i, "TMAX"] = TMAX[which.min( TMAX[,1] == w ), 2]
    data[i, "TMIN"] = TMIN[which.min( TMIN[,1] == w ), 2]
  }
  
  return( data )
}