#!/usr/bin/python
import sys
import re
import os.path
import requests
import StringIO
import csv
import datetime
import time
import string
import xlsxwriter
import unicodedata
from bs4 import BeautifulSoup
from openpyxl import Workbook
from openpyxl import load_workbook
from nltk.corpus import wordnet
from uncapped import uncapped_words
from cookies2 import google_cookies
#### CONSTANTS ####
header_rows = 1 # Number of header rows
suffixes = ["", " tickets", " broadway"] # Trends suffixes
request_timeout = 30 # Seconds to wait before the next request
cache_trends = 1 # 1 to cache trends in local files (to avoid throttling)
trends_begin = datetime.date(2004, 01, 01)
get_trends = 1
min_weeks_for_trends = 0 # To allow excluding shorter shows
grosses_path = "Grosses" # Where to cache grosses
trends_path = "Trends" # Where to cache trends data
###################
def read_grosses_date(s):
return datetime.datetime.strptime(s, "%m/%d/%Y").date()
# Grabs and parses the grosses file from Broadway World
def download_grosses(path):
print "Downloading grosses"
shows = fetch_show_list()
headers = { "Host" : "www.broadwayworld.com",
"User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9;" +\
"rv:35.0) Gecko/20100101 Firefox/35.0",
"Accept" : "text/html,application/xhtml+xml,application/xml;" +\
"q=0.9,*/*;q=0.8",
"Accept-Language" : "en-US,en;q=0.5",
"Accept-Encoding" : "gzip, deflate",
"Connection" : "keep-alive" }
cookies = { "CFID" : "1140731393",
"CFTOKEN" : "9bec5cd3391afedc-9E3CFFDA-E588-8F4F-" +\
"FDD65315D403E598" }
i = 1
for s in shows.keys():
sys.stdout.write("\r\033[K")
sys.stdout.write("\r.Fetching grosses...show " + str(i) + " of " +\
str(len(shows)) + ", '" + s + "'")
sys.stdout.flush()
show_p = os.path.join(path, "".join(c for c in s if c.isalnum() or
c in string.whitespace) +\
" Grosses.xlsx")
if (os.path.isfile(show_p)):
continue
r = requests.get(shows[s], headers=headers, cookies=cookies)
cleaned = r.content.replace("", "")
soup = BeautifulSoup(cleaned)
wb = xlsxwriter.Workbook(show_p)
ws = wb.add_worksheet()
rows = soup.find_all("tr")
r_i = 0
for r in rows:
if (r_i == 0):
ws.write(r_i, 0, "Show name")
else:
ws.write(r_i, 0, s)
cols = r.find_all("td")
c_i = 1
for c in cols:
ws.write(r_i, c_i, c.text)
c_i += 1
r_i += 1
wb.close()
time.sleep(1) # To play nice with the BW servers
i += 1
sys.stdout.write("\r\033[K")
sys.stdout.write("\r.Fetching grosses...done! Fetched " + str(len(shows)) +\
" grosses to " + str(path) + "\n")
# Grabs a list of shows from the Broadway World index (no caching)
def fetch_show_list():
show_url_prefix = "http://www.broadwayworld.com/grossesshowexcel.cfm?show="
show_url_suffix = "&all=on"
url_stem = "http://www.broadwayworld.com/grossesbyshow.cfm?letter="
urls = list((url_stem + c for c in string.ascii_lowercase + "1"))
shows = {}
i = 1
for url in urls:
sys.stdout.write("\r\033[K")
sys.stdout.write("\r.Fetching show list...url " + str(i) + " of " +\
str(len(urls)) + " (" + url + ")")
sys.stdout.flush()
r = requests.get(url)
page = r.text
show_url_stem = "http://www.broadwayworld.com/grosses/"
show_p = re.compile(re.escape("") +
r"((?:[^\s\<]\s*)+)" +
re.escape(""))
m = show_p.findall(page)
for s in m:
shows[s[1]] = show_url_prefix + s[0] + show_url_suffix
time.sleep(1)
i += 1
#if (i > 1):
# break
#print(shows)
sys.stdout.write("\r\033[K")
sys.stdout.write("\r.Fetching show list...done\n")
sys.stdout.flush()
return shows
# Parses the show name from the filename (assumed to be the show name followed
# by 'Grosses'. Then we attempt to split the show name into tokens by the
# following rules: 1) Capital letters denote a new token and 2) If a token
# contains something on the MLA's list of words not to capitalize in a title and
# (a) the token is not in the NLTK corpus of English words but (b) the token
# without the uncapitalized word (i.e. the stem) is in the corpus, we split that
# token into its stem and the uncapitalized word
#
# Then we rejoin the title with spaces
def parse_show_name(fn, i):
p = re.compile("(.+)\s*Grosses")
m = p.search(fn)
if (m != None and m.group(1) != None):
flat_name = m.group(1)
else:
print "WARNING: Couldn't find name for show in file '" + fn + \
"' defaulting to 'Unknown show " + str(i) + "'"
return "Unknown show " + str(i)
# If name already has spaces, then just assume it's the show name, convert
# it to title case and return it
if (len(flat_name.split()) < len(flat_name)):
pass
tokens = re.findall('[A-Z][^A-Z]*', flat_name)
words = []
# Check every token in the title to see if it might have an uncapitalized
# word lurking on its end
for t in tokens:
w = t # Assume the token is a word until proven otherwise
for u in uncapped_words:
if (t.endswith(u)): # We've found a potential lurker...
stem = t.split(u)[0]
# We only split the token if the stem is a valid word and the
# original token is not
if (wordnet.synsets(stem) and not wordnet.synsets(t)):
words.append(stem)
w = u
break # Just take the first match
words.append(w) # Reconstrust the list of words
return " ".join(words)
def cached_trends_path(terms):
if (not isinstance(terms, basestring)):
terms = ",".join(terms)
fname = terms.replace(r"/", "-")
fname = fname.replace(r"\\", "-")
# If we're caching and the file exists, then return true
return os.path.join(trends_path, fname + "_trends.csv")
def is_cached(terms):
# If we're caching and the file exists, then return true
return cache_trends and os.path.isfile(cached_trends_path(terms))
# Grab all the Google trends data for a show
def fetch_google_trends(name):
# Normalize the name before we go, to strip out weird characters
name = unicodedata.normalize('NFKD', name).encode('ascii','ignore')
sys.stdout.write("..Beginning Google trends request for '" + name + "'\n")
terms = list((name + s for s in suffixes))
terms.append(terms[:])
trends = {}
for t in terms:
sys.stdout.write("...Requesting '" + str(t) + "' - ")
cached = is_cached(t) # Need to check before we get the request
if (request_timeout > 0 and not cached):
sys.stdout.write("now sleeping " +\
str(request_timeout) + " " +\
"seconds before next request...")
sys.stdout.flush()
time.sleep(request_timeout)
elif (cached):
sys.stdout.write("cached - ")
report = google_trends_request(t)
if (not isinstance(t, basestring)):
q_suff = " relative"
else:
q_suff = ""
data = parse_trends_csv(report, q_suff=q_suff)
for k in data.keys():
if (k in trends):
trends[k] = dict(trends[k].items() + data[k].items())
else:
trends[k] = dict(data[k].items())
sys.stdout.write("done!\n")
sys.stdout.flush()
# Fill in gaps in the data with the last seen value.
# Also keeps track of max value seen so far to do de-normalization, but this
# feature is unused, since we do something similar in R instead
max_val = {}
all_terms = {} # In theory, we already know this, but...
all_dates = trends.keys()
all_dates.sort()
for d in all_dates:
for t in trends[d].keys():
all_terms[t] = 1
last_seen = dict(zip(all_terms.keys(), [0 for t in all_terms.keys()]))
for d in all_dates:
for t in trends[d].keys():
last_seen[t] = trends[d][t]
if (not t in max_val or max_val[t] < trends[d][t]):
max_val[t] = trends[d][t]
# Fill in the gaps with the last-seen value
for t in all_terms.keys():
if (not t in trends[d]):
trends[d][t] = last_seen[t]
return trends
# Issue a Trends query and cache it (or return the cached value if it's already
# cached)
def google_trends_request(terms):
url = "http://www.google.com/trends/trendsReport"
if (not isinstance(terms, basestring)):
terms = ",".join(terms)
# If we're caching and the file exists, then use the local copy
if (cache_trends and is_cached(terms)):
return open(cached_trends_path(terms)).read()
params = { "hl" : "en-US",
"q" : terms,
"tz" : "",
"content" : 1,
"export" : 1 }
r = requests.get(url, params=params, cookies=google_cookies)
if (cache_trends):
f = open(cached_trends_path(terms), "w")
f.write(unicodedata.normalize('NFKD', r.text).encode('ascii','ignore'))
return r.text
# Takes the Trends CSV from the Google query and parses it into a dict
def parse_trends_csv(report, q_suff=""):
f = StringIO.StringIO(report)
reader = csv.reader(f)
in_block = 0
data = {}
week_str = "\d{4}\-\d{2}\-\d{2}"
week_p = re.compile(week_str + "\s+\-\s+(" + week_str + ")")
month_str = "(\d{4}\-\d{02})"
month_p = re.compile(month_str)
for row in reader:
if (row == None or not row):
if (in_block):
break
else:
continue
if (not in_block and row[0] != "Interest over time"):
continue
elif (not in_block and row[0] == "Interest over time"):
in_block = 1
continue
elif (in_block and row[0] == "Week"):
queries = [r.lower() + q_suff for r in row[1:]]
date_p = week_p
continue
elif (in_block and row[0] == "Month"):
queries = [r.lower() + q_suff for r in row[1:]]
date_p = month_p
continue
elif (in_block and row[0] == ""):
in_block = 0
break
# Now we're in the block
m = date_p.search(row[0])
if (m == None or m.group(1) == None):
print "Couldn't find date in cell '" + row[0] + "'"
sys.exit()
else:
s = m.group(1)
# Convert monthly to weekly starting at first day of the month,
# which we will eventually transform to the end of the month
if (not re.compile(week_str).match(s)):
s += "-01"
end_date = datetime.datetime.strptime(s, "%Y-%m-%d").date()
# Now push forward one month and pull back, if it's monthly
if (not re.compile(week_str).match(s)):
end_date = end_date + datetime.timedelta(months=1) -\
datetime.timedelta(days=1)
if (end_date in data):
print "Date " + str(end_date) + " already seen!"
sys.exit()
out = []
for v in row[1:]:
try:
out.append(int(v))
except ValueError:
out.append(v)
data[end_date] = dict(zip(queries, out))
return data
# Parse args
if (len(sys.argv) < 2):
print "usage: " + sys.argv[0] + "