#!/usr/bin/python import sys import re import os.path import requests import StringIO import csv import datetime import time import string import xlsxwriter import unicodedata from bs4 import BeautifulSoup from openpyxl import Workbook from openpyxl import load_workbook from nltk.corpus import wordnet from uncapped import uncapped_words from cookies2 import google_cookies #### CONSTANTS #### header_rows = 1 # Number of header rows suffixes = ["", " tickets", " broadway"] # Trends suffixes request_timeout = 30 # Seconds to wait before the next request cache_trends = 1 # 1 to cache trends in local files (to avoid throttling) trends_begin = datetime.date(2004, 01, 01) get_trends = 1 min_weeks_for_trends = 0 # To allow excluding shorter shows grosses_path = "Grosses" # Where to cache grosses trends_path = "Trends" # Where to cache trends data ################### def read_grosses_date(s): return datetime.datetime.strptime(s, "%m/%d/%Y").date() # Grabs and parses the grosses file from Broadway World def download_grosses(path): print "Downloading grosses" shows = fetch_show_list() headers = { "Host" : "www.broadwayworld.com", "User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9;" +\ "rv:35.0) Gecko/20100101 Firefox/35.0", "Accept" : "text/html,application/xhtml+xml,application/xml;" +\ "q=0.9,*/*;q=0.8", "Accept-Language" : "en-US,en;q=0.5", "Accept-Encoding" : "gzip, deflate", "Connection" : "keep-alive" } cookies = { "CFID" : "1140731393", "CFTOKEN" : "9bec5cd3391afedc-9E3CFFDA-E588-8F4F-" +\ "FDD65315D403E598" } i = 1 for s in shows.keys(): sys.stdout.write("\r\033[K") sys.stdout.write("\r.Fetching grosses...show " + str(i) + " of " +\ str(len(shows)) + ", '" + s + "'") sys.stdout.flush() show_p = os.path.join(path, "".join(c for c in s if c.isalnum() or c in string.whitespace) +\ " Grosses.xlsx") if (os.path.isfile(show_p)): continue r = requests.get(shows[s], headers=headers, cookies=cookies) cleaned = r.content.replace("", "") soup = BeautifulSoup(cleaned) wb = xlsxwriter.Workbook(show_p) ws = wb.add_worksheet() rows = soup.find_all("tr") r_i = 0 for r in rows: if (r_i == 0): ws.write(r_i, 0, "Show name") else: ws.write(r_i, 0, s) cols = r.find_all("td") c_i = 1 for c in cols: ws.write(r_i, c_i, c.text) c_i += 1 r_i += 1 wb.close() time.sleep(1) # To play nice with the BW servers i += 1 sys.stdout.write("\r\033[K") sys.stdout.write("\r.Fetching grosses...done! Fetched " + str(len(shows)) +\ " grosses to " + str(path) + "\n") # Grabs a list of shows from the Broadway World index (no caching) def fetch_show_list(): show_url_prefix = "http://www.broadwayworld.com/grossesshowexcel.cfm?show=" show_url_suffix = "&all=on" url_stem = "http://www.broadwayworld.com/grossesbyshow.cfm?letter=" urls = list((url_stem + c for c in string.ascii_lowercase + "1")) shows = {} i = 1 for url in urls: sys.stdout.write("\r\033[K") sys.stdout.write("\r.Fetching show list...url " + str(i) + " of " +\ str(len(urls)) + " (" + url + ")") sys.stdout.flush() r = requests.get(url) page = r.text show_url_stem = "http://www.broadwayworld.com/grosses/" show_p = re.compile(re.escape("") + r"((?:[^\s\<]\s*)+)" + re.escape("")) m = show_p.findall(page) for s in m: shows[s[1]] = show_url_prefix + s[0] + show_url_suffix time.sleep(1) i += 1 #if (i > 1): # break #print(shows) sys.stdout.write("\r\033[K") sys.stdout.write("\r.Fetching show list...done\n") sys.stdout.flush() return shows # Parses the show name from the filename (assumed to be the show name followed # by 'Grosses'. Then we attempt to split the show name into tokens by the # following rules: 1) Capital letters denote a new token and 2) If a token # contains something on the MLA's list of words not to capitalize in a title and # (a) the token is not in the NLTK corpus of English words but (b) the token # without the uncapitalized word (i.e. the stem) is in the corpus, we split that # token into its stem and the uncapitalized word # # Then we rejoin the title with spaces def parse_show_name(fn, i): p = re.compile("(.+)\s*Grosses") m = p.search(fn) if (m != None and m.group(1) != None): flat_name = m.group(1) else: print "WARNING: Couldn't find name for show in file '" + fn + \ "' defaulting to 'Unknown show " + str(i) + "'" return "Unknown show " + str(i) # If name already has spaces, then just assume it's the show name, convert # it to title case and return it if (len(flat_name.split()) < len(flat_name)): pass tokens = re.findall('[A-Z][^A-Z]*', flat_name) words = [] # Check every token in the title to see if it might have an uncapitalized # word lurking on its end for t in tokens: w = t # Assume the token is a word until proven otherwise for u in uncapped_words: if (t.endswith(u)): # We've found a potential lurker... stem = t.split(u)[0] # We only split the token if the stem is a valid word and the # original token is not if (wordnet.synsets(stem) and not wordnet.synsets(t)): words.append(stem) w = u break # Just take the first match words.append(w) # Reconstrust the list of words return " ".join(words) def cached_trends_path(terms): if (not isinstance(terms, basestring)): terms = ",".join(terms) fname = terms.replace(r"/", "-") fname = fname.replace(r"\\", "-") # If we're caching and the file exists, then return true return os.path.join(trends_path, fname + "_trends.csv") def is_cached(terms): # If we're caching and the file exists, then return true return cache_trends and os.path.isfile(cached_trends_path(terms)) # Grab all the Google trends data for a show def fetch_google_trends(name): # Normalize the name before we go, to strip out weird characters name = unicodedata.normalize('NFKD', name).encode('ascii','ignore') sys.stdout.write("..Beginning Google trends request for '" + name + "'\n") terms = list((name + s for s in suffixes)) terms.append(terms[:]) trends = {} for t in terms: sys.stdout.write("...Requesting '" + str(t) + "' - ") cached = is_cached(t) # Need to check before we get the request if (request_timeout > 0 and not cached): sys.stdout.write("now sleeping " +\ str(request_timeout) + " " +\ "seconds before next request...") sys.stdout.flush() time.sleep(request_timeout) elif (cached): sys.stdout.write("cached - ") report = google_trends_request(t) if (not isinstance(t, basestring)): q_suff = " relative" else: q_suff = "" data = parse_trends_csv(report, q_suff=q_suff) for k in data.keys(): if (k in trends): trends[k] = dict(trends[k].items() + data[k].items()) else: trends[k] = dict(data[k].items()) sys.stdout.write("done!\n") sys.stdout.flush() # Fill in gaps in the data with the last seen value. # Also keeps track of max value seen so far to do de-normalization, but this # feature is unused, since we do something similar in R instead max_val = {} all_terms = {} # In theory, we already know this, but... all_dates = trends.keys() all_dates.sort() for d in all_dates: for t in trends[d].keys(): all_terms[t] = 1 last_seen = dict(zip(all_terms.keys(), [0 for t in all_terms.keys()])) for d in all_dates: for t in trends[d].keys(): last_seen[t] = trends[d][t] if (not t in max_val or max_val[t] < trends[d][t]): max_val[t] = trends[d][t] # Fill in the gaps with the last-seen value for t in all_terms.keys(): if (not t in trends[d]): trends[d][t] = last_seen[t] return trends # Issue a Trends query and cache it (or return the cached value if it's already # cached) def google_trends_request(terms): url = "http://www.google.com/trends/trendsReport" if (not isinstance(terms, basestring)): terms = ",".join(terms) # If we're caching and the file exists, then use the local copy if (cache_trends and is_cached(terms)): return open(cached_trends_path(terms)).read() params = { "hl" : "en-US", "q" : terms, "tz" : "", "content" : 1, "export" : 1 } r = requests.get(url, params=params, cookies=google_cookies) if (cache_trends): f = open(cached_trends_path(terms), "w") f.write(unicodedata.normalize('NFKD', r.text).encode('ascii','ignore')) return r.text # Takes the Trends CSV from the Google query and parses it into a dict def parse_trends_csv(report, q_suff=""): f = StringIO.StringIO(report) reader = csv.reader(f) in_block = 0 data = {} week_str = "\d{4}\-\d{2}\-\d{2}" week_p = re.compile(week_str + "\s+\-\s+(" + week_str + ")") month_str = "(\d{4}\-\d{02})" month_p = re.compile(month_str) for row in reader: if (row == None or not row): if (in_block): break else: continue if (not in_block and row[0] != "Interest over time"): continue elif (not in_block and row[0] == "Interest over time"): in_block = 1 continue elif (in_block and row[0] == "Week"): queries = [r.lower() + q_suff for r in row[1:]] date_p = week_p continue elif (in_block and row[0] == "Month"): queries = [r.lower() + q_suff for r in row[1:]] date_p = month_p continue elif (in_block and row[0] == ""): in_block = 0 break # Now we're in the block m = date_p.search(row[0]) if (m == None or m.group(1) == None): print "Couldn't find date in cell '" + row[0] + "'" sys.exit() else: s = m.group(1) # Convert monthly to weekly starting at first day of the month, # which we will eventually transform to the end of the month if (not re.compile(week_str).match(s)): s += "-01" end_date = datetime.datetime.strptime(s, "%Y-%m-%d").date() # Now push forward one month and pull back, if it's monthly if (not re.compile(week_str).match(s)): end_date = end_date + datetime.timedelta(months=1) -\ datetime.timedelta(days=1) if (end_date in data): print "Date " + str(end_date) + " already seen!" sys.exit() out = [] for v in row[1:]: try: out.append(int(v)) except ValueError: out.append(v) data[end_date] = dict(zip(queries, out)) return data # Parse args if (len(sys.argv) < 2): print "usage: " + sys.argv[0] + " " sys.exit() elif(len(sys.argv) == 2): out_f = sys.argv[1] in_f = [os.path.join(grosses_path, f) for f in os.listdir(grosses_path) if os.path.isfile(os.path.join(grosses_path, f)) and os.path.splitext(f)[1] == ".xlsx"] else: out_f = sys.argv[1] in_f = sys.argv[2:] # Either open the output workbook or a create a new one, depending on whether # we're appending append_mode = os.path.isfile(out_f) if (append_mode): print "File '" + out_f + "' already exists -- appending" out_wb = load_workbook(out_f) out_ws = out_wb[out_wb.get_sheet_names()[0]] # Assume first sheet else: out_wb = Workbook() out_ws = out_wb.active out_ws.title = "Grosses" shows = {} trends_by_show = {} print "Iterating grosses files" # Copy each input file to the output workbook for i in range(len(in_f)): sys.stdout.write(".Reading file " + str(i + 1) + " of " + str(len(in_f)) +\ ", '" + os.path.split(in_f[i])[1] + "'\n") in_wb = load_workbook(in_f[i]) in_ws = in_wb[in_wb.get_sheet_names()[0]] # Assume first sheet if (in_ws.cell(row=1, column=1).value != "Show name"): show_name = parse_show_name(in_f[i], i) else: show_name = unicode(in_ws.cell(row=2, column=1).value) # For the first sheet of the file, need to copy header rows if (i == 0 and not append_mode): for j in range(len(in_ws.columns)): h = (in_ws.cell(row=r + 1, column=j + 1).value for r in range(header_rows)) out_ws.cell(row=1, column=j + 1).value = \ " ".join(c for c in h if c != None) j = len(in_ws.columns) + 1 out_ws.cell(row=1, column=j).value = "Weeks since open" out_r = 2 elif (i == 0 and append_mode): out_r = len(out_ws.rows) + 1 # 1 for 1 indexing + 1 for next row sys.stdout.flush() shows[show_name] = read_grosses_date(in_ws.cell(row=2, column=2).value) # Find the last week in the gross data, to determine Trends availability max_week = read_grosses_date(in_ws.cell(row=len(in_ws.rows), column=2).value) - datetime.timedelta(days=7) trends_avail = max_week > trends_begin # Fetch trends if available and store them for later if (get_trends and trends_avail and len(in_ws.rows) > min_weeks_for_trends): trends = fetch_google_trends(show_name) else: trends = None trends_by_show[show_name] = trends # Copy cell by cell -- there may be a more efficient way to do this... for r in range(3, len(in_ws.rows) + 1): for c in range(1, len(in_ws.columns) + 1): v = in_ws.cell(row=r, column=c).value if (c > 2 and v.find("N/A") == -1): v = string.replace(v, "$", "") v = string.replace(v, ",", "") if (v.find("%") > -1): v = string.replace(v, "%", "") w = float(v) / 100 else: if (v.find(".") > -1): w = float(v) else: w = int(v) else: w = v.strip() out_ws.cell(row=out_r, column=c).value = w out_ws.cell(row=out_r, column=len(in_ws.columns) + 1).value = r - 3 out_r += 1 terms = list(("Name" + s for s in suffixes)) rels = list((t + " relative" for t in terms)) terms.extend(rels) trends_ws = [out_wb.create_sheet() for t in terms] # Create the Trends sheets -- one for each query for i in range(len(trends_ws)): trends_ws[i].title = "Trends - " + terms[i] # Pull Trends (if available) from 2 years before open to 12 years after date_range = range(-2 * 52, 12 * 52) # Print headers for ws in trends_ws: ws.cell(row=1, column=1).value = "Show name" c = 2 for d in date_range: ws.cell(row=1, column=c).value = d c += 1 # Pull the Trends data from the list and put it in the appropriate sheet, show # by show r = 2 for s in shows.keys(): trends_terms = list((s.lower() + suff for suff in suffixes)) rels = list((t + " relative" for t in trends_terms)) trends_terms.extend(rels) t = 0 opening = shows[s] trends = trends_by_show[s] for ws in trends_ws: c = 1 ws.cell(row=r, column=c).value = s c += 1 if (trends == None): continue for d in date_range: week = opening + datetime.timedelta(days=7) * d trends_weeks = list((k for k in trends.keys() if k <= week)) if (not trends_weeks): v = "N/A" else: latest_week = max((k for k in trends.keys() if k <= week)) trends_data = trends[latest_week] if (trends_terms[t] in trends_data): v = trends_data[trends_terms[t]] else: v = "N/A" ws.cell(row=r, column=c).value = v c += 1 t += 1 r += 1 out_wb.save(out_f) # Save the concatenated file