#!/usr/bin/python import mechanize import sys import csv import time # Parse the response from IBDB, returning the category with the most "votes" # (i.e. occurrences) def read_response(resp): categories = ["musical", "play", "special"] resp = resp.lower() cat_i = [resp.count(c) for c in categories] cat_i[0] -= 1 # Net out base rate occurrences in the page cat_i[1] -= 8 sys.stderr.write(str(cat_i) + "\n") if (max(cat_i) > 0): cat = categories[cat_i.index(max(cat_i))] else: cat = "NA" return cat in_fn = sys.argv[1] # List of shows out_fn = sys.argv[2] # Output file # Initialize a browser that looks like Firefox br = mechanize.Browser() br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US;' +\ 'rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9' +\ 'Firefox/3.0.1')] # Read in the list of shows shows = {} show_data = {} f = open(in_fn, "rU") reader = csv.reader(f) for r in reader: s = r[0] shows[s] = 1 show = shows.keys() # Query each show i = 0 for s in shows: sys.stderr.write(s + ": ") # Search for that show's name, and then parse the result search_page = br.open("http://www.ibdb.com/advSearchShows.php") br.select_form(nr=0) br.form["ShowProperName"] = s br.submit() show_data[s] = read_response(br.response().read()) i += 1 time.sleep(1) # To avoid overloading the IBDB server # Write the output f = open(out_fn, "wb") writer = csv.writer(f) writer.writerow(["Show", "Category"]) for s in show_data.keys(): writer.writerow([s, show_data[s]]) print ",".join([s, show_data[s]]) sys.std.out.flush() f.close()