#!/usr/bin/python
# Copyright 2010 Google Inc.
# Licensed under the Apache License, Version 2.0
# http://www.apache.org/licenses/LICENSE-2.0
# Google's Python Class
# http://code.google.com/edu/languages/google-python-class/
import sys
import re
"""Baby Names exercise
Define the extract_names() function below and change main()
to call it.
For writing regex, it's nice to include a copy of the target
text for inspiration.
Here's what the html looks like in the baby.html files:
...
Popularity in 1990
....
1 | Michael | Jessica |
2 | Christopher | Ashley |
3 | Matthew | Brittany |
...
Suggested milestones for incremental development:
-Extract the year and print it
-Extract the names and rank numbers and just print them
-Get the names data into a dict and print it
-Build the [year, 'name rank', ... ] list and print it
-Fix main() to use the extract_names list
"""
def extract_names(filename):
"""
Given a file name for baby.html, returns a list starting with the year string
followed by the name-rank strings in alphabetical order.
['2006', 'Aaliyah 91', Aaron 57', 'Abagail 895', ' ...]
"""
# +++your code here+++
# LAB(begin solution)
# The list [year, name_and_rank, name_and_rank, ...] we'll eventually return.
names = []
# Open and read the file.
f = open(filename, 'rU')
text = f.read()
# Could process the file line-by-line, but regex on the whole text
# at once is even easier.
# Get the year.
year_match = re.search(r'Popularity\sin\s(\d\d\d\d)', text)
if not year_match:
# We didn't find a year, so we'll exit with an error message.
sys.stderr.write('Couldn\'t find the year!\n')
sys.exit(1)
year = year_match.group(1)
names.append(year)
# Extract all the data tuples with a findall()
# each tuple is: (rank, boy-name, girl-name)
tuples = re.findall(r'(\d+) | (\w+) | \(\w+) | ', text)
#print tuples
# Store data into a dict using each name as a key and that
# name's rank number as the value.
# (if the name is already in there, don't add it, since
# this new rank will be bigger than the previous rank).
names_to_rank = {}
for rank_tuple in tuples:
(rank, boyname, girlname) = rank_tuple # unpack the tuple into 3 vars
if boyname not in names_to_rank:
names_to_rank[boyname] = rank
if girlname not in names_to_rank:
names_to_rank[girlname] = rank
# You can also write:
# for rank, boyname, girlname in tuples:
# ...
# To unpack the tuples inside a for-loop.
# Get the names, sorted in the right order
sorted_names = sorted(names_to_rank.keys())
# Build up result list, one element per line
for name in sorted_names:
names.append(name + " " + names_to_rank[name])
return names
# LAB(replace solution)
# return
# LAB(end solution)
def main():
# This command-line parsing code is provided.
# Make a list of command line arguments, omitting the [0] element
# which is the script itself.
args = sys.argv[1:]
if not args:
print 'usage: [--summaryfile] file [file ...]'
sys.exit(1)
# Notice the summary flag and remove it from args if it is present.
summary = False
if args[0] == '--summaryfile':
summary = True
del args[0]
# +++your code here+++
# For each filename, get the names, then either print the text output
# or write it to a summary file
# LAB(begin solution)
for filename in args:
names = extract_names(filename)
# Make text out of the whole list
text = '\n'.join(names)
if summary:
outf = open(filename + '.summary', 'w')
outf.write(text + '\n')
outf.close()
else:
print text
# LAB(end solution)
if __name__ == '__main__':
main()