# L-36 MCS 275 Mon 12 Apr 2010 : htmlrefs.py
# Illustration of HTMLParser to parse an html page,
# makes list of html pages this page refers to.
from HTMLParser import HTMLParser
from urllib import urlopen
class HTMLrefs(HTMLParser):
"""
Makes a list of all html links.
"""
def __init__(self):
"""
Initializes the list of links.
"""
HTMLParser.__init__(self)
self.refs = []
def handle_starttag(self, tag, attrs):
"""
Looks for tags equal to 'a' and
stores links for href attributes.
"""
print attrs
if tag == 'a':
F = filter(lambda (x,y): x=='href', attrs)
L = [ y for (x,y) in F ]
self.refs = self.refs + L
def ShowRefs(self):
"""
Prints the HTML refs to screen.
"""
for each in self.refs: print each
def main():
"""
Opens a web page and parses it.
"""
page = 'http://www.uic.edu/'
print 'opening %s ...' % page
f = urlopen(page)
p = HTMLrefs()
while True:
data = f.read(80)
if data == '': break
p.feed(data)
p.close()
print 'all html links :'
p.ShowRefs()
if __name__=="__main__": main()