# L-36 MCS 275 Mon 12 Apr 2010 : htmlrefs.py # Illustration of HTMLParser to parse an html page, # makes list of html pages this page refers to. from HTMLParser import HTMLParser from urllib import urlopen class HTMLrefs(HTMLParser): """ Makes a list of all html links. """ def __init__(self): """ Initializes the list of links. """ HTMLParser.__init__(self) self.refs = [] def handle_starttag(self, tag, attrs): """ Looks for tags equal to 'a' and stores links for href attributes. """ print attrs if tag == 'a': F = filter(lambda (x,y): x=='href', attrs) L = [ y for (x,y) in F ] self.refs = self.refs + L def ShowRefs(self): """ Prints the HTML refs to screen. """ for each in self.refs: print each def main(): """ Opens a web page and parses it. """ page = 'http://www.uic.edu/' print 'opening %s ...' % page f = urlopen(page) p = HTMLrefs() while True: data = f.read(80) if data == '': break p.feed(data) p.close() print 'all html links :' p.ShowRefs() if __name__=="__main__": main()