# L-38 MCS 275 Fri 18 Apr 2008 : htmlrefs.py

# Illustration of HTMLParser to parse an html page,
# makes list of html pages this page refers to.

from HTMLParser import HTMLParser
from urllib import urlopen

class HTMLrefs(HTMLParser):
   """
   Makes a list of all html links.
   """
   def __init__(self):
      """
      Initializes the list of links.
      """
      HTMLParser.__init__(self)
      self.refs = []

   def handle_starttag(self, tag, attrs):
      """
      Looks for tags equal to 'a' and
      stores links for href attributes.
      """
      print attrs
      if tag == 'a':
         F = filter(lambda (x,y): x=='href', attrs)
         L = [ y for (x,y) in F ]
         self.refs = self.refs + L

   def ShowRefs(self):
      """
      Prints the HTML refs to screen.
      """
      for each in self.refs: print each

def main():
   """
   Opens a web page and parses it.
   """
   page = 'http://www.uic.edu/'
   print 'opening %s ...' % page
   f = urlopen(page)
   p = HTMLrefs()
   while True:
      data = f.read(80)
      if data == '': break
      p.feed(data)
   p.close()
   print 'all html links :'
   p.ShowRefs()

if __name__=="__main__": main()
