# L-38 MCS 275 Fri 18 Apr 2008 : pdfclassfiles.py

# Uses HTMLParser to find all .pdf files listed as
# attributes of an 'a' tag on an HTML page.

from HTMLParser import HTMLParser
from urllib import urlopen

class pdfFiles(HTMLParser):
   """
   Scans attributes of 'a' tags for .pdf files.
   """
   def __init__(self):
      """
      Initializes the list of .pdf files.
      """
      HTMLParser.__init__(self)
      self.pdfFiles = []

   def handle_starttag(self, tag, attrs):
      """
      For tags equal to 'a' looks for
      attributes ending in .pdf.
      """
      print attrs
      if tag == 'a':
         A = [ y for (x,y) in attrs ]
         L = filter(lambda x: len(x)>3, A)
         F = filter(lambda x: x[-4:] == '.pdf', L)
         self.pdfFiles = self.pdfFiles + F

   def ShowFiles(self):
      """
      Prints the list of files to screen.
      """
      self.pdfFiles.sort()
      for each in self.pdfFiles: print each

def main():
   """
   Opens a web page and parses it.
   """
   page = 'http://www.math.uic.edu/~jan/mcs275/main.html'
   print 'opening %s ...' % page
   f = urlopen(page)
   p = pdfFiles()
   while True:
      data = f.read(80)
      if data == '': break
      p.feed(data)
   p.close()
   print 'pdf files on ' + page + ' :'
   p.ShowFiles()
   print 'number of files :', len(p.pdfFiles)

if __name__=="__main__": main()
