''' This is a skeletal working web spider, with virtually no error-checking. You need to pass in an URL that points to a directory (e.g. http://www.foo.com/bar/). ''' import sys import string import urllib import urlparse import htmllib import formatter from cStringIO import StringIO class Retriever: def __init__(self, URL): self.URL = URL def retrieve(self): self.page = urllib.urlopen(self.URL) self.body = self.page.read() self.page.close() self.parse() def getLinks(self): return self.parser.anchorlist def parse(self): # We're using the parser just to get the HREFs # We should also use it to e.g. respect w = formatter.DumbWriter(StringIO()) f = formatter.AbstractFormatter(w) self.parser = htmllib.HTMLParser(f) self.parser.feed(self.body) self.parser.close() class Spider: def __init__(self, startURL): self.URLs = [] self.queue = [startURL] self.URLdict = {startURL: 1} self.include = startURL def checkInclude(self, URL): return string.find(URL, self.include) == 0 def run(self): while self.queue: URL = self.queue.pop() print "Retrieving:", URL self.getPage(URL) self.URLs = self.URLdict.keys() self.URLs.sort() def getPage(self, URL): ret = Retriever(URL) ret.retrieve() for link in ret.getLinks(): # Handle relative links link = urlparse.urljoin(URL, link) print "Checking:", link # Make sure this is a new URL and is within the current site if ( not self.URLdict.has_key(link) ) and self.checkInclude(link): self.URLdict[link] = 1 self.queue.append(link) if __name__ == '__main__': startURL = sys.argv[1] spider = Spider(startURL) spider.run() for URL in spider.URLs: print URL