''' This is a skeletal working web spider, with virtually no error-checking. You need to pass in an URL that points to a directory (e.g. http://www.foo.com/bar/). This version adds brute-force threads, spawning and killing one thread per retrieve operation (instead of reusing threads). That means this script can run afoul of the 1.5.2 multi-CPU thread bug. ''' import sys import string import urllib import urlparse import htmllib import formatter from cStringIO import StringIO import threading import time MAX_THREADS = 10 class Retriever(threading.Thread): def __init__(self, URL): self.done = 0 self.URL = URL threading.Thread.__init__(self) def run(self): print "Retrieving:", self.URL self.page = urllib.urlopen(self.URL) self.body = self.page.read() self.page.close() self.parse() self.done = 1 def getLinks(self): return self.parser.anchorlist def parse(self): # We're using the parser just to get the HREFs # We should also use it to e.g. respect w = formatter.DumbWriter(StringIO()) f = formatter.AbstractFormatter(w) self.parser = htmllib.HTMLParser(f) self.parser.feed(self.body) self.parser.close() class Spider: def __init__(self, startURL, maxThreads): self.URLs = [] self.queue = [startURL] self.URLdict = {startURL: 1} self.include = startURL self.maxThreads = maxThreads self.numThreads = 0 self.threadList = [] def checkInclude(self, URL): return string.find(URL, self.include) == 0 def run(self): while self.queue or self.threadList: while self.queue and (self.numThreads < self.maxThreads): URL = self.queue.pop() self.getPage(URL) self.checkThreads() self.URLs = self.URLdict.keys() self.URLs.sort() def checkThreads(self): tmpNumThreads = self.numThreads for ret in self.threadList[:] : if ret.done: self.processPage(ret) self.threadList.remove(ret) self.numThreads = self.numThreads - 1 if tmpNumThreads == self.numThreads: time.sleep(1) def getPage(self, URL): ret = Retriever(URL) ret.start() self.threadList.append(ret) self.numThreads = self.numThreads + 1 def processPage(self, page): for link in page.getLinks(): # Handle relative links link = urlparse.urljoin(page.URL, link) print "Checking:", link # Make sure this is a new URL and is within the current site if ( not self.URLdict.has_key(link) ) and self.checkInclude(link): self.URLdict[link] = 1 self.queue.append(link) if __name__ == '__main__': startURL = sys.argv[1] spider = Spider(startURL, MAX_THREADS) spider.run() print for URL in spider.URLs: print URL