# -*- Mode: Python; tab-width: 4 -*- """ A coroutine-based web crawler that uses corodns, which requires a nameserver to be running at %(NAMESERVER)s. Usage: python crawler.py """ # # TODO: much better robots.txt comprehension. Right now, the mere *existence* # of a robots.txt file will deter this timid crawler. This was the result of # a deliberate decision to write a conservative robot. # crawler_debug = 0 USER_AGENT_STRING = 'Python coro-crawler http://www.egroups.com/group/python-coro/' NAMESERVER = '127.0.0.1' VERSION_STRING = '$Id: //depot/main/findmail/src/coroutine/crawler.py#4 $' import string import coro import corodns import coutil import formatter import htmllib import urllib import urlparse import re import socket binary_files = re.compile (r'.*\.(gif|jpg|jpeg|gz|tar|pdf|ps)', re.IGNORECASE) def request_text(uri): lines = ('GET %s HTTP/1.0' % uri, 'User-Agent: %s' % USER_AGENT_STRING, 'Connection: close', '', '') return string.join(lines, '\r\n') def get_tld (host): return string.join (string.split (host, '.')[-2:], '.') def make_tld_filter (tld): def tld_filter (host, port, uri, tld=tld): m = binary_files.match (uri) if m: return 0 else: return tld == string.lower (get_tld (host)) return tld_filter def url_split (url): urltype, url = urllib.splittype (url) host, uri = urllib.splithost(url) host, port = urllib.splitport (host) uri, query = urllib.splitquery (uri) uri, tag = urllib.splittag (uri) if port is None: port = 80 return urltype, host, port, uri, query, tag cache = {} def gethostbyname (host): host = string.lower (host) if not cache.has_key (host): cache[host] = corodns.gethostbyname (host) return cache[host] def get_content_type (header): lines = string.split (header, '\r\n') for line in lines: i = string.find (line, ': ') if i != -1: name, value = line[:i], line[i+2:] if string.lower (name) == 'content-type': return value return None cache_robots_ok = {} def robots_ok(host, port): addr = host, port if not cache_robots_ok.has_key(addr): robots_ok = no_robots_file(host, port) cache_robots_ok[addr] = robots_ok if crawler_debug: if (robots_ok): print "No robots.txt file ... continuing." else: print "A robots.txt file ... stopping." return cache_robots_ok[addr] def no_robots_file(host, port): """Return true if we can positively verify that 'robots.txt' does not exist.""" s = coro.make_socket (socket.AF_INET, socket.SOCK_STREAM) uri = 'http://%s:%d/robots.txt' % (host, port) if crawler_debug: print ' grabbing %s....' % uri # lookup IP address if crawler_debug: print ' looking up ip..' ip = gethostbyname (host) if crawler_debug: print ' got it', ip if not ip: return 0 s.connect ((ip, int(port))) s.send (request_text(uri)) header = '' found_header = 0 while 1: block = s.recv (8192) if not block: break else: header = header + block i = string.find (header, '\r\n\r\n') if i != -1: found_header = 1 header = header[:i] break if not found_header: return 0 firstline = string.split(header, '\r\n')[0] # A cheap way to detect any 200-level, 'successful' response: i = string.find(firstline, '20') if i != -1: return 0 # well, then, it don't exist: return 1 pending = coutil.object_queue() fetches = {} fetch_count = 0 working = {} NextURL = "NextURL" def fetcher (n): global fetch_count, working, fetches, pending i = 0 while 1: if (len(pending) == 0) and (len(working)==0): print 'done. hit ctrl-c, fred' (host, port, uri, url_filter) = pending.pop() work_key = (host, port, uri) working[work_key] = None fetch_count = fetch_count + 1 i = i + 1 try: # what about alternative ports? url = urlparse.urljoin ('http://%s' % host, uri) print '%4d:%4d:%4d:%4d %s %s' % (fetch_count, n, i, len(pending), host, uri) if not robots_ok(host, port): raise NextURL fetches[(host, port, uri)] = 1 f = formatter.NullFormatter() p = htmllib.HTMLParser(f) s = coro.make_socket (socket.AF_INET, socket.SOCK_STREAM) # lookup IP address if crawler_debug: print 'looking up ip..' ip = gethostbyname (host) if crawler_debug: print 'got it', ip if not ip: print 'No IP for %s' % host return s.connect ((ip, int(port))) s.send (request_text(uri)) header = '' found_header = 0 while 1: block = s.recv (8192) if not block: break elif found_header: p.feed (block) else: header = header + block i = string.find (header, '\r\n\r\n') if i != -1: found_header = 1 h = header[:i] content_type = get_content_type (header) if string.lower (content_type) == 'text/html': p.feed (header[i+4:]) else: raise NextURL if crawler_debug: print "p.anchorlist has length %d" % len(p.anchorlist) #print "p.anchorlist is %s" % p.anchorlist for a in p.anchorlist: urltype, rest = urllib.splittype (a) if urltype is None: next_url = urlparse.urljoin (url, rest) elif urltype == 'http': next_url = a else: next_url = None if next_url: #print 'next_url', url, next_url (urltype, host, port, uri, query, tag) = url_split (next_url) if urltype == 'http': if not fetches.has_key ((host,port,uri)): if url_filter: if url_filter (host, port, uri): fetches[(host,port,uri)] = 1 pending.push ((host, port, uri, url_filter)) else: #print 'url_filter eliminated %s' % next_url pass else: fetches[(host,port,uri)] = 1 pending.push ((host, port, uri, url_filter)) else: #print 'already fetch{ed,ing} %s' % next_url pass p.close() s.close() except NextURL: pass except: import traceback traceback.print_exc() del working[work_key] def test (url): import backdoor corodns.initialize(NAMESERVER) (urltype, host, port, uri, query, tag) = url_split (url) if urltype != 'http': raise ValueError, "http only, please" pending.push ((host, port, uri, make_tld_filter (get_tld (host)))) for i in range(20): coro.spawn (fetcher, i) coro.spawn (backdoor.serve) coro.event_loop (30.0) if __name__ == '__main__': import sys if len(sys.argv) < 2: print __doc__ % vars() else: test (sys.argv[1])