#!/usr/bin/env python """ This is a simple program I wrote to check my links on a HTML page. It does _not_ nest and check the linked pages. Limitations: If your URL to be checked is a directory, like foo.com/bar, you must specify it as foo.com/bar/ - with the trailing '/'. Kumar """ import urllib2, sys, re, urllib, urlparse, socket from HTMLParser import HTMLParser class LinkParser(HTMLParser): files = [] def __init__(self, base): HTMLParser.__init__(self) self.base = base def handle_starttag(self, tag, attrs): if tag in ("link", "a"): for i in attrs: if i[0] == "href": self.files.append(urllib.basejoin(self.base, i[1])) elif tag == "img": for i in attrs: if i[0] == "src": self.files.append(urllib.basejoin(self.base, i[1])) def getlist(self): HTMLParser.close(self) return self.files def get_response(urlstr): try: conn = urllib2.urlopen(urlstr) stuff = conn.read(1) except urllib2.HTTPError, herr: return ": " + str(herr) except urllib2.URLError, err: return ": " + err.reason[1] conn.close() return "" def print_all(data, base=None): lp = LinkParser(base) lp.feed(data) files = lp.getlist() for file in files: x = get_response(file) if not x: print file + ": OK!" else: print file + x if __name__ == "__main__": if len(sys.argv) != 2: print "Usage: %s [httpurl]" % sys.argv[0] print "Note: If you want to specify a directory like abc.foo.com/bar, specify it as" print "http://abc.foo.com/bar/ - with the trailing '/'" sys.exit(1) url = urllib.urlopen(sys.argv[1]) data = url.read() print_all(data, base=sys.argv[1])