#!/usr/bin/env python import subprocess as sp import re, sys from getopt import * # List of all the HTML 4 elements that you can validly leave unclosed if # you want. elements = ("colgroup", "dd", "dt", "li", "option", "p", "td", "tfoot", "th", "thead", "tr") def cleanOpen(mo): return '<%s>' % mo.group(1).lower().strip() def cleanClose(mo): return '' % mo.group(1).lower().strip() def clean(data): """Get rid of attributes and whitespace in opening and closing tags, and make them all lowercase.""" pat = re.compile(r'<\s*([a-z]+).*?>', re.I | re.M | re.S) data = pat.sub(cleanOpen, data) pat = re.compile(r'', re.I | re.M | re.S) data = pat.sub(cleanClose, data) return data def checkData(fname, data, out=sys.stdout): global elements for el in elements: cmd = 'sgrep -o "%%i " \'"<%s>" not in inner("<%s>" .. "")\'' \ % (el, el, el) p = sp.Popen(cmd, shell=True, stdin=sp.PIPE, stdout=sp.PIPE) result = p.communicate(data)[0] p.wait() if result: for pos in result.split(): # Things might have moved a bit thanks to our "cleaning" of the # input data. sgrep is giving positions in the cleaned version, # which is wrong. If you get round to fixing it, it's probably # more helpful to convert everything to line numbers anyway. print "%s: Unclosed <%s> near position %s" % (fname, el, pos) def checkUnclosed(fname, out=sys.stdout): f = open(fname) data = f.read() f.close() checkData(fname, clean(data)) def help(): print """checkUnclosed.py [-h|-v] [filename ...] Prints a warning message locating these validly unclosed HTML 4 elements in files: """, ", ".join(elements), """ Reads stdin if no files. Requires sgrep is installed and in your path. -h this, -v version. """ def main(): opts, args = getopt(sys.argv[1:], "hv") opts = [o[0] for o in opts] if "-h" in opts: return help() if "-v" in opts: print "Version 0.2" return if len(args) == 0: data = clean(sys.stdin.read()) checkData("stdin", data) return for arg in args: checkUnclosed(arg) if __name__ == "__main__": main()