|
Version 0.2: from html.parser import HTMLParser
class HTMLFormatter(HTMLParser): """Formats HTML""" def __init__(self): HTMLParser.__init__(self) self.tabbed = 0 self.formatted = []
def append(self, data): self.formatted.append(str(data))
def _write_tabs(self): self.append('\t'*self.tabbed)
def _format_attrs(self, attrs): fattrs = "" for a,v in attrs: fattrs = fattrs + " " + a + '="' + v.replace('"', '\\"') + '"' return fattrs def _format_tag(self, tag, ttype='start', ats=None): ftag = '<' if ttype == 'end': ftag = ftag + '/' ftag = ftag + tag if ats != None and len(ats): ftag = ftag + self._format_attrs(ats) if ttype == 'self': ftag = ftag + ' /' ftag = ftag + '>' return ftag def handle_starttag(self, tag, attrs): self._write_tabs() self.tabbed = self.tabbed + 1 self.append(self._format_tag(tag, ats=attrs) + '\n')
def handle_endtag(self, tag): self.tabbed = self.tabbed - 1 self._write_tabs() self.append(self._format_tag(tag, ttype='end') + '\n')
def handle_startendtag(self, tag, attrs): self._write_tabs() self.append(self._format_tag(tag, ttype='self', ats=attrs) + '\n') def handle_data(self, data): data = data.strip() if(len(data)): self._write_tabs() self.append(data + '\n')
def handle_charref(self, name): self.append(''+name+';')
def handle_entityref(self, name): self.append('&'+name+';')
def handle_comment(self, data): data = '<!--' + data + '-->' self._write_tabs(); self.append(data + '\n')
def handle_decl(self, decl): self._write_tabs() self.append('<!'+decl+'>')
def handle_pi(self, data): self._write_tabs() self.append('<?'+data+'>')
def render(self): return "".join(self.formatted)
if __name__ == "__main__": import sys if len(sys.argv) == 3: try: n = HTMLFormatter() f = open(sys.argv[1], 'r') n.feed(f.read()) f.close() f = open(sys.argv[2], 'w') f.write(n.render()) f.close() except IOError: print("Failed opening or writing to files '{0}', '{1}'".format(sys.argv[1], sys.argv[2])) else: print("Wrong number of arguments")
Anyways, to run, basically save this as htmlformatter.py (or your choice name). Then run: python htmlformatter.py file1.html file2.htmlOr you can use it as a library. from htmlformatter import HTMLFormatter n = HTMLFormatter() n.feed(some_html) print(n.render())To Do:- Fix the way it handles data, comments and entity refs
- Make it auto fix tags that should be self closing but aren't
- Close tags that only have space or small data in them on the same line instead of 2-3 separate lines
I'm open to ideas and suggestions for improvements.
Last Edit: Jun 21, 2009 16:42:57 GMT by Eric
|
|
|