| #!/usr/bin/env python |
| """usage: %prog [options] filename |
| |
| Parse a document to a tree, with optional profiling |
| """ |
| |
| import sys |
| import os |
| import traceback |
| from optparse import OptionParser |
| |
| from html5lib import html5parser, sanitizer |
| from html5lib.tokenizer import HTMLTokenizer |
| from html5lib import treebuilders, serializer, treewalkers |
| from html5lib import constants |
| |
| def parse(): |
| optParser = getOptParser() |
| opts,args = optParser.parse_args() |
| encoding = "utf8" |
| |
| try: |
| f = args[-1] |
| # Try opening from the internet |
| if f.startswith('http://'): |
| try: |
| import urllib.request, urllib.parse, urllib.error, cgi |
| f = urllib.request.urlopen(f) |
| contentType = f.headers.get('content-type') |
| if contentType: |
| (mediaType, params) = cgi.parse_header(contentType) |
| encoding = params.get('charset') |
| except: |
| pass |
| elif f == '-': |
| f = sys.stdin |
| if sys.version_info[0] >= 3: |
| encoding = None |
| else: |
| try: |
| # Try opening from file system |
| f = open(f, "rb") |
| except IOError as e: |
| sys.stderr.write("Unable to open file: %s\n" % e) |
| sys.exit(1) |
| except IndexError: |
| sys.stderr.write("No filename provided. Use -h for help\n") |
| sys.exit(1) |
| |
| treebuilder = treebuilders.getTreeBuilder(opts.treebuilder) |
| |
| if opts.sanitize: |
| tokenizer = sanitizer.HTMLSanitizer |
| else: |
| tokenizer = HTMLTokenizer |
| |
| p = html5parser.HTMLParser(tree=treebuilder, tokenizer=tokenizer, debug=opts.log) |
| |
| if opts.fragment: |
| parseMethod = p.parseFragment |
| else: |
| parseMethod = p.parse |
| |
| if opts.profile: |
| import cProfile |
| import pstats |
| cProfile.runctx("run(parseMethod, f, encoding)", None, |
| {"run": run, |
| "parseMethod": parseMethod, |
| "f": f, |
| "encoding": encoding}, |
| "stats.prof") |
| # XXX - We should use a temp file here |
| stats = pstats.Stats('stats.prof') |
| stats.strip_dirs() |
| stats.sort_stats('time') |
| stats.print_stats() |
| elif opts.time: |
| import time |
| t0 = time.time() |
| document = run(parseMethod, f, encoding) |
| t1 = time.time() |
| if document: |
| printOutput(p, document, opts) |
| t2 = time.time() |
| sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)"%(t1-t0, t2-t1)) |
| else: |
| sys.stderr.write("\n\nRun took: %fs"%(t1-t0)) |
| else: |
| document = run(parseMethod, f, encoding) |
| if document: |
| printOutput(p, document, opts) |
| |
| def run(parseMethod, f, encoding): |
| try: |
| document = parseMethod(f, encoding=encoding) |
| except: |
| document = None |
| traceback.print_exc() |
| return document |
| |
| def printOutput(parser, document, opts): |
| if opts.encoding: |
| print("Encoding:", parser.tokenizer.stream.charEncoding) |
| |
| for item in parser.log: |
| print(item) |
| |
| if document is not None: |
| if opts.xml: |
| sys.stdout.write(document.toxml("utf-8")) |
| elif opts.tree: |
| if not hasattr(document,'__getitem__'): |
| document = [document] |
| for fragment in document: |
| print(parser.tree.testSerializer(fragment)) |
| elif opts.hilite: |
| sys.stdout.write(document.hilite("utf-8")) |
| elif opts.html: |
| kwargs = {} |
| for opt in serializer.HTMLSerializer.options: |
| try: |
| kwargs[opt] = getattr(opts,opt) |
| except: |
| pass |
| if not kwargs['quote_char']: |
| del kwargs['quote_char'] |
| |
| tokens = treewalkers.getTreeWalker(opts.treebuilder)(document) |
| if sys.version_info[0] >= 3: |
| encoding = None |
| else: |
| encoding = "utf-8" |
| for text in serializer.HTMLSerializer(**kwargs).serialize(tokens, encoding=encoding): |
| sys.stdout.write(text) |
| if not text.endswith('\n'): sys.stdout.write('\n') |
| if opts.error: |
| errList=[] |
| for pos, errorcode, datavars in parser.errors: |
| errList.append("Line %i Col %i"%pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars) |
| sys.stdout.write("\nParse errors:\n" + "\n".join(errList)+"\n") |
| |
| def getOptParser(): |
| parser = OptionParser(usage=__doc__) |
| |
| parser.add_option("-p", "--profile", action="store_true", default=False, |
| dest="profile", help="Use the hotshot profiler to " |
| "produce a detailed log of the run") |
| |
| parser.add_option("-t", "--time", |
| action="store_true", default=False, dest="time", |
| help="Time the run using time.time (may not be accurate on all platforms, especially for short runs)") |
| |
| parser.add_option("-b", "--treebuilder", action="store", type="string", |
| dest="treebuilder", default="simpleTree") |
| |
| parser.add_option("-e", "--error", action="store_true", default=False, |
| dest="error", help="Print a list of parse errors") |
| |
| parser.add_option("-f", "--fragment", action="store_true", default=False, |
| dest="fragment", help="Parse as a fragment") |
| |
| parser.add_option("", "--tree", action="store_true", default=False, |
| dest="tree", help="Output as debug tree") |
| |
| parser.add_option("-x", "--xml", action="store_true", default=False, |
| dest="xml", help="Output as xml") |
| |
| parser.add_option("", "--no-html", action="store_false", default=True, |
| dest="html", help="Don't output html") |
| |
| parser.add_option("", "--hilite", action="store_true", default=False, |
| dest="hilite", help="Output as formatted highlighted code.") |
| |
| parser.add_option("-c", "--encoding", action="store_true", default=False, |
| dest="encoding", help="Print character encoding used") |
| |
| parser.add_option("", "--inject-meta-charset", action="store_true", |
| default=False, dest="inject_meta_charset", |
| help="inject <meta charset>") |
| |
| parser.add_option("", "--strip-whitespace", action="store_true", |
| default=False, dest="strip_whitespace", |
| help="strip whitespace") |
| |
| parser.add_option("", "--omit-optional-tags", action="store_true", |
| default=False, dest="omit_optional_tags", |
| help="omit optional tags") |
| |
| parser.add_option("", "--quote-attr-values", action="store_true", |
| default=False, dest="quote_attr_values", |
| help="quote attribute values") |
| |
| parser.add_option("", "--use-best-quote-char", action="store_true", |
| default=False, dest="use_best_quote_char", |
| help="use best quote character") |
| |
| parser.add_option("", "--quote-char", action="store", |
| default=None, dest="quote_char", |
| help="quote character") |
| |
| parser.add_option("", "--no-minimize-boolean-attributes", |
| action="store_false", default=True, |
| dest="minimize_boolean_attributes", |
| help="minimize boolean attributes") |
| |
| parser.add_option("", "--use-trailing-solidus", action="store_true", |
| default=False, dest="use_trailing_solidus", |
| help="use trailing solidus") |
| |
| parser.add_option("", "--space-before-trailing-solidus", |
| action="store_true", default=False, |
| dest="space_before_trailing_solidus", |
| help="add space before trailing solidus") |
| |
| parser.add_option("", "--escape-lt-in-attrs", action="store_true", |
| default=False, dest="escape_lt_in_attrs", |
| help="escape less than signs in attribute values") |
| |
| parser.add_option("", "--escape-rcdata", action="store_true", |
| default=False, dest="escape_rcdata", |
| help="escape rcdata element values") |
| |
| parser.add_option("", "--sanitize", action="store_true", default=False, |
| dest="sanitize", help="sanitize") |
| |
| parser.add_option("-l", "--log", action="store_true", default=False, |
| dest="log", help="log state transitions") |
| |
| return parser |
| |
| if __name__ == "__main__": |
| parse() |