#!/usr/bin/env python
"""usage: %prog [options] filename

Parse a document to a tree, with optional profiling
"""

import sys
import os
import traceback
from optparse import OptionParser

from html5lib import html5parser, sanitizer
from html5lib.tokenizer import HTMLTokenizer
from html5lib import treebuilders, serializer, treewalkers
from html5lib import constants

def parse():
    optParser = getOptParser()
    opts,args = optParser.parse_args()
    encoding = "utf8"

    try:
        f = args[-1]
        # Try opening from the internet
        if f.startswith('http://'):
            try:
                import urllib.request, urllib.parse, urllib.error, cgi
                f = urllib.request.urlopen(f)
                contentType = f.headers.get('content-type')
                if contentType:
                    (mediaType, params) = cgi.parse_header(contentType)
                    encoding = params.get('charset')
            except:
                pass
        elif f == '-':
            f = sys.stdin
            if sys.version_info[0] >= 3:
                encoding = None
        else:
            try:
                # Try opening from file system
                f = open(f, "rb")
            except IOError as e:                
                sys.stderr.write("Unable to open file: %s\n" % e)
                sys.exit(1)
    except IndexError:
        sys.stderr.write("No filename provided. Use -h for help\n")
        sys.exit(1)

    treebuilder = treebuilders.getTreeBuilder(opts.treebuilder)

    if opts.sanitize:
        tokenizer = sanitizer.HTMLSanitizer
    else:
        tokenizer = HTMLTokenizer

    p = html5parser.HTMLParser(tree=treebuilder, tokenizer=tokenizer, debug=opts.log)

    if opts.fragment:
        parseMethod = p.parseFragment
    else:
        parseMethod = p.parse

    if opts.profile:
        import cProfile
        import pstats
        cProfile.runctx("run(parseMethod, f, encoding)", None,
                        {"run": run,
                         "parseMethod": parseMethod,
                         "f": f,
                         "encoding": encoding},
                        "stats.prof")
        # XXX - We should use a temp file here
        stats = pstats.Stats('stats.prof')
        stats.strip_dirs()
        stats.sort_stats('time')
        stats.print_stats()
    elif opts.time:
        import time
        t0 = time.time()
        document = run(parseMethod, f, encoding)
        t1 = time.time()
        if document:
            printOutput(p, document, opts)
            t2 = time.time()
            sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)"%(t1-t0, t2-t1))
        else:
            sys.stderr.write("\n\nRun took: %fs"%(t1-t0))
    else:
        document = run(parseMethod, f, encoding)
        if document:
            printOutput(p, document, opts)

def run(parseMethod, f, encoding):
    try:
        document = parseMethod(f, encoding=encoding)
    except:
        document = None
        traceback.print_exc()
    return document

def printOutput(parser, document, opts):
    if opts.encoding:
        print("Encoding:", parser.tokenizer.stream.charEncoding)

    for item in parser.log:
        print(item)

    if document is not None:
        if opts.xml:
            sys.stdout.write(document.toxml("utf-8"))
        elif opts.tree:
            if not hasattr(document,'__getitem__'):
                document = [document]
            for fragment in document:
                print(parser.tree.testSerializer(fragment))
        elif opts.hilite:
            sys.stdout.write(document.hilite("utf-8"))
        elif opts.html:
            kwargs = {}
            for opt in serializer.HTMLSerializer.options:
                try:
                    kwargs[opt] = getattr(opts,opt)
                except:
                    pass
            if not kwargs['quote_char']:
                del kwargs['quote_char']

            tokens = treewalkers.getTreeWalker(opts.treebuilder)(document)
            if sys.version_info[0] >= 3:
                encoding = None
            else:
                encoding = "utf-8"
            for text in serializer.HTMLSerializer(**kwargs).serialize(tokens, encoding=encoding):
                sys.stdout.write(text)
            if not text.endswith('\n'): sys.stdout.write('\n')
    if opts.error:
        errList=[]
        for pos, errorcode, datavars in parser.errors:
            errList.append("Line %i Col %i"%pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars)
        sys.stdout.write("\nParse errors:\n" + "\n".join(errList)+"\n")

def getOptParser():
    parser = OptionParser(usage=__doc__)

    parser.add_option("-p", "--profile", action="store_true", default=False,
                      dest="profile", help="Use the hotshot profiler to "
                      "produce a detailed log of the run")

    parser.add_option("-t", "--time",
                      action="store_true", default=False, dest="time",
                      help="Time the run using time.time (may not be accurate on all platforms, especially for short runs)")

    parser.add_option("-b", "--treebuilder", action="store", type="string",
                      dest="treebuilder", default="simpleTree")

    parser.add_option("-e", "--error", action="store_true", default=False,
                      dest="error", help="Print a list of parse errors")

    parser.add_option("-f", "--fragment", action="store_true", default=False,
                      dest="fragment", help="Parse as a fragment")

    parser.add_option("", "--tree", action="store_true", default=False,
                      dest="tree", help="Output as debug tree")

    parser.add_option("-x", "--xml", action="store_true", default=False,
                      dest="xml", help="Output as xml")

    parser.add_option("", "--no-html", action="store_false", default=True,
                      dest="html", help="Don't output html")

    parser.add_option("", "--hilite", action="store_true", default=False,
                      dest="hilite", help="Output as formatted highlighted code.")

    parser.add_option("-c", "--encoding", action="store_true", default=False,
                      dest="encoding", help="Print character encoding used")

    parser.add_option("", "--inject-meta-charset", action="store_true",
                      default=False, dest="inject_meta_charset",
                      help="inject <meta charset>")

    parser.add_option("", "--strip-whitespace", action="store_true",
                      default=False, dest="strip_whitespace",
                      help="strip whitespace")

    parser.add_option("", "--omit-optional-tags", action="store_true",
                      default=False, dest="omit_optional_tags",
                      help="omit optional tags")

    parser.add_option("", "--quote-attr-values", action="store_true",
                      default=False, dest="quote_attr_values",
                      help="quote attribute values")

    parser.add_option("", "--use-best-quote-char", action="store_true",
                      default=False, dest="use_best_quote_char",
                      help="use best quote character")

    parser.add_option("", "--quote-char", action="store",
                      default=None, dest="quote_char",
                      help="quote character")

    parser.add_option("", "--no-minimize-boolean-attributes",
                      action="store_false", default=True,
                      dest="minimize_boolean_attributes",
                      help="minimize boolean attributes")

    parser.add_option("", "--use-trailing-solidus", action="store_true",
                      default=False, dest="use_trailing_solidus",
                      help="use trailing solidus")

    parser.add_option("", "--space-before-trailing-solidus",
                      action="store_true", default=False,
                      dest="space_before_trailing_solidus",
                      help="add space before trailing solidus")

    parser.add_option("", "--escape-lt-in-attrs", action="store_true",
                      default=False, dest="escape_lt_in_attrs",
                      help="escape less than signs in attribute values")

    parser.add_option("", "--escape-rcdata", action="store_true",
                      default=False, dest="escape_rcdata",
                      help="escape rcdata element values")

    parser.add_option("", "--sanitize", action="store_true", default=False,
                      dest="sanitize", help="sanitize")

    parser.add_option("-l", "--log", action="store_true", default=False,
                      dest="log", help="log state transitions")

    return parser

if __name__ == "__main__":
    parse()
