blob: b841c76ce41a06f6aaab3d39bee46376458335b8 [file] [log] [blame]
from __future__ import absolute_import, division, unicode_literals
import sys
import os
import json
import re
import html5lib
from . import support
from . import test_tokenizer
p = html5lib.HTMLParser()
unnamespaceExpected = re.compile(r"^(\|\s*)<html ([^>]+)>", re.M).sub
def main(out_path):
if not os.path.exists(out_path):
sys.stderr.write("Path %s does not exist" % out_path)
sys.exit(1)
for filename in support.get_data_files('tokenizer', '*.test'):
run_file(filename, out_path)
def run_file(filename, out_path):
try:
tests_data = json.load(open(filename, "r"))
except ValueError:
sys.stderr.write("Failed to load %s\n" % filename)
return
name = os.path.splitext(os.path.split(filename)[1])[0]
output_file = open(os.path.join(out_path, "tokenizer_%s.dat" % name), "w")
if 'tests' in tests_data:
for test_data in tests_data['tests']:
if 'initialStates' not in test_data:
test_data["initialStates"] = ["Data state"]
for initial_state in test_data["initialStates"]:
if initial_state != "Data state":
# don't support this yet
continue
test = make_test(test_data)
output_file.write(test)
output_file.close()
def make_test(test_data):
if 'doubleEscaped' in test_data:
test_data = test_tokenizer.unescape_test(test_data)
rv = []
rv.append("#data")
rv.append(test_data["input"].encode("utf8"))
rv.append("#errors")
tree = p.parse(test_data["input"])
output = p.tree.testSerializer(tree)
output = "\n".join(("| " + line[3:]) if line.startswith("| ") else line
for line in output.split("\n"))
output = unnamespaceExpected(r"\1<\2>", output)
rv.append(output.encode("utf8"))
rv.append("")
return "\n".join(rv)
if __name__ == "__main__":
main(sys.argv[1])