third_party/web_platform_tests/tools/html5lib/html5lib/tests/tokenizertotree.py - cobalt - Git at Google

 from __future__ import absolute_import, division, unicode_literals

 import sys
 import os
 import json
 import re

 import html5lib
 from . import support
 from . import test_tokenizer

 p = html5lib.HTMLParser()

 unnamespaceExpected = re.compile(r"^(\|\s*)<html ([^>]+)>", re.M).sub


 def main(out_path):
     if not os.path.exists(out_path):
         sys.stderr.write("Path %s does not exist" % out_path)
         sys.exit(1)

     for filename in support.get_data_files('tokenizer', '*.test'):
         run_file(filename, out_path)


 def run_file(filename, out_path):
     try:
         tests_data = json.load(open(filename, "r"))
     except ValueError:
         sys.stderr.write("Failed to load %s\n" % filename)
         return
     name = os.path.splitext(os.path.split(filename)[1])[0]
     output_file = open(os.path.join(out_path, "tokenizer_%s.dat" % name), "w")

     if 'tests' in tests_data:
         for test_data in tests_data['tests']:
             if 'initialStates' not in test_data:
                 test_data["initialStates"] = ["Data state"]

             for initial_state in test_data["initialStates"]:
                 if initial_state != "Data state":
                     # don't support this yet
                     continue
                 test = make_test(test_data)
                 output_file.write(test)

     output_file.close()


 def make_test(test_data):
     if 'doubleEscaped' in test_data:
         test_data = test_tokenizer.unescape_test(test_data)

     rv = []
     rv.append("#data")
     rv.append(test_data["input"].encode("utf8"))
     rv.append("#errors")
     tree = p.parse(test_data["input"])
     output = p.tree.testSerializer(tree)
     output = "\n".join(("| " + line[3:]) if line.startswith("|  ") else line
                        for line in output.split("\n"))
     output = unnamespaceExpected(r"\1<\2>", output)
     rv.append(output.encode("utf8"))
     rv.append("")
     return "\n".join(rv)

 if __name__ == "__main__":
     main(sys.argv[1])
	from __future__ import absolute_import, division, unicode_literals

	import sys
	import os
	import json
	import re

	import html5lib
	from . import support
	from . import test_tokenizer

	p = html5lib.HTMLParser()

	unnamespaceExpected = re.compile(r"^(\\|\s*)<html ([^>]+)>", re.M).sub


	def main(out_path):
	if not os.path.exists(out_path):
	sys.stderr.write("Path %s does not exist" % out_path)
	sys.exit(1)

	for filename in support.get_data_files('tokenizer', '*.test'):
	run_file(filename, out_path)


	def run_file(filename, out_path):
	try:
	tests_data = json.load(open(filename, "r"))
	except ValueError:
	sys.stderr.write("Failed to load %s\n" % filename)
	return
	name = os.path.splitext(os.path.split(filename)[1])[0]
	output_file = open(os.path.join(out_path, "tokenizer_%s.dat" % name), "w")

	if 'tests' in tests_data:
	for test_data in tests_data['tests']:
	if 'initialStates' not in test_data:
	test_data["initialStates"] = ["Data state"]

	for initial_state in test_data["initialStates"]:
	if initial_state != "Data state":
	# don't support this yet
	continue
	test = make_test(test_data)
	output_file.write(test)

	output_file.close()


	def make_test(test_data):
	if 'doubleEscaped' in test_data:
	test_data = test_tokenizer.unescape_test(test_data)

	rv = []
	rv.append("#data")
	rv.append(test_data["input"].encode("utf8"))
	rv.append("#errors")
	tree = p.parse(test_data["input"])
	output = p.tree.testSerializer(tree)
	output = "\n".join(("\| " + line[3:]) if line.startswith("\| ") else line
	for line in output.split("\n"))
	output = unnamespaceExpected(r"\1<\2>", output)
	rv.append(output.encode("utf8"))
	rv.append("")
	return "\n".join(rv)

	if __name__ == "__main__":
	main(sys.argv[1])