| import json |
| |
| import html5lib |
| |
| def parse(path="html5ents.xml"): |
| return html5lib.parse(open(path), treebuilder="lxml") |
| |
| def entity_table(tree): |
| return dict((entity_name("".join(tr[0].xpath(".//text()"))), |
| entity_characters(tr[1].text)) |
| for tr in tree.xpath("//h:tbody/h:tr", |
| namespaces={"h":"http://www.w3.org/1999/xhtml"})) |
| |
| def entity_name(inp): |
| return inp.strip() |
| |
| def entity_characters(inp): |
| return "".join(codepoint_to_character(item) |
| for item in inp.split() |
| if item) |
| |
| def codepoint_to_character(inp): |
| return ("\U000"+inp[2:]).decode("unicode-escape") |
| |
| def make_tests_json(entities): |
| test_list = make_test_list(entities) |
| tests_json = {"tests": |
| [make_test(*item) for item in test_list] |
| } |
| return tests_json |
| |
| def make_test(name, characters, good): |
| return { |
| "description":test_description(name, good), |
| "input":"&%s"%name, |
| "output":test_expected(name, characters, good) |
| } |
| |
| def test_description(name, good): |
| with_semicolon = name.endswith(";") |
| semicolon_text = {True:"with a semi-colon", |
| False:"without a semi-colon"}[with_semicolon] |
| if good: |
| text = "Named entity: %s %s"%(name, semicolon_text) |
| else: |
| text = "Bad named entity: %s %s"%(name, semicolon_text) |
| return text |
| |
| def test_expected(name, characters, good): |
| rv = [] |
| if not good or not name.endswith(";"): |
| rv.append("ParseError") |
| rv.append(["Character", characters]) |
| return rv |
| |
| def make_test_list(entities): |
| tests = [] |
| for entity_name, characters in entities.items(): |
| if entity_name.endswith(";") and not subentity_exists(entity_name, entities): |
| tests.append((entity_name[:-1], "&" + entity_name[:-1], False)) |
| tests.append((entity_name, characters, True)) |
| return sorted(tests) |
| |
| def subentity_exists(entity_name, entities): |
| for i in range(1, len(entity_name)): |
| if entity_name[:-i] in entities: |
| return True |
| return False |
| |
| def make_entities_code(entities): |
| entities_text = "\n".join(" \"%s\": u\"%s\","%( |
| name, entities[name].encode( |
| "unicode-escape").replace("\"", "\\\"")) |
| for name in sorted(entities.keys())) |
| return """entities = { |
| %s |
| }"""%entities_text |
| |
| def main(): |
| entities = entity_table(parse()) |
| tests_json = make_tests_json(entities) |
| json.dump(tests_json, open("namedEntities.test", "w"), indent=4) |
| code = make_entities_code(entities) |
| open("entities_constants.py", "w").write(code) |
| |
| if __name__ == "__main__": |
| main() |
| |