| from __future__ import absolute_import, division, unicode_literals |
| from six import text_type |
| |
| import gettext |
| _ = gettext.gettext |
| |
| try: |
| from functools import reduce |
| except ImportError: |
| pass |
| |
| from ..constants import voidElements, booleanAttributes, spaceCharacters |
| from ..constants import rcdataElements, entities, xmlEntities |
| from .. import utils |
| from xml.sax.saxutils import escape |
| |
| spaceCharacters = "".join(spaceCharacters) |
| |
| try: |
| from codecs import register_error, xmlcharrefreplace_errors |
| except ImportError: |
| unicode_encode_errors = "strict" |
| else: |
| unicode_encode_errors = "htmlentityreplace" |
| |
| encode_entity_map = {} |
| is_ucs4 = len("\U0010FFFF") == 1 |
| for k, v in list(entities.items()): |
| # skip multi-character entities |
| if ((is_ucs4 and len(v) > 1) or |
| (not is_ucs4 and len(v) > 2)): |
| continue |
| if v != "&": |
| if len(v) == 2: |
| v = utils.surrogatePairToCodepoint(v) |
| else: |
| v = ord(v) |
| if v not in encode_entity_map or k.islower(): |
| # prefer < over < and similarly for &, >, etc. |
| encode_entity_map[v] = k |
| |
| def htmlentityreplace_errors(exc): |
| if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)): |
| res = [] |
| codepoints = [] |
| skip = False |
| for i, c in enumerate(exc.object[exc.start:exc.end]): |
| if skip: |
| skip = False |
| continue |
| index = i + exc.start |
| if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]): |
| codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2]) |
| skip = True |
| else: |
| codepoint = ord(c) |
| codepoints.append(codepoint) |
| for cp in codepoints: |
| e = encode_entity_map.get(cp) |
| if e: |
| res.append("&") |
| res.append(e) |
| if not e.endswith(";"): |
| res.append(";") |
| else: |
| res.append("&#x%s;" % (hex(cp)[2:])) |
| return ("".join(res), exc.end) |
| else: |
| return xmlcharrefreplace_errors(exc) |
| |
| register_error(unicode_encode_errors, htmlentityreplace_errors) |
| |
| del register_error |
| |
| |
| class HTMLSerializer(object): |
| |
| # attribute quoting options |
| quote_attr_values = False |
| quote_char = '"' |
| use_best_quote_char = True |
| |
| # tag syntax options |
| omit_optional_tags = True |
| minimize_boolean_attributes = True |
| use_trailing_solidus = False |
| space_before_trailing_solidus = True |
| |
| # escaping options |
| escape_lt_in_attrs = False |
| escape_rcdata = False |
| resolve_entities = True |
| |
| # miscellaneous options |
| alphabetical_attributes = False |
| inject_meta_charset = True |
| strip_whitespace = False |
| sanitize = False |
| |
| options = ("quote_attr_values", "quote_char", "use_best_quote_char", |
| "omit_optional_tags", "minimize_boolean_attributes", |
| "use_trailing_solidus", "space_before_trailing_solidus", |
| "escape_lt_in_attrs", "escape_rcdata", "resolve_entities", |
| "alphabetical_attributes", "inject_meta_charset", |
| "strip_whitespace", "sanitize") |
| |
| def __init__(self, **kwargs): |
| """Initialize HTMLSerializer. |
| |
| Keyword options (default given first unless specified) include: |
| |
| inject_meta_charset=True|False |
| Whether it insert a meta element to define the character set of the |
| document. |
| quote_attr_values=True|False |
| Whether to quote attribute values that don't require quoting |
| per HTML5 parsing rules. |
| quote_char=u'"'|u"'" |
| Use given quote character for attribute quoting. Default is to |
| use double quote unless attribute value contains a double quote, |
| in which case single quotes are used instead. |
| escape_lt_in_attrs=False|True |
| Whether to escape < in attribute values. |
| escape_rcdata=False|True |
| Whether to escape characters that need to be escaped within normal |
| elements within rcdata elements such as style. |
| resolve_entities=True|False |
| Whether to resolve named character entities that appear in the |
| source tree. The XML predefined entities < > & " ' |
| are unaffected by this setting. |
| strip_whitespace=False|True |
| Whether to remove semantically meaningless whitespace. (This |
| compresses all whitespace to a single space except within pre.) |
| minimize_boolean_attributes=True|False |
| Shortens boolean attributes to give just the attribute value, |
| for example <input disabled="disabled"> becomes <input disabled>. |
| use_trailing_solidus=False|True |
| Includes a close-tag slash at the end of the start tag of void |
| elements (empty elements whose end tag is forbidden). E.g. <hr/>. |
| space_before_trailing_solidus=True|False |
| Places a space immediately before the closing slash in a tag |
| using a trailing solidus. E.g. <hr />. Requires use_trailing_solidus. |
| sanitize=False|True |
| Strip all unsafe or unknown constructs from output. |
| See `html5lib user documentation`_ |
| omit_optional_tags=True|False |
| Omit start/end tags that are optional. |
| alphabetical_attributes=False|True |
| Reorder attributes to be in alphabetical order. |
| |
| .. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation |
| """ |
| if 'quote_char' in kwargs: |
| self.use_best_quote_char = False |
| for attr in self.options: |
| setattr(self, attr, kwargs.get(attr, getattr(self, attr))) |
| self.errors = [] |
| self.strict = False |
| |
| def encode(self, string): |
| assert(isinstance(string, text_type)) |
| if self.encoding: |
| return string.encode(self.encoding, unicode_encode_errors) |
| else: |
| return string |
| |
| def encodeStrict(self, string): |
| assert(isinstance(string, text_type)) |
| if self.encoding: |
| return string.encode(self.encoding, "strict") |
| else: |
| return string |
| |
| def serialize(self, treewalker, encoding=None): |
| self.encoding = encoding |
| in_cdata = False |
| self.errors = [] |
| |
| if encoding and self.inject_meta_charset: |
| from ..filters.inject_meta_charset import Filter |
| treewalker = Filter(treewalker, encoding) |
| # WhitespaceFilter should be used before OptionalTagFilter |
| # for maximum efficiently of this latter filter |
| if self.strip_whitespace: |
| from ..filters.whitespace import Filter |
| treewalker = Filter(treewalker) |
| if self.sanitize: |
| from ..filters.sanitizer import Filter |
| treewalker = Filter(treewalker) |
| if self.omit_optional_tags: |
| from ..filters.optionaltags import Filter |
| treewalker = Filter(treewalker) |
| # Alphabetical attributes must be last, as other filters |
| # could add attributes and alter the order |
| if self.alphabetical_attributes: |
| from ..filters.alphabeticalattributes import Filter |
| treewalker = Filter(treewalker) |
| |
| for token in treewalker: |
| type = token["type"] |
| if type == "Doctype": |
| doctype = "<!DOCTYPE %s" % token["name"] |
| |
| if token["publicId"]: |
| doctype += ' PUBLIC "%s"' % token["publicId"] |
| elif token["systemId"]: |
| doctype += " SYSTEM" |
| if token["systemId"]: |
| if token["systemId"].find('"') >= 0: |
| if token["systemId"].find("'") >= 0: |
| self.serializeError(_("System identifer contains both single and double quote characters")) |
| quote_char = "'" |
| else: |
| quote_char = '"' |
| doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char) |
| |
| doctype += ">" |
| yield self.encodeStrict(doctype) |
| |
| elif type in ("Characters", "SpaceCharacters"): |
| if type == "SpaceCharacters" or in_cdata: |
| if in_cdata and token["data"].find("</") >= 0: |
| self.serializeError(_("Unexpected </ in CDATA")) |
| yield self.encode(token["data"]) |
| else: |
| yield self.encode(escape(token["data"])) |
| |
| elif type in ("StartTag", "EmptyTag"): |
| name = token["name"] |
| yield self.encodeStrict("<%s" % name) |
| if name in rcdataElements and not self.escape_rcdata: |
| in_cdata = True |
| elif in_cdata: |
| self.serializeError(_("Unexpected child element of a CDATA element")) |
| for (attr_namespace, attr_name), attr_value in token["data"].items(): |
| # TODO: Add namespace support here |
| k = attr_name |
| v = attr_value |
| yield self.encodeStrict(' ') |
| |
| yield self.encodeStrict(k) |
| if not self.minimize_boolean_attributes or \ |
| (k not in booleanAttributes.get(name, tuple()) |
| and k not in booleanAttributes.get("", tuple())): |
| yield self.encodeStrict("=") |
| if self.quote_attr_values or not v: |
| quote_attr = True |
| else: |
| quote_attr = reduce(lambda x, y: x or (y in v), |
| spaceCharacters + ">\"'=", False) |
| v = v.replace("&", "&") |
| if self.escape_lt_in_attrs: |
| v = v.replace("<", "<") |
| if quote_attr: |
| quote_char = self.quote_char |
| if self.use_best_quote_char: |
| if "'" in v and '"' not in v: |
| quote_char = '"' |
| elif '"' in v and "'" not in v: |
| quote_char = "'" |
| if quote_char == "'": |
| v = v.replace("'", "'") |
| else: |
| v = v.replace('"', """) |
| yield self.encodeStrict(quote_char) |
| yield self.encode(v) |
| yield self.encodeStrict(quote_char) |
| else: |
| yield self.encode(v) |
| if name in voidElements and self.use_trailing_solidus: |
| if self.space_before_trailing_solidus: |
| yield self.encodeStrict(" /") |
| else: |
| yield self.encodeStrict("/") |
| yield self.encode(">") |
| |
| elif type == "EndTag": |
| name = token["name"] |
| if name in rcdataElements: |
| in_cdata = False |
| elif in_cdata: |
| self.serializeError(_("Unexpected child element of a CDATA element")) |
| yield self.encodeStrict("</%s>" % name) |
| |
| elif type == "Comment": |
| data = token["data"] |
| if data.find("--") >= 0: |
| self.serializeError(_("Comment contains --")) |
| yield self.encodeStrict("<!--%s-->" % token["data"]) |
| |
| elif type == "Entity": |
| name = token["name"] |
| key = name + ";" |
| if key not in entities: |
| self.serializeError(_("Entity %s not recognized" % name)) |
| if self.resolve_entities and key not in xmlEntities: |
| data = entities[key] |
| else: |
| data = "&%s;" % name |
| yield self.encodeStrict(data) |
| |
| else: |
| self.serializeError(token["data"]) |
| |
| def render(self, treewalker, encoding=None): |
| if encoding: |
| return b"".join(list(self.serialize(treewalker, encoding))) |
| else: |
| return "".join(list(self.serialize(treewalker))) |
| |
| def serializeError(self, data="XXX ERROR MESSAGE NEEDED"): |
| # XXX The idea is to make data mandatory. |
| self.errors.append(data) |
| if self.strict: |
| raise SerializeError |
| |
| |
| def SerializeError(Exception): |
| """Error in serialized tree""" |
| pass |