blob: 6266b02dfb950b8e4f4acf687633816f28d61a1c [file] [log] [blame]
# Copyright 2019 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Utility functions for parsing XML strings into ElementTree nodes."""
import sys
import xml.etree.ElementTree as ET
import xml.sax
class _FirstTagFoundError(Exception):
"""Raised when the first tag is found in an XML document.
This isn't actually an error. Raising this exception is how we end parsing XML
documents early.
"""
pass
class _FirstTagFinder(xml.sax.ContentHandler):
"""An XML SAX parser that raises as soon as a tag is found.
Call getFirstTagLine to determine which line the tag was found on.
"""
def __init__(self):
xml.sax.ContentHandler.__init__(self)
self.first_tag_line = 0
self.first_tag_column = 0
def GetFirstTagLine(self):
return self.first_tag_line
def GetFirstTagColumn(self):
return self.first_tag_column
def setDocumentLocator(self, locator):
self.location = locator
def startElement(self, tag, attributes):
del tag, attributes # Unused.
# Now that the first tag is found, remember the location of it.
self.first_tag_line = self.location.getLineNumber()
self.first_tag_column = self.location.getColumnNumber()
# End parsing by throwing.
raise _FirstTagFoundError()
class _CommentedXMLParser(ET.XMLParser):
"""A Python 2 compatible ElementTree builder that preserves comments."""
def __init__(self, *args, **kwargs):
super(_CommentedXMLParser, self).__init__(*args, **kwargs)
self._parser.CommentHandler = self.comment
def comment(self, data): # pylint: disable=invalid-name
self._target.start(ET.Comment, {})
self._target.data(data)
self._target.end(ET.Comment)
def GetTopLevelContent(file_content):
"""Returns a string of all the text in the xml file before the first tag."""
handler = _FirstTagFinder()
first_tag_line = 0
first_tag_column = 0
try:
xml.sax.parseString(file_content.encode('utf-8'), handler)
except _FirstTagFoundError:
# This is the expected case, it means a tag was found in the doc.
first_tag_line = handler.GetFirstTagLine()
first_tag_column = handler.GetFirstTagColumn()
if first_tag_line == 0 and first_tag_column == 0:
return ''
char = 0
for _ in range(first_tag_line - 1):
char = file_content.index('\n', char) + 1
char += first_tag_column - 1
# |char| is now pointing at the final character before the opening tag '<'.
top_content = file_content[:char + 1].strip()
if not top_content:
return ''
return top_content + '\n\n'
def ParseXMLString(raw_xml):
"""Parses raw_xml and returns an ElementTree node that includes comments."""
if sys.version_info.major == 2:
return ET.fromstring(raw_xml.encode('utf-8'), _CommentedXMLParser())
else:
if sys.version_info >= (3, 8, 0):
tree_builder = ET.TreeBuilder(insert_comments=True)
else:
tree_builder = ET.TreeBuilder()
return ET.fromstring(raw_xml, ET.XMLParser(target=tree_builder))