blob: a50c295dc24833ac3cf3ff673c9ddb6bdc12ad8c [file] [log] [blame]
/*
* Copyright 2015 Google Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "cobalt/dom_parser/libxml_html_parser_wrapper.h"
#include "base/basictypes.h"
#include "base/string_util.h"
namespace cobalt {
namespace dom_parser {
namespace {
// Libxml SAX handler.
// http://www.xmlsoft.org/html/libxml-tree.html#xmlSAXHandler
// NOTE: Please read about XXE attacks before implementing handler fields such
// as resolveEntity and entityDecl.
htmlSAXHandler html_sax_handler = {
NULL, /* internalSubset */
NULL, /* isStandalone */
NULL, /* hasInternalSubset */
NULL, /* hasExternalSubset */
NULL, /* resolveEntity */
NULL, /* getEntity */
NULL, /* entityDecl */
NULL, /* notationDecl */
NULL, /* attributeDecl */
NULL, /* elementDecl */
NULL, /* unparsedEntityDecl */
NULL, /* setDocumentLocator */
&StartDocument, /* startDocument */
&EndDocument, /* endDocument */
&StartElement, /* startElement */
&EndElement, /* endElement */
NULL, /* reference */
&Characters, /* characters */
NULL, /* ignorableWhitespace */
NULL, /* processingInstruction */
&Comment, /* comment */
&ParserWarning, /* xmlParserWarning */
&ParserError, /* xmlParserError */
&ParserFatal, /* xmlParserFatalError */
NULL, /* getParameterEntity */
NULL, /* cdataBlock */
NULL, /* externalSubset */
1, /* initialized */
NULL, /* private */
NULL, /* startElementNsSAX2Func */
NULL, /* endElementNsSAX2Func */
NULL /* xmlStructuredErrorFunc */
};
} // namespace
//////////////////////////////////////////////////////////////////
// LibxmlHTMLParserWrapper
//////////////////////////////////////////////////////////////////
LibxmlHTMLParserWrapper::~LibxmlHTMLParserWrapper() {
if (html_parser_context_) {
htmlFreeParserCtxt(html_parser_context_);
}
}
void LibxmlHTMLParserWrapper::OnStartElement(
const std::string& name, const ParserAttributeVector& attributes) {
// Implied tags generated by Libxml should be ignored when parsing fragment.
if (!IsFullDocument() && (name == "html" || name == "body")) {
return;
}
LibxmlParserWrapper::OnStartElement(name, attributes);
}
void LibxmlHTMLParserWrapper::OnEndElement(const std::string& name) {
// Implied tags generated by Libxml should be ignored when parsing fragment.
if (!IsFullDocument() && (name == "html" || name == "body")) {
return;
}
LibxmlParserWrapper::OnEndElement(name);
}
void LibxmlHTMLParserWrapper::DecodeChunk(const char* data, size_t size) {
if (size == 0) {
return;
}
if (!IsStringUTF8(std::string(data, size))) {
static const char* kWarningNotUTF8 = "Ignoring non-UTF8 HTML input.";
OnParsingIssue(kWarning, kWarningNotUTF8);
return;
}
if (!html_parser_context_) {
// Suppress emitting a <p> element at the root level. This is needed to
// prevent a <p> tag being added to text at the root level, for example
// when used for setting an element's innerHTML.
htmlEmitImpliedRootLevelParagraph(0);
html_parser_context_ = htmlCreatePushParserCtxt(
&html_sax_handler, this, data, static_cast<int>(size),
NULL /*filename*/, XML_CHAR_ENCODING_UTF8);
if (!html_parser_context_) {
static const char* kErrorUnableCreateParser =
"Unable to create the libxml2 parser.";
OnParsingIssue(kFatal, kErrorUnableCreateParser);
} else {
if (IsFullDocument()) {
document()->IncreaseLoadingCounter();
}
}
} else {
DCHECK(html_parser_context_);
htmlParseChunk(html_parser_context_, data, static_cast<int>(size),
0 /*do not terminate*/);
}
}
void LibxmlHTMLParserWrapper::Finish() {
if (!html_parser_context_) {
static const char empty_document[] = "<html><body></body></html>";
DecodeChunk(empty_document, arraysize(empty_document));
}
if (html_parser_context_) {
// TODO: The check on issue level is a workaround for the fact that libxml
// doesn't recover fully from error related to encoding.
if (issue_level() <= kWarning) {
htmlParseChunk(html_parser_context_, NULL, 0,
1 /*terminate*/); // Triggers EndDocument
}
if (IsFullDocument()) {
document()->DecreaseLoadingCounterAndMaybeDispatchLoadEvent();
}
}
}
base::SourceLocation LibxmlHTMLParserWrapper::GetSourceLocation() {
base::SourceLocation source_location(first_chunk_location().file_path,
html_parser_context_->input->line,
html_parser_context_->input->col);
base::AdjustForStartLocation(
first_chunk_location().line_number, first_chunk_location().column_number,
&source_location.line_number, &source_location.column_number);
return source_location;
}
} // namespace dom_parser
} // namespace cobalt