| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /* |
| ******************************************************************************* |
| * |
| * Copyright (C) 2004-2010, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| * |
| ******************************************************************************* |
| * file name: xmlparser.cpp |
| * encoding: UTF-8 |
| * tab size: 8 (not used) |
| * indentation:4 |
| * |
| * created on: 2004jul21 |
| * created by: Andy Heninger |
| */ |
| |
| #include <stdio.h> |
| #include "unicode/uchar.h" |
| #include "unicode/ucnv.h" |
| #include "unicode/regex.h" |
| #include "filestrm.h" |
| #include "xmlparser.h" |
| |
| #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION |
| |
| // character constants |
| enum { |
| x_QUOT=0x22, |
| x_AMP=0x26, |
| x_APOS=0x27, |
| x_LT=0x3c, |
| x_GT=0x3e, |
| x_l=0x6c |
| }; |
| |
| #define XML_SPACES "[ \\u0009\\u000d\\u000a]" |
| |
| // XML #4 |
| #define XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \ |
| "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \ |
| "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \ |
| "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]" |
| |
| // XML #5 |
| #define XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]" |
| |
| // XML #6 |
| #define XML_NAME XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*" |
| |
| U_NAMESPACE_BEGIN |
| |
| UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser) |
| UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement) |
| |
| // |
| // UXMLParser constructor. Mostly just initializes the ICU regexes that are |
| // used for parsing. |
| // |
| UXMLParser::UXMLParser(UErrorCode &status) : |
| // XML Declaration. XML Production #23. |
| // example: "<?xml version=1.0 encoding="utf-16" ?> |
| // This is a sloppy implementation - just look for the leading <?xml and the closing ?> |
| // allow for a possible leading BOM. |
| mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status), |
| |
| // XML Comment production #15 |
| // example: "<!-- whatever --> |
| // note, does not detect an illegal "--" within comments |
| mXMLComment(UnicodeString("(?s)<!--.+?-->", -1, US_INV), 0, status), |
| |
| // XML Spaces |
| // production [3] |
| mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status), |
| |
| // XML Doctype decl production #28 |
| // example "<!DOCTYPE foo SYSTEM "somewhere" > |
| // or "<!DOCTYPE foo [internal dtd]> |
| // TODO: we don't actually parse the DOCTYPE or internal subsets. |
| // Some internal dtd subsets could confuse this simple-minded |
| // attempt at skipping over them, specifically, occcurences |
| // of closeing square brackets. These could appear in comments, |
| // or in parameter entity declarations, for example. |
| mXMLDoctype(UnicodeString( |
| "(?s)<!DOCTYPE.*?(>|\\[.*?\\].*?>)", -1, US_INV |
| ), 0, status), |
| |
| // XML PI production #16 |
| // example "<?target stuff?> |
| mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status), |
| |
| // XML Element Start Productions #40, #41 |
| // example <foo att1='abc' att2="d e f" > |
| // capture #1: the tag name |
| // |
| mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name" |
| "(?:" |
| XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = " |
| "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"' |
| ")*" // * for zero or more attributes. |
| XML_SPACES "*?>", -1, US_INV), 0, status), // match " >" |
| |
| // XML Element End production #42 |
| // example </foo> |
| mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>", -1, US_INV), 0, status), |
| |
| // XML Element Empty production #44 |
| // example <foo att1="abc" att2="d e f" /> |
| mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name" |
| "(?:" |
| XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = " |
| "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"' |
| ")*" // * for zero or more attributes. |
| XML_SPACES "*?/>", -1, US_INV), 0, status), // match " />" |
| |
| |
| // XMLCharData. Everything but '<'. Note that & will be dealt with later. |
| mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status), |
| |
| // Attribute name = "value". XML Productions 10, 40/41 |
| // Capture group 1 is name, |
| // 2 is the attribute value, including the quotes. |
| // |
| // Note that attributes are scanned twice. The first time is with |
| // the regex for an entire element start. There, the attributes |
| // are checked syntactically, but not separted out one by one. |
| // Here, we match a single attribute, and make its name and |
| // attribute value available to the parser code. |
| mAttrValue(UnicodeString(XML_SPACES "+(" XML_NAME ")" XML_SPACES "*=" XML_SPACES "*" |
| "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status), |
| |
| |
| mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status), |
| |
| // Match any of the new-line sequences in content. |
| // All are changed to \u000a. |
| mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status), |
| |
| // & char references |
| // We will figure out what we've got based on which capture group has content. |
| // The last one is a catchall for unrecognized entity references.. |
| // 1 2 3 4 5 6 7 8 |
| mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))"), |
| 0, status), |
| |
| fNames(status), |
| fElementStack(status), |
| fOneLF((UChar)0x0a) // Plain new-line string, used in new line normalization. |
| { |
| } |
| |
| UXMLParser * |
| UXMLParser::createParser(UErrorCode &errorCode) { |
| if (U_FAILURE(errorCode)) { |
| return NULL; |
| } else { |
| return new UXMLParser(errorCode); |
| } |
| } |
| |
| UXMLParser::~UXMLParser() {} |
| |
| UXMLElement * |
| UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) { |
| char bytes[4096], charsetBuffer[100]; |
| FileStream *f; |
| const char *charset, *pb; |
| UnicodeString src; |
| UConverter *cnv; |
| UChar *buffer, *pu; |
| int32_t fileLength, bytesLength, length, capacity; |
| UBool flush; |
| |
| if(U_FAILURE(errorCode)) { |
| return NULL; |
| } |
| |
| f=T_FileStream_open(filename, "rb"); |
| if(f==NULL) { |
| errorCode=U_FILE_ACCESS_ERROR; |
| return NULL; |
| } |
| |
| bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes)); |
| if(bytesLength<(int32_t)sizeof(bytes)) { |
| // we have already read the entire file |
| fileLength=bytesLength; |
| } else { |
| // get the file length |
| fileLength=T_FileStream_size(f); |
| } |
| |
| /* |
| * get the charset: |
| * 1. Unicode signature |
| * 2. treat as ISO-8859-1 and read XML encoding="charser" |
| * 3. default to UTF-8 |
| */ |
| charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode); |
| if(U_SUCCESS(errorCode) && charset!=NULL) { |
| // open converter according to Unicode signature |
| cnv=ucnv_open(charset, &errorCode); |
| } else { |
| // read as Latin-1 and parse the XML declaration and encoding |
| cnv=ucnv_open("ISO-8859-1", &errorCode); |
| if(U_FAILURE(errorCode)) { |
| // unexpected error opening Latin-1 converter |
| goto exit; |
| } |
| |
| buffer=toUCharPtr(src.getBuffer(bytesLength)); |
| if(buffer==NULL) { |
| // unexpected failure to reserve some string capacity |
| errorCode=U_MEMORY_ALLOCATION_ERROR; |
| goto exit; |
| } |
| pb=bytes; |
| pu=buffer; |
| ucnv_toUnicode( |
| cnv, |
| &pu, buffer+src.getCapacity(), |
| &pb, bytes+bytesLength, |
| NULL, TRUE, &errorCode); |
| src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0); |
| ucnv_close(cnv); |
| cnv=NULL; |
| if(U_FAILURE(errorCode)) { |
| // unexpected error in conversion from Latin-1 |
| src.remove(); |
| goto exit; |
| } |
| |
| // parse XML declaration |
| if(mXMLDecl.reset(src).lookingAt(0, errorCode)) { |
| int32_t declEnd=mXMLDecl.end(errorCode); |
| // go beyond <?xml |
| int32_t pos=src.indexOf((UChar)x_l)+1; |
| |
| mAttrValue.reset(src); |
| while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) { // loop runs once per attribute on this element. |
| UnicodeString attName = mAttrValue.group(1, errorCode); |
| UnicodeString attValue = mAttrValue.group(2, errorCode); |
| |
| // Trim the quotes from the att value. These are left over from the original regex |
| // that parsed the attribue, which couldn't conveniently strip them. |
| attValue.remove(0,1); // one char from the beginning |
| attValue.truncate(attValue.length()-1); // and one from the end. |
| |
| if(attName==UNICODE_STRING("encoding", 8)) { |
| length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer)); |
| charset=charsetBuffer; |
| break; |
| } |
| pos = mAttrValue.end(2, errorCode); |
| } |
| |
| if(charset==NULL) { |
| // default to UTF-8 |
| charset="UTF-8"; |
| } |
| cnv=ucnv_open(charset, &errorCode); |
| } |
| } |
| |
| if(U_FAILURE(errorCode)) { |
| // unable to open the converter |
| goto exit; |
| } |
| |
| // convert the file contents |
| capacity=fileLength; // estimated capacity |
| src.getBuffer(capacity); |
| src.releaseBuffer(0); // zero length |
| flush=FALSE; |
| for(;;) { |
| // convert contents of bytes[bytesLength] |
| pb=bytes; |
| for(;;) { |
| length=src.length(); |
| buffer=toUCharPtr(src.getBuffer(capacity)); |
| if(buffer==NULL) { |
| // unexpected failure to reserve some string capacity |
| errorCode=U_MEMORY_ALLOCATION_ERROR; |
| goto exit; |
| } |
| |
| pu=buffer+length; |
| ucnv_toUnicode( |
| cnv, &pu, buffer+src.getCapacity(), |
| &pb, bytes+bytesLength, |
| NULL, FALSE, &errorCode); |
| src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0); |
| if(errorCode==U_BUFFER_OVERFLOW_ERROR) { |
| errorCode=U_ZERO_ERROR; |
| capacity=(3*src.getCapacity())/2; // increase capacity by 50% |
| } else { |
| break; |
| } |
| } |
| |
| if(U_FAILURE(errorCode)) { |
| break; // conversion error |
| } |
| |
| if(flush) { |
| break; // completely converted the file |
| } |
| |
| // read next block |
| bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes)); |
| if(bytesLength==0) { |
| // reached end of file, convert once more to flush the converter |
| flush=TRUE; |
| } |
| } |
| |
| exit: |
| ucnv_close(cnv); |
| T_FileStream_close(f); |
| |
| if(U_SUCCESS(errorCode)) { |
| return parse(src, errorCode); |
| } else { |
| return NULL; |
| } |
| } |
| |
| UXMLElement * |
| UXMLParser::parse(const UnicodeString &src, UErrorCode &status) { |
| if(U_FAILURE(status)) { |
| return NULL; |
| } |
| |
| UXMLElement *root = NULL; |
| fPos = 0; // TODO use just a local pos variable and pass it into functions |
| // where necessary? |
| |
| // set all matchers to work on the input string |
| mXMLDecl.reset(src); |
| mXMLComment.reset(src); |
| mXMLSP.reset(src); |
| mXMLDoctype.reset(src); |
| mXMLPI.reset(src); |
| mXMLElemStart.reset(src); |
| mXMLElemEnd.reset(src); |
| mXMLElemEmpty.reset(src); |
| mXMLCharData.reset(src); |
| mAttrValue.reset(src); |
| mAttrNormalizer.reset(src); |
| mNewLineNormalizer.reset(src); |
| mAmps.reset(src); |
| |
| // Consume the XML Declaration, if present. |
| if (mXMLDecl.lookingAt(fPos, status)) { |
| fPos = mXMLDecl.end(status); |
| } |
| |
| // Consume "misc" [XML production 27] appearing before DocType |
| parseMisc(status); |
| |
| // Consume a DocType declaration, if present. |
| if (mXMLDoctype.lookingAt(fPos, status)) { |
| fPos = mXMLDoctype.end(status); |
| } |
| |
| // Consume additional "misc" [XML production 27] appearing after the DocType |
| parseMisc(status); |
| |
| // Get the root element |
| if (mXMLElemEmpty.lookingAt(fPos, status)) { |
| // Root is an empty element (no nested elements or content) |
| root = createElement(mXMLElemEmpty, status); |
| fPos = mXMLElemEmpty.end(status); |
| } else { |
| if (mXMLElemStart.lookingAt(fPos, status) == FALSE) { |
| error("Root Element expected", status); |
| goto errorExit; |
| } |
| root = createElement(mXMLElemStart, status); |
| UXMLElement *el = root; |
| |
| // |
| // This is the loop that consumes the root element of the document, |
| // including all nested content. Nested elements are handled by |
| // explicit pushes/pops of the element stack; there is no recursion |
| // in the control flow of this code. |
| // "el" always refers to the current element, the one to which content |
| // is being added. It is above the top of the element stack. |
| for (;;) { |
| // Nested Element Start |
| if (mXMLElemStart.lookingAt(fPos, status)) { |
| UXMLElement *t = createElement(mXMLElemStart, status); |
| el->fChildren.addElement(t, status); |
| t->fParent = el; |
| fElementStack.push(el, status); |
| el = t; |
| continue; |
| } |
| |
| // Text Content. String is concatenated onto the current node's content, |
| // but only if it contains something other than spaces. |
| UnicodeString s = scanContent(status); |
| if (s.length() > 0) { |
| mXMLSP.reset(s); |
| if (mXMLSP.matches(status) == FALSE) { |
| // This chunk of text contains something other than just |
| // white space. Make a child node for it. |
| replaceCharRefs(s, status); |
| el->fChildren.addElement(s.clone(), status); |
| } |
| mXMLSP.reset(src); // The matchers need to stay set to the main input string. |
| continue; |
| } |
| |
| // Comments. Discard. |
| if (mXMLComment.lookingAt(fPos, status)) { |
| fPos = mXMLComment.end(status); |
| continue; |
| } |
| |
| // PIs. Discard. |
| if (mXMLPI.lookingAt(fPos, status)) { |
| fPos = mXMLPI.end(status); |
| continue; |
| } |
| |
| // Element End |
| if (mXMLElemEnd.lookingAt(fPos, status)) { |
| fPos = mXMLElemEnd.end(0, status); |
| const UnicodeString name = mXMLElemEnd.group(1, status); |
| if (name != *el->fName) { |
| error("Element start / end tag mismatch", status); |
| goto errorExit; |
| } |
| if (fElementStack.empty()) { |
| // Close of the root element. We're done with the doc. |
| el = NULL; |
| break; |
| } |
| el = (UXMLElement *)fElementStack.pop(); |
| continue; |
| } |
| |
| // Empty Element. Stored as a child of the current element, but not stacked. |
| if (mXMLElemEmpty.lookingAt(fPos, status)) { |
| UXMLElement *t = createElement(mXMLElemEmpty, status); |
| el->fChildren.addElement(t, status); |
| continue; |
| } |
| |
| // Hit something within the document that doesn't match anything. |
| // It's an error. |
| error("Unrecognized markup", status); |
| break; |
| } |
| |
| if (el != NULL || !fElementStack.empty()) { |
| // We bailed out early, for some reason. |
| error("Root element not closed.", status); |
| goto errorExit; |
| } |
| } |
| |
| // Root Element parse is complete. |
| // Consume the annoying xml "Misc" that can appear at the end of the doc. |
| parseMisc(status); |
| |
| // We should have reached the end of the input |
| if (fPos != src.length()) { |
| error("Extra content at the end of the document", status); |
| goto errorExit; |
| } |
| |
| // Success! |
| return root; |
| |
| errorExit: |
| delete root; |
| return NULL; |
| } |
| |
| // |
| // createElement |
| // We've just matched an element start tag. Create and fill in a UXMLElement object |
| // for it. |
| // |
| UXMLElement * |
| UXMLParser::createElement(RegexMatcher &mEl, UErrorCode &status) { |
| // First capture group is the element's name. |
| UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status); |
| |
| // Scan for attributes. |
| int32_t pos = mEl.end(1, status); // The position after the end of the tag name |
| |
| while (mAttrValue.lookingAt(pos, status)) { // loop runs once per attribute on this element. |
| UnicodeString attName = mAttrValue.group(1, status); |
| UnicodeString attValue = mAttrValue.group(2, status); |
| |
| // Trim the quotes from the att value. These are left over from the original regex |
| // that parsed the attribue, which couldn't conveniently strip them. |
| attValue.remove(0,1); // one char from the beginning |
| attValue.truncate(attValue.length()-1); // and one from the end. |
| |
| // XML Attribue value normalization. |
| // This is one of the really screwy parts of the XML spec. |
| // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize |
| // Note that non-validating parsers must treat all entities as type CDATA |
| // which simplifies things some. |
| |
| // Att normalization step 1: normalize any newlines in the attribute value |
| mNewLineNormalizer.reset(attValue); |
| attValue = mNewLineNormalizer.replaceAll(fOneLF, status); |
| |
| // Next change all xml white space chars to plain \u0020 spaces. |
| mAttrNormalizer.reset(attValue); |
| UnicodeString oneSpace((UChar)0x0020); |
| attValue = mAttrNormalizer.replaceAll(oneSpace, status); |
| |
| // Replace character entities. |
| replaceCharRefs(attValue, status); |
| |
| // Save the attribute name and value in our document structure. |
| el->fAttNames.addElement((void *)intern(attName, status), status); |
| el->fAttValues.addElement(attValue.clone(), status); |
| pos = mAttrValue.end(2, status); |
| } |
| fPos = mEl.end(0, status); |
| return el; |
| } |
| |
| // |
| // parseMisc |
| // Consume XML "Misc" [production #27] |
| // which is any combination of space, PI and comments |
| // Need to watch end-of-input because xml MISC stuff is allowed after |
| // the document element, so we WILL scan off the end in this function |
| // |
| void |
| UXMLParser::parseMisc(UErrorCode &status) { |
| for (;;) { |
| if (fPos >= mXMLPI.input().length()) { |
| break; |
| } |
| if (mXMLPI.lookingAt(fPos, status)) { |
| fPos = mXMLPI.end(status); |
| continue; |
| } |
| if (mXMLSP.lookingAt(fPos, status)) { |
| fPos = mXMLSP.end(status); |
| continue; |
| } |
| if (mXMLComment.lookingAt(fPos, status)) { |
| fPos = mXMLComment.end(status); |
| continue; |
| } |
| break; |
| } |
| } |
| |
| // |
| // Scan for document content. |
| // |
| UnicodeString |
| UXMLParser::scanContent(UErrorCode &status) { |
| UnicodeString result; |
| if (mXMLCharData.lookingAt(fPos, status)) { |
| result = mXMLCharData.group((int32_t)0, status); |
| // Normalize the new-lines. (Before char ref substitution) |
| mNewLineNormalizer.reset(result); |
| result = mNewLineNormalizer.replaceAll(fOneLF, status); |
| |
| // TODO: handle CDATA |
| fPos = mXMLCharData.end(0, status); |
| } |
| |
| return result; |
| } |
| |
| // |
| // replaceCharRefs |
| // |
| // replace the char entities < & { ካ etc. in a string |
| // with the corresponding actual character. |
| // |
| void |
| UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) { |
| UnicodeString result; |
| UnicodeString replacement; |
| int i; |
| |
| mAmps.reset(s); |
| // See the initialization for the regex matcher mAmps. |
| // Which entity we've matched is determined by which capture group has content, |
| // which is flaged by start() of that group not being -1. |
| while (mAmps.find()) { |
| if (mAmps.start(1, status) != -1) { |
| replacement.setTo((UChar)x_AMP); |
| } else if (mAmps.start(2, status) != -1) { |
| replacement.setTo((UChar)x_LT); |
| } else if (mAmps.start(3, status) != -1) { |
| replacement.setTo((UChar)x_GT); |
| } else if (mAmps.start(4, status) != -1) { |
| replacement.setTo((UChar)x_APOS); |
| } else if (mAmps.start(5, status) != -1) { |
| replacement.setTo((UChar)x_QUOT); |
| } else if (mAmps.start(6, status) != -1) { |
| UnicodeString hexString = mAmps.group(6, status); |
| UChar32 val = 0; |
| for (i=0; i<hexString.length(); i++) { |
| val = (val << 4) + u_digit(hexString.charAt(i), 16); |
| } |
| // TODO: some verification that the character is valid |
| replacement.setTo(val); |
| } else if (mAmps.start(7, status) != -1) { |
| UnicodeString decimalString = mAmps.group(7, status); |
| UChar32 val = 0; |
| for (i=0; i<decimalString.length(); i++) { |
| val = val*10 + u_digit(decimalString.charAt(i), 10); |
| } |
| // TODO: some verification that the character is valid |
| replacement.setTo(val); |
| } else { |
| // An unrecognized &entity; Leave it alone. |
| // TODO: check that it really looks like an entity, and is not some |
| // random & in the text. |
| replacement = mAmps.group((int32_t)0, status); |
| } |
| mAmps.appendReplacement(result, replacement, status); |
| } |
| mAmps.appendTail(result); |
| s = result; |
| } |
| |
| void |
| UXMLParser::error(const char *message, UErrorCode &status) { |
| // TODO: something better here... |
| const UnicodeString &src=mXMLDecl.input(); |
| int line = 0; |
| int ci = 0; |
| while (ci < fPos && ci>=0) { |
| ci = src.indexOf((UChar)0x0a, ci+1); |
| line++; |
| } |
| fprintf(stderr, "Error: %s at line %d\n", message, line); |
| if (U_SUCCESS(status)) { |
| status = U_PARSE_ERROR; |
| } |
| } |
| |
| // intern strings like in Java |
| |
| const UnicodeString * |
| UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) { |
| const UHashElement *he=fNames.find(s); |
| if(he!=NULL) { |
| // already a known name, return its hashed key pointer |
| return (const UnicodeString *)he->key.pointer; |
| } else { |
| // add this new name and return its hashed key pointer |
| fNames.puti(s, 0, errorCode); |
| he=fNames.find(s); |
| return (const UnicodeString *)he->key.pointer; |
| } |
| } |
| |
| const UnicodeString * |
| UXMLParser::findName(const UnicodeString &s) const { |
| const UHashElement *he=fNames.find(s); |
| if(he!=NULL) { |
| // a known name, return its hashed key pointer |
| return (const UnicodeString *)he->key.pointer; |
| } else { |
| // unknown name |
| return NULL; |
| } |
| } |
| |
| // UXMLElement ------------------------------------------------------------- *** |
| |
| UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) : |
| fParser(parser), |
| fName(name), |
| fAttNames(errorCode), |
| fAttValues(errorCode), |
| fChildren(errorCode), |
| fParent(NULL) |
| { |
| } |
| |
| UXMLElement::~UXMLElement() { |
| int i; |
| // attribute names are owned by the UXMLParser, don't delete them here |
| for (i=fAttValues.size()-1; i>=0; i--) { |
| delete (UObject *)fAttValues.elementAt(i); |
| } |
| for (i=fChildren.size()-1; i>=0; i--) { |
| delete (UObject *)fChildren.elementAt(i); |
| } |
| } |
| |
| const UnicodeString & |
| UXMLElement::getTagName() const { |
| return *fName; |
| } |
| |
| UnicodeString |
| UXMLElement::getText(UBool recurse) const { |
| UnicodeString text; |
| appendText(text, recurse); |
| return text; |
| } |
| |
| void |
| UXMLElement::appendText(UnicodeString &text, UBool recurse) const { |
| const UObject *node; |
| int32_t i, count=fChildren.size(); |
| for(i=0; i<count; ++i) { |
| node=(const UObject *)fChildren.elementAt(i); |
| const UnicodeString *s=dynamic_cast<const UnicodeString *>(node); |
| if(s!=NULL) { |
| text.append(*s); |
| } else if(recurse) /* must be a UXMLElement */ { |
| ((const UXMLElement *)node)->appendText(text, recurse); |
| } |
| } |
| } |
| |
| int32_t |
| UXMLElement::countAttributes() const { |
| return fAttNames.size(); |
| } |
| |
| const UnicodeString * |
| UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const { |
| if(0<=i && i<fAttNames.size()) { |
| name.setTo(*(const UnicodeString *)fAttNames.elementAt(i)); |
| value.setTo(*(const UnicodeString *)fAttValues.elementAt(i)); |
| return &value; // or return (UnicodeString *)fAttValues.elementAt(i); |
| } else { |
| return NULL; |
| } |
| } |
| |
| const UnicodeString * |
| UXMLElement::getAttribute(const UnicodeString &name) const { |
| // search for the attribute name by comparing the interned pointer, |
| // not the string contents |
| const UnicodeString *p=fParser->findName(name); |
| if(p==NULL) { |
| return NULL; // no such attribute seen by the parser at all |
| } |
| |
| int32_t i, count=fAttNames.size(); |
| for(i=0; i<count; ++i) { |
| if(p==(const UnicodeString *)fAttNames.elementAt(i)) { |
| return (const UnicodeString *)fAttValues.elementAt(i); |
| } |
| } |
| return NULL; |
| } |
| |
| int32_t |
| UXMLElement::countChildren() const { |
| return fChildren.size(); |
| } |
| |
| const UObject * |
| UXMLElement::getChild(int32_t i, UXMLNodeType &type) const { |
| if(0<=i && i<fChildren.size()) { |
| const UObject *node=(const UObject *)fChildren.elementAt(i); |
| if(dynamic_cast<const UXMLElement *>(node)!=NULL) { |
| type=UXML_NODE_TYPE_ELEMENT; |
| } else { |
| type=UXML_NODE_TYPE_STRING; |
| } |
| return node; |
| } else { |
| return NULL; |
| } |
| } |
| |
| const UXMLElement * |
| UXMLElement::nextChildElement(int32_t &i) const { |
| if(i<0) { |
| return NULL; |
| } |
| |
| const UObject *node; |
| int32_t count=fChildren.size(); |
| while(i<count) { |
| node=(const UObject *)fChildren.elementAt(i++); |
| const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node); |
| if(elem!=NULL) { |
| return elem; |
| } |
| } |
| return NULL; |
| } |
| |
| const UXMLElement * |
| UXMLElement::getChildElement(const UnicodeString &name) const { |
| // search for the element name by comparing the interned pointer, |
| // not the string contents |
| const UnicodeString *p=fParser->findName(name); |
| if(p==NULL) { |
| return NULL; // no such element seen by the parser at all |
| } |
| |
| const UObject *node; |
| int32_t i, count=fChildren.size(); |
| for(i=0; i<count; ++i) { |
| node=(const UObject *)fChildren.elementAt(i); |
| const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node); |
| if(elem!=NULL) { |
| if(p==elem->fName) { |
| return elem; |
| } |
| } |
| } |
| return NULL; |
| } |
| |
| U_NAMESPACE_END |
| |
| #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ |
| |