| /* |
| ******************************************************************************* |
| * |
| * Copyright (C) 1998-2012, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| * |
| ******************************************************************************* |
| * |
| * File read.c |
| * |
| * Modification History: |
| * |
| * Date Name Description |
| * 05/26/99 stephen Creation. |
| * 5/10/01 Ram removed ustdio dependency |
| ******************************************************************************* |
| */ |
| |
| #include "read.h" |
| #include "errmsg.h" |
| #include "unicode/ustring.h" |
| #include "unicode/utf16.h" |
| |
| #define OPENBRACE 0x007B |
| #define CLOSEBRACE 0x007D |
| #define COMMA 0x002C |
| #define QUOTE 0x0022 |
| #define ESCAPE 0x005C |
| #define SLASH 0x002F |
| #define ASTERISK 0x002A |
| #define SPACE 0x0020 |
| #define COLON 0x003A |
| #define BADBOM 0xFFFE |
| #define CR 0x000D |
| #define LF 0x000A |
| |
| static int32_t lineCount; |
| |
| /* Protos */ |
| static enum ETokenType getStringToken(UCHARBUF *buf, |
| UChar32 initialChar, |
| struct UString *token, |
| UErrorCode *status); |
| |
| static UChar32 getNextChar (UCHARBUF *buf, UBool skipwhite, struct UString *token, UErrorCode *status); |
| static void seekUntilNewline (UCHARBUF *buf, struct UString *token, UErrorCode *status); |
| static void seekUntilEndOfComment (UCHARBUF *buf, struct UString *token, UErrorCode *status); |
| static UBool isWhitespace (UChar32 c); |
| static UBool isNewline (UChar32 c); |
| |
| U_CFUNC void resetLineNumber() { |
| lineCount = 1; |
| } |
| |
| /* Read and return the next token from the stream. If the token is of |
| type eString, fill in the token parameter with the token. If the |
| token is eError, then the status parameter will contain the |
| specific error. This will be eItemNotFound at the end of file, |
| indicating that all tokens have been returned. This method will |
| never return eString twice in a row; instead, multiple adjacent |
| string tokens will be merged into one, with no intervening |
| space. */ |
| U_CFUNC enum ETokenType |
| getNextToken(UCHARBUF* buf, |
| struct UString *token, |
| uint32_t *linenumber, /* out: linenumber of token */ |
| struct UString *comment, |
| UErrorCode *status) { |
| enum ETokenType result; |
| UChar32 c; |
| |
| if (U_FAILURE(*status)) { |
| return TOK_ERROR; |
| } |
| |
| /* Skip whitespace */ |
| c = getNextChar(buf, TRUE, comment, status); |
| |
| if (U_FAILURE(*status)) { |
| return TOK_ERROR; |
| } |
| |
| *linenumber = lineCount; |
| |
| switch(c) { |
| case BADBOM: |
| return TOK_ERROR; |
| case OPENBRACE: |
| return TOK_OPEN_BRACE; |
| case CLOSEBRACE: |
| return TOK_CLOSE_BRACE; |
| case COMMA: |
| return TOK_COMMA; |
| case U_EOF: |
| return TOK_EOF; |
| case COLON: |
| return TOK_COLON; |
| |
| default: |
| result = getStringToken(buf, c, token, status); |
| } |
| |
| *linenumber = lineCount; |
| return result; |
| } |
| |
| /* Copy a string token into the given UnicodeString. Upon entry, we |
| have already read the first character of the string token, which is |
| not a whitespace character (but may be a QUOTE or ESCAPE). This |
| function reads all subsequent characters that belong with this |
| string, and copy them into the token parameter. The other |
| important, and slightly convoluted purpose of this function is to |
| merge adjacent strings. It looks forward a bit, and if the next |
| non comment, non whitespace item is a string, it reads it in as |
| well. If two adjacent strings are quoted, they are merged without |
| intervening space. Otherwise a single SPACE character is |
| inserted. */ |
| static enum ETokenType getStringToken(UCHARBUF* buf, |
| UChar32 initialChar, |
| struct UString *token, |
| UErrorCode *status) { |
| UBool lastStringWasQuoted; |
| UChar32 c; |
| UChar target[3] = { '\0' }; |
| UChar *pTarget = target; |
| int len=0; |
| UBool isFollowingCharEscaped=FALSE; |
| UBool isNLUnescaped = FALSE; |
| UChar32 prevC=0; |
| |
| /* We are guaranteed on entry that initialChar is not a whitespace |
| character. If we are at the EOF, or have some other problem, it |
| doesn't matter; we still want to validly return the initialChar |
| (if nothing else) as a string token. */ |
| |
| if (U_FAILURE(*status)) { |
| return TOK_ERROR; |
| } |
| |
| /* setup */ |
| lastStringWasQuoted = FALSE; |
| c = initialChar; |
| ustr_setlen(token, 0, status); |
| |
| if (U_FAILURE(*status)) { |
| return TOK_ERROR; |
| } |
| |
| for (;;) { |
| if (c == QUOTE) { |
| if (!lastStringWasQuoted && token->fLength > 0) { |
| ustr_ucat(token, SPACE, status); |
| |
| if (U_FAILURE(*status)) { |
| return TOK_ERROR; |
| } |
| } |
| |
| lastStringWasQuoted = TRUE; |
| |
| for (;;) { |
| c = ucbuf_getc(buf,status); |
| |
| /* EOF reached */ |
| if (c == U_EOF) { |
| return TOK_EOF; |
| } |
| |
| /* Unterminated quoted strings */ |
| if (U_FAILURE(*status)) { |
| return TOK_ERROR; |
| } |
| |
| if (c == QUOTE && !isFollowingCharEscaped) { |
| break; |
| } |
| |
| if (c == ESCAPE && !isFollowingCharEscaped) { |
| pTarget = target; |
| c = unescape(buf, status); |
| |
| if (c == U_ERR) { |
| return TOK_ERROR; |
| } |
| if(c == CR || c == LF){ |
| isNLUnescaped = TRUE; |
| } |
| } |
| |
| if(c==ESCAPE && !isFollowingCharEscaped){ |
| isFollowingCharEscaped = TRUE; |
| }else{ |
| U_APPEND_CHAR32(c, pTarget,len); |
| pTarget = target; |
| ustr_uscat(token, pTarget,len, status); |
| isFollowingCharEscaped = FALSE; |
| len=0; |
| if(c == CR || c == LF){ |
| if(isNLUnescaped == FALSE && prevC!=CR){ |
| lineCount++; |
| } |
| isNLUnescaped = FALSE; |
| } |
| } |
| |
| if (U_FAILURE(*status)) { |
| return TOK_ERROR; |
| } |
| prevC = c; |
| } |
| } else { |
| if (token->fLength > 0) { |
| ustr_ucat(token, SPACE, status); |
| |
| if (U_FAILURE(*status)) { |
| return TOK_ERROR; |
| } |
| } |
| |
| if(lastStringWasQuoted){ |
| if(getShowWarning()){ |
| warning(lineCount, "Mixing quoted and unquoted strings"); |
| } |
| if(isStrict()){ |
| return TOK_ERROR; |
| } |
| |
| } |
| |
| lastStringWasQuoted = FALSE; |
| |
| /* if we reach here we are mixing |
| * quoted and unquoted strings |
| * warn in normal mode and error in |
| * pedantic mode |
| */ |
| |
| if (c == ESCAPE) { |
| pTarget = target; |
| c = unescape(buf, status); |
| |
| /* EOF reached */ |
| if (c == U_EOF) { |
| return TOK_ERROR; |
| } |
| } |
| |
| U_APPEND_CHAR32(c, pTarget,len); |
| pTarget = target; |
| ustr_uscat(token, pTarget,len, status); |
| len=0; |
| |
| if (U_FAILURE(*status)) { |
| return TOK_ERROR; |
| } |
| |
| for (;;) { |
| /* DON'T skip whitespace */ |
| c = getNextChar(buf, FALSE, NULL, status); |
| |
| /* EOF reached */ |
| if (c == U_EOF) { |
| ucbuf_ungetc(c, buf); |
| return TOK_STRING; |
| } |
| |
| if (U_FAILURE(*status)) { |
| return TOK_STRING; |
| } |
| |
| if (c == QUOTE |
| || c == OPENBRACE |
| || c == CLOSEBRACE |
| || c == COMMA |
| || c == COLON) { |
| ucbuf_ungetc(c, buf); |
| break; |
| } |
| |
| if (isWhitespace(c)) { |
| break; |
| } |
| |
| if (c == ESCAPE) { |
| pTarget = target; |
| c = unescape(buf, status); |
| |
| if (c == U_ERR) { |
| return TOK_ERROR; |
| } |
| } |
| |
| U_APPEND_CHAR32(c, pTarget,len); |
| pTarget = target; |
| ustr_uscat(token, pTarget,len, status); |
| len=0; |
| if (U_FAILURE(*status)) { |
| return TOK_ERROR; |
| } |
| } |
| } |
| |
| /* DO skip whitespace */ |
| c = getNextChar(buf, TRUE, NULL, status); |
| |
| if (U_FAILURE(*status)) { |
| return TOK_STRING; |
| } |
| |
| if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) { |
| ucbuf_ungetc(c, buf); |
| return TOK_STRING; |
| } |
| } |
| } |
| |
| /* Retrieve the next character. If skipwhite is |
| true, whitespace is skipped as well. */ |
| static UChar32 getNextChar(UCHARBUF* buf, |
| UBool skipwhite, |
| struct UString *token, |
| UErrorCode *status) { |
| UChar32 c, c2; |
| |
| if (U_FAILURE(*status)) { |
| return U_EOF; |
| } |
| |
| for (;;) { |
| c = ucbuf_getc(buf,status); |
| |
| if (c == U_EOF) { |
| return U_EOF; |
| } |
| |
| if (skipwhite && isWhitespace(c)) { |
| continue; |
| } |
| |
| /* This also handles the get() failing case */ |
| if (c != SLASH) { |
| return c; |
| } |
| |
| c = ucbuf_getc(buf,status); /* "/c" */ |
| |
| if (c == U_EOF) { |
| return U_EOF; |
| } |
| |
| switch (c) { |
| case SLASH: /* "//" */ |
| seekUntilNewline(buf, NULL, status); |
| break; |
| |
| case ASTERISK: /* " / * " */ |
| c2 = ucbuf_getc(buf, status); /* "/ * c" */ |
| if(c2 == ASTERISK){ /* "/ * *" */ |
| /* parse multi-line comment and store it in token*/ |
| seekUntilEndOfComment(buf, token, status); |
| } else { |
| ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/ *". Include c2 back in buffer. */ |
| seekUntilEndOfComment(buf, NULL, status); |
| } |
| break; |
| |
| default: |
| ucbuf_ungetc(c, buf); /* "/c" - put back the c */ |
| /* If get() failed this is a NOP */ |
| return SLASH; |
| } |
| |
| } |
| } |
| |
| static void seekUntilNewline(UCHARBUF* buf, |
| struct UString *token, |
| UErrorCode *status) { |
| UChar32 c; |
| |
| if (U_FAILURE(*status)) { |
| return; |
| } |
| |
| do { |
| c = ucbuf_getc(buf,status); |
| /* add the char to token */ |
| if(token!=NULL){ |
| ustr_u32cat(token, c, status); |
| } |
| } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR); |
| } |
| |
| static void seekUntilEndOfComment(UCHARBUF *buf, |
| struct UString *token, |
| UErrorCode *status) { |
| UChar32 c, d; |
| uint32_t line; |
| |
| if (U_FAILURE(*status)) { |
| return; |
| } |
| |
| line = lineCount; |
| |
| do { |
| c = ucbuf_getc(buf, status); |
| |
| if (c == ASTERISK) { |
| d = ucbuf_getc(buf, status); |
| |
| if (d != SLASH) { |
| ucbuf_ungetc(d, buf); |
| } else { |
| break; |
| } |
| } |
| /* add the char to token */ |
| if(token!=NULL){ |
| ustr_u32cat(token, c, status); |
| } |
| /* increment the lineCount */ |
| isNewline(c); |
| |
| } while (c != U_EOF && *status == U_ZERO_ERROR); |
| |
| if (c == U_EOF) { |
| *status = U_INVALID_FORMAT_ERROR; |
| error(line, "unterminated comment detected"); |
| } |
| } |
| |
| U_CFUNC UChar32 unescape(UCHARBUF *buf, UErrorCode *status) { |
| if (U_FAILURE(*status)) { |
| return U_EOF; |
| } |
| |
| /* We expect to be called after the ESCAPE has been seen, but |
| * u_fgetcx needs an ESCAPE to do its magic. */ |
| ucbuf_ungetc(ESCAPE, buf); |
| |
| return ucbuf_getcx32(buf, status); |
| } |
| |
| static UBool isWhitespace(UChar32 c) { |
| switch (c) { |
| /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */ |
| case 0x000A: |
| case 0x2029: |
| lineCount++; |
| case 0x000D: |
| case 0x0020: |
| case 0x0009: |
| case 0xFEFF: |
| return TRUE; |
| |
| default: |
| return FALSE; |
| } |
| } |
| |
| static UBool isNewline(UChar32 c) { |
| switch (c) { |
| /* '\n', '\r', 0x2029 */ |
| case 0x000A: |
| case 0x2029: |
| lineCount++; |
| case 0x000D: |
| return TRUE; |
| |
| default: |
| return FALSE; |
| } |
| } |