| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| |
| #include <stdio.h> |
| #include <string> |
| #include <stdlib.h> |
| #include <errno.h> |
| #include <string.h> |
| #include <iostream> |
| #include <fstream> |
| |
| // We only use U8_* macros, which are entirely inline. |
| #include "unicode/utf8.h" |
| |
| // This contains a codepage and ISO 14882:1998 illegality table. |
| // Use "make gen-table" to rebuild it. |
| #include "cptbl.h" |
| |
| /** |
| * What is this? |
| * |
| * "This" is a preprocessor that makes an attempt to convert fully valid C++11 source code |
| * in utf-8 into something consumable by certain compilers (Solaris, xlC) |
| * which aren't quite standards compliant. |
| * |
| * - u"<unicode>" or u'<unicode>' gets converted to u"\uNNNN" or u'\uNNNN' |
| * - u8"<unicode>" gets converted to "\xAA\xBB\xCC\xDD" etc. |
| * (some compilers do not support the u8 prefix correctly.) |
| * - if the system is EBCDIC-based, that is used to correct the input characters. |
| * |
| * Usage: |
| * escapesrc infile.cpp outfile.cpp |
| * Normally this is invoked by the build stage, with a rule such as: |
| * |
| * _%.cpp: $(srcdir)/%.cpp |
| * @$(BINDIR)/escapesrc$(EXEEXT) $< $@ |
| * %.o: _%.cpp |
| * $(COMPILE.cc) ... $@ $< |
| * |
| * In the Makefiles, SKIP_ESCAPING=YES is used to prevent escapesrc.cpp |
| * from being itself escaped. |
| */ |
| |
| |
| static const char |
| kSPACE = 0x20, |
| kTAB = 0x09, |
| kLF = 0x0A, |
| kCR = 0x0D; |
| |
| // For convenience |
| # define cp1047_to_8859(c) cp1047_8859_1[c] |
| |
| // Our app's name |
| std::string prog; |
| |
| /** |
| * Give the usual 1-line documentation and exit |
| */ |
| void usage() { |
| fprintf(stderr, "%s: usage: %s infile.cpp outfile.cpp\n", prog.c_str(), prog.c_str()); |
| } |
| |
| /** |
| * Delete the output file (if any) |
| * We want to delete even if we didn't generate, because it might be stale. |
| */ |
| int cleanup(const std::string &outfile) { |
| const char *outstr = outfile.c_str(); |
| if(outstr && *outstr) { |
| int rc = std::remove(outstr); |
| if(rc == 0) { |
| fprintf(stderr, "%s: deleted %s\n", prog.c_str(), outstr); |
| return 0; |
| } else { |
| if( errno == ENOENT ) { |
| return 0; // File did not exist - no error. |
| } else { |
| perror("std::remove"); |
| return 1; |
| } |
| } |
| } |
| return 0; |
| } |
| |
| /** |
| * Skip across any known whitespace. |
| * @param p startpoint |
| * @param e limit |
| * @return first non-whitespace char |
| */ |
| inline const char *skipws(const char *p, const char *e) { |
| for(;p<e;p++) { |
| switch(*p) { |
| case kSPACE: |
| case kTAB: |
| case kLF: |
| case kCR: |
| break; |
| default: |
| return p; // non ws |
| } |
| } |
| return p; |
| } |
| |
| /** |
| * Append a byte, hex encoded |
| * @param outstr sstring to append to |
| * @param byte the byte to append |
| */ |
| void appendByte(std::string &outstr, |
| uint8_t byte) { |
| char tmp2[5]; |
| sprintf(tmp2, "\\x%02X", 0xFF & (int)(byte)); |
| outstr += tmp2; |
| } |
| |
| /** |
| * Append the bytes from 'linestr' into outstr, with escaping |
| * @param outstr the output buffer |
| * @param linestr the input buffer |
| * @param pos in/out: the current char under consideration |
| * @param chars the number of chars to consider |
| * @return true on failure |
| */ |
| bool appendUtf8(std::string &outstr, |
| const std::string &linestr, |
| size_t &pos, |
| size_t chars) { |
| char tmp[9]; |
| for(size_t i=0;i<chars;i++) { |
| tmp[i] = linestr[++pos]; |
| } |
| tmp[chars] = 0; |
| unsigned int c; |
| sscanf(tmp, "%X", &c); |
| UChar32 ch = c & 0x1FFFFF; |
| |
| // now to append \\x%% etc |
| uint8_t bytesNeeded = U8_LENGTH(ch); |
| if(bytesNeeded == 0) { |
| fprintf(stderr, "Illegal code point U+%X\n", ch); |
| return true; |
| } |
| uint8_t bytes[4]; |
| uint8_t *s = bytes; |
| size_t i = 0; |
| U8_APPEND_UNSAFE(s, i, ch); |
| for(size_t t = 0; t<i; t++) { |
| appendByte(outstr, s[t]); |
| } |
| return false; |
| } |
| |
| /** |
| * Fixup u8"x" |
| * @param linestr string to mutate. Already escaped into \u format. |
| * @param origpos beginning, points to 'u8"' |
| * @param pos end, points to " |
| * @return false for no-problem, true for failure! |
| */ |
| bool fixu8(std::string &linestr, size_t origpos, size_t &endpos) { |
| size_t pos = origpos + 3; |
| std::string outstr; |
| outstr += '\"'; // local encoding |
| for(;pos<endpos;pos++) { |
| char c = linestr[pos]; |
| if(c == '\\') { |
| char c2 = linestr[++pos]; |
| switch(c2) { |
| case '\'': |
| case '"': |
| #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY) |
| c2 = cp1047_to_8859(c2); |
| #endif |
| appendByte(outstr, c2); |
| break; |
| case 'u': |
| appendUtf8(outstr, linestr, pos, 4); |
| break; |
| case 'U': |
| appendUtf8(outstr, linestr, pos, 8); |
| break; |
| } |
| } else { |
| #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY) |
| c = cp1047_to_8859(c); |
| #endif |
| appendByte(outstr, c); |
| } |
| } |
| outstr += ('\"'); |
| |
| linestr.replace(origpos, (endpos-origpos+1), outstr); |
| |
| return false; // OK |
| } |
| |
| /** |
| * fix the u"x"/u'x'/u8"x" string at the position |
| * u8'x' is not supported, sorry. |
| * @param linestr the input string |
| * @param pos the position |
| * @return false = no err, true = had err |
| */ |
| bool fixAt(std::string &linestr, size_t pos) { |
| size_t origpos = pos; |
| |
| if(linestr[pos] != 'u') { |
| fprintf(stderr, "Not a 'u'?"); |
| return true; |
| } |
| |
| pos++; // past 'u' |
| |
| bool utf8 = false; |
| |
| if(linestr[pos] == '8') { // u8" |
| utf8 = true; |
| pos++; |
| } |
| |
| char quote = linestr[pos]; |
| |
| if(quote != '\'' && quote != '\"') { |
| fprintf(stderr, "Quote is '%c' - not sure what to do.\n", quote); |
| return true; |
| } |
| |
| if(quote == '\'' && utf8) { |
| fprintf(stderr, "Cannot do u8'...'\n"); |
| return true; |
| } |
| |
| pos ++; |
| |
| //printf("u%c…%c\n", quote, quote); |
| |
| for(; pos < linestr.size(); pos++) { |
| if(linestr[pos] == quote) { |
| if(utf8) { |
| return fixu8(linestr, origpos, pos); // fix u8"..." |
| } else { |
| return false; // end of quote |
| } |
| } |
| if(linestr[pos] == '\\') { |
| pos++; |
| if(linestr[pos] == quote) continue; // quoted quote |
| if(linestr[pos] == 'u') continue; // for now ... unicode escape |
| if(linestr[pos] == '\\') continue; |
| // some other escape… ignore |
| } else { |
| size_t old_pos = pos; |
| int32_t i = pos; |
| #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY) |
| // mogrify 1-4 bytes from 1047 'back' to utf-8 |
| char old_byte = linestr[pos]; |
| linestr[pos] = cp1047_to_8859(linestr[pos]); |
| // how many more? |
| int32_t trail = U8_COUNT_TRAIL_BYTES(linestr[pos]); |
| for(size_t pos2 = pos+1; trail>0; pos2++,trail--) { |
| linestr[pos2] = cp1047_to_8859(linestr[pos2]); |
| if(linestr[pos2] == 0x0A) { |
| linestr[pos2] = 0x85; // NL is ambiguous here |
| } |
| } |
| #endif |
| |
| // Proceed to decode utf-8 |
| const uint8_t *s = (const uint8_t*) (linestr.c_str()); |
| int32_t length = linestr.size(); |
| UChar32 c; |
| if(U8_IS_SINGLE((uint8_t)s[i]) && oldIllegal[s[i]]) { |
| #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY) |
| linestr[pos] = old_byte; // put it back |
| #endif |
| continue; // single code point not previously legal for \u escaping |
| } |
| |
| // otherwise, convert it to \u / \U |
| { |
| U8_NEXT(s, i, length, c); |
| } |
| if(c<0) { |
| fprintf(stderr, "Illegal utf-8 sequence at Column: %d\n", (int)old_pos); |
| fprintf(stderr, "Line: >>%s<<\n", linestr.c_str()); |
| return true; |
| } |
| |
| size_t seqLen = (i-pos); |
| |
| //printf("U+%04X pos %d [len %d]\n", c, pos, seqLen);fflush(stdout); |
| |
| char newSeq[20]; |
| if( c <= 0xFFFF) { |
| sprintf(newSeq, "\\u%04X", c); |
| } else { |
| sprintf(newSeq, "\\U%08X", c); |
| } |
| linestr.replace(pos, seqLen, newSeq); |
| pos += strlen(newSeq) - 1; |
| } |
| } |
| |
| return false; |
| } |
| |
| /** |
| * Fixup an entire line |
| * false = no err |
| * true = had err |
| * @param no the line number (not used) |
| * @param linestr the string to fix |
| * @return true if any err, else false |
| */ |
| bool fixLine(int /*no*/, std::string &linestr) { |
| const char *line = linestr.c_str(); |
| size_t len = linestr.size(); |
| |
| // no u' in the line? |
| if(!strstr(line, "u'") && !strstr(line, "u\"") && !strstr(line, "u8\"")) { |
| return false; // Nothing to do. No u' or u" detected |
| } |
| |
| // start from the end and find all u" cases |
| size_t pos = len = linestr.size(); |
| if(len>INT32_MAX/2) { |
| return true; |
| } |
| while((pos>0) && (pos = linestr.rfind("u\"", pos)) != std::string::npos) { |
| //printf("found doublequote at %d\n", pos); |
| if(fixAt(linestr, pos)) return true; |
| if(pos == 0) break; |
| pos--; |
| } |
| |
| // reset and find all u' cases |
| pos = len = linestr.size(); |
| while((pos>0) && (pos = linestr.rfind("u'", pos)) != std::string::npos) { |
| //printf("found singlequote at %d\n", pos); |
| if(fixAt(linestr, pos)) return true; |
| if(pos == 0) break; |
| pos--; |
| } |
| |
| // reset and find all u8" cases |
| pos = len = linestr.size(); |
| while((pos>0) && (pos = linestr.rfind("u8\"", pos)) != std::string::npos) { |
| if(fixAt(linestr, pos)) return true; |
| if(pos == 0) break; |
| pos--; |
| } |
| |
| //fprintf(stderr, "%d - fixed\n", no); |
| return false; |
| } |
| |
| /** |
| * Convert a whole file |
| * @param infile |
| * @param outfile |
| * @return 1 on err, 0 otherwise |
| */ |
| int convert(const std::string &infile, const std::string &outfile) { |
| fprintf(stderr, "escapesrc: %s -> %s\n", infile.c_str(), outfile.c_str()); |
| |
| std::ifstream inf; |
| |
| inf.open(infile.c_str(), std::ios::in); |
| |
| if(!inf.is_open()) { |
| fprintf(stderr, "%s: could not open input file %s\n", prog.c_str(), infile.c_str()); |
| cleanup(outfile); |
| return 1; |
| } |
| |
| std::ofstream outf; |
| |
| outf.open(outfile.c_str(), std::ios::out); |
| |
| if(!outf.is_open()) { |
| fprintf(stderr, "%s: could not open output file %s\n", prog.c_str(), outfile.c_str()); |
| return 1; |
| } |
| |
| // TODO: any platform variations of #line? |
| outf << "#line 1 \"" << infile << "\"" << '\n'; |
| |
| int no = 0; |
| std::string linestr; |
| while( getline( inf, linestr)) { |
| no++; |
| if(fixLine(no, linestr)) { |
| goto fail; |
| } |
| outf << linestr << '\n'; |
| } |
| |
| if(inf.eof()) { |
| return 0; |
| } |
| fail: |
| outf.close(); |
| fprintf(stderr, "%s:%d: Fixup failed by %s\n", infile.c_str(), no, prog.c_str()); |
| cleanup(outfile); |
| return 1; |
| } |
| |
| /** |
| * Main function |
| */ |
| int main(int argc, const char *argv[]) { |
| prog = argv[0]; |
| |
| if(argc != 3) { |
| usage(); |
| return 1; |
| } |
| |
| std::string infile = argv[1]; |
| std::string outfile = argv[2]; |
| |
| return convert(infile, outfile); |
| } |