| /************************************************************************** |
| * |
| * Copyright (C) 2002-2010, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| * |
| *************************************************************************** |
| */ |
| |
| // |
| // ugrep - an ICU sample program illustrating the use of ICU Regular Expressions. |
| // |
| // The use of the ICU Regex API all occurs within the main() |
| // function. The rest of the code deals with with opening files, |
| // encoding conversions, printing results, etc. |
| // |
| // This is not a full-featured grep program. The command line options |
| // have been kept to a minimum to avoid complicating the sample code. |
| // |
| |
| |
| |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| |
| #include "unicode/utypes.h" |
| #include "unicode/ustring.h" |
| #include "unicode/regex.h" |
| #include "unicode/ucnv.h" |
| #include "unicode/uclean.h" |
| |
| |
| // |
| // The following variables contain paramters that may be set from the command line. |
| // |
| const char *pattern = NULL; // The regular expression |
| int firstFileNum; // argv index of the first file name |
| UBool displayFileName = FALSE; |
| UBool displayLineNum = FALSE; |
| |
| |
| // |
| // Info regarding the file currently being processed |
| // |
| const char *fileName; |
| int fileLen; // Length, in UTF-16 Code Units. |
| |
| UChar *ucharBuf = 0; // Buffer, holds converted file. (Simple minded program, always reads |
| // the whole file at once. |
| |
| char *charBuf = 0; // Buffer, for original, unconverted file data. |
| |
| |
| // |
| // Info regarding the line currently being processed |
| // |
| int lineStart; // Index of first char of the current line in the file buffer |
| int lineEnd; // Index of char following the new line sequence for the current line |
| int lineNum; |
| |
| // |
| // Converter, used on output to convert Unicode data back to char * |
| // so that it will display in non-Unicode terminal windows. |
| // |
| UConverter *outConverter = 0; |
| |
| // |
| // Function forward declarations |
| // |
| void processOptions(int argc, const char **argv); |
| void nextLine(int start); |
| void printMatch(); |
| void printUsage(); |
| void readFile(const char *name); |
| |
| |
| |
| //------------------------------------------------------------------------------------------ |
| // |
| // main for ugrep |
| // |
| // Structurally, all use of the ICU Regular Expression API is in main(), |
| // and all of the supporting stuff necessary to make a running program, but |
| // not directly related to regular expressions, is factored out into these other |
| // functions. |
| // |
| //------------------------------------------------------------------------------------------ |
| int main(int argc, const char** argv) { |
| UBool matchFound = FALSE; |
| |
| // |
| // Process the commmand line options. |
| // |
| processOptions(argc, argv); |
| |
| // |
| // Create a RegexPattern object from the user supplied pattern string. |
| // |
| UErrorCode status = U_ZERO_ERROR; // All ICU operations report success or failure |
| // in a status variable. |
| |
| UParseError parseErr; // In the event of a syntax error in the regex pattern, |
| // this struct will contain the position of the |
| // error. |
| |
| RegexPattern *rePat = RegexPattern::compile(pattern, parseErr, status); |
| // Note that C++ is doing an automatic conversion |
| // of the (char *) pattern to a temporary |
| // UnicodeString object. |
| if (U_FAILURE(status)) { |
| fprintf(stderr, "ugrep: error in pattern: \"%s\" at position %d\n", |
| u_errorName(status), parseErr.offset); |
| exit(-1); |
| } |
| |
| // |
| // Create a RegexMatcher from the newly created pattern. |
| // |
| UnicodeString empty; |
| RegexMatcher *matcher = rePat->matcher(empty, status); |
| if (U_FAILURE(status)) { |
| fprintf(stderr, "ugrep: error in creating RegexMatcher: \"%s\"\n", |
| u_errorName(status)); |
| exit(-1); |
| } |
| |
| // |
| // Loop, processing each of the input files. |
| // |
| for (int fileNum=firstFileNum; fileNum < argc; fileNum++) { |
| readFile(argv[fileNum]); |
| |
| // |
| // Loop through the lines of a file, trying to match the regex pattern on each. |
| // |
| for (nextLine(0); lineStart<fileLen; nextLine(lineEnd)) { |
| UnicodeString s(FALSE, ucharBuf+lineStart, lineEnd-lineStart); |
| matcher->reset(s); |
| if (matcher->find()) { |
| matchFound = TRUE; |
| printMatch(); |
| } |
| } |
| } |
| |
| // |
| // Clean up |
| // |
| delete matcher; |
| delete rePat; |
| free(ucharBuf); |
| free(charBuf); |
| ucnv_close(outConverter); |
| |
| u_cleanup(); // shut down ICU, release any cached data it owns. |
| |
| return matchFound? 0: 1; |
| } |
| |
| |
| |
| //------------------------------------------------------------------------------------------ |
| // |
| // doOptions Run through the command line options, and set |
| // the global variables accordingly. |
| // |
| // exit without returning if an error occured and |
| // ugrep should not proceed further. |
| // |
| //------------------------------------------------------------------------------------------ |
| void processOptions(int argc, const char **argv) { |
| int optInd; |
| UBool doUsage = FALSE; |
| UBool doVersion = FALSE; |
| const char *arg; |
| |
| |
| for(optInd = 1; optInd < argc; ++optInd) { |
| arg = argv[optInd]; |
| |
| /* version info */ |
| if(strcmp(arg, "-V") == 0 || strcmp(arg, "--version") == 0) { |
| doVersion = TRUE; |
| } |
| /* usage info */ |
| else if(strcmp(arg, "--help") == 0) { |
| doUsage = TRUE; |
| } |
| else if(strcmp(arg, "-n") == 0 || strcmp(arg, "--line-number") == 0) { |
| displayLineNum = TRUE; |
| } |
| /* POSIX.1 says all arguments after -- are not options */ |
| else if(strcmp(arg, "--") == 0) { |
| /* skip the -- */ |
| ++optInd; |
| break; |
| } |
| /* unrecognized option */ |
| else if(strncmp(arg, "-", strlen("-")) == 0) { |
| printf("ugrep: invalid option -- %s\n", arg+1); |
| doUsage = TRUE; |
| } |
| /* done with options */ |
| else { |
| break; |
| } |
| } |
| |
| if (doUsage) { |
| printUsage(); |
| exit(0); |
| } |
| |
| if (doVersion) { |
| printf("ugrep version 0.01\n"); |
| if (optInd == argc) { |
| exit(0); |
| } |
| } |
| |
| int remainingArgs = argc-optInd; // pattern file ... |
| if (remainingArgs < 2) { |
| fprintf(stderr, "ugrep: files or pattern are missing.\n"); |
| printUsage(); |
| exit(1); |
| } |
| |
| if (remainingArgs > 2) { |
| // More than one file to be processed. Display file names with match output. |
| displayFileName = TRUE; |
| } |
| |
| pattern = argv[optInd]; |
| firstFileNum = optInd+1; |
| } |
| |
| //------------------------------------------------------------------------------------------ |
| // |
| // printUsage |
| // |
| //------------------------------------------------------------------------------------------ |
| void printUsage() { |
| printf("ugrep [options] pattern file...\n" |
| " -V or --version display version information\n" |
| " --help display this help and exit\n" |
| " -- stop further option processing\n" |
| "-n, --line-number Prefix each line of output with the line number within its input file.\n" |
| ); |
| exit(0); |
| } |
| |
| //------------------------------------------------------------------------------------------ |
| // |
| // readFile Read a file into memory, and convert it to Unicode. |
| // |
| // Since this is just a demo program, take the simple minded approach |
| // of always reading the whole file at once. No intelligent buffering |
| // is done. |
| // |
| //------------------------------------------------------------------------------------------ |
| void readFile(const char *name) { |
| |
| // |
| // Initialize global file variables |
| // |
| fileName = name; |
| fileLen = 0; // zero length prevents processing in case of errors. |
| |
| |
| // |
| // Open the file and determine its size. |
| // |
| FILE *file = fopen(name, "rb"); |
| if (file == 0 ) { |
| fprintf(stderr, "ugrep: Could not open file \"%s\"\n", fileName); |
| return; |
| } |
| fseek(file, 0, SEEK_END); |
| int rawFileLen = ftell(file); |
| fseek(file, 0, SEEK_SET); |
| |
| |
| // |
| // Read in the file |
| // |
| charBuf = (char *)realloc(charBuf, rawFileLen+1); // Need error checking... |
| int t = fread(charBuf, 1, rawFileLen, file); |
| if (t != rawFileLen) { |
| fprintf(stderr, "Error reading file \"%s\"\n", fileName); |
| fclose(file); |
| return; |
| } |
| charBuf[rawFileLen]=0; |
| fclose(file); |
| |
| // |
| // Look for a Unicode Signature (BOM) in the data |
| // |
| int32_t signatureLength; |
| const char * charDataStart = charBuf; |
| UErrorCode status = U_ZERO_ERROR; |
| const char* encoding = ucnv_detectUnicodeSignature( |
| charDataStart, rawFileLen, &signatureLength, &status); |
| if (U_FAILURE(status)) { |
| fprintf(stderr, "ugrep: ICU Error \"%s\" from ucnv_detectUnicodeSignature()\n", |
| u_errorName(status)); |
| return; |
| } |
| if(encoding!=NULL ){ |
| charDataStart += signatureLength; |
| rawFileLen -= signatureLength; |
| } |
| |
| // |
| // Open a converter to take the file to UTF-16 |
| // |
| UConverter* conv; |
| conv = ucnv_open(encoding, &status); |
| if (U_FAILURE(status)) { |
| fprintf(stderr, "ugrep: ICU Error \"%s\" from ucnv_open()\n", u_errorName(status)); |
| return; |
| } |
| |
| // |
| // Convert the file data to UChar. |
| // Preflight first to determine required buffer size. |
| // |
| uint32_t destCap = ucnv_toUChars(conv, |
| NULL, // dest, |
| 0, // destCapacity, |
| charDataStart, |
| rawFileLen, |
| &status); |
| if (status != U_BUFFER_OVERFLOW_ERROR) { |
| fprintf(stderr, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); |
| return; |
| }; |
| |
| status = U_ZERO_ERROR; |
| ucharBuf = (UChar *)realloc(ucharBuf, (destCap+1) * sizeof(UChar)); |
| ucnv_toUChars(conv, |
| ucharBuf, // dest, |
| destCap+1, |
| charDataStart, |
| rawFileLen, |
| &status); |
| if (U_FAILURE(status)) { |
| fprintf(stderr, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); |
| return; |
| }; |
| ucnv_close(conv); |
| |
| // |
| // Successful conversion. Set the global size variables so that |
| // the rest of the processing will proceed for this file. |
| // |
| fileLen = destCap; |
| } |
| |
| |
| |
| |
| |
| //------------------------------------------------------------------------------------------ |
| // |
| // nextLine Advance the line index variables, starting at the |
| // specified position in the input file buffer, by |
| // scanning forwrd until the next end-of-line. |
| // |
| // Need to take into account all of the possible Unicode |
| // line ending sequences. |
| // |
| //------------------------------------------------------------------------------------------ |
| void nextLine(int startPos) { |
| if (startPos == 0) { |
| lineNum = 0; |
| } else { |
| lineNum++; |
| } |
| lineStart = lineEnd = startPos; |
| |
| for (;;) { |
| if (lineEnd >= fileLen) { |
| return; |
| } |
| UChar c = ucharBuf[lineEnd]; |
| lineEnd++; |
| if (c == 0x0a || // Line Feed |
| c == 0x0c || // Form Feed |
| c == 0x0d || // Carriage Return |
| c == 0x85 || // Next Line |
| c == 0x2028 || // Line Separator |
| c == 0x2029) // Paragraph separator |
| { |
| break; |
| } |
| } |
| |
| // Check for CR/LF sequence, and advance over the LF if we're in the middle of one. |
| if (lineEnd < fileLen && |
| ucharBuf[lineEnd-1] == 0x0d && |
| ucharBuf[lineEnd] == 0x0a) |
| { |
| lineEnd++; |
| } |
| } |
| |
| |
| //------------------------------------------------------------------------------------------ |
| // |
| // printMatch Called when a matching line has been located. |
| // Print out the line from the file with the match, after |
| // converting it back to the default code page. |
| // |
| //------------------------------------------------------------------------------------------ |
| void printMatch() { |
| char buf[2000]; |
| UErrorCode status = U_ZERO_ERROR; |
| |
| // If we haven't already created a converter for output, do it now. |
| if (outConverter == 0) { |
| outConverter = ucnv_open(NULL, &status); |
| if (U_FAILURE(status)) { |
| fprintf(stderr, "ugrep: Error opening default converter: \"%s\"\n", |
| u_errorName(status)); |
| exit(-1); |
| } |
| }; |
| |
| // Convert the line to be printed back to the default 8 bit code page. |
| // If the line is too long for our buffer, just truncate it. |
| ucnv_fromUChars(outConverter, |
| buf, // destination buffer for conversion |
| sizeof(buf), // capacity of destination buffer |
| &ucharBuf[lineStart], // Input to conversion |
| lineEnd-lineStart, // number of UChars to convert |
| &status); |
| buf[sizeof(buf)-1] = 0; // Add null for use in case of too long lines. |
| // The converter null-terminates its output unless |
| // the buffer completely fills. |
| |
| if (displayFileName) { |
| printf("%s:", fileName); |
| } |
| if (displayLineNum) { |
| printf("%d:", lineNum); |
| } |
| printf("%s", buf); |
| } |
| |