src/net/tools/dump_cache/url_to_filename_encoder.cc - cobalt - Git at Google

 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include <stdlib.h>

 #include "base/logging.h"
 #include "base/string_util.h"
 #include "net/base/net_util.h"
 #include "net/tools/dump_cache/url_to_filename_encoder.h"

 using std::string;

 namespace {

 // Returns 1 if buf is prefixed by "num_digits" of hex digits
 // Teturns 0 otherwise.
 // The function checks for '\0' for string termination.
 int HexDigitsPrefix(const char* buf, int num_digits) {
   for (int i = 0; i < num_digits; i++) {
     if (!IsHexDigit(buf[i]))
       return 0;  // This also detects end of string as '\0' is not xdigit.
   }
   return 1;
 }

 #if defined(WIN32) || defined(__LB_XB1__) || defined(__LB_XB360__)
 #define strtoull _strtoui64
 #endif

 // A simple parser for long long values. Returns the parsed value if a
 // valid integer is found; else returns deflt
 // UInt64 and Int64 cannot handle decimal numbers with leading 0s.
 uint64 ParseLeadingHex64Value(const char *str, uint64 deflt) {
   char *error = NULL;
   const uint64 value = strtoull(str, &error, 16);
   return (error == str) ? deflt : value;
 }

 }

 namespace net {

 // The escape character choice is made here -- all code and tests in this
 // directory are based off of this constant.  However, our testdata
 // has tons of dependencies on this, so it cannot be changed without
 // re-running those tests and fixing them.
 const char UrlToFilenameEncoder::kEscapeChar = ',';
 const char UrlToFilenameEncoder::kTruncationChar = '-';
 const size_t UrlToFilenameEncoder::kMaximumSubdirectoryLength = 128;

 void UrlToFilenameEncoder::AppendSegment(string* segment, string* dest) {
   CHECK(!segment->empty());
   if ((*segment == ".") || (*segment == "..")) {
     dest->append(1, kEscapeChar);
     dest->append(*segment);
     segment->clear();
   } else {
     size_t segment_size = segment->size();
     if (segment_size > kMaximumSubdirectoryLength) {
       // We need to inject ",-" at the end of the segment to signify that
       // we are inserting an artificial '/'.  This means we have to chop
       // off at least two characters to make room.
       segment_size = kMaximumSubdirectoryLength - 2;

       // But we don't want to break up an escape sequence that happens to lie at
       // the end.  Escape sequences are at most 2 characters.
       if ((*segment)[segment_size - 1] == kEscapeChar) {
         segment_size -= 1;
       } else if ((*segment)[segment_size - 2] == kEscapeChar) {
         segment_size -= 2;
       }
       dest->append(segment->data(), segment_size);
       dest->append(1, kEscapeChar);
       dest->append(1, kTruncationChar);
       segment->erase(0, segment_size);

       // At this point, if we had segment_size=3, and segment="abcd",
       // then after this erase, we will have written "abc,-" and set segment="d"
     } else {
       dest->append(*segment);
       segment->clear();
     }
   }
 }

 void UrlToFilenameEncoder::EncodeSegment(const string& filename_prefix,
                                          const string& escaped_ending,
                                          char dir_separator,
                                          string* encoded_filename) {
   string filename_ending = UrlUtilities::Unescape(escaped_ending);

   char encoded[3];
   int encoded_len;
   string segment;

   // TODO(jmarantz): This code would be a bit simpler if we disallowed
   // Instaweb allowing filename_prefix to not end in "/".  We could
   // then change the is routine to just take one input string.
   size_t start_of_segment = filename_prefix.find_last_of(dir_separator);
   if (start_of_segment == string::npos) {
     segment = filename_prefix;
   } else {
     segment = filename_prefix.substr(start_of_segment + 1);
     *encoded_filename = filename_prefix.substr(0, start_of_segment + 1);
   }

   size_t index = 0;
   // Special case the first / to avoid adding a leading kEscapeChar.
   if (!filename_ending.empty() && (filename_ending[0] == dir_separator)) {
     encoded_filename->append(segment);
     segment.clear();
     encoded_filename->append(1, dir_separator);
     ++index;
   }

   for (; index < filename_ending.length(); ++index) {
     unsigned char ch = static_cast<unsigned char>(filename_ending[index]);

     // Note: instead of outputing an empty segment, we let the second slash
     // be escaped below.
     if ((ch == dir_separator) && !segment.empty()) {
       AppendSegment(&segment, encoded_filename);
       encoded_filename->append(1, dir_separator);
       segment.clear();
     } else {
       // After removing unsafe chars the only safe ones are _.=+- and alphanums.
       if ((ch == '_') || (ch == '.') || (ch == '=') || (ch == '+') ||
           (ch == '-') || (('0' <= ch) && (ch <= '9')) ||
           (('A' <= ch) && (ch <= 'Z')) || (('a' <= ch) && (ch <= 'z'))) {
         encoded[0] = ch;
         encoded_len = 1;
       } else {
         encoded[0] = kEscapeChar;
         encoded[1] = ch / 16;
         encoded[1] += (encoded[1] >= 10) ? 'A' - 10 : '0';
         encoded[2] = ch % 16;
         encoded[2] += (encoded[2] >= 10) ? 'A' - 10 : '0';
         encoded_len = 3;
       }
       segment.append(encoded, encoded_len);

       // If segment is too big, we must chop it into chunks.
       if (segment.size() > kMaximumSubdirectoryLength) {
         AppendSegment(&segment, encoded_filename);
         encoded_filename->append(1, dir_separator);
       }
     }
   }

   // Append "," to the leaf filename so the leaf can also be a branch., e.g.
   // allow http://a/b/c and http://a/b/c/d to co-exist as files "/a/b/c," and
   // /a/b/c/d".  So we will rename the "d" here to "d,".  If doing that pushed
   // us over the 128 char limit, then we will need to append "/" and the
   // remaining chars.
   segment += kEscapeChar;
   AppendSegment(&segment, encoded_filename);
   if (!segment.empty()) {
     // The last overflow segment is special, because we appended in
     // kEscapeChar above.  We won't need to check it again for size
     // or further escaping.
     encoded_filename->append(1, dir_separator);
     encoded_filename->append(segment);
   }
 }

 // Note: this decoder is not the exact inverse of the EncodeSegment above,
 // because it does not take into account a prefix.
 bool UrlToFilenameEncoder::Decode(const string& encoded_filename,
                                   char dir_separator,
                                   string* decoded_url) {
   enum State {
     kStart,
     kEscape,
     kFirstDigit,
     kTruncate,
     kEscapeDot
   };
   State state = kStart;
   char hex_buffer[3];
   hex_buffer[2] = '\0';
   for (size_t i = 0; i < encoded_filename.size(); ++i) {
     char ch = encoded_filename[i];
     switch (state) {
       case kStart:
         if (ch == kEscapeChar) {
           state = kEscape;
         } else if (ch == dir_separator) {
           decoded_url->append(1, '/');  // URLs only use '/' not '\\'
         } else {
           decoded_url->append(1, ch);
         }
         break;
       case kEscape:
         if (HexDigitsPrefix(&ch, 1) == 1) {
           hex_buffer[0] = ch;
           state = kFirstDigit;
         } else if (ch == kTruncationChar) {
           state = kTruncate;
         } else if (ch == '.') {
           decoded_url->append(1, '.');
           state = kEscapeDot;  // Look for at most one more dot.
         } else if (ch == dir_separator) {
           // Consider url "//x".  This was once encoded to "/,/x,".
           // This code is what skips the first Escape.
           decoded_url->append(1, '/');  // URLs only use '/' not '\\'
           state = kStart;
         } else {
           return false;
         }
         break;
       case kFirstDigit:
         if (HexDigitsPrefix(&ch, 1) == 1) {
           hex_buffer[1] = ch;
           uint64 hex_value = ParseLeadingHex64Value(hex_buffer, 0);
           decoded_url->append(1, static_cast<char>(hex_value));
           state = kStart;
         } else {
           return false;
         }
         break;
       case kTruncate:
         if (ch == dir_separator) {
           // Skip this separator, it was only put in to break up long
           // path segments, but is not part of the URL.
           state = kStart;
         } else {
           return false;
         }
         break;
       case kEscapeDot:
         decoded_url->append(1, ch);
         state = kStart;
         break;
     }
   }

   // All legal encoded filenames end in kEscapeChar.
   return (state == kEscape);
 }

 // Escape the given input |path| and chop any individual components
 // of the path which are greater than kMaximumSubdirectoryLength characters
 // into two chunks.
 //
 // This legacy version has several issues with aliasing of different URLs,
 // inability to represent both /a/b/c and /a/b/c/d, and inability to decode
 // the filenames back into URLs.
 //
 // But there is a large body of slurped data which depends on this format,
 // so leave it as the default for spdy_in_mem_edsm_server.
 string UrlToFilenameEncoder::LegacyEscape(const string& path) {
   string output;

   // Note:  We also chop paths into medium sized 'chunks'.
   //        This is due to the incompetence of the windows
   //        filesystem, which still hasn't figured out how
   //        to deal with long filenames.
   int last_slash = 0;
   for (size_t index = 0; index < path.length(); index++) {
     char ch = path[index];
     if (ch == 0x5C)
       last_slash = index;
     if ((ch == 0x2D) ||                    // hyphen
         (ch == 0x5C) || (ch == 0x5F) ||    // backslash, underscore
         ((0x30 <= ch) && (ch <= 0x39)) ||  // Digits [0-9]
         ((0x41 <= ch) && (ch <= 0x5A)) ||  // Uppercase [A-Z]
         ((0x61 <= ch) && (ch <= 0x7A))) {  // Lowercase [a-z]
       output.append(&path[index], 1);
     } else {
       char encoded[3];
       encoded[0] = 'x';
       encoded[1] = ch / 16;
       encoded[1] += (encoded[1] >= 10) ? 'A' - 10 : '0';
       encoded[2] = ch % 16;
       encoded[2] += (encoded[2] >= 10) ? 'A' - 10 : '0';
       output.append(encoded, 3);
     }
     if (index - last_slash > kMaximumSubdirectoryLength) {
 #ifdef WIN32
       char slash = '\\';
 #else
       char slash = '/';
 #endif
       output.append(&slash, 1);
       last_slash = index;
     }
   }
   return output;
 }

 }  // namespace net
	// Copyright (c) 2011 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include <stdlib.h>

	#include "base/logging.h"
	#include "base/string_util.h"
	#include "net/base/net_util.h"
	#include "net/tools/dump_cache/url_to_filename_encoder.h"

	using std::string;

	namespace {

	// Returns 1 if buf is prefixed by "num_digits" of hex digits
	// Teturns 0 otherwise.
	// The function checks for '\0' for string termination.
	int HexDigitsPrefix(const char* buf, int num_digits) {
	for (int i = 0; i < num_digits; i++) {
	if (!IsHexDigit(buf[i]))
	return 0; // This also detects end of string as '\0' is not xdigit.
	}
	return 1;
	}

	#if defined(WIN32) \|\| defined(__LB_XB1__) \|\| defined(__LB_XB360__)
	#define strtoull _strtoui64
	#endif

	// A simple parser for long long values. Returns the parsed value if a
	// valid integer is found; else returns deflt
	// UInt64 and Int64 cannot handle decimal numbers with leading 0s.
	uint64 ParseLeadingHex64Value(const char *str, uint64 deflt) {
	char *error = NULL;
	const uint64 value = strtoull(str, &error, 16);
	return (error == str) ? deflt : value;
	}

	}

	namespace net {

	// The escape character choice is made here -- all code and tests in this
	// directory are based off of this constant. However, our testdata
	// has tons of dependencies on this, so it cannot be changed without
	// re-running those tests and fixing them.
	const char UrlToFilenameEncoder::kEscapeChar = ',';
	const char UrlToFilenameEncoder::kTruncationChar = '-';
	const size_t UrlToFilenameEncoder::kMaximumSubdirectoryLength = 128;

	void UrlToFilenameEncoder::AppendSegment(string* segment, string* dest) {
	CHECK(!segment->empty());
	if ((segment == ".") \|\| (segment == "..")) {
	dest->append(1, kEscapeChar);
	dest->append(*segment);
	segment->clear();
	} else {
	size_t segment_size = segment->size();
	if (segment_size > kMaximumSubdirectoryLength) {
	// We need to inject ",-" at the end of the segment to signify that
	// we are inserting an artificial '/'. This means we have to chop
	// off at least two characters to make room.
	segment_size = kMaximumSubdirectoryLength - 2;

	// But we don't want to break up an escape sequence that happens to lie at
	// the end. Escape sequences are at most 2 characters.
	if ((*segment)[segment_size - 1] == kEscapeChar) {
	segment_size -= 1;
	} else if ((*segment)[segment_size - 2] == kEscapeChar) {
	segment_size -= 2;
	}
	dest->append(segment->data(), segment_size);
	dest->append(1, kEscapeChar);
	dest->append(1, kTruncationChar);
	segment->erase(0, segment_size);

	// At this point, if we had segment_size=3, and segment="abcd",
	// then after this erase, we will have written "abc,-" and set segment="d"
	} else {
	dest->append(*segment);
	segment->clear();
	}
	}
	}

	void UrlToFilenameEncoder::EncodeSegment(const string& filename_prefix,
	const string& escaped_ending,
	char dir_separator,
	string* encoded_filename) {
	string filename_ending = UrlUtilities::Unescape(escaped_ending);

	char encoded[3];
	int encoded_len;
	string segment;

	// TODO(jmarantz): This code would be a bit simpler if we disallowed
	// Instaweb allowing filename_prefix to not end in "/". We could
	// then change the is routine to just take one input string.
	size_t start_of_segment = filename_prefix.find_last_of(dir_separator);
	if (start_of_segment == string::npos) {
	segment = filename_prefix;
	} else {
	segment = filename_prefix.substr(start_of_segment + 1);
	*encoded_filename = filename_prefix.substr(0, start_of_segment + 1);
	}

	size_t index = 0;
	// Special case the first / to avoid adding a leading kEscapeChar.
	if (!filename_ending.empty() && (filename_ending[0] == dir_separator)) {
	encoded_filename->append(segment);
	segment.clear();
	encoded_filename->append(1, dir_separator);
	++index;
	}

	for (; index < filename_ending.length(); ++index) {
	unsigned char ch = static_cast<unsigned char>(filename_ending[index]);

	// Note: instead of outputing an empty segment, we let the second slash
	// be escaped below.
	if ((ch == dir_separator) && !segment.empty()) {
	AppendSegment(&segment, encoded_filename);
	encoded_filename->append(1, dir_separator);
	segment.clear();
	} else {
	// After removing unsafe chars the only safe ones are _.=+- and alphanums.
	if ((ch == '_') \|\| (ch == '.') \|\| (ch == '=') \|\| (ch == '+') \|\|
	(ch == '-') \|\| (('0' <= ch) && (ch <= '9')) \|\|
	(('A' <= ch) && (ch <= 'Z')) \|\| (('a' <= ch) && (ch <= 'z'))) {
	encoded[0] = ch;
	encoded_len = 1;
	} else {
	encoded[0] = kEscapeChar;
	encoded[1] = ch / 16;
	encoded[1] += (encoded[1] >= 10) ? 'A' - 10 : '0';
	encoded[2] = ch % 16;
	encoded[2] += (encoded[2] >= 10) ? 'A' - 10 : '0';
	encoded_len = 3;
	}
	segment.append(encoded, encoded_len);

	// If segment is too big, we must chop it into chunks.
	if (segment.size() > kMaximumSubdirectoryLength) {
	AppendSegment(&segment, encoded_filename);
	encoded_filename->append(1, dir_separator);
	}
	}
	}

	// Append "," to the leaf filename so the leaf can also be a branch., e.g.
	// allow http://a/b/c and http://a/b/c/d to co-exist as files "/a/b/c," and
	// /a/b/c/d". So we will rename the "d" here to "d,". If doing that pushed
	// us over the 128 char limit, then we will need to append "/" and the
	// remaining chars.
	segment += kEscapeChar;
	AppendSegment(&segment, encoded_filename);
	if (!segment.empty()) {
	// The last overflow segment is special, because we appended in
	// kEscapeChar above. We won't need to check it again for size
	// or further escaping.
	encoded_filename->append(1, dir_separator);
	encoded_filename->append(segment);
	}
	}

	// Note: this decoder is not the exact inverse of the EncodeSegment above,
	// because it does not take into account a prefix.
	bool UrlToFilenameEncoder::Decode(const string& encoded_filename,
	char dir_separator,
	string* decoded_url) {
	enum State {
	kStart,
	kEscape,
	kFirstDigit,
	kTruncate,
	kEscapeDot
	};
	State state = kStart;
	char hex_buffer[3];
	hex_buffer[2] = '\0';
	for (size_t i = 0; i < encoded_filename.size(); ++i) {
	char ch = encoded_filename[i];
	switch (state) {
	case kStart:
	if (ch == kEscapeChar) {
	state = kEscape;
	} else if (ch == dir_separator) {
	decoded_url->append(1, '/'); // URLs only use '/' not '\\'
	} else {
	decoded_url->append(1, ch);
	}
	break;
	case kEscape:
	if (HexDigitsPrefix(&ch, 1) == 1) {
	hex_buffer[0] = ch;
	state = kFirstDigit;
	} else if (ch == kTruncationChar) {
	state = kTruncate;
	} else if (ch == '.') {
	decoded_url->append(1, '.');
	state = kEscapeDot; // Look for at most one more dot.
	} else if (ch == dir_separator) {
	// Consider url "//x". This was once encoded to "/,/x,".
	// This code is what skips the first Escape.
	decoded_url->append(1, '/'); // URLs only use '/' not '\\'
	state = kStart;
	} else {
	return false;
	}
	break;
	case kFirstDigit:
	if (HexDigitsPrefix(&ch, 1) == 1) {
	hex_buffer[1] = ch;
	uint64 hex_value = ParseLeadingHex64Value(hex_buffer, 0);
	decoded_url->append(1, static_cast<char>(hex_value));
	state = kStart;
	} else {
	return false;
	}
	break;
	case kTruncate:
	if (ch == dir_separator) {
	// Skip this separator, it was only put in to break up long
	// path segments, but is not part of the URL.
	state = kStart;
	} else {
	return false;
	}
	break;
	case kEscapeDot:
	decoded_url->append(1, ch);
	state = kStart;
	break;
	}
	}

	// All legal encoded filenames end in kEscapeChar.
	return (state == kEscape);
	}

	// Escape the given input \|path\| and chop any individual components
	// of the path which are greater than kMaximumSubdirectoryLength characters
	// into two chunks.
	//
	// This legacy version has several issues with aliasing of different URLs,
	// inability to represent both /a/b/c and /a/b/c/d, and inability to decode
	// the filenames back into URLs.
	//
	// But there is a large body of slurped data which depends on this format,
	// so leave it as the default for spdy_in_mem_edsm_server.
	string UrlToFilenameEncoder::LegacyEscape(const string& path) {
	string output;

	// Note: We also chop paths into medium sized 'chunks'.
	// This is due to the incompetence of the windows
	// filesystem, which still hasn't figured out how
	// to deal with long filenames.
	int last_slash = 0;
	for (size_t index = 0; index < path.length(); index++) {
	char ch = path[index];
	if (ch == 0x5C)
	last_slash = index;
	if ((ch == 0x2D) \|\| // hyphen
	(ch == 0x5C) \|\| (ch == 0x5F) \|\| // backslash, underscore
	((0x30 <= ch) && (ch <= 0x39)) \|\| // Digits [0-9]
	((0x41 <= ch) && (ch <= 0x5A)) \|\| // Uppercase [A-Z]
	((0x61 <= ch) && (ch <= 0x7A))) { // Lowercase [a-z]
	output.append(&path[index], 1);
	} else {
	char encoded[3];
	encoded[0] = 'x';
	encoded[1] = ch / 16;
	encoded[1] += (encoded[1] >= 10) ? 'A' - 10 : '0';
	encoded[2] = ch % 16;
	encoded[2] += (encoded[2] >= 10) ? 'A' - 10 : '0';
	output.append(encoded, 3);
	}
	if (index - last_slash > kMaximumSubdirectoryLength) {
	#ifdef WIN32
	char slash = '\\';
	#else
	char slash = '/';
	#endif
	output.append(&slash, 1);
	last_slash = index;
	}
	}
	return output;
	}

	} // namespace net