Import Cobalt 3.11337
diff --git a/src/net/tools/dump_cache/url_to_filename_encoder.cc b/src/net/tools/dump_cache/url_to_filename_encoder.cc
new file mode 100644
index 0000000..e928ee9
--- /dev/null
+++ b/src/net/tools/dump_cache/url_to_filename_encoder.cc
@@ -0,0 +1,292 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <stdlib.h>
+
+#include "base/logging.h"
+#include "base/string_util.h"
+#include "net/base/net_util.h"
+#include "net/tools/dump_cache/url_to_filename_encoder.h"
+
+using std::string;
+
+namespace {
+
+// Returns 1 if buf is prefixed by "num_digits" of hex digits
+// Teturns 0 otherwise.
+// The function checks for '\0' for string termination.
+int HexDigitsPrefix(const char* buf, int num_digits) {
+ for (int i = 0; i < num_digits; i++) {
+ if (!IsHexDigit(buf[i]))
+ return 0; // This also detects end of string as '\0' is not xdigit.
+ }
+ return 1;
+}
+
+#if defined(WIN32) || defined(__LB_XB1__) || defined(__LB_XB360__)
+#define strtoull _strtoui64
+#endif
+
+// A simple parser for long long values. Returns the parsed value if a
+// valid integer is found; else returns deflt
+// UInt64 and Int64 cannot handle decimal numbers with leading 0s.
+uint64 ParseLeadingHex64Value(const char *str, uint64 deflt) {
+ char *error = NULL;
+ const uint64 value = strtoull(str, &error, 16);
+ return (error == str) ? deflt : value;
+}
+
+}
+
+namespace net {
+
+// The escape character choice is made here -- all code and tests in this
+// directory are based off of this constant. However, our testdata
+// has tons of dependencies on this, so it cannot be changed without
+// re-running those tests and fixing them.
+const char UrlToFilenameEncoder::kEscapeChar = ',';
+const char UrlToFilenameEncoder::kTruncationChar = '-';
+const size_t UrlToFilenameEncoder::kMaximumSubdirectoryLength = 128;
+
+void UrlToFilenameEncoder::AppendSegment(string* segment, string* dest) {
+ CHECK(!segment->empty());
+ if ((*segment == ".") || (*segment == "..")) {
+ dest->append(1, kEscapeChar);
+ dest->append(*segment);
+ segment->clear();
+ } else {
+ size_t segment_size = segment->size();
+ if (segment_size > kMaximumSubdirectoryLength) {
+ // We need to inject ",-" at the end of the segment to signify that
+ // we are inserting an artificial '/'. This means we have to chop
+ // off at least two characters to make room.
+ segment_size = kMaximumSubdirectoryLength - 2;
+
+ // But we don't want to break up an escape sequence that happens to lie at
+ // the end. Escape sequences are at most 2 characters.
+ if ((*segment)[segment_size - 1] == kEscapeChar) {
+ segment_size -= 1;
+ } else if ((*segment)[segment_size - 2] == kEscapeChar) {
+ segment_size -= 2;
+ }
+ dest->append(segment->data(), segment_size);
+ dest->append(1, kEscapeChar);
+ dest->append(1, kTruncationChar);
+ segment->erase(0, segment_size);
+
+ // At this point, if we had segment_size=3, and segment="abcd",
+ // then after this erase, we will have written "abc,-" and set segment="d"
+ } else {
+ dest->append(*segment);
+ segment->clear();
+ }
+ }
+}
+
+void UrlToFilenameEncoder::EncodeSegment(const string& filename_prefix,
+ const string& escaped_ending,
+ char dir_separator,
+ string* encoded_filename) {
+ string filename_ending = UrlUtilities::Unescape(escaped_ending);
+
+ char encoded[3];
+ int encoded_len;
+ string segment;
+
+ // TODO(jmarantz): This code would be a bit simpler if we disallowed
+ // Instaweb allowing filename_prefix to not end in "/". We could
+ // then change the is routine to just take one input string.
+ size_t start_of_segment = filename_prefix.find_last_of(dir_separator);
+ if (start_of_segment == string::npos) {
+ segment = filename_prefix;
+ } else {
+ segment = filename_prefix.substr(start_of_segment + 1);
+ *encoded_filename = filename_prefix.substr(0, start_of_segment + 1);
+ }
+
+ size_t index = 0;
+ // Special case the first / to avoid adding a leading kEscapeChar.
+ if (!filename_ending.empty() && (filename_ending[0] == dir_separator)) {
+ encoded_filename->append(segment);
+ segment.clear();
+ encoded_filename->append(1, dir_separator);
+ ++index;
+ }
+
+ for (; index < filename_ending.length(); ++index) {
+ unsigned char ch = static_cast<unsigned char>(filename_ending[index]);
+
+ // Note: instead of outputing an empty segment, we let the second slash
+ // be escaped below.
+ if ((ch == dir_separator) && !segment.empty()) {
+ AppendSegment(&segment, encoded_filename);
+ encoded_filename->append(1, dir_separator);
+ segment.clear();
+ } else {
+ // After removing unsafe chars the only safe ones are _.=+- and alphanums.
+ if ((ch == '_') || (ch == '.') || (ch == '=') || (ch == '+') ||
+ (ch == '-') || (('0' <= ch) && (ch <= '9')) ||
+ (('A' <= ch) && (ch <= 'Z')) || (('a' <= ch) && (ch <= 'z'))) {
+ encoded[0] = ch;
+ encoded_len = 1;
+ } else {
+ encoded[0] = kEscapeChar;
+ encoded[1] = ch / 16;
+ encoded[1] += (encoded[1] >= 10) ? 'A' - 10 : '0';
+ encoded[2] = ch % 16;
+ encoded[2] += (encoded[2] >= 10) ? 'A' - 10 : '0';
+ encoded_len = 3;
+ }
+ segment.append(encoded, encoded_len);
+
+ // If segment is too big, we must chop it into chunks.
+ if (segment.size() > kMaximumSubdirectoryLength) {
+ AppendSegment(&segment, encoded_filename);
+ encoded_filename->append(1, dir_separator);
+ }
+ }
+ }
+
+ // Append "," to the leaf filename so the leaf can also be a branch., e.g.
+ // allow http://a/b/c and http://a/b/c/d to co-exist as files "/a/b/c," and
+ // /a/b/c/d". So we will rename the "d" here to "d,". If doing that pushed
+ // us over the 128 char limit, then we will need to append "/" and the
+ // remaining chars.
+ segment += kEscapeChar;
+ AppendSegment(&segment, encoded_filename);
+ if (!segment.empty()) {
+ // The last overflow segment is special, because we appended in
+ // kEscapeChar above. We won't need to check it again for size
+ // or further escaping.
+ encoded_filename->append(1, dir_separator);
+ encoded_filename->append(segment);
+ }
+}
+
+// Note: this decoder is not the exact inverse of the EncodeSegment above,
+// because it does not take into account a prefix.
+bool UrlToFilenameEncoder::Decode(const string& encoded_filename,
+ char dir_separator,
+ string* decoded_url) {
+ enum State {
+ kStart,
+ kEscape,
+ kFirstDigit,
+ kTruncate,
+ kEscapeDot
+ };
+ State state = kStart;
+ char hex_buffer[3];
+ hex_buffer[2] = '\0';
+ for (size_t i = 0; i < encoded_filename.size(); ++i) {
+ char ch = encoded_filename[i];
+ switch (state) {
+ case kStart:
+ if (ch == kEscapeChar) {
+ state = kEscape;
+ } else if (ch == dir_separator) {
+ decoded_url->append(1, '/'); // URLs only use '/' not '\\'
+ } else {
+ decoded_url->append(1, ch);
+ }
+ break;
+ case kEscape:
+ if (HexDigitsPrefix(&ch, 1) == 1) {
+ hex_buffer[0] = ch;
+ state = kFirstDigit;
+ } else if (ch == kTruncationChar) {
+ state = kTruncate;
+ } else if (ch == '.') {
+ decoded_url->append(1, '.');
+ state = kEscapeDot; // Look for at most one more dot.
+ } else if (ch == dir_separator) {
+ // Consider url "//x". This was once encoded to "/,/x,".
+ // This code is what skips the first Escape.
+ decoded_url->append(1, '/'); // URLs only use '/' not '\\'
+ state = kStart;
+ } else {
+ return false;
+ }
+ break;
+ case kFirstDigit:
+ if (HexDigitsPrefix(&ch, 1) == 1) {
+ hex_buffer[1] = ch;
+ uint64 hex_value = ParseLeadingHex64Value(hex_buffer, 0);
+ decoded_url->append(1, static_cast<char>(hex_value));
+ state = kStart;
+ } else {
+ return false;
+ }
+ break;
+ case kTruncate:
+ if (ch == dir_separator) {
+ // Skip this separator, it was only put in to break up long
+ // path segments, but is not part of the URL.
+ state = kStart;
+ } else {
+ return false;
+ }
+ break;
+ case kEscapeDot:
+ decoded_url->append(1, ch);
+ state = kStart;
+ break;
+ }
+ }
+
+ // All legal encoded filenames end in kEscapeChar.
+ return (state == kEscape);
+}
+
+// Escape the given input |path| and chop any individual components
+// of the path which are greater than kMaximumSubdirectoryLength characters
+// into two chunks.
+//
+// This legacy version has several issues with aliasing of different URLs,
+// inability to represent both /a/b/c and /a/b/c/d, and inability to decode
+// the filenames back into URLs.
+//
+// But there is a large body of slurped data which depends on this format,
+// so leave it as the default for spdy_in_mem_edsm_server.
+string UrlToFilenameEncoder::LegacyEscape(const string& path) {
+ string output;
+
+ // Note: We also chop paths into medium sized 'chunks'.
+ // This is due to the incompetence of the windows
+ // filesystem, which still hasn't figured out how
+ // to deal with long filenames.
+ int last_slash = 0;
+ for (size_t index = 0; index < path.length(); index++) {
+ char ch = path[index];
+ if (ch == 0x5C)
+ last_slash = index;
+ if ((ch == 0x2D) || // hyphen
+ (ch == 0x5C) || (ch == 0x5F) || // backslash, underscore
+ ((0x30 <= ch) && (ch <= 0x39)) || // Digits [0-9]
+ ((0x41 <= ch) && (ch <= 0x5A)) || // Uppercase [A-Z]
+ ((0x61 <= ch) && (ch <= 0x7A))) { // Lowercase [a-z]
+ output.append(&path[index], 1);
+ } else {
+ char encoded[3];
+ encoded[0] = 'x';
+ encoded[1] = ch / 16;
+ encoded[1] += (encoded[1] >= 10) ? 'A' - 10 : '0';
+ encoded[2] = ch % 16;
+ encoded[2] += (encoded[2] >= 10) ? 'A' - 10 : '0';
+ output.append(encoded, 3);
+ }
+ if (index - last_slash > kMaximumSubdirectoryLength) {
+#ifdef WIN32
+ char slash = '\\';
+#else
+ char slash = '/';
+#endif
+ output.append(&slash, 1);
+ last_slash = index;
+ }
+ }
+ return output;
+}
+
+} // namespace net
diff --git a/src/net/tools/dump_cache/url_to_filename_encoder.h b/src/net/tools/dump_cache/url_to_filename_encoder.h
new file mode 100644
index 0000000..b81a854
--- /dev/null
+++ b/src/net/tools/dump_cache/url_to_filename_encoder.h
@@ -0,0 +1,211 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// URL filename encoder goals:
+//
+// 1. Allow URLs with arbitrary path-segment length, generating filenames
+// with a maximum of 128 characters.
+// 2. Provide a somewhat human readable filenames, for easy debugging flow.
+// 3. Provide reverse-mapping from filenames back to URLs.
+// 4. Be able to distinguish http://x from http://x/ from http://x/index.html.
+// Those can all be different URLs.
+// 5. Be able to represent http://a/b/c and http://a/b/c/d, a pattern seen
+// with Facebook Connect.
+//
+// We need an escape-character for representing characters that are legal
+// in URL paths, but not in filenames, such as '?'.
+//
+// We can pick any legal character as an escape, as long as we escape it too.
+// But as we have a goal of having filenames that humans can correlate with
+// URLs, we should pick one that doesn't show up frequently in URLs. Candidates
+// are ~`!@#$%^&()-=_+{}[],. but we would prefer to avoid characters that are
+// shell escapes or that various build tools use.
+//
+// .#&%-=_+ occur frequently in URLs.
+// <>:"/\|?* are illegal in Windows
+// See http://msdn.microsoft.com/en-us/library/aa365247(VS.85).aspx
+// ~`!$^&(){}[]'; are special to Unix shells
+// In addition, build tools do not like ^@#%
+//
+// Josh took a quick look at the frequency of some special characters in
+// Sadeesh's slurped directory from Fall 09 and found the following occurances:
+//
+// ^ 3 build tool doesn't like ^ in testdata filenames
+// @ 10 build tool doesn't like @ in testdata filenames
+// . 1676 too frequent in URLs
+// , 76 THE WINNER
+// # 0 build tool doesn't like it
+// & 487 Prefer to avoid shell escapes
+// % 374 g4 doesn't like it
+// = 579 very frequent in URLs -- leave unmodified
+// - 464 very frequent in URLs -- leave unmodified
+// _ 798 very frequent in URLs -- leave unmodified
+//
+//
+// The escaping algorithm is:
+// 1) Escape all unfriendly symbols as ,XX where XX is the hex code.
+// 2) Add a ',' at the end (We do not allow ',' at end of any directory name,
+// so this assures that e.g. /a and /a/b can coexist in the filesystem).
+// 3) Go through the path segment by segment (where a segment is one directory
+// or leaf in the path) and
+// 3a) If the segment is empty, escape the second slash. i.e. if it was
+// www.foo.com//a then we escape the second / like www.foo.com/,2Fa,
+// 3a) If it is "." or ".." prepend with ',' (so that we have a non-
+// empty and non-reserved filename).
+// 3b) If it is over 128 characters, break it up into smaller segments by
+// inserting ,-/ (Windows limits paths to 128 chars, other OSes also
+// have limits that would restrict us)
+//
+// For example:
+// URL File
+// / /,
+// /index.html /index.html,
+// /. /.,
+// /a/b /a/b,
+// /a/b/ /a/b/,
+// /a/b/c /a/b/c, Note: no prefix problem
+// /u?foo=bar /u,3Ffoo=bar,
+// // /,2F,
+// /./ /,./,
+// /../ /,../,
+// /, /,2C,
+// /,./ /,2C./,
+// /very...longname/ /very...long,-/name If very...long is about 126 long.
+
+// NOTE: we avoid using some classes here (like FilePath and GURL) because we
+// share this code with other projects externally.
+
+#ifndef NET_TOOLS_DUMP_CACHE_URL_TO_FILENAME_ENCODER_H_
+#define NET_TOOLS_DUMP_CACHE_URL_TO_FILENAME_ENCODER_H_
+
+#include <string>
+
+#include "base/string_util.h"
+#include "net/tools/dump_cache/url_utilities.h"
+
+namespace net {
+
+// Helper class for converting a URL into a filename.
+class UrlToFilenameEncoder {
+ public:
+ // Given a |url| and a |base_path|, returns a filename which represents this
+ // |url|. |url| may include URL escaping such as %21 for !
+ // |legacy_escape| indicates that this function should use the old-style
+ // of encoding.
+ // TODO(mbelshe): delete the legacy_escape code.
+ static std::string Encode(const std::string& url, std::string base_path,
+ bool legacy_escape) {
+ std::string filename;
+ if (!legacy_escape) {
+ std::string url_no_scheme = UrlUtilities::GetUrlHostPath(url);
+ EncodeSegment(base_path, url_no_scheme, '/', &filename);
+#ifdef WIN32
+ ReplaceAll(&filename, "/", "\\");
+#endif
+ } else {
+ std::string clean_url(url);
+ if (clean_url.length() && clean_url[clean_url.length()-1] == '/')
+ clean_url.append("index.html");
+
+ std::string host = UrlUtilities::GetUrlHost(clean_url);
+ filename.append(base_path);
+ filename.append(host);
+#ifdef WIN32
+ filename.append("\\");
+#else
+ filename.append("/");
+#endif
+
+ std::string url_filename = UrlUtilities::GetUrlPath(clean_url);
+ // Strip the leading '/'.
+ if (url_filename[0] == '/')
+ url_filename = url_filename.substr(1);
+
+ // Replace '/' with '\'.
+ ConvertToSlashes(&url_filename);
+
+ // Strip double back-slashes ("\\\\").
+ StripDoubleSlashes(&url_filename);
+
+ // Save path as filesystem-safe characters.
+ url_filename = LegacyEscape(url_filename);
+ filename.append(url_filename);
+
+#ifndef WIN32
+ // Last step - convert to native slashes.
+ const std::string slash("/");
+ const std::string backslash("\\");
+ ReplaceAll(&filename, backslash, slash);
+#endif
+ }
+
+ return filename;
+ }
+
+ // Rewrite HTML in a form that the SPDY in-memory server
+ // can read.
+ // |filename_prefix| is prepended without escaping.
+ // |escaped_ending| is the URL to be encoded into a filename. It may have URL
+ // escaped characters (like %21 for !).
+ // |dir_separator| is "/" on Unix, "\" on Windows.
+ // |encoded_filename| is the resultant filename.
+ static void EncodeSegment(
+ const std::string& filename_prefix,
+ const std::string& escaped_ending,
+ char dir_separator,
+ std::string* encoded_filename);
+
+ // Decodes a filename that was encoded with EncodeSegment,
+ // yielding back the original URL.
+ static bool Decode(const std::string& encoded_filename,
+ char dir_separator,
+ std::string* decoded_url);
+
+ static const char kEscapeChar;
+ static const char kTruncationChar;
+ static const size_t kMaximumSubdirectoryLength;
+
+ friend class UrlToFilenameEncoderTest;
+
+ private:
+ // Appends a segment of the path, special-casing "." and "..", and
+ // ensuring that the segment does not exceed the path length. If it does,
+ // it chops the end off the segment, writes the segment with a separator of
+ // ",-/", and then rewrites segment to contain just the truncated piece so
+ // it can be used in the next iteration.
+ // |segment| is a read/write parameter containing segment to write
+ // Note: this should not be called with empty segment.
+ static void AppendSegment(std::string* segment, std::string* dest);
+
+ // Allow reading of old slurped files.
+ static std::string LegacyEscape(const std::string& path);
+
+ // Replace all instances of |from| within |str| as |to|.
+ static void ReplaceAll(std::string* str, const std::string& from,
+ const std::string& to) {
+ std::string::size_type pos(0);
+ while ((pos = str->find(from, pos)) != std::string::npos) {
+ str->replace(pos, from.size(), to);
+ pos += from.size();
+ }
+ }
+
+ // Replace all instances of "/" with "\" in |path|.
+ static void ConvertToSlashes(std::string* path) {
+ const std::string slash("/");
+ const std::string backslash("\\");
+ ReplaceAll(path, slash, backslash);
+ }
+
+ // Replace all instances of "\\" with "%5C%5C" in |path|.
+ static void StripDoubleSlashes(std::string* path) {
+ const std::string doubleslash("\\\\");
+ const std::string escaped_doubleslash("%5C%5C");
+ ReplaceAll(path, doubleslash, escaped_doubleslash);
+ }
+};
+
+} // namespace net
+
+#endif // NET_TOOLS_DUMP_CACHE_URL_TO_FILENAME_ENCODER_H_
diff --git a/src/net/tools/dump_cache/url_to_filename_encoder_unittest.cc b/src/net/tools/dump_cache/url_to_filename_encoder_unittest.cc
new file mode 100644
index 0000000..2e09e0b
--- /dev/null
+++ b/src/net/tools/dump_cache/url_to_filename_encoder_unittest.cc
@@ -0,0 +1,341 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "net/tools/dump_cache/url_to_filename_encoder.h"
+
+#include <string>
+#include <vector>
+
+#include "base/string_piece.h"
+#include "base/string_util.h"
+#include "base/stringprintf.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+using base::StringPiece;
+using std::string;
+
+namespace net {
+
+#ifdef WIN32
+char kDirSeparator = '\\';
+char kOtherDirSeparator = '/';
+#else
+char kDirSeparator = '/';
+char kOtherDirSeparator = '\\';
+#endif
+
+class UrlToFilenameEncoderTest : public ::testing::Test {
+ protected:
+ UrlToFilenameEncoderTest() : escape_(1, UrlToFilenameEncoder::kEscapeChar),
+ dir_sep_(1, kDirSeparator) {
+ }
+
+ void CheckSegmentLength(const StringPiece& escaped_word) {
+ std::vector<StringPiece> components;
+ Tokenize(escaped_word, StringPiece("/"), &components);
+ for (size_t i = 0; i < components.size(); ++i) {
+ EXPECT_GE(UrlToFilenameEncoder::kMaximumSubdirectoryLength,
+ components[i].size());
+ }
+ }
+
+ void CheckValidChars(const StringPiece& escaped_word, char invalid_slash) {
+ // These characters are invalid in Windows. We add in ', as that's pretty
+ // inconvenient in a Unix filename.
+ //
+ // See http://msdn.microsoft.com/en-us/library/aa365247(VS.85).aspx
+ const string kInvalidChars = "<>:\"|?*'";
+ for (size_t i = 0; i < escaped_word.size(); ++i) {
+ char c = escaped_word[i];
+ EXPECT_EQ(string::npos, kInvalidChars.find(c));
+ EXPECT_NE(invalid_slash, c);
+ EXPECT_NE('\0', c); // only invalid character in Posix
+ EXPECT_GT(0x7E, c); // only English printable characters
+ }
+ }
+
+ void Validate(const string& in_word, const string& gold_word) {
+ string escaped_word, url;
+ UrlToFilenameEncoder::EncodeSegment("", in_word, '/', &escaped_word);
+ EXPECT_EQ(gold_word, escaped_word);
+ CheckSegmentLength(escaped_word);
+ CheckValidChars(escaped_word, '\\');
+ UrlToFilenameEncoder::Decode(escaped_word, '/', &url);
+ EXPECT_EQ(in_word, url);
+ }
+
+ void ValidateAllSegmentsSmall(const string& in_word) {
+ string escaped_word, url;
+ UrlToFilenameEncoder::EncodeSegment("", in_word, '/', &escaped_word);
+ CheckSegmentLength(escaped_word);
+ CheckValidChars(escaped_word, '\\');
+ UrlToFilenameEncoder::Decode(escaped_word, '/', &url);
+ EXPECT_EQ(in_word, url);
+ }
+
+ void ValidateNoChange(const string& word) {
+ // We always suffix the leaf with kEscapeChar, unless the leaf is empty.
+ Validate(word, word + escape_);
+ }
+
+ void ValidateEscaped(unsigned char ch) {
+ // We always suffix the leaf with kEscapeChar, unless the leaf is empty.
+ char escaped[100];
+ const char escape = UrlToFilenameEncoder::kEscapeChar;
+ base::snprintf(escaped, sizeof(escaped), "%c%02X%c", escape, ch, escape);
+ Validate(string(1, ch), escaped);
+ }
+
+ void ValidateUrl(const string& url, const string& base_path,
+ bool legacy_escape, const string& gold_filename) {
+ string encoded_filename = UrlToFilenameEncoder::Encode(
+ url, base_path, legacy_escape);
+ EXPECT_EQ(gold_filename, encoded_filename);
+ if (!legacy_escape) {
+ CheckSegmentLength(encoded_filename);
+ CheckValidChars(encoded_filename, kOtherDirSeparator);
+ string decoded_url;
+ UrlToFilenameEncoder::Decode(encoded_filename, kDirSeparator,
+ &decoded_url);
+ if (url != decoded_url) {
+ EXPECT_EQ(url, "http://" + decoded_url);
+ }
+ }
+ }
+
+ void ValidateUrlOldNew(const string& url, const string& gold_old_filename,
+ const string& gold_new_filename) {
+ ValidateUrl(url, "", true, gold_old_filename);
+ ValidateUrl(url, "", false, gold_new_filename);
+ }
+
+ void ValidateEncodeSame(const string& url1, const string& url2) {
+ string filename1 = UrlToFilenameEncoder::Encode(url1, "", false);
+ string filename2 = UrlToFilenameEncoder::Encode(url2, "", false);
+ EXPECT_EQ(filename1, filename2);
+ }
+
+ string escape_;
+ string dir_sep_;
+};
+
+TEST_F(UrlToFilenameEncoderTest, DoesNotEscape) {
+ ValidateNoChange("");
+ ValidateNoChange("abcdefg");
+ ValidateNoChange("abcdefghijklmnopqrstuvwxyz");
+ ValidateNoChange("ZYXWVUT");
+ ValidateNoChange("ZYXWVUTSRQPONMLKJIHGFEDCBA");
+ ValidateNoChange("01234567689");
+ ValidateNoChange("_.=+-");
+ ValidateNoChange("abcdefghijklmnopqrstuvwxyzZYXWVUTSRQPONMLKJIHGFEDCBA"
+ "01234567689_.=+-");
+ ValidateNoChange("index.html");
+ ValidateNoChange("/");
+ ValidateNoChange("/.");
+ ValidateNoChange(".");
+ ValidateNoChange("..");
+}
+
+TEST_F(UrlToFilenameEncoderTest, Escapes) {
+ const string bad_chars =
+ "<>:\"\\|?*" // Illegal on Windows
+ "~`!$^&(){}[]';" // Bad for Unix shells
+ "^@" // Build tool doesn't like
+ "#%" // Tool doesn't like
+ ","; // The escape char has to be escaped
+
+ for (size_t i = 0; i < bad_chars.size(); ++i) {
+ ValidateEscaped(bad_chars[i]);
+ }
+
+ // Check non-printable characters.
+ ValidateEscaped('\0');
+ for (size_t i = 127; i < 256; ++i) {
+ ValidateEscaped(static_cast<char>(i));
+ }
+}
+
+TEST_F(UrlToFilenameEncoderTest, DoesEscapeCorrectly) {
+ Validate("mysite.com&x", "mysite.com" + escape_ + "26x" + escape_);
+ Validate("/./", "/" + escape_ + "./" + escape_);
+ Validate("/../", "/" + escape_ + "../" + escape_);
+ Validate("//", "/" + escape_ + "2F" + escape_);
+ Validate("/./leaf", "/" + escape_ + "./leaf" + escape_);
+ Validate("/../leaf", "/" + escape_ + "../leaf" + escape_);
+ Validate("//leaf", "/" + escape_ + "2Fleaf" + escape_);
+ Validate("mysite/u?param1=x¶m2=y",
+ "mysite/u" + escape_ + "3Fparam1=x" + escape_ + "26param2=y" +
+ escape_);
+ Validate("search?q=dogs&go=&form=QBLH&qs=n", // from Latency Labs bing test.
+ "search" + escape_ + "3Fq=dogs" + escape_ + "26go=" + escape_ +
+ "26form=QBLH" + escape_ + "26qs=n" + escape_);
+ Validate("~joebob/my_neeto-website+with_stuff.asp?id=138&content=true",
+ "" + escape_ + "7Ejoebob/my_neeto-website+with_stuff.asp" + escape_ +
+ "3Fid=138" + escape_ + "26content=true" + escape_);
+}
+
+TEST_F(UrlToFilenameEncoderTest, EncodeUrlCorrectly) {
+ ValidateUrlOldNew("http://www.google.com/index.html",
+ "www.google.com" + dir_sep_ + "indexx2Ehtml",
+ "www.google.com" + dir_sep_ + "index.html" + escape_);
+ ValidateUrlOldNew("http://www.google.com/x/search?hl=en&q=dogs&oq=",
+ "www.google.com" + dir_sep_ + "x" + dir_sep_ +
+ "searchx3Fhlx3Denx26qx3Ddogsx26oqx3D",
+
+ "www.google.com" + dir_sep_ + "x" + dir_sep_ + "search" +
+ escape_ + "3Fhl=en" + escape_ + "26q=dogs" + escape_ +
+ "26oq=" + escape_);
+ ValidateUrlOldNew("http://www.foo.com/a//",
+ "www.foo.com" + dir_sep_ + "ax255Cx255Cindexx2Ehtml",
+ "www.foo.com" + dir_sep_ + "a" + dir_sep_ + escape_ + "2F" +
+ escape_);
+
+ // From bug: Double slash preserved.
+ ValidateUrl("http://www.foo.com/u?site=http://www.google.com/index.html",
+ "", false,
+ "www.foo.com" + dir_sep_ + "u" + escape_ + "3Fsite=http" +
+ escape_ + "3A" + dir_sep_ + escape_ + "2Fwww.google.com" +
+ dir_sep_ + "index.html" + escape_);
+ ValidateUrlOldNew(
+ "http://blogutils.net/olct/online.php?"
+ "site=http://thelwordfanfics.blogspot.&interval=600",
+
+ "blogutils.net" + dir_sep_ + "olct" + dir_sep_ + "onlinex2Ephpx3F"
+ "sitex3Dhttpx3Ax255Cx255Cthelwordfanficsx2Eblogspotx2Ex26intervalx3D600",
+
+ "blogutils.net" + dir_sep_ + "olct" + dir_sep_ + "online.php" + escape_ +
+ "3Fsite=http" + escape_ + "3A" + dir_sep_ + escape_ +
+ "2Fthelwordfanfics.blogspot." + escape_ + "26interval=600" + escape_);
+}
+
+// From bug: Escapes treated the same as normal char.
+TEST_F(UrlToFilenameEncoderTest, UnescapeUrlsBeforeEncode) {
+ for (int i = 0; i < 128; ++i) {
+ string unescaped(1, static_cast<char>(i));
+ string escaped = base::StringPrintf("%%%02X", i);
+ ValidateEncodeSame(unescaped, escaped);
+ }
+
+ ValidateEncodeSame(
+ "http://www.blogger.com/navbar.g?bName=God!&Mode=FOO&searchRoot"
+ "=http%3A%2F%2Fsurvivorscanthrive.blogspot.com%2Fsearch",
+
+ "http://www.blogger.com/navbar.g?bName=God%21&Mode=FOO&searchRoot"
+ "=http%3A%2F%2Fsurvivorscanthrive.blogspot.com%2Fsearch");
+}
+
+// From bug: Filename encoding is not prefix-free.
+TEST_F(UrlToFilenameEncoderTest, EscapeSecondSlash) {
+ Validate("/", "/" + escape_);
+ Validate("//", "/" + escape_ + "2F" + escape_);
+ Validate("///", "/" + escape_ + "2F" + "/" + escape_);
+}
+
+TEST_F(UrlToFilenameEncoderTest, LongTail) {
+ static char long_word[] =
+ "~joebob/briggs/12345678901234567890123456789012345678901234567890"
+ "1234567890123456789012345678901234567890123456789012345678901234567890"
+ "1234567890123456789012345678901234567890123456789012345678901234567890"
+ "1234567890123456789012345678901234567890123456789012345678901234567890"
+ "1234567890123456789012345678901234567890123456789012345678901234567890"
+ "1234567890123456789012345678901234567890123456789012345678901234567890";
+
+ // the long lines in the string below are 64 characters, so we can see
+ // the slashes every 128.
+ string gold_long_word =
+ escape_ + "7Ejoebob/briggs/"
+ "1234567890123456789012345678901234567890123456789012345678901234"
+ "56789012345678901234567890123456789012345678901234567890123456" +
+ escape_ + "-/"
+ "7890123456789012345678901234567890123456789012345678901234567890"
+ "12345678901234567890123456789012345678901234567890123456789012" +
+ escape_ + "-/"
+ "3456789012345678901234567890123456789012345678901234567890123456"
+ "78901234567890123456789012345678901234567890123456789012345678" +
+ escape_ + "-/"
+ "9012345678901234567890" + escape_;
+ EXPECT_LT(UrlToFilenameEncoder::kMaximumSubdirectoryLength,
+ sizeof(long_word));
+ Validate(long_word, gold_long_word);
+}
+
+TEST_F(UrlToFilenameEncoderTest, LongTailQuestion) {
+ // Here the '?' in the last path segment expands to @3F, making
+ // it hit 128 chars before the input segment gets that big.
+ static char long_word[] =
+ "~joebob/briggs/1234567?1234567?1234567?1234567?1234567?"
+ "1234567?1234567?1234567?1234567?1234567?1234567?1234567?"
+ "1234567?1234567?1234567?1234567?1234567?1234567?1234567?"
+ "1234567?1234567?1234567?1234567?1234567?1234567?1234567?"
+ "1234567?1234567?1234567?1234567?1234567?1234567?1234567?"
+ "1234567?1234567?1234567?1234567?1234567?1234567?1234567?";
+
+ // Notice that at the end of the third segment, we avoid splitting
+ // the (escape_ + "3F") that was generated from the "?", so that segment is
+ // only 127 characters.
+ string pattern = "1234567" + escape_ + "3F"; // 10 characters
+ string gold_long_word =
+ escape_ + "7Ejoebob/briggs/" +
+ pattern + pattern + pattern + pattern + pattern + pattern + "1234"
+ "567" + escape_ + "3F" + pattern + pattern + pattern + pattern + pattern +
+ "123456" + escape_ + "-/"
+ "7" + escape_ + "3F" + pattern + pattern + pattern + pattern + pattern +
+ pattern + pattern + pattern + pattern + pattern + pattern + pattern +
+ "12" +
+ escape_ + "-/"
+ "34567" + escape_ + "3F" + pattern + pattern + pattern + pattern + pattern
+ + "1234567" + escape_ + "3F" + pattern + pattern + pattern + pattern
+ + pattern + "1234567" +
+ escape_ + "-/" +
+ escape_ + "3F" + pattern + pattern + escape_;
+ EXPECT_LT(UrlToFilenameEncoder::kMaximumSubdirectoryLength,
+ sizeof(long_word));
+ Validate(long_word, gold_long_word);
+}
+
+TEST_F(UrlToFilenameEncoderTest, CornerCasesNearMaxLenNoEscape) {
+ // hit corner cases, +/- 4 characters from kMaxLen
+ for (int i = -4; i <= 4; ++i) {
+ string input;
+ input.append(i + UrlToFilenameEncoder::kMaximumSubdirectoryLength, 'x');
+ ValidateAllSegmentsSmall(input);
+ }
+}
+
+TEST_F(UrlToFilenameEncoderTest, CornerCasesNearMaxLenWithEscape) {
+ // hit corner cases, +/- 4 characters from kMaxLen. This time we
+ // leave off the last 'x' and put in a '.', which ensures that we
+ // are truncating with '/' *after* the expansion.
+ for (int i = -4; i <= 4; ++i) {
+ string input;
+ input.append(i + UrlToFilenameEncoder::kMaximumSubdirectoryLength - 1, 'x');
+ input.append(1, '.'); // this will expand to 3 characters.
+ ValidateAllSegmentsSmall(input);
+ }
+}
+
+TEST_F(UrlToFilenameEncoderTest, LeafBranchAlias) {
+ Validate("/a/b/c", "/a/b/c" + escape_); // c is leaf file "c,"
+ Validate("/a/b/c/d", "/a/b/c/d" + escape_); // c is directory "c"
+ Validate("/a/b/c/d/", "/a/b/c/d/" + escape_);
+}
+
+
+TEST_F(UrlToFilenameEncoderTest, BackslashSeparator) {
+ string long_word;
+ string escaped_word;
+ long_word.append(UrlToFilenameEncoder::kMaximumSubdirectoryLength + 1, 'x');
+ UrlToFilenameEncoder::EncodeSegment("", long_word, '\\', &escaped_word);
+
+ // check that one backslash, plus the escape ",-", and the ending , got added.
+ EXPECT_EQ(long_word.size() + 4, escaped_word.size());
+ ASSERT_LT(UrlToFilenameEncoder::kMaximumSubdirectoryLength,
+ escaped_word.size());
+ // Check that the backslash got inserted at the correct spot.
+ EXPECT_EQ('\\', escaped_word[
+ UrlToFilenameEncoder::kMaximumSubdirectoryLength]);
+}
+
+} // namespace net
+
diff --git a/src/net/tools/dump_cache/url_utilities.cc b/src/net/tools/dump_cache/url_utilities.cc
new file mode 100644
index 0000000..fe64bd9
--- /dev/null
+++ b/src/net/tools/dump_cache/url_utilities.cc
@@ -0,0 +1,126 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "net/tools/dump_cache/url_utilities.h"
+
+#include "base/logging.h"
+#include "base/string_number_conversions.h"
+#include "base/string_util.h"
+
+namespace net {
+
+std::string UrlUtilities::GetUrlHost(const std::string& url) {
+ size_t b = url.find("//");
+ if (b == std::string::npos)
+ b = 0;
+ else
+ b += 2;
+ size_t next_slash = url.find_first_of('/', b);
+ size_t next_colon = url.find_first_of(':', b);
+ if (next_slash != std::string::npos
+ && next_colon != std::string::npos
+ && next_colon < next_slash) {
+ return std::string(url, b, next_colon - b);
+ }
+ if (next_slash == std::string::npos) {
+ if (next_colon != std::string::npos) {
+ return std::string(url, b, next_colon - b);
+ } else {
+ next_slash = url.size();
+ }
+ }
+ return std::string(url, b, next_slash - b);
+}
+
+std::string UrlUtilities::GetUrlHostPath(const std::string& url) {
+ size_t b = url.find("//");
+ if (b == std::string::npos)
+ b = 0;
+ else
+ b += 2;
+ return std::string(url, b);
+}
+
+std::string UrlUtilities::GetUrlPath(const std::string& url) {
+ size_t b = url.find("//");
+ if (b == std::string::npos)
+ b = 0;
+ else
+ b += 2;
+ b = url.find("/", b);
+ if (b == std::string::npos)
+ return "/";
+
+ size_t e = url.find("#", b+1);
+ if (e != std::string::npos)
+ return std::string(url, b, (e - b));
+ return std::string(url, b);
+}
+
+namespace {
+
+// Parsing states for UrlUtilities::Unescape
+enum UnescapeState {
+ NORMAL, // We are not in the middle of parsing an escape.
+ ESCAPE1, // We just parsed % .
+ ESCAPE2 // We just parsed %X for some hex digit X.
+};
+
+} // namespace
+
+std::string UrlUtilities::Unescape(const std::string& escaped_url) {
+ std::string unescaped_url, escape_text;
+ int escape_value;
+ UnescapeState state = NORMAL;
+ std::string::const_iterator iter = escaped_url.begin();
+ while (iter < escaped_url.end()) {
+ char c = *iter;
+ switch (state) {
+ case NORMAL:
+ if (c == '%') {
+ escape_text.clear();
+ state = ESCAPE1;
+ } else {
+ unescaped_url.push_back(c);
+ }
+ ++iter;
+ break;
+ case ESCAPE1:
+ if (IsHexDigit(c)) {
+ escape_text.push_back(c);
+ state = ESCAPE2;
+ ++iter;
+ } else {
+ // Unexpected, % followed by non-hex chars, pass it through.
+ unescaped_url.push_back('%');
+ state = NORMAL;
+ }
+ break;
+ case ESCAPE2:
+ if (IsHexDigit(c)) {
+ escape_text.push_back(c);
+ bool ok = base::HexStringToInt(escape_text, &escape_value);
+ DCHECK(ok);
+ unescaped_url.push_back(static_cast<unsigned char>(escape_value));
+ state = NORMAL;
+ ++iter;
+ } else {
+ // Unexpected, % followed by non-hex chars, pass it through.
+ unescaped_url.push_back('%');
+ unescaped_url.append(escape_text);
+ state = NORMAL;
+ }
+ break;
+ }
+ }
+ // Unexpected, % followed by end of string, pass it through.
+ if (state == ESCAPE1 || state == ESCAPE2) {
+ unescaped_url.push_back('%');
+ unescaped_url.append(escape_text);
+ }
+ return unescaped_url;
+}
+
+} // namespace net
+
diff --git a/src/net/tools/dump_cache/url_utilities.h b/src/net/tools/dump_cache/url_utilities.h
new file mode 100644
index 0000000..c9d8ea5
--- /dev/null
+++ b/src/net/tools/dump_cache/url_utilities.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef NET_TOOLS_DUMP_CACHE_URL_UTILITIES_H_
+#define NET_TOOLS_DUMP_CACHE_URL_UTILITIES_H_
+
+#include <string>
+
+namespace net {
+
+struct UrlUtilities {
+ // Gets the host from an url, strips the port number as well if the url
+ // has one.
+ // For example: calling GetUrlHost(www.foo.com:8080/boo) returns www.foo.com
+ static std::string GetUrlHost(const std::string& url);
+
+ // Get the host + path portion of an url
+ // e.g http://www.foo.com/path
+ // returns www.foo.com/path
+ static std::string GetUrlHostPath(const std::string& url);
+
+ // Gets the path portion of an url.
+ // e.g http://www.foo.com/path
+ // returns /path
+ static std::string GetUrlPath(const std::string& url);
+
+ // Unescape a url, converting all %XX to the the actual char 0xXX.
+ // For example, this will convert "foo%21bar" to "foo!bar".
+ static std::string Unescape(const std::string& escaped_url);
+};
+
+} // namespace net
+
+#endif // NET_TOOLS_DUMP_CACHE_URL_UTILITIES_H_
diff --git a/src/net/tools/dump_cache/url_utilities_unittest.cc b/src/net/tools/dump_cache/url_utilities_unittest.cc
new file mode 100644
index 0000000..0f9cb06
--- /dev/null
+++ b/src/net/tools/dump_cache/url_utilities_unittest.cc
@@ -0,0 +1,114 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "net/tools/dump_cache/url_utilities.h"
+
+#include <string>
+
+#include "base/string_util.h"
+#include "base/stringprintf.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+namespace net {
+
+TEST(UrlUtilitiesTest, GetUrlHost) {
+ EXPECT_EQ("www.foo.com",
+ UrlUtilities::GetUrlHost("http://www.foo.com"));
+ EXPECT_EQ("www.foo.com",
+ UrlUtilities::GetUrlHost("http://www.foo.com:80"));
+ EXPECT_EQ("www.foo.com",
+ UrlUtilities::GetUrlHost("http://www.foo.com:80/"));
+ EXPECT_EQ("www.foo.com",
+ UrlUtilities::GetUrlHost("http://www.foo.com/news"));
+ EXPECT_EQ("www.foo.com",
+ UrlUtilities::GetUrlHost("www.foo.com:80/news?q=hello"));
+ EXPECT_EQ("www.foo.com",
+ UrlUtilities::GetUrlHost("www.foo.com/news?q=a:b"));
+ EXPECT_EQ("www.foo.com",
+ UrlUtilities::GetUrlHost("www.foo.com:80"));
+}
+
+TEST(UrlUtilitiesTest, GetUrlHostPath) {
+ EXPECT_EQ("www.foo.com",
+ UrlUtilities::GetUrlHostPath("http://www.foo.com"));
+ EXPECT_EQ("www.foo.com:80",
+ UrlUtilities::GetUrlHostPath("http://www.foo.com:80"));
+ EXPECT_EQ("www.foo.com:80/",
+ UrlUtilities::GetUrlHostPath("http://www.foo.com:80/"));
+ EXPECT_EQ("www.foo.com/news",
+ UrlUtilities::GetUrlHostPath("http://www.foo.com/news"));
+ EXPECT_EQ("www.foo.com:80/news?q=hello",
+ UrlUtilities::GetUrlHostPath("www.foo.com:80/news?q=hello"));
+ EXPECT_EQ("www.foo.com/news?q=a:b",
+ UrlUtilities::GetUrlHostPath("www.foo.com/news?q=a:b"));
+ EXPECT_EQ("www.foo.com:80",
+ UrlUtilities::GetUrlHostPath("www.foo.com:80"));
+}
+
+TEST(UrlUtilitiesTest, GetUrlPath) {
+ EXPECT_EQ("/",
+ UrlUtilities::GetUrlPath("http://www.foo.com"));
+ EXPECT_EQ("/",
+ UrlUtilities::GetUrlPath("http://www.foo.com:80"));
+ EXPECT_EQ("/",
+ UrlUtilities::GetUrlPath("http://www.foo.com:80/"));
+ EXPECT_EQ("/news",
+ UrlUtilities::GetUrlPath("http://www.foo.com/news"));
+ EXPECT_EQ("/news?q=hello",
+ UrlUtilities::GetUrlPath("www.foo.com:80/news?q=hello"));
+ EXPECT_EQ("/news?q=a:b",
+ UrlUtilities::GetUrlPath("www.foo.com/news?q=a:b"));
+ EXPECT_EQ("/",
+ UrlUtilities::GetUrlPath("www.foo.com:80"));
+}
+
+TEST(UrlUtilitiesTest, Unescape) {
+ // Basic examples are left alone.
+ EXPECT_EQ("http://www.foo.com",
+ UrlUtilities::Unescape("http://www.foo.com"));
+ EXPECT_EQ("www.foo.com:80/news?q=hello",
+ UrlUtilities::Unescape("www.foo.com:80/news?q=hello"));
+
+ // All chars can be unescaped.
+ EXPECT_EQ("~`!@#$%^&*()_-+={[}]|\\:;\"'<,>.?/",
+ UrlUtilities::Unescape("%7E%60%21%40%23%24%25%5E%26%2A%28%29%5F%2D"
+ "%2B%3D%7B%5B%7D%5D%7C%5C%3A%3B%22%27%3C%2C"
+ "%3E%2E%3F%2F"));
+ for (int c = 0; c < 256; ++c) {
+ std::string unescaped_char(1, implicit_cast<unsigned char>(c));
+ std::string escaped_char = base::StringPrintf("%%%02X", c);
+ EXPECT_EQ(unescaped_char, UrlUtilities::Unescape(escaped_char))
+ << "escaped_char = " << escaped_char;
+ escaped_char = base::StringPrintf("%%%02x", c);
+ EXPECT_EQ(unescaped_char, UrlUtilities::Unescape(escaped_char))
+ << "escaped_char = " << escaped_char;
+ }
+
+ // All non-% chars are left alone.
+ EXPECT_EQ("~`!@#$^&*()_-+={[}]|\\:;\"'<,>.?/",
+ UrlUtilities::Unescape("~`!@#$^&*()_-+={[}]|\\:;\"'<,>.?/"));
+ for (int c = 0; c < 256; ++c) {
+ if (c != '%') {
+ std::string just_char(1, implicit_cast<unsigned char>(c));
+ EXPECT_EQ(just_char, UrlUtilities::Unescape(just_char));
+ }
+ }
+
+ // Some examples to unescape.
+ EXPECT_EQ("Hello, world!", UrlUtilities::Unescape("Hello%2C world%21"));
+
+ // Not actually escapes.
+ EXPECT_EQ("%", UrlUtilities::Unescape("%"));
+ EXPECT_EQ("%www", UrlUtilities::Unescape("%www"));
+ EXPECT_EQ("%foo", UrlUtilities::Unescape("%foo"));
+ EXPECT_EQ("%1", UrlUtilities::Unescape("%1"));
+ EXPECT_EQ("%1x", UrlUtilities::Unescape("%1x"));
+ EXPECT_EQ("%%", UrlUtilities::Unescape("%%"));
+ // Escapes following non-escapes.
+ EXPECT_EQ("%!", UrlUtilities::Unescape("%%21"));
+ EXPECT_EQ("%2!", UrlUtilities::Unescape("%2%21"));
+}
+
+} // namespace net
+