src/net/tools/dump_cache/url_to_filename_encoder.h - cobalt - Git at Google

 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 // URL filename encoder goals:
 //
 // 1. Allow URLs with arbitrary path-segment length, generating filenames
 //    with a maximum of 128 characters.
 // 2. Provide a somewhat human readable filenames, for easy debugging flow.
 // 3. Provide reverse-mapping from filenames back to URLs.
 // 4. Be able to distinguish http://x from http://x/ from http://x/index.html.
 //    Those can all be different URLs.
 // 5. Be able to represent http://a/b/c and http://a/b/c/d, a pattern seen
 //    with Facebook Connect.
 //
 // We need an escape-character for representing characters that are legal
 // in URL paths, but not in filenames, such as '?'.
 //
 // We can pick any legal character as an escape, as long as we escape it too.
 // But as we have a goal of having filenames that humans can correlate with
 // URLs, we should pick one that doesn't show up frequently in URLs. Candidates
 // are ~`!@#$%^&()-=_+{}[],. but we would prefer to avoid characters that are
 // shell escapes or that various build tools use.
 //
 // .#&%-=_+ occur frequently in URLs.
 // <>:"/\|?* are illegal in Windows
 //   See http://msdn.microsoft.com/en-us/library/aa365247(VS.85).aspx
 // ~`!$^&(){}[]'; are special to Unix shells
 // In addition, build tools do not like ^@#%
 //
 // Josh took a quick look at the frequency of some special characters in
 // Sadeesh's slurped directory from Fall 09 and found the following occurances:
 //
 //   ^   3               build tool doesn't like ^ in testdata filenames
 //   @   10              build tool doesn't like @ in testdata filenames
 //   .   1676            too frequent in URLs
 //   ,   76              THE WINNER
 //   #   0               build tool doesn't like it
 //   &   487             Prefer to avoid shell escapes
 //   %   374             g4 doesn't like it
 //   =   579             very frequent in URLs -- leave unmodified
 //   -   464             very frequent in URLs -- leave unmodified
 //   _   798             very frequent in URLs -- leave unmodified
 //
 //
 // The escaping algorithm is:
 //  1) Escape all unfriendly symbols as ,XX where XX is the hex code.
 //  2) Add a ',' at the end (We do not allow ',' at end of any directory name,
 //     so this assures that e.g. /a and /a/b can coexist in the filesystem).
 //  3) Go through the path segment by segment (where a segment is one directory
 //     or leaf in the path) and
 //     3a) If the segment is empty, escape the second slash. i.e. if it was
 //         www.foo.com//a then we escape the second / like www.foo.com/,2Fa,
 //     3a) If it is "." or ".." prepend with ',' (so that we have a non-
 //         empty and non-reserved filename).
 //     3b) If it is over 128 characters, break it up into smaller segments by
 //         inserting ,-/ (Windows limits paths to 128 chars, other OSes also
 //         have limits that would restrict us)
 //
 // For example:
 //     URL               File
 //     /                 /,
 //     /index.html       /index.html,
 //     /.                /.,
 //     /a/b              /a/b,
 //     /a/b/             /a/b/,
 //     /a/b/c            /a/b/c,   Note: no prefix problem
 //     /u?foo=bar        /u,3Ffoo=bar,
 //     //                /,2F,
 //     /./               /,./,
 //     /../              /,../,
 //     /,                /,2C,
 //     /,./              /,2C./,
 //     /very...longname/ /very...long,-/name   If very...long is about 126 long.

 // NOTE: we avoid using some classes here (like FilePath and GURL) because we
 //       share this code with other projects externally.

 #ifndef NET_TOOLS_DUMP_CACHE_URL_TO_FILENAME_ENCODER_H_
 #define NET_TOOLS_DUMP_CACHE_URL_TO_FILENAME_ENCODER_H_

 #include <string>

 #include "base/string_util.h"
 #include "net/tools/dump_cache/url_utilities.h"

 namespace net {

 // Helper class for converting a URL into a filename.
 class UrlToFilenameEncoder {
  public:
   // Given a |url| and a |base_path|, returns a filename which represents this
   // |url|. |url| may include URL escaping such as %21 for !
   // |legacy_escape| indicates that this function should use the old-style
   // of encoding.
   // TODO(mbelshe): delete the legacy_escape code.
   static std::string Encode(const std::string& url, std::string base_path,
                             bool legacy_escape) {
     std::string filename;
     if (!legacy_escape) {
       std::string url_no_scheme = UrlUtilities::GetUrlHostPath(url);
       EncodeSegment(base_path, url_no_scheme, '/', &filename);
 #ifdef WIN32
       ReplaceAll(&filename, "/", "\\");
 #endif
     } else {
       std::string clean_url(url);
       if (clean_url.length() && clean_url[clean_url.length()-1] == '/')
         clean_url.append("index.html");

       std::string host = UrlUtilities::GetUrlHost(clean_url);
       filename.append(base_path);
       filename.append(host);
 #ifdef WIN32
       filename.append("\\");
 #else
       filename.append("/");
 #endif

       std::string url_filename = UrlUtilities::GetUrlPath(clean_url);
       // Strip the leading '/'.
       if (url_filename[0] == '/')
         url_filename = url_filename.substr(1);

       // Replace '/' with '\'.
       ConvertToSlashes(&url_filename);

       // Strip double back-slashes ("\\\\").
       StripDoubleSlashes(&url_filename);

       // Save path as filesystem-safe characters.
       url_filename = LegacyEscape(url_filename);
       filename.append(url_filename);

 #ifndef WIN32
       // Last step - convert to native slashes.
       const std::string slash("/");
       const std::string backslash("\\");
       ReplaceAll(&filename, backslash, slash);
 #endif
     }

     return filename;
   }

   // Rewrite HTML in a form that the SPDY in-memory server
   // can read.
   // |filename_prefix| is prepended without escaping.
   // |escaped_ending| is the URL to be encoded into a filename. It may have URL
   // escaped characters (like %21 for !).
   // |dir_separator| is "/" on Unix, "\" on Windows.
   // |encoded_filename| is the resultant filename.
   static void EncodeSegment(
       const std::string& filename_prefix,
       const std::string& escaped_ending,
       char dir_separator,
       std::string* encoded_filename);

   // Decodes a filename that was encoded with EncodeSegment,
   // yielding back the original URL.
   static bool Decode(const std::string& encoded_filename,
                      char dir_separator,
                      std::string* decoded_url);

   static const char kEscapeChar;
   static const char kTruncationChar;
   static const size_t kMaximumSubdirectoryLength;

   friend class UrlToFilenameEncoderTest;

  private:
   // Appends a segment of the path, special-casing "." and "..", and
   // ensuring that the segment does not exceed the path length.  If it does,
   // it chops the end off the segment, writes the segment with a separator of
   // ",-/", and then rewrites segment to contain just the truncated piece so
   // it can be used in the next iteration.
   // |segment| is a read/write parameter containing segment to write
   // Note: this should not be called with empty segment.
   static void AppendSegment(std::string* segment, std::string* dest);

   // Allow reading of old slurped files.
   static std::string LegacyEscape(const std::string& path);

   // Replace all instances of |from| within |str| as |to|.
   static void ReplaceAll(std::string* str, const std::string& from,
                          const std::string& to) {
     std::string::size_type pos(0);
     while ((pos = str->find(from, pos)) != std::string::npos) {
       str->replace(pos, from.size(), to);
       pos += from.size();
     }
   }

   // Replace all instances of "/" with "\" in |path|.
   static void ConvertToSlashes(std::string* path) {
     const std::string slash("/");
     const std::string backslash("\\");
     ReplaceAll(path, slash, backslash);
   }

   // Replace all instances of "\\" with "%5C%5C" in |path|.
   static void StripDoubleSlashes(std::string* path) {
     const std::string doubleslash("\\\\");
     const std::string escaped_doubleslash("%5C%5C");
     ReplaceAll(path, doubleslash, escaped_doubleslash);
   }
 };

 }  // namespace net

 #endif  // NET_TOOLS_DUMP_CACHE_URL_TO_FILENAME_ENCODER_H_
	// Copyright (c) 2010 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	// URL filename encoder goals:
	//
	// 1. Allow URLs with arbitrary path-segment length, generating filenames
	// with a maximum of 128 characters.
	// 2. Provide a somewhat human readable filenames, for easy debugging flow.
	// 3. Provide reverse-mapping from filenames back to URLs.
	// 4. Be able to distinguish http://x from http://x/ from http://x/index.html.
	// Those can all be different URLs.
	// 5. Be able to represent http://a/b/c and http://a/b/c/d, a pattern seen
	// with Facebook Connect.
	//
	// We need an escape-character for representing characters that are legal
	// in URL paths, but not in filenames, such as '?'.
	//
	// We can pick any legal character as an escape, as long as we escape it too.
	// But as we have a goal of having filenames that humans can correlate with
	// URLs, we should pick one that doesn't show up frequently in URLs. Candidates
	// are ~`!@#$%^&()-=_+{}[],. but we would prefer to avoid characters that are
	// shell escapes or that various build tools use.
	//
	// .#&%-=_+ occur frequently in URLs.
	// <>:"/\\|?* are illegal in Windows
	// See http://msdn.microsoft.com/en-us/library/aa365247(VS.85).aspx
	// ~`!$^&(){}[]'; are special to Unix shells
	// In addition, build tools do not like ^@#%
	//
	// Josh took a quick look at the frequency of some special characters in
	// Sadeesh's slurped directory from Fall 09 and found the following occurances:
	//
	// ^ 3 build tool doesn't like ^ in testdata filenames
	// @ 10 build tool doesn't like @ in testdata filenames
	// . 1676 too frequent in URLs
	// , 76 THE WINNER
	// # 0 build tool doesn't like it
	// & 487 Prefer to avoid shell escapes
	// % 374 g4 doesn't like it
	// = 579 very frequent in URLs -- leave unmodified
	// - 464 very frequent in URLs -- leave unmodified
	// _ 798 very frequent in URLs -- leave unmodified
	//
	//
	// The escaping algorithm is:
	// 1) Escape all unfriendly symbols as ,XX where XX is the hex code.
	// 2) Add a ',' at the end (We do not allow ',' at end of any directory name,
	// so this assures that e.g. /a and /a/b can coexist in the filesystem).
	// 3) Go through the path segment by segment (where a segment is one directory
	// or leaf in the path) and
	// 3a) If the segment is empty, escape the second slash. i.e. if it was
	// www.foo.com//a then we escape the second / like www.foo.com/,2Fa,
	// 3a) If it is "." or ".." prepend with ',' (so that we have a non-
	// empty and non-reserved filename).
	// 3b) If it is over 128 characters, break it up into smaller segments by
	// inserting ,-/ (Windows limits paths to 128 chars, other OSes also
	// have limits that would restrict us)
	//
	// For example:
	// URL File
	// / /,
	// /index.html /index.html,
	// /. /.,
	// /a/b /a/b,
	// /a/b/ /a/b/,
	// /a/b/c /a/b/c, Note: no prefix problem
	// /u?foo=bar /u,3Ffoo=bar,
	// // /,2F,
	// /./ /,./,
	// /../ /,../,
	// /, /,2C,
	// /,./ /,2C./,
	// /very...longname/ /very...long,-/name If very...long is about 126 long.

	// NOTE: we avoid using some classes here (like FilePath and GURL) because we
	// share this code with other projects externally.

	#ifndef NET_TOOLS_DUMP_CACHE_URL_TO_FILENAME_ENCODER_H_
	#define NET_TOOLS_DUMP_CACHE_URL_TO_FILENAME_ENCODER_H_

	#include <string>

	#include "base/string_util.h"
	#include "net/tools/dump_cache/url_utilities.h"

	namespace net {

	// Helper class for converting a URL into a filename.
	class UrlToFilenameEncoder {
	public:
	// Given a \|url\| and a \|base_path\|, returns a filename which represents this
	// \|url\|. \|url\| may include URL escaping such as %21 for !
	// \|legacy_escape\| indicates that this function should use the old-style
	// of encoding.
	// TODO(mbelshe): delete the legacy_escape code.
	static std::string Encode(const std::string& url, std::string base_path,
	bool legacy_escape) {
	std::string filename;
	if (!legacy_escape) {
	std::string url_no_scheme = UrlUtilities::GetUrlHostPath(url);
	EncodeSegment(base_path, url_no_scheme, '/', &filename);
	#ifdef WIN32
	ReplaceAll(&filename, "/", "\\");
	#endif
	} else {
	std::string clean_url(url);
	if (clean_url.length() && clean_url[clean_url.length()-1] == '/')
	clean_url.append("index.html");

	std::string host = UrlUtilities::GetUrlHost(clean_url);
	filename.append(base_path);
	filename.append(host);
	#ifdef WIN32
	filename.append("\\");
	#else
	filename.append("/");
	#endif

	std::string url_filename = UrlUtilities::GetUrlPath(clean_url);
	// Strip the leading '/'.
	if (url_filename[0] == '/')
	url_filename = url_filename.substr(1);

	// Replace '/' with '\'.
	ConvertToSlashes(&url_filename);

	// Strip double back-slashes ("\\\\").
	StripDoubleSlashes(&url_filename);

	// Save path as filesystem-safe characters.
	url_filename = LegacyEscape(url_filename);
	filename.append(url_filename);

	#ifndef WIN32
	// Last step - convert to native slashes.
	const std::string slash("/");
	const std::string backslash("\\");
	ReplaceAll(&filename, backslash, slash);
	#endif
	}

	return filename;
	}

	// Rewrite HTML in a form that the SPDY in-memory server
	// can read.
	// \|filename_prefix\| is prepended without escaping.
	// \|escaped_ending\| is the URL to be encoded into a filename. It may have URL
	// escaped characters (like %21 for !).
	// \|dir_separator\| is "/" on Unix, "\" on Windows.
	// \|encoded_filename\| is the resultant filename.
	static void EncodeSegment(
	const std::string& filename_prefix,
	const std::string& escaped_ending,
	char dir_separator,
	std::string* encoded_filename);

	// Decodes a filename that was encoded with EncodeSegment,
	// yielding back the original URL.
	static bool Decode(const std::string& encoded_filename,
	char dir_separator,
	std::string* decoded_url);

	static const char kEscapeChar;
	static const char kTruncationChar;
	static const size_t kMaximumSubdirectoryLength;

	friend class UrlToFilenameEncoderTest;

	private:
	// Appends a segment of the path, special-casing "." and "..", and
	// ensuring that the segment does not exceed the path length. If it does,
	// it chops the end off the segment, writes the segment with a separator of
	// ",-/", and then rewrites segment to contain just the truncated piece so
	// it can be used in the next iteration.
	// \|segment\| is a read/write parameter containing segment to write
	// Note: this should not be called with empty segment.
	static void AppendSegment(std::string* segment, std::string* dest);

	// Allow reading of old slurped files.
	static std::string LegacyEscape(const std::string& path);

	// Replace all instances of \|from\| within \|str\| as \|to\|.
	static void ReplaceAll(std::string* str, const std::string& from,
	const std::string& to) {
	std::string::size_type pos(0);
	while ((pos = str->find(from, pos)) != std::string::npos) {
	str->replace(pos, from.size(), to);
	pos += from.size();
	}
	}

	// Replace all instances of "/" with "\" in \|path\|.
	static void ConvertToSlashes(std::string* path) {
	const std::string slash("/");
	const std::string backslash("\\");
	ReplaceAll(path, slash, backslash);
	}

	// Replace all instances of "\\" with "%5C%5C" in \|path\|.
	static void StripDoubleSlashes(std::string* path) {
	const std::string doubleslash("\\\\");
	const std::string escaped_doubleslash("%5C%5C");
	ReplaceAll(path, doubleslash, escaped_doubleslash);
	}
	};

	} // namespace net

	#endif // NET_TOOLS_DUMP_CACHE_URL_TO_FILENAME_ENCODER_H_