Blame - src/net/tools/dump_cache/url_to_filename_encoder.h - cobalt

blob: b81a8543be7af60ac677764aa3af748e51d09ea4 [file] [log] [blame]

David Ghandehari	c3f1d40	2016-09-22 02:23:39 -0700	[diff] [blame]	1	// Copyright (c) 2010 The Chromium Authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style license that can be
				3	// found in the LICENSE file.
				4
				5	// URL filename encoder goals:
				6	//
				7	// 1. Allow URLs with arbitrary path-segment length, generating filenames
				8	// with a maximum of 128 characters.
				9	// 2. Provide a somewhat human readable filenames, for easy debugging flow.
				10	// 3. Provide reverse-mapping from filenames back to URLs.
				11	// 4. Be able to distinguish http://x from http://x/ from http://x/index.html.
				12	// Those can all be different URLs.
				13	// 5. Be able to represent http://a/b/c and http://a/b/c/d, a pattern seen
				14	// with Facebook Connect.
				15	//
				16	// We need an escape-character for representing characters that are legal
				17	// in URL paths, but not in filenames, such as '?'.
				18	//
				19	// We can pick any legal character as an escape, as long as we escape it too.
				20	// But as we have a goal of having filenames that humans can correlate with
				21	// URLs, we should pick one that doesn't show up frequently in URLs. Candidates
				22	// are ~`!@#$%^&()-=_+{}[],. but we would prefer to avoid characters that are
				23	// shell escapes or that various build tools use.
				24	//
				25	// .#&%-=_+ occur frequently in URLs.
				26	// <>:"/\\|?* are illegal in Windows
				27	// See http://msdn.microsoft.com/en-us/library/aa365247(VS.85).aspx
				28	// ~`!$^&(){}[]'; are special to Unix shells
				29	// In addition, build tools do not like ^@#%
				30	//
				31	// Josh took a quick look at the frequency of some special characters in
				32	// Sadeesh's slurped directory from Fall 09 and found the following occurances:
				33	//
				34	// ^ 3 build tool doesn't like ^ in testdata filenames
				35	// @ 10 build tool doesn't like @ in testdata filenames
				36	// . 1676 too frequent in URLs
				37	// , 76 THE WINNER
				38	// # 0 build tool doesn't like it
				39	// & 487 Prefer to avoid shell escapes
				40	// % 374 g4 doesn't like it
				41	// = 579 very frequent in URLs -- leave unmodified
				42	// - 464 very frequent in URLs -- leave unmodified
				43	// _ 798 very frequent in URLs -- leave unmodified
				44	//
				45	//
				46	// The escaping algorithm is:
				47	// 1) Escape all unfriendly symbols as ,XX where XX is the hex code.
				48	// 2) Add a ',' at the end (We do not allow ',' at end of any directory name,
				49	// so this assures that e.g. /a and /a/b can coexist in the filesystem).
				50	// 3) Go through the path segment by segment (where a segment is one directory
				51	// or leaf in the path) and
				52	// 3a) If the segment is empty, escape the second slash. i.e. if it was
				53	// www.foo.com//a then we escape the second / like www.foo.com/,2Fa,
				54	// 3a) If it is "." or ".." prepend with ',' (so that we have a non-
				55	// empty and non-reserved filename).
				56	// 3b) If it is over 128 characters, break it up into smaller segments by
				57	// inserting ,-/ (Windows limits paths to 128 chars, other OSes also
				58	// have limits that would restrict us)
				59	//
				60	// For example:
				61	// URL File
				62	// / /,
				63	// /index.html /index.html,
				64	// /. /.,
				65	// /a/b /a/b,
				66	// /a/b/ /a/b/,
				67	// /a/b/c /a/b/c, Note: no prefix problem
				68	// /u?foo=bar /u,3Ffoo=bar,
				69	// // /,2F,
				70	// /./ /,./,
				71	// /../ /,../,
				72	// /, /,2C,
				73	// /,./ /,2C./,
				74	// /very...longname/ /very...long,-/name If very...long is about 126 long.
				75
				76	// NOTE: we avoid using some classes here (like FilePath and GURL) because we
				77	// share this code with other projects externally.
				78
				79	#ifndef NET_TOOLS_DUMP_CACHE_URL_TO_FILENAME_ENCODER_H_
				80	#define NET_TOOLS_DUMP_CACHE_URL_TO_FILENAME_ENCODER_H_
				81
				82	#include <string>
				83
				84	#include "base/string_util.h"
				85	#include "net/tools/dump_cache/url_utilities.h"
				86
				87	namespace net {
				88
				89	// Helper class for converting a URL into a filename.
				90	class UrlToFilenameEncoder {
				91	public:
				92	// Given a \|url\| and a \|base_path\|, returns a filename which represents this
				93	// \|url\|. \|url\| may include URL escaping such as %21 for !
				94	// \|legacy_escape\| indicates that this function should use the old-style
				95	// of encoding.
				96	// TODO(mbelshe): delete the legacy_escape code.
				97	static std::string Encode(const std::string& url, std::string base_path,
				98	bool legacy_escape) {
				99	std::string filename;
				100	if (!legacy_escape) {
				101	std::string url_no_scheme = UrlUtilities::GetUrlHostPath(url);
				102	EncodeSegment(base_path, url_no_scheme, '/', &filename);
				103	#ifdef WIN32
				104	ReplaceAll(&filename, "/", "\\");
				105	#endif
				106	} else {
				107	std::string clean_url(url);
				108	if (clean_url.length() && clean_url[clean_url.length()-1] == '/')
				109	clean_url.append("index.html");
				110
				111	std::string host = UrlUtilities::GetUrlHost(clean_url);
				112	filename.append(base_path);
				113	filename.append(host);
				114	#ifdef WIN32
				115	filename.append("\\");
				116	#else
				117	filename.append("/");
				118	#endif
				119
				120	std::string url_filename = UrlUtilities::GetUrlPath(clean_url);
				121	// Strip the leading '/'.
				122	if (url_filename[0] == '/')
				123	url_filename = url_filename.substr(1);
				124
				125	// Replace '/' with '\'.
				126	ConvertToSlashes(&url_filename);
				127
				128	// Strip double back-slashes ("\\\\").
				129	StripDoubleSlashes(&url_filename);
				130
				131	// Save path as filesystem-safe characters.
				132	url_filename = LegacyEscape(url_filename);
				133	filename.append(url_filename);
				134
				135	#ifndef WIN32
				136	// Last step - convert to native slashes.
				137	const std::string slash("/");
				138	const std::string backslash("\\");
				139	ReplaceAll(&filename, backslash, slash);
				140	#endif
				141	}
				142
				143	return filename;
				144	}
				145
				146	// Rewrite HTML in a form that the SPDY in-memory server
				147	// can read.
				148	// \|filename_prefix\| is prepended without escaping.
				149	// \|escaped_ending\| is the URL to be encoded into a filename. It may have URL
				150	// escaped characters (like %21 for !).
				151	// \|dir_separator\| is "/" on Unix, "\" on Windows.
				152	// \|encoded_filename\| is the resultant filename.
				153	static void EncodeSegment(
				154	const std::string& filename_prefix,
				155	const std::string& escaped_ending,
				156	char dir_separator,
				157	std::string* encoded_filename);
				158
				159	// Decodes a filename that was encoded with EncodeSegment,
				160	// yielding back the original URL.
				161	static bool Decode(const std::string& encoded_filename,
				162	char dir_separator,
				163	std::string* decoded_url);
				164
				165	static const char kEscapeChar;
				166	static const char kTruncationChar;
				167	static const size_t kMaximumSubdirectoryLength;
				168
				169	friend class UrlToFilenameEncoderTest;
				170
				171	private:
				172	// Appends a segment of the path, special-casing "." and "..", and
				173	// ensuring that the segment does not exceed the path length. If it does,
				174	// it chops the end off the segment, writes the segment with a separator of
				175	// ",-/", and then rewrites segment to contain just the truncated piece so
				176	// it can be used in the next iteration.
				177	// \|segment\| is a read/write parameter containing segment to write
				178	// Note: this should not be called with empty segment.
				179	static void AppendSegment(std::string* segment, std::string* dest);
				180
				181	// Allow reading of old slurped files.
				182	static std::string LegacyEscape(const std::string& path);
				183
				184	// Replace all instances of \|from\| within \|str\| as \|to\|.
				185	static void ReplaceAll(std::string* str, const std::string& from,
				186	const std::string& to) {
				187	std::string::size_type pos(0);
				188	while ((pos = str->find(from, pos)) != std::string::npos) {
				189	str->replace(pos, from.size(), to);
				190	pos += from.size();
				191	}
				192	}
				193
				194	// Replace all instances of "/" with "\" in \|path\|.
				195	static void ConvertToSlashes(std::string* path) {
				196	const std::string slash("/");
				197	const std::string backslash("\\");
				198	ReplaceAll(path, slash, backslash);
				199	}
				200
				201	// Replace all instances of "\\" with "%5C%5C" in \|path\|.
				202	static void StripDoubleSlashes(std::string* path) {
				203	const std::string doubleslash("\\\\");
				204	const std::string escaped_doubleslash("%5C%5C");
				205	ReplaceAll(path, doubleslash, escaped_doubleslash);
				206	}
				207	};
				208
				209	} // namespace net
				210
				211	#endif // NET_TOOLS_DUMP_CACHE_URL_TO_FILENAME_ENCODER_H_