David Ghandehari | c3f1d40 | 2016-09-22 02:23:39 -0700 | [diff] [blame] | 1 | // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
| 5 | // URL filename encoder goals: |
| 6 | // |
| 7 | // 1. Allow URLs with arbitrary path-segment length, generating filenames |
| 8 | // with a maximum of 128 characters. |
| 9 | // 2. Provide a somewhat human readable filenames, for easy debugging flow. |
| 10 | // 3. Provide reverse-mapping from filenames back to URLs. |
| 11 | // 4. Be able to distinguish http://x from http://x/ from http://x/index.html. |
| 12 | // Those can all be different URLs. |
| 13 | // 5. Be able to represent http://a/b/c and http://a/b/c/d, a pattern seen |
| 14 | // with Facebook Connect. |
| 15 | // |
| 16 | // We need an escape-character for representing characters that are legal |
| 17 | // in URL paths, but not in filenames, such as '?'. |
| 18 | // |
| 19 | // We can pick any legal character as an escape, as long as we escape it too. |
| 20 | // But as we have a goal of having filenames that humans can correlate with |
| 21 | // URLs, we should pick one that doesn't show up frequently in URLs. Candidates |
| 22 | // are ~`!@#$%^&()-=_+{}[],. but we would prefer to avoid characters that are |
| 23 | // shell escapes or that various build tools use. |
| 24 | // |
| 25 | // .#&%-=_+ occur frequently in URLs. |
| 26 | // <>:"/\|?* are illegal in Windows |
| 27 | // See http://msdn.microsoft.com/en-us/library/aa365247(VS.85).aspx |
| 28 | // ~`!$^&(){}[]'; are special to Unix shells |
| 29 | // In addition, build tools do not like ^@#% |
| 30 | // |
| 31 | // Josh took a quick look at the frequency of some special characters in |
| 32 | // Sadeesh's slurped directory from Fall 09 and found the following occurances: |
| 33 | // |
| 34 | // ^ 3 build tool doesn't like ^ in testdata filenames |
| 35 | // @ 10 build tool doesn't like @ in testdata filenames |
| 36 | // . 1676 too frequent in URLs |
| 37 | // , 76 THE WINNER |
| 38 | // # 0 build tool doesn't like it |
| 39 | // & 487 Prefer to avoid shell escapes |
| 40 | // % 374 g4 doesn't like it |
| 41 | // = 579 very frequent in URLs -- leave unmodified |
| 42 | // - 464 very frequent in URLs -- leave unmodified |
| 43 | // _ 798 very frequent in URLs -- leave unmodified |
| 44 | // |
| 45 | // |
| 46 | // The escaping algorithm is: |
| 47 | // 1) Escape all unfriendly symbols as ,XX where XX is the hex code. |
| 48 | // 2) Add a ',' at the end (We do not allow ',' at end of any directory name, |
| 49 | // so this assures that e.g. /a and /a/b can coexist in the filesystem). |
| 50 | // 3) Go through the path segment by segment (where a segment is one directory |
| 51 | // or leaf in the path) and |
| 52 | // 3a) If the segment is empty, escape the second slash. i.e. if it was |
| 53 | // www.foo.com//a then we escape the second / like www.foo.com/,2Fa, |
| 54 | // 3a) If it is "." or ".." prepend with ',' (so that we have a non- |
| 55 | // empty and non-reserved filename). |
| 56 | // 3b) If it is over 128 characters, break it up into smaller segments by |
| 57 | // inserting ,-/ (Windows limits paths to 128 chars, other OSes also |
| 58 | // have limits that would restrict us) |
| 59 | // |
| 60 | // For example: |
| 61 | // URL File |
| 62 | // / /, |
| 63 | // /index.html /index.html, |
| 64 | // /. /., |
| 65 | // /a/b /a/b, |
| 66 | // /a/b/ /a/b/, |
| 67 | // /a/b/c /a/b/c, Note: no prefix problem |
| 68 | // /u?foo=bar /u,3Ffoo=bar, |
| 69 | // // /,2F, |
| 70 | // /./ /,./, |
| 71 | // /../ /,../, |
| 72 | // /, /,2C, |
| 73 | // /,./ /,2C./, |
| 74 | // /very...longname/ /very...long,-/name If very...long is about 126 long. |
| 75 | |
| 76 | // NOTE: we avoid using some classes here (like FilePath and GURL) because we |
| 77 | // share this code with other projects externally. |
| 78 | |
| 79 | #ifndef NET_TOOLS_DUMP_CACHE_URL_TO_FILENAME_ENCODER_H_ |
| 80 | #define NET_TOOLS_DUMP_CACHE_URL_TO_FILENAME_ENCODER_H_ |
| 81 | |
| 82 | #include <string> |
| 83 | |
| 84 | #include "base/string_util.h" |
| 85 | #include "net/tools/dump_cache/url_utilities.h" |
| 86 | |
| 87 | namespace net { |
| 88 | |
| 89 | // Helper class for converting a URL into a filename. |
| 90 | class UrlToFilenameEncoder { |
| 91 | public: |
| 92 | // Given a |url| and a |base_path|, returns a filename which represents this |
| 93 | // |url|. |url| may include URL escaping such as %21 for ! |
| 94 | // |legacy_escape| indicates that this function should use the old-style |
| 95 | // of encoding. |
| 96 | // TODO(mbelshe): delete the legacy_escape code. |
| 97 | static std::string Encode(const std::string& url, std::string base_path, |
| 98 | bool legacy_escape) { |
| 99 | std::string filename; |
| 100 | if (!legacy_escape) { |
| 101 | std::string url_no_scheme = UrlUtilities::GetUrlHostPath(url); |
| 102 | EncodeSegment(base_path, url_no_scheme, '/', &filename); |
| 103 | #ifdef WIN32 |
| 104 | ReplaceAll(&filename, "/", "\\"); |
| 105 | #endif |
| 106 | } else { |
| 107 | std::string clean_url(url); |
| 108 | if (clean_url.length() && clean_url[clean_url.length()-1] == '/') |
| 109 | clean_url.append("index.html"); |
| 110 | |
| 111 | std::string host = UrlUtilities::GetUrlHost(clean_url); |
| 112 | filename.append(base_path); |
| 113 | filename.append(host); |
| 114 | #ifdef WIN32 |
| 115 | filename.append("\\"); |
| 116 | #else |
| 117 | filename.append("/"); |
| 118 | #endif |
| 119 | |
| 120 | std::string url_filename = UrlUtilities::GetUrlPath(clean_url); |
| 121 | // Strip the leading '/'. |
| 122 | if (url_filename[0] == '/') |
| 123 | url_filename = url_filename.substr(1); |
| 124 | |
| 125 | // Replace '/' with '\'. |
| 126 | ConvertToSlashes(&url_filename); |
| 127 | |
| 128 | // Strip double back-slashes ("\\\\"). |
| 129 | StripDoubleSlashes(&url_filename); |
| 130 | |
| 131 | // Save path as filesystem-safe characters. |
| 132 | url_filename = LegacyEscape(url_filename); |
| 133 | filename.append(url_filename); |
| 134 | |
| 135 | #ifndef WIN32 |
| 136 | // Last step - convert to native slashes. |
| 137 | const std::string slash("/"); |
| 138 | const std::string backslash("\\"); |
| 139 | ReplaceAll(&filename, backslash, slash); |
| 140 | #endif |
| 141 | } |
| 142 | |
| 143 | return filename; |
| 144 | } |
| 145 | |
| 146 | // Rewrite HTML in a form that the SPDY in-memory server |
| 147 | // can read. |
| 148 | // |filename_prefix| is prepended without escaping. |
| 149 | // |escaped_ending| is the URL to be encoded into a filename. It may have URL |
| 150 | // escaped characters (like %21 for !). |
| 151 | // |dir_separator| is "/" on Unix, "\" on Windows. |
| 152 | // |encoded_filename| is the resultant filename. |
| 153 | static void EncodeSegment( |
| 154 | const std::string& filename_prefix, |
| 155 | const std::string& escaped_ending, |
| 156 | char dir_separator, |
| 157 | std::string* encoded_filename); |
| 158 | |
| 159 | // Decodes a filename that was encoded with EncodeSegment, |
| 160 | // yielding back the original URL. |
| 161 | static bool Decode(const std::string& encoded_filename, |
| 162 | char dir_separator, |
| 163 | std::string* decoded_url); |
| 164 | |
| 165 | static const char kEscapeChar; |
| 166 | static const char kTruncationChar; |
| 167 | static const size_t kMaximumSubdirectoryLength; |
| 168 | |
| 169 | friend class UrlToFilenameEncoderTest; |
| 170 | |
| 171 | private: |
| 172 | // Appends a segment of the path, special-casing "." and "..", and |
| 173 | // ensuring that the segment does not exceed the path length. If it does, |
| 174 | // it chops the end off the segment, writes the segment with a separator of |
| 175 | // ",-/", and then rewrites segment to contain just the truncated piece so |
| 176 | // it can be used in the next iteration. |
| 177 | // |segment| is a read/write parameter containing segment to write |
| 178 | // Note: this should not be called with empty segment. |
| 179 | static void AppendSegment(std::string* segment, std::string* dest); |
| 180 | |
| 181 | // Allow reading of old slurped files. |
| 182 | static std::string LegacyEscape(const std::string& path); |
| 183 | |
| 184 | // Replace all instances of |from| within |str| as |to|. |
| 185 | static void ReplaceAll(std::string* str, const std::string& from, |
| 186 | const std::string& to) { |
| 187 | std::string::size_type pos(0); |
| 188 | while ((pos = str->find(from, pos)) != std::string::npos) { |
| 189 | str->replace(pos, from.size(), to); |
| 190 | pos += from.size(); |
| 191 | } |
| 192 | } |
| 193 | |
| 194 | // Replace all instances of "/" with "\" in |path|. |
| 195 | static void ConvertToSlashes(std::string* path) { |
| 196 | const std::string slash("/"); |
| 197 | const std::string backslash("\\"); |
| 198 | ReplaceAll(path, slash, backslash); |
| 199 | } |
| 200 | |
| 201 | // Replace all instances of "\\" with "%5C%5C" in |path|. |
| 202 | static void StripDoubleSlashes(std::string* path) { |
| 203 | const std::string doubleslash("\\\\"); |
| 204 | const std::string escaped_doubleslash("%5C%5C"); |
| 205 | ReplaceAll(path, doubleslash, escaped_doubleslash); |
| 206 | } |
| 207 | }; |
| 208 | |
| 209 | } // namespace net |
| 210 | |
| 211 | #endif // NET_TOOLS_DUMP_CACHE_URL_TO_FILENAME_ENCODER_H_ |