|  | // Copyright 2013 The Chromium Authors. All rights reserved. | 
|  | // Use of this source code is governed by a BSD-style license that can be | 
|  | // found in the LICENSE file. | 
|  |  | 
|  | #include "net/tools/tld_cleanup/tld_cleanup_util.h" | 
|  |  | 
|  | #include "base/files/file_util.h" | 
|  | #include "base/logging.h" | 
|  | #include "base/strings/string_number_conversions.h" | 
|  | #include "base/strings/string_util.h" | 
|  | #include "url/gurl.h" | 
|  | #include "url/third_party/mozilla/url_parse.h" | 
|  |  | 
|  | namespace { | 
|  |  | 
|  | const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS==="; | 
|  | const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS==="; | 
|  |  | 
|  | const int kExceptionRule = 1; | 
|  | const int kWildcardRule = 2; | 
|  | const int kPrivateRule = 4; | 
|  | } | 
|  |  | 
|  | namespace net { | 
|  | namespace tld_cleanup { | 
|  |  | 
|  | // Writes the list of domain rules contained in the 'rules' set to the | 
|  | // 'outfile', with each rule terminated by a LF.  The file must already have | 
|  | // been created with write access. | 
|  | bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) { | 
|  | std::string data; | 
|  | data.append("%{\n" | 
|  | "// Copyright 2012 The Chromium Authors. All rights reserved.\n" | 
|  | "// Use of this source code is governed by a BSD-style license " | 
|  | "that can be\n" | 
|  | "// found in the LICENSE file.\n\n" | 
|  | "// This file is generated by net/tools/tld_cleanup/.\n" | 
|  | "// DO NOT MANUALLY EDIT!\n" | 
|  | "%}\n" | 
|  | "struct DomainRule {\n" | 
|  | "  int name_offset;\n" | 
|  | "  int type;  // flags: 1: exception, 2: wildcard, 4: private\n" | 
|  | "};\n" | 
|  | "%%\n"); | 
|  |  | 
|  | for (auto i = rules.begin(); i != rules.end(); ++i) { | 
|  | data.append(i->first); | 
|  | data.append(", "); | 
|  | int type = 0; | 
|  | if (i->second.exception) { | 
|  | type = kExceptionRule; | 
|  | } else if (i->second.wildcard) { | 
|  | type = kWildcardRule; | 
|  | } | 
|  | if (i->second.is_private) { | 
|  | type += kPrivateRule; | 
|  | } | 
|  | data.append(base::IntToString(type)); | 
|  | data.append("\n"); | 
|  | } | 
|  |  | 
|  | data.append("%%\n"); | 
|  |  | 
|  | int written = base::WriteFile(outfile, | 
|  | data.data(), | 
|  | static_cast<int>(data.size())); | 
|  |  | 
|  | return written == static_cast<int>(data.size()); | 
|  | } | 
|  |  | 
|  | // Adjusts the rule to a standard form: removes single extraneous dots and | 
|  | // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as | 
|  | // valid; logs a warning and returns kWarning if it is probably invalid; and | 
|  | // logs an error and returns kError if the rule is (almost) certainly invalid. | 
|  | NormalizeResult NormalizeRule(std::string* domain, Rule* rule) { | 
|  | NormalizeResult result = kSuccess; | 
|  |  | 
|  | // Strip single leading and trailing dots. | 
|  | if (domain->at(0) == '.') | 
|  | domain->erase(0, 1); | 
|  | if (domain->empty()) { | 
|  | LOG(WARNING) << "Ignoring empty rule"; | 
|  | return kWarning; | 
|  | } | 
|  | if (domain->at(domain->size() - 1) == '.') | 
|  | domain->erase(domain->size() - 1, 1); | 
|  | if (domain->empty()) { | 
|  | LOG(WARNING) << "Ignoring empty rule"; | 
|  | return kWarning; | 
|  | } | 
|  |  | 
|  | // Allow single leading '*.' or '!', saved here so it's not canonicalized. | 
|  | size_t start_offset = 0; | 
|  | if (domain->at(0) == '!') { | 
|  | domain->erase(0, 1); | 
|  | rule->exception = true; | 
|  | } else if (domain->find("*.") == 0) { | 
|  | domain->erase(0, 2); | 
|  | rule->wildcard = true; | 
|  | } | 
|  | if (domain->empty()) { | 
|  | LOG(WARNING) << "Ignoring empty rule"; | 
|  | return kWarning; | 
|  | } | 
|  |  | 
|  | // Warn about additional '*.' or '!'. | 
|  | if (domain->find("*.", start_offset) != std::string::npos || | 
|  | domain->find('!', start_offset) != std::string::npos) { | 
|  | LOG(WARNING) << "Keeping probably invalid rule: " << *domain; | 
|  | result = kWarning; | 
|  | } | 
|  |  | 
|  | // Make a GURL and normalize it, then get the host back out. | 
|  | std::string url = "http://"; | 
|  | url.append(*domain); | 
|  | GURL gurl(url); | 
|  | const std::string& spec = gurl.possibly_invalid_spec(); | 
|  | url::Component host = gurl.parsed_for_possibly_invalid_spec().host; | 
|  | if (host.len < 0) { | 
|  | LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << *domain; | 
|  | return kError; | 
|  | } | 
|  | if (!gurl.is_valid()) { | 
|  | LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain; | 
|  | result = kWarning; | 
|  | } | 
|  | domain->assign(spec.substr(host.begin, host.len)); | 
|  |  | 
|  | return result; | 
|  | } | 
|  |  | 
|  | NormalizeResult NormalizeDataToRuleMap(const std::string data, | 
|  | RuleMap* rules) { | 
|  | CHECK(rules); | 
|  | // We do a lot of string assignment during parsing, but simplicity is more | 
|  | // important than performance here. | 
|  | std::string domain; | 
|  | NormalizeResult result = kSuccess; | 
|  | size_t line_start = 0; | 
|  | size_t line_end = 0; | 
|  | bool is_private = false; | 
|  | RuleMap extra_rules; | 
|  | int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1; | 
|  | int end_private_length = arraysize(kEndPrivateDomainsComment) - 1; | 
|  | while (line_start < data.size()) { | 
|  | if (line_start + begin_private_length < data.size() && | 
|  | !data.compare(line_start, begin_private_length, | 
|  | kBeginPrivateDomainsComment)) { | 
|  | is_private = true; | 
|  | line_end = line_start + begin_private_length; | 
|  | } else if (line_start + end_private_length < data.size() && | 
|  | !data.compare(line_start, end_private_length, | 
|  | kEndPrivateDomainsComment)) { | 
|  | is_private = false; | 
|  | line_end = line_start + end_private_length; | 
|  | } else if (line_start + 1 < data.size() && | 
|  | data[line_start] == '/' && | 
|  | data[line_start + 1] == '/') { | 
|  | // Skip comments. | 
|  | line_end = data.find_first_of("\r\n", line_start); | 
|  | if (line_end == std::string::npos) | 
|  | line_end = data.size(); | 
|  | } else { | 
|  | // Truncate at first whitespace. | 
|  | line_end = data.find_first_of("\r\n \t", line_start); | 
|  | if (line_end == std::string::npos) | 
|  | line_end = data.size(); | 
|  | domain.assign(data, line_start, line_end - line_start); | 
|  |  | 
|  | Rule rule; | 
|  | rule.wildcard = false; | 
|  | rule.exception = false; | 
|  | rule.is_private = is_private; | 
|  | NormalizeResult new_result = NormalizeRule(&domain, &rule); | 
|  | if (new_result != kError) { | 
|  | // Check the existing rules to make sure we don't have an exception and | 
|  | // wildcard for the same rule, or that the same domain is listed as both | 
|  | // private and not private. If we did, we'd have to update our | 
|  | // parsing code to handle this case. | 
|  | CHECK(rules->find(domain) == rules->end()) | 
|  | << "Duplicate rule found for " << domain; | 
|  |  | 
|  | (*rules)[domain] = rule; | 
|  | // Add true TLD for multi-level rules.  We don't add them right now, in | 
|  | // case there's an exception or wild card that either exists or might be | 
|  | // added in a later iteration.  In those cases, there's no need to add | 
|  | // it and it would just slow down parsing the data. | 
|  | size_t tld_start = domain.find_last_of('.'); | 
|  | if (tld_start != std::string::npos && tld_start + 1 < domain.size()) { | 
|  | std::string extra_rule_domain = domain.substr(tld_start + 1); | 
|  | RuleMap::const_iterator iter = extra_rules.find(extra_rule_domain); | 
|  | Rule extra_rule; | 
|  | extra_rule.exception = false; | 
|  | extra_rule.wildcard = false; | 
|  | if (iter == extra_rules.end()) { | 
|  | extra_rule.is_private = is_private; | 
|  | } else { | 
|  | // A rule already exists, so we ensure that if any of the entries is | 
|  | // not private the result should be that the entry is not private. | 
|  | // An example is .au which is not listed as a real TLD, but only | 
|  | // lists second-level domains such as com.au. Subdomains of .au | 
|  | // (eg. blogspot.com.au) are also listed in the private section, | 
|  | // which is processed later, so this ensures that the real TLD | 
|  | // (eg. .au) is listed as public. | 
|  | extra_rule.is_private = is_private && iter->second.is_private; | 
|  | } | 
|  | extra_rules[extra_rule_domain] = extra_rule; | 
|  | } | 
|  | } | 
|  | result = std::max(result, new_result); | 
|  | } | 
|  |  | 
|  | // Find beginning of next non-empty line. | 
|  | line_start = data.find_first_of("\r\n", line_end); | 
|  | if (line_start == std::string::npos) | 
|  | line_start = data.size(); | 
|  | line_start = data.find_first_not_of("\r\n", line_start); | 
|  | if (line_start == std::string::npos) | 
|  | line_start = data.size(); | 
|  | } | 
|  |  | 
|  | for (RuleMap::const_iterator iter = extra_rules.begin(); | 
|  | iter != extra_rules.end(); | 
|  | ++iter) { | 
|  | if (rules->find(iter->first) == rules->end()) { | 
|  | (*rules)[iter->first] = iter->second; | 
|  | } | 
|  | } | 
|  |  | 
|  | return result; | 
|  | } | 
|  |  | 
|  | NormalizeResult NormalizeFile(const base::FilePath& in_filename, | 
|  | const base::FilePath& out_filename) { | 
|  | RuleMap rules; | 
|  | std::string data; | 
|  | if (!base::ReadFileToString(in_filename, &data)) { | 
|  | LOG(ERROR) << "Unable to read file"; | 
|  | // We return success since we've already reported the error. | 
|  | return kSuccess; | 
|  | } | 
|  |  | 
|  | NormalizeResult result = NormalizeDataToRuleMap(data, &rules); | 
|  |  | 
|  | if (!WriteRules(rules, out_filename)) { | 
|  | LOG(ERROR) << "Error(s) writing output file"; | 
|  | result = kError; | 
|  | } | 
|  |  | 
|  | return result; | 
|  | } | 
|  |  | 
|  |  | 
|  | }  // namespace tld_cleanup | 
|  | }  // namespace net |