| // Copyright 2013 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "net/tools/tld_cleanup/tld_cleanup_util.h" |
| |
| #include "base/files/file_util.h" |
| #include "base/logging.h" |
| #include "base/strings/string_number_conversions.h" |
| #include "base/strings/string_util.h" |
| #include "url/gurl.h" |
| #include "url/third_party/mozilla/url_parse.h" |
| |
| namespace { |
| |
| const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS==="; |
| const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS==="; |
| |
| const int kExceptionRule = 1; |
| const int kWildcardRule = 2; |
| const int kPrivateRule = 4; |
| } |
| |
| namespace net { |
| namespace tld_cleanup { |
| |
| // Writes the list of domain rules contained in the 'rules' set to the |
| // 'outfile', with each rule terminated by a LF. The file must already have |
| // been created with write access. |
| bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) { |
| std::string data; |
| data.append("%{\n" |
| "// Copyright 2012 The Chromium Authors. All rights reserved.\n" |
| "// Use of this source code is governed by a BSD-style license " |
| "that can be\n" |
| "// found in the LICENSE file.\n\n" |
| "// This file is generated by net/tools/tld_cleanup/.\n" |
| "// DO NOT MANUALLY EDIT!\n" |
| "%}\n" |
| "struct DomainRule {\n" |
| " int name_offset;\n" |
| " int type; // flags: 1: exception, 2: wildcard, 4: private\n" |
| "};\n" |
| "%%\n"); |
| |
| for (auto i = rules.begin(); i != rules.end(); ++i) { |
| data.append(i->first); |
| data.append(", "); |
| int type = 0; |
| if (i->second.exception) { |
| type = kExceptionRule; |
| } else if (i->second.wildcard) { |
| type = kWildcardRule; |
| } |
| if (i->second.is_private) { |
| type += kPrivateRule; |
| } |
| data.append(base::IntToString(type)); |
| data.append("\n"); |
| } |
| |
| data.append("%%\n"); |
| |
| int written = base::WriteFile(outfile, |
| data.data(), |
| static_cast<int>(data.size())); |
| |
| return written == static_cast<int>(data.size()); |
| } |
| |
| // Adjusts the rule to a standard form: removes single extraneous dots and |
| // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as |
| // valid; logs a warning and returns kWarning if it is probably invalid; and |
| // logs an error and returns kError if the rule is (almost) certainly invalid. |
| NormalizeResult NormalizeRule(std::string* domain, Rule* rule) { |
| NormalizeResult result = kSuccess; |
| |
| // Strip single leading and trailing dots. |
| if (domain->at(0) == '.') |
| domain->erase(0, 1); |
| if (domain->empty()) { |
| LOG(WARNING) << "Ignoring empty rule"; |
| return kWarning; |
| } |
| if (domain->at(domain->size() - 1) == '.') |
| domain->erase(domain->size() - 1, 1); |
| if (domain->empty()) { |
| LOG(WARNING) << "Ignoring empty rule"; |
| return kWarning; |
| } |
| |
| // Allow single leading '*.' or '!', saved here so it's not canonicalized. |
| size_t start_offset = 0; |
| if (domain->at(0) == '!') { |
| domain->erase(0, 1); |
| rule->exception = true; |
| } else if (domain->find("*.") == 0) { |
| domain->erase(0, 2); |
| rule->wildcard = true; |
| } |
| if (domain->empty()) { |
| LOG(WARNING) << "Ignoring empty rule"; |
| return kWarning; |
| } |
| |
| // Warn about additional '*.' or '!'. |
| if (domain->find("*.", start_offset) != std::string::npos || |
| domain->find('!', start_offset) != std::string::npos) { |
| LOG(WARNING) << "Keeping probably invalid rule: " << *domain; |
| result = kWarning; |
| } |
| |
| // Make a GURL and normalize it, then get the host back out. |
| std::string url = "http://"; |
| url.append(*domain); |
| GURL gurl(url); |
| const std::string& spec = gurl.possibly_invalid_spec(); |
| url::Component host = gurl.parsed_for_possibly_invalid_spec().host; |
| if (host.len < 0) { |
| LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << *domain; |
| return kError; |
| } |
| if (!gurl.is_valid()) { |
| LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain; |
| result = kWarning; |
| } |
| domain->assign(spec.substr(host.begin, host.len)); |
| |
| return result; |
| } |
| |
| NormalizeResult NormalizeDataToRuleMap(const std::string data, |
| RuleMap* rules) { |
| CHECK(rules); |
| // We do a lot of string assignment during parsing, but simplicity is more |
| // important than performance here. |
| std::string domain; |
| NormalizeResult result = kSuccess; |
| size_t line_start = 0; |
| size_t line_end = 0; |
| bool is_private = false; |
| RuleMap extra_rules; |
| int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1; |
| int end_private_length = arraysize(kEndPrivateDomainsComment) - 1; |
| while (line_start < data.size()) { |
| if (line_start + begin_private_length < data.size() && |
| !data.compare(line_start, begin_private_length, |
| kBeginPrivateDomainsComment)) { |
| is_private = true; |
| line_end = line_start + begin_private_length; |
| } else if (line_start + end_private_length < data.size() && |
| !data.compare(line_start, end_private_length, |
| kEndPrivateDomainsComment)) { |
| is_private = false; |
| line_end = line_start + end_private_length; |
| } else if (line_start + 1 < data.size() && |
| data[line_start] == '/' && |
| data[line_start + 1] == '/') { |
| // Skip comments. |
| line_end = data.find_first_of("\r\n", line_start); |
| if (line_end == std::string::npos) |
| line_end = data.size(); |
| } else { |
| // Truncate at first whitespace. |
| line_end = data.find_first_of("\r\n \t", line_start); |
| if (line_end == std::string::npos) |
| line_end = data.size(); |
| domain.assign(data, line_start, line_end - line_start); |
| |
| Rule rule; |
| rule.wildcard = false; |
| rule.exception = false; |
| rule.is_private = is_private; |
| NormalizeResult new_result = NormalizeRule(&domain, &rule); |
| if (new_result != kError) { |
| // Check the existing rules to make sure we don't have an exception and |
| // wildcard for the same rule, or that the same domain is listed as both |
| // private and not private. If we did, we'd have to update our |
| // parsing code to handle this case. |
| CHECK(rules->find(domain) == rules->end()) |
| << "Duplicate rule found for " << domain; |
| |
| (*rules)[domain] = rule; |
| // Add true TLD for multi-level rules. We don't add them right now, in |
| // case there's an exception or wild card that either exists or might be |
| // added in a later iteration. In those cases, there's no need to add |
| // it and it would just slow down parsing the data. |
| size_t tld_start = domain.find_last_of('.'); |
| if (tld_start != std::string::npos && tld_start + 1 < domain.size()) { |
| std::string extra_rule_domain = domain.substr(tld_start + 1); |
| RuleMap::const_iterator iter = extra_rules.find(extra_rule_domain); |
| Rule extra_rule; |
| extra_rule.exception = false; |
| extra_rule.wildcard = false; |
| if (iter == extra_rules.end()) { |
| extra_rule.is_private = is_private; |
| } else { |
| // A rule already exists, so we ensure that if any of the entries is |
| // not private the result should be that the entry is not private. |
| // An example is .au which is not listed as a real TLD, but only |
| // lists second-level domains such as com.au. Subdomains of .au |
| // (eg. blogspot.com.au) are also listed in the private section, |
| // which is processed later, so this ensures that the real TLD |
| // (eg. .au) is listed as public. |
| extra_rule.is_private = is_private && iter->second.is_private; |
| } |
| extra_rules[extra_rule_domain] = extra_rule; |
| } |
| } |
| result = std::max(result, new_result); |
| } |
| |
| // Find beginning of next non-empty line. |
| line_start = data.find_first_of("\r\n", line_end); |
| if (line_start == std::string::npos) |
| line_start = data.size(); |
| line_start = data.find_first_not_of("\r\n", line_start); |
| if (line_start == std::string::npos) |
| line_start = data.size(); |
| } |
| |
| for (RuleMap::const_iterator iter = extra_rules.begin(); |
| iter != extra_rules.end(); |
| ++iter) { |
| if (rules->find(iter->first) == rules->end()) { |
| (*rules)[iter->first] = iter->second; |
| } |
| } |
| |
| return result; |
| } |
| |
| NormalizeResult NormalizeFile(const base::FilePath& in_filename, |
| const base::FilePath& out_filename) { |
| RuleMap rules; |
| std::string data; |
| if (!base::ReadFileToString(in_filename, &data)) { |
| LOG(ERROR) << "Unable to read file"; |
| // We return success since we've already reported the error. |
| return kSuccess; |
| } |
| |
| NormalizeResult result = NormalizeDataToRuleMap(data, &rules); |
| |
| if (!WriteRules(rules, out_filename)) { |
| LOG(ERROR) << "Error(s) writing output file"; |
| result = kError; |
| } |
| |
| return result; |
| } |
| |
| |
| } // namespace tld_cleanup |
| } // namespace net |