src/net/tools/tld_cleanup/tld_cleanup_util.cc - cobalt - Git at Google

 // Copyright 2013 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "net/tools/tld_cleanup/tld_cleanup_util.h"

 #include "base/files/file_util.h"
 #include "base/logging.h"
 #include "base/strings/string_number_conversions.h"
 #include "base/strings/string_util.h"
 #include "url/gurl.h"
 #include "url/third_party/mozilla/url_parse.h"

 namespace {

 const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS===";
 const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS===";

 const int kExceptionRule = 1;
 const int kWildcardRule = 2;
 const int kPrivateRule = 4;
 }

 namespace net {
 namespace tld_cleanup {

 // Writes the list of domain rules contained in the 'rules' set to the
 // 'outfile', with each rule terminated by a LF.  The file must already have
 // been created with write access.
 bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) {
   std::string data;
   data.append("%{\n"
               "// Copyright 2012 The Chromium Authors. All rights reserved.\n"
               "// Use of this source code is governed by a BSD-style license "
               "that can be\n"
               "// found in the LICENSE file.\n\n"
               "// This file is generated by net/tools/tld_cleanup/.\n"
               "// DO NOT MANUALLY EDIT!\n"
               "%}\n"
               "struct DomainRule {\n"
               "  int name_offset;\n"
               "  int type;  // flags: 1: exception, 2: wildcard, 4: private\n"
               "};\n"
               "%%\n");

   for (auto i = rules.begin(); i != rules.end(); ++i) {
     data.append(i->first);
     data.append(", ");
     int type = 0;
     if (i->second.exception) {
       type = kExceptionRule;
     } else if (i->second.wildcard) {
       type = kWildcardRule;
     }
     if (i->second.is_private) {
       type += kPrivateRule;
     }
     data.append(base::IntToString(type));
     data.append("\n");
   }

   data.append("%%\n");

   int written = base::WriteFile(outfile,
                                      data.data(),
                                      static_cast<int>(data.size()));

   return written == static_cast<int>(data.size());
 }

 // Adjusts the rule to a standard form: removes single extraneous dots and
 // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as
 // valid; logs a warning and returns kWarning if it is probably invalid; and
 // logs an error and returns kError if the rule is (almost) certainly invalid.
 NormalizeResult NormalizeRule(std::string* domain, Rule* rule) {
   NormalizeResult result = kSuccess;

   // Strip single leading and trailing dots.
   if (domain->at(0) == '.')
     domain->erase(0, 1);
   if (domain->empty()) {
     LOG(WARNING) << "Ignoring empty rule";
     return kWarning;
   }
   if (domain->at(domain->size() - 1) == '.')
     domain->erase(domain->size() - 1, 1);
   if (domain->empty()) {
     LOG(WARNING) << "Ignoring empty rule";
     return kWarning;
   }

   // Allow single leading '*.' or '!', saved here so it's not canonicalized.
   size_t start_offset = 0;
   if (domain->at(0) == '!') {
     domain->erase(0, 1);
     rule->exception = true;
   } else if (domain->find("*.") == 0) {
     domain->erase(0, 2);
     rule->wildcard = true;
   }
   if (domain->empty()) {
     LOG(WARNING) << "Ignoring empty rule";
     return kWarning;
   }

   // Warn about additional '*.' or '!'.
   if (domain->find("*.", start_offset) != std::string::npos ||
       domain->find('!', start_offset) != std::string::npos) {
     LOG(WARNING) << "Keeping probably invalid rule: " << *domain;
     result = kWarning;
   }

   // Make a GURL and normalize it, then get the host back out.
   std::string url = "http://";
   url.append(*domain);
   GURL gurl(url);
   const std::string& spec = gurl.possibly_invalid_spec();
   url::Component host = gurl.parsed_for_possibly_invalid_spec().host;
   if (host.len < 0) {
     LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << *domain;
     return kError;
   }
   if (!gurl.is_valid()) {
     LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain;
     result = kWarning;
   }
   domain->assign(spec.substr(host.begin, host.len));

   return result;
 }

 NormalizeResult NormalizeDataToRuleMap(const std::string data,
                                        RuleMap* rules) {
   CHECK(rules);
   // We do a lot of string assignment during parsing, but simplicity is more
   // important than performance here.
   std::string domain;
   NormalizeResult result = kSuccess;
   size_t line_start = 0;
   size_t line_end = 0;
   bool is_private = false;
   RuleMap extra_rules;
   int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1;
   int end_private_length = arraysize(kEndPrivateDomainsComment) - 1;
   while (line_start < data.size()) {
     if (line_start + begin_private_length < data.size() &&
         !data.compare(line_start, begin_private_length,
                       kBeginPrivateDomainsComment)) {
       is_private = true;
       line_end = line_start + begin_private_length;
     } else if (line_start + end_private_length < data.size() &&
         !data.compare(line_start, end_private_length,
                       kEndPrivateDomainsComment)) {
       is_private = false;
       line_end = line_start + end_private_length;
     } else if (line_start + 1 < data.size() &&
         data[line_start] == '/' &&
         data[line_start + 1] == '/') {
       // Skip comments.
       line_end = data.find_first_of("\r\n", line_start);
       if (line_end == std::string::npos)
         line_end = data.size();
     } else {
       // Truncate at first whitespace.
       line_end = data.find_first_of("\r\n \t", line_start);
       if (line_end == std::string::npos)
         line_end = data.size();
       domain.assign(data, line_start, line_end - line_start);

       Rule rule;
       rule.wildcard = false;
       rule.exception = false;
       rule.is_private = is_private;
       NormalizeResult new_result = NormalizeRule(&domain, &rule);
       if (new_result != kError) {
         // Check the existing rules to make sure we don't have an exception and
         // wildcard for the same rule, or that the same domain is listed as both
         // private and not private. If we did, we'd have to update our
         // parsing code to handle this case.
         CHECK(rules->find(domain) == rules->end())
             << "Duplicate rule found for " << domain;

         (*rules)[domain] = rule;
         // Add true TLD for multi-level rules.  We don't add them right now, in
         // case there's an exception or wild card that either exists or might be
         // added in a later iteration.  In those cases, there's no need to add
         // it and it would just slow down parsing the data.
         size_t tld_start = domain.find_last_of('.');
         if (tld_start != std::string::npos && tld_start + 1 < domain.size()) {
           std::string extra_rule_domain = domain.substr(tld_start + 1);
           RuleMap::const_iterator iter = extra_rules.find(extra_rule_domain);
           Rule extra_rule;
           extra_rule.exception = false;
           extra_rule.wildcard = false;
           if (iter == extra_rules.end()) {
             extra_rule.is_private = is_private;
           } else {
             // A rule already exists, so we ensure that if any of the entries is
             // not private the result should be that the entry is not private.
             // An example is .au which is not listed as a real TLD, but only
             // lists second-level domains such as com.au. Subdomains of .au
             // (eg. blogspot.com.au) are also listed in the private section,
             // which is processed later, so this ensures that the real TLD
             // (eg. .au) is listed as public.
             extra_rule.is_private = is_private && iter->second.is_private;
           }
           extra_rules[extra_rule_domain] = extra_rule;
         }
       }
       result = std::max(result, new_result);
     }

     // Find beginning of next non-empty line.
     line_start = data.find_first_of("\r\n", line_end);
     if (line_start == std::string::npos)
       line_start = data.size();
     line_start = data.find_first_not_of("\r\n", line_start);
     if (line_start == std::string::npos)
       line_start = data.size();
   }

   for (RuleMap::const_iterator iter = extra_rules.begin();
        iter != extra_rules.end();
        ++iter) {
     if (rules->find(iter->first) == rules->end()) {
       (*rules)[iter->first] = iter->second;
     }
   }

   return result;
 }

 NormalizeResult NormalizeFile(const base::FilePath& in_filename,
                               const base::FilePath& out_filename) {
   RuleMap rules;
   std::string data;
   if (!base::ReadFileToString(in_filename, &data)) {
     LOG(ERROR) << "Unable to read file";
     // We return success since we've already reported the error.
     return kSuccess;
   }

   NormalizeResult result = NormalizeDataToRuleMap(data, &rules);

   if (!WriteRules(rules, out_filename)) {
     LOG(ERROR) << "Error(s) writing output file";
     result = kError;
   }

   return result;
 }


 }  // namespace tld_cleanup
 }  // namespace net
	// Copyright 2013 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "net/tools/tld_cleanup/tld_cleanup_util.h"

	#include "base/files/file_util.h"
	#include "base/logging.h"
	#include "base/strings/string_number_conversions.h"
	#include "base/strings/string_util.h"
	#include "url/gurl.h"
	#include "url/third_party/mozilla/url_parse.h"

	namespace {

	const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS===";
	const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS===";

	const int kExceptionRule = 1;
	const int kWildcardRule = 2;
	const int kPrivateRule = 4;
	}

	namespace net {
	namespace tld_cleanup {

	// Writes the list of domain rules contained in the 'rules' set to the
	// 'outfile', with each rule terminated by a LF. The file must already have
	// been created with write access.
	bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) {
	std::string data;
	data.append("%{\n"
	"// Copyright 2012 The Chromium Authors. All rights reserved.\n"
	"// Use of this source code is governed by a BSD-style license "
	"that can be\n"
	"// found in the LICENSE file.\n\n"
	"// This file is generated by net/tools/tld_cleanup/.\n"
	"// DO NOT MANUALLY EDIT!\n"
	"%}\n"
	"struct DomainRule {\n"
	" int name_offset;\n"
	" int type; // flags: 1: exception, 2: wildcard, 4: private\n"
	"};\n"
	"%%\n");

	for (auto i = rules.begin(); i != rules.end(); ++i) {
	data.append(i->first);
	data.append(", ");
	int type = 0;
	if (i->second.exception) {
	type = kExceptionRule;
	} else if (i->second.wildcard) {
	type = kWildcardRule;
	}
	if (i->second.is_private) {
	type += kPrivateRule;
	}
	data.append(base::IntToString(type));
	data.append("\n");
	}

	data.append("%%\n");

	int written = base::WriteFile(outfile,
	data.data(),
	static_cast<int>(data.size()));

	return written == static_cast<int>(data.size());
	}

	// Adjusts the rule to a standard form: removes single extraneous dots and
	// canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as
	// valid; logs a warning and returns kWarning if it is probably invalid; and
	// logs an error and returns kError if the rule is (almost) certainly invalid.
	NormalizeResult NormalizeRule(std::string* domain, Rule* rule) {
	NormalizeResult result = kSuccess;

	// Strip single leading and trailing dots.
	if (domain->at(0) == '.')
	domain->erase(0, 1);
	if (domain->empty()) {
	LOG(WARNING) << "Ignoring empty rule";
	return kWarning;
	}
	if (domain->at(domain->size() - 1) == '.')
	domain->erase(domain->size() - 1, 1);
	if (domain->empty()) {
	LOG(WARNING) << "Ignoring empty rule";
	return kWarning;
	}

	// Allow single leading '*.' or '!', saved here so it's not canonicalized.
	size_t start_offset = 0;
	if (domain->at(0) == '!') {
	domain->erase(0, 1);
	rule->exception = true;
	} else if (domain->find("*.") == 0) {
	domain->erase(0, 2);
	rule->wildcard = true;
	}
	if (domain->empty()) {
	LOG(WARNING) << "Ignoring empty rule";
	return kWarning;
	}

	// Warn about additional '*.' or '!'.
	if (domain->find("*.", start_offset) != std::string::npos \|\|
	domain->find('!', start_offset) != std::string::npos) {
	LOG(WARNING) << "Keeping probably invalid rule: " << *domain;
	result = kWarning;
	}

	// Make a GURL and normalize it, then get the host back out.
	std::string url = "http://";
	url.append(*domain);
	GURL gurl(url);
	const std::string& spec = gurl.possibly_invalid_spec();
	url::Component host = gurl.parsed_for_possibly_invalid_spec().host;
	if (host.len < 0) {
	LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << *domain;
	return kError;
	}
	if (!gurl.is_valid()) {
	LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain;
	result = kWarning;
	}
	domain->assign(spec.substr(host.begin, host.len));

	return result;
	}

	NormalizeResult NormalizeDataToRuleMap(const std::string data,
	RuleMap* rules) {
	CHECK(rules);
	// We do a lot of string assignment during parsing, but simplicity is more
	// important than performance here.
	std::string domain;
	NormalizeResult result = kSuccess;
	size_t line_start = 0;
	size_t line_end = 0;
	bool is_private = false;
	RuleMap extra_rules;
	int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1;
	int end_private_length = arraysize(kEndPrivateDomainsComment) - 1;
	while (line_start < data.size()) {
	if (line_start + begin_private_length < data.size() &&
	!data.compare(line_start, begin_private_length,
	kBeginPrivateDomainsComment)) {
	is_private = true;
	line_end = line_start + begin_private_length;
	} else if (line_start + end_private_length < data.size() &&
	!data.compare(line_start, end_private_length,
	kEndPrivateDomainsComment)) {
	is_private = false;
	line_end = line_start + end_private_length;
	} else if (line_start + 1 < data.size() &&
	data[line_start] == '/' &&
	data[line_start + 1] == '/') {
	// Skip comments.
	line_end = data.find_first_of("\r\n", line_start);
	if (line_end == std::string::npos)
	line_end = data.size();
	} else {
	// Truncate at first whitespace.
	line_end = data.find_first_of("\r\n \t", line_start);
	if (line_end == std::string::npos)
	line_end = data.size();
	domain.assign(data, line_start, line_end - line_start);

	Rule rule;
	rule.wildcard = false;
	rule.exception = false;
	rule.is_private = is_private;
	NormalizeResult new_result = NormalizeRule(&domain, &rule);
	if (new_result != kError) {
	// Check the existing rules to make sure we don't have an exception and
	// wildcard for the same rule, or that the same domain is listed as both
	// private and not private. If we did, we'd have to update our
	// parsing code to handle this case.
	CHECK(rules->find(domain) == rules->end())
	<< "Duplicate rule found for " << domain;

	(*rules)[domain] = rule;
	// Add true TLD for multi-level rules. We don't add them right now, in
	// case there's an exception or wild card that either exists or might be
	// added in a later iteration. In those cases, there's no need to add
	// it and it would just slow down parsing the data.
	size_t tld_start = domain.find_last_of('.');
	if (tld_start != std::string::npos && tld_start + 1 < domain.size()) {
	std::string extra_rule_domain = domain.substr(tld_start + 1);
	RuleMap::const_iterator iter = extra_rules.find(extra_rule_domain);
	Rule extra_rule;
	extra_rule.exception = false;
	extra_rule.wildcard = false;
	if (iter == extra_rules.end()) {
	extra_rule.is_private = is_private;
	} else {
	// A rule already exists, so we ensure that if any of the entries is
	// not private the result should be that the entry is not private.
	// An example is .au which is not listed as a real TLD, but only
	// lists second-level domains such as com.au. Subdomains of .au
	// (eg. blogspot.com.au) are also listed in the private section,
	// which is processed later, so this ensures that the real TLD
	// (eg. .au) is listed as public.
	extra_rule.is_private = is_private && iter->second.is_private;
	}
	extra_rules[extra_rule_domain] = extra_rule;
	}
	}
	result = std::max(result, new_result);
	}

	// Find beginning of next non-empty line.
	line_start = data.find_first_of("\r\n", line_end);
	if (line_start == std::string::npos)
	line_start = data.size();
	line_start = data.find_first_not_of("\r\n", line_start);
	if (line_start == std::string::npos)
	line_start = data.size();
	}

	for (RuleMap::const_iterator iter = extra_rules.begin();
	iter != extra_rules.end();
	++iter) {
	if (rules->find(iter->first) == rules->end()) {
	(*rules)[iter->first] = iter->second;
	}
	}

	return result;
	}

	NormalizeResult NormalizeFile(const base::FilePath& in_filename,
	const base::FilePath& out_filename) {
	RuleMap rules;
	std::string data;
	if (!base::ReadFileToString(in_filename, &data)) {
	LOG(ERROR) << "Unable to read file";
	// We return success since we've already reported the error.
	return kSuccess;
	}

	NormalizeResult result = NormalizeDataToRuleMap(data, &rules);

	if (!WriteRules(rules, out_filename)) {
	LOG(ERROR) << "Error(s) writing output file";
	result = kError;
	}

	return result;
	}


	} // namespace tld_cleanup
	} // namespace net