Speed up generation of input files list.

The list of all input files used by a project is very large
for projects like Chromium and especially Fuchsia.

This CL refactors the InputFileManager::GetAllPhysicalInputFiles()
method in order to avoid un-necessary memory allocations and
std::set<> operations.

This is done by introducing a new VectorSetSorter class in
"gn/vector_utils.h" that can be used to efficiently merge
and sort sets of items with minimal memory usage.

Change-Id: I2e4b22c7294e93ffaec28dda765d371b3f0cc301
Reviewed-on: https://gn-review.googlesource.com/c/gn/+/7941
Commit-Queue: David Turner <digit@google.com>
Reviewed-by: Scott Graham <scottmg@chromium.org>
diff --git a/build/gen.py b/build/gen.py
index 582562c..8083aac 100755
--- a/build/gen.py
+++ b/build/gen.py
@@ -675,6 +675,7 @@
         'src/gn/tokenizer_unittest.cc',
         'src/gn/unique_vector_unittest.cc',
         'src/gn/value_unittest.cc',
+        'src/gn/vector_utils_unittest.cc',
         'src/gn/visibility_unittest.cc',
         'src/gn/visual_studio_utils_unittest.cc',
         'src/gn/visual_studio_writer_unittest.cc',
diff --git a/src/gn/input_file_manager.cc b/src/gn/input_file_manager.cc
index 1c1fe01..7e0d7ee 100644
--- a/src/gn/input_file_manager.cc
+++ b/src/gn/input_file_manager.cc
@@ -14,6 +14,7 @@
 #include "gn/scope_per_file_provider.h"
 #include "gn/tokenizer.h"
 #include "gn/trace.h"
+#include "gn/vector_utils.h"
 
 namespace {
 
@@ -263,13 +264,13 @@
   return static_cast<int>(input_files_.size());
 }
 
-void InputFileManager::GetAllPhysicalInputFileNames(
-    std::vector<base::FilePath>* result) const {
+void InputFileManager::AddAllPhysicalInputFileNamesToVectorSetSorter(
+    VectorSetSorter<base::FilePath>* sorter) const {
   std::lock_guard<std::mutex> lock(lock_);
-  result->reserve(input_files_.size());
+
   for (const auto& file : input_files_) {
     if (!file.second->file.physical_name().empty())
-      result->push_back(file.second->file.physical_name());
+      sorter->Add(file.second->file.physical_name());
   }
 }
 
diff --git a/src/gn/input_file_manager.h b/src/gn/input_file_manager.h
index f5ac565..d5e5553 100644
--- a/src/gn/input_file_manager.h
+++ b/src/gn/input_file_manager.h
@@ -18,6 +18,7 @@
 #include "gn/input_file.h"
 #include "gn/parse_tree.h"
 #include "gn/settings.h"
+#include "gn/vector_utils.h"
 #include "util/auto_reset_event.h"
 
 class BuildSettings;
@@ -91,8 +92,16 @@
   // Does not count dynamic input.
   int GetInputFileCount() const;
 
-  // Fills the vector with all input files.
-  void GetAllPhysicalInputFileNames(std::vector<base::FilePath>* result) const;
+  // Add all physical input files to a VectorSetSorter instance.
+  // This allows fast merging and sorting with other file paths sets.
+  //
+  // This is more memory efficient than returning a vector of base::FilePath
+  // instance, especially with projects with a very large number of input files,
+  // but note that the VectorSetSorter only holds pointers to the
+  // items recorded in this InputFileManager instance, and it is up to the
+  // caller to ensure these will not change until the sorter is destroyed.
+  void AddAllPhysicalInputFileNamesToVectorSetSorter(
+      VectorSetSorter<base::FilePath>* sorter) const;
 
   void set_load_file_callback(SyncLoadFileCallback load_file_callback) {
     load_file_callback_ = load_file_callback;
diff --git a/src/gn/json_project_writer.cc b/src/gn/json_project_writer.cc
index 7c04ea1..c478379 100644
--- a/src/gn/json_project_writer.cc
+++ b/src/gn/json_project_writer.cc
@@ -209,26 +209,34 @@
       "default_toolchain",
       base::Value(default_toolchain_label.GetUserVisibleName(false)));
 
-  std::vector<base::FilePath> input_files;
-  g_scheduler->input_file_manager()->GetAllPhysicalInputFileNames(&input_files);
-
   // Other files read by the build.
   std::vector<base::FilePath> other_files = g_scheduler->GetGenDependencies();
 
-  // Sort the input files to order them deterministically.
-  // Additionally, remove duplicate filepaths that seem to creep in.
-  std::set<base::FilePath> fileset(input_files.begin(), input_files.end());
-  fileset.insert(other_files.begin(), other_files.end());
+  const InputFileManager* input_file_manager =
+      g_scheduler->input_file_manager();
 
   base::ListValue inputs;
-  const auto &build_path = build_settings->root_path();
-  for (const auto& other_file : fileset) {
-    std::string file;
-    if (MakeAbsolutePathRelativeIfPossible(FilePathToUTF8(build_path),
-                                           FilePathToUTF8(other_file), &file)) {
-      inputs.Append(std::make_unique<base::Value>(std::move(file)));
-    }
+  {
+    VectorSetSorter<base::FilePath> sorter(
+        input_file_manager->GetInputFileCount() + other_files.size());
+
+    input_file_manager->AddAllPhysicalInputFileNamesToVectorSetSorter(&sorter);
+
+    sorter.Add(other_files.begin(), other_files.end());
+
+    std::string build_path = FilePathToUTF8(build_settings->root_path());
+    auto item_callback = [&inputs,
+                          &build_path](const base::FilePath& input_file) {
+      std::string file;
+      if (MakeAbsolutePathRelativeIfPossible(
+              build_path, FilePathToUTF8(input_file), &file)) {
+        inputs.Append(std::make_unique<base::Value>(std::move(file)));
+      }
+    };
+
+    sorter.IterateOver(item_callback);
   }
+
   settings->SetKey("gen_input_files", std::move(inputs));
 
   auto output = std::make_unique<base::DictionaryValue>();
diff --git a/src/gn/ninja_build_writer.cc b/src/gn/ninja_build_writer.cc
index 12ccaa7..b5b491b 100644
--- a/src/gn/ninja_build_writer.cc
+++ b/src/gn/ninja_build_writer.cc
@@ -292,30 +292,35 @@
   // exist and will error if they don't. When files are listed in a depfile,
   // missing files are ignored.
   dep_out_ << "build.ninja:";
-  std::vector<base::FilePath> input_files;
-  g_scheduler->input_file_manager()->GetAllPhysicalInputFileNames(&input_files);
 
   // Other files read by the build.
   std::vector<base::FilePath> other_files = g_scheduler->GetGenDependencies();
 
-  // Sort the input files to order them deterministically.
-  // Additionally, remove duplicate filepaths that seem to creep in.
-  std::set<base::FilePath> fileset(input_files.begin(), input_files.end());
-  fileset.insert(other_files.begin(), other_files.end());
+  const InputFileManager* input_file_manager =
+      g_scheduler->input_file_manager();
+
+  VectorSetSorter<base::FilePath> sorter(
+      input_file_manager->GetInputFileCount() + other_files.size());
+
+  input_file_manager->AddAllPhysicalInputFileNamesToVectorSetSorter(&sorter);
+  sorter.Add(other_files.begin(), other_files.end());
 
   const base::FilePath build_path =
       build_settings_->build_dir().Resolve(build_settings_->root_path());
 
   EscapeOptions depfile_escape;
   depfile_escape.mode = ESCAPE_DEPFILE;
-  for (const auto& other_file : fileset) {
+  auto item_callback = [this, &depfile_escape,
+                        &build_path](const base::FilePath& input_file) {
     const base::FilePath file =
-        MakeAbsoluteFilePathRelativeIfPossible(build_path, other_file);
+        MakeAbsoluteFilePathRelativeIfPossible(build_path, input_file);
     dep_out_ << " ";
     EscapeStringToStream(dep_out_,
                          FilePathToUTF8(file.NormalizePathSeparatorsTo('/')),
                          depfile_escape);
-  }
+  };
+
+  sorter.IterateOver(item_callback);
 
   out_ << std::endl;
 }
diff --git a/src/gn/vector_utils.h b/src/gn/vector_utils.h
new file mode 100644
index 0000000..8366dda
--- /dev/null
+++ b/src/gn/vector_utils.h
@@ -0,0 +1,103 @@
+// Copyright 2020 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TOOLS_GN_VECTOR_UTILS_H_
+#define TOOLS_GN_VECTOR_UTILS_H_
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+// A VectorSetSorter is a convenience class used to efficiently sort and
+// de-duplicate one or more sets of items of type T, then iterate over the
+// result, or get it as a simple vector. Usage is the following:
+//
+// For performance reasons, this implementation only stores pointers to the
+// input items in order to minimize memory usage. Callers should ensure the
+// items added to this sorter do not change until the instance is destroyed.
+//
+//    1) Create instance, passing an optional initial capacity.
+//
+//    2) Add items using one of the Add() methods, as many times as
+//       necessary. Note that this records only pointers to said items
+//       so their content should not change until the instance is destroyed.
+//
+//    3) Call IteratorOver() to iterate over all sorted and de-duplicated
+//       items.
+//
+//    4) Alternatively, call AsVector() to return a new vector that contains
+//       copies of the original sorted / deduplicated items.
+//
+template <typename T>
+class VectorSetSorter {
+ public:
+  // Constructor. |initial_capacity| might be provided to minimize the number
+  // of allocations performed by this instance, if the maximum number of
+  // input items is known in advance.
+  VectorSetSorter(size_t initial_capacity = 0) {
+    ptrs_.reserve(initial_capacity);
+  }
+
+  // Add one single item to the sorter.
+  void Add(const T& item) {
+    ptrs_.push_back(&item);
+    sorted_ = false;
+  }
+
+  // Add one range of items to the sorter.
+  void Add(typename std::vector<T>::const_iterator begin,
+           typename std::vector<T>::const_iterator end) {
+    for (; begin != end; ++begin)
+      ptrs_.push_back(&(*begin));
+    sorted_ = false;
+  }
+
+  // Add one range of items to the sorter.
+  void Add(const T* start, const T* end) {
+    for (; start != end; ++start)
+      ptrs_.push_back(start);
+    sorted_ = false;
+  }
+
+  // Iterate over all sorted items, removing duplicates from the loop.
+  // |item_callback| is a callable that will be invoked for each item in the
+  // result.
+  template <typename ITEM_CALLBACK>
+  void IterateOver(ITEM_CALLBACK item_callback) {
+    if (!sorted_) {
+      Sort();
+    }
+    const T* prev_item = nullptr;
+    for (const T* item : ptrs_) {
+      if (!prev_item || *prev_item != *item) {
+        item_callback(*item);
+        prev_item = item;
+      }
+    }
+  }
+
+  // Return the sorted and de-duplicated resulting set as a vector of items.
+  // Note that this copies the input items.
+  std::vector<T> AsVector() {
+    std::vector<T> result;
+    result.reserve(ptrs_.size());
+    IterateOver([&result](const T& item) { result.push_back(item); });
+    return result;
+  }
+
+ private:
+  // Sort all items previously added to this instance.
+  // Must be called after adding all desired items, and before
+  // calling IterateOver() or AsVector().
+  void Sort() {
+    std::sort(ptrs_.begin(), ptrs_.end(),
+              [](const T* a, const T* b) { return *a < *b; });
+    sorted_ = true;
+  }
+
+  std::vector<const T*> ptrs_;
+  bool sorted_ = false;
+};
+
+#endif  // TOOLS_GN_VECTOR_UTILS_H_
diff --git a/src/gn/vector_utils_unittest.cc b/src/gn/vector_utils_unittest.cc
new file mode 100644
index 0000000..c7cee01
--- /dev/null
+++ b/src/gn/vector_utils_unittest.cc
@@ -0,0 +1,47 @@
+// Copyright 2020 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "gn/vector_utils.h"
+
+#include "util/test/test.h"
+
+#include <string>
+
+TEST(VectorSetSorter, AsVectorWithStrings) {
+  VectorSetSorter<std::string> sorter;
+
+  std::vector<std::string> input = {
+      "World!", "Hello", "bonjour", "Hello", "monde!", "World!",
+  };
+
+  sorter.Add(input.begin(), input.end());
+  auto result = sorter.AsVector();
+
+  ASSERT_EQ(result.size(), 4u) << result.size();
+  EXPECT_STREQ(result[0].c_str(), "Hello");
+  EXPECT_STREQ(result[1].c_str(), "World!");
+  EXPECT_STREQ(result[2].c_str(), "bonjour");
+  EXPECT_STREQ(result[3].c_str(), "monde!");
+}
+
+TEST(VectorSetSorter, IterateOverWithStrings) {
+  VectorSetSorter<std::string> sorter;
+
+  std::vector<std::string> input = {
+      "World!", "Hello", "bonjour", "Hello", "monde!", "World!",
+  };
+
+  sorter.Add(input.begin(), input.end());
+
+  std::vector<std::string> result;
+
+  sorter.IterateOver(
+      [&result](const std::string& str) { result.push_back(str); });
+
+  ASSERT_EQ(result.size(), 4u) << result.size();
+  EXPECT_STREQ(result[0].c_str(), "Hello");
+  EXPECT_STREQ(result[1].c_str(), "World!");
+  EXPECT_STREQ(result[2].c_str(), "bonjour");
+  EXPECT_STREQ(result[3].c_str(), "monde!");
+}