Speed up generation of input files list.
The list of all input files used by a project is very large
for projects like Chromium and especially Fuchsia.
This CL refactors the InputFileManager::GetAllPhysicalInputFiles()
method in order to avoid un-necessary memory allocations and
std::set<> operations.
This is done by introducing a new VectorSetSorter class in
"gn/vector_utils.h" that can be used to efficiently merge
and sort sets of items with minimal memory usage.
Change-Id: I2e4b22c7294e93ffaec28dda765d371b3f0cc301
Reviewed-on: https://gn-review.googlesource.com/c/gn/+/7941
Commit-Queue: David Turner <digit@google.com>
Reviewed-by: Scott Graham <scottmg@chromium.org>
diff --git a/build/gen.py b/build/gen.py
index 582562c..8083aac 100755
--- a/build/gen.py
+++ b/build/gen.py
@@ -675,6 +675,7 @@
'src/gn/tokenizer_unittest.cc',
'src/gn/unique_vector_unittest.cc',
'src/gn/value_unittest.cc',
+ 'src/gn/vector_utils_unittest.cc',
'src/gn/visibility_unittest.cc',
'src/gn/visual_studio_utils_unittest.cc',
'src/gn/visual_studio_writer_unittest.cc',
diff --git a/src/gn/input_file_manager.cc b/src/gn/input_file_manager.cc
index 1c1fe01..7e0d7ee 100644
--- a/src/gn/input_file_manager.cc
+++ b/src/gn/input_file_manager.cc
@@ -14,6 +14,7 @@
#include "gn/scope_per_file_provider.h"
#include "gn/tokenizer.h"
#include "gn/trace.h"
+#include "gn/vector_utils.h"
namespace {
@@ -263,13 +264,13 @@
return static_cast<int>(input_files_.size());
}
-void InputFileManager::GetAllPhysicalInputFileNames(
- std::vector<base::FilePath>* result) const {
+void InputFileManager::AddAllPhysicalInputFileNamesToVectorSetSorter(
+ VectorSetSorter<base::FilePath>* sorter) const {
std::lock_guard<std::mutex> lock(lock_);
- result->reserve(input_files_.size());
+
for (const auto& file : input_files_) {
if (!file.second->file.physical_name().empty())
- result->push_back(file.second->file.physical_name());
+ sorter->Add(file.second->file.physical_name());
}
}
diff --git a/src/gn/input_file_manager.h b/src/gn/input_file_manager.h
index f5ac565..d5e5553 100644
--- a/src/gn/input_file_manager.h
+++ b/src/gn/input_file_manager.h
@@ -18,6 +18,7 @@
#include "gn/input_file.h"
#include "gn/parse_tree.h"
#include "gn/settings.h"
+#include "gn/vector_utils.h"
#include "util/auto_reset_event.h"
class BuildSettings;
@@ -91,8 +92,16 @@
// Does not count dynamic input.
int GetInputFileCount() const;
- // Fills the vector with all input files.
- void GetAllPhysicalInputFileNames(std::vector<base::FilePath>* result) const;
+ // Add all physical input files to a VectorSetSorter instance.
+ // This allows fast merging and sorting with other file paths sets.
+ //
+ // This is more memory efficient than returning a vector of base::FilePath
+ // instance, especially with projects with a very large number of input files,
+ // but note that the VectorSetSorter only holds pointers to the
+ // items recorded in this InputFileManager instance, and it is up to the
+ // caller to ensure these will not change until the sorter is destroyed.
+ void AddAllPhysicalInputFileNamesToVectorSetSorter(
+ VectorSetSorter<base::FilePath>* sorter) const;
void set_load_file_callback(SyncLoadFileCallback load_file_callback) {
load_file_callback_ = load_file_callback;
diff --git a/src/gn/json_project_writer.cc b/src/gn/json_project_writer.cc
index 7c04ea1..c478379 100644
--- a/src/gn/json_project_writer.cc
+++ b/src/gn/json_project_writer.cc
@@ -209,26 +209,34 @@
"default_toolchain",
base::Value(default_toolchain_label.GetUserVisibleName(false)));
- std::vector<base::FilePath> input_files;
- g_scheduler->input_file_manager()->GetAllPhysicalInputFileNames(&input_files);
-
// Other files read by the build.
std::vector<base::FilePath> other_files = g_scheduler->GetGenDependencies();
- // Sort the input files to order them deterministically.
- // Additionally, remove duplicate filepaths that seem to creep in.
- std::set<base::FilePath> fileset(input_files.begin(), input_files.end());
- fileset.insert(other_files.begin(), other_files.end());
+ const InputFileManager* input_file_manager =
+ g_scheduler->input_file_manager();
base::ListValue inputs;
- const auto &build_path = build_settings->root_path();
- for (const auto& other_file : fileset) {
- std::string file;
- if (MakeAbsolutePathRelativeIfPossible(FilePathToUTF8(build_path),
- FilePathToUTF8(other_file), &file)) {
- inputs.Append(std::make_unique<base::Value>(std::move(file)));
- }
+ {
+ VectorSetSorter<base::FilePath> sorter(
+ input_file_manager->GetInputFileCount() + other_files.size());
+
+ input_file_manager->AddAllPhysicalInputFileNamesToVectorSetSorter(&sorter);
+
+ sorter.Add(other_files.begin(), other_files.end());
+
+ std::string build_path = FilePathToUTF8(build_settings->root_path());
+ auto item_callback = [&inputs,
+ &build_path](const base::FilePath& input_file) {
+ std::string file;
+ if (MakeAbsolutePathRelativeIfPossible(
+ build_path, FilePathToUTF8(input_file), &file)) {
+ inputs.Append(std::make_unique<base::Value>(std::move(file)));
+ }
+ };
+
+ sorter.IterateOver(item_callback);
}
+
settings->SetKey("gen_input_files", std::move(inputs));
auto output = std::make_unique<base::DictionaryValue>();
diff --git a/src/gn/ninja_build_writer.cc b/src/gn/ninja_build_writer.cc
index 12ccaa7..b5b491b 100644
--- a/src/gn/ninja_build_writer.cc
+++ b/src/gn/ninja_build_writer.cc
@@ -292,30 +292,35 @@
// exist and will error if they don't. When files are listed in a depfile,
// missing files are ignored.
dep_out_ << "build.ninja:";
- std::vector<base::FilePath> input_files;
- g_scheduler->input_file_manager()->GetAllPhysicalInputFileNames(&input_files);
// Other files read by the build.
std::vector<base::FilePath> other_files = g_scheduler->GetGenDependencies();
- // Sort the input files to order them deterministically.
- // Additionally, remove duplicate filepaths that seem to creep in.
- std::set<base::FilePath> fileset(input_files.begin(), input_files.end());
- fileset.insert(other_files.begin(), other_files.end());
+ const InputFileManager* input_file_manager =
+ g_scheduler->input_file_manager();
+
+ VectorSetSorter<base::FilePath> sorter(
+ input_file_manager->GetInputFileCount() + other_files.size());
+
+ input_file_manager->AddAllPhysicalInputFileNamesToVectorSetSorter(&sorter);
+ sorter.Add(other_files.begin(), other_files.end());
const base::FilePath build_path =
build_settings_->build_dir().Resolve(build_settings_->root_path());
EscapeOptions depfile_escape;
depfile_escape.mode = ESCAPE_DEPFILE;
- for (const auto& other_file : fileset) {
+ auto item_callback = [this, &depfile_escape,
+ &build_path](const base::FilePath& input_file) {
const base::FilePath file =
- MakeAbsoluteFilePathRelativeIfPossible(build_path, other_file);
+ MakeAbsoluteFilePathRelativeIfPossible(build_path, input_file);
dep_out_ << " ";
EscapeStringToStream(dep_out_,
FilePathToUTF8(file.NormalizePathSeparatorsTo('/')),
depfile_escape);
- }
+ };
+
+ sorter.IterateOver(item_callback);
out_ << std::endl;
}
diff --git a/src/gn/vector_utils.h b/src/gn/vector_utils.h
new file mode 100644
index 0000000..8366dda
--- /dev/null
+++ b/src/gn/vector_utils.h
@@ -0,0 +1,103 @@
+// Copyright 2020 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TOOLS_GN_VECTOR_UTILS_H_
+#define TOOLS_GN_VECTOR_UTILS_H_
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+// A VectorSetSorter is a convenience class used to efficiently sort and
+// de-duplicate one or more sets of items of type T, then iterate over the
+// result, or get it as a simple vector. Usage is the following:
+//
+// For performance reasons, this implementation only stores pointers to the
+// input items in order to minimize memory usage. Callers should ensure the
+// items added to this sorter do not change until the instance is destroyed.
+//
+// 1) Create instance, passing an optional initial capacity.
+//
+// 2) Add items using one of the Add() methods, as many times as
+// necessary. Note that this records only pointers to said items
+// so their content should not change until the instance is destroyed.
+//
+// 3) Call IteratorOver() to iterate over all sorted and de-duplicated
+// items.
+//
+// 4) Alternatively, call AsVector() to return a new vector that contains
+// copies of the original sorted / deduplicated items.
+//
+template <typename T>
+class VectorSetSorter {
+ public:
+ // Constructor. |initial_capacity| might be provided to minimize the number
+ // of allocations performed by this instance, if the maximum number of
+ // input items is known in advance.
+ VectorSetSorter(size_t initial_capacity = 0) {
+ ptrs_.reserve(initial_capacity);
+ }
+
+ // Add one single item to the sorter.
+ void Add(const T& item) {
+ ptrs_.push_back(&item);
+ sorted_ = false;
+ }
+
+ // Add one range of items to the sorter.
+ void Add(typename std::vector<T>::const_iterator begin,
+ typename std::vector<T>::const_iterator end) {
+ for (; begin != end; ++begin)
+ ptrs_.push_back(&(*begin));
+ sorted_ = false;
+ }
+
+ // Add one range of items to the sorter.
+ void Add(const T* start, const T* end) {
+ for (; start != end; ++start)
+ ptrs_.push_back(start);
+ sorted_ = false;
+ }
+
+ // Iterate over all sorted items, removing duplicates from the loop.
+ // |item_callback| is a callable that will be invoked for each item in the
+ // result.
+ template <typename ITEM_CALLBACK>
+ void IterateOver(ITEM_CALLBACK item_callback) {
+ if (!sorted_) {
+ Sort();
+ }
+ const T* prev_item = nullptr;
+ for (const T* item : ptrs_) {
+ if (!prev_item || *prev_item != *item) {
+ item_callback(*item);
+ prev_item = item;
+ }
+ }
+ }
+
+ // Return the sorted and de-duplicated resulting set as a vector of items.
+ // Note that this copies the input items.
+ std::vector<T> AsVector() {
+ std::vector<T> result;
+ result.reserve(ptrs_.size());
+ IterateOver([&result](const T& item) { result.push_back(item); });
+ return result;
+ }
+
+ private:
+ // Sort all items previously added to this instance.
+ // Must be called after adding all desired items, and before
+ // calling IterateOver() or AsVector().
+ void Sort() {
+ std::sort(ptrs_.begin(), ptrs_.end(),
+ [](const T* a, const T* b) { return *a < *b; });
+ sorted_ = true;
+ }
+
+ std::vector<const T*> ptrs_;
+ bool sorted_ = false;
+};
+
+#endif // TOOLS_GN_VECTOR_UTILS_H_
diff --git a/src/gn/vector_utils_unittest.cc b/src/gn/vector_utils_unittest.cc
new file mode 100644
index 0000000..c7cee01
--- /dev/null
+++ b/src/gn/vector_utils_unittest.cc
@@ -0,0 +1,47 @@
+// Copyright 2020 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "gn/vector_utils.h"
+
+#include "util/test/test.h"
+
+#include <string>
+
+TEST(VectorSetSorter, AsVectorWithStrings) {
+ VectorSetSorter<std::string> sorter;
+
+ std::vector<std::string> input = {
+ "World!", "Hello", "bonjour", "Hello", "monde!", "World!",
+ };
+
+ sorter.Add(input.begin(), input.end());
+ auto result = sorter.AsVector();
+
+ ASSERT_EQ(result.size(), 4u) << result.size();
+ EXPECT_STREQ(result[0].c_str(), "Hello");
+ EXPECT_STREQ(result[1].c_str(), "World!");
+ EXPECT_STREQ(result[2].c_str(), "bonjour");
+ EXPECT_STREQ(result[3].c_str(), "monde!");
+}
+
+TEST(VectorSetSorter, IterateOverWithStrings) {
+ VectorSetSorter<std::string> sorter;
+
+ std::vector<std::string> input = {
+ "World!", "Hello", "bonjour", "Hello", "monde!", "World!",
+ };
+
+ sorter.Add(input.begin(), input.end());
+
+ std::vector<std::string> result;
+
+ sorter.IterateOver(
+ [&result](const std::string& str) { result.push_back(str); });
+
+ ASSERT_EQ(result.size(), 4u) << result.size();
+ EXPECT_STREQ(result[0].c_str(), "Hello");
+ EXPECT_STREQ(result[1].c_str(), "World!");
+ EXPECT_STREQ(result[2].c_str(), "bonjour");
+ EXPECT_STREQ(result[3].c_str(), "monde!");
+}