[format] Convert invalid whitespace to space when loading for format

I've been forced to edit a lot of Go code recently where tabs are much
loved, and my Vim occasionally gets horked and sneaks tabs into my .gn
files.

After that, running `gn format` on the BUILD.gn complains about

    ERROR at //.../BUILD.gn:5:1: Invalid token.
            sources = [
    ^
    You got a tab character in here. Tabs are evil. Convert to spaces.

It just seems petty to make me manually replace them! So only when
parsing for format purposes, silently transform tabs (and since we're
here vertical tabs and form feeds too) to spaces outside of quoted
strings. Maintain the error in normal operation though.

Change-Id: Ia4099be3bbd9621e75479d4bfa734862f6c5bccf
Reviewed-on: https://gn-review.googlesource.com/c/gn/+/6500
Reviewed-by: Brett Wilson <brettw@chromium.org>
Commit-Queue: Brett Wilson <brettw@chromium.org>
diff --git a/src/gn/command_format.cc b/src/gn/command_format.cc
index 15a0dc5..99e7a62 100644
--- a/src/gn/command_format.cc
+++ b/src/gn/command_format.cc
@@ -1084,22 +1084,6 @@
 
 }  // namespace
 
-bool FormatFileToString(Setup* setup,
-                        const SourceFile& file,
-                        TreeDumpMode dump_tree,
-                        std::string* output) {
-  Err err;
-  const ParseNode* parse_node =
-      setup->scheduler().input_file_manager()->SyncLoadFile(
-          LocationRange(), &setup->build_settings(), file, &err);
-  if (err.has_error()) {
-    err.PrintToStdout();
-    return false;
-  }
-  DoFormat(parse_node, dump_tree, output);
-  return true;
-}
-
 bool FormatStringToString(const std::string& input,
                           TreeDumpMode dump_tree,
                           std::string* output) {
@@ -1108,7 +1092,8 @@
   file.SetContents(input);
   Err err;
   // Tokenize.
-  std::vector<Token> tokens = Tokenizer::Tokenize(&file, &err);
+  std::vector<Token> tokens =
+      Tokenizer::Tokenize(&file, &err, WhitespaceTransform::kInvalidToSpace);
   if (err.has_error()) {
     err.PrintToStdout();
     return false;
@@ -1192,34 +1177,35 @@
       return 1;
     }
 
+    base::FilePath to_format = setup.build_settings().GetFullPath(file);
+    std::string original_contents;
+    if (!base::ReadFileToString(to_format, &original_contents)) {
+      Err(Location(),
+          std::string("Couldn't read \"") + FilePathToUTF8(to_format))
+          .PrintToStdout();
+      return 1;
+    }
+
     std::string output_string;
-    if (FormatFileToString(&setup, file, dump_tree, &output_string)) {
-      if (dump_tree == TreeDumpMode::kInactive) {
-        // Update the file in-place.
-        base::FilePath to_write = setup.build_settings().GetFullPath(file);
-        std::string original_contents;
-        if (!base::ReadFileToString(to_write, &original_contents)) {
-          Err(Location(), std::string("Couldn't read \"") +
-                              FilePathToUTF8(to_write) +
-                              std::string("\" for comparison."))
+    if (!FormatStringToString(original_contents, dump_tree, &output_string)) {
+      return 1;
+    }
+    if (dump_tree == TreeDumpMode::kInactive) {
+      // Update the file in-place.
+      if (dry_run)
+        return original_contents == output_string ? 0 : 2;
+      if (original_contents != output_string) {
+        if (base::WriteFile(to_format, output_string.data(),
+                            static_cast<int>(output_string.size())) == -1) {
+          Err(Location(),
+              std::string("Failed to write formatted output back to \"") +
+                  FilePathToUTF8(to_format) + std::string("\"."))
               .PrintToStdout();
           return 1;
         }
-        if (dry_run)
-          return original_contents == output_string ? 0 : 2;
-        if (original_contents != output_string) {
-          if (base::WriteFile(to_write, output_string.data(),
-                              static_cast<int>(output_string.size())) == -1) {
-            Err(Location(),
-                std::string("Failed to write formatted output back to \"") +
-                    FilePathToUTF8(to_write) + std::string("\"."))
-                .PrintToStdout();
-            return 1;
-          }
-          if (!quiet) {
-            printf("Wrote formatted to '%s'.\n",
-                   FilePathToUTF8(to_write).c_str());
-          }
+        if (!quiet) {
+          printf("Wrote formatted to '%s'.\n",
+                 FilePathToUTF8(to_format).c_str());
         }
       }
     }
diff --git a/src/gn/command_format_unittest.cc b/src/gn/command_format_unittest.cc
index 0420269..f334b70 100644
--- a/src/gn/command_format_unittest.cc
+++ b/src/gn/command_format_unittest.cc
@@ -17,24 +17,28 @@
 #define FORMAT_TEST(n)                                                      \
   TEST_F(FormatTest, n) {                                                   \
     ::Setup setup;                                                          \
+    std::string input;                                                      \
     std::string out;                                                        \
     std::string expected;                                                   \
     base::FilePath src_dir =                                                \
         GetExePath().DirName().Append(FILE_PATH_LITERAL(".."));             \
     base::SetCurrentDirectory(src_dir);                                     \
-    EXPECT_TRUE(commands::FormatFileToString(                               \
-        &setup, SourceFile("//src/gn/format_test_data/" #n ".gn"),          \
-        commands::TreeDumpMode::kInactive, &out));                          \
+    ASSERT_TRUE(base::ReadFileToString(                                     \
+        base::FilePath(FILE_PATH_LITERAL("src/gn/format_test_data/")        \
+                           FILE_PATH_LITERAL(#n) FILE_PATH_LITERAL(".gn")), \
+        &input));                                                           \
     ASSERT_TRUE(base::ReadFileToString(                                     \
         base::FilePath(FILE_PATH_LITERAL("src/gn/format_test_data/")        \
                            FILE_PATH_LITERAL(#n)                            \
                                FILE_PATH_LITERAL(".golden")),               \
         &expected));                                                        \
+    EXPECT_TRUE(commands::FormatStringToString(                             \
+        input, commands::TreeDumpMode::kInactive, &out));                   \
     EXPECT_EQ(expected, out);                                               \
     /* Make sure formatting the output doesn't cause further changes. */    \
     std::string out_again;                                                  \
-    EXPECT_TRUE(commands::FormatStringToString(out,                         \
-        commands::TreeDumpMode::kInactive, &out_again));                    \
+    EXPECT_TRUE(commands::FormatStringToString(                             \
+        out, commands::TreeDumpMode::kInactive, &out_again));               \
     ASSERT_EQ(out, out_again);                                              \
   }
 
@@ -116,3 +120,4 @@
 FORMAT_TEST(073)
 FORMAT_TEST(074)
 FORMAT_TEST(075)
+FORMAT_TEST(076)
diff --git a/src/gn/format_test_data/076.gn b/src/gn/format_test_data/076.gn
new file mode 100644
index 0000000..ea617dc
--- /dev/null
+++ b/src/gn/format_test_data/076.gn
@@ -0,0 +1,12 @@
+# Intentionally contains tabs which the formatter should convert to spaces, but
+# only where it's not relevant to the parse.
+
+if (true) {
+	sources = [
+	  "a.c", "a.h", "main.c"
+	]
+}
+
+if (false) {
+  embedded = "a tab	to leave	alone"
+}
diff --git a/src/gn/format_test_data/076.golden b/src/gn/format_test_data/076.golden
new file mode 100644
index 0000000..b682482
--- /dev/null
+++ b/src/gn/format_test_data/076.golden
@@ -0,0 +1,14 @@
+# Intentionally contains tabs which the formatter should convert to spaces, but
+# only where it's not relevant to the parse.
+
+if (true) {
+  sources = [
+    "a.c",
+    "a.h",
+    "main.c",
+  ]
+}
+
+if (false) {
+  embedded = "a tab	to leave	alone"
+}
diff --git a/src/gn/tokenizer.cc b/src/gn/tokenizer.cc
index 67c21d5..eec2349 100644
--- a/src/gn/tokenizer.cc
+++ b/src/gn/tokenizer.cc
@@ -68,14 +68,22 @@
 
 }  // namespace
 
-Tokenizer::Tokenizer(const InputFile* input_file, Err* err)
-    : input_file_(input_file), input_(input_file->contents()), err_(err) {}
+Tokenizer::Tokenizer(const InputFile* input_file,
+                     Err* err,
+                     WhitespaceTransform whitespace_transform)
+    : input_file_(input_file),
+      input_(input_file->contents()),
+      err_(err),
+      whitespace_transform_(whitespace_transform) {}
 
 Tokenizer::~Tokenizer() = default;
 
 // static
-std::vector<Token> Tokenizer::Tokenize(const InputFile* input_file, Err* err) {
-  Tokenizer t(input_file, err);
+std::vector<Token> Tokenizer::Tokenize(
+    const InputFile* input_file,
+    Err* err,
+    WhitespaceTransform whitespace_transform) {
+  Tokenizer t(input_file, err, whitespace_transform);
   return t.Run();
 }
 
@@ -339,7 +347,9 @@
   DCHECK(!at_end());
   char c = input_[cur_];
   // Note that tab (0x09), vertical tab (0x0B), and formfeed (0x0C) are illegal.
-  return c == 0x0A || c == 0x0D || c == 0x20;
+  return c == 0x0A || c == 0x0D || c == 0x20 ||
+         (whitespace_transform_ == WhitespaceTransform::kInvalidToSpace &&
+          (c == 0x09 || c == 0x0B || c == 0x0C));
 }
 
 bool Tokenizer::IsCurrentStringTerminator(char quote_char) const {
diff --git a/src/gn/tokenizer.h b/src/gn/tokenizer.h
index a2153ca..fb74d2a 100644
--- a/src/gn/tokenizer.h
+++ b/src/gn/tokenizer.h
@@ -16,9 +16,22 @@
 
 class InputFile;
 
+// Tab (0x09), vertical tab (0x0B), and formfeed (0x0C) are illegal in GN files.
+// Almost always these are errors. However, in the case of running the formatter
+// it's nice to convert these to spaces when encountered so that the input can
+// still be parsed and rewritten correctly by the formatter.
+enum class WhitespaceTransform {
+  kMaintainOriginalInput,
+  kInvalidToSpace,
+};
+
 class Tokenizer {
  public:
-  static std::vector<Token> Tokenize(const InputFile* input_file, Err* err);
+  static std::vector<Token> Tokenize(
+      const InputFile* input_file,
+      Err* err,
+      WhitespaceTransform whitespace_transform =
+          WhitespaceTransform::kMaintainOriginalInput);
 
   // Counts lines in the given buffer (the first line is "1") and returns
   // the byte offset of the beginning of that line, or (size_t)-1 if there
@@ -39,7 +52,9 @@
 
  private:
   // InputFile must outlive the tokenizer and all generated tokens.
-  Tokenizer(const InputFile* input_file, Err* err);
+  Tokenizer(const InputFile* input_file,
+            Err* err,
+            WhitespaceTransform whitespace_transform);
   ~Tokenizer();
 
   std::vector<Token> Run();
@@ -79,6 +94,7 @@
   const InputFile* input_file_;
   const std::string_view input_;
   Err* err_;
+  WhitespaceTransform whitespace_transform_;
   size_t cur_ = 0;  // Byte offset into input buffer.
 
   int line_number_ = 1;
diff --git a/src/gn/tokenizer_unittest.cc b/src/gn/tokenizer_unittest.cc
index b7423a8..d740d86 100644
--- a/src/gn/tokenizer_unittest.cc
+++ b/src/gn/tokenizer_unittest.cc
@@ -203,3 +203,35 @@
       "}",
       fn2));
 }
+
+TEST(Tokenizer, WhitespaceTransformMaintain) {
+  InputFile input(SourceFile("/test"));
+  input.SetContents("a\t2\v\"st\tuff\"\f{");
+
+  Err err;
+  std::vector<Token> results = Tokenizer::Tokenize(
+      &input, &err, WhitespaceTransform::kMaintainOriginalInput);
+  EXPECT_TRUE(err.has_error());
+  EXPECT_EQ(err.location().column_number(), 2);
+}
+
+TEST(Tokenizer, WhitespaceTransformToSpace) {
+  InputFile input(SourceFile("/test"));
+  input.SetContents("a\t2\v\"st\tuff\"\f{");
+
+  Err err;
+  std::vector<Token> results =
+      Tokenizer::Tokenize(&input, &err, WhitespaceTransform::kInvalidToSpace);
+  EXPECT_FALSE(err.has_error());
+  ASSERT_EQ(results.size(), 4u);
+  EXPECT_EQ(results[0].type(), Token::IDENTIFIER);
+  EXPECT_EQ(results[0].value(), "a");
+  EXPECT_EQ(results[1].type(), Token::INTEGER);
+  EXPECT_EQ(results[1].value(), "2");
+  EXPECT_EQ(results[2].type(), Token::STRING);
+  EXPECT_EQ(results[2].value(),
+            "\"st\tuff\"");  // Note, embedded \t not transformed.
+  EXPECT_EQ(results[3].type(), Token::LEFT_BRACE);
+  EXPECT_EQ(results[3].value(), "{");
+}
+