Import cobalt 25.master.0.1034729
diff --git a/third_party/google_benchmark/.travis-libcxx-setup.sh b/third_party/google_benchmark/.travis-libcxx-setup.sh
deleted file mode 100644
index a591743..0000000
--- a/third_party/google_benchmark/.travis-libcxx-setup.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/usr/bin/env bash
-
-# Install a newer CMake version
-curl -sSL https://cmake.org/files/v3.6/cmake-3.6.1-Linux-x86_64.sh -o install-cmake.sh
-chmod +x install-cmake.sh
-sudo ./install-cmake.sh --prefix=/usr/local --skip-license
-
-# Checkout LLVM sources
-git clone --depth=1 https://github.com/llvm-mirror/llvm.git llvm-source
-git clone --depth=1 https://github.com/llvm-mirror/libcxx.git llvm-source/projects/libcxx
-git clone --depth=1 https://github.com/llvm-mirror/libcxxabi.git llvm-source/projects/libcxxabi
-
-# Setup libc++ options
-if [ -z "$BUILD_32_BITS" ]; then
-  export BUILD_32_BITS=OFF && echo disabling 32 bit build
-fi
-
-# Build and install libc++ (Use unstable ABI for better sanitizer coverage)
-mkdir llvm-build && cd llvm-build
-cmake -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${COMPILER} \
-      -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_INSTALL_PREFIX=/usr \
-      -DLIBCXX_ABI_UNSTABLE=ON \
-      -DLLVM_USE_SANITIZER=${LIBCXX_SANITIZER} \
-      -DLLVM_BUILD_32_BITS=${BUILD_32_BITS} \
-      ../llvm-source
-make cxx -j2
-sudo make install-cxxabi install-cxx
-cd ../
diff --git a/third_party/google_benchmark/BUILD.bazel b/third_party/google_benchmark/BUILD.bazel
deleted file mode 100644
index d97a019..0000000
--- a/third_party/google_benchmark/BUILD.bazel
+++ /dev/null
@@ -1,44 +0,0 @@
-licenses(["notice"])
-
-config_setting(
-    name = "windows",
-    values = {
-        "cpu": "x64_windows",
-    },
-    visibility = [":__subpackages__"],
-)
-
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
-cc_library(
-    name = "benchmark",
-    srcs = glob(
-        [
-            "src/*.cc",
-            "src/*.h",
-        ],
-        exclude = ["src/benchmark_main.cc"],
-    ),
-    hdrs = ["include/benchmark/benchmark.h"],
-    linkopts = select({
-        ":windows": ["-DEFAULTLIB:shlwapi.lib"],
-        "//conditions:default": ["-pthread"],
-    }),
-    strip_include_prefix = "include",
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "benchmark_main",
-    srcs = ["src/benchmark_main.cc"],
-    hdrs = ["include/benchmark/benchmark.h"],
-    strip_include_prefix = "include",
-    visibility = ["//visibility:public"],
-    deps = [":benchmark"],
-)
-
-cc_library(
-    name = "benchmark_internal_headers",
-    hdrs = glob(["src/*.h"]),
-    visibility = ["//test:__pkg__"],
-)
diff --git a/third_party/google_benchmark/BUILD.gn b/third_party/google_benchmark/BUILD.gn
index c274b73..2f26cfe 100644
--- a/third_party/google_benchmark/BUILD.gn
+++ b/third_party/google_benchmark/BUILD.gn
@@ -1,42 +1,89 @@
-# Copyright 2021 The Cobalt Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Copyright 2019 The Chromium Authors
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
 
-static_library("google_benchmark") {
+config("benchmark_config") {
+  include_dirs = [ "src/include" ]
+
+  if (!is_component_build) {
+    defines = [ "BENCHMARK_STATIC_DEFINE" ]
+  }
+}
+
+component("google_benchmark") {
   testonly = true
 
-  sources = [
-    "src/benchmark.cc",
-    "src/benchmark_api_internal.cc",
-    "src/benchmark_name.cc",
-    "src/benchmark_register.cc",
-    "src/benchmark_runner.cc",
-    "src/colorprint_starboard.cc",
-    "src/commandlineflags.cc",
-    "src/complexity.cc",
-    "src/console_reporter.cc",
-    "src/counter.cc",
-    "src/csv_reporter.cc",
-    "src/json_reporter.cc",
-    "src/reporter.cc",
-    "src/sleep.cc",
-    "src/statistics.cc",
-    "src/string_util.cc",
-    "src/sysinfo.cc",
-    "src/timers.cc",
+  public = [
+    "src/include/benchmark/benchmark.h",
+    "src/include/benchmark/export.h",
   ]
 
-  include_dirs = [ "include" ]
+  sources = [
+    "src/src/arraysize.h",
+    "src/src/benchmark.cc",
+    "src/src/benchmark_api_internal.cc",
+    "src/src/benchmark_api_internal.h",
+    "src/src/benchmark_name.cc",
+    "src/src/benchmark_register.cc",
+    "src/src/benchmark_register.h",
+    "src/src/benchmark_runner.cc",
+    "src/src/benchmark_runner.h",
+    "src/src/check.cc",
+    "src/src/check.h",
+    "src/src/colorprint.cc",
+    "src/src/colorprint.h",
+    "src/src/commandlineflags.cc",
+    "src/src/commandlineflags.h",
+    "src/src/complexity.cc",
+    "src/src/complexity.h",
+    "src/src/console_reporter.cc",
+    "src/src/counter.cc",
+    "src/src/counter.h",
+    "src/src/csv_reporter.cc",
+    "src/src/cycleclock.h",
+    "src/src/internal_macros.h",
+    "src/src/json_reporter.cc",
+    "src/src/log.h",
+    "src/src/mutex.h",
+    "src/src/perf_counters.cc",
+    "src/src/perf_counters.h",
+    "src/src/re.h",
+    "src/src/reporter.cc",
+    "src/src/statistics.cc",
+    "src/src/statistics.h",
+    "src/src/string_util.cc",
+    "src/src/string_util.h",
+    "src/src/sysinfo.cc",
+    "src/src/thread_manager.h",
+    "src/src/thread_timer.h",
+    "src/src/timers.cc",
+    "src/src/timers.h",
+  ]
 
-  public_deps = [ "//starboard/common" ]
+  all_dependent_configs = [ ":benchmark_config" ]
+
+  configs -= [ "//build/config/compiler:chromium_code" ]
+  configs += [
+    "//build/config/compiler:no_chromium_code",
+  ]
+
+  if (is_win) {
+    configs -= [ "//build/config/win:nominmax" ]
+  }
+
+  defines = [
+    "benchmark_EXPORTS=1",
+
+    # Tell gtest to always use standard regular expressions.
+    "HAVE_GNU_POSIX_REGEX=0",
+    "HAVE_POSIX_REGEX=0",
+    "HAVE_STD_REGEX=1",
+  ]
+}
+
+component("benchmark_main") {
+  testonly = true
+  sources = [ "src/src/benchmark_main.cc" ]
+  defines = [ "benchmark_EXPORTS=1" ]
+  deps = [ ":google_benchmark" ]
 }
diff --git a/third_party/google_benchmark/CMakeLists.txt b/third_party/google_benchmark/CMakeLists.txt
deleted file mode 100644
index 8cfe125..0000000
--- a/third_party/google_benchmark/CMakeLists.txt
+++ /dev/null
@@ -1,277 +0,0 @@
-cmake_minimum_required (VERSION 3.5.1)
-
-foreach(p
-    CMP0048 # OK to clear PROJECT_VERSION on project()
-    CMP0054 # CMake 3.1
-    CMP0056 # export EXE_LINKER_FLAGS to try_run
-    CMP0057 # Support no if() IN_LIST operator
-    CMP0063 # Honor visibility properties for all targets
-    )
-  if(POLICY ${p})
-    cmake_policy(SET ${p} NEW)
-  endif()
-endforeach()
-
-project (benchmark CXX)
-
-option(BENCHMARK_ENABLE_TESTING "Enable testing of the benchmark library." ON)
-option(BENCHMARK_ENABLE_EXCEPTIONS "Enable the use of exceptions in the benchmark library." ON)
-option(BENCHMARK_ENABLE_LTO "Enable link time optimisation of the benchmark library." OFF)
-option(BENCHMARK_USE_LIBCXX "Build and test using libc++ as the standard library." OFF)
-if(NOT MSVC)
-  option(BENCHMARK_BUILD_32_BITS "Build a 32 bit version of the library." OFF)
-else()
-  set(BENCHMARK_BUILD_32_BITS OFF CACHE BOOL "Build a 32 bit version of the library - unsupported when using MSVC)" FORCE)
-endif()
-option(BENCHMARK_ENABLE_INSTALL "Enable installation of benchmark. (Projects embedding benchmark may want to turn this OFF.)" ON)
-
-# Allow unmet dependencies to be met using CMake's ExternalProject mechanics, which
-# may require downloading the source code.
-option(BENCHMARK_DOWNLOAD_DEPENDENCIES "Allow the downloading and in-tree building of unmet dependencies" OFF)
-
-# This option can be used to disable building and running unit tests which depend on gtest
-# in cases where it is not possible to build or find a valid version of gtest.
-option(BENCHMARK_ENABLE_GTEST_TESTS "Enable building the unit tests which depend on gtest" ON)
-
-set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
-set(ENABLE_ASSEMBLY_TESTS_DEFAULT OFF)
-function(should_enable_assembly_tests)
-  if(CMAKE_BUILD_TYPE)
-    string(TOLOWER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_LOWER)
-    if (${CMAKE_BUILD_TYPE_LOWER} MATCHES "coverage")
-      # FIXME: The --coverage flag needs to be removed when building assembly
-      # tests for this to work.
-      return()
-    endif()
-  endif()
-  if (MSVC)
-    return()
-  elseif(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
-    return()
-  elseif(NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
-    # FIXME: Make these work on 32 bit builds
-    return()
-  elseif(BENCHMARK_BUILD_32_BITS)
-     # FIXME: Make these work on 32 bit builds
-    return()
-  endif()
-  find_program(LLVM_FILECHECK_EXE FileCheck)
-  if (LLVM_FILECHECK_EXE)
-    set(LLVM_FILECHECK_EXE "${LLVM_FILECHECK_EXE}" CACHE PATH "llvm filecheck" FORCE)
-    message(STATUS "LLVM FileCheck Found: ${LLVM_FILECHECK_EXE}")
-  else()
-    message(STATUS "Failed to find LLVM FileCheck")
-    return()
-  endif()
-  set(ENABLE_ASSEMBLY_TESTS_DEFAULT ON PARENT_SCOPE)
-endfunction()
-should_enable_assembly_tests()
-
-# This option disables the building and running of the assembly verification tests
-option(BENCHMARK_ENABLE_ASSEMBLY_TESTS "Enable building and running the assembly tests"
-    ${ENABLE_ASSEMBLY_TESTS_DEFAULT})
-
-# Make sure we can import out CMake functions
-list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules")
-list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
-
-
-# Read the git tags to determine the project version
-include(GetGitVersion)
-get_git_version(GIT_VERSION)
-
-# Tell the user what versions we are using
-string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" VERSION ${GIT_VERSION})
-message(STATUS "Version: ${VERSION}")
-
-# The version of the libraries
-set(GENERIC_LIB_VERSION ${VERSION})
-string(SUBSTRING ${VERSION} 0 1 GENERIC_LIB_SOVERSION)
-
-# Import our CMake modules
-include(CheckCXXCompilerFlag)
-include(AddCXXCompilerFlag)
-include(CXXFeatureCheck)
-
-if (BENCHMARK_BUILD_32_BITS)
-  add_required_cxx_compiler_flag(-m32)
-endif()
-
-if (MSVC)
-  # Turn compiler warnings up to 11
-  string(REGEX REPLACE "[-/]W[1-4]" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4")
-  add_definitions(-D_CRT_SECURE_NO_WARNINGS)
-
-  if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
-    add_cxx_compiler_flag(-EHs-)
-    add_cxx_compiler_flag(-EHa-)
-    add_definitions(-D_HAS_EXCEPTIONS=0)
-  endif()
-  # Link time optimisation
-  if (BENCHMARK_ENABLE_LTO)
-    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /GL")
-    set(CMAKE_STATIC_LINKER_FLAGS_RELEASE "${CMAKE_STATIC_LINKER_FLAGS_RELEASE} /LTCG")
-    set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /LTCG")
-    set(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} /LTCG")
-
-    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /GL")
-    string(REGEX REPLACE "[-/]INCREMENTAL" "/INCREMENTAL:NO" CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO}")
-    set(CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO} /LTCG")
-    string(REGEX REPLACE "[-/]INCREMENTAL" "/INCREMENTAL:NO" CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO}")
-    set(CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO} /LTCG")
-    string(REGEX REPLACE "[-/]INCREMENTAL" "/INCREMENTAL:NO" CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO}")
-    set(CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO} /LTCG")
-
-    set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /GL")
-    set(CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL "${CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL} /LTCG")
-    set(CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL "${CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL} /LTCG")
-    set(CMAKE_EXE_LINKER_FLAGS_MINSIZEREL "${CMAKE_EXE_LINKER_FLAGS_MINSIZEREL} /LTCG")
-  endif()
-else()
-  # Try and enable C++11. Don't use C++14 because it doesn't work in some
-  # configurations.
-  add_cxx_compiler_flag(-std=c++11)
-  if (NOT HAVE_CXX_FLAG_STD_CXX11)
-    add_cxx_compiler_flag(-std=c++0x)
-  endif()
-
-  # Turn compiler warnings up to 11
-  add_cxx_compiler_flag(-Wall)
-  add_cxx_compiler_flag(-Wextra)
-  add_cxx_compiler_flag(-Wshadow)
-  add_cxx_compiler_flag(-Werror RELEASE)
-  add_cxx_compiler_flag(-Werror RELWITHDEBINFO)
-  add_cxx_compiler_flag(-Werror MINSIZEREL)
-  add_cxx_compiler_flag(-pedantic)
-  add_cxx_compiler_flag(-pedantic-errors)
-  add_cxx_compiler_flag(-Wshorten-64-to-32)
-  add_cxx_compiler_flag(-fstrict-aliasing)
-  # Disable warnings regarding deprecated parts of the library while building
-  # and testing those parts of the library.
-  add_cxx_compiler_flag(-Wno-deprecated-declarations)
-  if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
-    # Intel silently ignores '-Wno-deprecated-declarations',
-    # warning no. 1786 must be explicitly disabled.
-    # See #631 for rationale.
-    add_cxx_compiler_flag(-wd1786)
-  endif()
-  # Disable deprecation warnings for release builds (when -Werror is enabled).
-  add_cxx_compiler_flag(-Wno-deprecated RELEASE)
-  add_cxx_compiler_flag(-Wno-deprecated RELWITHDEBINFO)
-  add_cxx_compiler_flag(-Wno-deprecated MINSIZEREL)
-  if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
-    add_cxx_compiler_flag(-fno-exceptions)
-  endif()
-
-  if (HAVE_CXX_FLAG_FSTRICT_ALIASING)
-    if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "Intel") #ICC17u2: Many false positives for Wstrict-aliasing
-      add_cxx_compiler_flag(-Wstrict-aliasing)
-    endif()
-  endif()
-  # ICC17u2: overloaded virtual function "benchmark::Fixture::SetUp" is only partially overridden
-  # (because of deprecated overload)
-  add_cxx_compiler_flag(-wd654)
-  add_cxx_compiler_flag(-Wthread-safety)
-  if (HAVE_CXX_FLAG_WTHREAD_SAFETY)
-    cxx_feature_check(THREAD_SAFETY_ATTRIBUTES)
-  endif()
-
-  # On most UNIX like platforms g++ and clang++ define _GNU_SOURCE as a
-  # predefined macro, which turns on all of the wonderful libc extensions.
-  # However g++ doesn't do this in Cygwin so we have to define it ourselfs
-  # since we depend on GNU/POSIX/BSD extensions.
-  if (CYGWIN)
-    add_definitions(-D_GNU_SOURCE=1)
-  endif()
-
-  if (QNXNTO)
-    add_definitions(-D_QNX_SOURCE)
-  endif()
-
-  # Link time optimisation
-  if (BENCHMARK_ENABLE_LTO)
-    add_cxx_compiler_flag(-flto)
-    if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
-      find_program(GCC_AR gcc-ar)
-      if (GCC_AR)
-        set(CMAKE_AR ${GCC_AR})
-      endif()
-      find_program(GCC_RANLIB gcc-ranlib)
-      if (GCC_RANLIB)
-        set(CMAKE_RANLIB ${GCC_RANLIB})
-      endif()
-    elseif("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
-      include(llvm-toolchain)
-    endif()
-  endif()
-
-  # Coverage build type
-  set(BENCHMARK_CXX_FLAGS_COVERAGE "${CMAKE_CXX_FLAGS_DEBUG}"
-    CACHE STRING "Flags used by the C++ compiler during coverage builds."
-    FORCE)
-  set(BENCHMARK_EXE_LINKER_FLAGS_COVERAGE "${CMAKE_EXE_LINKER_FLAGS_DEBUG}"
-    CACHE STRING "Flags used for linking binaries during coverage builds."
-    FORCE)
-  set(BENCHMARK_SHARED_LINKER_FLAGS_COVERAGE "${CMAKE_SHARED_LINKER_FLAGS_DEBUG}"
-    CACHE STRING "Flags used by the shared libraries linker during coverage builds."
-    FORCE)
-  mark_as_advanced(
-    BENCHMARK_CXX_FLAGS_COVERAGE
-    BENCHMARK_EXE_LINKER_FLAGS_COVERAGE
-    BENCHMARK_SHARED_LINKER_FLAGS_COVERAGE)
-  set(CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}" CACHE STRING
-    "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel Coverage.")
-  add_cxx_compiler_flag(--coverage COVERAGE)
-endif()
-
-if (BENCHMARK_USE_LIBCXX)
-  if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
-    add_cxx_compiler_flag(-stdlib=libc++)
-  elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" OR
-          "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
-    add_cxx_compiler_flag(-nostdinc++)
-    message(WARNING "libc++ header path must be manually specified using CMAKE_CXX_FLAGS")
-    # Adding -nodefaultlibs directly to CMAKE_<TYPE>_LINKER_FLAGS will break
-    # configuration checks such as 'find_package(Threads)'
-    list(APPEND BENCHMARK_CXX_LINKER_FLAGS -nodefaultlibs)
-    # -lc++ cannot be added directly to CMAKE_<TYPE>_LINKER_FLAGS because
-    # linker flags appear before all linker inputs and -lc++ must appear after.
-    list(APPEND BENCHMARK_CXX_LIBRARIES c++)
-  else()
-    message(FATAL_ERROR "-DBENCHMARK_USE_LIBCXX:BOOL=ON is not supported for compiler")
-  endif()
-endif(BENCHMARK_USE_LIBCXX)
-
-# C++ feature checks
-# Determine the correct regular expression engine to use
-cxx_feature_check(STD_REGEX)
-cxx_feature_check(GNU_POSIX_REGEX)
-cxx_feature_check(POSIX_REGEX)
-if(NOT HAVE_STD_REGEX AND NOT HAVE_GNU_POSIX_REGEX AND NOT HAVE_POSIX_REGEX)
-  message(FATAL_ERROR "Failed to determine the source files for the regular expression backend")
-endif()
-if (NOT BENCHMARK_ENABLE_EXCEPTIONS AND HAVE_STD_REGEX
-        AND NOT HAVE_GNU_POSIX_REGEX AND NOT HAVE_POSIX_REGEX)
-  message(WARNING "Using std::regex with exceptions disabled is not fully supported")
-endif()
-cxx_feature_check(STEADY_CLOCK)
-# Ensure we have pthreads
-set(THREADS_PREFER_PTHREAD_FLAG ON)
-find_package(Threads REQUIRED)
-
-# Set up directories
-include_directories(${PROJECT_SOURCE_DIR}/include)
-
-# Build the targets
-add_subdirectory(src)
-
-if (BENCHMARK_ENABLE_TESTING)
-  enable_testing()
-  if (BENCHMARK_ENABLE_GTEST_TESTS AND
-      NOT (TARGET gtest AND TARGET gtest_main AND
-           TARGET gmock AND TARGET gmock_main))
-    include(GoogleTest)
-  endif()
-  add_subdirectory(test)
-endif()
diff --git a/third_party/google_benchmark/DIR_METADATA b/third_party/google_benchmark/DIR_METADATA
new file mode 100644
index 0000000..643d2ba
--- /dev/null
+++ b/third_party/google_benchmark/DIR_METADATA
@@ -0,0 +1,3 @@
+monorail: {
+  component: "Internals>Core"
+}
diff --git a/third_party/google_benchmark/METADATA b/third_party/google_benchmark/METADATA
index 5b23eeb..cfeb98c 100644
--- a/third_party/google_benchmark/METADATA
+++ b/third_party/google_benchmark/METADATA
@@ -1,24 +1,17 @@
-name: "google_benchmark"
-description:
-  "Benchmark is a Google library that benchmarks code snippets, similar to "
-  "unit tests. Cobalt currently enables Benchmark on Linux and Android to "
-  "benchmark memory and thread usage."
+name: "Google Benchmark"
 
 third_party {
   identifier {
     type: "ChromiumVersion"
-    value: "87.0.4280.142"  # from https://chromereleases.googleblog.com/2021/01/stable-channel-update-for-chrome-os.html
+    value: "114.0.5735.358"  # from https://chromereleases.googleblog.com/2024/03/long-term-support-channel-update-for_26.html
   }
   identifier {
     type: "Git"
-    value: "https://chromium.googlesource.com/external/github.com/google/benchmark.git"
-    version: "367119482ff4abc3d73e4a109b410090fc281337"
-    # from https://chromium.googlesource.com/chromium/src/+/87.0.4280.142/DEPS#821
+    value: "https://chromium.googlesource.com/chromium/src.git"
+    version: "1759c6ae9316996b9f150c0ce9d0ca78a3d15c02"
   }
-  last_upgrade_date {
-    year: 2019
-    month: 12
-    day: 2
+  identifier {
+    type: "UpstreamSubdir"
+    value: "third_party/google_benchmark"
   }
-  license_type: NOTICE
 }
diff --git a/third_party/google_benchmark/OWNERS b/third_party/google_benchmark/OWNERS
new file mode 100644
index 0000000..0dc4bec
--- /dev/null
+++ b/third_party/google_benchmark/OWNERS
@@ -0,0 +1,7 @@
+# Primary for bugs, reviews:
+asully@chromium.org
+pkasting@chromium.org
+
+# Secondary:
+ayui@chromium.org
+jsbell@chromium.org
diff --git a/third_party/google_benchmark/README.chromium b/third_party/google_benchmark/README.chromium
new file mode 100644
index 0000000..34ce0eb
--- /dev/null
+++ b/third_party/google_benchmark/README.chromium
@@ -0,0 +1,13 @@
+Name: Google Benchmark
+Short Name: benchmark
+URL: https://github.com/google/benchmark
+Version: efc89f0b524780b1994d5dddd83a92718e5be492
+License: Apache 2.0
+License File: NOT_SHIPPED
+Security Critical: no
+
+Description:
+A microbenchmark support library.
+
+Local Additions:
+* gn file for building in chromium
diff --git a/third_party/google_benchmark/WORKSPACE b/third_party/google_benchmark/WORKSPACE
deleted file mode 100644
index 8df248a..0000000
--- a/third_party/google_benchmark/WORKSPACE
+++ /dev/null
@@ -1,15 +0,0 @@
-workspace(name = "com_github_google_benchmark")
-
-load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
-
-http_archive(
-    name = "rules_cc",
-    strip_prefix = "rules_cc-a508235df92e71d537fcbae0c7c952ea6957a912",
-    urls = ["https://github.com/bazelbuild/rules_cc/archive/a508235df92e71d537fcbae0c7c952ea6957a912.zip"],
-)
-
-http_archive(
-    name = "com_google_googletest",
-    strip_prefix = "googletest-3f0cf6b62ad1eb50d8736538363d3580dd640c3e",
-    urls = ["https://github.com/google/googletest/archive/3f0cf6b62ad1eb50d8736538363d3580dd640c3e.zip"],
-)
diff --git a/third_party/google_benchmark/cmake/Config.cmake.in b/third_party/google_benchmark/cmake/Config.cmake.in
deleted file mode 100644
index 6e9256e..0000000
--- a/third_party/google_benchmark/cmake/Config.cmake.in
+++ /dev/null
@@ -1 +0,0 @@
-include("${CMAKE_CURRENT_LIST_DIR}/@targets_export_name@.cmake")
diff --git a/third_party/google_benchmark/conan/CMakeLists.txt b/third_party/google_benchmark/conan/CMakeLists.txt
deleted file mode 100644
index 15b92ca..0000000
--- a/third_party/google_benchmark/conan/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-cmake_minimum_required(VERSION 2.8.11)
-project(cmake_wrapper)
-
-include(conanbuildinfo.cmake)
-conan_basic_setup()
-
-include(${CMAKE_SOURCE_DIR}/CMakeListsOriginal.txt)
diff --git a/third_party/google_benchmark/conan/test_package/CMakeLists.txt b/third_party/google_benchmark/conan/test_package/CMakeLists.txt
deleted file mode 100644
index 089a6c7..0000000
--- a/third_party/google_benchmark/conan/test_package/CMakeLists.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-cmake_minimum_required(VERSION 2.8.11)
-project(test_package)
-
-set(CMAKE_VERBOSE_MAKEFILE TRUE)
-
-include(${CMAKE_BINARY_DIR}/conanbuildinfo.cmake)
-conan_basic_setup()
-
-add_executable(${PROJECT_NAME} test_package.cpp)
-target_link_libraries(${PROJECT_NAME} ${CONAN_LIBS})
diff --git a/third_party/google_benchmark/conan/test_package/conanfile.py b/third_party/google_benchmark/conan/test_package/conanfile.py
deleted file mode 100644
index d63f408..0000000
--- a/third_party/google_benchmark/conan/test_package/conanfile.py
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-from conans import ConanFile, CMake
-import os
-
-
-class TestPackageConan(ConanFile):
-    settings = "os", "compiler", "build_type", "arch"
-    generators = "cmake"
-
-    def build(self):
-        cmake = CMake(self)
-        cmake.configure()
-        cmake.build()
-
-    def test(self):
-        bin_path = os.path.join("bin", "test_package")
-        self.run(bin_path, run_environment=True)
diff --git a/third_party/google_benchmark/conan/test_package/test_package.cpp b/third_party/google_benchmark/conan/test_package/test_package.cpp
deleted file mode 100644
index 4fa7ec0..0000000
--- a/third_party/google_benchmark/conan/test_package/test_package.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "benchmark/benchmark.h"
-
-void BM_StringCreation(benchmark::State& state) {
-    while (state.KeepRunning())
-        std::string empty_string;
-}
-
-BENCHMARK(BM_StringCreation);
-
-void BM_StringCopy(benchmark::State& state) {
-    std::string x = "hello";
-    while (state.KeepRunning())
-        std::string copy(x);
-}
-
-BENCHMARK(BM_StringCopy);
-
-BENCHMARK_MAIN();
diff --git a/third_party/google_benchmark/conanfile.py b/third_party/google_benchmark/conanfile.py
deleted file mode 100644
index e31fc52..0000000
--- a/third_party/google_benchmark/conanfile.py
+++ /dev/null
@@ -1,79 +0,0 @@
-from conans import ConanFile, CMake, tools
-from conans.errors import ConanInvalidConfiguration
-import shutil
-import os
-
-
-class GoogleBenchmarkConan(ConanFile):
-    name = "benchmark"
-    description = "A microbenchmark support library."
-    topics = ("conan", "benchmark", "google", "microbenchmark")
-    url = "https://github.com/google/benchmark"
-    homepage = "https://github.com/google/benchmark"
-    author = "Google Inc."
-    license = "Apache-2.0"
-    exports_sources = ["*"]
-    generators = "cmake"
-
-    settings = "arch", "build_type", "compiler", "os"
-    options = {
-        "shared": [True, False],
-        "fPIC": [True, False],
-        "enable_lto": [True, False],
-        "enable_exceptions": [True, False]
-    }
-    default_options = {"shared": False, "fPIC": True, "enable_lto": False, "enable_exceptions": True}
-
-    _build_subfolder = "."
-
-    def source(self):
-        # Wrap the original CMake file to call conan_basic_setup
-        shutil.move("CMakeLists.txt", "CMakeListsOriginal.txt")
-        shutil.move(os.path.join("conan", "CMakeLists.txt"), "CMakeLists.txt")
-
-    def config_options(self):
-        if self.settings.os == "Windows":
-            if self.settings.compiler == "Visual Studio" and float(self.settings.compiler.version.value) <= 12:
-                raise ConanInvalidConfiguration("{} {} does not support Visual Studio <= 12".format(self.name, self.version))
-            del self.options.fPIC
-
-    def configure(self):
-        if self.settings.os == "Windows" and self.options.shared:
-            raise ConanInvalidConfiguration("Windows shared builds are not supported right now, see issue #639")
-
-    def _configure_cmake(self):
-        cmake = CMake(self)
-
-        cmake.definitions["BENCHMARK_ENABLE_TESTING"] = "OFF"
-        cmake.definitions["BENCHMARK_ENABLE_GTEST_TESTS"] = "OFF"
-        cmake.definitions["BENCHMARK_ENABLE_LTO"] = "ON" if self.options.enable_lto else "OFF"
-        cmake.definitions["BENCHMARK_ENABLE_EXCEPTIONS"] = "ON" if self.options.enable_exceptions else "OFF"
-
-        # See https://github.com/google/benchmark/pull/638 for Windows 32 build explanation
-        if self.settings.os != "Windows":
-            cmake.definitions["BENCHMARK_BUILD_32_BITS"] = "ON" if "64" not in str(self.settings.arch) else "OFF"
-            cmake.definitions["BENCHMARK_USE_LIBCXX"] = "ON" if (str(self.settings.compiler.libcxx) == "libc++") else "OFF"
-        else:
-            cmake.definitions["BENCHMARK_USE_LIBCXX"] = "OFF"
-
-        cmake.configure(build_folder=self._build_subfolder)
-        return cmake
-
-    def build(self):
-        cmake = self._configure_cmake()
-        cmake.build()
-
-    def package(self):
-        cmake = self._configure_cmake()
-        cmake.install()
-
-        self.copy(pattern="LICENSE", dst="licenses")
-
-    def package_info(self):
-        self.cpp_info.libs = tools.collect_libs(self)
-        if self.settings.os == "Linux":
-            self.cpp_info.libs.extend(["pthread", "rt"])
-        elif self.settings.os == "Windows":
-            self.cpp_info.libs.append("shlwapi")
-        elif self.settings.os == "SunOS":
-            self.cpp_info.libs.append("kstat")
diff --git a/third_party/google_benchmark/dependencies.md b/third_party/google_benchmark/dependencies.md
deleted file mode 100644
index 6289b4e..0000000
--- a/third_party/google_benchmark/dependencies.md
+++ /dev/null
@@ -1,18 +0,0 @@
-# Build tool dependency policy
-
-To ensure the broadest compatibility when building the benchmark library, but
-still allow forward progress, we require any build tooling to be available for:
-
-* Debian stable AND
-* The last two Ubuntu LTS releases AND
-
-Currently, this means using build tool versions that are available for Ubuntu
-16.04 (Xenial), Ubuntu 18.04 (Bionic), and Debian stretch.
-
-_Note, [travis](.travis.yml) runs under Ubuntu 14.04 (Trusty) for linux builds._
-
-## cmake
-The current supported version is cmake 3.5.1 as of 2018-06-06.
-
-_Note, this version is also available for Ubuntu 14.04, the previous Ubuntu LTS
-release, as `cmake3`._
diff --git a/third_party/google_benchmark/docs/_config.yml b/third_party/google_benchmark/docs/_config.yml
deleted file mode 100644
index 1885487..0000000
--- a/third_party/google_benchmark/docs/_config.yml
+++ /dev/null
@@ -1 +0,0 @@
-theme: jekyll-theme-midnight
\ No newline at end of file
diff --git a/third_party/google_benchmark/mingw.py b/third_party/google_benchmark/mingw.py
deleted file mode 100644
index 65cf4b8..0000000
--- a/third_party/google_benchmark/mingw.py
+++ /dev/null
@@ -1,320 +0,0 @@
-#! /usr/bin/env python
-# encoding: utf-8
-
-import argparse
-import errno
-import logging
-import os
-import platform
-import re
-import sys
-import subprocess
-import tempfile
-
-try:
-    import winreg
-except ImportError:
-    import _winreg as winreg
-try:
-    import urllib.request as request
-except ImportError:
-    import urllib as request
-try:
-    import urllib.parse as parse
-except ImportError:
-    import urlparse as parse
-
-class EmptyLogger(object):
-    '''
-    Provides an implementation that performs no logging
-    '''
-    def debug(self, *k, **kw):
-        pass
-    def info(self, *k, **kw):
-        pass
-    def warn(self, *k, **kw):
-        pass
-    def error(self, *k, **kw):
-        pass
-    def critical(self, *k, **kw):
-        pass
-    def setLevel(self, *k, **kw):
-        pass
-
-urls = (
-    'http://downloads.sourceforge.net/project/mingw-w64/Toolchains%20'
-        'targetting%20Win32/Personal%20Builds/mingw-builds/installer/'
-        'repository.txt',
-    'http://downloads.sourceforge.net/project/mingwbuilds/host-windows/'
-        'repository.txt'
-)
-'''
-A list of mingw-build repositories
-'''
-
-def repository(urls = urls, log = EmptyLogger()):
-    '''
-    Downloads and parse mingw-build repository files and parses them
-    '''
-    log.info('getting mingw-builds repository')
-    versions = {}
-    re_sourceforge = re.compile(r'http://sourceforge.net/projects/([^/]+)/files')
-    re_sub = r'http://downloads.sourceforge.net/project/\1'
-    for url in urls:
-        log.debug(' - requesting: %s', url)
-        socket = request.urlopen(url)
-        repo = socket.read()
-        if not isinstance(repo, str):
-            repo = repo.decode();
-        socket.close()
-        for entry in repo.split('\n')[:-1]:
-            value = entry.split('|')
-            version = tuple([int(n) for n in value[0].strip().split('.')])
-            version = versions.setdefault(version, {})
-            arch = value[1].strip()
-            if arch == 'x32':
-                arch = 'i686'
-            elif arch == 'x64':
-                arch = 'x86_64'
-            arch = version.setdefault(arch, {})
-            threading = arch.setdefault(value[2].strip(), {})
-            exceptions = threading.setdefault(value[3].strip(), {})
-            revision = exceptions.setdefault(int(value[4].strip()[3:]),
-                re_sourceforge.sub(re_sub, value[5].strip()))
-    return versions
-
-def find_in_path(file, path=None):
-    '''
-    Attempts to find an executable in the path
-    '''
-    if platform.system() == 'Windows':
-        file += '.exe'
-    if path is None:
-        path = os.environ.get('PATH', '')
-    if type(path) is type(''):
-        path = path.split(os.pathsep)
-    return list(filter(os.path.exists,
-        map(lambda dir, file=file: os.path.join(dir, file), path)))
-
-def find_7zip(log = EmptyLogger()):
-    '''
-    Attempts to find 7zip for unpacking the mingw-build archives
-    '''
-    log.info('finding 7zip')
-    path = find_in_path('7z')
-    if not path:
-        key = winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, r'SOFTWARE\7-Zip')
-        path, _ = winreg.QueryValueEx(key, 'Path')
-        path = [os.path.join(path, '7z.exe')]
-    log.debug('found \'%s\'', path[0])
-    return path[0]
-
-find_7zip()
-
-def unpack(archive, location, log = EmptyLogger()):
-    '''
-    Unpacks a mingw-builds archive
-    '''
-    sevenzip = find_7zip(log)
-    log.info('unpacking %s', os.path.basename(archive))
-    cmd = [sevenzip, 'x', archive, '-o' + location, '-y']
-    log.debug(' - %r', cmd)
-    with open(os.devnull, 'w') as devnull:
-        subprocess.check_call(cmd, stdout = devnull)
-
-def download(url, location, log = EmptyLogger()):
-    '''
-    Downloads and unpacks a mingw-builds archive
-    '''
-    log.info('downloading MinGW')
-    log.debug(' - url: %s', url)
-    log.debug(' - location: %s', location)
-
-    re_content = re.compile(r'attachment;[ \t]*filename=(")?([^"]*)(")?[\r\n]*')
-
-    stream = request.urlopen(url)
-    try:
-        content = stream.getheader('Content-Disposition') or ''
-    except AttributeError:
-        content = stream.headers.getheader('Content-Disposition') or ''
-    matches = re_content.match(content)
-    if matches:
-        filename = matches.group(2)
-    else:
-        parsed = parse.urlparse(stream.geturl())
-        filename = os.path.basename(parsed.path)
-
-    try:
-        os.makedirs(location)
-    except OSError as e:
-        if e.errno == errno.EEXIST and os.path.isdir(location):
-            pass
-        else:
-            raise
-
-    archive = os.path.join(location, filename)
-    with open(archive, 'wb') as out:
-        while True:
-            buf = stream.read(1024)
-            if not buf:
-                break
-            out.write(buf)
-    unpack(archive, location, log = log)
-    os.remove(archive)
-
-    possible = os.path.join(location, 'mingw64')
-    if not os.path.exists(possible):
-        possible = os.path.join(location, 'mingw32')
-        if not os.path.exists(possible):
-            raise ValueError('Failed to find unpacked MinGW: ' + possible)
-    return possible
-
-def root(location = None, arch = None, version = None, threading = None,
-        exceptions = None, revision = None, log = EmptyLogger()):
-    '''
-    Returns the root folder of a specific version of the mingw-builds variant
-    of gcc. Will download the compiler if needed
-    '''
-
-    # Get the repository if we don't have all the information
-    if not (arch and version and threading and exceptions and revision):
-        versions = repository(log = log)
-
-    # Determine some defaults
-    version = version or max(versions.keys())
-    if not arch:
-        arch = platform.machine().lower()
-        if arch == 'x86':
-            arch = 'i686'
-        elif arch == 'amd64':
-            arch = 'x86_64'
-    if not threading:
-        keys = versions[version][arch].keys()
-        if 'posix' in keys:
-            threading = 'posix'
-        elif 'win32' in keys:
-            threading = 'win32'
-        else:
-            threading = keys[0]
-    if not exceptions:
-        keys = versions[version][arch][threading].keys()
-        if 'seh' in keys:
-            exceptions = 'seh'
-        elif 'sjlj' in keys:
-            exceptions = 'sjlj'
-        else:
-            exceptions = keys[0]
-    if revision is None:
-        revision = max(versions[version][arch][threading][exceptions].keys())
-    if not location:
-        location = os.path.join(tempfile.gettempdir(), 'mingw-builds')
-
-    # Get the download url
-    url = versions[version][arch][threading][exceptions][revision]
-
-    # Tell the user whatzzup
-    log.info('finding MinGW %s', '.'.join(str(v) for v in version))
-    log.debug(' - arch: %s', arch)
-    log.debug(' - threading: %s', threading)
-    log.debug(' - exceptions: %s', exceptions)
-    log.debug(' - revision: %s', revision)
-    log.debug(' - url: %s', url)
-
-    # Store each specific revision differently
-    slug = '{version}-{arch}-{threading}-{exceptions}-rev{revision}'
-    slug = slug.format(
-        version = '.'.join(str(v) for v in version),
-        arch = arch,
-        threading = threading,
-        exceptions = exceptions,
-        revision = revision
-    )
-    if arch == 'x86_64':
-        root_dir = os.path.join(location, slug, 'mingw64')
-    elif arch == 'i686':
-        root_dir = os.path.join(location, slug, 'mingw32')
-    else:
-        raise ValueError('Unknown MinGW arch: ' + arch)
-
-    # Download if needed
-    if not os.path.exists(root_dir):
-        downloaded = download(url, os.path.join(location, slug), log = log)
-        if downloaded != root_dir:
-            raise ValueError('The location of mingw did not match\n%s\n%s'
-                % (downloaded, root_dir))
-
-    return root_dir
-
-def str2ver(string):
-    '''
-    Converts a version string into a tuple
-    '''
-    try:
-        version = tuple(int(v) for v in string.split('.'))
-        if len(version) is not 3:
-            raise ValueError()
-    except ValueError:
-        raise argparse.ArgumentTypeError(
-            'please provide a three digit version string')
-    return version
-
-def main():
-    '''
-    Invoked when the script is run directly by the python interpreter
-    '''
-    parser = argparse.ArgumentParser(
-        description = 'Downloads a specific version of MinGW',
-        formatter_class = argparse.ArgumentDefaultsHelpFormatter
-    )
-    parser.add_argument('--location',
-        help = 'the location to download the compiler to',
-        default = os.path.join(tempfile.gettempdir(), 'mingw-builds'))
-    parser.add_argument('--arch', required = True, choices = ['i686', 'x86_64'],
-        help = 'the target MinGW architecture string')
-    parser.add_argument('--version', type = str2ver,
-        help = 'the version of GCC to download')
-    parser.add_argument('--threading', choices = ['posix', 'win32'],
-        help = 'the threading type of the compiler')
-    parser.add_argument('--exceptions', choices = ['sjlj', 'seh', 'dwarf'],
-        help = 'the method to throw exceptions')
-    parser.add_argument('--revision', type=int,
-        help = 'the revision of the MinGW release')
-    group = parser.add_mutually_exclusive_group()
-    group.add_argument('-v', '--verbose', action='store_true',
-        help='increase the script output verbosity')
-    group.add_argument('-q', '--quiet', action='store_true',
-        help='only print errors and warning')
-    args = parser.parse_args()
-
-    # Create the logger
-    logger = logging.getLogger('mingw')
-    handler = logging.StreamHandler()
-    formatter = logging.Formatter('%(message)s')
-    handler.setFormatter(formatter)
-    logger.addHandler(handler)
-    logger.setLevel(logging.INFO)
-    if args.quiet:
-        logger.setLevel(logging.WARN)
-    if args.verbose:
-        logger.setLevel(logging.DEBUG)
-
-    # Get MinGW
-    root_dir = root(location = args.location, arch = args.arch,
-        version = args.version, threading = args.threading,
-        exceptions = args.exceptions, revision = args.revision,
-        log = logger)
-
-    sys.stdout.write('%s\n' % os.path.join(root_dir, 'bin'))
-
-if __name__ == '__main__':
-    try:
-        main()
-    except IOError as e:
-        sys.stderr.write('IO error: %s\n' % e)
-        sys.exit(1)
-    except OSError as e:
-        sys.stderr.write('OS error: %s\n' % e)
-        sys.exit(1)
-    except KeyboardInterrupt as e:
-        sys.stderr.write('Killed\n')
-        sys.exit(1)
diff --git a/third_party/google_benchmark/releasing.md b/third_party/google_benchmark/releasing.md
deleted file mode 100644
index f0cd701..0000000
--- a/third_party/google_benchmark/releasing.md
+++ /dev/null
@@ -1,16 +0,0 @@
-# How to release
-
-* Make sure you're on master and synced to HEAD
-* Ensure the project builds and tests run (sanity check only, obviously)
-    * `parallel -j0 exec ::: test/*_test` can help ensure everything at least
-      passes
-* Prepare release notes
-    * `git log $(git describe --abbrev=0 --tags)..HEAD` gives you the list of
-      commits between the last annotated tag and HEAD
-    * Pick the most interesting.
-* Create a release through github's interface
-    * Note this will create a lightweight tag.
-    * Update this to an annotated tag:
-      * `git pull --tags`
-      * `git tag -a -f <tag> <tag>`
-      * `git push --force origin`
diff --git a/third_party/google_benchmark/.clang-format b/third_party/google_benchmark/src/.clang-format
similarity index 100%
rename from third_party/google_benchmark/.clang-format
rename to third_party/google_benchmark/src/.clang-format
diff --git a/third_party/google_benchmark/src/.clang-tidy b/third_party/google_benchmark/src/.clang-tidy
new file mode 100644
index 0000000..56938a5
--- /dev/null
+++ b/third_party/google_benchmark/src/.clang-tidy
@@ -0,0 +1,7 @@
+---
+Checks:          'clang-analyzer-*,readability-redundant-*,performance-*'
+WarningsAsErrors: 'clang-analyzer-*,readability-redundant-*,performance-*'
+HeaderFilterRegex: '.*'
+AnalyzeTemporaryDtors: false
+FormatStyle:     none
+User:            user
diff --git a/third_party/google_benchmark/src/.github/ISSUE_TEMPLATE/bug_report.md b/third_party/google_benchmark/src/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 0000000..6c2ced9
--- /dev/null
+++ b/third_party/google_benchmark/src/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,32 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: "[BUG]"
+labels: ''
+assignees: ''
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**System**
+Which OS, compiler, and compiler version are you using:
+  - OS: 
+  - Compiler and version: 
+
+**To reproduce**
+Steps to reproduce the behavior:
+1. sync to commit ...
+2. cmake/bazel...
+3. make ...
+4. See error
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
+
+**Additional context**
+Add any other context about the problem here.
diff --git a/third_party/google_benchmark/src/.github/ISSUE_TEMPLATE/feature_request.md b/third_party/google_benchmark/src/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 0000000..9e8ab6a
--- /dev/null
+++ b/third_party/google_benchmark/src/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,20 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: "[FR]"
+labels: ''
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
diff --git a/third_party/google_benchmark/src/.github/install_bazel.sh b/third_party/google_benchmark/src/.github/install_bazel.sh
new file mode 100644
index 0000000..bb910d8
--- /dev/null
+++ b/third_party/google_benchmark/src/.github/install_bazel.sh
@@ -0,0 +1,13 @@
+if ! bazel version; then
+  arch=$(uname -m)
+  if [ "$arch" == "aarch64" ]; then
+    arch="arm64"
+  fi
+  echo "Installing wget and downloading $arch Bazel binary from GitHub releases."
+  yum install -y wget
+  wget "https://github.com/bazelbuild/bazel/releases/download/6.0.0/bazel-6.0.0-linux-$arch" -O /usr/local/bin/bazel
+  chmod +x /usr/local/bin/bazel
+else
+  # bazel is installed for the correct architecture
+  exit 0
+fi
diff --git a/third_party/google_benchmark/src/.github/libcxx-setup.sh b/third_party/google_benchmark/src/.github/libcxx-setup.sh
new file mode 100755
index 0000000..e39e310
--- /dev/null
+++ b/third_party/google_benchmark/src/.github/libcxx-setup.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+
+# Checkout LLVM sources
+#git clone --depth=1 https://github.com/llvm/llvm-project.git llvm-project
+#
+## Setup libc++ options
+#if [ -z "$BUILD_32_BITS" ]; then
+#  export BUILD_32_BITS=OFF && echo disabling 32 bit build
+#fi
+#
+## Build and install libc++ (Use unstable ABI for better sanitizer coverage)
+#cd ./llvm-project
+#cmake -DCMAKE_C_COMPILER=${CC}                  \
+#      -DCMAKE_CXX_COMPILER=${CXX}               \
+#      -DCMAKE_BUILD_TYPE=RelWithDebInfo         \
+#      -DCMAKE_INSTALL_PREFIX=/usr               \
+#      -DLIBCXX_ABI_UNSTABLE=OFF                 \
+#      -DLLVM_USE_SANITIZER=${LIBCXX_SANITIZER}  \
+#      -DLLVM_BUILD_32_BITS=${BUILD_32_BITS}     \
+#      -DLLVM_ENABLE_RUNTIMES='libcxx;libcxxabi' \
+#      -S llvm -B llvm-build -G "Unix Makefiles"
+#make -C llvm-build -j3 cxx cxxabi
+#sudo make -C llvm-build install-cxx install-cxxabi
+#cd ..
+
+sudo apt update
+sudo apt -y install libc++-dev libc++abi-dev
diff --git a/third_party/google_benchmark/src/.github/workflows/bazel.yml b/third_party/google_benchmark/src/.github/workflows/bazel.yml
new file mode 100644
index 0000000..9e31c90
--- /dev/null
+++ b/third_party/google_benchmark/src/.github/workflows/bazel.yml
@@ -0,0 +1,35 @@
+name: bazel
+
+on:
+  push: {}
+  pull_request: {}
+
+jobs:
+  job:
+    name: bazel.${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-2022]
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: mount bazel cache
+      uses: actions/cache@v3
+      env:
+        cache-name: bazel-cache
+      with:
+        path: "~/.cache/bazel"
+        key: ${{ env.cache-name }}-${{ matrix.os }}-${{ github.ref }}
+        restore-keys: |
+          ${{ env.cache-name }}-${{ matrix.os }}-main
+
+    - name: build
+      run: |
+        bazel build //:benchmark //:benchmark_main //test/...
+
+    - name: test
+      run: |
+        bazel test --test_output=all //test/...
diff --git a/third_party/google_benchmark/src/.github/workflows/build-and-test-min-cmake.yml b/third_party/google_benchmark/src/.github/workflows/build-and-test-min-cmake.yml
new file mode 100644
index 0000000..e3e3217
--- /dev/null
+++ b/third_party/google_benchmark/src/.github/workflows/build-and-test-min-cmake.yml
@@ -0,0 +1,46 @@
+name: build-and-test-min-cmake
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  job:
+    name: ${{ matrix.os }}.min-cmake
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, macos-latest]
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: lukka/get-cmake@latest
+        with:
+          cmakeVersion: 3.10.0
+
+      - name: create build environment
+        run: cmake -E make_directory ${{ runner.workspace }}/_build
+
+      - name: setup cmake initial cache
+        run: touch compiler-cache.cmake
+
+      - name: configure cmake
+        env:
+          CXX: ${{ matrix.compiler }}
+        shell: bash
+        working-directory: ${{ runner.workspace }}/_build
+        run: >
+          cmake -C ${{ github.workspace }}/compiler-cache.cmake
+          $GITHUB_WORKSPACE
+          -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+          -DCMAKE_CXX_VISIBILITY_PRESET=hidden
+          -DCMAKE_VISIBILITY_INLINES_HIDDEN=ON
+
+      - name: build
+        shell: bash
+        working-directory: ${{ runner.workspace }}/_build
+        run: cmake --build .
diff --git a/third_party/google_benchmark/src/.github/workflows/build-and-test-perfcounters.yml b/third_party/google_benchmark/src/.github/workflows/build-and-test-perfcounters.yml
new file mode 100644
index 0000000..97e4d8e
--- /dev/null
+++ b/third_party/google_benchmark/src/.github/workflows/build-and-test-perfcounters.yml
@@ -0,0 +1,51 @@
+name: build-and-test-perfcounters
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  job:
+    # TODO(dominic): Extend this to include compiler and set through env: CC/CXX.
+    name: ${{ matrix.os }}.${{ matrix.build_type }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-22.04, ubuntu-20.04]
+        build_type: ['Release', 'Debug']
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: install libpfm
+      run: |
+        sudo apt update
+        sudo apt -y install libpfm4-dev
+
+    - name: create build environment
+      run: cmake -E make_directory ${{ runner.workspace }}/_build
+
+    - name: configure cmake
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: >
+        cmake $GITHUB_WORKSPACE
+        -DBENCHMARK_ENABLE_LIBPFM=1
+        -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+        -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+
+    - name: build
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: cmake --build . --config ${{ matrix.build_type }}
+
+    # Skip testing, for now. It seems perf_event_open does not succeed on the
+    # hosting machine, very likely a permissions issue.
+    # TODO(mtrofin): Enable test.
+    # - name: test
+    #   shell: bash
+    #   working-directory: ${{ runner.workspace }}/_build
+    #   run: ctest -C ${{ matrix.build_type }} --rerun-failed --output-on-failure
+
diff --git a/third_party/google_benchmark/src/.github/workflows/build-and-test.yml b/third_party/google_benchmark/src/.github/workflows/build-and-test.yml
new file mode 100644
index 0000000..b35200a
--- /dev/null
+++ b/third_party/google_benchmark/src/.github/workflows/build-and-test.yml
@@ -0,0 +1,114 @@
+name: build-and-test
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  # TODO: add 32-bit builds (g++ and clang++) for ubuntu
+  #   (requires g++-multilib and libc6:i386)
+  # TODO: add coverage build (requires lcov)
+  # TODO: add clang + libc++ builds for ubuntu
+  job:
+    name: ${{ matrix.os }}.${{ matrix.build_type }}.${{ matrix.lib }}.${{ matrix.compiler }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-22.04, ubuntu-20.04, macos-latest]
+        build_type: ['Release', 'Debug']
+        compiler: ['g++', 'clang++']
+        lib: ['shared', 'static']
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: lukka/get-cmake@latest
+
+      - name: create build environment
+        run: cmake -E make_directory ${{ runner.workspace }}/_build
+
+      - name: setup cmake initial cache
+        run: touch compiler-cache.cmake
+
+      - name: configure cmake
+        env:
+          CXX: ${{ matrix.compiler }}
+        shell: bash
+        working-directory: ${{ runner.workspace }}/_build
+        run: >
+          cmake -C ${{ github.workspace }}/compiler-cache.cmake
+          $GITHUB_WORKSPACE
+          -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+          -DBUILD_SHARED_LIBS=${{ matrix.lib == 'shared' }}
+          -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+          -DCMAKE_CXX_COMPILER=${{ env.CXX }}
+          -DCMAKE_CXX_VISIBILITY_PRESET=hidden
+          -DCMAKE_VISIBILITY_INLINES_HIDDEN=ON
+
+      - name: build
+        shell: bash
+        working-directory: ${{ runner.workspace }}/_build
+        run: cmake --build . --config ${{ matrix.build_type }}
+
+      - name: test
+        shell: bash
+        working-directory: ${{ runner.workspace }}/_build
+        run: ctest -C ${{ matrix.build_type }} -VV
+
+  msvc:
+    name: ${{ matrix.os }}.${{ matrix.build_type }}.${{ matrix.lib }}.${{ matrix.msvc }}
+    runs-on: ${{ matrix.os }}
+    defaults:
+        run:
+            shell: powershell
+    strategy:
+      fail-fast: false
+      matrix:
+        msvc:
+          - VS-16-2019
+          - VS-17-2022
+        arch:
+          - x64
+        build_type:
+          - Debug
+          - Release
+        lib:
+          - shared
+          - static
+        include:
+          - msvc: VS-16-2019
+            os: windows-2019
+            generator: 'Visual Studio 16 2019'
+          - msvc: VS-17-2022
+            os: windows-2022
+            generator: 'Visual Studio 17 2022'
+
+    steps:
+      - uses: actions/checkout@v2
+
+      - uses: lukka/get-cmake@latest
+
+      - name: configure cmake
+        run: >
+          cmake -S . -B _build/
+          -A ${{ matrix.arch }}
+          -G "${{ matrix.generator }}"
+          -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+          -DBUILD_SHARED_LIBS=${{ matrix.lib == 'shared' }}
+
+      - name: build
+        run: cmake --build _build/ --config ${{ matrix.build_type }}
+
+      - name: setup test environment
+        # Make sure gmock and benchmark DLLs can be found
+        run: >
+            echo "$((Get-Item .).FullName)/_build/bin/${{ matrix.build_type }}" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append;
+            echo "$((Get-Item .).FullName)/_build/src/${{ matrix.build_type }}" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append;
+
+      - name: test
+        run: ctest --test-dir _build/ -C ${{ matrix.build_type }} -VV
+
+
diff --git a/third_party/google_benchmark/src/.github/workflows/clang-format-lint.yml b/third_party/google_benchmark/src/.github/workflows/clang-format-lint.yml
new file mode 100644
index 0000000..77ce1f8
--- /dev/null
+++ b/third_party/google_benchmark/src/.github/workflows/clang-format-lint.yml
@@ -0,0 +1,17 @@
+name: clang-format-lint
+on:
+  push: {}
+  pull_request: {}
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+    - uses: DoozyX/clang-format-lint-action@v0.13
+      with:
+        source: './include/benchmark ./src ./test'
+        extensions: 'h,cc'
+        clangFormatVersion: 12
+        style: Google
diff --git a/third_party/google_benchmark/src/.github/workflows/clang-tidy.yml b/third_party/google_benchmark/src/.github/workflows/clang-tidy.yml
new file mode 100644
index 0000000..2eaab9c
--- /dev/null
+++ b/third_party/google_benchmark/src/.github/workflows/clang-tidy.yml
@@ -0,0 +1,38 @@
+name: clang-tidy
+
+on:
+  push: {}
+  pull_request: {}
+
+jobs:
+  job:
+    name: run-clang-tidy
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: install clang-tidy
+      run: sudo apt update && sudo apt -y install clang-tidy
+
+    - name: create build environment
+      run: cmake -E make_directory ${{ runner.workspace }}/_build
+
+    - name: configure cmake
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: >
+        cmake $GITHUB_WORKSPACE
+        -DBENCHMARK_ENABLE_ASSEMBLY_TESTS=OFF
+        -DBENCHMARK_ENABLE_LIBPFM=OFF
+        -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+        -DCMAKE_C_COMPILER=clang
+        -DCMAKE_CXX_COMPILER=clang++
+        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+        -DGTEST_COMPILE_COMMANDS=OFF
+
+    - name: run
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: run-clang-tidy
diff --git a/third_party/google_benchmark/src/.github/workflows/doxygen.yml b/third_party/google_benchmark/src/.github/workflows/doxygen.yml
new file mode 100644
index 0000000..da92c46
--- /dev/null
+++ b/third_party/google_benchmark/src/.github/workflows/doxygen.yml
@@ -0,0 +1,28 @@
+name: doxygen
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+jobs:
+  build-and-deploy:
+    name: Build HTML documentation
+    runs-on: ubuntu-latest
+    steps:
+    - name: Fetching sources
+      uses: actions/checkout@v3
+
+    - name: Installing build dependencies
+      run: |
+        sudo apt update
+        sudo apt install doxygen gcc git
+
+    - name: Creating build directory
+      run: mkdir build
+
+    - name: Building HTML documentation with Doxygen
+      run: |
+        cmake -S . -B build -DBENCHMARK_ENABLE_TESTING:BOOL=OFF -DBENCHMARK_ENABLE_DOXYGEN:BOOL=ON -DBENCHMARK_INSTALL_DOCS:BOOL=ON
+        cmake --build build --target benchmark_doxygen
diff --git a/third_party/google_benchmark/src/.github/workflows/pylint.yml b/third_party/google_benchmark/src/.github/workflows/pylint.yml
new file mode 100644
index 0000000..c6939b5
--- /dev/null
+++ b/third_party/google_benchmark/src/.github/workflows/pylint.yml
@@ -0,0 +1,28 @@
+name: pylint
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  pylint:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python 3.8
+      uses: actions/setup-python@v1
+      with:
+        python-version: 3.8
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install pylint pylint-exit conan
+
+    - name: Run pylint
+      run: |
+        pylint `find . -name '*.py'|xargs` || pylint-exit $?
diff --git a/third_party/google_benchmark/src/.github/workflows/sanitizer.yml b/third_party/google_benchmark/src/.github/workflows/sanitizer.yml
new file mode 100644
index 0000000..4cb93f4
--- /dev/null
+++ b/third_party/google_benchmark/src/.github/workflows/sanitizer.yml
@@ -0,0 +1,106 @@
+name: sanitizer
+
+on:
+  push: {}
+  pull_request: {}
+
+env:
+  UBSAN_OPTIONS: "print_stacktrace=1"
+
+jobs:
+  job:
+    name: ${{ matrix.sanitizer }}.${{ matrix.build_type }}.${{ matrix.compiler }}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: ['Debug', 'RelWithDebInfo']
+        sanitizer: ['asan', 'ubsan', 'tsan']
+        compiler: ['clang', 'gcc']
+        # TODO: add 'msan' above. currently failing and needs investigation.
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: configure msan env
+      if: matrix.sanitizer == 'msan'
+      run: |
+        echo "EXTRA_FLAGS=-g -O2 -fno-omit-frame-pointer -fsanitize=memory -fsanitize-memory-track-origins" >> $GITHUB_ENV
+        echo "LIBCXX_SANITIZER=MemoryWithOrigins" >> $GITHUB_ENV
+
+    - name: configure ubsan env
+      if: matrix.sanitizer == 'ubsan'
+      run: |
+        echo "EXTRA_FLAGS=-g -O2 -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=all" >> $GITHUB_ENV
+        echo "LIBCXX_SANITIZER=Undefined" >> $GITHUB_ENV
+
+    - name: configure asan env
+      if: matrix.sanitizer == 'asan'
+      run: |
+        echo "EXTRA_FLAGS=-g -O2 -fno-omit-frame-pointer -fsanitize=address -fno-sanitize-recover=all" >> $GITHUB_ENV
+        echo "LIBCXX_SANITIZER=Address" >> $GITHUB_ENV
+
+    - name: configure tsan env
+      if: matrix.sanitizer == 'tsan'
+      run: |
+        echo "EXTRA_FLAGS=-g -O2 -fno-omit-frame-pointer -fsanitize=thread -fno-sanitize-recover=all" >> $GITHUB_ENV
+        echo "LIBCXX_SANITIZER=Thread" >> $GITHUB_ENV
+
+    - name: fine-tune asan options
+      # in clang+asan we get an error from std::regex. ignore it.
+      if: matrix.sanitizer == 'asan' && matrix.compiler == 'clang'
+      run: |
+        echo "ASAN_OPTIONS=alloc_dealloc_mismatch=0" >> $GITHUB_ENV
+
+    - name: setup clang
+      if: matrix.compiler == 'clang'
+      uses: egor-tensin/setup-clang@v1
+      with:
+        version: latest
+        platform: x64
+
+    - name: configure clang
+      if: matrix.compiler == 'clang'
+      run: |
+        echo "CC=cc" >> $GITHUB_ENV
+        echo "CXX=c++" >> $GITHUB_ENV
+
+    - name: configure gcc
+      if: matrix.compiler == 'gcc'
+      run: |
+        sudo apt update && sudo apt -y install gcc-10 g++-10
+        echo "CC=gcc-10" >> $GITHUB_ENV
+        echo "CXX=g++-10" >> $GITHUB_ENV
+
+    - name: install llvm stuff
+      if: matrix.compiler == 'clang'
+      run: |
+        "${GITHUB_WORKSPACE}/.github/libcxx-setup.sh"
+        echo "EXTRA_CXX_FLAGS=\"-stdlib=libc++\"" >> $GITHUB_ENV
+
+    - name: create build environment
+      run: cmake -E make_directory ${{ runner.workspace }}/_build
+
+    - name: configure cmake
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: >
+        VERBOSE=1
+        cmake $GITHUB_WORKSPACE
+        -DBENCHMARK_ENABLE_ASSEMBLY_TESTS=OFF
+        -DBENCHMARK_ENABLE_LIBPFM=OFF
+        -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+        -DCMAKE_C_COMPILER=${{ env.CC }}
+        -DCMAKE_CXX_COMPILER=${{ env.CXX }}
+        -DCMAKE_C_FLAGS="${{ env.EXTRA_FLAGS }}"
+        -DCMAKE_CXX_FLAGS="${{ env.EXTRA_FLAGS }} ${{ env.EXTRA_CXX_FLAGS }}"
+        -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+
+    - name: build
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: cmake --build . --config ${{ matrix.build_type }}
+
+    - name: test
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: ctest -C ${{ matrix.build_type }} -VV
diff --git a/third_party/google_benchmark/src/.github/workflows/test_bindings.yml b/third_party/google_benchmark/src/.github/workflows/test_bindings.yml
new file mode 100644
index 0000000..98fa7e1
--- /dev/null
+++ b/third_party/google_benchmark/src/.github/workflows/test_bindings.yml
@@ -0,0 +1,29 @@
+name: test-bindings
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+jobs:
+  python_bindings:
+    name: Test GBM Python bindings on ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ ubuntu-latest, macos-latest, windows-latest ]
+
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.11
+      - name: Install GBM Python bindings on ${{ matrix.os}}
+        run:
+          python -m pip install wheel .
+      - name: Run bindings example on ${{ matrix.os }}
+        run:
+          python bindings/python/google_benchmark/example.py
diff --git a/third_party/google_benchmark/src/.github/workflows/wheels.yml b/third_party/google_benchmark/src/.github/workflows/wheels.yml
new file mode 100644
index 0000000..d3c4630
--- /dev/null
+++ b/third_party/google_benchmark/src/.github/workflows/wheels.yml
@@ -0,0 +1,79 @@
+name: Build and upload Python wheels
+
+on:
+  workflow_dispatch:
+  release:
+    types:
+      - published
+
+jobs:
+  build_sdist:
+    name: Build source distribution
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out repo
+        uses: actions/checkout@v3
+
+      - name: Install Python 3.11
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.11
+
+      - name: Build and check sdist
+        run: |
+          python setup.py sdist
+      - name: Upload sdist
+        uses: actions/upload-artifact@v3
+        with:
+          name: dist
+          path: dist/*.tar.gz
+
+  build_wheels:
+    name: Build Google Benchmark wheels on ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
+
+    steps:
+      - name: Check out Google Benchmark
+        uses: actions/checkout@v3
+
+      - name: Set up QEMU
+        if: runner.os == 'Linux'
+        uses: docker/setup-qemu-action@v2
+        with:
+          platforms: all
+
+      - name: Build wheels on ${{ matrix.os }} using cibuildwheel
+        uses: pypa/cibuildwheel@v2.12.0
+        env:
+          CIBW_BUILD: 'cp38-* cp39-* cp310-* cp311-*'
+          CIBW_SKIP: "*-musllinux_*"
+          CIBW_TEST_SKIP: "*-macosx_arm64"
+          CIBW_ARCHS_LINUX: x86_64 aarch64
+          CIBW_ARCHS_MACOS: x86_64 arm64
+          CIBW_ARCHS_WINDOWS: AMD64
+          CIBW_BEFORE_ALL_LINUX: bash .github/install_bazel.sh
+          CIBW_TEST_COMMAND: python {project}/bindings/python/google_benchmark/example.py
+
+      - name: Upload Google Benchmark ${{ matrix.os }} wheels
+        uses: actions/upload-artifact@v3
+        with:
+          name: dist
+          path: ./wheelhouse/*.whl
+
+  pypi_upload:
+    name: Publish google-benchmark wheels to PyPI
+    needs: [build_sdist, build_wheels]
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/download-artifact@v3
+      with:
+        name: dist
+        path: dist
+
+    - uses: pypa/gh-action-pypi-publish@v1.6.4
+      with:
+        user: __token__
+        password: ${{ secrets.PYPI_PASSWORD }}
diff --git a/third_party/google_benchmark/.gitignore b/third_party/google_benchmark/src/.gitignore
similarity index 93%
rename from third_party/google_benchmark/.gitignore
rename to third_party/google_benchmark/src/.gitignore
index a7716e3..704f56c 100644
--- a/third_party/google_benchmark/.gitignore
+++ b/third_party/google_benchmark/src/.gitignore
@@ -11,6 +11,7 @@
 *.swp
 *.pyc
 __pycache__
+.DS_Store
 
 # lcov
 *.lcov
@@ -60,3 +61,7 @@
 
 # Visual Studio Code cache/options directory
 .vscode/
+
+# Python build stuff
+dist/
+*.egg-info*
diff --git a/third_party/google_benchmark/.travis.yml b/third_party/google_benchmark/src/.travis.yml
similarity index 87%
rename from third_party/google_benchmark/.travis.yml
rename to third_party/google_benchmark/src/.travis.yml
index 6b6cfc7..8cfed3d 100644
--- a/third_party/google_benchmark/.travis.yml
+++ b/third_party/google_benchmark/src/.travis.yml
@@ -2,10 +2,6 @@
 dist: trusty
 language: cpp
 
-env:
-  global:
-    - /usr/local/bin:$PATH
-
 matrix:
   include:
     - compiler: gcc
@@ -15,10 +11,6 @@
             - lcov
       env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Coverage
     - compiler: gcc
-      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Debug
-    - compiler: gcc
-      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Release
-    - compiler: gcc
       addons:
         apt:
           packages:
@@ -48,10 +40,6 @@
         - COMPILER=g++-6 C_COMPILER=gcc-6  BUILD_TYPE=Debug
         - ENABLE_SANITIZER=1
         - EXTRA_FLAGS="-fno-omit-frame-pointer -g -O2 -fsanitize=undefined,address -fuse-ld=gold"
-    - compiler: clang
-      env: COMPILER=clang++ C_COMPILER=clang BUILD_TYPE=Debug
-    - compiler: clang
-      env: COMPILER=clang++ C_COMPILER=clang BUILD_TYPE=Release
     # Clang w/ libc++
     - compiler: clang
       dist: xenial
@@ -154,29 +142,14 @@
       osx_image: xcode8.3
       compiler: clang
       env:
-        - COMPILER=clang++ BUILD_TYPE=Debug
-    - os: osx
-      osx_image: xcode8.3
-      compiler: clang
-      env:
-        - COMPILER=clang++ BUILD_TYPE=Release
-    - os: osx
-      osx_image: xcode8.3
-      compiler: clang
-      env:
         - COMPILER=clang++
         - BUILD_TYPE=Release
         - BUILD_32_BITS=ON
         - EXTRA_FLAGS="-m32"
-    - os: osx
-      osx_image: xcode8.3
-      compiler: gcc
-      env:
-        - COMPILER=g++-7 C_COMPILER=gcc-7  BUILD_TYPE=Debug
 
 before_script:
   - if [ -n "${LIBCXX_BUILD}" ]; then
-      source .travis-libcxx-setup.sh;
+      source .libcxx-setup.sh;
     fi
   - if [ -n "${ENABLE_SANITIZER}" ]; then
       export EXTRA_OPTIONS="-DBENCHMARK_ENABLE_ASSEMBLY_TESTS=OFF";
@@ -215,11 +188,11 @@
   - if [ "${TRAVIS_OS_NAME}" == "linux" ]; then
       sudo apt-get update -qq;
       sudo apt-get install -qq unzip cmake3;
-      wget https://github.com/bazelbuild/bazel/releases/download/0.10.1/bazel-0.10.1-installer-linux-x86_64.sh --output-document bazel-installer.sh;
+      wget https://github.com/bazelbuild/bazel/releases/download/3.2.0/bazel-3.2.0-installer-linux-x86_64.sh --output-document bazel-installer.sh;
       travis_wait sudo bash bazel-installer.sh;
     fi
   - if [ "${TRAVIS_OS_NAME}" == "osx" ]; then
-      curl -L -o bazel-installer.sh https://github.com/bazelbuild/bazel/releases/download/0.10.1/bazel-0.10.1-installer-darwin-x86_64.sh;
+      curl -L -o bazel-installer.sh https://github.com/bazelbuild/bazel/releases/download/3.2.0/bazel-3.2.0-installer-darwin-x86_64.sh;
       travis_wait sudo bash bazel-installer.sh;
     fi
 
diff --git a/third_party/google_benchmark/.ycm_extra_conf.py b/third_party/google_benchmark/src/.ycm_extra_conf.py
similarity index 100%
rename from third_party/google_benchmark/.ycm_extra_conf.py
rename to third_party/google_benchmark/src/.ycm_extra_conf.py
diff --git a/third_party/google_benchmark/AUTHORS b/third_party/google_benchmark/src/AUTHORS
similarity index 75%
rename from third_party/google_benchmark/AUTHORS
rename to third_party/google_benchmark/src/AUTHORS
index 35c4c8c..bafecad 100644
--- a/third_party/google_benchmark/AUTHORS
+++ b/third_party/google_benchmark/src/AUTHORS
@@ -13,6 +13,8 @@
 Andriy Berestovskyy <berestovskyy@gmail.com>
 Arne Beer <arne@twobeer.de>
 Carto
+Cezary Skrzyński <czars1988@gmail.com>
+Christian Wassermann <christian_wassermann@web.de>
 Christopher Seymour <chris.j.seymour@hotmail.com>
 Colin Braley <braley.colin@gmail.com>
 Daniel Harvey <danielharvey458@gmail.com>
@@ -20,35 +22,49 @@
 Deniz Evrenci <denizevrenci@gmail.com>
 Dirac Research 
 Dominik Czarnota <dominik.b.czarnota@gmail.com>
+Dominik Korman <kormandominik@gmail.com>
+Donald Aingworth <donalds_junk_mail@yahoo.com>
 Eric Backus <eric_backus@alum.mit.edu>
 Eric Fiselier <eric@efcs.ca>
 Eugene Zhuk <eugene.zhuk@gmail.com>
 Evgeny Safronov <division494@gmail.com>
 Federico Ficarelli <federico.ficarelli@gmail.com>
 Felix Homann <linuxaudio@showlabor.de>
+GergÅ‘ Szitár <szitar.gergo@gmail.com>
 Google Inc.
+Henrique Bucher <hbucher@gmail.com>
 International Business Machines Corporation
 Ismael Jimenez Martinez <ismael.jimenez.martinez@gmail.com>
 Jern-Kuan Leong <jernkuan@gmail.com>
 JianXiong Zhou <zhoujianxiong2@gmail.com>
 Joao Paulo Magalhaes <joaoppmagalhaes@gmail.com>
+Jordan Williams <jwillikers@protonmail.com>
 Jussi Knuuttila <jussi.knuuttila@gmail.com>
 Kaito Udagawa <umireon@gmail.com>
 Kishan Kumar <kumar.kishan@outlook.com>
 Lei Xu <eddyxu@gmail.com>
+Marcel Jacobse <mjacobse@uni-bremen.de>
 Matt Clarkson <mattyclarkson@gmail.com>
 Maxim Vafin <maxvafin@gmail.com>
+Mike Apodaca <gatorfax@gmail.com>
 MongoDB Inc.
 Nick Hutchinson <nshutchinson@gmail.com>
+Norman Heino <norman.heino@gmail.com>
 Oleksandr Sochka <sasha.sochka@gmail.com>
 Ori Livneh <ori.livneh@gmail.com>
 Paul Redmond <paul.redmond@gmail.com>
+Raghu Raja <raghu@enfabrica.net>
 Radoslav Yovchev <radoslav.tm@gmail.com>
+Rainer Orth <ro@cebitec.uni-bielefeld.de>
 Roman Lebedev <lebedev.ri@gmail.com>
 Sayan Bhattacharjee <aero.sayan@gmail.com>
+Shapr3D <google-contributors@shapr3d.com>
 Shuo Chen <chenshuo@chenshuo.com>
+Staffan Tjernstrom <staffantj@gmail.com>
 Steinar H. Gunderson <sgunderson@bigfoot.com>
 Stripe, Inc.
+Tobias Schmidt <tobias.schmidt@in.tum.de>
 Yixuan Qiu <yixuanq@gmail.com>
 Yusuke Suzuki <utatane.tea@gmail.com>
 Zbigniew Skowron <zbychs@gmail.com>
+Min-Yih Hsu <yihshyng223@gmail.com>
diff --git a/third_party/google_benchmark/src/BUILD.bazel b/third_party/google_benchmark/src/BUILD.bazel
new file mode 100644
index 0000000..9961616
--- /dev/null
+++ b/third_party/google_benchmark/src/BUILD.bazel
@@ -0,0 +1,83 @@
+licenses(["notice"])
+
+config_setting(
+    name = "qnx",
+    constraint_values = ["@platforms//os:qnx"],
+    values = {
+        "cpu": "x64_qnx",
+    },
+    visibility = [":__subpackages__"],
+)
+
+config_setting(
+    name = "windows",
+    constraint_values = ["@platforms//os:windows"],
+    values = {
+        "cpu": "x64_windows",
+    },
+    visibility = [":__subpackages__"],
+)
+
+config_setting(
+    name = "macos",
+    constraint_values = ["@platforms//os:macos"],
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "perfcounters",
+    define_values = {
+        "pfm": "1",
+    },
+    visibility = [":__subpackages__"],
+)
+
+cc_library(
+    name = "benchmark",
+    srcs = glob(
+        [
+            "src/*.cc",
+            "src/*.h",
+        ],
+        exclude = ["src/benchmark_main.cc"],
+    ),
+    hdrs = [
+        "include/benchmark/benchmark.h",
+        "include/benchmark/export.h",
+    ],
+    linkopts = select({
+        ":windows": ["-DEFAULTLIB:shlwapi.lib"],
+        "//conditions:default": ["-pthread"],
+    }),
+    strip_include_prefix = "include",
+    visibility = ["//visibility:public"],
+    # Only static linking is allowed; no .so will be produced.
+    # Using `defines` (i.e. not `local_defines`) means that no
+    # dependent rules need to bother about defining the macro.
+    linkstatic = True,
+    defines = [
+        "BENCHMARK_STATIC_DEFINE",
+    ] + select({
+        ":perfcounters": ["HAVE_LIBPFM"],
+        "//conditions:default": [],
+    }),
+    deps = select({
+        ":perfcounters": ["@libpfm//:libpfm"],
+        "//conditions:default": [],
+    }),
+)
+
+cc_library(
+    name = "benchmark_main",
+    srcs = ["src/benchmark_main.cc"],
+    hdrs = ["include/benchmark/benchmark.h", "include/benchmark/export.h"],
+    strip_include_prefix = "include",
+    visibility = ["//visibility:public"],
+    deps = [":benchmark"],
+)
+
+cc_library(
+    name = "benchmark_internal_headers",
+    hdrs = glob(["src/*.h"]),
+    visibility = ["//test:__pkg__"],
+)
diff --git a/third_party/google_benchmark/src/CMakeLists.txt b/third_party/google_benchmark/src/CMakeLists.txt
index eab1428..f884fae 100644
--- a/third_party/google_benchmark/src/CMakeLists.txt
+++ b/third_party/google_benchmark/src/CMakeLists.txt
@@ -1,112 +1,337 @@
-# Allow the source files to find headers in src/
-include(GNUInstallDirs)
-include_directories(${PROJECT_SOURCE_DIR}/src)
+# Require CMake 3.10. If available, use the policies up to CMake 3.22.
+cmake_minimum_required (VERSION 3.10...3.22)
 
-if (DEFINED BENCHMARK_CXX_LINKER_FLAGS)
-  list(APPEND CMAKE_SHARED_LINKER_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS})
-  list(APPEND CMAKE_MODULE_LINKER_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS})
+project (benchmark VERSION 1.7.1 LANGUAGES CXX)
+
+option(BENCHMARK_ENABLE_TESTING "Enable testing of the benchmark library." ON)
+option(BENCHMARK_ENABLE_EXCEPTIONS "Enable the use of exceptions in the benchmark library." ON)
+option(BENCHMARK_ENABLE_LTO "Enable link time optimisation of the benchmark library." OFF)
+option(BENCHMARK_USE_LIBCXX "Build and test using libc++ as the standard library." OFF)
+option(BENCHMARK_ENABLE_WERROR "Build Release candidates with -Werror." ON)
+option(BENCHMARK_FORCE_WERROR "Build Release candidates with -Werror regardless of compiler issues." OFF)
+
+if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "PGI")
+  # PGC++ maybe reporting false positives.
+  set(BENCHMARK_ENABLE_WERROR OFF)
+endif()
+if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "NVHPC")
+  set(BENCHMARK_ENABLE_WERROR OFF)
+endif()
+if(BENCHMARK_FORCE_WERROR)
+  set(BENCHMARK_ENABLE_WERROR ON)
+endif(BENCHMARK_FORCE_WERROR)
+
+if(NOT MSVC)
+  option(BENCHMARK_BUILD_32_BITS "Build a 32 bit version of the library." OFF)
+else()
+  set(BENCHMARK_BUILD_32_BITS OFF CACHE BOOL "Build a 32 bit version of the library - unsupported when using MSVC)" FORCE)
+endif()
+option(BENCHMARK_ENABLE_INSTALL "Enable installation of benchmark. (Projects embedding benchmark may want to turn this OFF.)" ON)
+option(BENCHMARK_ENABLE_DOXYGEN "Build documentation with Doxygen." OFF)
+option(BENCHMARK_INSTALL_DOCS "Enable installation of documentation." ON)
+
+# Allow unmet dependencies to be met using CMake's ExternalProject mechanics, which
+# may require downloading the source code.
+option(BENCHMARK_DOWNLOAD_DEPENDENCIES "Allow the downloading and in-tree building of unmet dependencies" OFF)
+
+# This option can be used to disable building and running unit tests which depend on gtest
+# in cases where it is not possible to build or find a valid version of gtest.
+option(BENCHMARK_ENABLE_GTEST_TESTS "Enable building the unit tests which depend on gtest" ON)
+option(BENCHMARK_USE_BUNDLED_GTEST "Use bundled GoogleTest. If disabled, the find_package(GTest) will be used." ON)
+
+option(BENCHMARK_ENABLE_LIBPFM "Enable performance counters provided by libpfm" OFF)
+
+# Export only public symbols
+set(CMAKE_CXX_VISIBILITY_PRESET hidden)
+set(CMAKE_VISIBILITY_INLINES_HIDDEN ON)
+
+if(MSVC)
+    # As of CMake 3.18, CMAKE_SYSTEM_PROCESSOR is not set properly for MSVC and
+    # cross-compilation (e.g. Host=x86_64, target=aarch64) requires using the
+    # undocumented, but working variable.
+    # See https://gitlab.kitware.com/cmake/cmake/-/issues/15170
+    set(CMAKE_SYSTEM_PROCESSOR ${MSVC_CXX_ARCHITECTURE_ID})
+    if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ARM")
+      set(CMAKE_CROSSCOMPILING TRUE)
+    endif()
 endif()
 
-file(GLOB
-  SOURCE_FILES
-    *.cc
-    ${PROJECT_SOURCE_DIR}/include/benchmark/*.h
-    ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
-file(GLOB BENCHMARK_MAIN "benchmark_main.cc")
-foreach(item ${BENCHMARK_MAIN})
-  list(REMOVE_ITEM SOURCE_FILES "${item}")
-endforeach()
+set(ENABLE_ASSEMBLY_TESTS_DEFAULT OFF)
+function(should_enable_assembly_tests)
+  if(CMAKE_BUILD_TYPE)
+    string(TOLOWER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_LOWER)
+    if (${CMAKE_BUILD_TYPE_LOWER} MATCHES "coverage")
+      # FIXME: The --coverage flag needs to be removed when building assembly
+      # tests for this to work.
+      return()
+    endif()
+  endif()
+  if (MSVC)
+    return()
+  elseif(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
+    return()
+  elseif(NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
+    # FIXME: Make these work on 32 bit builds
+    return()
+  elseif(BENCHMARK_BUILD_32_BITS)
+     # FIXME: Make these work on 32 bit builds
+    return()
+  endif()
+  find_program(LLVM_FILECHECK_EXE FileCheck)
+  if (LLVM_FILECHECK_EXE)
+    set(LLVM_FILECHECK_EXE "${LLVM_FILECHECK_EXE}" CACHE PATH "llvm filecheck" FORCE)
+    message(STATUS "LLVM FileCheck Found: ${LLVM_FILECHECK_EXE}")
+  else()
+    message(STATUS "Failed to find LLVM FileCheck")
+    return()
+  endif()
+  set(ENABLE_ASSEMBLY_TESTS_DEFAULT ON PARENT_SCOPE)
+endfunction()
+should_enable_assembly_tests()
 
-add_library(benchmark ${SOURCE_FILES})
-set_target_properties(benchmark PROPERTIES
-  OUTPUT_NAME "benchmark"
-  VERSION ${GENERIC_LIB_VERSION}
-  SOVERSION ${GENERIC_LIB_SOVERSION}
-)
-target_include_directories(benchmark PUBLIC
-    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include>
-    )
+# This option disables the building and running of the assembly verification tests
+option(BENCHMARK_ENABLE_ASSEMBLY_TESTS "Enable building and running the assembly tests"
+    ${ENABLE_ASSEMBLY_TESTS_DEFAULT})
 
-# Link threads.
-target_link_libraries(benchmark  ${BENCHMARK_CXX_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
-find_library(LIBRT rt)
-if(LIBRT)
-  target_link_libraries(benchmark ${LIBRT})
+# Make sure we can import out CMake functions
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules")
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
+
+
+# Read the git tags to determine the project version
+include(GetGitVersion)
+get_git_version(GIT_VERSION)
+
+# If no git version can be determined, use the version
+# from the project() command
+if ("${GIT_VERSION}" STREQUAL "0.0.0")
+  set(VERSION "${benchmark_VERSION}")
+else()
+  set(VERSION "${GIT_VERSION}")
+endif()
+# Tell the user what versions we are using
+message(STATUS "Version: ${VERSION}")
+
+# The version of the libraries
+set(GENERIC_LIB_VERSION ${VERSION})
+string(SUBSTRING ${VERSION} 0 1 GENERIC_LIB_SOVERSION)
+
+# Import our CMake modules
+include(AddCXXCompilerFlag)
+include(CheckCXXCompilerFlag)
+include(CheckLibraryExists)
+include(CXXFeatureCheck)
+
+check_library_exists(rt shm_open "" HAVE_LIB_RT)
+
+if (BENCHMARK_BUILD_32_BITS)
+  add_required_cxx_compiler_flag(-m32)
 endif()
 
-if(CMAKE_BUILD_TYPE)
-  string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UPPER)
-endif()
-if(NOT CMAKE_THREAD_LIBS_INIT AND "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}}" MATCHES ".*-fsanitize=[^ ]*address.*")
-  message(WARNING "CMake's FindThreads.cmake did not fail, but CMAKE_THREAD_LIBS_INIT ended up being empty. This was fixed in https://github.com/Kitware/CMake/commit/d53317130e84898c5328c237186dbd995aaf1c12 Let's guess that -pthread is sufficient.")
-  target_link_libraries(benchmark -pthread)
+if (MSVC)
+  set(BENCHMARK_CXX_STANDARD 14)
+else()
+  set(BENCHMARK_CXX_STANDARD 11)
 endif()
 
-# We need extra libraries on Windows
-if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
-  target_link_libraries(benchmark shlwapi)
+set(CMAKE_CXX_STANDARD ${BENCHMARK_CXX_STANDARD})
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+if (MSVC)
+  # Turn compiler warnings up to 11
+  string(REGEX REPLACE "[-/]W[1-4]" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4")
+  add_definitions(-D_CRT_SECURE_NO_WARNINGS)
+
+  if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
+    add_cxx_compiler_flag(-EHs-)
+    add_cxx_compiler_flag(-EHa-)
+    add_definitions(-D_HAS_EXCEPTIONS=0)
+  endif()
+  # Link time optimisation
+  if (BENCHMARK_ENABLE_LTO)
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /GL")
+    set(CMAKE_STATIC_LINKER_FLAGS_RELEASE "${CMAKE_STATIC_LINKER_FLAGS_RELEASE} /LTCG")
+    set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /LTCG")
+    set(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} /LTCG")
+
+    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /GL")
+    string(REGEX REPLACE "[-/]INCREMENTAL" "/INCREMENTAL:NO" CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO}")
+    set(CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO} /LTCG")
+    string(REGEX REPLACE "[-/]INCREMENTAL" "/INCREMENTAL:NO" CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO}")
+    set(CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO} /LTCG")
+    string(REGEX REPLACE "[-/]INCREMENTAL" "/INCREMENTAL:NO" CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO}")
+    set(CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO} /LTCG")
+
+    set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /GL")
+    set(CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL "${CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL} /LTCG")
+    set(CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL "${CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL} /LTCG")
+    set(CMAKE_EXE_LINKER_FLAGS_MINSIZEREL "${CMAKE_EXE_LINKER_FLAGS_MINSIZEREL} /LTCG")
+  endif()
+else()
+  # Turn compiler warnings up to 11
+  add_cxx_compiler_flag(-Wall)
+  add_cxx_compiler_flag(-Wextra)
+  add_cxx_compiler_flag(-Wshadow)
+  add_cxx_compiler_flag(-Wfloat-equal)
+  if(BENCHMARK_ENABLE_WERROR)
+      add_cxx_compiler_flag(-Werror)
+  endif()
+  if (NOT BENCHMARK_ENABLE_TESTING)
+    # Disable warning when compiling tests as gtest does not use 'override'.
+    add_cxx_compiler_flag(-Wsuggest-override)
+  endif()
+  add_cxx_compiler_flag(-pedantic)
+  add_cxx_compiler_flag(-pedantic-errors)
+  add_cxx_compiler_flag(-Wshorten-64-to-32)
+  add_cxx_compiler_flag(-fstrict-aliasing)
+  # Disable warnings regarding deprecated parts of the library while building
+  # and testing those parts of the library.
+  add_cxx_compiler_flag(-Wno-deprecated-declarations)
+  if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
+    # Intel silently ignores '-Wno-deprecated-declarations',
+    # warning no. 1786 must be explicitly disabled.
+    # See #631 for rationale.
+    add_cxx_compiler_flag(-wd1786)
+  endif()
+  # Disable deprecation warnings for release builds (when -Werror is enabled).
+  if(BENCHMARK_ENABLE_WERROR)
+      add_cxx_compiler_flag(-Wno-deprecated)
+  endif()
+  if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
+    add_cxx_compiler_flag(-fno-exceptions)
+  endif()
+
+  if (HAVE_CXX_FLAG_FSTRICT_ALIASING)
+    if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "Intel") #ICC17u2: Many false positives for Wstrict-aliasing
+      add_cxx_compiler_flag(-Wstrict-aliasing)
+    endif()
+  endif()
+  # ICC17u2: overloaded virtual function "benchmark::Fixture::SetUp" is only partially overridden
+  # (because of deprecated overload)
+  add_cxx_compiler_flag(-wd654)
+  add_cxx_compiler_flag(-Wthread-safety)
+  if (HAVE_CXX_FLAG_WTHREAD_SAFETY)
+    cxx_feature_check(THREAD_SAFETY_ATTRIBUTES "-DINCLUDE_DIRECTORIES=${PROJECT_SOURCE_DIR}/include")
+  endif()
+
+  # On most UNIX like platforms g++ and clang++ define _GNU_SOURCE as a
+  # predefined macro, which turns on all of the wonderful libc extensions.
+  # However g++ doesn't do this in Cygwin so we have to define it ourselves
+  # since we depend on GNU/POSIX/BSD extensions.
+  if (CYGWIN)
+    add_definitions(-D_GNU_SOURCE=1)
+  endif()
+
+  if (QNXNTO)
+    add_definitions(-D_QNX_SOURCE)
+  endif()
+
+  # Link time optimisation
+  if (BENCHMARK_ENABLE_LTO)
+    add_cxx_compiler_flag(-flto)
+    add_cxx_compiler_flag(-Wno-lto-type-mismatch)
+    if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
+      find_program(GCC_AR gcc-ar)
+      if (GCC_AR)
+        set(CMAKE_AR ${GCC_AR})
+      endif()
+      find_program(GCC_RANLIB gcc-ranlib)
+      if (GCC_RANLIB)
+        set(CMAKE_RANLIB ${GCC_RANLIB})
+      endif()
+    elseif("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
+      include(llvm-toolchain)
+    endif()
+  endif()
+
+  # Coverage build type
+  set(BENCHMARK_CXX_FLAGS_COVERAGE "${CMAKE_CXX_FLAGS_DEBUG}"
+    CACHE STRING "Flags used by the C++ compiler during coverage builds."
+    FORCE)
+  set(BENCHMARK_EXE_LINKER_FLAGS_COVERAGE "${CMAKE_EXE_LINKER_FLAGS_DEBUG}"
+    CACHE STRING "Flags used for linking binaries during coverage builds."
+    FORCE)
+  set(BENCHMARK_SHARED_LINKER_FLAGS_COVERAGE "${CMAKE_SHARED_LINKER_FLAGS_DEBUG}"
+    CACHE STRING "Flags used by the shared libraries linker during coverage builds."
+    FORCE)
+  mark_as_advanced(
+    BENCHMARK_CXX_FLAGS_COVERAGE
+    BENCHMARK_EXE_LINKER_FLAGS_COVERAGE
+    BENCHMARK_SHARED_LINKER_FLAGS_COVERAGE)
+  set(CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}" CACHE STRING
+    "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel Coverage.")
+  add_cxx_compiler_flag(--coverage COVERAGE)
 endif()
 
-# We need extra libraries on Solaris
-if(${CMAKE_SYSTEM_NAME} MATCHES "SunOS")
-  target_link_libraries(benchmark kstat)
+if (BENCHMARK_USE_LIBCXX)
+  if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
+    add_cxx_compiler_flag(-stdlib=libc++)
+  elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" OR
+          "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
+    add_cxx_compiler_flag(-nostdinc++)
+    message(WARNING "libc++ header path must be manually specified using CMAKE_CXX_FLAGS")
+    # Adding -nodefaultlibs directly to CMAKE_<TYPE>_LINKER_FLAGS will break
+    # configuration checks such as 'find_package(Threads)'
+    list(APPEND BENCHMARK_CXX_LINKER_FLAGS -nodefaultlibs)
+    # -lc++ cannot be added directly to CMAKE_<TYPE>_LINKER_FLAGS because
+    # linker flags appear before all linker inputs and -lc++ must appear after.
+    list(APPEND BENCHMARK_CXX_LIBRARIES c++)
+  else()
+    message(FATAL_ERROR "-DBENCHMARK_USE_LIBCXX:BOOL=ON is not supported for compiler")
+  endif()
+endif(BENCHMARK_USE_LIBCXX)
+
+set(EXTRA_CXX_FLAGS "")
+if (WIN32 AND "${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
+  # Clang on Windows fails to compile the regex feature check under C++11
+  set(EXTRA_CXX_FLAGS "-DCMAKE_CXX_STANDARD=14")
 endif()
 
-# Benchmark main library
-add_library(benchmark_main "benchmark_main.cc")
-set_target_properties(benchmark_main PROPERTIES
-  OUTPUT_NAME "benchmark_main"
-  VERSION ${GENERIC_LIB_VERSION}
-  SOVERSION ${GENERIC_LIB_SOVERSION}
-)
-target_include_directories(benchmark PUBLIC
-    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include>
-    )
-target_link_libraries(benchmark_main benchmark)
+# C++ feature checks
+# Determine the correct regular expression engine to use
+cxx_feature_check(STD_REGEX ${EXTRA_CXX_FLAGS})
+cxx_feature_check(GNU_POSIX_REGEX ${EXTRA_CXX_FLAGS})
+cxx_feature_check(POSIX_REGEX ${EXTRA_CXX_FLAGS})
+if(NOT HAVE_STD_REGEX AND NOT HAVE_GNU_POSIX_REGEX AND NOT HAVE_POSIX_REGEX)
+  message(FATAL_ERROR "Failed to determine the source files for the regular expression backend")
+endif()
+if (NOT BENCHMARK_ENABLE_EXCEPTIONS AND HAVE_STD_REGEX
+        AND NOT HAVE_GNU_POSIX_REGEX AND NOT HAVE_POSIX_REGEX)
+  message(WARNING "Using std::regex with exceptions disabled is not fully supported")
+endif()
 
+cxx_feature_check(STEADY_CLOCK)
+# Ensure we have pthreads
+set(THREADS_PREFER_PTHREAD_FLAG ON)
+find_package(Threads REQUIRED)
+cxx_feature_check(PTHREAD_AFFINITY)
 
-set(generated_dir "${CMAKE_CURRENT_BINARY_DIR}/generated")
+if (BENCHMARK_ENABLE_LIBPFM)
+  find_package(PFM)
+endif()
 
-set(version_config "${generated_dir}/${PROJECT_NAME}ConfigVersion.cmake")
-set(project_config "${generated_dir}/${PROJECT_NAME}Config.cmake")
-set(pkg_config "${generated_dir}/${PROJECT_NAME}.pc")
-set(targets_export_name "${PROJECT_NAME}Targets")
+# Set up directories
+include_directories(${PROJECT_SOURCE_DIR}/include)
 
-set(namespace "${PROJECT_NAME}::")
+# Build the targets
+add_subdirectory(src)
 
-include(CMakePackageConfigHelpers)
-write_basic_package_version_file(
-  "${version_config}" VERSION ${GENERIC_LIB_VERSION} COMPATIBILITY SameMajorVersion
-)
-
-configure_file("${PROJECT_SOURCE_DIR}/cmake/Config.cmake.in" "${project_config}" @ONLY)
-configure_file("${PROJECT_SOURCE_DIR}/cmake/benchmark.pc.in" "${pkg_config}" @ONLY)
-
-if (BENCHMARK_ENABLE_INSTALL)
-  # Install target (will install the library to specified CMAKE_INSTALL_PREFIX variable)
-  install(
-    TARGETS benchmark benchmark_main
-    EXPORT ${targets_export_name}
-    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
-    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
-    INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
-
-  install(
-    DIRECTORY "${PROJECT_SOURCE_DIR}/include/benchmark"
-    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
-    FILES_MATCHING PATTERN "*.*h")
-
-  install(
-      FILES "${project_config}" "${version_config}"
-      DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")
-
-  install(
-      FILES "${pkg_config}"
-      DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
-
-  install(
-      EXPORT "${targets_export_name}"
-      NAMESPACE "${namespace}"
-      DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")
+if (BENCHMARK_ENABLE_TESTING)
+  enable_testing()
+  if (BENCHMARK_ENABLE_GTEST_TESTS AND
+      NOT (TARGET gtest AND TARGET gtest_main AND
+           TARGET gmock AND TARGET gmock_main))
+    if (BENCHMARK_USE_BUNDLED_GTEST)
+      include(GoogleTest)
+    else()
+      find_package(GTest CONFIG REQUIRED)
+      add_library(gtest ALIAS GTest::gtest)
+      add_library(gtest_main ALIAS GTest::gtest_main)
+      add_library(gmock ALIAS GTest::gmock)
+      add_library(gmock_main ALIAS GTest::gmock_main)
+    endif()
+  endif()
+  add_subdirectory(test)
 endif()
diff --git a/third_party/google_benchmark/CONTRIBUTING.md b/third_party/google_benchmark/src/CONTRIBUTING.md
similarity index 100%
rename from third_party/google_benchmark/CONTRIBUTING.md
rename to third_party/google_benchmark/src/CONTRIBUTING.md
diff --git a/third_party/google_benchmark/CONTRIBUTORS b/third_party/google_benchmark/src/CONTRIBUTORS
similarity index 80%
rename from third_party/google_benchmark/CONTRIBUTORS
rename to third_party/google_benchmark/src/CONTRIBUTORS
index 6b64a00..56f03e2 100644
--- a/third_party/google_benchmark/CONTRIBUTORS
+++ b/third_party/google_benchmark/src/CONTRIBUTORS
@@ -22,12 +22,16 @@
 #
 # Please keep the list sorted.
 
+Abhina Sreeskantharajan <abhina.sreeskantharajan@ibm.com>
 Albert Pretorius <pretoalb@gmail.com>
 Alex Steele <steelal123@gmail.com>
 Andriy Berestovskyy <berestovskyy@gmail.com>
 Arne Beer <arne@twobeer.de>
+Bátor Tallér <bator.taller@shapr3d.com>
 Billy Robert O'Neal III <billy.oneal@gmail.com> <bion@microsoft.com>
+Cezary Skrzyński <czars1988@gmail.com>
 Chris Kennelly <ckennelly@google.com> <ckennelly@ckennelly.com>
+Christian Wassermann <christian_wassermann@web.de>
 Christopher Seymour <chris.j.seymour@hotmail.com>
 Colin Braley <braley.colin@gmail.com>
 Cyrille Faucheux <cyrille.faucheux@gmail.com>
@@ -36,41 +40,55 @@
 Deniz Evrenci <denizevrenci@gmail.com>
 Dominic Hamon <dma@stripysock.com> <dominic@google.com>
 Dominik Czarnota <dominik.b.czarnota@gmail.com>
+Dominik Korman <kormandominik@gmail.com>
+Donald Aingworth <donalds_junk_mail@yahoo.com>
 Eric Backus <eric_backus@alum.mit.edu>
 Eric Fiselier <eric@efcs.ca>
 Eugene Zhuk <eugene.zhuk@gmail.com>
 Evgeny Safronov <division494@gmail.com>
+Fanbo Meng <fanbo.meng@ibm.com>
 Federico Ficarelli <federico.ficarelli@gmail.com>
 Felix Homann <linuxaudio@showlabor.de>
 Geoffrey Martin-Noble <gcmn@google.com> <gmngeoffrey@gmail.com>
+GergÅ‘ Szitár <szitar.gergo@gmail.com>
 Hannes Hauswedell <h2@fsfe.org>
+Henrique Bucher <hbucher@gmail.com>
 Ismael Jimenez Martinez <ismael.jimenez.martinez@gmail.com>
 Jern-Kuan Leong <jernkuan@gmail.com>
 JianXiong Zhou <zhoujianxiong2@gmail.com>
 Joao Paulo Magalhaes <joaoppmagalhaes@gmail.com>
 John Millikin <jmillikin@stripe.com>
+Jordan Williams <jwillikers@protonmail.com>
 Jussi Knuuttila <jussi.knuuttila@gmail.com>
 Kai Wolf <kai.wolf@gmail.com>
 Kaito Udagawa <umireon@gmail.com>
 Kishan Kumar <kumar.kishan@outlook.com>
 Lei Xu <eddyxu@gmail.com>
+Marcel Jacobse <mjacobse@uni-bremen.de>
 Matt Clarkson <mattyclarkson@gmail.com>
 Maxim Vafin <maxvafin@gmail.com>
+Mike Apodaca <gatorfax@gmail.com>
 Nick Hutchinson <nshutchinson@gmail.com>
+Norman Heino <norman.heino@gmail.com>
 Oleksandr Sochka <sasha.sochka@gmail.com>
 Ori Livneh <ori.livneh@gmail.com>
 Pascal Leroy <phl@google.com>
 Paul Redmond <paul.redmond@gmail.com>
 Pierre Phaneuf <pphaneuf@google.com>
 Radoslav Yovchev <radoslav.tm@gmail.com>
+Rainer Orth <ro@cebitec.uni-bielefeld.de>
+Raghu Raja <raghu@enfabrica.net>
 Raul Marin <rmrodriguez@cartodb.com>
 Ray Glover <ray.glover@uk.ibm.com>
 Robert Guo <robert.guo@mongodb.com>
 Roman Lebedev <lebedev.ri@gmail.com>
 Sayan Bhattacharjee <aero.sayan@gmail.com>
 Shuo Chen <chenshuo@chenshuo.com>
+Steven Wan <wan.yu@ibm.com>
+Tobias Schmidt <tobias.schmidt@in.tum.de>
 Tobias Ulvgård <tobias.ulvgard@dirac.se>
 Tom Madams <tom.ej.madams@gmail.com> <tmadams@google.com>
 Yixuan Qiu <yixuanq@gmail.com>
 Yusuke Suzuki <utatane.tea@gmail.com>
 Zbigniew Skowron <zbychs@gmail.com>
+Min-Yih Hsu <yihshyng223@gmail.com>
diff --git a/third_party/google_benchmark/LICENSE b/third_party/google_benchmark/src/LICENSE
similarity index 100%
rename from third_party/google_benchmark/LICENSE
rename to third_party/google_benchmark/src/LICENSE
diff --git a/third_party/google_benchmark/src/METADATA b/third_party/google_benchmark/src/METADATA
new file mode 100644
index 0000000..0434052
--- /dev/null
+++ b/third_party/google_benchmark/src/METADATA
@@ -0,0 +1,14 @@
+name: "google_benchmark-source"
+
+third_party {
+  identifier {
+    type: "ChromiumVersion"
+    value: "114.0.5735.358"  # from https://chromereleases.googleblog.com/2024/03/long-term-support-channel-update-for_26.html
+  }
+  identifier {
+    type: "Git"
+    value: "https://chromium.googlesource.com/external/github.com/google/benchmark.git"
+    version: "b177433f3ee2513b1075140c723d73ab8901790f"
+    # from https://chromium.googlesource.com/chromium/src/+/114.0.5735.358/DEPS#1125
+  }
+}
diff --git a/third_party/google_benchmark/src/README.md b/third_party/google_benchmark/src/README.md
new file mode 100644
index 0000000..b64048b
--- /dev/null
+++ b/third_party/google_benchmark/src/README.md
@@ -0,0 +1,224 @@
+# Benchmark
+
+[![build-and-test](https://github.com/google/benchmark/workflows/build-and-test/badge.svg)](https://github.com/google/benchmark/actions?query=workflow%3Abuild-and-test)
+[![bazel](https://github.com/google/benchmark/actions/workflows/bazel.yml/badge.svg)](https://github.com/google/benchmark/actions/workflows/bazel.yml)
+[![pylint](https://github.com/google/benchmark/workflows/pylint/badge.svg)](https://github.com/google/benchmark/actions?query=workflow%3Apylint)
+[![test-bindings](https://github.com/google/benchmark/workflows/test-bindings/badge.svg)](https://github.com/google/benchmark/actions?query=workflow%3Atest-bindings)
+
+[![Build Status](https://travis-ci.org/google/benchmark.svg?branch=main)](https://travis-ci.org/google/benchmark)
+[![Coverage Status](https://coveralls.io/repos/google/benchmark/badge.svg)](https://coveralls.io/r/google/benchmark)
+
+
+A library to benchmark code snippets, similar to unit tests. Example:
+
+```c++
+#include <benchmark/benchmark.h>
+
+static void BM_SomeFunction(benchmark::State& state) {
+  // Perform setup here
+  for (auto _ : state) {
+    // This code gets timed
+    SomeFunction();
+  }
+}
+// Register the function as a benchmark
+BENCHMARK(BM_SomeFunction);
+// Run the benchmark
+BENCHMARK_MAIN();
+```
+
+## Getting Started
+
+To get started, see [Requirements](#requirements) and
+[Installation](#installation). See [Usage](#usage) for a full example and the
+[User Guide](docs/user_guide.md) for a more comprehensive feature overview.
+
+It may also help to read the [Google Test documentation](https://github.com/google/googletest/blob/main/docs/primer.md)
+as some of the structural aspects of the APIs are similar.
+
+## Resources
+
+[Discussion group](https://groups.google.com/d/forum/benchmark-discuss)
+
+IRC channels:
+* [libera](https://libera.chat) #benchmark
+
+[Additional Tooling Documentation](docs/tools.md)
+
+[Assembly Testing Documentation](docs/AssemblyTests.md)
+
+[Building and installing Python bindings](docs/python_bindings.md)
+
+## Requirements
+
+The library can be used with C++03. However, it requires C++11 to build,
+including compiler and standard library support.
+
+The following minimum versions are required to build the library:
+
+* GCC 4.8
+* Clang 3.4
+* Visual Studio 14 2015
+* Intel 2015 Update 1
+
+See [Platform-Specific Build Instructions](docs/platform_specific_build_instructions.md).
+
+## Installation
+
+This describes the installation process using cmake. As pre-requisites, you'll
+need git and cmake installed.
+
+_See [dependencies.md](docs/dependencies.md) for more details regarding supported
+versions of build tools._
+
+```bash
+# Check out the library.
+$ git clone https://github.com/google/benchmark.git
+# Go to the library root directory
+$ cd benchmark
+# Make a build directory to place the build output.
+$ cmake -E make_directory "build"
+# Generate build system files with cmake, and download any dependencies.
+$ cmake -E chdir "build" cmake -DBENCHMARK_DOWNLOAD_DEPENDENCIES=on -DCMAKE_BUILD_TYPE=Release ../
+# or, starting with CMake 3.13, use a simpler form:
+# cmake -DCMAKE_BUILD_TYPE=Release -S . -B "build"
+# Build the library.
+$ cmake --build "build" --config Release
+```
+This builds the `benchmark` and `benchmark_main` libraries and tests.
+On a unix system, the build directory should now look something like this:
+
+```
+/benchmark
+  /build
+    /src
+      /libbenchmark.a
+      /libbenchmark_main.a
+    /test
+      ...
+```
+
+Next, you can run the tests to check the build.
+
+```bash
+$ cmake -E chdir "build" ctest --build-config Release
+```
+
+If you want to install the library globally, also run:
+
+```
+sudo cmake --build "build" --config Release --target install
+```
+
+Note that Google Benchmark requires Google Test to build and run the tests. This
+dependency can be provided two ways:
+
+* Checkout the Google Test sources into `benchmark/googletest`.
+* Otherwise, if `-DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON` is specified during
+  configuration as above, the library will automatically download and build
+  any required dependencies.
+
+If you do not wish to build and run the tests, add `-DBENCHMARK_ENABLE_GTEST_TESTS=OFF`
+to `CMAKE_ARGS`.
+
+### Debug vs Release
+
+By default, benchmark builds as a debug library. You will see a warning in the
+output when this is the case. To build it as a release library instead, add
+`-DCMAKE_BUILD_TYPE=Release` when generating the build system files, as shown
+above. The use of `--config Release` in build commands is needed to properly
+support multi-configuration tools (like Visual Studio for example) and can be
+skipped for other build systems (like Makefile).
+
+To enable link-time optimisation, also add `-DBENCHMARK_ENABLE_LTO=true` when
+generating the build system files.
+
+If you are using gcc, you might need to set `GCC_AR` and `GCC_RANLIB` cmake
+cache variables, if autodetection fails.
+
+If you are using clang, you may need to set `LLVMAR_EXECUTABLE`,
+`LLVMNM_EXECUTABLE` and `LLVMRANLIB_EXECUTABLE` cmake cache variables.
+
+To enable sanitizer checks (eg., `asan` and `tsan`), add:
+```
+ -DCMAKE_C_FLAGS="-g -O2 -fno-omit-frame-pointer -fsanitize=address -fsanitize=thread -fno-sanitize-recover=all"
+ -DCMAKE_CXX_FLAGS="-g -O2 -fno-omit-frame-pointer -fsanitize=address -fsanitize=thread -fno-sanitize-recover=all "  
+```
+
+### Stable and Experimental Library Versions
+
+The main branch contains the latest stable version of the benchmarking library;
+the API of which can be considered largely stable, with source breaking changes
+being made only upon the release of a new major version.
+
+Newer, experimental, features are implemented and tested on the
+[`v2` branch](https://github.com/google/benchmark/tree/v2). Users who wish
+to use, test, and provide feedback on the new features are encouraged to try
+this branch. However, this branch provides no stability guarantees and reserves
+the right to change and break the API at any time.
+
+## Usage
+
+### Basic usage
+
+Define a function that executes the code to measure, register it as a benchmark
+function using the `BENCHMARK` macro, and ensure an appropriate `main` function
+is available:
+
+```c++
+#include <benchmark/benchmark.h>
+
+static void BM_StringCreation(benchmark::State& state) {
+  for (auto _ : state)
+    std::string empty_string;
+}
+// Register the function as a benchmark
+BENCHMARK(BM_StringCreation);
+
+// Define another benchmark
+static void BM_StringCopy(benchmark::State& state) {
+  std::string x = "hello";
+  for (auto _ : state)
+    std::string copy(x);
+}
+BENCHMARK(BM_StringCopy);
+
+BENCHMARK_MAIN();
+```
+
+To run the benchmark, compile and link against the `benchmark` library
+(libbenchmark.a/.so). If you followed the build steps above, this library will 
+be under the build directory you created.
+
+```bash
+# Example on linux after running the build steps above. Assumes the
+# `benchmark` and `build` directories are under the current directory.
+$ g++ mybenchmark.cc -std=c++11 -isystem benchmark/include \
+  -Lbenchmark/build/src -lbenchmark -lpthread -o mybenchmark
+```
+
+Alternatively, link against the `benchmark_main` library and remove
+`BENCHMARK_MAIN();` above to get the same behavior.
+
+The compiled executable will run all benchmarks by default. Pass the `--help`
+flag for option information or see the [User Guide](docs/user_guide.md).
+
+### Usage with CMake
+
+If using CMake, it is recommended to link against the project-provided
+`benchmark::benchmark` and `benchmark::benchmark_main` targets using
+`target_link_libraries`.
+It is possible to use ```find_package``` to import an installed version of the
+library.
+```cmake
+find_package(benchmark REQUIRED)
+```
+Alternatively, ```add_subdirectory``` will incorporate the library directly in
+to one's CMake project.
+```cmake
+add_subdirectory(benchmark)
+```
+Either way, link to the library as follows.
+```cmake
+target_link_libraries(MyTarget benchmark::benchmark)
+```
diff --git a/third_party/google_benchmark/src/WORKSPACE b/third_party/google_benchmark/src/WORKSPACE
new file mode 100644
index 0000000..74e7ebc
--- /dev/null
+++ b/third_party/google_benchmark/src/WORKSPACE
@@ -0,0 +1,22 @@
+workspace(name = "com_github_google_benchmark")
+
+load("//:bazel/benchmark_deps.bzl", "benchmark_deps")
+
+benchmark_deps()
+
+load("@rules_foreign_cc//foreign_cc:repositories.bzl", "rules_foreign_cc_dependencies")
+
+rules_foreign_cc_dependencies()
+
+load("@rules_python//python:pip.bzl", pip3_install="pip_install")
+
+pip3_install(
+   name = "py_deps",
+   requirements = "//:requirements.txt",
+)
+
+new_local_repository(
+    name = "python_headers",
+    build_file = "@//bindings/python:python_headers.BUILD",
+    path = "<PYTHON_INCLUDE_PATH>",  # May be overwritten by setup.py.
+)
diff --git a/third_party/google_benchmark/_config.yml b/third_party/google_benchmark/src/_config.yml
similarity index 65%
rename from third_party/google_benchmark/_config.yml
rename to third_party/google_benchmark/src/_config.yml
index 1885487..1fa5ff8 100644
--- a/third_party/google_benchmark/_config.yml
+++ b/third_party/google_benchmark/src/_config.yml
@@ -1 +1,2 @@
-theme: jekyll-theme-midnight
\ No newline at end of file
+theme: jekyll-theme-midnight
+markdown: GFM
diff --git a/third_party/google_benchmark/appveyor.yml b/third_party/google_benchmark/src/appveyor.yml
similarity index 100%
rename from third_party/google_benchmark/appveyor.yml
rename to third_party/google_benchmark/src/appveyor.yml
diff --git a/third_party/google_benchmark/src/bazel/benchmark_deps.bzl b/third_party/google_benchmark/src/bazel/benchmark_deps.bzl
new file mode 100644
index 0000000..e9ca2ce
--- /dev/null
+++ b/third_party/google_benchmark/src/bazel/benchmark_deps.bzl
@@ -0,0 +1,66 @@
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+load("@bazel_tools//tools/build_defs/repo:git.bzl", "new_git_repository")
+
+def benchmark_deps():
+    """Loads dependencies required to build Google Benchmark."""
+
+    if "bazel_skylib" not in native.existing_rules():
+        http_archive(
+            name = "bazel_skylib",
+            sha256 = "f7be3474d42aae265405a592bb7da8e171919d74c16f082a5457840f06054728",
+            urls = [
+                "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/1.2.1/bazel-skylib-1.2.1.tar.gz",
+                "https://github.com/bazelbuild/bazel-skylib/releases/download/1.2.1/bazel-skylib-1.2.1.tar.gz",
+            ],
+        )
+
+    if "rules_foreign_cc" not in native.existing_rules():
+        http_archive(
+            name = "rules_foreign_cc",
+            sha256 = "bcd0c5f46a49b85b384906daae41d277b3dc0ff27c7c752cc51e43048a58ec83",
+            strip_prefix = "rules_foreign_cc-0.7.1",
+            url = "https://github.com/bazelbuild/rules_foreign_cc/archive/0.7.1.tar.gz",
+        )
+
+    if "rules_python" not in native.existing_rules():
+        http_archive(
+            name = "rules_python",
+            url = "https://github.com/bazelbuild/rules_python/releases/download/0.1.0/rules_python-0.1.0.tar.gz",
+            sha256 = "b6d46438523a3ec0f3cead544190ee13223a52f6a6765a29eae7b7cc24cc83a0",
+        )
+
+    if "com_google_absl" not in native.existing_rules():
+        http_archive(
+            name = "com_google_absl",
+            sha256 = "f41868f7a938605c92936230081175d1eae87f6ea2c248f41077c8f88316f111",
+            strip_prefix = "abseil-cpp-20200225.2",
+            urls = ["https://github.com/abseil/abseil-cpp/archive/20200225.2.tar.gz"],
+        )
+
+    if "com_google_googletest" not in native.existing_rules():
+        new_git_repository(
+            name = "com_google_googletest",
+            remote = "https://github.com/google/googletest.git",
+            tag = "release-1.11.0",
+        )
+
+    if "nanobind" not in native.existing_rules():
+        new_git_repository(
+            name = "nanobind",
+            remote = "https://github.com/wjakob/nanobind.git",
+            commit = "1ffbfe836c9dac599496a170274ee0075094a607", # v0.2.0
+            shallow_since = "1677873085 +0100",
+            build_file = "@//bindings/python:nanobind.BUILD",
+            recursive_init_submodules = True,
+        )
+
+    if "libpfm" not in native.existing_rules():
+        # Downloaded from v4.9.0 tag at https://sourceforge.net/p/perfmon2/libpfm4/ref/master/tags/
+        http_archive(
+            name = "libpfm",
+            build_file = str(Label("//tools:libpfm.BUILD.bazel")),
+            sha256 = "5da5f8872bde14b3634c9688d980f68bda28b510268723cc12973eedbab9fecc",
+            type = "tar.gz",
+            strip_prefix = "libpfm-4.11.0",
+            urls = ["https://sourceforge.net/projects/perfmon2/files/libpfm4/libpfm-4.11.0.tar.gz/download"],
+        )
diff --git a/third_party/google_benchmark/src/benchmark.cc b/third_party/google_benchmark/src/benchmark.cc
deleted file mode 100644
index b751b9c..0000000
--- a/third_party/google_benchmark/src/benchmark.cc
+++ /dev/null
@@ -1,499 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "benchmark/benchmark.h"
-#include "benchmark_api_internal.h"
-#include "benchmark_runner.h"
-#include "internal_macros.h"
-
-#ifndef BENCHMARK_OS_WINDOWS
-#ifndef BENCHMARK_OS_FUCHSIA
-#include <sys/resource.h>
-#endif
-#include <sys/time.h>
-#include <unistd.h>
-#endif
-
-#include <algorithm>
-#include <atomic>
-#include <condition_variable>
-#include <cstdio>
-#include <cstdlib>
-#include <fstream>
-#include <iostream>
-#include <memory>
-#include <string>
-#include <thread>
-#include <utility>
-
-#include "check.h"
-#include "colorprint.h"
-#include "commandlineflags.h"
-#include "complexity.h"
-#include "counter.h"
-#include "internal_macros.h"
-#include "log.h"
-#include "mutex.h"
-#include "re.h"
-#include "statistics.h"
-#include "string_util.h"
-#include "thread_manager.h"
-#include "thread_timer.h"
-
-// Print a list of benchmarks. This option overrides all other options.
-DEFINE_bool(benchmark_list_tests, false);
-
-// A regular expression that specifies the set of benchmarks to execute.  If
-// this flag is empty, or if this flag is the string \"all\", all benchmarks
-// linked into the binary are run.
-DEFINE_string(benchmark_filter, ".");
-
-// Minimum number of seconds we should run benchmark before results are
-// considered significant.  For cpu-time based tests, this is the lower bound
-// on the total cpu time used by all threads that make up the test.  For
-// real-time based tests, this is the lower bound on the elapsed time of the
-// benchmark execution, regardless of number of threads.
-DEFINE_double(benchmark_min_time, 0.5);
-
-// The number of runs of each benchmark. If greater than 1, the mean and
-// standard deviation of the runs will be reported.
-DEFINE_int32(benchmark_repetitions, 1);
-
-// Report the result of each benchmark repetitions. When 'true' is specified
-// only the mean, standard deviation, and other statistics are reported for
-// repeated benchmarks. Affects all reporters.
-DEFINE_bool(benchmark_report_aggregates_only, false);
-
-// Display the result of each benchmark repetitions. When 'true' is specified
-// only the mean, standard deviation, and other statistics are displayed for
-// repeated benchmarks. Unlike benchmark_report_aggregates_only, only affects
-// the display reporter, but  *NOT* file reporter, which will still contain
-// all the output.
-DEFINE_bool(benchmark_display_aggregates_only, false);
-
-// The format to use for console output.
-// Valid values are 'console', 'json', or 'csv'.
-DEFINE_string(benchmark_format, "console");
-
-// The format to use for file output.
-// Valid values are 'console', 'json', or 'csv'.
-DEFINE_string(benchmark_out_format, "json");
-
-// The file to write additional output to.
-DEFINE_string(benchmark_out, "");
-
-// Whether to use colors in the output.  Valid values:
-// 'true'/'yes'/1, 'false'/'no'/0, and 'auto'. 'auto' means to use colors if
-// the output is being sent to a terminal and the TERM environment variable is
-// set to a terminal type that supports colors.
-DEFINE_string(benchmark_color, "auto");
-
-// Whether to use tabular format when printing user counters to the console.
-// Valid values: 'true'/'yes'/1, 'false'/'no'/0.  Defaults to false.
-DEFINE_bool(benchmark_counters_tabular, false);
-
-// The level of verbose logging to output
-DEFINE_int32(v, 0);
-
-namespace benchmark {
-
-namespace internal {
-
-// FIXME: wouldn't LTO mess this up?
-void UseCharPointer(char const volatile*) {}
-
-}  // namespace internal
-
-State::State(IterationCount max_iters, const std::vector<int64_t>& ranges,
-             int thread_i, int n_threads, internal::ThreadTimer* timer,
-             internal::ThreadManager* manager)
-    : total_iterations_(0),
-      batch_leftover_(0),
-      max_iterations(max_iters),
-      started_(false),
-      finished_(false),
-      error_occurred_(false),
-      range_(ranges),
-      complexity_n_(0),
-      counters(),
-      thread_index(thread_i),
-      threads(n_threads),
-      timer_(timer),
-      manager_(manager) {
-  CHECK(max_iterations != 0) << "At least one iteration must be run";
-  CHECK_LT(thread_index, threads) << "thread_index must be less than threads";
-
-  // Note: The use of offsetof below is technically undefined until C++17
-  // because State is not a standard layout type. However, all compilers
-  // currently provide well-defined behavior as an extension (which is
-  // demonstrated since constexpr evaluation must diagnose all undefined
-  // behavior). However, GCC and Clang also warn about this use of offsetof,
-  // which must be suppressed.
-#if defined(__INTEL_COMPILER)
-#pragma warning push
-#pragma warning(disable : 1875)
-#elif defined(__GNUC__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Winvalid-offsetof"
-#endif
-  // Offset tests to ensure commonly accessed data is on the first cache line.
-  const int cache_line_size = 64;
-  static_assert(offsetof(State, error_occurred_) <=
-                    (cache_line_size - sizeof(error_occurred_)),
-                "");
-#if defined(__INTEL_COMPILER)
-#pragma warning pop
-#elif defined(__GNUC__)
-#pragma GCC diagnostic pop
-#endif
-}
-
-void State::PauseTiming() {
-  // Add in time accumulated so far
-  CHECK(started_ && !finished_ && !error_occurred_);
-  timer_->StopTimer();
-}
-
-void State::ResumeTiming() {
-  CHECK(started_ && !finished_ && !error_occurred_);
-  timer_->StartTimer();
-}
-
-void State::SkipWithError(const char* msg) {
-  CHECK(msg);
-  error_occurred_ = true;
-  {
-    MutexLock l(manager_->GetBenchmarkMutex());
-    if (manager_->results.has_error_ == false) {
-      manager_->results.error_message_ = msg;
-      manager_->results.has_error_ = true;
-    }
-  }
-  total_iterations_ = 0;
-  if (timer_->running()) timer_->StopTimer();
-}
-
-void State::SetIterationTime(double seconds) {
-  timer_->SetIterationTime(seconds);
-}
-
-void State::SetLabel(const char* label) {
-  MutexLock l(manager_->GetBenchmarkMutex());
-  manager_->results.report_label_ = label;
-}
-
-void State::StartKeepRunning() {
-  CHECK(!started_ && !finished_);
-  started_ = true;
-  total_iterations_ = error_occurred_ ? 0 : max_iterations;
-  manager_->StartStopBarrier();
-  if (!error_occurred_) ResumeTiming();
-}
-
-void State::FinishKeepRunning() {
-  CHECK(started_ && (!finished_ || error_occurred_));
-  if (!error_occurred_) {
-    PauseTiming();
-  }
-  // Total iterations has now wrapped around past 0. Fix this.
-  total_iterations_ = 0;
-  finished_ = true;
-  manager_->StartStopBarrier();
-}
-
-namespace internal {
-namespace {
-
-void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
-                   BenchmarkReporter* display_reporter,
-                   BenchmarkReporter* file_reporter) {
-  // Note the file_reporter can be null.
-  CHECK(display_reporter != nullptr);
-
-  // Determine the width of the name field using a minimum width of 10.
-  bool might_have_aggregates = FLAGS_benchmark_repetitions > 1;
-  size_t name_field_width = 10;
-  size_t stat_field_width = 0;
-  for (const BenchmarkInstance& benchmark : benchmarks) {
-    name_field_width =
-        std::max<size_t>(name_field_width, benchmark.name.str().size());
-    might_have_aggregates |= benchmark.repetitions > 1;
-
-    for (const auto& Stat : *benchmark.statistics)
-      stat_field_width = std::max<size_t>(stat_field_width, Stat.name_.size());
-  }
-  if (might_have_aggregates) name_field_width += 1 + stat_field_width;
-
-  // Print header here
-  BenchmarkReporter::Context context;
-  context.name_field_width = name_field_width;
-
-  // Keep track of running times of all instances of current benchmark
-  std::vector<BenchmarkReporter::Run> complexity_reports;
-
-  // We flush streams after invoking reporter methods that write to them. This
-  // ensures users get timely updates even when streams are not line-buffered.
-  auto flushStreams = [](BenchmarkReporter* reporter) {
-    if (!reporter) return;
-    std::flush(reporter->GetOutputStream());
-    std::flush(reporter->GetErrorStream());
-  };
-
-  if (display_reporter->ReportContext(context) &&
-      (!file_reporter || file_reporter->ReportContext(context))) {
-    flushStreams(display_reporter);
-    flushStreams(file_reporter);
-
-    for (const auto& benchmark : benchmarks) {
-      RunResults run_results = RunBenchmark(benchmark, &complexity_reports);
-
-      auto report = [&run_results](BenchmarkReporter* reporter,
-                                   bool report_aggregates_only) {
-        assert(reporter);
-        // If there are no aggregates, do output non-aggregates.
-        report_aggregates_only &= !run_results.aggregates_only.empty();
-        if (!report_aggregates_only)
-          reporter->ReportRuns(run_results.non_aggregates);
-        if (!run_results.aggregates_only.empty())
-          reporter->ReportRuns(run_results.aggregates_only);
-      };
-
-      report(display_reporter, run_results.display_report_aggregates_only);
-      if (file_reporter)
-        report(file_reporter, run_results.file_report_aggregates_only);
-
-      flushStreams(display_reporter);
-      flushStreams(file_reporter);
-    }
-  }
-  display_reporter->Finalize();
-  if (file_reporter) file_reporter->Finalize();
-  flushStreams(display_reporter);
-  flushStreams(file_reporter);
-}
-
-// Disable deprecated warnings temporarily because we need to reference
-// CSVReporter but don't want to trigger -Werror=-Wdeprecated
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated"
-#endif
-
-std::unique_ptr<BenchmarkReporter> CreateReporter(
-    std::string const& name, ConsoleReporter::OutputOptions output_opts) {
-  typedef std::unique_ptr<BenchmarkReporter> PtrType;
-  if (name == "console") {
-    return PtrType(new ConsoleReporter(output_opts));
-  } else if (name == "json") {
-    return PtrType(new JSONReporter);
-  } else if (name == "csv") {
-    return PtrType(new CSVReporter);
-  } else {
-    std::cerr << "Unexpected format: '" << name << "'\n";
-    std::exit(1);
-  }
-}
-
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
-
-}  // end namespace
-
-bool IsZero(double n) {
-  return std::abs(n) < std::numeric_limits<double>::epsilon();
-}
-
-ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color) {
-  int output_opts = ConsoleReporter::OO_Defaults;
-  auto is_benchmark_color = [force_no_color]() -> bool {
-    if (force_no_color) {
-      return false;
-    }
-    if (FLAGS_benchmark_color == "auto") {
-      return IsColorTerminal();
-    }
-    return IsTruthyFlagValue(FLAGS_benchmark_color);
-  };
-  if (is_benchmark_color()) {
-    output_opts |= ConsoleReporter::OO_Color;
-  } else {
-    output_opts &= ~ConsoleReporter::OO_Color;
-  }
-  if (FLAGS_benchmark_counters_tabular) {
-    output_opts |= ConsoleReporter::OO_Tabular;
-  } else {
-    output_opts &= ~ConsoleReporter::OO_Tabular;
-  }
-  return static_cast<ConsoleReporter::OutputOptions>(output_opts);
-}
-
-}  // end namespace internal
-
-size_t RunSpecifiedBenchmarks() {
-  return RunSpecifiedBenchmarks(nullptr, nullptr);
-}
-
-size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter) {
-  return RunSpecifiedBenchmarks(display_reporter, nullptr);
-}
-
-size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
-                              BenchmarkReporter* file_reporter) {
-  std::string spec = FLAGS_benchmark_filter;
-  if (spec.empty() || spec == "all")
-    spec = ".";  // Regexp that matches all benchmarks
-
-  // Setup the reporters
-  std::ofstream output_file;
-  std::unique_ptr<BenchmarkReporter> default_display_reporter;
-  std::unique_ptr<BenchmarkReporter> default_file_reporter;
-  if (!display_reporter) {
-    default_display_reporter = internal::CreateReporter(
-        FLAGS_benchmark_format, internal::GetOutputOptions());
-    display_reporter = default_display_reporter.get();
-  }
-  auto& Out = display_reporter->GetOutputStream();
-  auto& Err = display_reporter->GetErrorStream();
-
-  std::string const& fname = FLAGS_benchmark_out;
-  if (fname.empty() && file_reporter) {
-    Err << "A custom file reporter was provided but "
-           "--benchmark_out=<file> was not specified."
-        << std::endl;
-    std::exit(1);
-  }
-  if (!fname.empty()) {
-    output_file.open(fname);
-    if (!output_file.is_open()) {
-      Err << "invalid file name: '" << fname << std::endl;
-      std::exit(1);
-    }
-    if (!file_reporter) {
-      default_file_reporter = internal::CreateReporter(
-          FLAGS_benchmark_out_format, ConsoleReporter::OO_None);
-      file_reporter = default_file_reporter.get();
-    }
-    file_reporter->SetOutputStream(&output_file);
-    file_reporter->SetErrorStream(&output_file);
-  }
-
-  std::vector<internal::BenchmarkInstance> benchmarks;
-  if (!FindBenchmarksInternal(spec, &benchmarks, &Err)) return 0;
-
-  if (benchmarks.empty()) {
-    Err << "Failed to match any benchmarks against regex: " << spec << "\n";
-    return 0;
-  }
-
-  if (FLAGS_benchmark_list_tests) {
-    for (auto const& benchmark : benchmarks)
-      Out << benchmark.name.str() << "\n";
-  } else {
-    internal::RunBenchmarks(benchmarks, display_reporter, file_reporter);
-  }
-
-  return benchmarks.size();
-}
-
-void RegisterMemoryManager(MemoryManager* manager) {
-  internal::memory_manager = manager;
-}
-
-namespace internal {
-
-void PrintUsageAndExit() {
-  fprintf(stdout,
-          "benchmark"
-          " [--benchmark_list_tests={true|false}]\n"
-          "          [--benchmark_filter=<regex>]\n"
-          "          [--benchmark_min_time=<min_time>]\n"
-          "          [--benchmark_repetitions=<num_repetitions>]\n"
-          "          [--benchmark_report_aggregates_only={true|false}]\n"
-          "          [--benchmark_display_aggregates_only={true|false}]\n"
-          "          [--benchmark_format=<console|json|csv>]\n"
-          "          [--benchmark_out=<filename>]\n"
-          "          [--benchmark_out_format=<json|console|csv>]\n"
-          "          [--benchmark_color={auto|true|false}]\n"
-          "          [--benchmark_counters_tabular={true|false}]\n"
-          "          [--v=<verbosity>]\n");
-  exit(0);
-}
-
-void ParseCommandLineFlags(int* argc, char** argv) {
-  using namespace benchmark;
-  BenchmarkReporter::Context::executable_name =
-      (argc && *argc > 0) ? argv[0] : "unknown";
-  for (int i = 1; argc && i < *argc; ++i) {
-    if (ParseBoolFlag(argv[i], "benchmark_list_tests",
-                      &FLAGS_benchmark_list_tests) ||
-        ParseStringFlag(argv[i], "benchmark_filter", &FLAGS_benchmark_filter) ||
-        ParseDoubleFlag(argv[i], "benchmark_min_time",
-                        &FLAGS_benchmark_min_time) ||
-        ParseInt32Flag(argv[i], "benchmark_repetitions",
-                       &FLAGS_benchmark_repetitions) ||
-        ParseBoolFlag(argv[i], "benchmark_report_aggregates_only",
-                      &FLAGS_benchmark_report_aggregates_only) ||
-        ParseBoolFlag(argv[i], "benchmark_display_aggregates_only",
-                      &FLAGS_benchmark_display_aggregates_only) ||
-        ParseStringFlag(argv[i], "benchmark_format", &FLAGS_benchmark_format) ||
-        ParseStringFlag(argv[i], "benchmark_out", &FLAGS_benchmark_out) ||
-        ParseStringFlag(argv[i], "benchmark_out_format",
-                        &FLAGS_benchmark_out_format) ||
-        ParseStringFlag(argv[i], "benchmark_color", &FLAGS_benchmark_color) ||
-        // "color_print" is the deprecated name for "benchmark_color".
-        // TODO: Remove this.
-        ParseStringFlag(argv[i], "color_print", &FLAGS_benchmark_color) ||
-        ParseBoolFlag(argv[i], "benchmark_counters_tabular",
-                      &FLAGS_benchmark_counters_tabular) ||
-        ParseInt32Flag(argv[i], "v", &FLAGS_v)) {
-      for (int j = i; j != *argc - 1; ++j) argv[j] = argv[j + 1];
-
-      --(*argc);
-      --i;
-    } else if (IsFlag(argv[i], "help")) {
-      PrintUsageAndExit();
-    }
-  }
-  for (auto const* flag :
-       {&FLAGS_benchmark_format, &FLAGS_benchmark_out_format})
-    if (*flag != "console" && *flag != "json" && *flag != "csv") {
-      PrintUsageAndExit();
-    }
-  if (FLAGS_benchmark_color.empty()) {
-    PrintUsageAndExit();
-  }
-}
-
-int InitializeStreams() {
-  static std::ios_base::Init init;
-  return 0;
-}
-
-}  // end namespace internal
-
-void Initialize(int* argc, char** argv) {
-  internal::ParseCommandLineFlags(argc, argv);
-  internal::LogLevel() = FLAGS_v;
-}
-
-bool ReportUnrecognizedArguments(int argc, char** argv) {
-  for (int i = 1; i < argc; ++i) {
-    fprintf(stderr, "%s: error: unrecognized command-line flag: %s\n", argv[0],
-            argv[i]);
-  }
-  return argc > 1;
-}
-
-}  // end namespace benchmark
diff --git a/third_party/google_benchmark/src/benchmark_api_internal.cc b/third_party/google_benchmark/src/benchmark_api_internal.cc
deleted file mode 100644
index d468a25..0000000
--- a/third_party/google_benchmark/src/benchmark_api_internal.cc
+++ /dev/null
@@ -1,15 +0,0 @@
-#include "benchmark_api_internal.h"
-
-namespace benchmark {
-namespace internal {
-
-State BenchmarkInstance::Run(IterationCount iters, int thread_id,
-                             internal::ThreadTimer* timer,
-                             internal::ThreadManager* manager) const {
-  State st(iters, arg, thread_id, threads, timer, manager);
-  benchmark->Run(st);
-  return st;
-}
-
-}  // internal
-}  // benchmark
diff --git a/third_party/google_benchmark/src/benchmark_api_internal.h b/third_party/google_benchmark/src/benchmark_api_internal.h
deleted file mode 100644
index 264eff9..0000000
--- a/third_party/google_benchmark/src/benchmark_api_internal.h
+++ /dev/null
@@ -1,53 +0,0 @@
-#ifndef BENCHMARK_API_INTERNAL_H
-#define BENCHMARK_API_INTERNAL_H
-
-#include "benchmark/benchmark.h"
-#include "commandlineflags.h"
-
-#include <cmath>
-#include <iosfwd>
-#include <limits>
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace benchmark {
-namespace internal {
-
-// Information kept per benchmark we may want to run
-struct BenchmarkInstance {
-  BenchmarkName name;
-  Benchmark* benchmark;
-  AggregationReportMode aggregation_report_mode;
-  std::vector<int64_t> arg;
-  TimeUnit time_unit;
-  int range_multiplier;
-  bool measure_process_cpu_time;
-  bool use_real_time;
-  bool use_manual_time;
-  BigO complexity;
-  BigOFunc* complexity_lambda;
-  UserCounters counters;
-  const std::vector<Statistics>* statistics;
-  bool last_benchmark_instance;
-  int repetitions;
-  double min_time;
-  IterationCount iterations;
-  int threads;  // Number of concurrent threads to us
-
-  State Run(IterationCount iters, int thread_id, internal::ThreadTimer* timer,
-            internal::ThreadManager* manager) const;
-};
-
-bool FindBenchmarksInternal(const std::string& re,
-                            std::vector<BenchmarkInstance>* benchmarks,
-                            std::ostream* Err);
-
-bool IsZero(double n);
-
-ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color = false);
-
-}  // end namespace internal
-}  // end namespace benchmark
-
-#endif  // BENCHMARK_API_INTERNAL_H
diff --git a/third_party/google_benchmark/src/benchmark_runner.cc b/third_party/google_benchmark/src/benchmark_runner.cc
deleted file mode 100644
index 337fac1..0000000
--- a/third_party/google_benchmark/src/benchmark_runner.cc
+++ /dev/null
@@ -1,361 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "benchmark_runner.h"
-#include "benchmark/benchmark.h"
-#include "benchmark_api_internal.h"
-#include "internal_macros.h"
-
-#ifndef BENCHMARK_OS_WINDOWS
-#ifndef BENCHMARK_OS_FUCHSIA
-#include <sys/resource.h>
-#endif
-#include <sys/time.h>
-#include <unistd.h>
-#endif
-
-#include <algorithm>
-#include <atomic>
-#include <condition_variable>
-#include <cstdio>
-#include <cstdlib>
-#include <fstream>
-#include <iostream>
-#include <memory>
-#include <string>
-#include <thread>
-#include <utility>
-
-#include "check.h"
-#include "colorprint.h"
-#include "commandlineflags.h"
-#include "complexity.h"
-#include "counter.h"
-#include "internal_macros.h"
-#include "log.h"
-#include "mutex.h"
-#include "re.h"
-#include "statistics.h"
-#include "string_util.h"
-#include "thread_manager.h"
-#include "thread_timer.h"
-
-namespace benchmark {
-
-namespace internal {
-
-MemoryManager* memory_manager = nullptr;
-
-namespace {
-
-static constexpr IterationCount kMaxIterations = 1000000000;
-
-BenchmarkReporter::Run CreateRunReport(
-    const benchmark::internal::BenchmarkInstance& b,
-    const internal::ThreadManager::Result& results,
-    IterationCount memory_iterations,
-    const MemoryManager::Result& memory_result, double seconds,
-    int64_t repetition_index) {
-  // Create report about this benchmark run.
-  BenchmarkReporter::Run report;
-
-  report.run_name = b.name;
-  report.error_occurred = results.has_error_;
-  report.error_message = results.error_message_;
-  report.report_label = results.report_label_;
-  // This is the total iterations across all threads.
-  report.iterations = results.iterations;
-  report.time_unit = b.time_unit;
-  report.threads = b.threads;
-  report.repetition_index = repetition_index;
-  report.repetitions = b.repetitions;
-
-  if (!report.error_occurred) {
-    if (b.use_manual_time) {
-      report.real_accumulated_time = results.manual_time_used;
-    } else {
-      report.real_accumulated_time = results.real_time_used;
-    }
-    report.cpu_accumulated_time = results.cpu_time_used;
-    report.complexity_n = results.complexity_n;
-    report.complexity = b.complexity;
-    report.complexity_lambda = b.complexity_lambda;
-    report.statistics = b.statistics;
-    report.counters = results.counters;
-
-    if (memory_iterations > 0) {
-      report.has_memory_result = true;
-      report.allocs_per_iter =
-          memory_iterations ? static_cast<double>(memory_result.num_allocs) /
-                                  memory_iterations
-                            : 0;
-      report.max_bytes_used = memory_result.max_bytes_used;
-    }
-
-    internal::Finish(&report.counters, results.iterations, seconds, b.threads);
-  }
-  return report;
-}
-
-// Execute one thread of benchmark b for the specified number of iterations.
-// Adds the stats collected for the thread into *total.
-void RunInThread(const BenchmarkInstance* b, IterationCount iters,
-                 int thread_id, ThreadManager* manager) {
-  internal::ThreadTimer timer(
-      b->measure_process_cpu_time
-          ? internal::ThreadTimer::CreateProcessCpuTime()
-          : internal::ThreadTimer::Create());
-  State st = b->Run(iters, thread_id, &timer, manager);
-  CHECK(st.iterations() >= st.max_iterations)
-      << "Benchmark returned before State::KeepRunning() returned false!";
-  {
-    MutexLock l(manager->GetBenchmarkMutex());
-    internal::ThreadManager::Result& results = manager->results;
-    results.iterations += st.iterations();
-    results.cpu_time_used += timer.cpu_time_used();
-    results.real_time_used += timer.real_time_used();
-    results.manual_time_used += timer.manual_time_used();
-    results.complexity_n += st.complexity_length_n();
-    internal::Increment(&results.counters, st.counters);
-  }
-  manager->NotifyThreadComplete();
-}
-
-class BenchmarkRunner {
- public:
-  BenchmarkRunner(const benchmark::internal::BenchmarkInstance& b_,
-                  std::vector<BenchmarkReporter::Run>* complexity_reports_)
-      : b(b_),
-        complexity_reports(*complexity_reports_),
-        min_time(!IsZero(b.min_time) ? b.min_time : FLAGS_benchmark_min_time),
-        repeats(b.repetitions != 0 ? b.repetitions
-                                   : FLAGS_benchmark_repetitions),
-        has_explicit_iteration_count(b.iterations != 0),
-        pool(b.threads - 1),
-        iters(has_explicit_iteration_count ? b.iterations : 1) {
-    run_results.display_report_aggregates_only =
-        (FLAGS_benchmark_report_aggregates_only ||
-         FLAGS_benchmark_display_aggregates_only);
-    run_results.file_report_aggregates_only =
-        FLAGS_benchmark_report_aggregates_only;
-    if (b.aggregation_report_mode != internal::ARM_Unspecified) {
-      run_results.display_report_aggregates_only =
-          (b.aggregation_report_mode &
-           internal::ARM_DisplayReportAggregatesOnly);
-      run_results.file_report_aggregates_only =
-          (b.aggregation_report_mode & internal::ARM_FileReportAggregatesOnly);
-    }
-
-    for (int repetition_num = 0; repetition_num < repeats; repetition_num++) {
-      DoOneRepetition(repetition_num);
-    }
-
-    // Calculate additional statistics
-    run_results.aggregates_only = ComputeStats(run_results.non_aggregates);
-
-    // Maybe calculate complexity report
-    if ((b.complexity != oNone) && b.last_benchmark_instance) {
-      auto additional_run_stats = ComputeBigO(complexity_reports);
-      run_results.aggregates_only.insert(run_results.aggregates_only.end(),
-                                         additional_run_stats.begin(),
-                                         additional_run_stats.end());
-      complexity_reports.clear();
-    }
-  }
-
-  RunResults&& get_results() { return std::move(run_results); }
-
- private:
-  RunResults run_results;
-
-  const benchmark::internal::BenchmarkInstance& b;
-  std::vector<BenchmarkReporter::Run>& complexity_reports;
-
-  const double min_time;
-  const int repeats;
-  const bool has_explicit_iteration_count;
-
-  std::vector<std::thread> pool;
-
-  IterationCount iters;  // preserved between repetitions!
-  // So only the first repetition has to find/calculate it,
-  // the other repetitions will just use that precomputed iteration count.
-
-  struct IterationResults {
-    internal::ThreadManager::Result results;
-    IterationCount iters;
-    double seconds;
-  };
-  IterationResults DoNIterations() {
-    VLOG(2) << "Running " << b.name.str() << " for " << iters << "\n";
-
-    std::unique_ptr<internal::ThreadManager> manager;
-    manager.reset(new internal::ThreadManager(b.threads));
-
-    // Run all but one thread in separate threads
-    for (std::size_t ti = 0; ti < pool.size(); ++ti) {
-      pool[ti] = std::thread(&RunInThread, &b, iters, static_cast<int>(ti + 1),
-                             manager.get());
-    }
-    // And run one thread here directly.
-    // (If we were asked to run just one thread, we don't create new threads.)
-    // Yes, we need to do this here *after* we start the separate threads.
-    RunInThread(&b, iters, 0, manager.get());
-
-    // The main thread has finished. Now let's wait for the other threads.
-    manager->WaitForAllThreads();
-    for (std::thread& thread : pool) thread.join();
-
-    IterationResults i;
-    // Acquire the measurements/counters from the manager, UNDER THE LOCK!
-    {
-      MutexLock l(manager->GetBenchmarkMutex());
-      i.results = manager->results;
-    }
-
-    // And get rid of the manager.
-    manager.reset();
-
-    // Adjust real/manual time stats since they were reported per thread.
-    i.results.real_time_used /= b.threads;
-    i.results.manual_time_used /= b.threads;
-    // If we were measuring whole-process CPU usage, adjust the CPU time too.
-    if (b.measure_process_cpu_time) i.results.cpu_time_used /= b.threads;
-
-    VLOG(2) << "Ran in " << i.results.cpu_time_used << "/"
-            << i.results.real_time_used << "\n";
-
-    // So for how long were we running?
-    i.iters = iters;
-    // Base decisions off of real time if requested by this benchmark.
-    i.seconds = i.results.cpu_time_used;
-    if (b.use_manual_time) {
-      i.seconds = i.results.manual_time_used;
-    } else if (b.use_real_time) {
-      i.seconds = i.results.real_time_used;
-    }
-
-    return i;
-  }
-
-  IterationCount PredictNumItersNeeded(const IterationResults& i) const {
-    // See how much iterations should be increased by.
-    // Note: Avoid division by zero with max(seconds, 1ns).
-    double multiplier = min_time * 1.4 / std::max(i.seconds, 1e-9);
-    // If our last run was at least 10% of FLAGS_benchmark_min_time then we
-    // use the multiplier directly.
-    // Otherwise we use at most 10 times expansion.
-    // NOTE: When the last run was at least 10% of the min time the max
-    // expansion should be 14x.
-    bool is_significant = (i.seconds / min_time) > 0.1;
-    multiplier = is_significant ? multiplier : std::min(10.0, multiplier);
-    if (multiplier <= 1.0) multiplier = 2.0;
-
-    // So what seems to be the sufficiently-large iteration count? Round up.
-    const IterationCount max_next_iters =
-        std::lround(std::max(multiplier * i.iters, i.iters + 1.0));
-    // But we do have *some* sanity limits though..
-    const IterationCount next_iters = std::min(max_next_iters, kMaxIterations);
-
-    VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n";
-    return next_iters;  // round up before conversion to integer.
-  }
-
-  bool ShouldReportIterationResults(const IterationResults& i) const {
-    // Determine if this run should be reported;
-    // Either it has run for a sufficient amount of time
-    // or because an error was reported.
-    return i.results.has_error_ ||
-           i.iters >= kMaxIterations ||  // Too many iterations already.
-           i.seconds >= min_time ||      // The elapsed time is large enough.
-           // CPU time is specified but the elapsed real time greatly exceeds
-           // the minimum time.
-           // Note that user provided timers are except from this sanity check.
-           ((i.results.real_time_used >= 5 * min_time) && !b.use_manual_time);
-  }
-
-  void DoOneRepetition(int64_t repetition_index) {
-    const bool is_the_first_repetition = repetition_index == 0;
-    IterationResults i;
-
-    // We *may* be gradually increasing the length (iteration count)
-    // of the benchmark until we decide the results are significant.
-    // And once we do, we report those last results and exit.
-    // Please do note that the if there are repetitions, the iteration count
-    // is *only* calculated for the *first* repetition, and other repetitions
-    // simply use that precomputed iteration count.
-    for (;;) {
-      i = DoNIterations();
-
-      // Do we consider the results to be significant?
-      // If we are doing repetitions, and the first repetition was already done,
-      // it has calculated the correct iteration time, so we have run that very
-      // iteration count just now. No need to calculate anything. Just report.
-      // Else, the normal rules apply.
-      const bool results_are_significant = !is_the_first_repetition ||
-                                           has_explicit_iteration_count ||
-                                           ShouldReportIterationResults(i);
-
-      if (results_are_significant) break;  // Good, let's report them!
-
-      // Nope, bad iteration. Let's re-estimate the hopefully-sufficient
-      // iteration count, and run the benchmark again...
-
-      iters = PredictNumItersNeeded(i);
-      assert(iters > i.iters &&
-             "if we did more iterations than we want to do the next time, "
-             "then we should have accepted the current iteration run.");
-    }
-
-    // Oh, one last thing, we need to also produce the 'memory measurements'..
-    MemoryManager::Result memory_result;
-    IterationCount memory_iterations = 0;
-    if (memory_manager != nullptr) {
-      // Only run a few iterations to reduce the impact of one-time
-      // allocations in benchmarks that are not properly managed.
-      memory_iterations = std::min<IterationCount>(16, iters);
-      memory_manager->Start();
-      std::unique_ptr<internal::ThreadManager> manager;
-      manager.reset(new internal::ThreadManager(1));
-      RunInThread(&b, memory_iterations, 0, manager.get());
-      manager->WaitForAllThreads();
-      manager.reset();
-
-      memory_manager->Stop(&memory_result);
-    }
-
-    // Ok, now actualy report.
-    BenchmarkReporter::Run report =
-        CreateRunReport(b, i.results, memory_iterations, memory_result,
-                        i.seconds, repetition_index);
-
-    if (!report.error_occurred && b.complexity != oNone)
-      complexity_reports.push_back(report);
-
-    run_results.non_aggregates.push_back(report);
-  }
-};
-
-}  // end namespace
-
-RunResults RunBenchmark(
-    const benchmark::internal::BenchmarkInstance& b,
-    std::vector<BenchmarkReporter::Run>* complexity_reports) {
-  internal::BenchmarkRunner r(b, complexity_reports);
-  return r.get_results();
-}
-
-}  // end namespace internal
-
-}  // end namespace benchmark
diff --git a/third_party/google_benchmark/src/benchmark_runner.h b/third_party/google_benchmark/src/benchmark_runner.h
deleted file mode 100644
index 96e8282..0000000
--- a/third_party/google_benchmark/src/benchmark_runner.h
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef BENCHMARK_RUNNER_H_
-#define BENCHMARK_RUNNER_H_
-
-#include "benchmark_api_internal.h"
-#include "internal_macros.h"
-
-DECLARE_double(benchmark_min_time);
-
-DECLARE_int32(benchmark_repetitions);
-
-DECLARE_bool(benchmark_report_aggregates_only);
-
-DECLARE_bool(benchmark_display_aggregates_only);
-
-namespace benchmark {
-
-namespace internal {
-
-extern MemoryManager* memory_manager;
-
-struct RunResults {
-  std::vector<BenchmarkReporter::Run> non_aggregates;
-  std::vector<BenchmarkReporter::Run> aggregates_only;
-
-  bool display_report_aggregates_only = false;
-  bool file_report_aggregates_only = false;
-};
-
-RunResults RunBenchmark(
-    const benchmark::internal::BenchmarkInstance& b,
-    std::vector<BenchmarkReporter::Run>* complexity_reports);
-
-}  // namespace internal
-
-}  // end namespace benchmark
-
-#endif  // BENCHMARK_RUNNER_H_
diff --git a/third_party/google_benchmark/src/bindings/python/BUILD b/third_party/google_benchmark/src/bindings/python/BUILD
new file mode 100644
index 0000000..9559a76
--- /dev/null
+++ b/third_party/google_benchmark/src/bindings/python/BUILD
@@ -0,0 +1,3 @@
+exports_files(glob(["*.BUILD"]))
+exports_files(["build_defs.bzl"])
+
diff --git a/third_party/google_benchmark/src/bindings/python/build_defs.bzl b/third_party/google_benchmark/src/bindings/python/build_defs.bzl
new file mode 100644
index 0000000..009820a
--- /dev/null
+++ b/third_party/google_benchmark/src/bindings/python/build_defs.bzl
@@ -0,0 +1,25 @@
+_SHARED_LIB_SUFFIX = {
+    "//conditions:default": ".so",
+    "//:windows": ".dll",
+}
+
+def py_extension(name, srcs, hdrs = [], copts = [], features = [], deps = []):
+    for shared_lib_suffix in _SHARED_LIB_SUFFIX.values():
+        shared_lib_name = name + shared_lib_suffix
+        native.cc_binary(
+            name = shared_lib_name,
+            linkshared = True,
+            linkstatic = True,
+            srcs = srcs + hdrs,
+            copts = copts,
+            features = features,
+            deps = deps,
+        )
+
+    return native.py_library(
+        name = name,
+        data = select({
+            platform: [name + shared_lib_suffix]
+            for platform, shared_lib_suffix in _SHARED_LIB_SUFFIX.items()
+        }),
+    )
diff --git a/third_party/google_benchmark/src/bindings/python/google_benchmark/BUILD b/third_party/google_benchmark/src/bindings/python/google_benchmark/BUILD
new file mode 100644
index 0000000..89ec76e
--- /dev/null
+++ b/third_party/google_benchmark/src/bindings/python/google_benchmark/BUILD
@@ -0,0 +1,40 @@
+load("//bindings/python:build_defs.bzl", "py_extension")
+
+py_library(
+    name = "google_benchmark",
+    srcs = ["__init__.py"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":_benchmark",
+    ],
+)
+
+py_extension(
+    name = "_benchmark",
+    srcs = ["benchmark.cc"],
+    copts = [
+        "-fexceptions",
+        "-fno-strict-aliasing",
+    ],
+    features = [
+        "-use_header_modules",
+        "-parse_headers",
+    ],
+    deps = [
+        "//:benchmark",
+        "@nanobind",
+        "@python_headers",
+    ],
+)
+
+py_test(
+    name = "example",
+    srcs = ["example.py"],
+    python_version = "PY3",
+    srcs_version = "PY3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":google_benchmark",
+    ],
+)
+
diff --git a/third_party/google_benchmark/src/bindings/python/google_benchmark/__init__.py b/third_party/google_benchmark/src/bindings/python/google_benchmark/__init__.py
new file mode 100644
index 0000000..e6ef8e7
--- /dev/null
+++ b/third_party/google_benchmark/src/bindings/python/google_benchmark/__init__.py
@@ -0,0 +1,162 @@
+# Copyright 2020 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Python benchmarking utilities.
+
+Example usage:
+  import google_benchmark as benchmark
+
+  @benchmark.register
+  def my_benchmark(state):
+      ...  # Code executed outside `while` loop is not timed.
+
+      while state:
+        ...  # Code executed within `while` loop is timed.
+
+  if __name__ == '__main__':
+    benchmark.main()
+"""
+import atexit
+
+from absl import app
+from google_benchmark import _benchmark
+from google_benchmark._benchmark import (
+    Counter,
+    kNanosecond,
+    kMicrosecond,
+    kMillisecond,
+    kSecond,
+    oNone,
+    o1,
+    oN,
+    oNSquared,
+    oNCubed,
+    oLogN,
+    oNLogN,
+    oAuto,
+    oLambda,
+    State,
+)
+
+
+__all__ = [
+    "register",
+    "main",
+    "Counter",
+    "kNanosecond",
+    "kMicrosecond",
+    "kMillisecond",
+    "kSecond",
+    "oNone",
+    "o1",
+    "oN",
+    "oNSquared",
+    "oNCubed",
+    "oLogN",
+    "oNLogN",
+    "oAuto",
+    "oLambda",
+    "State",
+]
+
+__version__ = "1.7.1"
+
+
+class __OptionMaker:
+    """A stateless class to collect benchmark options.
+
+    Collect all decorator calls like @option.range(start=0, limit=1<<5).
+    """
+
+    class Options:
+        """Pure data class to store options calls, along with the benchmarked function."""
+
+        def __init__(self, func):
+            self.func = func
+            self.builder_calls = []
+
+    @classmethod
+    def make(cls, func_or_options):
+        """Make Options from Options or the benchmarked function."""
+        if isinstance(func_or_options, cls.Options):
+            return func_or_options
+        return cls.Options(func_or_options)
+
+    def __getattr__(self, builder_name):
+        """Append option call in the Options."""
+
+        # The function that get returned on @option.range(start=0, limit=1<<5).
+        def __builder_method(*args, **kwargs):
+
+            # The decorator that get called, either with the benchmared function
+            # or the previous Options
+            def __decorator(func_or_options):
+                options = self.make(func_or_options)
+                options.builder_calls.append((builder_name, args, kwargs))
+                # The decorator returns Options so it is not technically a decorator
+                # and needs a final call to @register
+                return options
+
+            return __decorator
+
+        return __builder_method
+
+
+# Alias for nicer API.
+# We have to instantiate an object, even if stateless, to be able to use __getattr__
+# on option.range
+option = __OptionMaker()
+
+
+def register(undefined=None, *, name=None):
+    """Register function for benchmarking."""
+    if undefined is None:
+        # Decorator is called without parenthesis so we return a decorator
+        return lambda f: register(f, name=name)
+
+    # We have either the function to benchmark (simple case) or an instance of Options
+    # (@option._ case).
+    options = __OptionMaker.make(undefined)
+
+    if name is None:
+        name = options.func.__name__
+
+    # We register the benchmark and reproduce all the @option._ calls onto the
+    # benchmark builder pattern
+    benchmark = _benchmark.RegisterBenchmark(name, options.func)
+    for name, args, kwargs in options.builder_calls[::-1]:
+        getattr(benchmark, name)(*args, **kwargs)
+
+    # return the benchmarked function because the decorator does not modify it
+    return options.func
+
+
+def _flags_parser(argv):
+    argv = _benchmark.Initialize(argv)
+    return app.parse_flags_with_usage(argv)
+
+
+def _run_benchmarks(argv):
+    if len(argv) > 1:
+        raise app.UsageError("Too many command-line arguments.")
+    return _benchmark.RunSpecifiedBenchmarks()
+
+
+def main(argv=None):
+    return app.run(_run_benchmarks, argv=argv, flags_parser=_flags_parser)
+
+
+# Methods for use with custom main function.
+initialize = _benchmark.Initialize
+run_benchmarks = _benchmark.RunSpecifiedBenchmarks
+atexit.register(_benchmark.ClearRegisteredBenchmarks)
diff --git a/third_party/google_benchmark/src/bindings/python/google_benchmark/benchmark.cc b/third_party/google_benchmark/src/bindings/python/google_benchmark/benchmark.cc
new file mode 100644
index 0000000..f444769
--- /dev/null
+++ b/third_party/google_benchmark/src/bindings/python/google_benchmark/benchmark.cc
@@ -0,0 +1,184 @@
+// Benchmark for Python.
+
+#include "benchmark/benchmark.h"
+
+#include "nanobind/nanobind.h"
+#include "nanobind/operators.h"
+#include "nanobind/stl/bind_map.h"
+#include "nanobind/stl/string.h"
+#include "nanobind/stl/vector.h"
+
+NB_MAKE_OPAQUE(benchmark::UserCounters);
+
+namespace {
+namespace nb = nanobind;
+
+std::vector<std::string> Initialize(const std::vector<std::string>& argv) {
+  // The `argv` pointers here become invalid when this function returns, but
+  // benchmark holds the pointer to `argv[0]`. We create a static copy of it
+  // so it persists, and replace the pointer below.
+  static std::string executable_name(argv[0]);
+  std::vector<char*> ptrs;
+  ptrs.reserve(argv.size());
+  for (auto& arg : argv) {
+    ptrs.push_back(const_cast<char*>(arg.c_str()));
+  }
+  ptrs[0] = const_cast<char*>(executable_name.c_str());
+  int argc = static_cast<int>(argv.size());
+  benchmark::Initialize(&argc, ptrs.data());
+  std::vector<std::string> remaining_argv;
+  remaining_argv.reserve(argc);
+  for (int i = 0; i < argc; ++i) {
+    remaining_argv.emplace_back(ptrs[i]);
+  }
+  return remaining_argv;
+}
+
+benchmark::internal::Benchmark* RegisterBenchmark(const std::string& name,
+                                                  nb::callable f) {
+  return benchmark::RegisterBenchmark(
+      name, [f](benchmark::State& state) { f(&state); });
+}
+
+NB_MODULE(_benchmark, m) {
+
+  using benchmark::TimeUnit;
+  nb::enum_<TimeUnit>(m, "TimeUnit")
+      .value("kNanosecond", TimeUnit::kNanosecond)
+      .value("kMicrosecond", TimeUnit::kMicrosecond)
+      .value("kMillisecond", TimeUnit::kMillisecond)
+      .value("kSecond", TimeUnit::kSecond)
+      .export_values();
+
+  using benchmark::BigO;
+  nb::enum_<BigO>(m, "BigO")
+      .value("oNone", BigO::oNone)
+      .value("o1", BigO::o1)
+      .value("oN", BigO::oN)
+      .value("oNSquared", BigO::oNSquared)
+      .value("oNCubed", BigO::oNCubed)
+      .value("oLogN", BigO::oLogN)
+      .value("oNLogN", BigO::oNLogN)
+      .value("oAuto", BigO::oAuto)
+      .value("oLambda", BigO::oLambda)
+      .export_values();
+
+  using benchmark::internal::Benchmark;
+  nb::class_<Benchmark>(m, "Benchmark")
+      // For methods returning a pointer to the current object, reference
+      // return policy is used to ask nanobind not to take ownership of the
+      // returned object and avoid calling delete on it.
+      // https://pybind11.readthedocs.io/en/stable/advanced/functions.html#return-value-policies
+      //
+      // For methods taking a const std::vector<...>&, a copy is created
+      // because a it is bound to a Python list.
+      // https://pybind11.readthedocs.io/en/stable/advanced/cast/stl.html
+      .def("unit", &Benchmark::Unit, nb::rv_policy::reference)
+      .def("arg", &Benchmark::Arg, nb::rv_policy::reference)
+      .def("args", &Benchmark::Args, nb::rv_policy::reference)
+      .def("range", &Benchmark::Range, nb::rv_policy::reference,
+           nb::arg("start"), nb::arg("limit"))
+      .def("dense_range", &Benchmark::DenseRange,
+           nb::rv_policy::reference, nb::arg("start"),
+           nb::arg("limit"), nb::arg("step") = 1)
+      .def("ranges", &Benchmark::Ranges, nb::rv_policy::reference)
+      .def("args_product", &Benchmark::ArgsProduct,
+           nb::rv_policy::reference)
+      .def("arg_name", &Benchmark::ArgName, nb::rv_policy::reference)
+      .def("arg_names", &Benchmark::ArgNames,
+           nb::rv_policy::reference)
+      .def("range_pair", &Benchmark::RangePair,
+           nb::rv_policy::reference, nb::arg("lo1"), nb::arg("hi1"),
+           nb::arg("lo2"), nb::arg("hi2"))
+      .def("range_multiplier", &Benchmark::RangeMultiplier,
+           nb::rv_policy::reference)
+      .def("min_time", &Benchmark::MinTime, nb::rv_policy::reference)
+      .def("min_warmup_time", &Benchmark::MinWarmUpTime,
+           nb::rv_policy::reference)
+      .def("iterations", &Benchmark::Iterations,
+           nb::rv_policy::reference)
+      .def("repetitions", &Benchmark::Repetitions,
+           nb::rv_policy::reference)
+      .def("report_aggregates_only", &Benchmark::ReportAggregatesOnly,
+           nb::rv_policy::reference, nb::arg("value") = true)
+      .def("display_aggregates_only", &Benchmark::DisplayAggregatesOnly,
+           nb::rv_policy::reference, nb::arg("value") = true)
+      .def("measure_process_cpu_time", &Benchmark::MeasureProcessCPUTime,
+           nb::rv_policy::reference)
+      .def("use_real_time", &Benchmark::UseRealTime,
+           nb::rv_policy::reference)
+      .def("use_manual_time", &Benchmark::UseManualTime,
+           nb::rv_policy::reference)
+      .def(
+          "complexity",
+          (Benchmark * (Benchmark::*)(benchmark::BigO)) & Benchmark::Complexity,
+          nb::rv_policy::reference,
+          nb::arg("complexity") = benchmark::oAuto);
+
+  using benchmark::Counter;
+  nb::class_<Counter> py_counter(m, "Counter");
+
+  nb::enum_<Counter::Flags>(py_counter, "Flags")
+      .value("kDefaults", Counter::Flags::kDefaults)
+      .value("kIsRate", Counter::Flags::kIsRate)
+      .value("kAvgThreads", Counter::Flags::kAvgThreads)
+      .value("kAvgThreadsRate", Counter::Flags::kAvgThreadsRate)
+      .value("kIsIterationInvariant", Counter::Flags::kIsIterationInvariant)
+      .value("kIsIterationInvariantRate",
+             Counter::Flags::kIsIterationInvariantRate)
+      .value("kAvgIterations", Counter::Flags::kAvgIterations)
+      .value("kAvgIterationsRate", Counter::Flags::kAvgIterationsRate)
+      .value("kInvert", Counter::Flags::kInvert)
+      .export_values()
+      .def(nb::self | nb::self);
+
+  nb::enum_<Counter::OneK>(py_counter, "OneK")
+      .value("kIs1000", Counter::OneK::kIs1000)
+      .value("kIs1024", Counter::OneK::kIs1024)
+      .export_values();
+
+  py_counter
+      .def(nb::init<double, Counter::Flags, Counter::OneK>(),
+           nb::arg("value") = 0., nb::arg("flags") = Counter::kDefaults,
+           nb::arg("k") = Counter::kIs1000)
+      .def("__init__", ([](Counter *c, double value) { new (c) Counter(value); }))
+      .def_rw("value", &Counter::value)
+      .def_rw("flags", &Counter::flags)
+      .def_rw("oneK", &Counter::oneK)
+      .def(nb::init_implicit<double>());
+
+  nb::implicitly_convertible<nb::int_, Counter>();
+
+  nb::bind_map<benchmark::UserCounters>(m, "UserCounters");
+
+  using benchmark::State;
+  nb::class_<State>(m, "State")
+      .def("__bool__", &State::KeepRunning)
+      .def_prop_ro("keep_running", &State::KeepRunning)
+      .def("pause_timing", &State::PauseTiming)
+      .def("resume_timing", &State::ResumeTiming)
+      .def("skip_with_error", &State::SkipWithError)
+      .def_prop_ro("error_occurred", &State::error_occurred)
+      .def("set_iteration_time", &State::SetIterationTime)
+      .def_prop_rw("bytes_processed", &State::bytes_processed,
+                    &State::SetBytesProcessed)
+      .def_prop_rw("complexity_n", &State::complexity_length_n,
+                    &State::SetComplexityN)
+      .def_prop_rw("items_processed", &State::items_processed,
+                   &State::SetItemsProcessed)
+      .def("set_label", &State::SetLabel)
+      .def("range", &State::range, nb::arg("pos") = 0)
+      .def_prop_ro("iterations", &State::iterations)
+      .def_prop_ro("name", &State::name)
+      .def_rw("counters", &State::counters)
+      .def_prop_ro("thread_index", &State::thread_index)
+      .def_prop_ro("threads", &State::threads);
+
+  m.def("Initialize", Initialize);
+  m.def("RegisterBenchmark", RegisterBenchmark,
+        nb::rv_policy::reference);
+  m.def("RunSpecifiedBenchmarks",
+        []() { benchmark::RunSpecifiedBenchmarks(); });
+  m.def("ClearRegisteredBenchmarks", benchmark::ClearRegisteredBenchmarks);
+};
+}  // namespace
diff --git a/third_party/google_benchmark/src/bindings/python/google_benchmark/example.py b/third_party/google_benchmark/src/bindings/python/google_benchmark/example.py
new file mode 100644
index 0000000..d95a043
--- /dev/null
+++ b/third_party/google_benchmark/src/bindings/python/google_benchmark/example.py
@@ -0,0 +1,136 @@
+# Copyright 2020 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Example of Python using C++ benchmark framework.
+
+To run this example, you must first install the `google_benchmark` Python package.
+
+To install using `setup.py`, download and extract the `google_benchmark` source.
+In the extracted directory, execute:
+  python setup.py install
+"""
+
+import random
+import time
+
+import google_benchmark as benchmark
+from google_benchmark import Counter
+
+
+@benchmark.register
+def empty(state):
+    while state:
+        pass
+
+
+@benchmark.register
+def sum_million(state):
+    while state:
+        sum(range(1_000_000))
+
+@benchmark.register
+def pause_timing(state):
+    """Pause timing every iteration."""
+    while state:
+        # Construct a list of random ints every iteration without timing it
+        state.pause_timing()
+        random_list = [random.randint(0, 100) for _ in range(100)]
+        state.resume_timing()
+        # Time the in place sorting algorithm
+        random_list.sort()
+
+
+@benchmark.register
+def skipped(state):
+    if True:  # Test some predicate here.
+        state.skip_with_error("some error")
+        return  # NOTE: You must explicitly return, or benchmark will continue.
+
+    ...  # Benchmark code would be here.
+
+
+@benchmark.register
+def manual_timing(state):
+    while state:
+        # Manually count Python CPU time
+        start = time.perf_counter()  # perf_counter_ns() in Python 3.7+
+        # Something to benchmark
+        time.sleep(0.01)
+        end = time.perf_counter()
+        state.set_iteration_time(end - start)
+
+
+@benchmark.register
+def custom_counters(state):
+    """Collect custom metric using benchmark.Counter."""
+    num_foo = 0.0
+    while state:
+        # Benchmark some code here
+        pass
+        # Collect some custom metric named foo
+        num_foo += 0.13
+
+    # Automatic Counter from numbers.
+    state.counters["foo"] = num_foo
+    # Set a counter as a rate.
+    state.counters["foo_rate"] = Counter(num_foo, Counter.kIsRate)
+    #  Set a counter as an inverse of rate.
+    state.counters["foo_inv_rate"] = Counter(num_foo, Counter.kIsRate | Counter.kInvert)
+    # Set a counter as a thread-average quantity.
+    state.counters["foo_avg"] = Counter(num_foo, Counter.kAvgThreads)
+    # There's also a combined flag:
+    state.counters["foo_avg_rate"] = Counter(num_foo, Counter.kAvgThreadsRate)
+
+
+@benchmark.register
+@benchmark.option.measure_process_cpu_time()
+@benchmark.option.use_real_time()
+def with_options(state):
+    while state:
+        sum(range(1_000_000))
+
+
+@benchmark.register(name="sum_million_microseconds")
+@benchmark.option.unit(benchmark.kMicrosecond)
+def with_options2(state):
+    while state:
+        sum(range(1_000_000))
+
+
+@benchmark.register
+@benchmark.option.arg(100)
+@benchmark.option.arg(1000)
+def passing_argument(state):
+    while state:
+        sum(range(state.range(0)))
+
+
+@benchmark.register
+@benchmark.option.range(8, limit=8 << 10)
+def using_range(state):
+    while state:
+        sum(range(state.range(0)))
+
+
+@benchmark.register
+@benchmark.option.range_multiplier(2)
+@benchmark.option.range(1 << 10, 1 << 18)
+@benchmark.option.complexity(benchmark.oN)
+def computing_complexity(state):
+    while state:
+        sum(range(state.range(0)))
+    state.complexity_n = state.range(0)
+
+
+if __name__ == "__main__":
+    benchmark.main()
diff --git a/third_party/google_benchmark/src/bindings/python/nanobind.BUILD b/third_party/google_benchmark/src/bindings/python/nanobind.BUILD
new file mode 100644
index 0000000..35536bb
--- /dev/null
+++ b/third_party/google_benchmark/src/bindings/python/nanobind.BUILD
@@ -0,0 +1,56 @@
+
+config_setting(
+    name = "msvc_compiler",
+    flag_values = {"@bazel_tools//tools/cpp:compiler": "msvc-cl"},
+)
+
+cc_library(
+    name = "nanobind",
+    hdrs = glob(
+        include = [
+            "include/nanobind/*.h",
+            "include/nanobind/stl/*.h",
+            "include/nanobind/detail/*.h",
+        ],
+        exclude = [],
+    ),
+    srcs = [
+        "include/nanobind/stl/detail/nb_dict.h",
+        "include/nanobind/stl/detail/nb_list.h",
+        "include/nanobind/stl/detail/traits.h",
+        "ext/robin_map/include/tsl/robin_map.h",
+        "ext/robin_map/include/tsl/robin_hash.h",
+        "ext/robin_map/include/tsl/robin_growth_policy.h",
+        "ext/robin_map/include/tsl/robin_set.h",
+        "src/buffer.h",
+        "src/common.cpp",
+        "src/error.cpp",
+        "src/implicit.cpp",
+        "src/nb_enum.cpp",
+        "src/nb_func.cpp",
+        "src/nb_internals.cpp",
+        "src/nb_internals.h",
+        "src/nb_ndarray.cpp",
+        "src/nb_type.cpp",
+        "src/trampoline.cpp",
+    ],
+    copts = select({
+        ":msvc_compiler": [],
+        "//conditions:default": [
+        "-fexceptions",
+        "-Os",  # size optimization
+        "-flto", # enable LTO
+        ],
+    }),
+    linkopts = select({
+        "@com_github_google_benchmark//:macos": [
+        "-undefined dynamic_lookup",
+        "-Wl,-no_fixup_chains",
+        "-Wl,-dead_strip",
+        ],
+        "//conditions:default": [],
+    }),
+    includes = ["include", "ext/robin_map/include"],
+    deps = ["@python_headers"],
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/google_benchmark/src/bindings/python/python_headers.BUILD b/third_party/google_benchmark/src/bindings/python/python_headers.BUILD
new file mode 100644
index 0000000..9c34cf6
--- /dev/null
+++ b/third_party/google_benchmark/src/bindings/python/python_headers.BUILD
@@ -0,0 +1,6 @@
+cc_library(
+    name = "python_headers",
+    hdrs = glob(["**/*.h"]),
+    includes = ["."],
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/google_benchmark/src/bindings/python/requirements.txt b/third_party/google_benchmark/src/bindings/python/requirements.txt
new file mode 100644
index 0000000..f5bbe7e
--- /dev/null
+++ b/third_party/google_benchmark/src/bindings/python/requirements.txt
@@ -0,0 +1,2 @@
+absl-py>=0.7.1
+
diff --git a/third_party/google_benchmark/src/check.h b/third_party/google_benchmark/src/check.h
deleted file mode 100644
index f5f8253..0000000
--- a/third_party/google_benchmark/src/check.h
+++ /dev/null
@@ -1,82 +0,0 @@
-#ifndef CHECK_H_
-#define CHECK_H_
-
-#include <cmath>
-#include <cstdlib>
-#include <ostream>
-
-#include "internal_macros.h"
-#include "log.h"
-
-namespace benchmark {
-namespace internal {
-
-typedef void(AbortHandlerT)();
-
-inline AbortHandlerT*& GetAbortHandler() {
-  static AbortHandlerT* handler = &std::abort;
-  return handler;
-}
-
-BENCHMARK_NORETURN inline void CallAbortHandler() {
-  GetAbortHandler()();
-  std::abort();  // fallback to enforce noreturn
-}
-
-// CheckHandler is the class constructed by failing CHECK macros. CheckHandler
-// will log information about the failures and abort when it is destructed.
-class CheckHandler {
- public:
-  CheckHandler(const char* check, const char* file, const char* func, int line)
-      : log_(GetErrorLogInstance()) {
-    log_ << file << ":" << line << ": " << func << ": Check `" << check
-         << "' failed. ";
-  }
-
-  LogType& GetLog() { return log_; }
-
-  BENCHMARK_NORETURN ~CheckHandler() BENCHMARK_NOEXCEPT_OP(false) {
-    log_ << std::endl;
-    CallAbortHandler();
-  }
-
-  CheckHandler& operator=(const CheckHandler&) = delete;
-  CheckHandler(const CheckHandler&) = delete;
-  CheckHandler() = delete;
-
- private:
-  LogType& log_;
-};
-
-}  // end namespace internal
-}  // end namespace benchmark
-
-// The CHECK macro returns a std::ostream object that can have extra information
-// written to it.
-#ifndef NDEBUG
-#define CHECK(b)                                                             \
-  (b ? ::benchmark::internal::GetNullLogInstance()                           \
-     : ::benchmark::internal::CheckHandler(#b, __FILE__, __func__, __LINE__) \
-           .GetLog())
-#else
-#define CHECK(b) ::benchmark::internal::GetNullLogInstance()
-#endif
-
-// clang-format off
-// preserve whitespacing between operators for alignment
-#define CHECK_EQ(a, b) CHECK((a) == (b))
-#define CHECK_NE(a, b) CHECK((a) != (b))
-#define CHECK_GE(a, b) CHECK((a) >= (b))
-#define CHECK_LE(a, b) CHECK((a) <= (b))
-#define CHECK_GT(a, b) CHECK((a) > (b))
-#define CHECK_LT(a, b) CHECK((a) < (b))
-
-#define CHECK_FLOAT_EQ(a, b, eps) CHECK(std::fabs((a) - (b)) <  (eps))
-#define CHECK_FLOAT_NE(a, b, eps) CHECK(std::fabs((a) - (b)) >= (eps))
-#define CHECK_FLOAT_GE(a, b, eps) CHECK((a) - (b) > -(eps))
-#define CHECK_FLOAT_LE(a, b, eps) CHECK((b) - (a) > -(eps))
-#define CHECK_FLOAT_GT(a, b, eps) CHECK((a) - (b) >  (eps))
-#define CHECK_FLOAT_LT(a, b, eps) CHECK((b) - (a) >  (eps))
-//clang-format on
-
-#endif  // CHECK_H_
diff --git a/third_party/google_benchmark/cmake/AddCXXCompilerFlag.cmake b/third_party/google_benchmark/src/cmake/AddCXXCompilerFlag.cmake
similarity index 94%
rename from third_party/google_benchmark/cmake/AddCXXCompilerFlag.cmake
rename to third_party/google_benchmark/src/cmake/AddCXXCompilerFlag.cmake
index d0d2099..858589e 100644
--- a/third_party/google_benchmark/cmake/AddCXXCompilerFlag.cmake
+++ b/third_party/google_benchmark/src/cmake/AddCXXCompilerFlag.cmake
@@ -34,9 +34,11 @@
   check_cxx_compiler_flag("${FLAG}" ${MANGLED_FLAG})
   set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}")
   if(${MANGLED_FLAG})
-    set(VARIANT ${ARGV1})
-    if(ARGV1)
+    if(ARGC GREATER 1)
+      set(VARIANT ${ARGV1})
       string(TOUPPER "_${VARIANT}" VARIANT)
+    else()
+      set(VARIANT "")
     endif()
     set(CMAKE_CXX_FLAGS${VARIANT} "${CMAKE_CXX_FLAGS${VARIANT}} ${BENCHMARK_CXX_FLAGS${VARIANT}} ${FLAG}" PARENT_SCOPE)
   endif()
@@ -49,9 +51,11 @@
   check_cxx_compiler_flag("${FLAG}" ${MANGLED_FLAG})
   set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}")
   if(${MANGLED_FLAG})
-    set(VARIANT ${ARGV1})
-    if(ARGV1)
+    if(ARGC GREATER 1)
+      set(VARIANT ${ARGV1})
       string(TOUPPER "_${VARIANT}" VARIANT)
+    else()
+      set(VARIANT "")
     endif()
     set(CMAKE_CXX_FLAGS${VARIANT} "${CMAKE_CXX_FLAGS${VARIANT}} ${FLAG}" PARENT_SCOPE)
     set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${FLAG}" PARENT_SCOPE)
diff --git a/third_party/google_benchmark/cmake/CXXFeatureCheck.cmake b/third_party/google_benchmark/src/cmake/CXXFeatureCheck.cmake
similarity index 60%
rename from third_party/google_benchmark/cmake/CXXFeatureCheck.cmake
rename to third_party/google_benchmark/src/cmake/CXXFeatureCheck.cmake
index 059d510..e514826 100644
--- a/third_party/google_benchmark/cmake/CXXFeatureCheck.cmake
+++ b/third_party/google_benchmark/src/cmake/CXXFeatureCheck.cmake
@@ -17,6 +17,8 @@
 endif()
 set(__cxx_feature_check INCLUDED)
 
+option(CXXFEATURECHECK_DEBUG OFF)
+
 function(cxx_feature_check FILE)
   string(TOLOWER ${FILE} FILE)
   string(TOUPPER ${FILE} VAR)
@@ -27,13 +29,22 @@
     return()
   endif()
 
+  set(FEATURE_CHECK_CMAKE_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS})
+  if (ARGC GREATER 1)
+    message(STATUS "Enabling additional flags: ${ARGV1}")
+    list(APPEND FEATURE_CHECK_CMAKE_FLAGS ${ARGV1})
+  endif()
+
   if (NOT DEFINED COMPILE_${FEATURE})
-    message(STATUS "Performing Test ${FEATURE}")
     if(CMAKE_CROSSCOMPILING)
+      message(STATUS "Cross-compiling to test ${FEATURE}")
       try_compile(COMPILE_${FEATURE}
               ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp
-              CMAKE_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS}
-              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES})
+              CXX_STANDARD 11
+              CXX_STANDARD_REQUIRED ON
+              CMAKE_FLAGS ${FEATURE_CHECK_CMAKE_FLAGS}
+              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES}
+              OUTPUT_VARIABLE COMPILE_OUTPUT_VAR)
       if(COMPILE_${FEATURE})
         message(WARNING
               "If you see build failures due to cross compilation, try setting HAVE_${VAR} to 0")
@@ -42,11 +53,14 @@
         set(RUN_${FEATURE} 1 CACHE INTERNAL "")
       endif()
     else()
-      message(STATUS "Performing Test ${FEATURE}")
+      message(STATUS "Compiling and running to test ${FEATURE}")
       try_run(RUN_${FEATURE} COMPILE_${FEATURE}
               ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp
-              CMAKE_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS}
-              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES})
+              CXX_STANDARD 11
+              CXX_STANDARD_REQUIRED ON
+              CMAKE_FLAGS ${FEATURE_CHECK_CMAKE_FLAGS}
+              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES}
+              COMPILE_OUTPUT_VARIABLE COMPILE_OUTPUT_VAR)
     endif()
   endif()
 
@@ -56,7 +70,11 @@
     add_definitions(-DHAVE_${VAR})
   else()
     if(NOT COMPILE_${FEATURE})
-      message(STATUS "Performing Test ${FEATURE} -- failed to compile")
+      if(CXXFEATURECHECK_DEBUG)
+        message(STATUS "Performing Test ${FEATURE} -- failed to compile: ${COMPILE_OUTPUT_VAR}")
+      else()
+        message(STATUS "Performing Test ${FEATURE} -- failed to compile")
+      endif()
     else()
       message(STATUS "Performing Test ${FEATURE} -- compiled but failed to run")
     endif()
diff --git a/third_party/google_benchmark/src/cmake/Config.cmake.in b/third_party/google_benchmark/src/cmake/Config.cmake.in
new file mode 100644
index 0000000..2e15f0c
--- /dev/null
+++ b/third_party/google_benchmark/src/cmake/Config.cmake.in
@@ -0,0 +1,7 @@
+@PACKAGE_INIT@
+
+include (CMakeFindDependencyMacro)
+
+find_dependency (Threads)
+
+include("${CMAKE_CURRENT_LIST_DIR}/@targets_export_name@.cmake")
diff --git a/third_party/google_benchmark/cmake/GetGitVersion.cmake b/third_party/google_benchmark/src/cmake/GetGitVersion.cmake
similarity index 61%
rename from third_party/google_benchmark/cmake/GetGitVersion.cmake
rename to third_party/google_benchmark/src/cmake/GetGitVersion.cmake
index 4f10f22..04a1f9b 100644
--- a/third_party/google_benchmark/cmake/GetGitVersion.cmake
+++ b/third_party/google_benchmark/src/cmake/GetGitVersion.cmake
@@ -20,16 +20,20 @@
 
 function(get_git_version var)
   if(GIT_EXECUTABLE)
-      execute_process(COMMAND ${GIT_EXECUTABLE} describe --match "v[0-9]*.[0-9]*.[0-9]*" --abbrev=8
+      execute_process(COMMAND ${GIT_EXECUTABLE} describe --tags --match "v[0-9]*.[0-9]*.[0-9]*" --abbrev=8
           WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
           RESULT_VARIABLE status
-          OUTPUT_VARIABLE GIT_VERSION
+          OUTPUT_VARIABLE GIT_DESCRIBE_VERSION
           ERROR_QUIET)
-      if(${status})
-          set(GIT_VERSION "v0.0.0")
+      if(status)
+          set(GIT_DESCRIBE_VERSION "v0.0.0")
+      endif()
+      
+      string(STRIP ${GIT_DESCRIBE_VERSION} GIT_DESCRIBE_VERSION)
+      if(GIT_DESCRIBE_VERSION MATCHES v[^-]*-) 
+         string(REGEX REPLACE "v([^-]*)-([0-9]+)-.*" "\\1.\\2"  GIT_VERSION ${GIT_DESCRIBE_VERSION})
       else()
-          string(STRIP ${GIT_VERSION} GIT_VERSION)
-          string(REGEX REPLACE "-[0-9]+-g" "-" GIT_VERSION ${GIT_VERSION})
+         string(REGEX REPLACE "v(.*)" "\\1" GIT_VERSION ${GIT_DESCRIBE_VERSION})
       endif()
 
       # Work out if the repository is dirty
@@ -43,12 +47,12 @@
           ERROR_QUIET)
       string(COMPARE NOTEQUAL "${GIT_DIFF_INDEX}" "" GIT_DIRTY)
       if (${GIT_DIRTY})
-          set(GIT_VERSION "${GIT_VERSION}-dirty")
+          set(GIT_DESCRIBE_VERSION "${GIT_DESCRIBE_VERSION}-dirty")
       endif()
+      message(STATUS "git version: ${GIT_DESCRIBE_VERSION} normalized to ${GIT_VERSION}")
   else()
-      set(GIT_VERSION "v0.0.0")
+      set(GIT_VERSION "0.0.0")
   endif()
 
-  message(STATUS "git Version: ${GIT_VERSION}")
   set(${var} ${GIT_VERSION} PARENT_SCOPE)
 endfunction()
diff --git a/third_party/google_benchmark/cmake/GoogleTest.cmake b/third_party/google_benchmark/src/cmake/GoogleTest.cmake
similarity index 77%
rename from third_party/google_benchmark/cmake/GoogleTest.cmake
rename to third_party/google_benchmark/src/cmake/GoogleTest.cmake
index dd611fc..44adbfb 100644
--- a/third_party/google_benchmark/cmake/GoogleTest.cmake
+++ b/third_party/google_benchmark/src/cmake/GoogleTest.cmake
@@ -29,13 +29,24 @@
 
 include(${GOOGLETEST_PREFIX}/googletest-paths.cmake)
 
+# googletest doesn't seem to want to stay build warning clean so let's not hurt ourselves.
+if (MSVC)
+  add_compile_options(/wd4244 /wd4722)
+else()
+  add_compile_options(-w)
+endif()
+
 # Add googletest directly to our build. This defines
 # the gtest and gtest_main targets.
 add_subdirectory(${GOOGLETEST_SOURCE_DIR}
                  ${GOOGLETEST_BINARY_DIR}
                  EXCLUDE_FROM_ALL)
 
-set_target_properties(gtest PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest,INTERFACE_INCLUDE_DIRECTORIES>)
-set_target_properties(gtest_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest_main,INTERFACE_INCLUDE_DIRECTORIES>)
-set_target_properties(gmock PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock,INTERFACE_INCLUDE_DIRECTORIES>)
-set_target_properties(gmock_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock_main,INTERFACE_INCLUDE_DIRECTORIES>)
+if(NOT DEFINED GTEST_COMPILE_COMMANDS)
+    set(GTEST_COMPILE_COMMANDS ON)
+endif()
+
+set_target_properties(gtest PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest,INTERFACE_INCLUDE_DIRECTORIES> EXPORT_COMPILE_COMMANDS ${GTEST_COMPILE_COMMANDS})
+set_target_properties(gtest_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest_main,INTERFACE_INCLUDE_DIRECTORIES> EXPORT_COMPILE_COMMANDS ${GTEST_COMPILE_COMMANDS})
+set_target_properties(gmock PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock,INTERFACE_INCLUDE_DIRECTORIES> EXPORT_COMPILE_COMMANDS ${GTEST_COMPILE_COMMANDS})
+set_target_properties(gmock_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock_main,INTERFACE_INCLUDE_DIRECTORIES> EXPORT_COMPILE_COMMANDS ${GTEST_COMPILE_COMMANDS})
diff --git a/third_party/google_benchmark/cmake/GoogleTest.cmake.in b/third_party/google_benchmark/src/cmake/GoogleTest.cmake.in
similarity index 90%
rename from third_party/google_benchmark/cmake/GoogleTest.cmake.in
rename to third_party/google_benchmark/src/cmake/GoogleTest.cmake.in
index 28818ee..ce653ac 100644
--- a/third_party/google_benchmark/cmake/GoogleTest.cmake.in
+++ b/third_party/google_benchmark/src/cmake/GoogleTest.cmake.in
@@ -31,13 +31,14 @@
   )
 else()
   if(NOT ALLOW_DOWNLOADING_GOOGLETEST)
-    message(SEND_ERROR "Did not find Google Test sources! Either pass correct path in GOOGLETEST_PATH, or enable ALLOW_DOWNLOADING_GOOGLETEST, or disable BENCHMARK_ENABLE_GTEST_TESTS / BENCHMARK_ENABLE_TESTING.")
+    message(SEND_ERROR "Did not find Google Test sources! Either pass correct path in GOOGLETEST_PATH, or enable BENCHMARK_DOWNLOAD_DEPENDENCIES, or disable BENCHMARK_USE_BUNDLED_GTEST, or disable BENCHMARK_ENABLE_GTEST_TESTS / BENCHMARK_ENABLE_TESTING.")
+    return()
   else()
     message(WARNING "Did not find Google Test sources! Fetching from web...")
     ExternalProject_Add(
       googletest
       GIT_REPOSITORY    https://github.com/google/googletest.git
-      GIT_TAG           master
+      GIT_TAG           "release-1.11.0"
       PREFIX            "${CMAKE_BINARY_DIR}"
       STAMP_DIR         "${CMAKE_BINARY_DIR}/stamp"
       DOWNLOAD_DIR      "${CMAKE_BINARY_DIR}/download"
diff --git a/third_party/google_benchmark/cmake/benchmark.pc.in b/third_party/google_benchmark/src/cmake/benchmark.pc.in
similarity index 73%
rename from third_party/google_benchmark/cmake/benchmark.pc.in
rename to third_party/google_benchmark/src/cmake/benchmark.pc.in
index 43ca8f9..9dae881 100644
--- a/third_party/google_benchmark/cmake/benchmark.pc.in
+++ b/third_party/google_benchmark/src/cmake/benchmark.pc.in
@@ -1,7 +1,7 @@
 prefix=@CMAKE_INSTALL_PREFIX@
 exec_prefix=${prefix}
-libdir=${prefix}/lib
-includedir=${prefix}/include
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
 
 Name: @PROJECT_NAME@
 Description: Google microbenchmark framework
diff --git a/third_party/google_benchmark/cmake/gnu_posix_regex.cpp b/third_party/google_benchmark/src/cmake/gnu_posix_regex.cpp
similarity index 100%
rename from third_party/google_benchmark/cmake/gnu_posix_regex.cpp
rename to third_party/google_benchmark/src/cmake/gnu_posix_regex.cpp
diff --git a/third_party/google_benchmark/cmake/llvm-toolchain.cmake b/third_party/google_benchmark/src/cmake/llvm-toolchain.cmake
similarity index 100%
rename from third_party/google_benchmark/cmake/llvm-toolchain.cmake
rename to third_party/google_benchmark/src/cmake/llvm-toolchain.cmake
diff --git a/third_party/google_benchmark/cmake/posix_regex.cpp b/third_party/google_benchmark/src/cmake/posix_regex.cpp
similarity index 100%
rename from third_party/google_benchmark/cmake/posix_regex.cpp
rename to third_party/google_benchmark/src/cmake/posix_regex.cpp
diff --git a/third_party/google_benchmark/src/cmake/pthread_affinity.cpp b/third_party/google_benchmark/src/cmake/pthread_affinity.cpp
new file mode 100644
index 0000000..7b143bc
--- /dev/null
+++ b/third_party/google_benchmark/src/cmake/pthread_affinity.cpp
@@ -0,0 +1,16 @@
+#include <pthread.h>
+int main() {
+  cpu_set_t set;
+  CPU_ZERO(&set);
+  for (int i = 0; i < CPU_SETSIZE; ++i) {
+    CPU_SET(i, &set);
+    CPU_CLR(i, &set);
+  }
+  pthread_t self = pthread_self();
+  int ret;
+  ret = pthread_getaffinity_np(self, sizeof(set), &set);
+  if (ret != 0) return ret;
+  ret = pthread_setaffinity_np(self, sizeof(set), &set);
+  if (ret != 0) return ret;
+  return 0;
+}
diff --git a/third_party/google_benchmark/cmake/split_list.cmake b/third_party/google_benchmark/src/cmake/split_list.cmake
similarity index 100%
rename from third_party/google_benchmark/cmake/split_list.cmake
rename to third_party/google_benchmark/src/cmake/split_list.cmake
diff --git a/third_party/google_benchmark/cmake/std_regex.cpp b/third_party/google_benchmark/src/cmake/std_regex.cpp
similarity index 100%
rename from third_party/google_benchmark/cmake/std_regex.cpp
rename to third_party/google_benchmark/src/cmake/std_regex.cpp
diff --git a/third_party/google_benchmark/cmake/steady_clock.cpp b/third_party/google_benchmark/src/cmake/steady_clock.cpp
similarity index 100%
rename from third_party/google_benchmark/cmake/steady_clock.cpp
rename to third_party/google_benchmark/src/cmake/steady_clock.cpp
diff --git a/third_party/google_benchmark/cmake/thread_safety_attributes.cpp b/third_party/google_benchmark/src/cmake/thread_safety_attributes.cpp
similarity index 100%
rename from third_party/google_benchmark/cmake/thread_safety_attributes.cpp
rename to third_party/google_benchmark/src/cmake/thread_safety_attributes.cpp
diff --git a/third_party/google_benchmark/src/colorprint_starboard.cc b/third_party/google_benchmark/src/colorprint_starboard.cc
deleted file mode 100644
index e2d83e4..0000000
--- a/third_party/google_benchmark/src/colorprint_starboard.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright 2019 The Cobalt Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "colorprint.h"
-
-#include <stdio.h>
-#include <vector>
-
-#include "starboard/string.h"
-
-namespace benchmark {
-
-std::string FormatString(const char* msg, va_list args) {
-  va_list args_copy;
-  va_copy(args_copy, args);
-
-  int expected_size = ::vsnprintf(NULL, 0, msg, args_copy);
-
-  va_end(args_copy);
-
-  if (expected_size <= 0) {
-    return std::string();
-  }
-
-  std::vector<char> buffer(expected_size + 1);
-  ::vsnprintf(buffer.data(), buffer.size(), msg, args);
-  return std::string(buffer.data(), expected_size);
-}
-
-std::string FormatString(const char* msg, ...) {
-  va_list args;
-  va_start(args, msg);
-  auto tmp = FormatString(msg, args);
-  va_end(args);
-  return tmp;
-}
-
-void ColorPrintf(std::ostream& out, LogColor color, const char* fmt, ...) {
-  va_list args;
-  va_start(args, fmt);
-  ColorPrintf(out, color, fmt, args);
-  va_end(args);
-}
-
-void ColorPrintf(std::ostream& out, LogColor color, const char* fmt,
-                 va_list args) {
-  out << FormatString(fmt, args);
-}
-
-bool IsColorTerminal() { return false; }
-
-}  // end namespace benchmark
diff --git a/third_party/google_benchmark/src/commandlineflags.h b/third_party/google_benchmark/src/commandlineflags.h
deleted file mode 100644
index 3a1f6a8..0000000
--- a/third_party/google_benchmark/src/commandlineflags.h
+++ /dev/null
@@ -1,103 +0,0 @@
-#ifndef BENCHMARK_COMMANDLINEFLAGS_H_
-#define BENCHMARK_COMMANDLINEFLAGS_H_
-
-#include <cstdint>
-#include <string>
-
-// Macro for referencing flags.
-#define FLAG(name) FLAGS_##name
-
-// Macros for declaring flags.
-#define DECLARE_bool(name) extern bool FLAG(name)
-#define DECLARE_int32(name) extern int32_t FLAG(name)
-#define DECLARE_double(name) extern double FLAG(name)
-#define DECLARE_string(name) extern std::string FLAG(name)
-
-// Macros for defining flags.
-#define DEFINE_bool(name, default_val)            \
-  bool FLAG(name) =                               \
-    benchmark::BoolFromEnv(#name, default_val)
-#define DEFINE_int32(name, default_val)           \
-  int32_t FLAG(name) =                            \
-    benchmark::Int32FromEnv(#name, default_val)
-#define DEFINE_double(name, default_val)          \
-  double FLAG(name) =                             \
-    benchmark::DoubleFromEnv(#name, default_val)
-#define DEFINE_string(name, default_val)          \
-  std::string FLAG(name) =                        \
-    benchmark::StringFromEnv(#name, default_val)
-
-namespace benchmark {
-
-// Parses a bool from the environment variable
-// corresponding to the given flag.
-//
-// If the variable exists, returns IsTruthyFlagValue() value;  if not,
-// returns the given default value.
-bool BoolFromEnv(const char* flag, bool default_val);
-
-// Parses an Int32 from the environment variable
-// corresponding to the given flag.
-//
-// If the variable exists, returns ParseInt32() value;  if not, returns
-// the given default value.
-int32_t Int32FromEnv(const char* flag, int32_t default_val);
-
-// Parses an Double from the environment variable
-// corresponding to the given flag.
-//
-// If the variable exists, returns ParseDouble();  if not, returns
-// the given default value.
-double DoubleFromEnv(const char* flag, double default_val);
-
-// Parses a string from the environment variable
-// corresponding to the given flag.
-//
-// If variable exists, returns its value;  if not, returns
-// the given default value.
-const char* StringFromEnv(const char* flag, const char* default_val);
-
-// Parses a string for a bool flag, in the form of either
-// "--flag=value" or "--flag".
-//
-// In the former case, the value is taken as true if it passes IsTruthyValue().
-//
-// In the latter case, the value is taken as true.
-//
-// On success, stores the value of the flag in *value, and returns
-// true.  On failure, returns false without changing *value.
-bool ParseBoolFlag(const char* str, const char* flag, bool* value);
-
-// Parses a string for an Int32 flag, in the form of
-// "--flag=value".
-//
-// On success, stores the value of the flag in *value, and returns
-// true.  On failure, returns false without changing *value.
-bool ParseInt32Flag(const char* str, const char* flag, int32_t* value);
-
-// Parses a string for a Double flag, in the form of
-// "--flag=value".
-//
-// On success, stores the value of the flag in *value, and returns
-// true.  On failure, returns false without changing *value.
-bool ParseDoubleFlag(const char* str, const char* flag, double* value);
-
-// Parses a string for a string flag, in the form of
-// "--flag=value".
-//
-// On success, stores the value of the flag in *value, and returns
-// true.  On failure, returns false without changing *value.
-bool ParseStringFlag(const char* str, const char* flag, std::string* value);
-
-// Returns true if the string matches the flag.
-bool IsFlag(const char* str, const char* flag);
-
-// Returns true unless value starts with one of: '0', 'f', 'F', 'n' or 'N', or
-// some non-alphanumeric character. Also returns false if the value matches
-// one of 'no', 'false', 'off' (case-insensitive). As a special case, also
-// returns true if value is the empty string.
-bool IsTruthyFlagValue(const std::string& value);
-
-}  // end namespace benchmark
-
-#endif  // BENCHMARK_COMMANDLINEFLAGS_H_
diff --git a/third_party/google_benchmark/docs/AssemblyTests.md b/third_party/google_benchmark/src/docs/AssemblyTests.md
similarity index 98%
rename from third_party/google_benchmark/docs/AssemblyTests.md
rename to third_party/google_benchmark/src/docs/AssemblyTests.md
index 1fbdc26..89df7ca 100644
--- a/third_party/google_benchmark/docs/AssemblyTests.md
+++ b/third_party/google_benchmark/src/docs/AssemblyTests.md
@@ -111,6 +111,7 @@
 is matching stack frame addresses. In this case regular expressions
 can be used to match the differing bits of output. For example:
 
+<!-- {% raw %} -->
 ```c++
 int ExternInt;
 struct Point { int x, y, z; };
@@ -127,6 +128,7 @@
     // CHECK: ret
 }
 ```
+<!-- {% endraw %} -->
 
 ## Current Requirements and Limitations
 
diff --git a/third_party/google_benchmark/src/docs/_config.yml b/third_party/google_benchmark/src/docs/_config.yml
new file mode 100644
index 0000000..fff4ab9
--- /dev/null
+++ b/third_party/google_benchmark/src/docs/_config.yml
@@ -0,0 +1 @@
+theme: jekyll-theme-minimal
diff --git a/third_party/google_benchmark/src/docs/dependencies.md b/third_party/google_benchmark/src/docs/dependencies.md
new file mode 100644
index 0000000..07760e1
--- /dev/null
+++ b/third_party/google_benchmark/src/docs/dependencies.md
@@ -0,0 +1,13 @@
+# Build tool dependency policy
+
+We follow the [Foundational C++ support policy](https://opensource.google/documentation/policies/cplusplus-support) for our build tools. In
+particular the ["Build Systems" section](https://opensource.google/documentation/policies/cplusplus-support#build-systems).
+
+## CMake
+
+The current supported version is CMake 3.10 as of 2023-08-10. Most modern
+distributions include newer versions, for example:
+
+* Ubuntu 20.04 provides CMake 3.16.3
+* Debian 11.4 provides CMake 3.18.4
+* Ubuntu 22.04 provides CMake 3.22.1
diff --git a/third_party/google_benchmark/src/docs/index.md b/third_party/google_benchmark/src/docs/index.md
new file mode 100644
index 0000000..9cada96
--- /dev/null
+++ b/third_party/google_benchmark/src/docs/index.md
@@ -0,0 +1,12 @@
+# Benchmark
+
+* [Assembly Tests](AssemblyTests.md)
+* [Dependencies](dependencies.md)
+* [Perf Counters](perf_counters.md)
+* [Platform Specific Build Instructions](platform_specific_build_instructions.md)
+* [Python Bindings](python_bindings.md)
+* [Random Interleaving](random_interleaving.md)
+* [Reducing Variance](reducing_variance.md)
+* [Releasing](releasing.md)
+* [Tools](tools.md)
+* [User Guide](user_guide.md)
diff --git a/third_party/google_benchmark/src/docs/perf_counters.md b/third_party/google_benchmark/src/docs/perf_counters.md
new file mode 100644
index 0000000..f342092
--- /dev/null
+++ b/third_party/google_benchmark/src/docs/perf_counters.md
@@ -0,0 +1,35 @@
+<a name="perf-counters" />
+
+# User-Requested Performance Counters
+
+When running benchmarks, the user may choose to request collection of
+performance counters. This may be useful in investigation scenarios - narrowing
+down the cause of a regression; or verifying that the underlying cause of a
+performance improvement matches expectations.
+
+This feature is available if:
+
+* The benchmark is run on an architecture featuring a Performance Monitoring
+  Unit (PMU),
+* The benchmark is compiled with support for collecting counters. Currently,
+  this requires [libpfm](http://perfmon2.sourceforge.net/), which is built as a
+  dependency via Bazel.
+
+The feature does not require modifying benchmark code. Counter collection is
+handled at the boundaries where timer collection is also handled. 
+
+To opt-in:
+* If using a Bazel build, add `--define pfm=1` to your build flags
+* If using CMake:
+  * Install `libpfm4-dev`, e.g. `apt-get install libpfm4-dev`.
+  * Enable the CMake flag `BENCHMARK_ENABLE_LIBPFM` in `CMakeLists.txt`.
+
+To use, pass a comma-separated list of counter names through the
+`--benchmark_perf_counters` flag. The names are decoded through libpfm - meaning,
+they are platform specific, but some (e.g. `CYCLES` or `INSTRUCTIONS`) are
+mapped by libpfm to platform-specifics - see libpfm
+[documentation](http://perfmon2.sourceforge.net/docs.html) for more details.
+
+The counter values are reported back through the [User Counters](../README.md#custom-counters)
+mechanism, meaning, they are available in all the formats (e.g. JSON) supported
+by User Counters.
diff --git a/third_party/google_benchmark/src/docs/platform_specific_build_instructions.md b/third_party/google_benchmark/src/docs/platform_specific_build_instructions.md
new file mode 100644
index 0000000..2d5d6c4
--- /dev/null
+++ b/third_party/google_benchmark/src/docs/platform_specific_build_instructions.md
@@ -0,0 +1,48 @@
+# Platform Specific Build Instructions
+
+## Building with GCC
+
+When the library is built using GCC it is necessary to link with the pthread
+library due to how GCC implements `std::thread`. Failing to link to pthread will
+lead to runtime exceptions (unless you're using libc++), not linker errors. See
+[issue #67](https://github.com/google/benchmark/issues/67) for more details. You
+can link to pthread by adding `-pthread` to your linker command. Note, you can
+also use `-lpthread`, but there are potential issues with ordering of command
+line parameters if you use that.
+
+On QNX, the pthread library is part of libc and usually included automatically
+(see
+[`pthread_create()`](https://www.qnx.com/developers/docs/7.1/index.html#com.qnx.doc.neutrino.lib_ref/topic/p/pthread_create.html)).
+There's no separate pthread library to link.
+
+## Building with Visual Studio 2015 or 2017
+
+The `shlwapi` library (`-lshlwapi`) is required to support a call to `CPUInfo` which reads the registry. Either add `shlwapi.lib` under `[ Configuration Properties > Linker > Input ]`, or use the following:
+
+```
+// Alternatively, can add libraries using linker options.
+#ifdef _WIN32
+#pragma comment ( lib, "Shlwapi.lib" )
+#ifdef _DEBUG
+#pragma comment ( lib, "benchmarkd.lib" )
+#else
+#pragma comment ( lib, "benchmark.lib" )
+#endif
+#endif
+```
+
+Can also use the graphical version of CMake:
+* Open `CMake GUI`.
+* Under `Where to build the binaries`, same path as source plus `build`.
+* Under `CMAKE_INSTALL_PREFIX`, same path as source plus `install`.
+* Click `Configure`, `Generate`, `Open Project`.
+* If build fails, try deleting entire directory and starting again, or unticking options to build less.
+
+## Building with Intel 2015 Update 1 or Intel System Studio Update 4
+
+See instructions for building with Visual Studio. Once built, right click on the solution and change the build to Intel.
+
+## Building on Solaris
+
+If you're running benchmarks on solaris, you'll want the kstat library linked in
+too (`-lkstat`).
\ No newline at end of file
diff --git a/third_party/google_benchmark/src/docs/python_bindings.md b/third_party/google_benchmark/src/docs/python_bindings.md
new file mode 100644
index 0000000..6a7aab0
--- /dev/null
+++ b/third_party/google_benchmark/src/docs/python_bindings.md
@@ -0,0 +1,34 @@
+# Building and installing Python bindings
+
+Python bindings are available as wheels on [PyPI](https://pypi.org/project/google-benchmark/) for importing and 
+using Google Benchmark directly in Python. 
+Currently, pre-built wheels exist for macOS (both ARM64 and Intel x86), Linux x86-64 and 64-bit Windows.
+Supported Python versions are Python 3.7 - 3.10.
+
+To install Google Benchmark's Python bindings, run:
+
+```bash
+python -m pip install --upgrade pip  # for manylinux2014 support
+python -m pip install google-benchmark
+```
+
+In order to keep your system Python interpreter clean, it is advisable to run these commands in a virtual
+environment. See the [official Python documentation](https://docs.python.org/3/library/venv.html) 
+on how to create virtual environments.
+
+To build a wheel directly from source, you can follow these steps:
+```bash
+git clone https://github.com/google/benchmark.git
+cd benchmark
+# create a virtual environment and activate it
+python3 -m venv venv --system-site-packages
+source venv/bin/activate  # .\venv\Scripts\Activate.ps1 on Windows
+
+# upgrade Python's system-wide packages
+python -m pip install --upgrade pip setuptools wheel
+# builds the wheel and stores it in the directory "wheelhouse".
+python -m pip wheel . -w wheelhouse
+```
+
+NB: Building wheels from source requires Bazel. For platform-specific instructions on how to install Bazel,
+refer to the [Bazel installation docs](https://bazel.build/install).
diff --git a/third_party/google_benchmark/src/docs/random_interleaving.md b/third_party/google_benchmark/src/docs/random_interleaving.md
new file mode 100644
index 0000000..c083036
--- /dev/null
+++ b/third_party/google_benchmark/src/docs/random_interleaving.md
@@ -0,0 +1,13 @@
+<a name="interleaving" />
+
+# Random Interleaving
+
+[Random Interleaving](https://github.com/google/benchmark/issues/1051) is a
+technique to lower run-to-run variance. It randomly interleaves repetitions of a
+microbenchmark with repetitions from other microbenchmarks in the same benchmark
+test. Data shows it is able to lower run-to-run variance by
+[40%](https://github.com/google/benchmark/issues/1051) on average.
+
+To use, you mainly need to set `--benchmark_enable_random_interleaving=true`,
+and optionally specify non-zero repetition count `--benchmark_repetitions=9`
+and optionally decrease the per-repetition time `--benchmark_min_time=0.1`.
diff --git a/third_party/google_benchmark/src/docs/reducing_variance.md b/third_party/google_benchmark/src/docs/reducing_variance.md
new file mode 100644
index 0000000..e566ab9
--- /dev/null
+++ b/third_party/google_benchmark/src/docs/reducing_variance.md
@@ -0,0 +1,100 @@
+# Reducing Variance
+
+<a name="disabling-cpu-frequency-scaling" />
+
+## Disabling CPU Frequency Scaling
+
+If you see this error:
+
+```
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
+```
+
+you might want to disable the CPU frequency scaling while running the
+benchmark, as well as consider other ways to stabilize the performance of
+your system while benchmarking.
+
+See [Reducing Variance](reducing_variance.md) for more information.
+
+Exactly how to do this depends on the Linux distribution,
+desktop environment, and installed programs.  Specific details are a moving
+target, so we will not attempt to exhaustively document them here.
+
+One simple option is to use the `cpupower` program to change the
+performance governor to "performance".  This tool is maintained along with
+the Linux kernel and provided by your distribution.
+
+It must be run as root, like this:
+
+```bash
+sudo cpupower frequency-set --governor performance
+```
+
+After this you can verify that all CPUs are using the performance governor
+by running this command:
+
+```bash
+cpupower frequency-info -o proc
+```
+
+The benchmarks you subsequently run will have less variance.
+
+<a name="reducing-variance" />
+
+## Reducing Variance in Benchmarks
+
+The Linux CPU frequency governor [discussed
+above](user_guide#disabling-cpu-frequency-scaling) is not the only source
+of noise in benchmarks.  Some, but not all, of the sources of variance
+include:
+
+1. On multi-core machines not all CPUs/CPU cores/CPU threads run the same
+   speed, so running a benchmark one time and then again may give a
+   different result depending on which CPU it ran on.
+2. CPU scaling features that run on the CPU, like Intel's Turbo Boost and
+   AMD Turbo Core and Precision Boost, can temporarily change the CPU
+   frequency even when the using the "performance" governor on Linux.
+3. Context switching between CPUs, or scheduling competition on the CPU the
+   benchmark is running on.
+4. Intel Hyperthreading or AMD SMT causing the same issue as above.
+5. Cache effects caused by code running on other CPUs.
+6. Non-uniform memory architectures (NUMA).
+
+These can cause variance in benchmarks results within a single run
+(`--benchmark_repetitions=N`) or across multiple runs of the benchmark
+program.
+
+Reducing sources of variance is OS and architecture dependent, which is one
+reason some companies maintain machines dedicated to performance testing.
+
+Some of the easier and and effective ways of reducing variance on a typical
+Linux workstation are:
+
+1. Use the performance governor as [discussed
+above](user_guide#disabling-cpu-frequency-scaling).
+1. Disable processor boosting by:
+   ```sh
+   echo 0 | sudo tee /sys/devices/system/cpu/cpufreq/boost
+   ```
+   See the Linux kernel's
+   [boost.txt](https://www.kernel.org/doc/Documentation/cpu-freq/boost.txt)
+   for more information.
+2. Set the benchmark program's task affinity to a fixed cpu.  For example:
+   ```sh
+   taskset -c 0 ./mybenchmark
+   ```
+3. Disabling Hyperthreading/SMT.  This can be done in the Bios or using the
+   `/sys` file system (see the LLVM project's [Benchmarking
+   tips](https://llvm.org/docs/Benchmarking.html)).
+4. Close other programs that do non-trivial things based on timers, such as
+   your web browser, desktop environment, etc.
+5. Reduce the working set of your benchmark to fit within the L1 cache, but
+   do be aware that this may lead you to optimize for an unrelistic
+   situation.
+
+Further resources on this topic:
+
+1. The LLVM project's [Benchmarking
+   tips](https://llvm.org/docs/Benchmarking.html).
+1. The Arch Wiki [Cpu frequency
+scaling](https://wiki.archlinux.org/title/CPU_frequency_scaling) page.
diff --git a/third_party/google_benchmark/src/docs/releasing.md b/third_party/google_benchmark/src/docs/releasing.md
new file mode 100644
index 0000000..6d3b613
--- /dev/null
+++ b/third_party/google_benchmark/src/docs/releasing.md
@@ -0,0 +1,37 @@
+# How to release
+
+* Make sure you're on main and synced to HEAD
+* Ensure the project builds and tests run
+    * `parallel -j0 exec ::: test/*_test` can help ensure everything at least
+      passes
+* Prepare release notes
+    * `git log $(git describe --abbrev=0 --tags)..HEAD` gives you the list of
+      commits between the last annotated tag and HEAD
+    * Pick the most interesting.
+* Create one last commit that updates the version saved in `CMakeLists.txt` and the
+  `__version__` variable in `bindings/python/google_benchmark/__init__.py`to the release
+  version you're creating. (This version will be used if benchmark is installed from the
+  archive you'll be creating in the next step.)
+
+```
+project (benchmark VERSION 1.6.0 LANGUAGES CXX)
+```
+
+```python
+# bindings/python/google_benchmark/__init__.py
+
+# ...
+
+__version__ = "1.6.0"  # <-- change this to the release version you are creating
+
+# ...
+```
+
+* Create a release through github's interface
+    * Note this will create a lightweight tag.
+    * Update this to an annotated tag:
+      * `git pull --tags`
+      * `git tag -a -f <tag> <tag>`
+      * `git push --force --tags origin`
+* Confirm that the "Build and upload Python wheels" action runs to completion
+    * run it manually if it hasn't run
diff --git a/third_party/google_benchmark/docs/tools.md b/third_party/google_benchmark/src/docs/tools.md
similarity index 98%
rename from third_party/google_benchmark/docs/tools.md
rename to third_party/google_benchmark/src/docs/tools.md
index 4a3b2e9..f2d0c49 100644
--- a/third_party/google_benchmark/docs/tools.md
+++ b/third_party/google_benchmark/src/docs/tools.md
@@ -4,7 +4,11 @@
 
 The `compare.py` can be used to compare the result of benchmarks.
 
-**NOTE**: the utility relies on the scipy package which can be installed using [these instructions](https://www.scipy.org/install.html).
+### Dependencies
+The utility relies on the [scipy](https://www.scipy.org) package which can be installed using pip:
+```bash
+pip3 install -r requirements.txt
+```
 
 ### Displaying aggregates only
 
diff --git a/third_party/google_benchmark/README.md b/third_party/google_benchmark/src/docs/user_guide.md
similarity index 75%
rename from third_party/google_benchmark/README.md
rename to third_party/google_benchmark/src/docs/user_guide.md
index d972ab0..133fca5 100644
--- a/third_party/google_benchmark/README.md
+++ b/third_party/google_benchmark/src/docs/user_guide.md
@@ -1,246 +1,6 @@
-# Benchmark
+# User Guide
 
-[![Build Status](https://travis-ci.org/google/benchmark.svg?branch=master)](https://travis-ci.org/google/benchmark)
-[![Build status](https://ci.appveyor.com/api/projects/status/u0qsyp7t1tk7cpxs/branch/master?svg=true)](https://ci.appveyor.com/project/google/benchmark/branch/master)
-[![Coverage Status](https://coveralls.io/repos/google/benchmark/badge.svg)](https://coveralls.io/r/google/benchmark)
-[![slackin](https://slackin-iqtfqnpzxd.now.sh/badge.svg)](https://slackin-iqtfqnpzxd.now.sh/)
-
-A library to benchmark code snippets, similar to unit tests. Example:
-
-```c++
-#include <benchmark/benchmark.h>
-
-static void BM_SomeFunction(benchmark::State& state) {
-  // Perform setup here
-  for (auto _ : state) {
-    // This code gets timed
-    SomeFunction();
-  }
-}
-// Register the function as a benchmark
-BENCHMARK(BM_SomeFunction);
-// Run the benchmark
-BENCHMARK_MAIN();
-```
-
-To get started, see [Requirements](#requirements) and
-[Installation](#installation). See [Usage](#usage) for a full example and the
-[User Guide](#user-guide) for a more comprehensive feature overview.
-
-It may also help to read the [Google Test documentation](https://github.com/google/googletest/blob/master/googletest/docs/primer.md)
-as some of the structural aspects of the APIs are similar.
-
-### Resources
-
-[Discussion group](https://groups.google.com/d/forum/benchmark-discuss)
-
-IRC channel: [freenode](https://freenode.net) #googlebenchmark
-
-[Additional Tooling Documentation](docs/tools.md)
-
-[Assembly Testing Documentation](docs/AssemblyTests.md)
-
-## Requirements
-
-The library can be used with C++03. However, it requires C++11 to build,
-including compiler and standard library support.
-
-The following minimum versions are required to build the library:
-
-* GCC 4.8
-* Clang 3.4
-* Visual Studio 14 2015
-* Intel 2015 Update 1
-
-See [Platform-Specific Build Instructions](#platform-specific-build-instructions).
-
-## Installation
-
-This describes the installation process using cmake. As pre-requisites, you'll
-need git and cmake installed.
-
-_See [dependencies.md](dependencies.md) for more details regarding supported
-versions of build tools._
-
-```bash
-# Check out the library.
-$ git clone https://github.com/google/benchmark.git
-# Benchmark requires Google Test as a dependency. Add the source tree as a subdirectory.
-$ git clone https://github.com/google/googletest.git benchmark/googletest
-# Go to the library root directory
-$ cd benchmark
-# Make a build directory to place the build output.
-$ mkdir build && cd build
-# Generate a Makefile with cmake.
-# Use cmake -G <generator> to generate a different file type.
-$ cmake ../
-# Build the library.
-# Use make -j<number_of_parallel_jobs> to speed up the build process, e.g. make -j8 .
-$ make
-```
-This builds the `benchmark` and `benchmark_main` libraries and tests.
-On a unix system, the build directory should now look something like this:
-
-```
-/benchmark
-  /build
-    /src
-      /libbenchmark.a
-      /libbenchmark_main.a
-    /test
-      ...
-```
-
-Next, you can run the tests to check the build.
-
-```bash
-$ make test
-```
-
-If you want to install the library globally, also run:
-
-```
-sudo make install
-```
-
-Note that Google Benchmark requires Google Test to build and run the tests. This
-dependency can be provided two ways:
-
-* Checkout the Google Test sources into `benchmark/googletest` as above.
-* Otherwise, if `-DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON` is specified during
-  configuration, the library will automatically download and build any required
-  dependencies.
-
-If you do not wish to build and run the tests, add `-DBENCHMARK_ENABLE_GTEST_TESTS=OFF`
-to `CMAKE_ARGS`.
-
-### Debug vs Release
-
-By default, benchmark builds as a debug library. You will see a warning in the
-output when this is the case. To build it as a release library instead, use:
-
-```
-cmake -DCMAKE_BUILD_TYPE=Release
-```
-
-To enable link-time optimisation, use
-
-```
-cmake -DCMAKE_BUILD_TYPE=Release -DBENCHMARK_ENABLE_LTO=true
-```
-
-If you are using gcc, you might need to set `GCC_AR` and `GCC_RANLIB` cmake
-cache variables, if autodetection fails.
-
-If you are using clang, you may need to set `LLVMAR_EXECUTABLE`,
-`LLVMNM_EXECUTABLE` and `LLVMRANLIB_EXECUTABLE` cmake cache variables.
-
-
-### Stable and Experimental Library Versions
-
-The main branch contains the latest stable version of the benchmarking library;
-the API of which can be considered largely stable, with source breaking changes
-being made only upon the release of a new major version.
-
-Newer, experimental, features are implemented and tested on the
-[`v2` branch](https://github.com/google/benchmark/tree/v2). Users who wish
-to use, test, and provide feedback on the new features are encouraged to try
-this branch. However, this branch provides no stability guarantees and reserves
-the right to change and break the API at any time.
-
-## Usage
-
-### Basic usage
-
-Define a function that executes the code to measure, register it as a benchmark
-function using the `BENCHMARK` macro, and ensure an appropriate `main` function
-is available:
-
-```c++
-#include <benchmark/benchmark.h>
-
-static void BM_StringCreation(benchmark::State& state) {
-  for (auto _ : state)
-    std::string empty_string;
-}
-// Register the function as a benchmark
-BENCHMARK(BM_StringCreation);
-
-// Define another benchmark
-static void BM_StringCopy(benchmark::State& state) {
-  std::string x = "hello";
-  for (auto _ : state)
-    std::string copy(x);
-}
-BENCHMARK(BM_StringCopy);
-
-BENCHMARK_MAIN();
-```
-
-To run the benchmark, compile and link against the `benchmark` library
-(libbenchmark.a/.so). If you followed the build steps above, this
-library will be under the build directory you created.
-
-```bash
-# Example on linux after running the build steps above. Assumes the
-# `benchmark` and `build` directories are under the current directory.
-$ g++ mybenchmark.cc -std=c++11 -isystem benchmark/include \
-  -Lbenchmark/build/src -lbenchmark -lpthread -o mybenchmark
-```
-
-Alternatively, link against the `benchmark_main` library and remove
-`BENCHMARK_MAIN();` above to get the same behavior.
-
-The compiled executable will run all benchmarks by default. Pass the `--help`
-flag for option information or see the guide below.
-
-## Platform Specific Build Instructions
-
-### Building with GCC
-
-When the library is built using GCC it is necessary to link with the pthread
-library due to how GCC implements `std::thread`. Failing to link to pthread will
-lead to runtime exceptions (unless you're using libc++), not linker errors. See
-[issue #67](https://github.com/google/benchmark/issues/67) for more details. You
-can link to pthread by adding `-pthread` to your linker command. Note, you can
-also use `-lpthread`, but there are potential issues with ordering of command
-line parameters if you use that.
-
-### Building with Visual Studio 2015 or 2017
-
-The `shlwapi` library (`-lshlwapi`) is required to support a call to `CPUInfo` which reads the registry. Either add `shlwapi.lib` under `[ Configuration Properties > Linker > Input ]`, or use the following:
-
-```
-// Alternatively, can add libraries using linker options.
-#ifdef _WIN32
-#pragma comment ( lib, "Shlwapi.lib" )
-#ifdef _DEBUG
-#pragma comment ( lib, "benchmarkd.lib" )
-#else
-#pragma comment ( lib, "benchmark.lib" )
-#endif
-#endif
-```
-
-Can also use the graphical version of CMake:
-* Open `CMake GUI`.
-* Under `Where to build the binaries`, same path as source plus `build`.
-* Under `CMAKE_INSTALL_PREFIX`, same path as source plus `install`.
-* Click `Configure`, `Generate`, `Open Project`.
-* If build fails, try deleting entire directory and starting again, or unticking options to build less.
-
-### Building with Intel 2015 Update 1 or Intel System Studio Update 4
-
-See instructions for building with Visual Studio. Once built, right click on the solution and change the build to Intel.
-
-### Building on Solaris
-
-If you're running benchmarks on solaris, you'll want the kstat library linked in
-too (`-lkstat`).
-
-## User Guide
-
-### Command Line
+## Command Line
 
 [Output Formats](#output-formats)
 
@@ -252,12 +12,18 @@
 
 [Result Comparison](#result-comparison)
 
-### Library
+[Extra Context](#extra-context)
+
+## Library
 
 [Runtime and Reporting Considerations](#runtime-and-reporting-considerations)
 
+[Setup/Teardown](#setupteardown)
+
 [Passing Arguments](#passing-arguments)
 
+[Custom Benchmark Name](#custom-benchmark-name)
+
 [Calculating Asymptotic Complexity](#asymptotic-complexity)
 
 [Templated Benchmarks](#templated-benchmarks)
@@ -274,24 +40,33 @@
 
 [Setting the Time Unit](#setting-the-time-unit)
 
+[Random Interleaving](random_interleaving.md)
+
+[User-Requested Performance Counters](perf_counters.md)
+
 [Preventing Optimization](#preventing-optimization)
 
 [Reporting Statistics](#reporting-statistics)
 
 [Custom Statistics](#custom-statistics)
 
+[Memory Usage](#memory-usage)
+
 [Using RegisterBenchmark](#using-register-benchmark)
 
 [Exiting with an Error](#exiting-with-an-error)
 
-[A Faster KeepRunning Loop](#a-faster-keep-running-loop)
+[A Faster `KeepRunning` Loop](#a-faster-keep-running-loop)
+
+## Benchmarking Tips
 
 [Disabling CPU Frequency Scaling](#disabling-cpu-frequency-scaling)
 
+[Reducing Variance in Benchmarks](reducing_variance.md)
 
 <a name="output-formats" />
 
-### Output Formats
+## Output Formats
 
 The library supports multiple output formats. Use the
 `--benchmark_format=<console|json|csv>` flag (or set the
@@ -366,17 +141,21 @@
 
 <a name="output-files" />
 
-### Output Files
+## Output Files
 
 Write benchmark results to a file with the `--benchmark_out=<filename>` option
 (or set `BENCHMARK_OUT`). Specify the output format with
 `--benchmark_out_format={json|console|csv}` (or set
-`BENCHMARK_OUT_FORMAT={json|console|csv}`). Note that specifying
-`--benchmark_out` does not suppress the console output.
+`BENCHMARK_OUT_FORMAT={json|console|csv}`). Note that the 'csv' reporter is
+deprecated and the saved `.csv` file
+[is not parsable](https://github.com/google/benchmark/issues/794) by csv
+parsers.
+
+Specifying `--benchmark_out` does not suppress the console output.
 
 <a name="running-benchmarks" />
 
-### Running Benchmarks
+## Running Benchmarks
 
 Benchmarks are executed by running the produced binaries. Benchmarks binaries,
 by default, accept options that may be specified either through their command
@@ -388,7 +167,7 @@
 
 <a name="running-a-subset-of-benchmarks" />
 
-### Running a Subset of Benchmarks
+## Running a Subset of Benchmarks
 
 The `--benchmark_filter=<regex>` option (or `BENCHMARK_FILTER=<regex>`
 environment variable) can be used to only run the benchmarks that match
@@ -406,16 +185,50 @@
 BM_memcpy/32k       1834 ns       1837 ns     357143
 ```
 
+## Disabling Benchmarks
+
+It is possible to temporarily disable benchmarks by renaming the benchmark
+function to have the prefix "DISABLED_". This will cause the benchmark to
+be skipped at runtime.
+
 <a name="result-comparison" />
 
-### Result comparison
+## Result comparison
 
 It is possible to compare the benchmarking results.
-See [Additional Tooling Documentation](docs/tools.md)
+See [Additional Tooling Documentation](tools.md)
+
+<a name="extra-context" />
+
+## Extra Context
+
+Sometimes it's useful to add extra context to the content printed before the
+results. By default this section includes information about the CPU on which
+the benchmarks are running. If you do want to add more context, you can use
+the `benchmark_context` command line flag:
+
+```bash
+$ ./run_benchmarks --benchmark_context=pwd=`pwd`
+Run on (1 x 2300 MHz CPU)
+pwd: /home/user/benchmark/
+Benchmark              Time           CPU Iterations
+----------------------------------------------------
+BM_memcpy/32          11 ns         11 ns   79545455
+BM_memcpy/32k       2181 ns       2185 ns     324074
+```
+
+You can get the same effect with the API:
+
+```c++
+  benchmark::AddCustomContext("foo", "bar");
+```
+
+Note that attempts to add a second value with the same key will fail with an
+error message.
 
 <a name="runtime-and-reporting-considerations" />
 
-### Runtime and Reporting Considerations
+## Runtime and Reporting Considerations
 
 When the benchmark binary is executed, each benchmark function is run serially.
 The number of iterations to run is determined dynamically by running the
@@ -430,6 +243,19 @@
 the minimum time, or the wallclock time is 5x minimum time. The minimum time is
 set per benchmark by calling `MinTime` on the registered benchmark object.
 
+Furthermore warming up a benchmark might be necessary in order to get
+stable results because of e.g caching effects of the code under benchmark.
+Warming up means running the benchmark a given amount of time, before
+results are actually taken into account. The amount of time for which
+the warmup should be run can be set per benchmark by calling
+`MinWarmUpTime` on the registered benchmark object or for all benchmarks
+using the `--benchmark_min_warmup_time` command-line option. Note that
+`MinWarmUpTime` will overwrite the value of `--benchmark_min_warmup_time`
+for the single benchmark. How many iterations the warmup run of each
+benchmark takes is determined the same way as described in the paragraph
+above. Per default the warmup phase is set to 0 seconds and is therefore
+disabled.
+
 Average timings are then reported over the iterations run. If multiple
 repetitions are requested using the `--benchmark_repetitions` command-line
 option, or at registration time, the benchmark function will be run several
@@ -438,9 +264,43 @@
 As well as the per-benchmark entries, a preamble in the report will include
 information about the machine on which the benchmarks are run.
 
+<a name="setup-teardown" />
+
+## Setup/Teardown
+
+Global setup/teardown specific to each benchmark can be done by
+passing a callback to Setup/Teardown:
+
+The setup/teardown callbacks will be invoked once for each benchmark. If the
+benchmark is multi-threaded (will run in k threads), they will be invoked
+exactly once before each run with k threads.
+
+If the benchmark uses different size groups of threads, the above will be true
+for each size group.
+
+Eg.,
+
+```c++
+static void DoSetup(const benchmark::State& state) {
+}
+
+static void DoTeardown(const benchmark::State& state) {
+}
+
+static void BM_func(benchmark::State& state) {...}
+
+BENCHMARK(BM_func)->Arg(1)->Arg(3)->Threads(16)->Threads(32)->Setup(DoSetup)->Teardown(DoTeardown);
+
+```
+
+In this example, `DoSetup` and `DoTearDown` will be invoked 4 times each,
+specifically, once for each of this family:
+ - BM_func_Arg_1_Threads_16, BM_func_Arg_1_Threads_32
+ - BM_func_Arg_3_Threads_16, BM_func_Arg_3_Threads_32
+
 <a name="passing-arguments" />
 
-### Passing Arguments
+## Passing Arguments
 
 Sometimes a family of benchmarks can be implemented with just one routine that
 takes an extra argument to specify which one of the family of benchmarks to
@@ -459,7 +319,7 @@
   delete[] src;
   delete[] dst;
 }
-BENCHMARK(BM_memcpy)->Arg(8)->Arg(64)->Arg(512)->Arg(1<<10)->Arg(8<<10);
+BENCHMARK(BM_memcpy)->Arg(8)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(8<<10);
 ```
 
 The preceding code is quite repetitive, and can be replaced with the following
@@ -488,7 +348,8 @@
 static void BM_DenseRange(benchmark::State& state) {
   for(auto _ : state) {
     std::vector<int> v(state.range(0), state.range(0));
-    benchmark::DoNotOptimize(v.data());
+    auto data = v.data();
+    benchmark::DoNotOptimize(data);
     benchmark::ClobberMemory();
   }
 }
@@ -528,9 +389,53 @@
 product of the two specified ranges and will generate a benchmark for each such
 pair.
 
+<!-- {% raw %} -->
 ```c++
 BENCHMARK(BM_SetInsert)->Ranges({{1<<10, 8<<10}, {128, 512}});
 ```
+<!-- {% endraw %} -->
+
+Some benchmarks may require specific argument values that cannot be expressed
+with `Ranges`. In this case, `ArgsProduct` offers the ability to generate a
+benchmark input for each combination in the product of the supplied vectors.
+
+<!-- {% raw %} -->
+```c++
+BENCHMARK(BM_SetInsert)
+    ->ArgsProduct({{1<<10, 3<<10, 8<<10}, {20, 40, 60, 80}})
+// would generate the same benchmark arguments as
+BENCHMARK(BM_SetInsert)
+    ->Args({1<<10, 20})
+    ->Args({3<<10, 20})
+    ->Args({8<<10, 20})
+    ->Args({3<<10, 40})
+    ->Args({8<<10, 40})
+    ->Args({1<<10, 40})
+    ->Args({1<<10, 60})
+    ->Args({3<<10, 60})
+    ->Args({8<<10, 60})
+    ->Args({1<<10, 80})
+    ->Args({3<<10, 80})
+    ->Args({8<<10, 80});
+```
+<!-- {% endraw %} -->
+
+For the most common scenarios, helper methods for creating a list of
+integers for a given sparse or dense range are provided.
+
+```c++
+BENCHMARK(BM_SetInsert)
+    ->ArgsProduct({
+      benchmark::CreateRange(8, 128, /*multi=*/2),
+      benchmark::CreateDenseRange(1, 4, /*step=*/1)
+    })
+// would generate the same benchmark arguments as
+BENCHMARK(BM_SetInsert)
+    ->ArgsProduct({
+      {8, 16, 32, 64, 128},
+      {1, 2, 3, 4}
+    });
+```
 
 For more complex patterns of inputs, passing a custom function to `Apply` allows
 programmatic specification of an arbitrary set of arguments on which to run the
@@ -546,7 +451,7 @@
 BENCHMARK(BM_SetInsert)->Apply(CustomArguments);
 ```
 
-#### Passing Arbitrary Arguments to a Benchmark
+### Passing Arbitrary Arguments to a Benchmark
 
 In C++11 it is possible to define a benchmark that takes an arbitrary number
 of extra arguments. The `BENCHMARK_CAPTURE(func, test_case_name, ...args)`
@@ -556,13 +461,22 @@
 should describe the values passed.
 
 ```c++
-template <class ...ExtraArgs>
-void BM_takes_args(benchmark::State& state, ExtraArgs&&... extra_args) {
-  [...]
+template <class ...Args>
+void BM_takes_args(benchmark::State& state, Args&&... args) {
+  auto args_tuple = std::make_tuple(std::move(args)...);
+  for (auto _ : state) {
+    std::cout << std::get<0>(args_tuple) << ": " << std::get<1>(args_tuple)
+              << '\n';
+    [...]
+  }
 }
 // Registers a benchmark named "BM_takes_args/int_string_test" that passes
-// the specified values to `extra_args`.
+// the specified values to `args`.
 BENCHMARK_CAPTURE(BM_takes_args, int_string_test, 42, std::string("abc"));
+
+// Registers the same benchmark "BM_takes_args/int_test" that passes
+// the specified values to `args`.
+BENCHMARK_CAPTURE(BM_takes_args, int_test, 42, 43);
 ```
 
 Note that elements of `...args` may refer to global variables. Users should
@@ -570,7 +484,7 @@
 
 <a name="asymptotic-complexity" />
 
-### Calculating Asymptotic Complexity (Big O)
+## Calculating Asymptotic Complexity (Big O)
 
 Asymptotic complexity might be calculated for a family of benchmarks. The
 following code will calculate the coefficient for the high-order term in the
@@ -581,7 +495,8 @@
   std::string s1(state.range(0), '-');
   std::string s2(state.range(0), '-');
   for (auto _ : state) {
-    benchmark::DoNotOptimize(s1.compare(s2));
+    auto comparison_result = s1.compare(s2);
+    benchmark::DoNotOptimize(comparison_result);
   }
   state.SetComplexityN(state.range(0));
 }
@@ -602,12 +517,25 @@
 
 ```c++
 BENCHMARK(BM_StringCompare)->RangeMultiplier(2)
-    ->Range(1<<10, 1<<18)->Complexity([](int64_t n)->double{return n; });
+    ->Range(1<<10, 1<<18)->Complexity([](benchmark::IterationCount n)->double{return n; });
 ```
 
+<a name="custom-benchmark-name" />
+
+## Custom Benchmark Name
+
+You can change the benchmark's name as follows:
+
+```c++
+BENCHMARK(BM_memcpy)->Name("memcpy")->RangeMultiplier(2)->Range(8, 8<<10);
+```
+
+The invocation will execute the benchmark as before using `BM_memcpy` but changes
+the prefix in the report to `memcpy`.
+
 <a name="templated-benchmarks" />
 
-### Templated Benchmarks
+## Templated Benchmarks
 
 This example produces and consumes messages of size `sizeof(v)` `range_x`
 times. It also outputs throughput in the absence of multiprogramming.
@@ -626,14 +554,19 @@
   state.SetBytesProcessed(
       static_cast<int64_t>(state.iterations())*state.range(0));
 }
+// C++03
 BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue<int>)->Range(1<<0, 1<<10);
+
+// C++11 or newer, you can use the BENCHMARK macro with template parameters:
+BENCHMARK(BM_Sequential<WaitQueue<int>>)->Range(1<<0, 1<<10);
+
 ```
 
 Three macros are provided for adding benchmark templates.
 
 ```c++
 #ifdef BENCHMARK_HAS_CXX11
-#define BENCHMARK_TEMPLATE(func, ...) // Takes any number of parameters.
+#define BENCHMARK(func<...>) // Takes any number of parameters.
 #else // C++ < C++11
 #define BENCHMARK_TEMPLATE(func, arg1)
 #endif
@@ -643,7 +576,7 @@
 
 <a name="fixtures" />
 
-### Fixtures
+## Fixtures
 
 Fixture tests are created by first defining a type that derives from
 `::benchmark::Fixture` and then creating/registering the tests using the
@@ -681,7 +614,7 @@
 /* BarTest is now registered */
 ```
 
-#### Templated Fixtures
+### Templated Fixtures
 
 Also you can create templated fixture by using the following macros:
 
@@ -711,7 +644,7 @@
 
 <a name="custom-counters" />
 
-### Custom Counters
+## Custom Counters
 
 You can add your own counters with user-defined names. The example below
 will add columns "Foo", "Bar" and "Baz" in its output:
@@ -772,6 +705,7 @@
 When you're compiling in C++11 mode or later you can use `insert()` with
 `std::initializer_list`:
 
+<!-- {% raw %} -->
 ```c++
   // With C++11, this can be done:
   state.counters.insert({{"Foo", numFoos}, {"Bar", numBars}, {"Baz", numBazs}});
@@ -780,8 +714,9 @@
   state.counters["Bar"] = numBars;
   state.counters["Baz"] = numBazs;
 ```
+<!-- {% endraw %} -->
 
-#### Counter Reporting
+### Counter Reporting
 
 When using the console reporter, by default, user counters are printed at
 the end after the table, the same way as ``bytes_processed`` and
@@ -851,7 +786,7 @@
 
 <a name="multithreaded-benchmarks"/>
 
-### Multithreaded Benchmarks
+## Multithreaded Benchmarks
 
 In a multithreaded test (benchmark invoked by multiple threads simultaneously),
 it is guaranteed that none of the threads will start until all have reached
@@ -862,19 +797,29 @@
 
 ```c++
 static void BM_MultiThreaded(benchmark::State& state) {
-  if (state.thread_index == 0) {
+  if (state.thread_index() == 0) {
     // Setup code here.
   }
   for (auto _ : state) {
     // Run the test as normal.
   }
-  if (state.thread_index == 0) {
+  if (state.thread_index() == 0) {
     // Teardown code here.
   }
 }
 BENCHMARK(BM_MultiThreaded)->Threads(2);
 ```
 
+To run the benchmark across a range of thread counts, instead of `Threads`, use
+`ThreadRange`. This takes two parameters (`min_threads` and `max_threads`) and
+runs the benchmark once for values in the inclusive range. For example:
+
+```c++
+BENCHMARK(BM_MultiThreaded)->ThreadRange(1, 8);
+```
+
+will run `BM_MultiThreaded` with thread counts 1, 2, 4, and 8.
+
 If the benchmarked code itself uses threads and you want to compare it to
 single-threaded code, you may want to use real-time ("wallclock") measurements
 for latency comparisons:
@@ -887,7 +832,7 @@
 
 <a name="cpu-timers" />
 
-### CPU Timers
+## CPU Timers
 
 By default, the CPU timer only measures the time spent by the main thread.
 If the benchmark itself uses threads internally, this measurement may not
@@ -931,13 +876,14 @@
 BENCHMARK(BM_OpenMP)->Range(8, 8<<10)->MeasureProcessCPUTime()->UseRealTime();
 ```
 
-#### Controlling Timers
+### Controlling Timers
 
 Normally, the entire duration of the work loop (`for (auto _ : state) {}`)
 is measured. But sometimes, it is necessary to do some work inside of
 that loop, every iteration, but without counting that time to the benchmark time.
 That is possible, although it is not recommended, since it has high overhead.
 
+<!-- {% raw %} -->
 ```c++
 static void BM_SetInsert_With_Timer_Control(benchmark::State& state) {
   std::set<int> data;
@@ -952,10 +898,11 @@
 }
 BENCHMARK(BM_SetInsert_With_Timer_Control)->Ranges({{1<<10, 8<<10}, {128, 512}});
 ```
+<!-- {% endraw %} -->
 
 <a name="manual-timing" />
 
-### Manual Timing
+## Manual Timing
 
 For benchmarking something for which neither CPU time nor real-time are
 correct or accurate enough, completely manual timing is supported using
@@ -996,7 +943,7 @@
 
 <a name="setting-the-time-unit" />
 
-### Setting the Time Unit
+## Setting the Time Unit
 
 If a benchmark runs a few milliseconds it may be hard to visually compare the
 measured times, since the output data is given in nanoseconds per default. In
@@ -1006,9 +953,13 @@
 BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
 ```
 
+Additionally the default time unit can be set globally with the
+`--benchmark_time_unit={ns|us|ms|s}` command line argument. The argument only
+affects benchmarks where the time unit is not set explicitly.
+
 <a name="preventing-optimization" />
 
-### Preventing Optimization
+## Preventing Optimization
 
 To prevent a value or expression from being optimized away by the compiler
 the `benchmark::DoNotOptimize(...)` and `benchmark::ClobberMemory()`
@@ -1058,7 +1009,8 @@
   for (auto _ : state) {
     std::vector<int> v;
     v.reserve(1);
-    benchmark::DoNotOptimize(v.data()); // Allow v.data() to be clobbered.
+    auto data = v.data();           // Allow v.data() to be clobbered. Pass as non-const
+    benchmark::DoNotOptimize(data); // lvalue to avoid undesired compiler optimizations
     v.push_back(42);
     benchmark::ClobberMemory(); // Force 42 to be written to memory.
   }
@@ -1069,7 +1021,7 @@
 
 <a name="reporting-statistics" />
 
-### Statistics: Reporting the Mean, Median and Standard Deviation of Repeated Benchmarks
+## Statistics: Reporting the Mean, Median and Standard Deviation / Coefficient of variation of Repeated Benchmarks
 
 By default each benchmark is run once and that single result is reported.
 However benchmarks are often noisy and a single result may not be representative
@@ -1079,16 +1031,17 @@
 The number of runs of each benchmark is specified globally by the
 `--benchmark_repetitions` flag or on a per benchmark basis by calling
 `Repetitions` on the registered benchmark object. When a benchmark is run more
-than once the mean, median and standard deviation of the runs will be reported.
+than once the mean, median, standard deviation and coefficient of variation
+of the runs will be reported.
 
 Additionally the `--benchmark_report_aggregates_only={true|false}`,
 `--benchmark_display_aggregates_only={true|false}` flags or
 `ReportAggregatesOnly(bool)`, `DisplayAggregatesOnly(bool)` functions can be
 used to change how repeated tests are reported. By default the result of each
 repeated run is reported. When `report aggregates only` option is `true`,
-only the aggregates (i.e. mean, median and standard deviation, maybe complexity
-measurements if they were requested) of the runs is reported, to both the
-reporters - standard output (console), and the file.
+only the aggregates (i.e. mean, median, standard deviation and coefficient
+of variation, maybe complexity measurements if they were requested) of the runs
+is reported, to both the reporters - standard output (console), and the file.
 However when only the `display aggregates only` option is `true`,
 only the aggregates are displayed in the standard output, while the file
 output still contains everything.
@@ -1098,13 +1051,12 @@
 
 <a name="custom-statistics" />
 
-### Custom Statistics
+## Custom Statistics
 
-While having mean, median and standard deviation is nice, this may not be
-enough for everyone. For example you may want to know what the largest
-observation is, e.g. because you have some real-time constraints. This is easy.
-The following code will specify a custom statistic to be calculated, defined
-by a lambda function.
+While having these aggregates is nice, this may not be enough for everyone.
+For example you may want to know what the largest observation is, e.g. because
+you have some real-time constraints. This is easy. The following code will
+specify a custom statistic to be calculated, defined by a lambda function.
 
 ```c++
 void BM_spin_empty(benchmark::State& state) {
@@ -1122,9 +1074,43 @@
   ->Arg(512);
 ```
 
+While usually the statistics produce values in time units,
+you can also produce percentages:
+
+```c++
+void BM_spin_empty(benchmark::State& state) {
+  for (auto _ : state) {
+    for (int x = 0; x < state.range(0); ++x) {
+      benchmark::DoNotOptimize(x);
+    }
+  }
+}
+
+BENCHMARK(BM_spin_empty)
+  ->ComputeStatistics("ratio", [](const std::vector<double>& v) -> double {
+    return std::begin(v) / std::end(v);
+  }, benchmark::StatisticUnit::kPercentage)
+  ->Arg(512);
+```
+
+<a name="memory-usage" />
+
+## Memory Usage
+
+It's often useful to also track memory usage for benchmarks, alongside CPU
+performance. For this reason, benchmark offers the `RegisterMemoryManager`
+method that allows a custom `MemoryManager` to be injected.
+
+If set, the `MemoryManager::Start` and `MemoryManager::Stop` methods will be
+called at the start and end of benchmark runs to allow user code to fill out
+a report on the number of allocations, bytes used, etc.
+
+This data will then be reported alongside other performance data, currently
+only when using JSON output.
+
 <a name="using-register-benchmark" />
 
-### Using RegisterBenchmark(name, fn, args...)
+## Using RegisterBenchmark(name, fn, args...)
 
 The `RegisterBenchmark(name, func, args...)` function provides an alternative
 way to create and register benchmarks.
@@ -1148,23 +1134,26 @@
       benchmark::RegisterBenchmark(test_input.name(), BM_test, test_input);
   benchmark::Initialize(&argc, argv);
   benchmark::RunSpecifiedBenchmarks();
+  benchmark::Shutdown();
 }
 ```
 
 <a name="exiting-with-an-error" />
 
-### Exiting with an Error
+## Exiting with an Error
 
 When errors caused by external influences, such as file I/O and network
 communication, occur within a benchmark the
-`State::SkipWithError(const char* msg)` function can be used to skip that run
+`State::SkipWithError(const std::string& msg)` function can be used to skip that run
 of benchmark and report the error. Note that only future iterations of the
 `KeepRunning()` are skipped. For the ranged-for version of the benchmark loop
 Users must explicitly exit the loop, otherwise all iterations will be performed.
 Users may explicitly return to exit the benchmark immediately.
 
 The `SkipWithError(...)` function may be used at any point within the benchmark,
-including before and after the benchmark loop.
+including before and after the benchmark loop. Moreover, if `SkipWithError(...)`
+has been used, it is not required to reach the benchmark loop and one may return
+from the benchmark function early.
 
 For example:
 
@@ -1172,30 +1161,38 @@
 static void BM_test(benchmark::State& state) {
   auto resource = GetResource();
   if (!resource.good()) {
-      state.SkipWithError("Resource is not good!");
-      // KeepRunning() loop will not be entered.
+    state.SkipWithError("Resource is not good!");
+    // KeepRunning() loop will not be entered.
   }
   while (state.KeepRunning()) {
-      auto data = resource.read_data();
-      if (!resource.good()) {
-        state.SkipWithError("Failed to read data!");
-        break; // Needed to skip the rest of the iteration.
-     }
-     do_stuff(data);
+    auto data = resource.read_data();
+    if (!resource.good()) {
+      state.SkipWithError("Failed to read data!");
+      break; // Needed to skip the rest of the iteration.
+    }
+    do_stuff(data);
   }
 }
 
 static void BM_test_ranged_fo(benchmark::State & state) {
-  state.SkipWithError("test will not be entered");
+  auto resource = GetResource();
+  if (!resource.good()) {
+    state.SkipWithError("Resource is not good!");
+    return; // Early return is allowed when SkipWithError() has been used.
+  }
   for (auto _ : state) {
-    state.SkipWithError("Failed!");
-    break; // REQUIRED to prevent all further iterations.
+    auto data = resource.read_data();
+    if (!resource.good()) {
+      state.SkipWithError("Failed to read data!");
+      break; // REQUIRED to prevent all further iterations.
+    }
+    do_stuff(data);
   }
 }
 ```
 <a name="a-faster-keep-running-loop" />
 
-### A Faster KeepRunning Loop
+## A Faster KeepRunning Loop
 
 In C++11 mode, a ranged-based for loop should be used in preference to
 the `KeepRunning` loop for running the benchmarks. For example:
@@ -1253,18 +1250,17 @@
 
 <a name="disabling-cpu-frequency-scaling" />
 
-### Disabling CPU Frequency Scaling
+## Disabling CPU Frequency Scaling
 
 If you see this error:
 
 ```
-***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may
+be noisy and will incur extra overhead.
 ```
 
-you might want to disable the CPU frequency scaling while running the benchmark:
+you might want to disable the CPU frequency scaling while running the
+benchmark, as well as consider other ways to stabilize the performance of
+your system while benchmarking.
 
-```bash
-sudo cpupower frequency-set --governor performance
-./mybench
-sudo cpupower frequency-set --governor powersave
-```
+See [Reducing Variance](reducing_variance.md) for more information.
diff --git a/third_party/google_benchmark/include/benchmark/benchmark.h b/third_party/google_benchmark/src/include/benchmark/benchmark.h
similarity index 69%
rename from third_party/google_benchmark/include/benchmark/benchmark.h
rename to third_party/google_benchmark/src/include/benchmark/benchmark.h
index 144e212..e44d534 100644
--- a/third_party/google_benchmark/include/benchmark/benchmark.h
+++ b/third_party/google_benchmark/src/include/benchmark/benchmark.h
@@ -34,7 +34,7 @@
 BENCHMARK(BM_StringCopy);
 
 // Augment the main() program to invoke benchmarks if specified
-// via the --benchmarks command line flag.  E.g.,
+// via the --benchmark_filter command line flag.  E.g.,
 //       my_unittest --benchmark_filter=all
 //       my_unittest --benchmark_filter=BM_StringCreation
 //       my_unittest --benchmark_filter=String
@@ -42,6 +42,7 @@
 int main(int argc, char** argv) {
   benchmark::Initialize(&argc, argv);
   benchmark::RunSpecifiedBenchmarks();
+  benchmark::Shutdown();
   return 0;
 }
 
@@ -139,13 +140,13 @@
 do can be wrapped in a check against the thread index:
 
 static void BM_MultiThreaded(benchmark::State& state) {
-  if (state.thread_index == 0) {
+  if (state.thread_index() == 0) {
     // Setup code here.
   }
   for (auto _ : state) {
     // Run the test as normal.
   }
-  if (state.thread_index == 0) {
+  if (state.thread_index() == 0) {
     // Teardown code here.
   }
 }
@@ -167,18 +168,29 @@
 #define BENCHMARK_HAS_CXX11
 #endif
 
+// This _MSC_VER check should detect VS 2017 v15.3 and newer.
+#if __cplusplus >= 201703L || \
+    (defined(_MSC_VER) && _MSC_VER >= 1911 && _MSVC_LANG >= 201703L)
+#define BENCHMARK_HAS_CXX17
+#endif
+
 #include <stdint.h>
 
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <iosfwd>
+#include <limits>
 #include <map>
 #include <set>
 #include <string>
+#include <utility>
 #include <vector>
 
+#include "benchmark/export.h"
+
 #if defined(BENCHMARK_HAS_CXX11)
+#include <atomic>
 #include <initializer_list>
 #include <type_traits>
 #include <utility>
@@ -198,42 +210,63 @@
   TypeName& operator=(const TypeName&) = delete
 #endif
 
-#if defined(__GNUC__)
+#ifdef BENCHMARK_HAS_CXX17
+#define BENCHMARK_UNUSED [[maybe_unused]]
+#elif defined(__GNUC__) || defined(__clang__)
 #define BENCHMARK_UNUSED __attribute__((unused))
-#define BENCHMARK_ALWAYS_INLINE __attribute__((always_inline))
-#define BENCHMARK_NOEXCEPT noexcept
-#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
-#elif defined(_MSC_VER) && !defined(__clang__)
-#define BENCHMARK_UNUSED
-#define BENCHMARK_ALWAYS_INLINE __forceinline
-#if _MSC_VER >= 1900
-#define BENCHMARK_NOEXCEPT noexcept
-#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
 #else
-#define BENCHMARK_NOEXCEPT
-#define BENCHMARK_NOEXCEPT_OP(x)
+#define BENCHMARK_UNUSED
 #endif
+
+// Used to annotate functions, methods and classes so they
+// are not optimized by the compiler. Useful for tests
+// where you expect loops to stay in place churning cycles
+#if defined(__clang__)
+#define BENCHMARK_DONT_OPTIMIZE __attribute__((optnone))
+#elif defined(__GNUC__) || defined(__GNUG__)
+#define BENCHMARK_DONT_OPTIMIZE __attribute__((optimize(0)))
+#else
+// MSVC & Intel do not have a no-optimize attribute, only line pragmas
+#define BENCHMARK_DONT_OPTIMIZE
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#define BENCHMARK_ALWAYS_INLINE __attribute__((always_inline))
+#elif defined(_MSC_VER) && !defined(__clang__)
+#define BENCHMARK_ALWAYS_INLINE __forceinline
 #define __func__ __FUNCTION__
 #else
-#define BENCHMARK_UNUSED
 #define BENCHMARK_ALWAYS_INLINE
-#define BENCHMARK_NOEXCEPT
-#define BENCHMARK_NOEXCEPT_OP(x)
 #endif
 
 #define BENCHMARK_INTERNAL_TOSTRING2(x) #x
 #define BENCHMARK_INTERNAL_TOSTRING(x) BENCHMARK_INTERNAL_TOSTRING2(x)
 
-#if defined(__GNUC__) || defined(__clang__)
+// clang-format off
+#if (defined(__GNUC__) && !defined(__NVCC__) && !defined(__NVCOMPILER)) || defined(__clang__)
 #define BENCHMARK_BUILTIN_EXPECT(x, y) __builtin_expect(x, y)
 #define BENCHMARK_DEPRECATED_MSG(msg) __attribute__((deprecated(msg)))
+#define BENCHMARK_DISABLE_DEPRECATED_WARNING \
+  _Pragma("GCC diagnostic push")             \
+  _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
+#define BENCHMARK_RESTORE_DEPRECATED_WARNING _Pragma("GCC diagnostic pop")
+#elif defined(__NVCOMPILER)
+#define BENCHMARK_BUILTIN_EXPECT(x, y) __builtin_expect(x, y)
+#define BENCHMARK_DEPRECATED_MSG(msg) __attribute__((deprecated(msg)))
+#define BENCHMARK_DISABLE_DEPRECATED_WARNING \
+  _Pragma("diagnostic push") \
+  _Pragma("diag_suppress deprecated_entity_with_custom_message")
+#define BENCHMARK_RESTORE_DEPRECATED_WARNING _Pragma("diagnostic pop")
 #else
 #define BENCHMARK_BUILTIN_EXPECT(x, y) x
 #define BENCHMARK_DEPRECATED_MSG(msg)
 #define BENCHMARK_WARNING_MSG(msg)                           \
   __pragma(message(__FILE__ "(" BENCHMARK_INTERNAL_TOSTRING( \
       __LINE__) ") : warning note: " msg))
+#define BENCHMARK_DISABLE_DEPRECATED_WARNING
+#define BENCHMARK_RESTORE_DEPRECATED_WARNING
 #endif
+// clang-format on
 
 #if defined(__GNUC__) && !defined(__clang__)
 #define BENCHMARK_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
@@ -251,21 +284,60 @@
 #define BENCHMARK_UNREACHABLE() ((void)0)
 #endif
 
+#ifdef BENCHMARK_HAS_CXX11
+#define BENCHMARK_OVERRIDE override
+#else
+#define BENCHMARK_OVERRIDE
+#endif
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+// C4251: <symbol> needs to have dll-interface to be used by clients of class
+#pragma warning(disable : 4251)
+#endif
+
 namespace benchmark {
 class BenchmarkReporter;
-class MemoryManager;
 
-void Initialize(int* argc, char** argv);
+// Default number of minimum benchmark running time in seconds.
+const char kDefaultMinTimeStr[] = "0.5s";
+
+BENCHMARK_EXPORT void PrintDefaultHelp();
+
+BENCHMARK_EXPORT void Initialize(int* argc, char** argv,
+                                 void (*HelperPrinterf)() = PrintDefaultHelp);
+BENCHMARK_EXPORT void Shutdown();
 
 // Report to stdout all arguments in 'argv' as unrecognized except the first.
 // Returns true there is at least on unrecognized argument (i.e. 'argc' > 1).
-bool ReportUnrecognizedArguments(int argc, char** argv);
+BENCHMARK_EXPORT bool ReportUnrecognizedArguments(int argc, char** argv);
+
+// Returns the current value of --benchmark_filter.
+BENCHMARK_EXPORT std::string GetBenchmarkFilter();
+
+// Sets a new value to --benchmark_filter. (This will override this flag's
+// current value).
+// Should be called after `benchmark::Initialize()`, as
+// `benchmark::Initialize()` will override the flag's value.
+BENCHMARK_EXPORT void SetBenchmarkFilter(std::string value);
+
+// Returns the current value of --v (command line value for verbosity).
+BENCHMARK_EXPORT int32_t GetBenchmarkVerbosity();
+
+// Creates a default display reporter. Used by the library when no display
+// reporter is provided, but also made available for external use in case a
+// custom reporter should respect the `--benchmark_format` flag as a fallback
+BENCHMARK_EXPORT BenchmarkReporter* CreateDefaultDisplayReporter();
 
 // Generate a list of benchmarks matching the specified --benchmark_filter flag
 // and if --benchmark_list_tests is specified return after printing the name
 // of each matching benchmark. Otherwise run each matching benchmark and
 // report the results.
 //
+// spec : Specify the benchmarks to run. If users do not specify this arg,
+//        then the value of FLAGS_benchmark_filter
+//        will be used.
+//
 // The second and third overload use the specified 'display_reporter' and
 //  'file_reporter' respectively. 'file_reporter' will write to the file
 //  specified
@@ -273,28 +345,94 @@
 //  'file_reporter' is ignored.
 //
 // RETURNS: The number of matching benchmarks.
-size_t RunSpecifiedBenchmarks();
-size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter);
-size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
-                              BenchmarkReporter* file_reporter);
+BENCHMARK_EXPORT size_t RunSpecifiedBenchmarks();
+BENCHMARK_EXPORT size_t RunSpecifiedBenchmarks(std::string spec);
+
+BENCHMARK_EXPORT size_t
+RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter);
+BENCHMARK_EXPORT size_t
+RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter, std::string spec);
+
+BENCHMARK_EXPORT size_t RunSpecifiedBenchmarks(
+    BenchmarkReporter* display_reporter, BenchmarkReporter* file_reporter);
+BENCHMARK_EXPORT size_t
+RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
+                       BenchmarkReporter* file_reporter, std::string spec);
+
+// TimeUnit is passed to a benchmark in order to specify the order of magnitude
+// for the measured time.
+enum TimeUnit { kNanosecond, kMicrosecond, kMillisecond, kSecond };
+
+BENCHMARK_EXPORT TimeUnit GetDefaultTimeUnit();
+
+// Sets the default time unit the benchmarks use
+// Has to be called before the benchmark loop to take effect
+BENCHMARK_EXPORT void SetDefaultTimeUnit(TimeUnit unit);
+
+// If a MemoryManager is registered (via RegisterMemoryManager()),
+// it can be used to collect and report allocation metrics for a run of the
+// benchmark.
+class MemoryManager {
+ public:
+  static const int64_t TombstoneValue;
+
+  struct Result {
+    Result()
+        : num_allocs(0),
+          max_bytes_used(0),
+          total_allocated_bytes(TombstoneValue),
+          net_heap_growth(TombstoneValue) {}
+
+    // The number of allocations made in total between Start and Stop.
+    int64_t num_allocs;
+
+    // The peak memory use between Start and Stop.
+    int64_t max_bytes_used;
+
+    // The total memory allocated, in bytes, between Start and Stop.
+    // Init'ed to TombstoneValue if metric not available.
+    int64_t total_allocated_bytes;
+
+    // The net changes in memory, in bytes, between Start and Stop.
+    // ie., total_allocated_bytes - total_deallocated_bytes.
+    // Init'ed to TombstoneValue if metric not available.
+    int64_t net_heap_growth;
+  };
+
+  virtual ~MemoryManager() {}
+
+  // Implement this to start recording allocation information.
+  virtual void Start() = 0;
+
+  // Implement this to stop recording and fill out the given Result structure.
+  virtual void Stop(Result& result) = 0;
+};
 
 // Register a MemoryManager instance that will be used to collect and report
 // allocation measurements for benchmark runs.
+BENCHMARK_EXPORT
 void RegisterMemoryManager(MemoryManager* memory_manager);
 
+// Add a key-value pair to output as part of the context stanza in the report.
+BENCHMARK_EXPORT
+void AddCustomContext(const std::string& key, const std::string& value);
+
 namespace internal {
 class Benchmark;
 class BenchmarkImp;
 class BenchmarkFamilies;
 
+BENCHMARK_EXPORT std::map<std::string, std::string>*& GetGlobalContext();
+
+BENCHMARK_EXPORT
 void UseCharPointer(char const volatile*);
 
 // Take ownership of the pointer and register the benchmark. Return the
 // registered benchmark.
-Benchmark* RegisterBenchmarkInternal(Benchmark*);
+BENCHMARK_EXPORT Benchmark* RegisterBenchmarkInternal(Benchmark*);
 
 // Ensure that the standard streams are properly initialized in every TU.
-int InitializeStreams();
+BENCHMARK_EXPORT int InitializeStreams();
 BENCHMARK_UNUSED static int stream_init_anchor = InitializeStreams();
 
 }  // namespace internal
@@ -304,12 +442,24 @@
 #define BENCHMARK_HAS_NO_INLINE_ASSEMBLY
 #endif
 
+// Force the compiler to flush pending writes to global memory. Acts as an
+// effective read/write barrier
+#ifdef BENCHMARK_HAS_CXX11
+inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
+  std::atomic_signal_fence(std::memory_order_acq_rel);
+}
+#endif
+
 // The DoNotOptimize(...) function can be used to prevent a value or
 // expression from being optimized away by the compiler. This function is
 // intended to add little to no overhead.
 // See: https://youtu.be/nXaxk27zwlk?t=2441
 #ifndef BENCHMARK_HAS_NO_INLINE_ASSEMBLY
+#if !defined(__GNUC__) || defined(__llvm__) || defined(__INTEL_COMPILER)
 template <class Tp>
+BENCHMARK_DEPRECATED_MSG(
+    "The const-ref version of this method can permit "
+    "undesired compiler optimizations in benchmarks")
 inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
   asm volatile("" : : "r,m"(value) : "memory");
 }
@@ -322,26 +472,92 @@
   asm volatile("" : "+m,r"(value) : : "memory");
 #endif
 }
+#elif defined(BENCHMARK_HAS_CXX11) && (__GNUC__ >= 5)
+// Workaround for a bug with full argument copy overhead with GCC.
+// See: #1340 and https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105519
+template <class Tp>
+BENCHMARK_DEPRECATED_MSG(
+    "The const-ref version of this method can permit "
+    "undesired compiler optimizations in benchmarks")
+inline BENCHMARK_ALWAYS_INLINE
+    typename std::enable_if<std::is_trivially_copyable<Tp>::value &&
+                            (sizeof(Tp) <= sizeof(Tp*))>::type
+    DoNotOptimize(Tp const& value) {
+  asm volatile("" : : "r,m"(value) : "memory");
+}
 
-// Force the compiler to flush pending writes to global memory. Acts as an
-// effective read/write barrier
+template <class Tp>
+BENCHMARK_DEPRECATED_MSG(
+    "The const-ref version of this method can permit "
+    "undesired compiler optimizations in benchmarks")
+inline BENCHMARK_ALWAYS_INLINE
+    typename std::enable_if<!std::is_trivially_copyable<Tp>::value ||
+                            (sizeof(Tp) > sizeof(Tp*))>::type
+    DoNotOptimize(Tp const& value) {
+  asm volatile("" : : "m"(value) : "memory");
+}
+
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE
+    typename std::enable_if<std::is_trivially_copyable<Tp>::value &&
+                            (sizeof(Tp) <= sizeof(Tp*))>::type
+    DoNotOptimize(Tp& value) {
+  asm volatile("" : "+m,r"(value) : : "memory");
+}
+
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE
+    typename std::enable_if<!std::is_trivially_copyable<Tp>::value ||
+                            (sizeof(Tp) > sizeof(Tp*))>::type
+    DoNotOptimize(Tp& value) {
+  asm volatile("" : "+m"(value) : : "memory");
+}
+
+#else
+// Fallback for GCC < 5. Can add some overhead because the compiler is forced
+// to use memory operations instead of operations with registers.
+// TODO: Remove if GCC < 5 will be unsupported.
+template <class Tp>
+BENCHMARK_DEPRECATED_MSG(
+    "The const-ref version of this method can permit "
+    "undesired compiler optimizations in benchmarks")
+inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
+  asm volatile("" : : "m"(value) : "memory");
+}
+
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp& value) {
+  asm volatile("" : "+m"(value) : : "memory");
+}
+#endif
+
+#ifndef BENCHMARK_HAS_CXX11
 inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
   asm volatile("" : : : "memory");
 }
+#endif
 #elif defined(_MSC_VER)
 template <class Tp>
+BENCHMARK_DEPRECATED_MSG(
+    "The const-ref version of this method can permit "
+    "undesired compiler optimizations in benchmarks")
 inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
   internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
   _ReadWriteBarrier();
 }
 
+#ifndef BENCHMARK_HAS_CXX11
 inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() { _ReadWriteBarrier(); }
+#endif
 #else
 template <class Tp>
+BENCHMARK_DEPRECATED_MSG(
+    "The const-ref version of this method can permit "
+    "undesired compiler optimizations in benchmarks")
 inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
   internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
 }
-// FIXME Add ClobberMemory() for non-gnu and non-msvc compilers
+// FIXME Add ClobberMemory() for non-gnu and non-msvc compilers, before C++11.
 #endif
 
 // This class is used for user-defined counters.
@@ -351,27 +567,27 @@
     kDefaults = 0,
     // Mark the counter as a rate. It will be presented divided
     // by the duration of the benchmark.
-    kIsRate = 1U << 0U,
+    kIsRate = 1 << 0,
     // Mark the counter as a thread-average quantity. It will be
     // presented divided by the number of threads.
-    kAvgThreads = 1U << 1U,
+    kAvgThreads = 1 << 1,
     // Mark the counter as a thread-average rate. See above.
     kAvgThreadsRate = kIsRate | kAvgThreads,
     // Mark the counter as a constant value, valid/same for *every* iteration.
     // When reporting, it will be *multiplied* by the iteration count.
-    kIsIterationInvariant = 1U << 2U,
+    kIsIterationInvariant = 1 << 2,
     // Mark the counter as a constant rate.
     // When reporting, it will be *multiplied* by the iteration count
     // and then divided by the duration of the benchmark.
     kIsIterationInvariantRate = kIsRate | kIsIterationInvariant,
     // Mark the counter as a iteration-average quantity.
     // It will be presented divided by the number of iterations.
-    kAvgIterations = 1U << 3U,
+    kAvgIterations = 1 << 3,
     // Mark the counter as a iteration-average rate. See above.
     kAvgIterationsRate = kIsRate | kAvgIterations,
 
     // In the end, invert the result. This is always done last!
-    kInvert = 1U << 31U
+    kInvert = 1 << 31
   };
 
   enum OneK {
@@ -389,7 +605,7 @@
   Counter(double v = 0., Flags f = kDefaults, OneK k = kIs1000)
       : value(v), flags(f), oneK(k) {}
 
-  BENCHMARK_ALWAYS_INLINE operator double const&() const { return value; }
+  BENCHMARK_ALWAYS_INLINE operator double const &() const { return value; }
   BENCHMARK_ALWAYS_INLINE operator double&() { return value; }
 };
 
@@ -404,17 +620,15 @@
 // This is the container for the user-defined counters.
 typedef std::map<std::string, Counter> UserCounters;
 
-// TimeUnit is passed to a benchmark in order to specify the order of magnitude
-// for the measured time.
-enum TimeUnit { kNanosecond, kMicrosecond, kMillisecond };
-
 // BigO is passed to a benchmark in order to specify the asymptotic
 // computational
 // complexity for the benchmark. In case oAuto is selected, complexity will be
 // calculated automatically to the best fit.
 enum BigO { oNone, o1, oN, oNSquared, oNCubed, oLogN, oNLogN, oAuto, oLambda };
 
-typedef uint64_t IterationCount;
+typedef int64_t IterationCount;
+
+enum StatisticUnit { kTime, kPercentage };
 
 // BigOFunc is passed to a benchmark in order to specify the asymptotic
 // computational complexity for the benchmark.
@@ -428,14 +642,17 @@
 struct Statistics {
   std::string name_;
   StatisticsFunc* compute_;
+  StatisticUnit unit_;
 
-  Statistics(const std::string& name, StatisticsFunc* compute)
-      : name_(name), compute_(compute) {}
+  Statistics(const std::string& name, StatisticsFunc* compute,
+             StatisticUnit unit = kTime)
+      : name_(name), compute_(compute), unit_(unit) {}
 };
 
-struct BenchmarkInstance;
+class BenchmarkInstance;
 class ThreadTimer;
 class ThreadManager;
+class PerfCountersMeasurement;
 
 enum AggregationReportMode
 #if defined(BENCHMARK_HAS_CXX11)
@@ -457,11 +674,21 @@
       ARM_FileReportAggregatesOnly | ARM_DisplayReportAggregatesOnly
 };
 
+enum Skipped
+#if defined(BENCHMARK_HAS_CXX11)
+    : unsigned
+#endif
+{
+  NotSkipped = 0,
+  SkippedWithMessage,
+  SkippedWithError
+};
+
 }  // namespace internal
 
 // State is passed to a running Benchmark and contains state for the
 // benchmark to use.
-class State {
+class BENCHMARK_EXPORT State {
  public:
   struct StateIterator;
   friend struct StateIterator;
@@ -493,8 +720,8 @@
   //   }
   bool KeepRunningBatch(IterationCount n);
 
-  // REQUIRES: timer is running and 'SkipWithError(...)' has not been called
-  //           by the current thread.
+  // REQUIRES: timer is running and 'SkipWithMessage(...)' or
+  //   'SkipWithError(...)' has not been called by the current thread.
   // Stop the benchmark timer.  If not called, the timer will be
   // automatically stopped after the last iteration of the benchmark loop.
   //
@@ -509,8 +736,8 @@
   // within each benchmark iteration, if possible.
   void PauseTiming();
 
-  // REQUIRES: timer is not running and 'SkipWithError(...)' has not been called
-  //           by the current thread.
+  // REQUIRES: timer is not running and 'SkipWithMessage(...)' or
+  //   'SkipWithError(...)' has not been called by the current thread.
   // Start the benchmark timer.  The timer is NOT running on entrance to the
   // benchmark function. It begins running after control flow enters the
   // benchmark loop.
@@ -520,8 +747,30 @@
   // within each benchmark iteration, if possible.
   void ResumeTiming();
 
-  // REQUIRES: 'SkipWithError(...)' has not been called previously by the
-  //            current thread.
+  // REQUIRES: 'SkipWithMessage(...)' or 'SkipWithError(...)' has not been
+  //            called previously by the current thread.
+  // Report the benchmark as resulting in being skipped with the specified
+  // 'msg'.
+  // After this call the user may explicitly 'return' from the benchmark.
+  //
+  // If the ranged-for style of benchmark loop is used, the user must explicitly
+  // break from the loop, otherwise all future iterations will be run.
+  // If the 'KeepRunning()' loop is used the current thread will automatically
+  // exit the loop at the end of the current iteration.
+  //
+  // For threaded benchmarks only the current thread stops executing and future
+  // calls to `KeepRunning()` will block until all threads have completed
+  // the `KeepRunning()` loop. If multiple threads report being skipped only the
+  // first skip message is used.
+  //
+  // NOTE: Calling 'SkipWithMessage(...)' does not cause the benchmark to exit
+  // the current scope immediately. If the function is called from within
+  // the 'KeepRunning()' loop the current iteration will finish. It is the users
+  // responsibility to exit the scope as needed.
+  void SkipWithMessage(const std::string& msg);
+
+  // REQUIRES: 'SkipWithMessage(...)' or 'SkipWithError(...)' has not been
+  //            called previously by the current thread.
   // Report the benchmark as resulting in an error with the specified 'msg'.
   // After this call the user may explicitly 'return' from the benchmark.
   //
@@ -539,7 +788,13 @@
   // the current scope immediately. If the function is called from within
   // the 'KeepRunning()' loop the current iteration will finish. It is the users
   // responsibility to exit the scope as needed.
-  void SkipWithError(const char* msg);
+  void SkipWithError(const std::string& msg);
+
+  // Returns true if 'SkipWithMessage(...)' or 'SkipWithError(...)' was called.
+  bool skipped() const { return internal::NotSkipped != skipped_; }
+
+  // Returns true if an error has been reported with 'SkipWithError(...)'.
+  bool error_occurred() const { return internal::SkippedWithError == skipped_; }
 
   // REQUIRES: called exactly once per iteration of the benchmarking loop.
   // Set the manually measured time for this benchmark iteration, which
@@ -610,11 +865,7 @@
   //  BM_Compress   50         50   14115038  compress:27.3%
   //
   // REQUIRES: a benchmark has exited its benchmarking loop.
-  void SetLabel(const char* label);
-
-  void BENCHMARK_ALWAYS_INLINE SetLabel(const std::string& str) {
-    this->SetLabel(str.c_str());
-  }
+  void SetLabel(const std::string& label);
 
   // Range arguments for this run. CHECKs if the argument has been set.
   BENCHMARK_ALWAYS_INLINE
@@ -629,6 +880,14 @@
   BENCHMARK_DEPRECATED_MSG("use 'range(1)' instead")
   int64_t range_y() const { return range(1); }
 
+  // Number of threads concurrently executing the benchmark.
+  BENCHMARK_ALWAYS_INLINE
+  int threads() const { return threads_; }
+
+  // Index of the executing thread. Values from [0, threads).
+  BENCHMARK_ALWAYS_INLINE
+  int thread_index() const { return thread_index_; }
+
   BENCHMARK_ALWAYS_INLINE
   IterationCount iterations() const {
     if (BENCHMARK_BUILTIN_EXPECT(!started_, false)) {
@@ -637,8 +896,11 @@
     return max_iterations - total_iterations_ + batch_leftover_;
   }
 
- private
-     :  // items we expect on the first cache line (ie 64 bytes of the struct)
+  BENCHMARK_ALWAYS_INLINE
+  std::string name() const { return name_; }
+
+ private:
+  // items we expect on the first cache line (ie 64 bytes of the struct)
   // When total_iterations_ is 0, KeepRunning() and friends will return false.
   // May be larger than max_iterations.
   IterationCount total_iterations_;
@@ -654,9 +916,9 @@
  private:
   bool started_;
   bool finished_;
-  bool error_occurred_;
+  internal::Skipped skipped_;
 
- private:  // items we don't need on the first cache line
+  // items we don't need on the first cache line
   std::vector<int64_t> range_;
 
   int64_t complexity_n_;
@@ -664,25 +926,28 @@
  public:
   // Container for user-defined counters.
   UserCounters counters;
-  // Index of the executing thread. Values from [0, threads).
-  const int thread_index;
-  // Number of threads concurrently executing the benchmark.
-  const int threads;
 
  private:
-  State(IterationCount max_iters, const std::vector<int64_t>& ranges,
-        int thread_i, int n_threads, internal::ThreadTimer* timer,
-        internal::ThreadManager* manager);
+  State(std::string name, IterationCount max_iters,
+        const std::vector<int64_t>& ranges, int thread_i, int n_threads,
+        internal::ThreadTimer* timer, internal::ThreadManager* manager,
+        internal::PerfCountersMeasurement* perf_counters_measurement);
 
   void StartKeepRunning();
   // Implementation of KeepRunning() and KeepRunningBatch().
   // is_batch must be true unless n is 1.
   bool KeepRunningInternal(IterationCount n, bool is_batch);
   void FinishKeepRunning();
-  internal::ThreadTimer* timer_;
-  internal::ThreadManager* manager_;
 
-  friend struct internal::BenchmarkInstance;
+  const std::string name_;
+  const int thread_index_;
+  const int threads_;
+
+  internal::ThreadTimer* const timer_;
+  internal::ThreadManager* const manager_;
+  internal::PerfCountersMeasurement* const perf_counters_measurement_;
+
+  friend class internal::BenchmarkInstance;
 };
 
 inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunning() {
@@ -706,7 +971,7 @@
   }
   if (!started_) {
     StartKeepRunning();
-    if (!error_occurred_ && total_iterations_ >= n) {
+    if (!skipped() && total_iterations_ >= n) {
       total_iterations_ -= n;
       return true;
     }
@@ -736,7 +1001,7 @@
 
   BENCHMARK_ALWAYS_INLINE
   explicit StateIterator(State* st)
-      : cached_(st->error_occurred_ ? 0 : st->max_iterations), parent_(st) {}
+      : cached_(st->skipped() ? 0 : st->max_iterations), parent_(st) {}
 
  public:
   BENCHMARK_ALWAYS_INLINE
@@ -779,13 +1044,16 @@
 // be called on this object to change the properties of the benchmark.
 // Each method returns "this" so that multiple method calls can
 // chained into one expression.
-class Benchmark {
+class BENCHMARK_EXPORT Benchmark {
  public:
   virtual ~Benchmark();
 
   // Note: the following methods all return "this" so that multiple
   // method calls can be chained together in one expression.
 
+  // Specify the name of the benchmark
+  Benchmark* Name(const std::string& name);
+
   // Run this benchmark once with "x" as the extra argument passed
   // to the function.
   // REQUIRES: The function passed to the constructor must accept an arg1.
@@ -824,6 +1092,11 @@
   // REQUIRES: The function passed to the constructor must accept arg1, arg2 ...
   Benchmark* Ranges(const std::vector<std::pair<int64_t, int64_t> >& ranges);
 
+  // Run this benchmark once for each combination of values in the (cartesian)
+  // product of the supplied argument lists.
+  // REQUIRES: The function passed to the constructor must accept arg1, arg2 ...
+  Benchmark* ArgsProduct(const std::vector<std::vector<int64_t> >& arglists);
+
   // Equivalent to ArgNames({name})
   Benchmark* ArgName(const std::string& name);
 
@@ -841,6 +1114,23 @@
     return Ranges(ranges);
   }
 
+  // Have "setup" and/or "teardown" invoked once for every benchmark run.
+  // If the benchmark is multi-threaded (will run in k threads concurrently),
+  // the setup callback will be be invoked exactly once (not k times) before
+  // each run with k threads. Time allowing (e.g. for a short benchmark), there
+  // may be multiple such runs per benchmark, each run with its own
+  // "setup"/"teardown".
+  //
+  // If the benchmark uses different size groups of threads (e.g. via
+  // ThreadRange), the above will be true for each size group.
+  //
+  // The callback will be passed a State object, which includes the number
+  // of threads, thread-index, benchmark arguments, etc.
+  //
+  // The callback must not be NULL or self-deleting.
+  Benchmark* Setup(void (*setup)(const benchmark::State&));
+  Benchmark* Teardown(void (*teardown)(const benchmark::State&));
+
   // Pass this benchmark object to *func, which can customize
   // the benchmark by calling various methods like Arg, Args,
   // Threads, etc.
@@ -855,12 +1145,19 @@
   // REQUIRES: `t > 0` and `Iterations` has not been called on this benchmark.
   Benchmark* MinTime(double t);
 
+  // Set the minimum amount of time to run the benchmark before taking runtimes
+  // of this benchmark into account. This
+  // option overrides the `benchmark_min_warmup_time` flag.
+  // REQUIRES: `t >= 0` and `Iterations` has not been called on this benchmark.
+  Benchmark* MinWarmUpTime(double t);
+
   // Specify the amount of iterations that should be run by this benchmark.
+  // This option overrides the `benchmark_min_time` flag.
   // REQUIRES: 'n > 0' and `MinTime` has not been called on this benchmark.
   //
   // NOTE: This function should only be used when *exact* iteration control is
   //   needed and never to control or limit how long a benchmark runs, where
-  // `--benchmark_min_time=N` or `MinTime(...)` should be used instead.
+  // `--benchmark_min_time=<N>s` or `MinTime(...)` should be used instead.
   Benchmark* Iterations(IterationCount n);
 
   // Specify the amount of times to repeat this benchmark. This option overrides
@@ -880,7 +1177,7 @@
   // By default, the CPU time is measured only for the main thread, which may
   // be unrepresentative if the benchmark uses threads internally. If called,
   // the total CPU time spent by all the threads will be measured instead.
-  // By default, the only the main thread CPU time will be measured.
+  // By default, only the main thread CPU time will be measured.
   Benchmark* MeasureProcessCPUTime();
 
   // If a particular benchmark should use the Wall clock instead of the CPU time
@@ -909,7 +1206,9 @@
   Benchmark* Complexity(BigOFunc* complexity);
 
   // Add this statistics to be computed over all the values of benchmark run
-  Benchmark* ComputeStatistics(std::string name, StatisticsFunc* statistics);
+  Benchmark* ComputeStatistics(const std::string& name,
+                               StatisticsFunc* statistics,
+                               StatisticUnit unit = kTime);
 
   // Support for running multiple copies of the same benchmark concurrently
   // in multiple threads.  This may be useful when measuring the scaling
@@ -943,23 +1242,32 @@
 
   virtual void Run(State& state) = 0;
 
- protected:
-  explicit Benchmark(const char* name);
-  Benchmark(Benchmark const&);
-  void SetName(const char* name);
+  TimeUnit GetTimeUnit() const;
 
+ protected:
+  explicit Benchmark(const std::string& name);
+  void SetName(const std::string& name);
+
+ public:
+  const char* GetName() const;
   int ArgsCnt() const;
+  const char* GetArgName(int arg) const;
 
  private:
   friend class BenchmarkFamilies;
+  friend class BenchmarkInstance;
 
   std::string name_;
   AggregationReportMode aggregation_report_mode_;
   std::vector<std::string> arg_names_;       // Args for all benchmark runs
   std::vector<std::vector<int64_t> > args_;  // Args for all benchmark runs
+
   TimeUnit time_unit_;
+  bool use_default_time_unit_;
+
   int range_multiplier_;
   double min_time_;
+  double min_warmup_time_;
   IterationCount iterations_;
   int repetitions_;
   bool measure_process_cpu_time_;
@@ -970,7 +1278,21 @@
   std::vector<Statistics> statistics_;
   std::vector<int> thread_counts_;
 
-  Benchmark& operator=(Benchmark const&);
+  typedef void (*callback_function)(const benchmark::State&);
+  callback_function setup_;
+  callback_function teardown_;
+
+  Benchmark(Benchmark const&)
+#if defined(BENCHMARK_HAS_CXX11)
+      = delete
+#endif
+      ;
+
+  Benchmark& operator=(Benchmark const&)
+#if defined(BENCHMARK_HAS_CXX11)
+      = delete
+#endif
+      ;
 };
 
 }  // namespace internal
@@ -979,27 +1301,27 @@
 // the specified functor 'fn'.
 //
 // RETURNS: A pointer to the registered benchmark.
-internal::Benchmark* RegisterBenchmark(const char* name,
+internal::Benchmark* RegisterBenchmark(const std::string& name,
                                        internal::Function* fn);
 
 #if defined(BENCHMARK_HAS_CXX11)
 template <class Lambda>
-internal::Benchmark* RegisterBenchmark(const char* name, Lambda&& fn);
+internal::Benchmark* RegisterBenchmark(const std::string& name, Lambda&& fn);
 #endif
 
 // Remove all registered benchmarks. All pointers to previously registered
 // benchmarks are invalidated.
-void ClearRegisteredBenchmarks();
+BENCHMARK_EXPORT void ClearRegisteredBenchmarks();
 
 namespace internal {
 // The class used to hold all Benchmarks created from static function.
 // (ie those created using the BENCHMARK(...) macros.
-class FunctionBenchmark : public Benchmark {
+class BENCHMARK_EXPORT FunctionBenchmark : public Benchmark {
  public:
-  FunctionBenchmark(const char* name, Function* func)
+  FunctionBenchmark(const std::string& name, Function* func)
       : Benchmark(name), func_(func) {}
 
-  virtual void Run(State& st);
+  void Run(State& st) BENCHMARK_OVERRIDE;
 
  private:
   Function* func_;
@@ -1009,26 +1331,24 @@
 template <class Lambda>
 class LambdaBenchmark : public Benchmark {
  public:
-  virtual void Run(State& st) { lambda_(st); }
+  void Run(State& st) BENCHMARK_OVERRIDE { lambda_(st); }
 
  private:
   template <class OLambda>
-  LambdaBenchmark(const char* name, OLambda&& lam)
+  LambdaBenchmark(const std::string& name, OLambda&& lam)
       : Benchmark(name), lambda_(std::forward<OLambda>(lam)) {}
 
   LambdaBenchmark(LambdaBenchmark const&) = delete;
 
- private:
-  template <class Lam>
-  friend Benchmark* ::benchmark::RegisterBenchmark(const char*, Lam&&);
+  template <class Lam>  // NOLINTNEXTLINE(readability-redundant-declaration)
+  friend Benchmark* ::benchmark::RegisterBenchmark(const std::string&, Lam&&);
 
   Lambda lambda_;
 };
 #endif
-
 }  // namespace internal
 
-inline internal::Benchmark* RegisterBenchmark(const char* name,
+inline internal::Benchmark* RegisterBenchmark(const std::string& name,
                                               internal::Function* fn) {
   return internal::RegisterBenchmarkInternal(
       ::new internal::FunctionBenchmark(name, fn));
@@ -1036,7 +1356,7 @@
 
 #ifdef BENCHMARK_HAS_CXX11
 template <class Lambda>
-internal::Benchmark* RegisterBenchmark(const char* name, Lambda&& fn) {
+internal::Benchmark* RegisterBenchmark(const std::string& name, Lambda&& fn) {
   using BenchType =
       internal::LambdaBenchmark<typename std::decay<Lambda>::type>;
   return internal::RegisterBenchmarkInternal(
@@ -1047,7 +1367,7 @@
 #if defined(BENCHMARK_HAS_CXX11) && \
     (!defined(BENCHMARK_GCC_VERSION) || BENCHMARK_GCC_VERSION >= 409)
 template <class Lambda, class... Args>
-internal::Benchmark* RegisterBenchmark(const char* name, Lambda&& fn,
+internal::Benchmark* RegisterBenchmark(const std::string& name, Lambda&& fn,
                                        Args&&... args) {
   return benchmark::RegisterBenchmark(
       name, [=](benchmark::State& st) { fn(st, args...); });
@@ -1061,7 +1381,7 @@
  public:
   Fixture() : internal::Benchmark("") {}
 
-  virtual void Run(State& st) {
+  void Run(State& st) BENCHMARK_OVERRIDE {
     this->SetUp(st);
     this->BenchmarkCase(st);
     this->TearDown(st);
@@ -1077,7 +1397,6 @@
  protected:
   virtual void BenchmarkCase(State&) = 0;
 };
-
 }  // namespace benchmark
 
 // ------------------------------------------------------
@@ -1093,19 +1412,37 @@
 #endif
 
 // Helpers for generating unique variable names
+#ifdef BENCHMARK_HAS_CXX11
+#define BENCHMARK_PRIVATE_NAME(...)                                      \
+  BENCHMARK_PRIVATE_CONCAT(benchmark_uniq_, BENCHMARK_PRIVATE_UNIQUE_ID, \
+                           __VA_ARGS__)
+#else
 #define BENCHMARK_PRIVATE_NAME(n) \
-  BENCHMARK_PRIVATE_CONCAT(_benchmark_, BENCHMARK_PRIVATE_UNIQUE_ID, n)
+  BENCHMARK_PRIVATE_CONCAT(benchmark_uniq_, BENCHMARK_PRIVATE_UNIQUE_ID, n)
+#endif  // BENCHMARK_HAS_CXX11
+
 #define BENCHMARK_PRIVATE_CONCAT(a, b, c) BENCHMARK_PRIVATE_CONCAT2(a, b, c)
 #define BENCHMARK_PRIVATE_CONCAT2(a, b, c) a##b##c
+// Helper for concatenation with macro name expansion
+#define BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method) \
+  BaseClass##_##Method##_Benchmark
 
 #define BENCHMARK_PRIVATE_DECLARE(n)                                 \
   static ::benchmark::internal::Benchmark* BENCHMARK_PRIVATE_NAME(n) \
       BENCHMARK_UNUSED
 
+#ifdef BENCHMARK_HAS_CXX11
+#define BENCHMARK(...)                                               \
+  BENCHMARK_PRIVATE_DECLARE(_benchmark_) =                           \
+      (::benchmark::internal::RegisterBenchmarkInternal(             \
+          new ::benchmark::internal::FunctionBenchmark(#__VA_ARGS__, \
+                                                       __VA_ARGS__)))
+#else
 #define BENCHMARK(n)                                     \
   BENCHMARK_PRIVATE_DECLARE(n) =                         \
       (::benchmark::internal::RegisterBenchmarkInternal( \
           new ::benchmark::internal::FunctionBenchmark(#n, n)))
+#endif  // BENCHMARK_HAS_CXX11
 
 // Old-style macros
 #define BENCHMARK_WITH_ARG(n, a) BENCHMARK(n)->Arg((a))
@@ -1166,49 +1503,49 @@
 #define BENCHMARK_TEMPLATE(n, a) BENCHMARK_TEMPLATE1(n, a)
 #endif
 
-#define BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method)        \
-  class BaseClass##_##Method##_Benchmark : public BaseClass { \
-   public:                                                    \
-    BaseClass##_##Method##_Benchmark() : BaseClass() {        \
-      this->SetName(#BaseClass "/" #Method);                  \
-    }                                                         \
-                                                              \
-   protected:                                                 \
-    virtual void BenchmarkCase(::benchmark::State&);          \
+#define BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method)          \
+  class BaseClass##_##Method##_Benchmark : public BaseClass {   \
+   public:                                                      \
+    BaseClass##_##Method##_Benchmark() {                        \
+      this->SetName(#BaseClass "/" #Method);                    \
+    }                                                           \
+                                                                \
+   protected:                                                   \
+    void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE; \
   };
 
 #define BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(BaseClass, Method, a) \
   class BaseClass##_##Method##_Benchmark : public BaseClass<a> {    \
    public:                                                          \
-    BaseClass##_##Method##_Benchmark() : BaseClass<a>() {           \
+    BaseClass##_##Method##_Benchmark() {                            \
       this->SetName(#BaseClass "<" #a ">/" #Method);                \
     }                                                               \
                                                                     \
    protected:                                                       \
-    virtual void BenchmarkCase(::benchmark::State&);                \
+    void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE;     \
   };
 
 #define BENCHMARK_TEMPLATE2_PRIVATE_DECLARE_F(BaseClass, Method, a, b) \
   class BaseClass##_##Method##_Benchmark : public BaseClass<a, b> {    \
    public:                                                             \
-    BaseClass##_##Method##_Benchmark() : BaseClass<a, b>() {           \
+    BaseClass##_##Method##_Benchmark() {                               \
       this->SetName(#BaseClass "<" #a "," #b ">/" #Method);            \
     }                                                                  \
                                                                        \
    protected:                                                          \
-    virtual void BenchmarkCase(::benchmark::State&);                   \
+    void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE;        \
   };
 
 #ifdef BENCHMARK_HAS_CXX11
 #define BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(BaseClass, Method, ...)       \
   class BaseClass##_##Method##_Benchmark : public BaseClass<__VA_ARGS__> { \
    public:                                                                 \
-    BaseClass##_##Method##_Benchmark() : BaseClass<__VA_ARGS__>() {        \
+    BaseClass##_##Method##_Benchmark() {                                   \
       this->SetName(#BaseClass "<" #__VA_ARGS__ ">/" #Method);             \
     }                                                                      \
                                                                            \
    protected:                                                              \
-    virtual void BenchmarkCase(::benchmark::State&);                       \
+    void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE;            \
   };
 #else
 #define BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(n, a) \
@@ -1217,27 +1554,27 @@
 
 #define BENCHMARK_DEFINE_F(BaseClass, Method)    \
   BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method) \
-  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
 
 #define BENCHMARK_TEMPLATE1_DEFINE_F(BaseClass, Method, a)    \
   BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(BaseClass, Method, a) \
-  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
 
 #define BENCHMARK_TEMPLATE2_DEFINE_F(BaseClass, Method, a, b)    \
   BENCHMARK_TEMPLATE2_PRIVATE_DECLARE_F(BaseClass, Method, a, b) \
-  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
 
 #ifdef BENCHMARK_HAS_CXX11
 #define BENCHMARK_TEMPLATE_DEFINE_F(BaseClass, Method, ...)            \
   BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(BaseClass, Method, __VA_ARGS__) \
-  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
 #else
 #define BENCHMARK_TEMPLATE_DEFINE_F(BaseClass, Method, a) \
   BENCHMARK_TEMPLATE1_DEFINE_F(BaseClass, Method, a)
 #endif
 
 #define BENCHMARK_REGISTER_F(BaseClass, Method) \
-  BENCHMARK_PRIVATE_REGISTER_F(BaseClass##_##Method##_Benchmark)
+  BENCHMARK_PRIVATE_REGISTER_F(BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method))
 
 #define BENCHMARK_PRIVATE_REGISTER_F(TestName) \
   BENCHMARK_PRIVATE_DECLARE(TestName) =        \
@@ -1247,34 +1584,43 @@
 #define BENCHMARK_F(BaseClass, Method)           \
   BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method) \
   BENCHMARK_REGISTER_F(BaseClass, Method);       \
-  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
 
 #define BENCHMARK_TEMPLATE1_F(BaseClass, Method, a)           \
   BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(BaseClass, Method, a) \
   BENCHMARK_REGISTER_F(BaseClass, Method);                    \
-  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
 
 #define BENCHMARK_TEMPLATE2_F(BaseClass, Method, a, b)           \
   BENCHMARK_TEMPLATE2_PRIVATE_DECLARE_F(BaseClass, Method, a, b) \
   BENCHMARK_REGISTER_F(BaseClass, Method);                       \
-  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
 
 #ifdef BENCHMARK_HAS_CXX11
 #define BENCHMARK_TEMPLATE_F(BaseClass, Method, ...)                   \
   BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(BaseClass, Method, __VA_ARGS__) \
   BENCHMARK_REGISTER_F(BaseClass, Method);                             \
-  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
 #else
 #define BENCHMARK_TEMPLATE_F(BaseClass, Method, a) \
   BENCHMARK_TEMPLATE1_F(BaseClass, Method, a)
 #endif
 
 // Helper macro to create a main routine in a test that runs the benchmarks
+// Note the workaround for Hexagon simulator passing argc != 0, argv = NULL.
 #define BENCHMARK_MAIN()                                                \
   int main(int argc, char** argv) {                                     \
+    char arg0_default[] = "benchmark";                                  \
+    char* args_default = arg0_default;                                  \
+    if (!argv) {                                                        \
+      argc = 1;                                                         \
+      argv = &args_default;                                             \
+    }                                                                   \
     ::benchmark::Initialize(&argc, argv);                               \
     if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return 1; \
     ::benchmark::RunSpecifiedBenchmarks();                              \
+    ::benchmark::Shutdown();                                            \
+    return 0;                                                           \
   }                                                                     \
   int main(int, char**)
 
@@ -1283,7 +1629,7 @@
 
 namespace benchmark {
 
-struct CPUInfo {
+struct BENCHMARK_EXPORT CPUInfo {
   struct CacheInfo {
     std::string type;
     int level;
@@ -1291,10 +1637,12 @@
     int num_sharing;
   };
 
+  enum Scaling { UNKNOWN, ENABLED, DISABLED };
+
   int num_cpus;
+  Scaling scaling;
   double cycles_per_second;
   std::vector<CacheInfo> caches;
-  bool scaling_enabled;
   std::vector<double> load_avg;
 
   static const CPUInfo& Get();
@@ -1305,7 +1653,7 @@
 };
 
 // Adding Struct for System Information
-struct SystemInfo {
+struct BENCHMARK_EXPORT SystemInfo {
   std::string name;
   static const SystemInfo& Get();
 
@@ -1317,10 +1665,11 @@
 // BenchmarkName contains the components of the Benchmark's name
 // which allows individual fields to be modified or cleared before
 // building the final name using 'str()'.
-struct BenchmarkName {
+struct BENCHMARK_EXPORT BenchmarkName {
   std::string function_name;
   std::string args;
   std::string min_time;
+  std::string min_warmup_time;
   std::string iterations;
   std::string repetitions;
   std::string time_type;
@@ -1336,7 +1685,7 @@
 // can control the destination of the reports by calling
 // RunSpecifiedBenchmarks and passing it a custom reporter object.
 // The reporter object must implement the following interface.
-class BenchmarkReporter {
+class BENCHMARK_EXPORT BenchmarkReporter {
  public:
   struct Context {
     CPUInfo const& cpu_info;
@@ -1347,16 +1696,17 @@
     Context();
   };
 
-  struct Run {
+  struct BENCHMARK_EXPORT Run {
     static const int64_t no_repetition_index = -1;
     enum RunType { RT_Iteration, RT_Aggregate };
 
     Run()
         : run_type(RT_Iteration),
-          error_occurred(false),
+          aggregate_unit(kTime),
+          skipped(internal::NotSkipped),
           iterations(1),
           threads(1),
-          time_unit(kNanosecond),
+          time_unit(GetDefaultTimeUnit()),
           real_accumulated_time(0),
           cpu_accumulated_time(0),
           max_heapbytes_used(0),
@@ -1365,18 +1715,19 @@
           complexity_n(0),
           report_big_o(false),
           report_rms(false),
-          counters(),
-          has_memory_result(false),
-          allocs_per_iter(0.0),
-          max_bytes_used(0) {}
+          memory_result(NULL),
+          allocs_per_iter(0.0) {}
 
     std::string benchmark_name() const;
     BenchmarkName run_name;
+    int64_t family_index;
+    int64_t per_family_instance_index;
     RunType run_type;
     std::string aggregate_name;
+    StatisticUnit aggregate_unit;
     std::string report_label;  // Empty if not set by benchmark.
-    bool error_occurred;
-    std::string error_message;
+    internal::Skipped skipped;
+    std::string skip_message;
 
     IterationCount iterations;
     int64_t threads;
@@ -1416,9 +1767,21 @@
     UserCounters counters;
 
     // Memory metrics.
-    bool has_memory_result;
+    const MemoryManager::Result* memory_result;
     double allocs_per_iter;
-    int64_t max_bytes_used;
+  };
+
+  struct PerFamilyRunReports {
+    PerFamilyRunReports() : num_runs_total(0), num_runs_done(0) {}
+
+    // How many runs will all instances of this benchmark perform?
+    int num_runs_total;
+
+    // How many runs have happened already?
+    int num_runs_done;
+
+    // The reports about (non-errneous!) runs of this family.
+    std::vector<BenchmarkReporter::Run> Runs;
   };
 
   // Construct a BenchmarkReporter with the output stream set to 'std::cout'
@@ -1434,6 +1797,12 @@
   virtual bool ReportContext(const Context& context) = 0;
 
   // Called once for each group of benchmark runs, gives information about
+  // the configurations of the runs.
+  virtual void ReportRunsConfig(double /*min_time*/,
+                                bool /*has_explicit_iters*/,
+                                IterationCount /*iters*/) {}
+
+  // Called once for each group of benchmark runs, gives information about
   // cpu-time and heap memory usage during the benchmark run. If the group
   // of runs contained more than two entries then 'report' contains additional
   // elements representing the mean and standard deviation of those runs.
@@ -1478,7 +1847,7 @@
 
 // Simple reporter that outputs benchmark data to the console. This is the
 // default reporter used by RunSpecifiedBenchmarks().
-class ConsoleReporter : public BenchmarkReporter {
+class BENCHMARK_EXPORT ConsoleReporter : public BenchmarkReporter {
  public:
   enum OutputOptions {
     OO_None = 0,
@@ -1488,13 +1857,10 @@
     OO_Defaults = OO_ColorTabular
   };
   explicit ConsoleReporter(OutputOptions opts_ = OO_Defaults)
-      : output_options_(opts_),
-        name_field_width_(0),
-        prev_counters_(),
-        printed_header_(false) {}
+      : output_options_(opts_), name_field_width_(0), printed_header_(false) {}
 
-  virtual bool ReportContext(const Context& context);
-  virtual void ReportRuns(const std::vector<Run>& reports);
+  bool ReportContext(const Context& context) BENCHMARK_OVERRIDE;
+  void ReportRuns(const std::vector<Run>& reports) BENCHMARK_OVERRIDE;
 
  protected:
   virtual void PrintRunData(const Run& report);
@@ -1506,12 +1872,12 @@
   bool printed_header_;
 };
 
-class JSONReporter : public BenchmarkReporter {
+class BENCHMARK_EXPORT JSONReporter : public BenchmarkReporter {
  public:
   JSONReporter() : first_report_(true) {}
-  virtual bool ReportContext(const Context& context);
-  virtual void ReportRuns(const std::vector<Run>& reports);
-  virtual void Finalize();
+  bool ReportContext(const Context& context) BENCHMARK_OVERRIDE;
+  void ReportRuns(const std::vector<Run>& reports) BENCHMARK_OVERRIDE;
+  void Finalize() BENCHMARK_OVERRIDE;
 
  private:
   void PrintRunData(const Run& report);
@@ -1519,13 +1885,13 @@
   bool first_report_;
 };
 
-class BENCHMARK_DEPRECATED_MSG(
+class BENCHMARK_EXPORT BENCHMARK_DEPRECATED_MSG(
     "The CSV Reporter will be removed in a future release") CSVReporter
     : public BenchmarkReporter {
  public:
   CSVReporter() : printed_header_(false) {}
-  virtual bool ReportContext(const Context& context);
-  virtual void ReportRuns(const std::vector<Run>& reports);
+  bool ReportContext(const Context& context) BENCHMARK_OVERRIDE;
+  void ReportRuns(const std::vector<Run>& reports) BENCHMARK_OVERRIDE;
 
  private:
   void PrintRunData(const Run& report);
@@ -1534,31 +1900,10 @@
   std::set<std::string> user_counter_names_;
 };
 
-// If a MemoryManager is registered, it can be used to collect and report
-// allocation metrics for a run of the benchmark.
-class MemoryManager {
- public:
-  struct Result {
-    Result() : num_allocs(0), max_bytes_used(0) {}
-
-    // The number of allocations made in total between Start and Stop.
-    int64_t num_allocs;
-
-    // The peak memory use between Start and Stop.
-    int64_t max_bytes_used;
-  };
-
-  virtual ~MemoryManager() {}
-
-  // Implement this to start recording allocation information.
-  virtual void Start() = 0;
-
-  // Implement this to stop recording and fill out the given Result structure.
-  virtual void Stop(Result* result) = 0;
-};
-
 inline const char* GetTimeUnitString(TimeUnit unit) {
   switch (unit) {
+    case kSecond:
+      return "s";
     case kMillisecond:
       return "ms";
     case kMicrosecond:
@@ -1571,6 +1916,8 @@
 
 inline double GetTimeUnitMultiplier(TimeUnit unit) {
   switch (unit) {
+    case kSecond:
+      return 1;
     case kMillisecond:
       return 1e3;
     case kMicrosecond:
@@ -1581,6 +1928,26 @@
   BENCHMARK_UNREACHABLE();
 }
 
+// Creates a list of integer values for the given range and multiplier.
+// This can be used together with ArgsProduct() to allow multiple ranges
+// with different multipliers.
+// Example:
+// ArgsProduct({
+//   CreateRange(0, 1024, /*multi=*/32),
+//   CreateRange(0, 100, /*multi=*/4),
+//   CreateDenseRange(0, 4, /*step=*/1),
+// });
+BENCHMARK_EXPORT
+std::vector<int64_t> CreateRange(int64_t lo, int64_t hi, int multi);
+
+// Creates a list of integer values for the given range and step.
+BENCHMARK_EXPORT
+std::vector<int64_t> CreateDenseRange(int64_t start, int64_t limit, int step);
+
 }  // namespace benchmark
 
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
 #endif  // BENCHMARK_BENCHMARK_H_
diff --git a/third_party/google_benchmark/src/include/benchmark/export.h b/third_party/google_benchmark/src/include/benchmark/export.h
new file mode 100644
index 0000000..f96f859
--- /dev/null
+++ b/third_party/google_benchmark/src/include/benchmark/export.h
@@ -0,0 +1,47 @@
+#ifndef BENCHMARK_EXPORT_H
+#define BENCHMARK_EXPORT_H
+
+#if defined(_WIN32)
+#define EXPORT_ATTR __declspec(dllexport)
+#define IMPORT_ATTR __declspec(dllimport)
+#define NO_EXPORT_ATTR
+#define DEPRECATED_ATTR __declspec(deprecated)
+#else  // _WIN32
+#define EXPORT_ATTR __attribute__((visibility("default")))
+#define IMPORT_ATTR __attribute__((visibility("default")))
+#define NO_EXPORT_ATTR __attribute__((visibility("hidden")))
+#define DEPRECATE_ATTR __attribute__((__deprecated__))
+#endif  // _WIN32
+
+#ifdef BENCHMARK_STATIC_DEFINE
+#define BENCHMARK_EXPORT
+#define BENCHMARK_NO_EXPORT
+#else  // BENCHMARK_STATIC_DEFINE
+#ifndef BENCHMARK_EXPORT
+#ifdef benchmark_EXPORTS
+/* We are building this library */
+#define BENCHMARK_EXPORT EXPORT_ATTR
+#else  // benchmark_EXPORTS
+/* We are using this library */
+#define BENCHMARK_EXPORT IMPORT_ATTR
+#endif  // benchmark_EXPORTS
+#endif  // !BENCHMARK_EXPORT
+
+#ifndef BENCHMARK_NO_EXPORT
+#define BENCHMARK_NO_EXPORT NO_EXPORT_ATTR
+#endif  // !BENCHMARK_NO_EXPORT
+#endif  // BENCHMARK_STATIC_DEFINE
+
+#ifndef BENCHMARK_DEPRECATED
+#define BENCHMARK_DEPRECATED DEPRECATE_ATTR
+#endif  // BENCHMARK_DEPRECATED
+
+#ifndef BENCHMARK_DEPRECATED_EXPORT
+#define BENCHMARK_DEPRECATED_EXPORT BENCHMARK_EXPORT BENCHMARK_DEPRECATED
+#endif  // BENCHMARK_DEPRECATED_EXPORT
+
+#ifndef BENCHMARK_DEPRECATED_NO_EXPORT
+#define BENCHMARK_DEPRECATED_NO_EXPORT BENCHMARK_NO_EXPORT BENCHMARK_DEPRECATED
+#endif  // BENCHMARK_DEPRECATED_EXPORT
+
+#endif /* BENCHMARK_EXPORT_H */
diff --git a/third_party/google_benchmark/src/requirements.txt b/third_party/google_benchmark/src/requirements.txt
new file mode 100644
index 0000000..1c8a4bd
--- /dev/null
+++ b/third_party/google_benchmark/src/requirements.txt
@@ -0,0 +1,2 @@
+numpy == 1.22
+scipy == 1.5.4
diff --git a/third_party/google_benchmark/src/setup.py b/third_party/google_benchmark/src/setup.py
new file mode 100644
index 0000000..2388f59
--- /dev/null
+++ b/third_party/google_benchmark/src/setup.py
@@ -0,0 +1,166 @@
+import contextlib
+import os
+import platform
+import shutil
+import sysconfig
+from pathlib import Path
+from typing import List
+
+import setuptools
+from setuptools.command import build_ext
+
+
+PYTHON_INCLUDE_PATH_PLACEHOLDER = "<PYTHON_INCLUDE_PATH>"
+
+IS_WINDOWS = platform.system() == "Windows"
+IS_MAC = platform.system() == "Darwin"
+
+
+def _get_long_description(fp: str) -> str:
+    with open(fp, "r", encoding="utf-8") as f:
+        return f.read()
+
+
+def _get_version(fp: str) -> str:
+    """Parse a version string from a file."""
+    with open(fp, "r") as f:
+        for line in f:
+            if "__version__" in line:
+                delim = '"'
+                return line.split(delim)[1]
+    raise RuntimeError(f"could not find a version string in file {fp!r}.")
+
+
+def _parse_requirements(fp: str) -> List[str]:
+    with open(fp) as requirements:
+        return [
+            line.rstrip()
+            for line in requirements
+            if not (line.isspace() or line.startswith("#"))
+        ]
+
+
+@contextlib.contextmanager
+def temp_fill_include_path(fp: str):
+    """Temporarily set the Python include path in a file."""
+    with open(fp, "r+") as f:
+        try:
+            content = f.read()
+            replaced = content.replace(
+                PYTHON_INCLUDE_PATH_PLACEHOLDER,
+                Path(sysconfig.get_paths()['include']).as_posix(),
+            )
+            f.seek(0)
+            f.write(replaced)
+            f.truncate()
+            yield
+        finally:
+            # revert to the original content after exit
+            f.seek(0)
+            f.write(content)
+            f.truncate()
+
+
+class BazelExtension(setuptools.Extension):
+    """A C/C++ extension that is defined as a Bazel BUILD target."""
+
+    def __init__(self, name: str, bazel_target: str):
+        super().__init__(name=name, sources=[])
+
+        self.bazel_target = bazel_target
+        stripped_target = bazel_target.split("//")[-1]
+        self.relpath, self.target_name = stripped_target.split(":")
+
+
+class BuildBazelExtension(build_ext.build_ext):
+    """A command that runs Bazel to build a C/C++ extension."""
+
+    def run(self):
+        for ext in self.extensions:
+            self.bazel_build(ext)
+        build_ext.build_ext.run(self)
+
+    def bazel_build(self, ext: BazelExtension):
+        """Runs the bazel build to create the package."""
+        with temp_fill_include_path("WORKSPACE"):
+            temp_path = Path(self.build_temp)
+
+            bazel_argv = [
+                "bazel",
+                "build",
+                ext.bazel_target,
+                f"--symlink_prefix={temp_path / 'bazel-'}",
+                f"--compilation_mode={'dbg' if self.debug else 'opt'}",
+                # C++17 is required by nanobind
+                f"--cxxopt={'/std:c++17' if IS_WINDOWS else '-std=c++17'}",
+            ]
+
+            if IS_WINDOWS:
+                # Link with python*.lib.
+                for library_dir in self.library_dirs:
+                    bazel_argv.append("--linkopt=/LIBPATH:" + library_dir)
+            elif IS_MAC:
+                if platform.machine() == "x86_64":
+                    # C++17 needs macOS 10.14 at minimum
+                    bazel_argv.append("--macos_minimum_os=10.14")
+
+                    # cross-compilation for Mac ARM64 on GitHub Mac x86 runners.
+                    # ARCHFLAGS is set by cibuildwheel before macOS wheel builds.
+                    archflags = os.getenv("ARCHFLAGS", "")
+                    if "arm64" in archflags:
+                        bazel_argv.append("--cpu=darwin_arm64")
+                        bazel_argv.append("--macos_cpus=arm64")
+
+                elif platform.machine() == "arm64":
+                    bazel_argv.append("--macos_minimum_os=11.0")
+
+            self.spawn(bazel_argv)
+
+            shared_lib_suffix = '.dll' if IS_WINDOWS else '.so'
+            ext_name = ext.target_name + shared_lib_suffix
+            ext_bazel_bin_path = temp_path / 'bazel-bin' / ext.relpath / ext_name
+
+            ext_dest_path = Path(self.get_ext_fullpath(ext.name))
+            shutil.copyfile(ext_bazel_bin_path, ext_dest_path)
+
+            # explicitly call `bazel shutdown` for graceful exit
+            self.spawn(["bazel", "shutdown"])
+
+
+setuptools.setup(
+    name="google_benchmark",
+    version=_get_version("bindings/python/google_benchmark/__init__.py"),
+    url="https://github.com/google/benchmark",
+    description="A library to benchmark code snippets.",
+    long_description=_get_long_description("README.md"),
+    long_description_content_type="text/markdown",
+    author="Google",
+    author_email="benchmark-py@google.com",
+    # Contained modules and scripts.
+    package_dir={"": "bindings/python"},
+    packages=setuptools.find_packages("bindings/python"),
+    install_requires=_parse_requirements("bindings/python/requirements.txt"),
+    cmdclass=dict(build_ext=BuildBazelExtension),
+    ext_modules=[
+        BazelExtension(
+            "google_benchmark._benchmark",
+            "//bindings/python/google_benchmark:_benchmark",
+        )
+    ],
+    zip_safe=False,
+    # PyPI package information.
+    classifiers=[
+        "Development Status :: 4 - Beta",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: Apache Software License",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
+        "Topic :: Software Development :: Testing",
+        "Topic :: System :: Benchmark",
+    ],
+    license="Apache 2.0",
+    keywords="benchmark",
+)
diff --git a/third_party/google_benchmark/src/sleep.cc b/third_party/google_benchmark/src/sleep.cc
deleted file mode 100644
index 1512ac9..0000000
--- a/third_party/google_benchmark/src/sleep.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "sleep.h"
-
-#include <cerrno>
-#include <cstdlib>
-#include <ctime>
-
-#include "internal_macros.h"
-
-#ifdef BENCHMARK_OS_WINDOWS
-#include <windows.h>
-#endif
-
-namespace benchmark {
-#ifdef BENCHMARK_OS_WINDOWS
-// Window's Sleep takes milliseconds argument.
-void SleepForMilliseconds(int milliseconds) { Sleep(milliseconds); }
-void SleepForSeconds(double seconds) {
-  SleepForMilliseconds(static_cast<int>(kNumMillisPerSecond * seconds));
-}
-#else   // BENCHMARK_OS_WINDOWS
-void SleepForMicroseconds(int microseconds) {
-  struct timespec sleep_time;
-  sleep_time.tv_sec = microseconds / kNumMicrosPerSecond;
-  sleep_time.tv_nsec = (microseconds % kNumMicrosPerSecond) * kNumNanosPerMicro;
-  while (nanosleep(&sleep_time, &sleep_time) != 0 && errno == EINTR)
-    ;  // Ignore signals and wait for the full interval to elapse.
-}
-
-void SleepForMilliseconds(int milliseconds) {
-  SleepForMicroseconds(milliseconds * kNumMicrosPerMilli);
-}
-
-void SleepForSeconds(double seconds) {
-  SleepForMicroseconds(static_cast<int>(seconds * kNumMicrosPerSecond));
-}
-#endif  // BENCHMARK_OS_WINDOWS
-}  // end namespace benchmark
diff --git a/third_party/google_benchmark/src/sleep.h b/third_party/google_benchmark/src/sleep.h
deleted file mode 100644
index f98551a..0000000
--- a/third_party/google_benchmark/src/sleep.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef BENCHMARK_SLEEP_H_
-#define BENCHMARK_SLEEP_H_
-
-namespace benchmark {
-const int kNumMillisPerSecond = 1000;
-const int kNumMicrosPerMilli = 1000;
-const int kNumMicrosPerSecond = kNumMillisPerSecond * 1000;
-const int kNumNanosPerMicro = 1000;
-const int kNumNanosPerSecond = kNumNanosPerMicro * kNumMicrosPerSecond;
-
-void SleepForMilliseconds(int milliseconds);
-void SleepForSeconds(double seconds);
-}  // end namespace benchmark
-
-#endif  // BENCHMARK_SLEEP_H_
diff --git a/third_party/google_benchmark/src/src/CMakeLists.txt b/third_party/google_benchmark/src/src/CMakeLists.txt
new file mode 100644
index 0000000..91ea5f4
--- /dev/null
+++ b/third_party/google_benchmark/src/src/CMakeLists.txt
@@ -0,0 +1,170 @@
+# Allow the source files to find headers in src/
+include(GNUInstallDirs)
+include_directories(${PROJECT_SOURCE_DIR}/src)
+
+if (DEFINED BENCHMARK_CXX_LINKER_FLAGS)
+  list(APPEND CMAKE_SHARED_LINKER_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS})
+  list(APPEND CMAKE_MODULE_LINKER_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS})
+endif()
+
+file(GLOB
+  SOURCE_FILES
+    *.cc
+    ${PROJECT_SOURCE_DIR}/include/benchmark/*.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
+file(GLOB BENCHMARK_MAIN "benchmark_main.cc")
+foreach(item ${BENCHMARK_MAIN})
+  list(REMOVE_ITEM SOURCE_FILES "${item}")
+endforeach()
+
+add_library(benchmark ${SOURCE_FILES})
+add_library(benchmark::benchmark ALIAS benchmark)
+set_target_properties(benchmark PROPERTIES
+  OUTPUT_NAME "benchmark"
+  VERSION ${GENERIC_LIB_VERSION}
+  SOVERSION ${GENERIC_LIB_SOVERSION}
+)
+target_include_directories(benchmark PUBLIC
+  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
+)
+
+# libpfm, if available
+if (HAVE_LIBPFM)
+  target_link_libraries(benchmark PRIVATE pfm)
+  target_compile_definitions(benchmark PRIVATE -DHAVE_LIBPFM)
+endif()
+
+# pthread affinity, if available
+if(HAVE_PTHREAD_AFFINITY)
+  target_compile_definitions(benchmark PRIVATE -DBENCHMARK_HAS_PTHREAD_AFFINITY)
+endif()
+
+# Link threads.
+target_link_libraries(benchmark PRIVATE Threads::Threads)
+
+target_link_libraries(benchmark PRIVATE ${BENCHMARK_CXX_LIBRARIES})
+
+if(HAVE_LIB_RT)
+  target_link_libraries(benchmark PRIVATE rt)
+endif(HAVE_LIB_RT)
+
+
+# We need extra libraries on Windows
+if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
+  target_link_libraries(benchmark PRIVATE shlwapi)
+endif()
+
+# We need extra libraries on Solaris
+if(${CMAKE_SYSTEM_NAME} MATCHES "SunOS")
+  target_link_libraries(benchmark PRIVATE kstat)
+endif()
+
+if (NOT BUILD_SHARED_LIBS)
+  target_compile_definitions(benchmark PUBLIC -DBENCHMARK_STATIC_DEFINE)
+endif()
+
+# Benchmark main library
+add_library(benchmark_main "benchmark_main.cc")
+add_library(benchmark::benchmark_main ALIAS benchmark_main)
+set_target_properties(benchmark_main PROPERTIES
+  OUTPUT_NAME "benchmark_main"
+  VERSION ${GENERIC_LIB_VERSION}
+  SOVERSION ${GENERIC_LIB_SOVERSION}
+  DEFINE_SYMBOL benchmark_EXPORTS
+)
+target_link_libraries(benchmark_main PUBLIC benchmark::benchmark)
+
+set(generated_dir "${PROJECT_BINARY_DIR}")
+
+set(version_config "${generated_dir}/${PROJECT_NAME}ConfigVersion.cmake")
+set(project_config "${generated_dir}/${PROJECT_NAME}Config.cmake")
+set(pkg_config "${generated_dir}/${PROJECT_NAME}.pc")
+set(targets_to_export benchmark benchmark_main)
+set(targets_export_name "${PROJECT_NAME}Targets")
+
+set(namespace "${PROJECT_NAME}::")
+
+include(CMakePackageConfigHelpers)
+
+configure_package_config_file (
+  ${PROJECT_SOURCE_DIR}/cmake/Config.cmake.in
+  ${project_config}
+  INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}
+  NO_SET_AND_CHECK_MACRO
+  NO_CHECK_REQUIRED_COMPONENTS_MACRO
+)
+write_basic_package_version_file(
+  "${version_config}" VERSION ${GENERIC_LIB_VERSION} COMPATIBILITY SameMajorVersion
+)
+
+configure_file("${PROJECT_SOURCE_DIR}/cmake/benchmark.pc.in" "${pkg_config}" @ONLY)
+
+export (
+  TARGETS ${targets_to_export}
+  NAMESPACE "${namespace}"
+  FILE ${generated_dir}/${targets_export_name}.cmake
+)
+
+if (BENCHMARK_ENABLE_INSTALL)
+  # Install target (will install the library to specified CMAKE_INSTALL_PREFIX variable)
+  install(
+    TARGETS ${targets_to_export}
+    EXPORT ${targets_export_name}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+    INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+
+  install(
+    DIRECTORY "${PROJECT_SOURCE_DIR}/include/benchmark"
+              "${PROJECT_BINARY_DIR}/include/benchmark"
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+    FILES_MATCHING PATTERN "*.*h")
+
+  install(
+      FILES "${project_config}" "${version_config}"
+      DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")
+
+  install(
+      FILES "${pkg_config}"
+      DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
+
+  install(
+      EXPORT "${targets_export_name}"
+      NAMESPACE "${namespace}"
+      DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")
+endif()
+
+if (BENCHMARK_ENABLE_DOXYGEN)
+  find_package(Doxygen REQUIRED)
+  set(DOXYGEN_QUIET YES)
+  set(DOXYGEN_RECURSIVE YES)
+  set(DOXYGEN_GENERATE_HTML YES)
+  set(DOXYGEN_GENERATE_MAN NO)
+  set(DOXYGEN_MARKDOWN_SUPPORT YES)
+  set(DOXYGEN_BUILTIN_STL_SUPPORT YES)
+  set(DOXYGEN_EXTRACT_PACKAGE YES)
+  set(DOXYGEN_EXTRACT_STATIC YES)
+  set(DOXYGEN_SHOW_INCLUDE_FILES YES)
+  set(DOXYGEN_BINARY_TOC YES)
+  set(DOXYGEN_TOC_EXPAND YES)
+  set(DOXYGEN_USE_MDFILE_AS_MAINPAGE "index.md")
+  doxygen_add_docs(benchmark_doxygen
+    docs
+    include
+    src
+    ALL
+    WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+    COMMENT "Building documentation with Doxygen.")
+  if (BENCHMARK_ENABLE_INSTALL AND BENCHMARK_INSTALL_DOCS)
+    install(
+      DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/html/"
+      DESTINATION ${CMAKE_INSTALL_DOCDIR})
+  endif()
+else()
+  if (BENCHMARK_ENABLE_INSTALL AND BENCHMARK_INSTALL_DOCS)
+    install(
+      DIRECTORY "${PROJECT_SOURCE_DIR}/docs/"
+      DESTINATION ${CMAKE_INSTALL_DOCDIR})
+  endif()
+endif()
diff --git a/third_party/google_benchmark/src/arraysize.h b/third_party/google_benchmark/src/src/arraysize.h
similarity index 100%
rename from third_party/google_benchmark/src/arraysize.h
rename to third_party/google_benchmark/src/src/arraysize.h
diff --git a/third_party/google_benchmark/src/src/benchmark.cc b/third_party/google_benchmark/src/src/benchmark.cc
new file mode 100644
index 0000000..f1633b7
--- /dev/null
+++ b/third_party/google_benchmark/src/src/benchmark.cc
@@ -0,0 +1,765 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "benchmark/benchmark.h"
+
+#include "benchmark_api_internal.h"
+#include "benchmark_runner.h"
+#include "internal_macros.h"
+
+#ifndef BENCHMARK_OS_WINDOWS
+#if !defined(BENCHMARK_OS_FUCHSIA) && !defined(BENCHMARK_OS_QURT)
+#include <sys/resource.h>
+#endif
+#include <sys/time.h>
+#include <unistd.h>
+#endif
+
+#include <algorithm>
+#include <atomic>
+#include <condition_variable>
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <limits>
+#include <map>
+#include <memory>
+#include <random>
+#include <string>
+#include <thread>
+#include <utility>
+
+#include "check.h"
+#include "colorprint.h"
+#include "commandlineflags.h"
+#include "complexity.h"
+#include "counter.h"
+#include "internal_macros.h"
+#include "log.h"
+#include "mutex.h"
+#include "perf_counters.h"
+#include "re.h"
+#include "statistics.h"
+#include "string_util.h"
+#include "thread_manager.h"
+#include "thread_timer.h"
+
+namespace benchmark {
+// Print a list of benchmarks. This option overrides all other options.
+BM_DEFINE_bool(benchmark_list_tests, false);
+
+// A regular expression that specifies the set of benchmarks to execute.  If
+// this flag is empty, or if this flag is the string \"all\", all benchmarks
+// linked into the binary are run.
+BM_DEFINE_string(benchmark_filter, "");
+
+// Specification of how long to run the benchmark.
+//
+// It can be either an exact number of iterations (specified as `<integer>x`),
+// or a minimum number of seconds (specified as `<float>s`). If the latter
+// format (ie., min seconds) is used, the system may run the benchmark longer
+// until the results are considered significant.
+//
+// For backward compatibility, the `s` suffix may be omitted, in which case,
+// the specified number is interpreted as the number of seconds.
+//
+// For cpu-time based tests, this is the lower bound
+// on the total cpu time used by all threads that make up the test.  For
+// real-time based tests, this is the lower bound on the elapsed time of the
+// benchmark execution, regardless of number of threads.
+BM_DEFINE_string(benchmark_min_time, kDefaultMinTimeStr);
+
+// Minimum number of seconds a benchmark should be run before results should be
+// taken into account. This e.g can be necessary for benchmarks of code which
+// needs to fill some form of cache before performance is of interest.
+// Note: results gathered within this period are discarded and not used for
+// reported result.
+BM_DEFINE_double(benchmark_min_warmup_time, 0.0);
+
+// The number of runs of each benchmark. If greater than 1, the mean and
+// standard deviation of the runs will be reported.
+BM_DEFINE_int32(benchmark_repetitions, 1);
+
+// If set, enable random interleaving of repetitions of all benchmarks.
+// See http://github.com/google/benchmark/issues/1051 for details.
+BM_DEFINE_bool(benchmark_enable_random_interleaving, false);
+
+// Report the result of each benchmark repetitions. When 'true' is specified
+// only the mean, standard deviation, and other statistics are reported for
+// repeated benchmarks. Affects all reporters.
+BM_DEFINE_bool(benchmark_report_aggregates_only, false);
+
+// Display the result of each benchmark repetitions. When 'true' is specified
+// only the mean, standard deviation, and other statistics are displayed for
+// repeated benchmarks. Unlike benchmark_report_aggregates_only, only affects
+// the display reporter, but  *NOT* file reporter, which will still contain
+// all the output.
+BM_DEFINE_bool(benchmark_display_aggregates_only, false);
+
+// The format to use for console output.
+// Valid values are 'console', 'json', or 'csv'.
+BM_DEFINE_string(benchmark_format, "console");
+
+// The format to use for file output.
+// Valid values are 'console', 'json', or 'csv'.
+BM_DEFINE_string(benchmark_out_format, "json");
+
+// The file to write additional output to.
+BM_DEFINE_string(benchmark_out, "");
+
+// Whether to use colors in the output.  Valid values:
+// 'true'/'yes'/1, 'false'/'no'/0, and 'auto'. 'auto' means to use colors if
+// the output is being sent to a terminal and the TERM environment variable is
+// set to a terminal type that supports colors.
+BM_DEFINE_string(benchmark_color, "auto");
+
+// Whether to use tabular format when printing user counters to the console.
+// Valid values: 'true'/'yes'/1, 'false'/'no'/0.  Defaults to false.
+BM_DEFINE_bool(benchmark_counters_tabular, false);
+
+// List of additional perf counters to collect, in libpfm format. For more
+// information about libpfm: https://man7.org/linux/man-pages/man3/libpfm.3.html
+BM_DEFINE_string(benchmark_perf_counters, "");
+
+// Extra context to include in the output formatted as comma-separated key-value
+// pairs. Kept internal as it's only used for parsing from env/command line.
+BM_DEFINE_kvpairs(benchmark_context, {});
+
+// Set the default time unit to use for reports
+// Valid values are 'ns', 'us', 'ms' or 's'
+BM_DEFINE_string(benchmark_time_unit, "");
+
+// The level of verbose logging to output
+BM_DEFINE_int32(v, 0);
+
+namespace internal {
+
+std::map<std::string, std::string>* global_context = nullptr;
+
+BENCHMARK_EXPORT std::map<std::string, std::string>*& GetGlobalContext() {
+  return global_context;
+}
+
+// FIXME: wouldn't LTO mess this up?
+void UseCharPointer(char const volatile*) {}
+
+}  // namespace internal
+
+State::State(std::string name, IterationCount max_iters,
+             const std::vector<int64_t>& ranges, int thread_i, int n_threads,
+             internal::ThreadTimer* timer, internal::ThreadManager* manager,
+             internal::PerfCountersMeasurement* perf_counters_measurement)
+    : total_iterations_(0),
+      batch_leftover_(0),
+      max_iterations(max_iters),
+      started_(false),
+      finished_(false),
+      skipped_(internal::NotSkipped),
+      range_(ranges),
+      complexity_n_(0),
+      name_(std::move(name)),
+      thread_index_(thread_i),
+      threads_(n_threads),
+      timer_(timer),
+      manager_(manager),
+      perf_counters_measurement_(perf_counters_measurement) {
+  BM_CHECK(max_iterations != 0) << "At least one iteration must be run";
+  BM_CHECK_LT(thread_index_, threads_)
+      << "thread_index must be less than threads";
+
+  // Note: The use of offsetof below is technically undefined until C++17
+  // because State is not a standard layout type. However, all compilers
+  // currently provide well-defined behavior as an extension (which is
+  // demonstrated since constexpr evaluation must diagnose all undefined
+  // behavior). However, GCC and Clang also warn about this use of offsetof,
+  // which must be suppressed.
+#if defined(__INTEL_COMPILER)
+#pragma warning push
+#pragma warning(disable : 1875)
+#elif defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Winvalid-offsetof"
+#endif
+#if defined(__NVCC__)
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 1427
+#endif
+#if defined(__NVCOMPILER)
+#pragma diagnostic push
+#pragma diag_suppress offset_in_non_POD_nonstandard
+#endif
+  // Offset tests to ensure commonly accessed data is on the first cache line.
+  const int cache_line_size = 64;
+  static_assert(
+      offsetof(State, skipped_) <= (cache_line_size - sizeof(skipped_)), "");
+#if defined(__INTEL_COMPILER)
+#pragma warning pop
+#elif defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
+#if defined(__NVCC__)
+#pragma nv_diagnostic pop
+#endif
+#if defined(__NVCOMPILER)
+#pragma diagnostic pop
+#endif
+}
+
+void State::PauseTiming() {
+  // Add in time accumulated so far
+  BM_CHECK(started_ && !finished_ && !skipped());
+  timer_->StopTimer();
+  if (perf_counters_measurement_) {
+    std::vector<std::pair<std::string, double>> measurements;
+    if (!perf_counters_measurement_->Stop(measurements)) {
+      BM_CHECK(false) << "Perf counters read the value failed.";
+    }
+    for (const auto& name_and_measurement : measurements) {
+      auto name = name_and_measurement.first;
+      auto measurement = name_and_measurement.second;
+      BM_CHECK_EQ(std::fpclassify((double)counters[name]), FP_ZERO);
+      counters[name] = Counter(measurement, Counter::kAvgIterations);
+    }
+  }
+}
+
+void State::ResumeTiming() {
+  BM_CHECK(started_ && !finished_ && !skipped());
+  timer_->StartTimer();
+  if (perf_counters_measurement_) {
+    perf_counters_measurement_->Start();
+  }
+}
+
+void State::SkipWithMessage(const std::string& msg) {
+  skipped_ = internal::SkippedWithMessage;
+  {
+    MutexLock l(manager_->GetBenchmarkMutex());
+    if (internal::NotSkipped == manager_->results.skipped_) {
+      manager_->results.skip_message_ = msg;
+      manager_->results.skipped_ = skipped_;
+    }
+  }
+  total_iterations_ = 0;
+  if (timer_->running()) timer_->StopTimer();
+}
+
+void State::SkipWithError(const std::string& msg) {
+  skipped_ = internal::SkippedWithError;
+  {
+    MutexLock l(manager_->GetBenchmarkMutex());
+    if (internal::NotSkipped == manager_->results.skipped_) {
+      manager_->results.skip_message_ = msg;
+      manager_->results.skipped_ = skipped_;
+    }
+  }
+  total_iterations_ = 0;
+  if (timer_->running()) timer_->StopTimer();
+}
+
+void State::SetIterationTime(double seconds) {
+  timer_->SetIterationTime(seconds);
+}
+
+void State::SetLabel(const std::string& label) {
+  MutexLock l(manager_->GetBenchmarkMutex());
+  manager_->results.report_label_ = label;
+}
+
+void State::StartKeepRunning() {
+  BM_CHECK(!started_ && !finished_);
+  started_ = true;
+  total_iterations_ = skipped() ? 0 : max_iterations;
+  manager_->StartStopBarrier();
+  if (!skipped()) ResumeTiming();
+}
+
+void State::FinishKeepRunning() {
+  BM_CHECK(started_ && (!finished_ || skipped()));
+  if (!skipped()) {
+    PauseTiming();
+  }
+  // Total iterations has now wrapped around past 0. Fix this.
+  total_iterations_ = 0;
+  finished_ = true;
+  manager_->StartStopBarrier();
+}
+
+namespace internal {
+namespace {
+
+// Flushes streams after invoking reporter methods that write to them. This
+// ensures users get timely updates even when streams are not line-buffered.
+void FlushStreams(BenchmarkReporter* reporter) {
+  if (!reporter) return;
+  std::flush(reporter->GetOutputStream());
+  std::flush(reporter->GetErrorStream());
+}
+
+// Reports in both display and file reporters.
+void Report(BenchmarkReporter* display_reporter,
+            BenchmarkReporter* file_reporter, const RunResults& run_results) {
+  auto report_one = [](BenchmarkReporter* reporter, bool aggregates_only,
+                       const RunResults& results) {
+    assert(reporter);
+    // If there are no aggregates, do output non-aggregates.
+    aggregates_only &= !results.aggregates_only.empty();
+    if (!aggregates_only) reporter->ReportRuns(results.non_aggregates);
+    if (!results.aggregates_only.empty())
+      reporter->ReportRuns(results.aggregates_only);
+  };
+
+  report_one(display_reporter, run_results.display_report_aggregates_only,
+             run_results);
+  if (file_reporter)
+    report_one(file_reporter, run_results.file_report_aggregates_only,
+               run_results);
+
+  FlushStreams(display_reporter);
+  FlushStreams(file_reporter);
+}
+
+void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
+                   BenchmarkReporter* display_reporter,
+                   BenchmarkReporter* file_reporter) {
+  // Note the file_reporter can be null.
+  BM_CHECK(display_reporter != nullptr);
+
+  // Determine the width of the name field using a minimum width of 10.
+  bool might_have_aggregates = FLAGS_benchmark_repetitions > 1;
+  size_t name_field_width = 10;
+  size_t stat_field_width = 0;
+  for (const BenchmarkInstance& benchmark : benchmarks) {
+    name_field_width =
+        std::max<size_t>(name_field_width, benchmark.name().str().size());
+    might_have_aggregates |= benchmark.repetitions() > 1;
+
+    for (const auto& Stat : benchmark.statistics())
+      stat_field_width = std::max<size_t>(stat_field_width, Stat.name_.size());
+  }
+  if (might_have_aggregates) name_field_width += 1 + stat_field_width;
+
+  // Print header here
+  BenchmarkReporter::Context context;
+  context.name_field_width = name_field_width;
+
+  // Keep track of running times of all instances of each benchmark family.
+  std::map<int /*family_index*/, BenchmarkReporter::PerFamilyRunReports>
+      per_family_reports;
+
+  if (display_reporter->ReportContext(context) &&
+      (!file_reporter || file_reporter->ReportContext(context))) {
+    FlushStreams(display_reporter);
+    FlushStreams(file_reporter);
+
+    size_t num_repetitions_total = 0;
+
+    // This perfcounters object needs to be created before the runners vector
+    // below so it outlasts their lifetime.
+    PerfCountersMeasurement perfcounters(
+        StrSplit(FLAGS_benchmark_perf_counters, ','));
+
+    // Vector of benchmarks to run
+    std::vector<internal::BenchmarkRunner> runners;
+    runners.reserve(benchmarks.size());
+
+    // Count the number of benchmarks with threads to warn the user in case
+    // performance counters are used.
+    int benchmarks_with_threads = 0;
+
+    // Loop through all benchmarks
+    for (const BenchmarkInstance& benchmark : benchmarks) {
+      BenchmarkReporter::PerFamilyRunReports* reports_for_family = nullptr;
+      if (benchmark.complexity() != oNone)
+        reports_for_family = &per_family_reports[benchmark.family_index()];
+      benchmarks_with_threads += (benchmark.threads() > 0);
+      runners.emplace_back(benchmark, &perfcounters, reports_for_family);
+      int num_repeats_of_this_instance = runners.back().GetNumRepeats();
+      num_repetitions_total += num_repeats_of_this_instance;
+      if (reports_for_family)
+        reports_for_family->num_runs_total += num_repeats_of_this_instance;
+    }
+    assert(runners.size() == benchmarks.size() && "Unexpected runner count.");
+
+    // The use of performance counters with threads would be unintuitive for
+    // the average user so we need to warn them about this case
+    if ((benchmarks_with_threads > 0) && (perfcounters.num_counters() > 0)) {
+      GetErrorLogInstance()
+          << "***WARNING*** There are " << benchmarks_with_threads
+          << " benchmarks with threads and " << perfcounters.num_counters()
+          << " performance counters were requested. Beware counters will "
+             "reflect the combined usage across all "
+             "threads.\n";
+    }
+
+    std::vector<size_t> repetition_indices;
+    repetition_indices.reserve(num_repetitions_total);
+    for (size_t runner_index = 0, num_runners = runners.size();
+         runner_index != num_runners; ++runner_index) {
+      const internal::BenchmarkRunner& runner = runners[runner_index];
+      std::fill_n(std::back_inserter(repetition_indices),
+                  runner.GetNumRepeats(), runner_index);
+    }
+    assert(repetition_indices.size() == num_repetitions_total &&
+           "Unexpected number of repetition indexes.");
+
+    if (FLAGS_benchmark_enable_random_interleaving) {
+      std::random_device rd;
+      std::mt19937 g(rd());
+      std::shuffle(repetition_indices.begin(), repetition_indices.end(), g);
+    }
+
+    for (size_t repetition_index : repetition_indices) {
+      internal::BenchmarkRunner& runner = runners[repetition_index];
+      runner.DoOneRepetition();
+      if (runner.HasRepeatsRemaining()) continue;
+      // FIXME: report each repetition separately, not all of them in bulk.
+
+      display_reporter->ReportRunsConfig(
+          runner.GetMinTime(), runner.HasExplicitIters(), runner.GetIters());
+      if (file_reporter)
+        file_reporter->ReportRunsConfig(
+            runner.GetMinTime(), runner.HasExplicitIters(), runner.GetIters());
+
+      RunResults run_results = runner.GetResults();
+
+      // Maybe calculate complexity report
+      if (const auto* reports_for_family = runner.GetReportsForFamily()) {
+        if (reports_for_family->num_runs_done ==
+            reports_for_family->num_runs_total) {
+          auto additional_run_stats = ComputeBigO(reports_for_family->Runs);
+          run_results.aggregates_only.insert(run_results.aggregates_only.end(),
+                                             additional_run_stats.begin(),
+                                             additional_run_stats.end());
+          per_family_reports.erase(
+              static_cast<int>(reports_for_family->Runs.front().family_index));
+        }
+      }
+
+      Report(display_reporter, file_reporter, run_results);
+    }
+  }
+  display_reporter->Finalize();
+  if (file_reporter) file_reporter->Finalize();
+  FlushStreams(display_reporter);
+  FlushStreams(file_reporter);
+}
+
+// Disable deprecated warnings temporarily because we need to reference
+// CSVReporter but don't want to trigger -Werror=-Wdeprecated-declarations
+BENCHMARK_DISABLE_DEPRECATED_WARNING
+
+std::unique_ptr<BenchmarkReporter> CreateReporter(
+    std::string const& name, ConsoleReporter::OutputOptions output_opts) {
+  typedef std::unique_ptr<BenchmarkReporter> PtrType;
+  if (name == "console") {
+    return PtrType(new ConsoleReporter(output_opts));
+  }
+  if (name == "json") {
+    return PtrType(new JSONReporter());
+  }
+  if (name == "csv") {
+    return PtrType(new CSVReporter());
+  }
+  std::cerr << "Unexpected format: '" << name << "'\n";
+  std::exit(1);
+}
+
+BENCHMARK_RESTORE_DEPRECATED_WARNING
+
+}  // end namespace
+
+bool IsZero(double n) {
+  return std::abs(n) < std::numeric_limits<double>::epsilon();
+}
+
+ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color) {
+  int output_opts = ConsoleReporter::OO_Defaults;
+  auto is_benchmark_color = [force_no_color]() -> bool {
+    if (force_no_color) {
+      return false;
+    }
+    if (FLAGS_benchmark_color == "auto") {
+      return IsColorTerminal();
+    }
+    return IsTruthyFlagValue(FLAGS_benchmark_color);
+  };
+  if (is_benchmark_color()) {
+    output_opts |= ConsoleReporter::OO_Color;
+  } else {
+    output_opts &= ~ConsoleReporter::OO_Color;
+  }
+  if (FLAGS_benchmark_counters_tabular) {
+    output_opts |= ConsoleReporter::OO_Tabular;
+  } else {
+    output_opts &= ~ConsoleReporter::OO_Tabular;
+  }
+  return static_cast<ConsoleReporter::OutputOptions>(output_opts);
+}
+
+}  // end namespace internal
+
+BenchmarkReporter* CreateDefaultDisplayReporter() {
+  static auto default_display_reporter =
+      internal::CreateReporter(FLAGS_benchmark_format,
+                               internal::GetOutputOptions())
+          .release();
+  return default_display_reporter;
+}
+
+size_t RunSpecifiedBenchmarks() {
+  return RunSpecifiedBenchmarks(nullptr, nullptr, FLAGS_benchmark_filter);
+}
+
+size_t RunSpecifiedBenchmarks(std::string spec) {
+  return RunSpecifiedBenchmarks(nullptr, nullptr, std::move(spec));
+}
+
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter) {
+  return RunSpecifiedBenchmarks(display_reporter, nullptr,
+                                FLAGS_benchmark_filter);
+}
+
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
+                              std::string spec) {
+  return RunSpecifiedBenchmarks(display_reporter, nullptr, std::move(spec));
+}
+
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
+                              BenchmarkReporter* file_reporter) {
+  return RunSpecifiedBenchmarks(display_reporter, file_reporter,
+                                FLAGS_benchmark_filter);
+}
+
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
+                              BenchmarkReporter* file_reporter,
+                              std::string spec) {
+  if (spec.empty() || spec == "all")
+    spec = ".";  // Regexp that matches all benchmarks
+
+  // Setup the reporters
+  std::ofstream output_file;
+  std::unique_ptr<BenchmarkReporter> default_display_reporter;
+  std::unique_ptr<BenchmarkReporter> default_file_reporter;
+  if (!display_reporter) {
+    default_display_reporter.reset(CreateDefaultDisplayReporter());
+    display_reporter = default_display_reporter.get();
+  }
+  auto& Out = display_reporter->GetOutputStream();
+  auto& Err = display_reporter->GetErrorStream();
+
+  std::string const& fname = FLAGS_benchmark_out;
+  if (fname.empty() && file_reporter) {
+    Err << "A custom file reporter was provided but "
+           "--benchmark_out=<file> was not specified."
+        << std::endl;
+    std::exit(1);
+  }
+  if (!fname.empty()) {
+    output_file.open(fname);
+    if (!output_file.is_open()) {
+      Err << "invalid file name: '" << fname << "'" << std::endl;
+      std::exit(1);
+    }
+    if (!file_reporter) {
+      default_file_reporter = internal::CreateReporter(
+          FLAGS_benchmark_out_format, ConsoleReporter::OO_None);
+      file_reporter = default_file_reporter.get();
+    }
+    file_reporter->SetOutputStream(&output_file);
+    file_reporter->SetErrorStream(&output_file);
+  }
+
+  std::vector<internal::BenchmarkInstance> benchmarks;
+  if (!FindBenchmarksInternal(spec, &benchmarks, &Err)) return 0;
+
+  if (benchmarks.empty()) {
+    Err << "Failed to match any benchmarks against regex: " << spec << "\n";
+    return 0;
+  }
+
+  if (FLAGS_benchmark_list_tests) {
+    for (auto const& benchmark : benchmarks)
+      Out << benchmark.name().str() << "\n";
+  } else {
+    internal::RunBenchmarks(benchmarks, display_reporter, file_reporter);
+  }
+
+  return benchmarks.size();
+}
+
+namespace {
+// stores the time unit benchmarks use by default
+TimeUnit default_time_unit = kNanosecond;
+}  // namespace
+
+TimeUnit GetDefaultTimeUnit() { return default_time_unit; }
+
+void SetDefaultTimeUnit(TimeUnit unit) { default_time_unit = unit; }
+
+std::string GetBenchmarkFilter() { return FLAGS_benchmark_filter; }
+
+void SetBenchmarkFilter(std::string value) {
+  FLAGS_benchmark_filter = std::move(value);
+}
+
+int32_t GetBenchmarkVerbosity() { return FLAGS_v; }
+
+void RegisterMemoryManager(MemoryManager* manager) {
+  internal::memory_manager = manager;
+}
+
+void AddCustomContext(const std::string& key, const std::string& value) {
+  if (internal::global_context == nullptr) {
+    internal::global_context = new std::map<std::string, std::string>();
+  }
+  if (!internal::global_context->emplace(key, value).second) {
+    std::cerr << "Failed to add custom context \"" << key << "\" as it already "
+              << "exists with value \"" << value << "\"\n";
+  }
+}
+
+namespace internal {
+
+void (*HelperPrintf)();
+
+void PrintUsageAndExit() {
+  HelperPrintf();
+  exit(0);
+}
+
+void SetDefaultTimeUnitFromFlag(const std::string& time_unit_flag) {
+  if (time_unit_flag == "s") {
+    return SetDefaultTimeUnit(kSecond);
+  }
+  if (time_unit_flag == "ms") {
+    return SetDefaultTimeUnit(kMillisecond);
+  }
+  if (time_unit_flag == "us") {
+    return SetDefaultTimeUnit(kMicrosecond);
+  }
+  if (time_unit_flag == "ns") {
+    return SetDefaultTimeUnit(kNanosecond);
+  }
+  if (!time_unit_flag.empty()) {
+    PrintUsageAndExit();
+  }
+}
+
+void ParseCommandLineFlags(int* argc, char** argv) {
+  using namespace benchmark;
+  BenchmarkReporter::Context::executable_name =
+      (argc && *argc > 0) ? argv[0] : "unknown";
+  for (int i = 1; argc && i < *argc; ++i) {
+    if (ParseBoolFlag(argv[i], "benchmark_list_tests",
+                      &FLAGS_benchmark_list_tests) ||
+        ParseStringFlag(argv[i], "benchmark_filter", &FLAGS_benchmark_filter) ||
+        ParseStringFlag(argv[i], "benchmark_min_time",
+                        &FLAGS_benchmark_min_time) ||
+        ParseDoubleFlag(argv[i], "benchmark_min_warmup_time",
+                        &FLAGS_benchmark_min_warmup_time) ||
+        ParseInt32Flag(argv[i], "benchmark_repetitions",
+                       &FLAGS_benchmark_repetitions) ||
+        ParseBoolFlag(argv[i], "benchmark_enable_random_interleaving",
+                      &FLAGS_benchmark_enable_random_interleaving) ||
+        ParseBoolFlag(argv[i], "benchmark_report_aggregates_only",
+                      &FLAGS_benchmark_report_aggregates_only) ||
+        ParseBoolFlag(argv[i], "benchmark_display_aggregates_only",
+                      &FLAGS_benchmark_display_aggregates_only) ||
+        ParseStringFlag(argv[i], "benchmark_format", &FLAGS_benchmark_format) ||
+        ParseStringFlag(argv[i], "benchmark_out", &FLAGS_benchmark_out) ||
+        ParseStringFlag(argv[i], "benchmark_out_format",
+                        &FLAGS_benchmark_out_format) ||
+        ParseStringFlag(argv[i], "benchmark_color", &FLAGS_benchmark_color) ||
+        ParseBoolFlag(argv[i], "benchmark_counters_tabular",
+                      &FLAGS_benchmark_counters_tabular) ||
+        ParseStringFlag(argv[i], "benchmark_perf_counters",
+                        &FLAGS_benchmark_perf_counters) ||
+        ParseKeyValueFlag(argv[i], "benchmark_context",
+                          &FLAGS_benchmark_context) ||
+        ParseStringFlag(argv[i], "benchmark_time_unit",
+                        &FLAGS_benchmark_time_unit) ||
+        ParseInt32Flag(argv[i], "v", &FLAGS_v)) {
+      for (int j = i; j != *argc - 1; ++j) argv[j] = argv[j + 1];
+
+      --(*argc);
+      --i;
+    } else if (IsFlag(argv[i], "help")) {
+      PrintUsageAndExit();
+    }
+  }
+  for (auto const* flag :
+       {&FLAGS_benchmark_format, &FLAGS_benchmark_out_format}) {
+    if (*flag != "console" && *flag != "json" && *flag != "csv") {
+      PrintUsageAndExit();
+    }
+  }
+  SetDefaultTimeUnitFromFlag(FLAGS_benchmark_time_unit);
+  if (FLAGS_benchmark_color.empty()) {
+    PrintUsageAndExit();
+  }
+  for (const auto& kv : FLAGS_benchmark_context) {
+    AddCustomContext(kv.first, kv.second);
+  }
+}
+
+int InitializeStreams() {
+  static std::ios_base::Init init;
+  return 0;
+}
+
+}  // end namespace internal
+
+void PrintDefaultHelp() {
+  fprintf(stdout,
+          "benchmark"
+          " [--benchmark_list_tests={true|false}]\n"
+          "          [--benchmark_filter=<regex>]\n"
+          "          [--benchmark_min_time=`<integer>x` OR `<float>s` ]\n"
+          "          [--benchmark_min_warmup_time=<min_warmup_time>]\n"
+          "          [--benchmark_repetitions=<num_repetitions>]\n"
+          "          [--benchmark_enable_random_interleaving={true|false}]\n"
+          "          [--benchmark_report_aggregates_only={true|false}]\n"
+          "          [--benchmark_display_aggregates_only={true|false}]\n"
+          "          [--benchmark_format=<console|json|csv>]\n"
+          "          [--benchmark_out=<filename>]\n"
+          "          [--benchmark_out_format=<json|console|csv>]\n"
+          "          [--benchmark_color={auto|true|false}]\n"
+          "          [--benchmark_counters_tabular={true|false}]\n"
+#if defined HAVE_LIBPFM
+          "          [--benchmark_perf_counters=<counter>,...]\n"
+#endif
+          "          [--benchmark_context=<key>=<value>,...]\n"
+          "          [--benchmark_time_unit={ns|us|ms|s}]\n"
+          "          [--v=<verbosity>]\n");
+}
+
+void Initialize(int* argc, char** argv, void (*HelperPrintf)()) {
+  internal::HelperPrintf = HelperPrintf;
+  internal::ParseCommandLineFlags(argc, argv);
+  internal::LogLevel() = FLAGS_v;
+}
+
+void Shutdown() { delete internal::global_context; }
+
+bool ReportUnrecognizedArguments(int argc, char** argv) {
+  for (int i = 1; i < argc; ++i) {
+    fprintf(stderr, "%s: error: unrecognized command-line flag: %s\n", argv[0],
+            argv[i]);
+  }
+  return argc > 1;
+}
+
+}  // end namespace benchmark
diff --git a/third_party/google_benchmark/src/src/benchmark_api_internal.cc b/third_party/google_benchmark/src/src/benchmark_api_internal.cc
new file mode 100644
index 0000000..286f986
--- /dev/null
+++ b/third_party/google_benchmark/src/src/benchmark_api_internal.cc
@@ -0,0 +1,118 @@
+#include "benchmark_api_internal.h"
+
+#include <cinttypes>
+
+#include "string_util.h"
+
+namespace benchmark {
+namespace internal {
+
+BenchmarkInstance::BenchmarkInstance(Benchmark* benchmark, int family_idx,
+                                     int per_family_instance_idx,
+                                     const std::vector<int64_t>& args,
+                                     int thread_count)
+    : benchmark_(*benchmark),
+      family_index_(family_idx),
+      per_family_instance_index_(per_family_instance_idx),
+      aggregation_report_mode_(benchmark_.aggregation_report_mode_),
+      args_(args),
+      time_unit_(benchmark_.GetTimeUnit()),
+      measure_process_cpu_time_(benchmark_.measure_process_cpu_time_),
+      use_real_time_(benchmark_.use_real_time_),
+      use_manual_time_(benchmark_.use_manual_time_),
+      complexity_(benchmark_.complexity_),
+      complexity_lambda_(benchmark_.complexity_lambda_),
+      statistics_(benchmark_.statistics_),
+      repetitions_(benchmark_.repetitions_),
+      min_time_(benchmark_.min_time_),
+      min_warmup_time_(benchmark_.min_warmup_time_),
+      iterations_(benchmark_.iterations_),
+      threads_(thread_count) {
+  name_.function_name = benchmark_.name_;
+
+  size_t arg_i = 0;
+  for (const auto& arg : args) {
+    if (!name_.args.empty()) {
+      name_.args += '/';
+    }
+
+    if (arg_i < benchmark->arg_names_.size()) {
+      const auto& arg_name = benchmark_.arg_names_[arg_i];
+      if (!arg_name.empty()) {
+        name_.args += StrFormat("%s:", arg_name.c_str());
+      }
+    }
+
+    name_.args += StrFormat("%" PRId64, arg);
+    ++arg_i;
+  }
+
+  if (!IsZero(benchmark->min_time_)) {
+    name_.min_time = StrFormat("min_time:%0.3f", benchmark_.min_time_);
+  }
+
+  if (!IsZero(benchmark->min_warmup_time_)) {
+    name_.min_warmup_time =
+        StrFormat("min_warmup_time:%0.3f", benchmark_.min_warmup_time_);
+  }
+
+  if (benchmark_.iterations_ != 0) {
+    name_.iterations = StrFormat(
+        "iterations:%lu", static_cast<unsigned long>(benchmark_.iterations_));
+  }
+
+  if (benchmark_.repetitions_ != 0) {
+    name_.repetitions = StrFormat("repeats:%d", benchmark_.repetitions_);
+  }
+
+  if (benchmark_.measure_process_cpu_time_) {
+    name_.time_type = "process_time";
+  }
+
+  if (benchmark_.use_manual_time_) {
+    if (!name_.time_type.empty()) {
+      name_.time_type += '/';
+    }
+    name_.time_type += "manual_time";
+  } else if (benchmark_.use_real_time_) {
+    if (!name_.time_type.empty()) {
+      name_.time_type += '/';
+    }
+    name_.time_type += "real_time";
+  }
+
+  if (!benchmark_.thread_counts_.empty()) {
+    name_.threads = StrFormat("threads:%d", threads_);
+  }
+
+  setup_ = benchmark_.setup_;
+  teardown_ = benchmark_.teardown_;
+}
+
+State BenchmarkInstance::Run(
+    IterationCount iters, int thread_id, internal::ThreadTimer* timer,
+    internal::ThreadManager* manager,
+    internal::PerfCountersMeasurement* perf_counters_measurement) const {
+  State st(name_.function_name, iters, args_, thread_id, threads_, timer,
+           manager, perf_counters_measurement);
+  benchmark_.Run(st);
+  return st;
+}
+
+void BenchmarkInstance::Setup() const {
+  if (setup_) {
+    State st(name_.function_name, /*iters*/ 1, args_, /*thread_id*/ 0, threads_,
+             nullptr, nullptr, nullptr);
+    setup_(st);
+  }
+}
+
+void BenchmarkInstance::Teardown() const {
+  if (teardown_) {
+    State st(name_.function_name, /*iters*/ 1, args_, /*thread_id*/ 0, threads_,
+             nullptr, nullptr, nullptr);
+    teardown_(st);
+  }
+}
+}  // namespace internal
+}  // namespace benchmark
diff --git a/third_party/google_benchmark/src/src/benchmark_api_internal.h b/third_party/google_benchmark/src/src/benchmark_api_internal.h
new file mode 100644
index 0000000..94f5165
--- /dev/null
+++ b/third_party/google_benchmark/src/src/benchmark_api_internal.h
@@ -0,0 +1,87 @@
+#ifndef BENCHMARK_API_INTERNAL_H
+#define BENCHMARK_API_INTERNAL_H
+
+#include <cmath>
+#include <iosfwd>
+#include <limits>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "benchmark/benchmark.h"
+#include "commandlineflags.h"
+
+namespace benchmark {
+namespace internal {
+
+// Information kept per benchmark we may want to run
+class BenchmarkInstance {
+ public:
+  BenchmarkInstance(Benchmark* benchmark, int family_index,
+                    int per_family_instance_index,
+                    const std::vector<int64_t>& args, int threads);
+
+  const BenchmarkName& name() const { return name_; }
+  int family_index() const { return family_index_; }
+  int per_family_instance_index() const { return per_family_instance_index_; }
+  AggregationReportMode aggregation_report_mode() const {
+    return aggregation_report_mode_;
+  }
+  TimeUnit time_unit() const { return time_unit_; }
+  bool measure_process_cpu_time() const { return measure_process_cpu_time_; }
+  bool use_real_time() const { return use_real_time_; }
+  bool use_manual_time() const { return use_manual_time_; }
+  BigO complexity() const { return complexity_; }
+  BigOFunc* complexity_lambda() const { return complexity_lambda_; }
+  const std::vector<Statistics>& statistics() const { return statistics_; }
+  int repetitions() const { return repetitions_; }
+  double min_time() const { return min_time_; }
+  double min_warmup_time() const { return min_warmup_time_; }
+  IterationCount iterations() const { return iterations_; }
+  int threads() const { return threads_; }
+  void Setup() const;
+  void Teardown() const;
+
+  State Run(IterationCount iters, int thread_id, internal::ThreadTimer* timer,
+            internal::ThreadManager* manager,
+            internal::PerfCountersMeasurement* perf_counters_measurement) const;
+
+ private:
+  BenchmarkName name_;
+  Benchmark& benchmark_;
+  const int family_index_;
+  const int per_family_instance_index_;
+  AggregationReportMode aggregation_report_mode_;
+  const std::vector<int64_t>& args_;
+  TimeUnit time_unit_;
+  bool measure_process_cpu_time_;
+  bool use_real_time_;
+  bool use_manual_time_;
+  BigO complexity_;
+  BigOFunc* complexity_lambda_;
+  UserCounters counters_;
+  const std::vector<Statistics>& statistics_;
+  int repetitions_;
+  double min_time_;
+  double min_warmup_time_;
+  IterationCount iterations_;
+  int threads_;  // Number of concurrent threads to us
+
+  typedef void (*callback_function)(const benchmark::State&);
+  callback_function setup_ = nullptr;
+  callback_function teardown_ = nullptr;
+};
+
+bool FindBenchmarksInternal(const std::string& re,
+                            std::vector<BenchmarkInstance>* benchmarks,
+                            std::ostream* Err);
+
+bool IsZero(double n);
+
+BENCHMARK_EXPORT
+ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color = false);
+
+}  // end namespace internal
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_API_INTERNAL_H
diff --git a/third_party/google_benchmark/src/benchmark_main.cc b/third_party/google_benchmark/src/src/benchmark_main.cc
similarity index 94%
rename from third_party/google_benchmark/src/benchmark_main.cc
rename to third_party/google_benchmark/src/src/benchmark_main.cc
index b3b2478..cd61cd2 100644
--- a/third_party/google_benchmark/src/benchmark_main.cc
+++ b/third_party/google_benchmark/src/src/benchmark_main.cc
@@ -14,4 +14,5 @@
 
 #include "benchmark/benchmark.h"
 
+BENCHMARK_EXPORT int main(int, char**);
 BENCHMARK_MAIN();
diff --git a/third_party/google_benchmark/src/benchmark_name.cc b/third_party/google_benchmark/src/src/benchmark_name.cc
similarity index 91%
rename from third_party/google_benchmark/src/benchmark_name.cc
rename to third_party/google_benchmark/src/src/benchmark_name.cc
index 2a17ebc..01676bb 100644
--- a/third_party/google_benchmark/src/benchmark_name.cc
+++ b/third_party/google_benchmark/src/src/benchmark_name.cc
@@ -51,8 +51,9 @@
 }
 }  // namespace
 
+BENCHMARK_EXPORT
 std::string BenchmarkName::str() const {
-  return join('/', function_name, args, min_time, iterations, repetitions,
-              time_type, threads);
+  return join('/', function_name, args, min_time, min_warmup_time, iterations,
+              repetitions, time_type, threads);
 }
 }  // namespace benchmark
diff --git a/third_party/google_benchmark/src/benchmark_register.cc b/third_party/google_benchmark/src/src/benchmark_register.cc
similarity index 68%
rename from third_party/google_benchmark/src/benchmark_register.cc
rename to third_party/google_benchmark/src/src/benchmark_register.cc
index cca39b2..e447c9a 100644
--- a/third_party/google_benchmark/src/benchmark_register.cc
+++ b/third_party/google_benchmark/src/src/benchmark_register.cc
@@ -15,7 +15,7 @@
 #include "benchmark_register.h"
 
 #ifndef BENCHMARK_OS_WINDOWS
-#ifndef BENCHMARK_OS_FUCHSIA
+#if !defined(BENCHMARK_OS_FUCHSIA) && !defined(BENCHMARK_OS_QURT)
 #include <sys/resource.h>
 #endif
 #include <sys/time.h>
@@ -24,6 +24,7 @@
 
 #include <algorithm>
 #include <atomic>
+#include <cinttypes>
 #include <condition_variable>
 #include <cstdio>
 #include <cstdlib>
@@ -31,14 +32,10 @@
 #include <fstream>
 #include <iostream>
 #include <memory>
+#include <numeric>
 #include <sstream>
 #include <thread>
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-#include <inttypes.h>
-
 #include "benchmark/benchmark.h"
 #include "benchmark_api_internal.h"
 #include "check.h"
@@ -56,10 +53,13 @@
 
 namespace {
 // For non-dense Range, intermediate values are powers of kRangeMultiplier.
-static const int kRangeMultiplier = 8;
+static constexpr int kRangeMultiplier = 8;
+
 // The size of a benchmark family determines is the number of inputs to repeat
 // the benchmark on. If this is "large" then warn the user during configuration.
-static const size_t kMaxFamilySize = 100;
+static constexpr size_t kMaxFamilySize = 100;
+
+static constexpr char kDisabledPrefix[] = "DISABLED_";
 }  // end namespace
 
 namespace internal {
@@ -114,15 +114,15 @@
 bool BenchmarkFamilies::FindBenchmarks(
     std::string spec, std::vector<BenchmarkInstance>* benchmarks,
     std::ostream* ErrStream) {
-  CHECK(ErrStream);
+  BM_CHECK(ErrStream);
   auto& Err = *ErrStream;
   // Make regular expression out of command-line flag
   std::string error_msg;
   Regex re;
-  bool isNegativeFilter = false;
+  bool is_negative_filter = false;
   if (spec[0] == '-') {
     spec.replace(0, 1, "");
-    isNegativeFilter = true;
+    is_negative_filter = true;
   }
   if (!re.Init(spec, &error_msg)) {
     Err << "Could not compile benchmark re: " << error_msg << std::endl;
@@ -132,8 +132,13 @@
   // Special list of thread counts to use when none are specified
   const std::vector<int> one_thread = {1};
 
+  int next_family_index = 0;
+
   MutexLock l(mutex_);
   for (std::unique_ptr<Benchmark>& family : families_) {
+    int family_index = next_family_index;
+    int per_family_instance_index = 0;
+
     // Family was deleted or benchmark doesn't match
     if (!family) continue;
 
@@ -152,85 +157,27 @@
           << " will be repeated at least " << family_size << " times.\n";
     }
     // reserve in the special case the regex ".", since we know the final
-    // family size.
-    if (spec == ".") benchmarks->reserve(family_size);
+    // family size.  this doesn't take into account any disabled benchmarks
+    // so worst case we reserve more than we need.
+    if (spec == ".") benchmarks->reserve(benchmarks->size() + family_size);
 
     for (auto const& args : family->args_) {
       for (int num_threads : *thread_counts) {
-        BenchmarkInstance instance;
-        instance.name.function_name = family->name_;
-        instance.benchmark = family.get();
-        instance.aggregation_report_mode = family->aggregation_report_mode_;
-        instance.arg = args;
-        instance.time_unit = family->time_unit_;
-        instance.range_multiplier = family->range_multiplier_;
-        instance.min_time = family->min_time_;
-        instance.iterations = family->iterations_;
-        instance.repetitions = family->repetitions_;
-        instance.measure_process_cpu_time = family->measure_process_cpu_time_;
-        instance.use_real_time = family->use_real_time_;
-        instance.use_manual_time = family->use_manual_time_;
-        instance.complexity = family->complexity_;
-        instance.complexity_lambda = family->complexity_lambda_;
-        instance.statistics = &family->statistics_;
-        instance.threads = num_threads;
+        BenchmarkInstance instance(family.get(), family_index,
+                                   per_family_instance_index, args,
+                                   num_threads);
 
-        // Add arguments to instance name
-        size_t arg_i = 0;
-        for (auto const& arg : args) {
-          if (!instance.name.args.empty()) {
-            instance.name.args += '/';
-          }
-
-          if (arg_i < family->arg_names_.size()) {
-            const auto& arg_name = family->arg_names_[arg_i];
-            if (!arg_name.empty()) {
-              instance.name.args += StrFormat("%s:", arg_name.c_str());
-            }
-          }
-
-          instance.name.args += StrFormat("%" PRId64, arg);
-          ++arg_i;
-        }
-
-        if (!IsZero(family->min_time_))
-          instance.name.min_time =
-              StrFormat("min_time:%0.3f", family->min_time_);
-        if (family->iterations_ != 0) {
-          instance.name.iterations =
-              StrFormat("iterations:%lu",
-                        static_cast<unsigned long>(family->iterations_));
-        }
-        if (family->repetitions_ != 0)
-          instance.name.repetitions =
-              StrFormat("repeats:%d", family->repetitions_);
-
-        if (family->measure_process_cpu_time_) {
-          instance.name.time_type = "process_time";
-        }
-
-        if (family->use_manual_time_) {
-          if (!instance.name.time_type.empty()) {
-            instance.name.time_type += '/';
-          }
-          instance.name.time_type += "manual_time";
-        } else if (family->use_real_time_) {
-          if (!instance.name.time_type.empty()) {
-            instance.name.time_type += '/';
-          }
-          instance.name.time_type += "real_time";
-        }
-
-        // Add the number of threads used to the name
-        if (!family->thread_counts_.empty()) {
-          instance.name.threads = StrFormat("threads:%d", instance.threads);
-        }
-
-        const auto full_name = instance.name.str();
-        if ((re.Match(full_name) && !isNegativeFilter) ||
-            (!re.Match(full_name) && isNegativeFilter)) {
-          instance.last_benchmark_instance = (&args == &family->args_.back());
+        const auto full_name = instance.name().str();
+        if (full_name.rfind(kDisabledPrefix, 0) != 0 &&
+            ((re.Match(full_name) && !is_negative_filter) ||
+             (!re.Match(full_name) && is_negative_filter))) {
           benchmarks->push_back(std::move(instance));
+
+          ++per_family_instance_index;
+
+          // Only bump the next family index once we've estabilished that
+          // at least one instance of this family will be run.
+          if (next_family_index == family_index) ++next_family_index;
         }
       }
     }
@@ -257,39 +204,50 @@
 //                               Benchmark
 //=============================================================================//
 
-Benchmark::Benchmark(const char* name)
+Benchmark::Benchmark(const std::string& name)
     : name_(name),
       aggregation_report_mode_(ARM_Unspecified),
-      time_unit_(kNanosecond),
+      time_unit_(GetDefaultTimeUnit()),
+      use_default_time_unit_(true),
       range_multiplier_(kRangeMultiplier),
       min_time_(0),
+      min_warmup_time_(0),
       iterations_(0),
       repetitions_(0),
       measure_process_cpu_time_(false),
       use_real_time_(false),
       use_manual_time_(false),
       complexity_(oNone),
-      complexity_lambda_(nullptr) {
+      complexity_lambda_(nullptr),
+      setup_(nullptr),
+      teardown_(nullptr) {
   ComputeStatistics("mean", StatisticsMean);
   ComputeStatistics("median", StatisticsMedian);
   ComputeStatistics("stddev", StatisticsStdDev);
+  ComputeStatistics("cv", StatisticsCV, kPercentage);
 }
 
 Benchmark::~Benchmark() {}
 
+Benchmark* Benchmark::Name(const std::string& name) {
+  SetName(name);
+  return this;
+}
+
 Benchmark* Benchmark::Arg(int64_t x) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
   args_.push_back({x});
   return this;
 }
 
 Benchmark* Benchmark::Unit(TimeUnit unit) {
   time_unit_ = unit;
+  use_default_time_unit_ = false;
   return this;
 }
 
 Benchmark* Benchmark::Range(int64_t start, int64_t limit) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
   std::vector<int64_t> arglist;
   AddRange(&arglist, start, limit, range_multiplier_);
 
@@ -301,53 +259,61 @@
 
 Benchmark* Benchmark::Ranges(
     const std::vector<std::pair<int64_t, int64_t>>& ranges) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(ranges.size()));
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(ranges.size()));
   std::vector<std::vector<int64_t>> arglists(ranges.size());
-  std::size_t total = 1;
   for (std::size_t i = 0; i < ranges.size(); i++) {
     AddRange(&arglists[i], ranges[i].first, ranges[i].second,
              range_multiplier_);
-    total *= arglists[i].size();
   }
 
-  std::vector<std::size_t> ctr(arglists.size(), 0);
+  ArgsProduct(arglists);
 
+  return this;
+}
+
+Benchmark* Benchmark::ArgsProduct(
+    const std::vector<std::vector<int64_t>>& arglists) {
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(arglists.size()));
+
+  std::vector<std::size_t> indices(arglists.size());
+  const std::size_t total = std::accumulate(
+      std::begin(arglists), std::end(arglists), std::size_t{1},
+      [](const std::size_t res, const std::vector<int64_t>& arglist) {
+        return res * arglist.size();
+      });
+  std::vector<int64_t> args;
+  args.reserve(arglists.size());
   for (std::size_t i = 0; i < total; i++) {
-    std::vector<int64_t> tmp;
-    tmp.reserve(arglists.size());
-
-    for (std::size_t j = 0; j < arglists.size(); j++) {
-      tmp.push_back(arglists[j].at(ctr[j]));
+    for (std::size_t arg = 0; arg < arglists.size(); arg++) {
+      args.push_back(arglists[arg][indices[arg]]);
     }
+    args_.push_back(args);
+    args.clear();
 
-    args_.push_back(std::move(tmp));
-
-    for (std::size_t j = 0; j < arglists.size(); j++) {
-      if (ctr[j] + 1 < arglists[j].size()) {
-        ++ctr[j];
-        break;
-      }
-      ctr[j] = 0;
-    }
+    std::size_t arg = 0;
+    do {
+      indices[arg] = (indices[arg] + 1) % arglists[arg].size();
+    } while (indices[arg++] == 0 && arg < arglists.size());
   }
+
   return this;
 }
 
 Benchmark* Benchmark::ArgName(const std::string& name) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
   arg_names_ = {name};
   return this;
 }
 
 Benchmark* Benchmark::ArgNames(const std::vector<std::string>& names) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(names.size()));
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(names.size()));
   arg_names_ = names;
   return this;
 }
 
 Benchmark* Benchmark::DenseRange(int64_t start, int64_t limit, int step) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
-  CHECK_LE(start, limit);
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  BM_CHECK_LE(start, limit);
   for (int64_t arg = start; arg <= limit; arg += step) {
     args_.push_back({arg});
   }
@@ -355,7 +321,7 @@
 }
 
 Benchmark* Benchmark::Args(const std::vector<int64_t>& args) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(args.size()));
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(args.size()));
   args_.push_back(args);
   return this;
 }
@@ -365,28 +331,48 @@
   return this;
 }
 
+Benchmark* Benchmark::Setup(void (*setup)(const benchmark::State&)) {
+  BM_CHECK(setup != nullptr);
+  setup_ = setup;
+  return this;
+}
+
+Benchmark* Benchmark::Teardown(void (*teardown)(const benchmark::State&)) {
+  BM_CHECK(teardown != nullptr);
+  teardown_ = teardown;
+  return this;
+}
+
 Benchmark* Benchmark::RangeMultiplier(int multiplier) {
-  CHECK(multiplier > 1);
+  BM_CHECK(multiplier > 1);
   range_multiplier_ = multiplier;
   return this;
 }
 
 Benchmark* Benchmark::MinTime(double t) {
-  CHECK(t > 0.0);
-  CHECK(iterations_ == 0);
+  BM_CHECK(t > 0.0);
+  BM_CHECK(iterations_ == 0);
   min_time_ = t;
   return this;
 }
 
+Benchmark* Benchmark::MinWarmUpTime(double t) {
+  BM_CHECK(t >= 0.0);
+  BM_CHECK(iterations_ == 0);
+  min_warmup_time_ = t;
+  return this;
+}
+
 Benchmark* Benchmark::Iterations(IterationCount n) {
-  CHECK(n > 0);
-  CHECK(IsZero(min_time_));
+  BM_CHECK(n > 0);
+  BM_CHECK(IsZero(min_time_));
+  BM_CHECK(IsZero(min_warmup_time_));
   iterations_ = n;
   return this;
 }
 
 Benchmark* Benchmark::Repetitions(int n) {
-  CHECK(n > 0);
+  BM_CHECK(n > 0);
   repetitions_ = n;
   return this;
 }
@@ -419,14 +405,14 @@
 }
 
 Benchmark* Benchmark::UseRealTime() {
-  CHECK(!use_manual_time_)
+  BM_CHECK(!use_manual_time_)
       << "Cannot set UseRealTime and UseManualTime simultaneously.";
   use_real_time_ = true;
   return this;
 }
 
 Benchmark* Benchmark::UseManualTime() {
-  CHECK(!use_real_time_)
+  BM_CHECK(!use_real_time_)
       << "Cannot set UseRealTime and UseManualTime simultaneously.";
   use_manual_time_ = true;
   return this;
@@ -443,21 +429,22 @@
   return this;
 }
 
-Benchmark* Benchmark::ComputeStatistics(std::string name,
-                                        StatisticsFunc* statistics) {
-  statistics_.emplace_back(name, statistics);
+Benchmark* Benchmark::ComputeStatistics(const std::string& name,
+                                        StatisticsFunc* statistics,
+                                        StatisticUnit unit) {
+  statistics_.emplace_back(name, statistics, unit);
   return this;
 }
 
 Benchmark* Benchmark::Threads(int t) {
-  CHECK_GT(t, 0);
+  BM_CHECK_GT(t, 0);
   thread_counts_.push_back(t);
   return this;
 }
 
 Benchmark* Benchmark::ThreadRange(int min_threads, int max_threads) {
-  CHECK_GT(min_threads, 0);
-  CHECK_GE(max_threads, min_threads);
+  BM_CHECK_GT(min_threads, 0);
+  BM_CHECK_GE(max_threads, min_threads);
 
   AddRange(&thread_counts_, min_threads, max_threads, 2);
   return this;
@@ -465,9 +452,9 @@
 
 Benchmark* Benchmark::DenseThreadRange(int min_threads, int max_threads,
                                        int stride) {
-  CHECK_GT(min_threads, 0);
-  CHECK_GE(max_threads, min_threads);
-  CHECK_GE(stride, 1);
+  BM_CHECK_GT(min_threads, 0);
+  BM_CHECK_GE(max_threads, min_threads);
+  BM_CHECK_GE(stride, 1);
 
   for (auto i = min_threads; i < max_threads; i += stride) {
     thread_counts_.push_back(i);
@@ -481,7 +468,9 @@
   return this;
 }
 
-void Benchmark::SetName(const char* name) { name_ = name; }
+void Benchmark::SetName(const std::string& name) { name_ = name; }
+
+const char* Benchmark::GetName() const { return name_.c_str(); }
 
 int Benchmark::ArgsCnt() const {
   if (args_.empty()) {
@@ -491,6 +480,16 @@
   return static_cast<int>(args_.front().size());
 }
 
+const char* Benchmark::GetArgName(int arg) const {
+  BM_CHECK_GE(arg, 0);
+  BM_CHECK_LT(arg, static_cast<int>(arg_names_.size()));
+  return arg_names_[arg].c_str();
+}
+
+TimeUnit Benchmark::GetTimeUnit() const {
+  return use_default_time_unit_ ? GetDefaultTimeUnit() : time_unit_;
+}
+
 //=============================================================================//
 //                            FunctionBenchmark
 //=============================================================================//
@@ -503,4 +502,19 @@
   internal::BenchmarkFamilies::GetInstance()->ClearBenchmarks();
 }
 
+std::vector<int64_t> CreateRange(int64_t lo, int64_t hi, int multi) {
+  std::vector<int64_t> args;
+  internal::AddRange(&args, lo, hi, multi);
+  return args;
+}
+
+std::vector<int64_t> CreateDenseRange(int64_t start, int64_t limit, int step) {
+  BM_CHECK_LE(start, limit);
+  std::vector<int64_t> args;
+  for (int64_t arg = start; arg <= limit; arg += step) {
+    args.push_back(arg);
+  }
+  return args;
+}
+
 }  // end namespace benchmark
diff --git a/third_party/google_benchmark/src/benchmark_register.h b/third_party/google_benchmark/src/src/benchmark_register.h
similarity index 81%
rename from third_party/google_benchmark/src/benchmark_register.h
rename to third_party/google_benchmark/src/src/benchmark_register.h
index 204bf1d..53367c7 100644
--- a/third_party/google_benchmark/src/benchmark_register.h
+++ b/third_party/google_benchmark/src/src/benchmark_register.h
@@ -1,6 +1,7 @@
 #ifndef BENCHMARK_REGISTER_H
 #define BENCHMARK_REGISTER_H
 
+#include <algorithm>
 #include <limits>
 #include <vector>
 
@@ -12,18 +13,18 @@
 // Append the powers of 'mult' in the closed interval [lo, hi].
 // Returns iterator to the start of the inserted range.
 template <typename T>
-typename std::vector<T>::iterator
-AddPowers(std::vector<T>* dst, T lo, T hi, int mult) {
-  CHECK_GE(lo, 0);
-  CHECK_GE(hi, lo);
-  CHECK_GE(mult, 2);
+typename std::vector<T>::iterator AddPowers(std::vector<T>* dst, T lo, T hi,
+                                            int mult) {
+  BM_CHECK_GE(lo, 0);
+  BM_CHECK_GE(hi, lo);
+  BM_CHECK_GE(mult, 2);
 
   const size_t start_offset = dst->size();
 
   static const T kmax = std::numeric_limits<T>::max();
 
   // Space out the values in multiples of "mult"
-  for (T i = 1; i <= hi; i *= mult) {
+  for (T i = static_cast<T>(1); i <= hi; i *= static_cast<T>(mult)) {
     if (i >= lo) {
       dst->push_back(i);
     }
@@ -32,16 +33,16 @@
     if (i > kmax / mult) break;
   }
 
-  return dst->begin() + start_offset;
+  return dst->begin() + static_cast<int>(start_offset);
 }
 
 template <typename T>
 void AddNegatedPowers(std::vector<T>* dst, T lo, T hi, int mult) {
   // We negate lo and hi so we require that they cannot be equal to 'min'.
-  CHECK_GT(lo, std::numeric_limits<T>::min());
-  CHECK_GT(hi, std::numeric_limits<T>::min());
-  CHECK_GE(hi, lo);
-  CHECK_LE(hi, 0);
+  BM_CHECK_GT(lo, std::numeric_limits<T>::min());
+  BM_CHECK_GT(hi, std::numeric_limits<T>::min());
+  BM_CHECK_GE(hi, lo);
+  BM_CHECK_LE(hi, 0);
 
   // Add positive powers, then negate and reverse.
   // Casts necessary since small integers get promoted
@@ -60,8 +61,8 @@
   static_assert(std::is_integral<T>::value && std::is_signed<T>::value,
                 "Args type must be a signed integer");
 
-  CHECK_GE(hi, lo);
-  CHECK_GE(mult, 2);
+  BM_CHECK_GE(hi, lo);
+  BM_CHECK_GE(mult, 2);
 
   // Add "lo"
   dst->push_back(lo);
@@ -87,7 +88,7 @@
   }
 
   // Treat 0 as a special case (see discussion on #762).
-  if (lo <= 0 && hi >= 0) {
+  if (lo < 0 && hi >= 0) {
     dst->push_back(0);
   }
 
diff --git a/third_party/google_benchmark/src/src/benchmark_runner.cc b/third_party/google_benchmark/src/src/benchmark_runner.cc
new file mode 100644
index 0000000..f7ae424
--- /dev/null
+++ b/third_party/google_benchmark/src/src/benchmark_runner.cc
@@ -0,0 +1,497 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "benchmark_runner.h"
+
+#include "benchmark/benchmark.h"
+#include "benchmark_api_internal.h"
+#include "internal_macros.h"
+
+#ifndef BENCHMARK_OS_WINDOWS
+#if !defined(BENCHMARK_OS_FUCHSIA) && !defined(BENCHMARK_OS_QURT)
+#include <sys/resource.h>
+#endif
+#include <sys/time.h>
+#include <unistd.h>
+#endif
+
+#include <algorithm>
+#include <atomic>
+#include <climits>
+#include <cmath>
+#include <condition_variable>
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <limits>
+#include <memory>
+#include <string>
+#include <thread>
+#include <utility>
+
+#include "check.h"
+#include "colorprint.h"
+#include "commandlineflags.h"
+#include "complexity.h"
+#include "counter.h"
+#include "internal_macros.h"
+#include "log.h"
+#include "mutex.h"
+#include "perf_counters.h"
+#include "re.h"
+#include "statistics.h"
+#include "string_util.h"
+#include "thread_manager.h"
+#include "thread_timer.h"
+
+namespace benchmark {
+
+namespace internal {
+
+MemoryManager* memory_manager = nullptr;
+
+namespace {
+
+static constexpr IterationCount kMaxIterations = 1000000000;
+const double kDefaultMinTime =
+    std::strtod(::benchmark::kDefaultMinTimeStr, /*p_end*/ nullptr);
+
+BenchmarkReporter::Run CreateRunReport(
+    const benchmark::internal::BenchmarkInstance& b,
+    const internal::ThreadManager::Result& results,
+    IterationCount memory_iterations,
+    const MemoryManager::Result* memory_result, double seconds,
+    int64_t repetition_index, int64_t repeats) {
+  // Create report about this benchmark run.
+  BenchmarkReporter::Run report;
+
+  report.run_name = b.name();
+  report.family_index = b.family_index();
+  report.per_family_instance_index = b.per_family_instance_index();
+  report.skipped = results.skipped_;
+  report.skip_message = results.skip_message_;
+  report.report_label = results.report_label_;
+  // This is the total iterations across all threads.
+  report.iterations = results.iterations;
+  report.time_unit = b.time_unit();
+  report.threads = b.threads();
+  report.repetition_index = repetition_index;
+  report.repetitions = repeats;
+
+  if (!report.skipped) {
+    if (b.use_manual_time()) {
+      report.real_accumulated_time = results.manual_time_used;
+    } else {
+      report.real_accumulated_time = results.real_time_used;
+    }
+    report.cpu_accumulated_time = results.cpu_time_used;
+    report.complexity_n = results.complexity_n;
+    report.complexity = b.complexity();
+    report.complexity_lambda = b.complexity_lambda();
+    report.statistics = &b.statistics();
+    report.counters = results.counters;
+
+    if (memory_iterations > 0) {
+      assert(memory_result != nullptr);
+      report.memory_result = memory_result;
+      report.allocs_per_iter =
+          memory_iterations ? static_cast<double>(memory_result->num_allocs) /
+                                  memory_iterations
+                            : 0;
+    }
+
+    internal::Finish(&report.counters, results.iterations, seconds,
+                     b.threads());
+  }
+  return report;
+}
+
+// Execute one thread of benchmark b for the specified number of iterations.
+// Adds the stats collected for the thread into manager->results.
+void RunInThread(const BenchmarkInstance* b, IterationCount iters,
+                 int thread_id, ThreadManager* manager,
+                 PerfCountersMeasurement* perf_counters_measurement) {
+  internal::ThreadTimer timer(
+      b->measure_process_cpu_time()
+          ? internal::ThreadTimer::CreateProcessCpuTime()
+          : internal::ThreadTimer::Create());
+
+  State st =
+      b->Run(iters, thread_id, &timer, manager, perf_counters_measurement);
+  BM_CHECK(st.skipped() || st.iterations() >= st.max_iterations)
+      << "Benchmark returned before State::KeepRunning() returned false!";
+  {
+    MutexLock l(manager->GetBenchmarkMutex());
+    internal::ThreadManager::Result& results = manager->results;
+    results.iterations += st.iterations();
+    results.cpu_time_used += timer.cpu_time_used();
+    results.real_time_used += timer.real_time_used();
+    results.manual_time_used += timer.manual_time_used();
+    results.complexity_n += st.complexity_length_n();
+    internal::Increment(&results.counters, st.counters);
+  }
+  manager->NotifyThreadComplete();
+}
+
+double ComputeMinTime(const benchmark::internal::BenchmarkInstance& b,
+                      const BenchTimeType& iters_or_time) {
+  if (!IsZero(b.min_time())) return b.min_time();
+  // If the flag was used to specify number of iters, then return the default
+  // min_time.
+  if (iters_or_time.tag == BenchTimeType::ITERS) return kDefaultMinTime;
+
+  return iters_or_time.time;
+}
+
+IterationCount ComputeIters(const benchmark::internal::BenchmarkInstance& b,
+                            const BenchTimeType& iters_or_time) {
+  if (b.iterations() != 0) return b.iterations();
+
+  // We've already concluded that this flag is currently used to pass
+  // iters but do a check here again anyway.
+  BM_CHECK(iters_or_time.tag == BenchTimeType::ITERS);
+  return iters_or_time.iters;
+}
+
+}  // end namespace
+
+BenchTimeType ParseBenchMinTime(const std::string& value) {
+  BenchTimeType ret;
+
+  if (value.empty()) {
+    ret.tag = BenchTimeType::TIME;
+    ret.time = 0.0;
+    return ret;
+  }
+
+  if (value.back() == 'x') {
+    char* p_end;
+    // Reset errno before it's changed by strtol.
+    errno = 0;
+    IterationCount num_iters = std::strtol(value.c_str(), &p_end, 10);
+
+    // After a valid parse, p_end should have been set to
+    // point to the 'x' suffix.
+    BM_CHECK(errno == 0 && p_end != nullptr && *p_end == 'x')
+        << "Malformed iters value passed to --benchmark_min_time: `" << value
+        << "`. Expected --benchmark_min_time=<integer>x.";
+
+    ret.tag = BenchTimeType::ITERS;
+    ret.iters = num_iters;
+    return ret;
+  }
+
+  bool has_suffix = value.back() == 's';
+  if (!has_suffix) {
+    BM_VLOG(0) << "Value passed to --benchmark_min_time should have a suffix. "
+                  "Eg., `30s` for 30-seconds.";
+  }
+
+  char* p_end;
+  // Reset errno before it's changed by strtod.
+  errno = 0;
+  double min_time = std::strtod(value.c_str(), &p_end);
+
+  // After a successful parse, p_end should point to the suffix 's',
+  // or the end of the string if the suffix was omitted.
+  BM_CHECK(errno == 0 && p_end != nullptr &&
+           ((has_suffix && *p_end == 's') || *p_end == '\0'))
+      << "Malformed seconds value passed to --benchmark_min_time: `" << value
+      << "`. Expected --benchmark_min_time=<float>x.";
+
+  ret.tag = BenchTimeType::TIME;
+  ret.time = min_time;
+
+  return ret;
+}
+
+BenchmarkRunner::BenchmarkRunner(
+    const benchmark::internal::BenchmarkInstance& b_,
+    PerfCountersMeasurement* pcm_,
+    BenchmarkReporter::PerFamilyRunReports* reports_for_family_)
+    : b(b_),
+      reports_for_family(reports_for_family_),
+      parsed_benchtime_flag(ParseBenchMinTime(FLAGS_benchmark_min_time)),
+      min_time(ComputeMinTime(b_, parsed_benchtime_flag)),
+      min_warmup_time((!IsZero(b.min_time()) && b.min_warmup_time() > 0.0)
+                          ? b.min_warmup_time()
+                          : FLAGS_benchmark_min_warmup_time),
+      warmup_done(!(min_warmup_time > 0.0)),
+      repeats(b.repetitions() != 0 ? b.repetitions()
+                                   : FLAGS_benchmark_repetitions),
+      has_explicit_iteration_count(b.iterations() != 0 ||
+                                   parsed_benchtime_flag.tag ==
+                                       BenchTimeType::ITERS),
+      pool(b.threads() - 1),
+      iters(has_explicit_iteration_count
+                ? ComputeIters(b_, parsed_benchtime_flag)
+                : 1),
+      perf_counters_measurement_ptr(pcm_) {
+  run_results.display_report_aggregates_only =
+      (FLAGS_benchmark_report_aggregates_only ||
+       FLAGS_benchmark_display_aggregates_only);
+  run_results.file_report_aggregates_only =
+      FLAGS_benchmark_report_aggregates_only;
+  if (b.aggregation_report_mode() != internal::ARM_Unspecified) {
+    run_results.display_report_aggregates_only =
+        (b.aggregation_report_mode() &
+         internal::ARM_DisplayReportAggregatesOnly);
+    run_results.file_report_aggregates_only =
+        (b.aggregation_report_mode() & internal::ARM_FileReportAggregatesOnly);
+    BM_CHECK(FLAGS_benchmark_perf_counters.empty() ||
+             (perf_counters_measurement_ptr->num_counters() == 0))
+        << "Perf counters were requested but could not be set up.";
+  }
+}
+
+BenchmarkRunner::IterationResults BenchmarkRunner::DoNIterations() {
+  BM_VLOG(2) << "Running " << b.name().str() << " for " << iters << "\n";
+
+  std::unique_ptr<internal::ThreadManager> manager;
+  manager.reset(new internal::ThreadManager(b.threads()));
+
+  // Run all but one thread in separate threads
+  for (std::size_t ti = 0; ti < pool.size(); ++ti) {
+    pool[ti] = std::thread(&RunInThread, &b, iters, static_cast<int>(ti + 1),
+                           manager.get(), perf_counters_measurement_ptr);
+  }
+  // And run one thread here directly.
+  // (If we were asked to run just one thread, we don't create new threads.)
+  // Yes, we need to do this here *after* we start the separate threads.
+  RunInThread(&b, iters, 0, manager.get(), perf_counters_measurement_ptr);
+
+  // The main thread has finished. Now let's wait for the other threads.
+  manager->WaitForAllThreads();
+  for (std::thread& thread : pool) thread.join();
+
+  IterationResults i;
+  // Acquire the measurements/counters from the manager, UNDER THE LOCK!
+  {
+    MutexLock l(manager->GetBenchmarkMutex());
+    i.results = manager->results;
+  }
+
+  // And get rid of the manager.
+  manager.reset();
+
+  // Adjust real/manual time stats since they were reported per thread.
+  i.results.real_time_used /= b.threads();
+  i.results.manual_time_used /= b.threads();
+  // If we were measuring whole-process CPU usage, adjust the CPU time too.
+  if (b.measure_process_cpu_time()) i.results.cpu_time_used /= b.threads();
+
+  BM_VLOG(2) << "Ran in " << i.results.cpu_time_used << "/"
+             << i.results.real_time_used << "\n";
+
+  // By using KeepRunningBatch a benchmark can iterate more times than
+  // requested, so take the iteration count from i.results.
+  i.iters = i.results.iterations / b.threads();
+
+  // Base decisions off of real time if requested by this benchmark.
+  i.seconds = i.results.cpu_time_used;
+  if (b.use_manual_time()) {
+    i.seconds = i.results.manual_time_used;
+  } else if (b.use_real_time()) {
+    i.seconds = i.results.real_time_used;
+  }
+
+  return i;
+}
+
+IterationCount BenchmarkRunner::PredictNumItersNeeded(
+    const IterationResults& i) const {
+  // See how much iterations should be increased by.
+  // Note: Avoid division by zero with max(seconds, 1ns).
+  double multiplier = GetMinTimeToApply() * 1.4 / std::max(i.seconds, 1e-9);
+  // If our last run was at least 10% of FLAGS_benchmark_min_time then we
+  // use the multiplier directly.
+  // Otherwise we use at most 10 times expansion.
+  // NOTE: When the last run was at least 10% of the min time the max
+  // expansion should be 14x.
+  const bool is_significant = (i.seconds / GetMinTimeToApply()) > 0.1;
+  multiplier = is_significant ? multiplier : 10.0;
+
+  // So what seems to be the sufficiently-large iteration count? Round up.
+  const IterationCount max_next_iters = static_cast<IterationCount>(
+      std::lround(std::max(multiplier * static_cast<double>(i.iters),
+                           static_cast<double>(i.iters) + 1.0)));
+  // But we do have *some* limits though..
+  const IterationCount next_iters = std::min(max_next_iters, kMaxIterations);
+
+  BM_VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n";
+  return next_iters;  // round up before conversion to integer.
+}
+
+bool BenchmarkRunner::ShouldReportIterationResults(
+    const IterationResults& i) const {
+  // Determine if this run should be reported;
+  // Either it has run for a sufficient amount of time
+  // or because an error was reported.
+  return i.results.skipped_ ||
+         i.iters >= kMaxIterations ||  // Too many iterations already.
+         i.seconds >=
+             GetMinTimeToApply() ||  // The elapsed time is large enough.
+         // CPU time is specified but the elapsed real time greatly exceeds
+         // the minimum time.
+         // Note that user provided timers are except from this test.
+         ((i.results.real_time_used >= 5 * GetMinTimeToApply()) &&
+          !b.use_manual_time());
+}
+
+double BenchmarkRunner::GetMinTimeToApply() const {
+  // In order to re-use functionality to run and measure benchmarks for running
+  // a warmup phase of the benchmark, we need a way of telling whether to apply
+  // min_time or min_warmup_time. This function will figure out if we are in the
+  // warmup phase and therefore need to apply min_warmup_time or if we already
+  // in the benchmarking phase and min_time needs to be applied.
+  return warmup_done ? min_time : min_warmup_time;
+}
+
+void BenchmarkRunner::FinishWarmUp(const IterationCount& i) {
+  warmup_done = true;
+  iters = i;
+}
+
+void BenchmarkRunner::RunWarmUp() {
+  // Use the same mechanisms for warming up the benchmark as used for actually
+  // running and measuring the benchmark.
+  IterationResults i_warmup;
+  // Dont use the iterations determined in the warmup phase for the actual
+  // measured benchmark phase. While this may be a good starting point for the
+  // benchmark and it would therefore get rid of the need to figure out how many
+  // iterations are needed if min_time is set again, this may also be a complete
+  // wrong guess since the warmup loops might be considerably slower (e.g
+  // because of caching effects).
+  const IterationCount i_backup = iters;
+
+  for (;;) {
+    b.Setup();
+    i_warmup = DoNIterations();
+    b.Teardown();
+
+    const bool finish = ShouldReportIterationResults(i_warmup);
+
+    if (finish) {
+      FinishWarmUp(i_backup);
+      break;
+    }
+
+    // Although we are running "only" a warmup phase where running enough
+    // iterations at once without measuring time isn't as important as it is for
+    // the benchmarking phase, we still do it the same way as otherwise it is
+    // very confusing for the user to know how to choose a proper value for
+    // min_warmup_time if a different approach on running it is used.
+    iters = PredictNumItersNeeded(i_warmup);
+    assert(iters > i_warmup.iters &&
+           "if we did more iterations than we want to do the next time, "
+           "then we should have accepted the current iteration run.");
+  }
+}
+
+void BenchmarkRunner::DoOneRepetition() {
+  assert(HasRepeatsRemaining() && "Already done all repetitions?");
+
+  const bool is_the_first_repetition = num_repetitions_done == 0;
+
+  // In case a warmup phase is requested by the benchmark, run it now.
+  // After running the warmup phase the BenchmarkRunner should be in a state as
+  // this warmup never happened except the fact that warmup_done is set. Every
+  // other manipulation of the BenchmarkRunner instance would be a bug! Please
+  // fix it.
+  if (!warmup_done) RunWarmUp();
+
+  IterationResults i;
+  // We *may* be gradually increasing the length (iteration count)
+  // of the benchmark until we decide the results are significant.
+  // And once we do, we report those last results and exit.
+  // Please do note that the if there are repetitions, the iteration count
+  // is *only* calculated for the *first* repetition, and other repetitions
+  // simply use that precomputed iteration count.
+  for (;;) {
+    b.Setup();
+    i = DoNIterations();
+    b.Teardown();
+
+    // Do we consider the results to be significant?
+    // If we are doing repetitions, and the first repetition was already done,
+    // it has calculated the correct iteration time, so we have run that very
+    // iteration count just now. No need to calculate anything. Just report.
+    // Else, the normal rules apply.
+    const bool results_are_significant = !is_the_first_repetition ||
+                                         has_explicit_iteration_count ||
+                                         ShouldReportIterationResults(i);
+
+    if (results_are_significant) break;  // Good, let's report them!
+
+    // Nope, bad iteration. Let's re-estimate the hopefully-sufficient
+    // iteration count, and run the benchmark again...
+
+    iters = PredictNumItersNeeded(i);
+    assert(iters > i.iters &&
+           "if we did more iterations than we want to do the next time, "
+           "then we should have accepted the current iteration run.");
+  }
+
+  // Oh, one last thing, we need to also produce the 'memory measurements'..
+  MemoryManager::Result* memory_result = nullptr;
+  IterationCount memory_iterations = 0;
+  if (memory_manager != nullptr) {
+    // TODO(vyng): Consider making BenchmarkReporter::Run::memory_result an
+    // optional so we don't have to own the Result here.
+    // Can't do it now due to cxx03.
+    memory_results.push_back(MemoryManager::Result());
+    memory_result = &memory_results.back();
+    // Only run a few iterations to reduce the impact of one-time
+    // allocations in benchmarks that are not properly managed.
+    memory_iterations = std::min<IterationCount>(16, iters);
+    memory_manager->Start();
+    std::unique_ptr<internal::ThreadManager> manager;
+    manager.reset(new internal::ThreadManager(1));
+    b.Setup();
+    RunInThread(&b, memory_iterations, 0, manager.get(),
+                perf_counters_measurement_ptr);
+    manager->WaitForAllThreads();
+    manager.reset();
+    b.Teardown();
+    memory_manager->Stop(*memory_result);
+  }
+
+  // Ok, now actually report.
+  BenchmarkReporter::Run report =
+      CreateRunReport(b, i.results, memory_iterations, memory_result, i.seconds,
+                      num_repetitions_done, repeats);
+
+  if (reports_for_family) {
+    ++reports_for_family->num_runs_done;
+    if (!report.skipped) reports_for_family->Runs.push_back(report);
+  }
+
+  run_results.non_aggregates.push_back(report);
+
+  ++num_repetitions_done;
+}
+
+RunResults&& BenchmarkRunner::GetResults() {
+  assert(!HasRepeatsRemaining() && "Did not run all repetitions yet?");
+
+  // Calculate additional statistics over the repetitions of this instance.
+  run_results.aggregates_only = ComputeStats(run_results.non_aggregates);
+
+  return std::move(run_results);
+}
+
+}  // end namespace internal
+
+}  // end namespace benchmark
diff --git a/third_party/google_benchmark/src/src/benchmark_runner.h b/third_party/google_benchmark/src/src/benchmark_runner.h
new file mode 100644
index 0000000..db2fa04
--- /dev/null
+++ b/third_party/google_benchmark/src/src/benchmark_runner.h
@@ -0,0 +1,131 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef BENCHMARK_RUNNER_H_
+#define BENCHMARK_RUNNER_H_
+
+#include <thread>
+#include <vector>
+
+#include "benchmark_api_internal.h"
+#include "internal_macros.h"
+#include "perf_counters.h"
+#include "thread_manager.h"
+
+namespace benchmark {
+
+BM_DECLARE_string(benchmark_min_time);
+BM_DECLARE_double(benchmark_min_warmup_time);
+BM_DECLARE_int32(benchmark_repetitions);
+BM_DECLARE_bool(benchmark_report_aggregates_only);
+BM_DECLARE_bool(benchmark_display_aggregates_only);
+BM_DECLARE_string(benchmark_perf_counters);
+
+namespace internal {
+
+extern MemoryManager* memory_manager;
+
+struct RunResults {
+  std::vector<BenchmarkReporter::Run> non_aggregates;
+  std::vector<BenchmarkReporter::Run> aggregates_only;
+
+  bool display_report_aggregates_only = false;
+  bool file_report_aggregates_only = false;
+};
+
+struct BENCHMARK_EXPORT BenchTimeType {
+  enum { ITERS, TIME } tag;
+  union {
+    IterationCount iters;
+    double time;
+  };
+};
+
+BENCHMARK_EXPORT
+BenchTimeType ParseBenchMinTime(const std::string& value);
+
+class BenchmarkRunner {
+ public:
+  BenchmarkRunner(const benchmark::internal::BenchmarkInstance& b_,
+                  benchmark::internal::PerfCountersMeasurement* pmc_,
+                  BenchmarkReporter::PerFamilyRunReports* reports_for_family);
+
+  int GetNumRepeats() const { return repeats; }
+
+  bool HasRepeatsRemaining() const {
+    return GetNumRepeats() != num_repetitions_done;
+  }
+
+  void DoOneRepetition();
+
+  RunResults&& GetResults();
+
+  BenchmarkReporter::PerFamilyRunReports* GetReportsForFamily() const {
+    return reports_for_family;
+  }
+
+  double GetMinTime() const { return min_time; }
+
+  bool HasExplicitIters() const { return has_explicit_iteration_count; }
+
+  IterationCount GetIters() const { return iters; }
+
+ private:
+  RunResults run_results;
+
+  const benchmark::internal::BenchmarkInstance& b;
+  BenchmarkReporter::PerFamilyRunReports* reports_for_family;
+
+  BenchTimeType parsed_benchtime_flag;
+  const double min_time;
+  const double min_warmup_time;
+  bool warmup_done;
+  const int repeats;
+  const bool has_explicit_iteration_count;
+
+  int num_repetitions_done = 0;
+
+  std::vector<std::thread> pool;
+
+  std::vector<MemoryManager::Result> memory_results;
+
+  IterationCount iters;  // preserved between repetitions!
+  // So only the first repetition has to find/calculate it,
+  // the other repetitions will just use that precomputed iteration count.
+
+  PerfCountersMeasurement* const perf_counters_measurement_ptr = nullptr;
+
+  struct IterationResults {
+    internal::ThreadManager::Result results;
+    IterationCount iters;
+    double seconds;
+  };
+  IterationResults DoNIterations();
+
+  IterationCount PredictNumItersNeeded(const IterationResults& i) const;
+
+  bool ShouldReportIterationResults(const IterationResults& i) const;
+
+  double GetMinTimeToApply() const;
+
+  void FinishWarmUp(const IterationCount& i);
+
+  void RunWarmUp();
+};
+
+}  // namespace internal
+
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_RUNNER_H_
diff --git a/third_party/google_benchmark/src/src/check.cc b/third_party/google_benchmark/src/src/check.cc
new file mode 100644
index 0000000..5f7526e
--- /dev/null
+++ b/third_party/google_benchmark/src/src/check.cc
@@ -0,0 +1,11 @@
+#include "check.h"
+
+namespace benchmark {
+namespace internal {
+
+static AbortHandlerT* handler = &std::abort;
+
+BENCHMARK_EXPORT AbortHandlerT*& GetAbortHandler() { return handler; }
+
+}  // namespace internal
+}  // namespace benchmark
diff --git a/third_party/google_benchmark/src/src/check.h b/third_party/google_benchmark/src/src/check.h
new file mode 100644
index 0000000..c1cd5e8
--- /dev/null
+++ b/third_party/google_benchmark/src/src/check.h
@@ -0,0 +1,106 @@
+#ifndef CHECK_H_
+#define CHECK_H_
+
+#include <cmath>
+#include <cstdlib>
+#include <ostream>
+
+#include "benchmark/export.h"
+#include "internal_macros.h"
+#include "log.h"
+
+#if defined(__GNUC__) || defined(__clang__)
+#define BENCHMARK_NOEXCEPT noexcept
+#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
+#elif defined(_MSC_VER) && !defined(__clang__)
+#if _MSC_VER >= 1900
+#define BENCHMARK_NOEXCEPT noexcept
+#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
+#else
+#define BENCHMARK_NOEXCEPT
+#define BENCHMARK_NOEXCEPT_OP(x)
+#endif
+#define __func__ __FUNCTION__
+#else
+#define BENCHMARK_NOEXCEPT
+#define BENCHMARK_NOEXCEPT_OP(x)
+#endif
+
+namespace benchmark {
+namespace internal {
+
+typedef void(AbortHandlerT)();
+
+BENCHMARK_EXPORT
+AbortHandlerT*& GetAbortHandler();
+
+BENCHMARK_NORETURN inline void CallAbortHandler() {
+  GetAbortHandler()();
+  std::abort();  // fallback to enforce noreturn
+}
+
+// CheckHandler is the class constructed by failing BM_CHECK macros.
+// CheckHandler will log information about the failures and abort when it is
+// destructed.
+class CheckHandler {
+ public:
+  CheckHandler(const char* check, const char* file, const char* func, int line)
+      : log_(GetErrorLogInstance()) {
+    log_ << file << ":" << line << ": " << func << ": Check `" << check
+         << "' failed. ";
+  }
+
+  LogType& GetLog() { return log_; }
+
+#if defined(COMPILER_MSVC)
+#pragma warning(push)
+#pragma warning(disable : 4722)
+#endif
+  BENCHMARK_NORETURN ~CheckHandler() BENCHMARK_NOEXCEPT_OP(false) {
+    log_ << std::endl;
+    CallAbortHandler();
+  }
+#if defined(COMPILER_MSVC)
+#pragma warning(pop)
+#endif
+
+  CheckHandler& operator=(const CheckHandler&) = delete;
+  CheckHandler(const CheckHandler&) = delete;
+  CheckHandler() = delete;
+
+ private:
+  LogType& log_;
+};
+
+}  // end namespace internal
+}  // end namespace benchmark
+
+// The BM_CHECK macro returns a std::ostream object that can have extra
+// information written to it.
+#ifndef NDEBUG
+#define BM_CHECK(b)                                                          \
+  (b ? ::benchmark::internal::GetNullLogInstance()                           \
+     : ::benchmark::internal::CheckHandler(#b, __FILE__, __func__, __LINE__) \
+           .GetLog())
+#else
+#define BM_CHECK(b) ::benchmark::internal::GetNullLogInstance()
+#endif
+
+// clang-format off
+// preserve whitespacing between operators for alignment
+#define BM_CHECK_EQ(a, b) BM_CHECK((a) == (b))
+#define BM_CHECK_NE(a, b) BM_CHECK((a) != (b))
+#define BM_CHECK_GE(a, b) BM_CHECK((a) >= (b))
+#define BM_CHECK_LE(a, b) BM_CHECK((a) <= (b))
+#define BM_CHECK_GT(a, b) BM_CHECK((a) > (b))
+#define BM_CHECK_LT(a, b) BM_CHECK((a) < (b))
+
+#define BM_CHECK_FLOAT_EQ(a, b, eps) BM_CHECK(std::fabs((a) - (b)) <  (eps))
+#define BM_CHECK_FLOAT_NE(a, b, eps) BM_CHECK(std::fabs((a) - (b)) >= (eps))
+#define BM_CHECK_FLOAT_GE(a, b, eps) BM_CHECK((a) - (b) > -(eps))
+#define BM_CHECK_FLOAT_LE(a, b, eps) BM_CHECK((b) - (a) > -(eps))
+#define BM_CHECK_FLOAT_GT(a, b, eps) BM_CHECK((a) - (b) >  (eps))
+#define BM_CHECK_FLOAT_LT(a, b, eps) BM_CHECK((b) - (a) >  (eps))
+//clang-format on
+
+#endif  // CHECK_H_
diff --git a/third_party/google_benchmark/src/colorprint.cc b/third_party/google_benchmark/src/src/colorprint.cc
similarity index 90%
rename from third_party/google_benchmark/src/colorprint.cc
rename to third_party/google_benchmark/src/src/colorprint.cc
index fff6a98..9a653c5 100644
--- a/third_party/google_benchmark/src/colorprint.cc
+++ b/third_party/google_benchmark/src/src/colorprint.cc
@@ -25,8 +25,8 @@
 #include "internal_macros.h"
 
 #ifdef BENCHMARK_OS_WINDOWS
-#include <windows.h>
 #include <io.h>
+#include <windows.h>
 #else
 #include <unistd.h>
 #endif  // BENCHMARK_OS_WINDOWS
@@ -94,20 +94,20 @@
   va_end(args_cp);
 
   // currently there is no error handling for failure, so this is hack.
-  CHECK(ret >= 0);
+  BM_CHECK(ret >= 0);
 
-  if (ret == 0)  // handle empty expansion
+  if (ret == 0) {  // handle empty expansion
     return {};
-  else if (static_cast<size_t>(ret) < size)
-    return local_buff;
-  else {
-    // we did not provide a long enough buffer on our first attempt.
-    size = (size_t)ret + 1;  // + 1 for the null byte
-    std::unique_ptr<char[]> buff(new char[size]);
-    ret = vsnprintf(buff.get(), size, msg, args);
-    CHECK(ret > 0 && ((size_t)ret) < size);
-    return buff.get();
   }
+  if (static_cast<size_t>(ret) < size) {
+    return local_buff;
+  }
+  // we did not provide a long enough buffer on our first attempt.
+  size = static_cast<size_t>(ret) + 1;  // + 1 for the null byte
+  std::unique_ptr<char[]> buff(new char[size]);
+  ret = vsnprintf(buff.get(), size, msg, args);
+  BM_CHECK(ret > 0 && (static_cast<size_t>(ret)) < size);
+  return buff.get();
 }
 
 std::string FormatString(const char* msg, ...) {
@@ -163,7 +163,7 @@
 #else
   // On non-Windows platforms, we rely on the TERM variable. This list of
   // supported TERM values is copied from Google Test:
-  // <https://github.com/google/googletest/blob/master/googletest/src/gtest.cc#L2925>.
+  // <https://github.com/google/googletest/blob/main/googletest/src/gtest.cc#L2925>.
   const char* const SUPPORTED_TERM_VALUES[] = {
       "xterm",         "xterm-color",     "xterm-256color",
       "screen",        "screen-256color", "tmux",
diff --git a/third_party/google_benchmark/src/colorprint.h b/third_party/google_benchmark/src/src/colorprint.h
similarity index 100%
rename from third_party/google_benchmark/src/colorprint.h
rename to third_party/google_benchmark/src/src/colorprint.h
diff --git a/third_party/google_benchmark/src/commandlineflags.cc b/third_party/google_benchmark/src/src/commandlineflags.cc
similarity index 76%
rename from third_party/google_benchmark/src/commandlineflags.cc
rename to third_party/google_benchmark/src/src/commandlineflags.cc
index 4e60f0b..dcb4149 100644
--- a/third_party/google_benchmark/src/commandlineflags.cc
+++ b/third_party/google_benchmark/src/src/commandlineflags.cc
@@ -20,6 +20,10 @@
 #include <cstring>
 #include <iostream>
 #include <limits>
+#include <map>
+#include <utility>
+
+#include "../src/string_util.h"
 
 namespace benchmark {
 namespace {
@@ -78,6 +82,30 @@
   return true;
 }
 
+// Parses 'str' into KV pairs. If successful, writes the result to *value and
+// returns true; otherwise leaves *value unchanged and returns false.
+bool ParseKvPairs(const std::string& src_text, const char* str,
+                  std::map<std::string, std::string>* value) {
+  std::map<std::string, std::string> kvs;
+  for (const auto& kvpair : StrSplit(str, ',')) {
+    const auto kv = StrSplit(kvpair, '=');
+    if (kv.size() != 2) {
+      std::cerr << src_text << " is expected to be a comma-separated list of "
+                << "<key>=<value> strings, but actually has value \"" << str
+                << "\".\n";
+      return false;
+    }
+    if (!kvs.emplace(kv[0], kv[1]).second) {
+      std::cerr << src_text << " is expected to contain unique keys but key \""
+                << kv[0] << "\" was repeated.\n";
+      return false;
+    }
+  }
+
+  *value = kvs;
+  return true;
+}
+
 // Returns the name of the environment variable corresponding to the
 // given flag.  For example, FlagToEnvVar("foo") will return
 // "BENCHMARK_FOO" in the open-source version.
@@ -88,17 +116,19 @@
   for (size_t i = 0; i != flag_str.length(); ++i)
     env_var += static_cast<char>(::toupper(flag_str.c_str()[i]));
 
-  return "BENCHMARK_" + env_var;
+  return env_var;
 }
 
 }  // namespace
 
+BENCHMARK_EXPORT
 bool BoolFromEnv(const char* flag, bool default_val) {
   const std::string env_var = FlagToEnvVar(flag);
   const char* const value_str = getenv(env_var.c_str());
   return value_str == nullptr ? default_val : IsTruthyFlagValue(value_str);
 }
 
+BENCHMARK_EXPORT
 int32_t Int32FromEnv(const char* flag, int32_t default_val) {
   const std::string env_var = FlagToEnvVar(flag);
   const char* const value_str = getenv(env_var.c_str());
@@ -111,6 +141,7 @@
   return value;
 }
 
+BENCHMARK_EXPORT
 double DoubleFromEnv(const char* flag, double default_val) {
   const std::string env_var = FlagToEnvVar(flag);
   const char* const value_str = getenv(env_var.c_str());
@@ -123,12 +154,28 @@
   return value;
 }
 
+BENCHMARK_EXPORT
 const char* StringFromEnv(const char* flag, const char* default_val) {
   const std::string env_var = FlagToEnvVar(flag);
   const char* const value = getenv(env_var.c_str());
   return value == nullptr ? default_val : value;
 }
 
+BENCHMARK_EXPORT
+std::map<std::string, std::string> KvPairsFromEnv(
+    const char* flag, std::map<std::string, std::string> default_val) {
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const value_str = getenv(env_var.c_str());
+
+  if (value_str == nullptr) return default_val;
+
+  std::map<std::string, std::string> value;
+  if (!ParseKvPairs("Environment variable " + env_var, value_str, &value)) {
+    return default_val;
+  }
+  return value;
+}
+
 // Parses a string as a command line flag.  The string should have
 // the format "--flag=value".  When def_optional is true, the "=value"
 // part can be omitted.
@@ -159,6 +206,7 @@
   return flag_end + 1;
 }
 
+BENCHMARK_EXPORT
 bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
   // Gets the value of the flag as a string.
   const char* const value_str = ParseFlagValue(str, flag, true);
@@ -171,6 +219,7 @@
   return true;
 }
 
+BENCHMARK_EXPORT
 bool ParseInt32Flag(const char* str, const char* flag, int32_t* value) {
   // Gets the value of the flag as a string.
   const char* const value_str = ParseFlagValue(str, flag, false);
@@ -183,6 +232,7 @@
                     value);
 }
 
+BENCHMARK_EXPORT
 bool ParseDoubleFlag(const char* str, const char* flag, double* value) {
   // Gets the value of the flag as a string.
   const char* const value_str = ParseFlagValue(str, flag, false);
@@ -195,6 +245,7 @@
                      value);
 }
 
+BENCHMARK_EXPORT
 bool ParseStringFlag(const char* str, const char* flag, std::string* value) {
   // Gets the value of the flag as a string.
   const char* const value_str = ParseFlagValue(str, flag, false);
@@ -206,23 +257,42 @@
   return true;
 }
 
+BENCHMARK_EXPORT
+bool ParseKeyValueFlag(const char* str, const char* flag,
+                       std::map<std::string, std::string>* value) {
+  const char* const value_str = ParseFlagValue(str, flag, false);
+
+  if (value_str == nullptr) return false;
+
+  for (const auto& kvpair : StrSplit(value_str, ',')) {
+    const auto kv = StrSplit(kvpair, '=');
+    if (kv.size() != 2) return false;
+    value->emplace(kv[0], kv[1]);
+  }
+
+  return true;
+}
+
+BENCHMARK_EXPORT
 bool IsFlag(const char* str, const char* flag) {
   return (ParseFlagValue(str, flag, true) != nullptr);
 }
 
+BENCHMARK_EXPORT
 bool IsTruthyFlagValue(const std::string& value) {
   if (value.size() == 1) {
     char v = value[0];
     return isalnum(v) &&
            !(v == '0' || v == 'f' || v == 'F' || v == 'n' || v == 'N');
-  } else if (!value.empty()) {
+  }
+  if (!value.empty()) {
     std::string value_lower(value);
-    std::transform(value_lower.begin(), value_lower.end(),
-                   value_lower.begin(), ::tolower);
+    std::transform(value_lower.begin(), value_lower.end(), value_lower.begin(),
+                   [](char c) { return static_cast<char>(::tolower(c)); });
     return !(value_lower == "false" || value_lower == "no" ||
              value_lower == "off");
-  } else
-    return true;
+  }
+  return true;
 }
 
 }  // end namespace benchmark
diff --git a/third_party/google_benchmark/src/src/commandlineflags.h b/third_party/google_benchmark/src/src/commandlineflags.h
new file mode 100644
index 0000000..7882628
--- /dev/null
+++ b/third_party/google_benchmark/src/src/commandlineflags.h
@@ -0,0 +1,133 @@
+#ifndef BENCHMARK_COMMANDLINEFLAGS_H_
+#define BENCHMARK_COMMANDLINEFLAGS_H_
+
+#include <cstdint>
+#include <map>
+#include <string>
+
+#include "benchmark/export.h"
+
+// Macro for referencing flags.
+#define FLAG(name) FLAGS_##name
+
+// Macros for declaring flags.
+#define BM_DECLARE_bool(name) BENCHMARK_EXPORT extern bool FLAG(name)
+#define BM_DECLARE_int32(name) BENCHMARK_EXPORT extern int32_t FLAG(name)
+#define BM_DECLARE_double(name) BENCHMARK_EXPORT extern double FLAG(name)
+#define BM_DECLARE_string(name) BENCHMARK_EXPORT extern std::string FLAG(name)
+#define BM_DECLARE_kvpairs(name) \
+  BENCHMARK_EXPORT extern std::map<std::string, std::string> FLAG(name)
+
+// Macros for defining flags.
+#define BM_DEFINE_bool(name, default_val) \
+  BENCHMARK_EXPORT bool FLAG(name) = benchmark::BoolFromEnv(#name, default_val)
+#define BM_DEFINE_int32(name, default_val) \
+  BENCHMARK_EXPORT int32_t FLAG(name) =    \
+      benchmark::Int32FromEnv(#name, default_val)
+#define BM_DEFINE_double(name, default_val) \
+  BENCHMARK_EXPORT double FLAG(name) =      \
+      benchmark::DoubleFromEnv(#name, default_val)
+#define BM_DEFINE_string(name, default_val) \
+  BENCHMARK_EXPORT std::string FLAG(name) = \
+      benchmark::StringFromEnv(#name, default_val)
+#define BM_DEFINE_kvpairs(name, default_val)                       \
+  BENCHMARK_EXPORT std::map<std::string, std::string> FLAG(name) = \
+      benchmark::KvPairsFromEnv(#name, default_val)
+
+namespace benchmark {
+
+// Parses a bool from the environment variable corresponding to the given flag.
+//
+// If the variable exists, returns IsTruthyFlagValue() value;  if not,
+// returns the given default value.
+BENCHMARK_EXPORT
+bool BoolFromEnv(const char* flag, bool default_val);
+
+// Parses an Int32 from the environment variable corresponding to the given
+// flag.
+//
+// If the variable exists, returns ParseInt32() value;  if not, returns
+// the given default value.
+BENCHMARK_EXPORT
+int32_t Int32FromEnv(const char* flag, int32_t default_val);
+
+// Parses an Double from the environment variable corresponding to the given
+// flag.
+//
+// If the variable exists, returns ParseDouble();  if not, returns
+// the given default value.
+BENCHMARK_EXPORT
+double DoubleFromEnv(const char* flag, double default_val);
+
+// Parses a string from the environment variable corresponding to the given
+// flag.
+//
+// If variable exists, returns its value;  if not, returns
+// the given default value.
+BENCHMARK_EXPORT
+const char* StringFromEnv(const char* flag, const char* default_val);
+
+// Parses a set of kvpairs from the environment variable corresponding to the
+// given flag.
+//
+// If variable exists, returns its value;  if not, returns
+// the given default value.
+BENCHMARK_EXPORT
+std::map<std::string, std::string> KvPairsFromEnv(
+    const char* flag, std::map<std::string, std::string> default_val);
+
+// Parses a string for a bool flag, in the form of either
+// "--flag=value" or "--flag".
+//
+// In the former case, the value is taken as true if it passes IsTruthyValue().
+//
+// In the latter case, the value is taken as true.
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+BENCHMARK_EXPORT
+bool ParseBoolFlag(const char* str, const char* flag, bool* value);
+
+// Parses a string for an Int32 flag, in the form of "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+BENCHMARK_EXPORT
+bool ParseInt32Flag(const char* str, const char* flag, int32_t* value);
+
+// Parses a string for a Double flag, in the form of "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+BENCHMARK_EXPORT
+bool ParseDoubleFlag(const char* str, const char* flag, double* value);
+
+// Parses a string for a string flag, in the form of "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+BENCHMARK_EXPORT
+bool ParseStringFlag(const char* str, const char* flag, std::string* value);
+
+// Parses a string for a kvpairs flag in the form "--flag=key=value,key=value"
+//
+// On success, stores the value of the flag in *value and returns true. On
+// failure returns false, though *value may have been mutated.
+BENCHMARK_EXPORT
+bool ParseKeyValueFlag(const char* str, const char* flag,
+                       std::map<std::string, std::string>* value);
+
+// Returns true if the string matches the flag.
+BENCHMARK_EXPORT
+bool IsFlag(const char* str, const char* flag);
+
+// Returns true unless value starts with one of: '0', 'f', 'F', 'n' or 'N', or
+// some non-alphanumeric character. Also returns false if the value matches
+// one of 'no', 'false', 'off' (case-insensitive). As a special case, also
+// returns true if value is the empty string.
+BENCHMARK_EXPORT
+bool IsTruthyFlagValue(const std::string& value);
+
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_COMMANDLINEFLAGS_H_
diff --git a/third_party/google_benchmark/src/complexity.cc b/third_party/google_benchmark/src/src/complexity.cc
similarity index 92%
rename from third_party/google_benchmark/src/complexity.cc
rename to third_party/google_benchmark/src/src/complexity.cc
index aeed67f..825c573 100644
--- a/third_party/google_benchmark/src/complexity.cc
+++ b/third_party/google_benchmark/src/src/complexity.cc
@@ -15,12 +15,13 @@
 // Source project : https://github.com/ismaelJimenez/cpp.leastsq
 // Adapted to be used with google benchmark
 
-#include "benchmark/benchmark.h"
+#include "complexity.h"
 
 #include <algorithm>
 #include <cmath>
+
+#include "benchmark/benchmark.h"
 #include "check.h"
-#include "complexity.h"
 
 namespace benchmark {
 
@@ -82,7 +83,6 @@
 LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
                        const std::vector<double>& time,
                        BigOFunc* fitting_curve) {
-  double sigma_gn = 0.0;
   double sigma_gn_squared = 0.0;
   double sigma_time = 0.0;
   double sigma_time_gn = 0.0;
@@ -90,7 +90,6 @@
   // Calculate least square fitting parameter
   for (size_t i = 0; i < n.size(); ++i) {
     double gn_i = fitting_curve(n[i]);
-    sigma_gn += gn_i;
     sigma_gn_squared += gn_i * gn_i;
     sigma_time += time[i];
     sigma_time_gn += time[i] * gn_i;
@@ -125,10 +124,10 @@
 //                  fitting curve.
 LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
                        const std::vector<double>& time, const BigO complexity) {
-  CHECK_EQ(n.size(), time.size());
-  CHECK_GE(n.size(), 2);  // Do not compute fitting curve is less than two
-                          // benchmark runs are given
-  CHECK_NE(complexity, oNone);
+  BM_CHECK_EQ(n.size(), time.size());
+  BM_CHECK_GE(n.size(), 2);  // Do not compute fitting curve is less than two
+                             // benchmark runs are given
+  BM_CHECK_NE(complexity, oNone);
 
   LeastSq best_fit;
 
@@ -169,7 +168,8 @@
 
   // Populate the accumulators.
   for (const Run& run : reports) {
-    CHECK_GT(run.complexity_n, 0) << "Did you forget to call SetComplexityN?";
+    BM_CHECK_GT(run.complexity_n, 0)
+        << "Did you forget to call SetComplexityN?";
     n.push_back(run.complexity_n);
     real_time.push_back(run.real_accumulated_time / run.iterations);
     cpu_time.push_back(run.cpu_accumulated_time / run.iterations);
@@ -193,11 +193,14 @@
   // Get the data from the accumulator to BenchmarkReporter::Run's.
   Run big_o;
   big_o.run_name = run_name;
+  big_o.family_index = reports[0].family_index;
+  big_o.per_family_instance_index = reports[0].per_family_instance_index;
   big_o.run_type = BenchmarkReporter::Run::RT_Aggregate;
   big_o.repetitions = reports[0].repetitions;
   big_o.repetition_index = Run::no_repetition_index;
   big_o.threads = reports[0].threads;
   big_o.aggregate_name = "BigO";
+  big_o.aggregate_unit = StatisticUnit::kTime;
   big_o.report_label = reports[0].report_label;
   big_o.iterations = 0;
   big_o.real_accumulated_time = result_real.coef;
@@ -215,8 +218,11 @@
   // Only add label to mean/stddev if it is same for all runs
   Run rms;
   rms.run_name = run_name;
+  rms.family_index = reports[0].family_index;
+  rms.per_family_instance_index = reports[0].per_family_instance_index;
   rms.run_type = BenchmarkReporter::Run::RT_Aggregate;
   rms.aggregate_name = "RMS";
+  rms.aggregate_unit = StatisticUnit::kPercentage;
   rms.report_label = big_o.report_label;
   rms.iterations = 0;
   rms.repetition_index = Run::no_repetition_index;
diff --git a/third_party/google_benchmark/src/complexity.h b/third_party/google_benchmark/src/src/complexity.h
similarity index 96%
rename from third_party/google_benchmark/src/complexity.h
rename to third_party/google_benchmark/src/src/complexity.h
index df29b48..0a0679b 100644
--- a/third_party/google_benchmark/src/complexity.h
+++ b/third_party/google_benchmark/src/src/complexity.h
@@ -31,7 +31,7 @@
     const std::vector<BenchmarkReporter::Run>& reports);
 
 // This data structure will contain the result returned by MinimalLeastSq
-//   - coef        : Estimated coeficient for the high-order term as
+//   - coef        : Estimated coefficient for the high-order term as
 //                   interpolated from data.
 //   - rms         : Normalized Root Mean Squared Error.
 //   - complexity  : Scalability form (e.g. oN, oNLogN). In case a scalability
diff --git a/third_party/google_benchmark/src/console_reporter.cc b/third_party/google_benchmark/src/src/console_reporter.cc
similarity index 68%
rename from third_party/google_benchmark/src/console_reporter.cc
rename to third_party/google_benchmark/src/src/console_reporter.cc
index 6fd7645..10e05e1 100644
--- a/third_party/google_benchmark/src/console_reporter.cc
+++ b/third_party/google_benchmark/src/src/console_reporter.cc
@@ -33,6 +33,7 @@
 
 namespace benchmark {
 
+BENCHMARK_EXPORT
 bool ConsoleReporter::ReportContext(const Context& context) {
   name_field_width_ = context.name_field_width;
   printed_header_ = false;
@@ -45,19 +46,21 @@
     GetErrorStream()
         << "Color printing is only supported for stdout on windows."
            " Disabling color printing\n";
-    output_options_ = static_cast< OutputOptions >(output_options_ & ~OO_Color);
+    output_options_ = static_cast<OutputOptions>(output_options_ & ~OO_Color);
   }
 #endif
 
   return true;
 }
 
+BENCHMARK_EXPORT
 void ConsoleReporter::PrintHeader(const Run& run) {
-  std::string str = FormatString("%-*s %13s %15s %12s", static_cast<int>(name_field_width_),
-                                 "Benchmark", "Time", "CPU", "Iterations");
-  if(!run.counters.empty()) {
-    if(output_options_ & OO_Tabular) {
-      for(auto const& c : run.counters) {
+  std::string str =
+      FormatString("%-*s %13s %15s %12s", static_cast<int>(name_field_width_),
+                   "Benchmark", "Time", "CPU", "Iterations");
+  if (!run.counters.empty()) {
+    if (output_options_ & OO_Tabular) {
+      for (auto const& c : run.counters) {
         str += FormatString(" %10s", c.first.c_str());
       }
     } else {
@@ -68,6 +71,7 @@
   GetOutputStream() << line << "\n" << str << "\n" << line << "\n";
 }
 
+BENCHMARK_EXPORT
 void ConsoleReporter::ReportRuns(const std::vector<Run>& reports) {
   for (const auto& run : reports) {
     // print the header:
@@ -97,8 +101,10 @@
   va_end(args);
 }
 
-
 static std::string FormatTime(double time) {
+  // For the time columns of the console printer 13 digits are reserved. One of
+  // them is a space and max two of them are the time unit (e.g ns). That puts
+  // us at 10 digits usable for the number.
   // Align decimal places...
   if (time < 1.0) {
     return FormatString("%10.3f", time);
@@ -109,22 +115,33 @@
   if (time < 100.0) {
     return FormatString("%10.1f", time);
   }
+  // Assuming the time is at max 9.9999e+99 and we have 10 digits for the
+  // number, we get 10-1(.)-1(e)-1(sign)-2(exponent) = 5 digits to print.
+  if (time > 9999999999 /*max 10 digit number*/) {
+    return FormatString("%1.4e", time);
+  }
   return FormatString("%10.0f", time);
 }
 
+BENCHMARK_EXPORT
 void ConsoleReporter::PrintRunData(const Run& result) {
   typedef void(PrinterFn)(std::ostream&, LogColor, const char*, ...);
   auto& Out = GetOutputStream();
-  PrinterFn* printer = (output_options_ & OO_Color) ?
-                         (PrinterFn*)ColorPrintf : IgnoreColorPrint;
+  PrinterFn* printer = (output_options_ & OO_Color)
+                           ? static_cast<PrinterFn*>(ColorPrintf)
+                           : IgnoreColorPrint;
   auto name_color =
       (result.report_big_o || result.report_rms) ? COLOR_BLUE : COLOR_GREEN;
   printer(Out, name_color, "%-*s ", name_field_width_,
           result.benchmark_name().c_str());
 
-  if (result.error_occurred) {
+  if (internal::SkippedWithError == result.skipped) {
     printer(Out, COLOR_RED, "ERROR OCCURRED: \'%s\'",
-            result.error_message.c_str());
+            result.skip_message.c_str());
+    printer(Out, COLOR_DEFAULT, "\n");
+    return;
+  } else if (internal::SkippedWithMessage == result.skipped) {
+    printer(Out, COLOR_WHITE, "SKIPPED: \'%s\'", result.skip_message.c_str());
     printer(Out, COLOR_DEFAULT, "\n");
     return;
   }
@@ -134,18 +151,23 @@
   const std::string real_time_str = FormatTime(real_time);
   const std::string cpu_time_str = FormatTime(cpu_time);
 
-
   if (result.report_big_o) {
     std::string big_o = GetBigOString(result.complexity);
-    printer(Out, COLOR_YELLOW, "%10.2f %-4s %10.2f %-4s ", real_time, big_o.c_str(),
-            cpu_time, big_o.c_str());
+    printer(Out, COLOR_YELLOW, "%10.2f %-4s %10.2f %-4s ", real_time,
+            big_o.c_str(), cpu_time, big_o.c_str());
   } else if (result.report_rms) {
     printer(Out, COLOR_YELLOW, "%10.0f %-4s %10.0f %-4s ", real_time * 100, "%",
             cpu_time * 100, "%");
-  } else {
+  } else if (result.run_type != Run::RT_Aggregate ||
+             result.aggregate_unit == StatisticUnit::kTime) {
     const char* timeLabel = GetTimeUnitString(result.time_unit);
-    printer(Out, COLOR_YELLOW, "%s %-4s %s %-4s ", real_time_str.c_str(), timeLabel,
-            cpu_time_str.c_str(), timeLabel);
+    printer(Out, COLOR_YELLOW, "%s %-4s %s %-4s ", real_time_str.c_str(),
+            timeLabel, cpu_time_str.c_str(), timeLabel);
+  } else {
+    assert(result.aggregate_unit == StatisticUnit::kPercentage);
+    printer(Out, COLOR_YELLOW, "%10.2f %-4s %10.2f %-4s ",
+            (100. * result.real_accumulated_time), "%",
+            (100. * result.cpu_accumulated_time), "%");
   }
 
   if (!result.report_big_o && !result.report_rms) {
@@ -153,12 +175,19 @@
   }
 
   for (auto& c : result.counters) {
-    const std::size_t cNameLen = std::max(std::string::size_type(10),
-                                          c.first.length());
-    auto const& s = HumanReadableNumber(c.second.value, c.second.oneK);
+    const std::size_t cNameLen =
+        std::max(std::string::size_type(10), c.first.length());
+    std::string s;
     const char* unit = "";
-    if (c.second.flags & Counter::kIsRate)
-      unit = (c.second.flags & Counter::kInvert) ? "s" : "/s";
+    if (result.run_type == Run::RT_Aggregate &&
+        result.aggregate_unit == StatisticUnit::kPercentage) {
+      s = StrFormat("%.2f", 100. * c.second.value);
+      unit = "%";
+    } else {
+      s = HumanReadableNumber(c.second.value, c.second.oneK);
+      if (c.second.flags & Counter::kIsRate)
+        unit = (c.second.flags & Counter::kInvert) ? "s" : "/s";
+    }
     if (output_options_ & OO_Tabular) {
       printer(Out, COLOR_DEFAULT, " %*s%s", cNameLen - strlen(unit), s.c_str(),
               unit);
diff --git a/third_party/google_benchmark/src/counter.cc b/third_party/google_benchmark/src/src/counter.cc
similarity index 100%
rename from third_party/google_benchmark/src/counter.cc
rename to third_party/google_benchmark/src/src/counter.cc
diff --git a/third_party/google_benchmark/src/counter.h b/third_party/google_benchmark/src/src/counter.h
similarity index 100%
rename from third_party/google_benchmark/src/counter.h
rename to third_party/google_benchmark/src/src/counter.h
diff --git a/third_party/google_benchmark/src/csv_reporter.cc b/third_party/google_benchmark/src/src/csv_reporter.cc
similarity index 89%
rename from third_party/google_benchmark/src/csv_reporter.cc
rename to third_party/google_benchmark/src/src/csv_reporter.cc
index af2c18f..7b56da1 100644
--- a/third_party/google_benchmark/src/csv_reporter.cc
+++ b/third_party/google_benchmark/src/src/csv_reporter.cc
@@ -12,9 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "benchmark/benchmark.h"
-#include "complexity.h"
-
 #include <algorithm>
 #include <cstdint>
 #include <iostream>
@@ -22,7 +19,9 @@
 #include <tuple>
 #include <vector>
 
+#include "benchmark/benchmark.h"
 #include "check.h"
+#include "complexity.h"
 #include "string_util.h"
 #include "timers.h"
 
@@ -37,23 +36,29 @@
     "error_occurred", "error_message"};
 }  // namespace
 
-std::string CsvEscape(const std::string & s) {
+std::string CsvEscape(const std::string& s) {
   std::string tmp;
   tmp.reserve(s.size() + 2);
   for (char c : s) {
     switch (c) {
-    case '"' : tmp += "\"\""; break;
-    default  : tmp += c; break;
+      case '"':
+        tmp += "\"\"";
+        break;
+      default:
+        tmp += c;
+        break;
     }
   }
   return '"' + tmp + '"';
 }
 
+BENCHMARK_EXPORT
 bool CSVReporter::ReportContext(const Context& context) {
   PrintBasicContext(&GetErrorStream(), context);
   return true;
 }
 
+BENCHMARK_EXPORT
 void CSVReporter::ReportRuns(const std::vector<Run>& reports) {
   std::ostream& Out = GetOutputStream();
 
@@ -85,7 +90,8 @@
       for (const auto& cnt : run.counters) {
         if (cnt.first == "bytes_per_second" || cnt.first == "items_per_second")
           continue;
-        CHECK(user_counter_names_.find(cnt.first) != user_counter_names_.end())
+        BM_CHECK(user_counter_names_.find(cnt.first) !=
+                 user_counter_names_.end())
             << "All counters must be present in each run. "
             << "Counter named \"" << cnt.first
             << "\" was not in a run after being added to the header";
@@ -99,13 +105,14 @@
   }
 }
 
+BENCHMARK_EXPORT
 void CSVReporter::PrintRunData(const Run& run) {
   std::ostream& Out = GetOutputStream();
   Out << CsvEscape(run.benchmark_name()) << ",";
-  if (run.error_occurred) {
+  if (run.skipped) {
     Out << std::string(elements.size() - 3, ',');
-    Out << "true,";
-    Out << CsvEscape(run.error_message) << "\n";
+    Out << std::boolalpha << (internal::SkippedWithError == run.skipped) << ",";
+    Out << CsvEscape(run.skip_message) << "\n";
     return;
   }
 
diff --git a/third_party/google_benchmark/src/cycleclock.h b/third_party/google_benchmark/src/src/cycleclock.h
similarity index 76%
rename from third_party/google_benchmark/src/cycleclock.h
rename to third_party/google_benchmark/src/src/cycleclock.h
index d5d62c4..ae1ef2d 100644
--- a/third_party/google_benchmark/src/cycleclock.h
+++ b/third_party/google_benchmark/src/src/cycleclock.h
@@ -36,7 +36,8 @@
 // declarations of some other intrinsics, breaking compilation.
 // Therefore, we simply declare __rdtsc ourselves. See also
 // http://connect.microsoft.com/VisualStudio/feedback/details/262047
-#if defined(COMPILER_MSVC) && !defined(_M_IX86)
+#if defined(COMPILER_MSVC) && !defined(_M_IX86) && !defined(_M_ARM64) && \
+    !defined(_M_ARM64EC)
 extern "C" uint64_t __rdtsc();
 #pragma intrinsic(__rdtsc)
 #endif
@@ -84,13 +85,21 @@
   return (high << 32) | low;
 #elif defined(__powerpc__) || defined(__ppc__)
   // This returns a time-base, which is not always precisely a cycle-count.
-  int64_t tbl, tbu0, tbu1;
-  asm("mftbu %0" : "=r"(tbu0));
-  asm("mftb  %0" : "=r"(tbl));
-  asm("mftbu %0" : "=r"(tbu1));
-  tbl &= -static_cast<int64_t>(tbu0 == tbu1);
-  // high 32 bits in tbu1; low 32 bits in tbl  (tbu0 is garbage)
-  return (tbu1 << 32) | tbl;
+#if defined(__powerpc64__) || defined(__ppc64__)
+  int64_t tb;
+  asm volatile("mfspr %0, 268" : "=r"(tb));
+  return tb;
+#else
+  uint32_t tbl, tbu0, tbu1;
+  asm volatile(
+      "mftbu %0\n"
+      "mftb %1\n"
+      "mftbu %2"
+      : "=r"(tbu0), "=r"(tbl), "=r"(tbu1));
+  tbl &= -static_cast<int32_t>(tbu0 == tbu1);
+  // high 32 bits in tbu1; low 32 bits in tbl  (tbu0 is no longer needed)
+  return (static_cast<uint64_t>(tbu1) << 32) | tbl;
+#endif
 #elif defined(__sparc__)
   int64_t tick;
   asm(".byte 0x83, 0x41, 0x00, 0x00");
@@ -106,6 +115,12 @@
   // when I know it will work.  Otherwise, I'll use __rdtsc and hope
   // the code is being compiled with a non-ancient compiler.
   _asm rdtsc
+#elif defined(COMPILER_MSVC) && (defined(_M_ARM64) || defined(_M_ARM64EC))
+  // See // https://docs.microsoft.com/en-us/cpp/intrinsics/arm64-intrinsics
+  // and https://reviews.llvm.org/D53115
+  int64_t virtual_timer_value;
+  virtual_timer_value = _ReadStatusReg(ARM64_CNTVCT);
+  return virtual_timer_value;
 #elif defined(COMPILER_MSVC)
   return __rdtsc();
 #elif defined(BENCHMARK_OS_NACL)
@@ -118,7 +133,7 @@
 
   // Native Client does not provide any API to access cycle counter.
   // Use clock_gettime(CLOCK_MONOTONIC, ...) instead of gettimeofday
-  // because is provides nanosecond resolution (which is noticable at
+  // because is provides nanosecond resolution (which is noticeable at
   // least for PNaCl modules running on x86 Mac & Linux).
   // Initialize to always return 0 if clock_gettime fails.
   struct timespec ts = {0, 0};
@@ -153,32 +168,55 @@
   struct timeval tv;
   gettimeofday(&tv, nullptr);
   return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
-#elif defined(__mips__)
+#elif defined(__mips__) || defined(__m68k__)
   // mips apparently only allows rdtsc for superusers, so we fall
   // back to gettimeofday.  It's possible clock_gettime would be better.
   struct timeval tv;
   gettimeofday(&tv, nullptr);
   return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+#elif defined(__loongarch__) || defined(__csky__)
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
 #elif defined(__s390__)  // Covers both s390 and s390x.
   // Return the CPU clock.
   uint64_t tsc;
+#if defined(BENCHMARK_OS_ZOS) && defined(COMPILER_IBMXL)
+  // z/OS XL compiler HLASM syntax.
+  asm(" stck %0" : "=m"(tsc) : : "cc");
+#else
   asm("stck %0" : "=Q"(tsc) : : "cc");
+#endif
   return tsc;
-#elif defined(__riscv) // RISC-V
+#elif defined(__riscv)  // RISC-V
   // Use RDCYCLE (and RDCYCLEH on riscv32)
 #if __riscv_xlen == 32
-  uint64_t cycles_low, cycles_hi0, cycles_hi1;
-  asm("rdcycleh %0" : "=r"(cycles_hi0));
-  asm("rdcycle %0" : "=r"(cycles_lo));
-  asm("rdcycleh %0" : "=r"(cycles_hi1));
-  // This matches the PowerPC overflow detection, above
-  cycles_lo &= -static_cast<int64_t>(cycles_hi0 == cycles_hi1);
-  return (cycles_hi1 << 32) | cycles_lo;
+  uint32_t cycles_lo, cycles_hi0, cycles_hi1;
+  // This asm also includes the PowerPC overflow handling strategy, as above.
+  // Implemented in assembly because Clang insisted on branching.
+  asm volatile(
+      "rdcycleh %0\n"
+      "rdcycle %1\n"
+      "rdcycleh %2\n"
+      "sub %0, %0, %2\n"
+      "seqz %0, %0\n"
+      "sub %0, zero, %0\n"
+      "and %1, %1, %0\n"
+      : "=r"(cycles_hi0), "=r"(cycles_lo), "=r"(cycles_hi1));
+  return (static_cast<uint64_t>(cycles_hi1) << 32) | cycles_lo;
 #else
   uint64_t cycles;
-  asm("rdcycle %0" : "=r"(cycles));
+  asm volatile("rdcycle %0" : "=r"(cycles));
   return cycles;
 #endif
+#elif defined(__e2k__) || defined(__elbrus__)
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+#elif defined(__hexagon__)
+  uint64_t pcycle;
+  asm volatile("%0 = C15:14" : "=r"(pcycle));
+  return static_cast<double>(pcycle);
 #else
 // The soft failover to a generic implementation is automatic only for ARM.
 // For other platforms the developer is expected to make an attempt to create
diff --git a/third_party/google_benchmark/src/internal_macros.h b/third_party/google_benchmark/src/src/internal_macros.h
similarity index 74%
rename from third_party/google_benchmark/src/internal_macros.h
rename to third_party/google_benchmark/src/src/internal_macros.h
index 6adf00d..8dd7d0c 100644
--- a/third_party/google_benchmark/src/internal_macros.h
+++ b/third_party/google_benchmark/src/src/internal_macros.h
@@ -1,8 +1,6 @@
 #ifndef BENCHMARK_INTERNAL_MACROS_H_
 #define BENCHMARK_INTERNAL_MACROS_H_
 
-#include "benchmark/benchmark.h"
-
 /* Needed to detect STL */
 #include <cstdlib>
 
@@ -13,7 +11,11 @@
 #endif
 
 #if defined(__clang__)
-  #if !defined(COMPILER_CLANG)
+  #if defined(__ibmxl__)
+    #if !defined(COMPILER_IBMXL)
+      #define COMPILER_IBMXL
+    #endif
+  #elif !defined(COMPILER_CLANG)
     #define COMPILER_CLANG
   #endif
 #elif defined(_MSC_VER)
@@ -40,6 +42,19 @@
   #define BENCHMARK_OS_CYGWIN 1
 #elif defined(_WIN32)
   #define BENCHMARK_OS_WINDOWS 1
+  // WINAPI_FAMILY_PARTITION is defined in winapifamily.h.
+  // We include windows.h which implicitly includes winapifamily.h for compatibility.
+  #ifndef NOMINMAX
+    #define NOMINMAX
+  #endif
+  #include <windows.h>
+  #if defined(WINAPI_FAMILY_PARTITION)
+    #if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+      #define BENCHMARK_OS_WINDOWS_WIN32 1
+    #elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)
+      #define BENCHMARK_OS_WINDOWS_RT 1
+    #endif
+  #endif
   #if defined(__MINGW32__)
     #define BENCHMARK_OS_MINGW 1
   #endif
@@ -58,6 +73,8 @@
   #define BENCHMARK_OS_NETBSD 1
 #elif defined(__OpenBSD__)
   #define BENCHMARK_OS_OPENBSD 1
+#elif defined(__DragonFly__)
+  #define BENCHMARK_OS_DRAGONFLY 1
 #elif defined(__linux__)
   #define BENCHMARK_OS_LINUX 1
 #elif defined(__native_client__)
@@ -72,6 +89,10 @@
 #define BENCHMARK_OS_SOLARIS 1
 #elif defined(__QNX__)
 #define BENCHMARK_OS_QNX 1
+#elif defined(__MVS__)
+#define BENCHMARK_OS_ZOS 1
+#elif defined(__hexagon__)
+#define BENCHMARK_OS_QURT 1
 #endif
 
 #if defined(__ANDROID__) && defined(__GLIBCXX__)
diff --git a/third_party/google_benchmark/src/json_reporter.cc b/third_party/google_benchmark/src/src/json_reporter.cc
similarity index 65%
rename from third_party/google_benchmark/src/json_reporter.cc
rename to third_party/google_benchmark/src/src/json_reporter.cc
index e5f3c35..6559dfd 100644
--- a/third_party/google_benchmark/src/json_reporter.cc
+++ b/third_party/google_benchmark/src/src/json_reporter.cc
@@ -12,9 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "benchmark/benchmark.h"
-#include "complexity.h"
-
 #include <algorithm>
 #include <cmath>
 #include <cstdint>
@@ -25,41 +22,61 @@
 #include <tuple>
 #include <vector>
 
+#include "benchmark/benchmark.h"
+#include "complexity.h"
 #include "string_util.h"
 #include "timers.h"
 
 namespace benchmark {
-
 namespace {
 
-std::string StrEscape(const std::string & s) {
+std::string StrEscape(const std::string& s) {
   std::string tmp;
   tmp.reserve(s.size());
   for (char c : s) {
     switch (c) {
-    case '\b': tmp += "\\b"; break;
-    case '\f': tmp += "\\f"; break;
-    case '\n': tmp += "\\n"; break;
-    case '\r': tmp += "\\r"; break;
-    case '\t': tmp += "\\t"; break;
-    case '\\': tmp += "\\\\"; break;
-    case '"' : tmp += "\\\""; break;
-    default  : tmp += c; break;
+      case '\b':
+        tmp += "\\b";
+        break;
+      case '\f':
+        tmp += "\\f";
+        break;
+      case '\n':
+        tmp += "\\n";
+        break;
+      case '\r':
+        tmp += "\\r";
+        break;
+      case '\t':
+        tmp += "\\t";
+        break;
+      case '\\':
+        tmp += "\\\\";
+        break;
+      case '"':
+        tmp += "\\\"";
+        break;
+      default:
+        tmp += c;
+        break;
     }
   }
   return tmp;
 }
 
 std::string FormatKV(std::string const& key, std::string const& value) {
-  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(), StrEscape(value).c_str());
+  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(),
+                   StrEscape(value).c_str());
 }
 
 std::string FormatKV(std::string const& key, const char* value) {
-  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(), StrEscape(value).c_str());
+  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(),
+                   StrEscape(value).c_str());
 }
 
 std::string FormatKV(std::string const& key, bool value) {
-  return StrFormat("\"%s\": %s", StrEscape(key).c_str(), value ? "true" : "false");
+  return StrFormat("\"%s\": %s", StrEscape(key).c_str(),
+                   value ? "true" : "false");
 }
 
 std::string FormatKV(std::string const& key, int64_t value) {
@@ -68,12 +85,6 @@
   return ss.str();
 }
 
-std::string FormatKV(std::string const& key, IterationCount value) {
-  std::stringstream ss;
-  ss << '"' << StrEscape(key) << "\": " << value;
-  return ss.str();
-}
-
 std::string FormatKV(std::string const& key, double value) {
   std::stringstream ss;
   ss << '"' << StrEscape(key) << "\": ";
@@ -122,8 +133,12 @@
       << FormatKV("mhz_per_cpu",
                   RoundDouble(info.cycles_per_second / 1000000.0))
       << ",\n";
-  out << indent << FormatKV("cpu_scaling_enabled", info.scaling_enabled)
-      << ",\n";
+  if (CPUInfo::Scaling::UNKNOWN != info.scaling) {
+    out << indent
+        << FormatKV("cpu_scaling_enabled",
+                    info.scaling == CPUInfo::Scaling::ENABLED ? true : false)
+        << ",\n";
+  }
 
   out << indent << "\"caches\": [\n";
   indent = std::string(6, ' ');
@@ -134,8 +149,8 @@
     out << cache_indent << FormatKV("type", CI.type) << ",\n";
     out << cache_indent << FormatKV("level", static_cast<int64_t>(CI.level))
         << ",\n";
-    out << cache_indent
-        << FormatKV("size", static_cast<int64_t>(CI.size)) << ",\n";
+    out << cache_indent << FormatKV("size", static_cast<int64_t>(CI.size))
+        << ",\n";
     out << cache_indent
         << FormatKV("num_sharing", static_cast<int64_t>(CI.num_sharing))
         << "\n";
@@ -157,7 +172,19 @@
 #else
   const char build_type[] = "debug";
 #endif
-  out << indent << FormatKV("library_build_type", build_type) << "\n";
+  out << indent << FormatKV("library_build_type", build_type);
+
+  std::map<std::string, std::string>* global_context =
+      internal::GetGlobalContext();
+
+  if (global_context != nullptr) {
+    for (const auto& kv : *global_context) {
+      out << ",\n";
+      out << indent << FormatKV(kv.first, kv.second);
+    }
+  }
+  out << "\n";
+
   // Close context block and open the list of benchmarks.
   out << inner_indent << "},\n";
   out << inner_indent << "\"benchmarks\": [\n";
@@ -195,6 +222,10 @@
   std::string indent(6, ' ');
   std::ostream& out = GetOutputStream();
   out << indent << FormatKV("name", run.benchmark_name()) << ",\n";
+  out << indent << FormatKV("family_index", run.family_index) << ",\n";
+  out << indent
+      << FormatKV("per_family_instance_index", run.per_family_instance_index)
+      << ",\n";
   out << indent << FormatKV("run_name", run.run_name.str()) << ",\n";
   out << indent << FormatKV("run_type", [&run]() -> const char* {
     switch (run.run_type) {
@@ -213,15 +244,36 @@
   out << indent << FormatKV("threads", run.threads) << ",\n";
   if (run.run_type == BenchmarkReporter::Run::RT_Aggregate) {
     out << indent << FormatKV("aggregate_name", run.aggregate_name) << ",\n";
+    out << indent << FormatKV("aggregate_unit", [&run]() -> const char* {
+      switch (run.aggregate_unit) {
+        case StatisticUnit::kTime:
+          return "time";
+        case StatisticUnit::kPercentage:
+          return "percentage";
+      }
+      BENCHMARK_UNREACHABLE();
+    }()) << ",\n";
   }
-  if (run.error_occurred) {
-    out << indent << FormatKV("error_occurred", run.error_occurred) << ",\n";
-    out << indent << FormatKV("error_message", run.error_message) << ",\n";
+  if (internal::SkippedWithError == run.skipped) {
+    out << indent << FormatKV("error_occurred", true) << ",\n";
+    out << indent << FormatKV("error_message", run.skip_message) << ",\n";
+  } else if (internal::SkippedWithMessage == run.skipped) {
+    out << indent << FormatKV("skipped", true) << ",\n";
+    out << indent << FormatKV("skip_message", run.skip_message) << ",\n";
   }
   if (!run.report_big_o && !run.report_rms) {
     out << indent << FormatKV("iterations", run.iterations) << ",\n";
-    out << indent << FormatKV("real_time", run.GetAdjustedRealTime()) << ",\n";
-    out << indent << FormatKV("cpu_time", run.GetAdjustedCPUTime());
+    if (run.run_type != Run::RT_Aggregate ||
+        run.aggregate_unit == StatisticUnit::kTime) {
+      out << indent << FormatKV("real_time", run.GetAdjustedRealTime())
+          << ",\n";
+      out << indent << FormatKV("cpu_time", run.GetAdjustedCPUTime());
+    } else {
+      assert(run.aggregate_unit == StatisticUnit::kPercentage);
+      out << indent << FormatKV("real_time", run.real_accumulated_time)
+          << ",\n";
+      out << indent << FormatKV("cpu_time", run.cpu_accumulated_time);
+    }
     out << ",\n"
         << indent << FormatKV("time_unit", GetTimeUnitString(run.time_unit));
   } else if (run.report_big_o) {
@@ -239,9 +291,21 @@
     out << ",\n" << indent << FormatKV(c.first, c.second);
   }
 
-  if (run.has_memory_result) {
+  if (run.memory_result) {
+    const MemoryManager::Result memory_result = *run.memory_result;
     out << ",\n" << indent << FormatKV("allocs_per_iter", run.allocs_per_iter);
-    out << ",\n" << indent << FormatKV("max_bytes_used", run.max_bytes_used);
+    out << ",\n"
+        << indent << FormatKV("max_bytes_used", memory_result.max_bytes_used);
+
+    auto report_if_present = [&out, &indent](const std::string& label,
+                                             int64_t val) {
+      if (val != MemoryManager::TombstoneValue)
+        out << ",\n" << indent << FormatKV(label, val);
+    };
+
+    report_if_present("total_allocated_bytes",
+                      memory_result.total_allocated_bytes);
+    report_if_present("net_heap_growth", memory_result.net_heap_growth);
   }
 
   if (!run.report_label.empty()) {
@@ -250,4 +314,7 @@
   out << '\n';
 }
 
+const int64_t MemoryManager::TombstoneValue =
+    std::numeric_limits<int64_t>::max();
+
 }  // end namespace benchmark
diff --git a/third_party/google_benchmark/src/log.h b/third_party/google_benchmark/src/src/log.h
similarity index 63%
rename from third_party/google_benchmark/src/log.h
rename to third_party/google_benchmark/src/src/log.h
index 47d0c35..4570166 100644
--- a/third_party/google_benchmark/src/log.h
+++ b/third_party/google_benchmark/src/src/log.h
@@ -4,7 +4,12 @@
 #include <iostream>
 #include <ostream>
 
-#include "benchmark/benchmark.h"
+// NOTE: this is also defined in benchmark.h but we're trying to avoid a
+// dependency.
+// The _MSVC_LANG check should detect Visual Studio 2015 Update 3 and newer.
+#if __cplusplus >= 201103L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201103L)
+#define BENCHMARK_HAS_CXX11
+#endif
 
 namespace benchmark {
 namespace internal {
@@ -23,7 +28,16 @@
  private:
   LogType(std::ostream* out) : out_(out) {}
   std::ostream* out_;
-  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(LogType);
+
+  // NOTE: we could use BENCHMARK_DISALLOW_COPY_AND_ASSIGN but we shouldn't have
+  // a dependency on benchmark.h from here.
+#ifndef BENCHMARK_HAS_CXX11
+  LogType(const LogType&);
+  LogType& operator=(const LogType&);
+#else
+  LogType(const LogType&) = delete;
+  LogType& operator=(const LogType&) = delete;
+#endif
 };
 
 template <class Tp>
@@ -47,13 +61,13 @@
 }
 
 inline LogType& GetNullLogInstance() {
-  static LogType log(nullptr);
-  return log;
+  static LogType null_log((std::ostream*)nullptr);
+  return null_log;
 }
 
 inline LogType& GetErrorLogInstance() {
-  static LogType log(&std::clog);
-  return log;
+  static LogType error_log(&std::clog);
+  return error_log;
 }
 
 inline LogType& GetLogInstanceForLevel(int level) {
@@ -67,7 +81,7 @@
 }  // end namespace benchmark
 
 // clang-format off
-#define VLOG(x)                                                               \
+#define BM_VLOG(x)                                                               \
   (::benchmark::internal::GetLogInstanceForLevel(x) << "-- LOG(" << x << "):" \
                                                                          " ")
 // clang-format on
diff --git a/third_party/google_benchmark/src/mutex.h b/third_party/google_benchmark/src/src/mutex.h
similarity index 66%
rename from third_party/google_benchmark/src/mutex.h
rename to third_party/google_benchmark/src/src/mutex.h
index 5f461d0..bec78d9 100644
--- a/third_party/google_benchmark/src/mutex.h
+++ b/third_party/google_benchmark/src/src/mutex.h
@@ -9,60 +9,60 @@
 // Enable thread safety attributes only with clang.
 // The attributes can be safely erased when compiling with other compilers.
 #if defined(HAVE_THREAD_SAFETY_ATTRIBUTES)
-#define THREAD_ANNOTATION_ATTRIBUTE__(x) __attribute__((x))
+#define THREAD_ANNOTATION_ATTRIBUTE_(x) __attribute__((x))
 #else
-#define THREAD_ANNOTATION_ATTRIBUTE__(x)  // no-op
+#define THREAD_ANNOTATION_ATTRIBUTE_(x)  // no-op
 #endif
 
-#define CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(capability(x))
+#define CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE_(capability(x))
 
-#define SCOPED_CAPABILITY THREAD_ANNOTATION_ATTRIBUTE__(scoped_lockable)
+#define SCOPED_CAPABILITY THREAD_ANNOTATION_ATTRIBUTE_(scoped_lockable)
 
-#define GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(guarded_by(x))
+#define GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE_(guarded_by(x))
 
-#define PT_GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(pt_guarded_by(x))
+#define PT_GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE_(pt_guarded_by(x))
 
 #define ACQUIRED_BEFORE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquired_before(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(acquired_before(__VA_ARGS__))
 
 #define ACQUIRED_AFTER(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquired_after(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(acquired_after(__VA_ARGS__))
 
 #define REQUIRES(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(requires_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(requires_capability(__VA_ARGS__))
 
 #define REQUIRES_SHARED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(requires_shared_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(requires_shared_capability(__VA_ARGS__))
 
 #define ACQUIRE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquire_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(acquire_capability(__VA_ARGS__))
 
 #define ACQUIRE_SHARED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquire_shared_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(acquire_shared_capability(__VA_ARGS__))
 
 #define RELEASE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(release_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(release_capability(__VA_ARGS__))
 
 #define RELEASE_SHARED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(release_shared_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(release_shared_capability(__VA_ARGS__))
 
 #define TRY_ACQUIRE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(try_acquire_capability(__VA_ARGS__))
 
 #define TRY_ACQUIRE_SHARED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_shared_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(try_acquire_shared_capability(__VA_ARGS__))
 
-#define EXCLUDES(...) THREAD_ANNOTATION_ATTRIBUTE__(locks_excluded(__VA_ARGS__))
+#define EXCLUDES(...) THREAD_ANNOTATION_ATTRIBUTE_(locks_excluded(__VA_ARGS__))
 
-#define ASSERT_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(assert_capability(x))
+#define ASSERT_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE_(assert_capability(x))
 
 #define ASSERT_SHARED_CAPABILITY(x) \
-  THREAD_ANNOTATION_ATTRIBUTE__(assert_shared_capability(x))
+  THREAD_ANNOTATION_ATTRIBUTE_(assert_shared_capability(x))
 
-#define RETURN_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(lock_returned(x))
+#define RETURN_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE_(lock_returned(x))
 
 #define NO_THREAD_SAFETY_ANALYSIS \
-  THREAD_ANNOTATION_ATTRIBUTE__(no_thread_safety_analysis)
+  THREAD_ANNOTATION_ATTRIBUTE_(no_thread_safety_analysis)
 
 namespace benchmark {
 
@@ -71,7 +71,7 @@
 // NOTE: Wrappers for std::mutex and std::unique_lock are provided so that
 // we can annotate them with thread safety attributes and use the
 // -Wthread-safety warning with clang. The standard library types cannot be
-// used directly because they do not provided the required annotations.
+// used directly because they do not provide the required annotations.
 class CAPABILITY("mutex") Mutex {
  public:
   Mutex() {}
@@ -130,7 +130,7 @@
   // entered the barrier.  Returns iff this is the last thread to
   // enter the barrier.
   bool createBarrier(MutexLock& ml) REQUIRES(lock_) {
-    CHECK_LT(entered_, running_threads_);
+    BM_CHECK_LT(entered_, running_threads_);
     entered_++;
     if (entered_ < running_threads_) {
       // Wait for all threads to enter
diff --git a/third_party/google_benchmark/src/src/perf_counters.cc b/third_party/google_benchmark/src/src/perf_counters.cc
new file mode 100644
index 0000000..3980ea0
--- /dev/null
+++ b/third_party/google_benchmark/src/src/perf_counters.cc
@@ -0,0 +1,269 @@
+// Copyright 2021 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "perf_counters.h"
+
+#include <cstring>
+#include <memory>
+#include <vector>
+
+#if defined HAVE_LIBPFM
+#include "perfmon/pfmlib.h"
+#include "perfmon/pfmlib_perf_event.h"
+#endif
+
+namespace benchmark {
+namespace internal {
+
+constexpr size_t PerfCounterValues::kMaxCounters;
+
+#if defined HAVE_LIBPFM
+
+size_t PerfCounterValues::Read(const std::vector<int>& leaders) {
+  // Create a pointer for multiple reads
+  const size_t bufsize = values_.size() * sizeof(values_[0]);
+  char* ptr = reinterpret_cast<char*>(values_.data());
+  size_t size = bufsize;
+  for (int lead : leaders) {
+    auto read_bytes = ::read(lead, ptr, size);
+    if (read_bytes >= ssize_t(sizeof(uint64_t))) {
+      // Actual data bytes are all bytes minus initial padding
+      std::size_t data_bytes = read_bytes - sizeof(uint64_t);
+      // This should be very cheap since it's in hot cache
+      std::memmove(ptr, ptr + sizeof(uint64_t), data_bytes);
+      // Increment our counters
+      ptr += data_bytes;
+      size -= data_bytes;
+    } else {
+      int err = errno;
+      GetErrorLogInstance() << "Error reading lead " << lead << " errno:" << err
+                            << " " << ::strerror(err) << "\n";
+      return 0;
+    }
+  }
+  return (bufsize - size) / sizeof(uint64_t);
+}
+
+const bool PerfCounters::kSupported = true;
+
+bool PerfCounters::Initialize() { return pfm_initialize() == PFM_SUCCESS; }
+
+bool PerfCounters::IsCounterSupported(const std::string& name) {
+  perf_event_attr_t attr;
+  std::memset(&attr, 0, sizeof(attr));
+  pfm_perf_encode_arg_t arg;
+  std::memset(&arg, 0, sizeof(arg));
+  arg.attr = &attr;
+  const int mode = PFM_PLM3;  // user mode only
+  int ret = pfm_get_os_event_encoding(name.c_str(), mode, PFM_OS_PERF_EVENT_EXT,
+                                      &arg);
+  return (ret == PFM_SUCCESS);
+}
+
+PerfCounters PerfCounters::Create(
+    const std::vector<std::string>& counter_names) {
+  // Valid counters will populate these arrays but we start empty
+  std::vector<std::string> valid_names;
+  std::vector<int> counter_ids;
+  std::vector<int> leader_ids;
+
+  // Resize to the maximum possible
+  valid_names.reserve(counter_names.size());
+  counter_ids.reserve(counter_names.size());
+
+  const int kCounterMode = PFM_PLM3;  // user mode only
+
+  // Group leads will be assigned on demand. The idea is that once we cannot
+  // create a counter descriptor, the reason is that this group has maxed out
+  // so we set the group_id again to -1 and retry - giving the algorithm a
+  // chance to create a new group leader to hold the next set of counters.
+  int group_id = -1;
+
+  // Loop through all performance counters
+  for (size_t i = 0; i < counter_names.size(); ++i) {
+    // we are about to push into the valid names vector
+    // check if we did not reach the maximum
+    if (valid_names.size() == PerfCounterValues::kMaxCounters) {
+      // Log a message if we maxed out and stop adding
+      GetErrorLogInstance()
+          << counter_names.size() << " counters were requested. The maximum is "
+          << PerfCounterValues::kMaxCounters << " and " << valid_names.size()
+          << " were already added. All remaining counters will be ignored\n";
+      // stop the loop and return what we have already
+      break;
+    }
+
+    // Check if this name is empty
+    const auto& name = counter_names[i];
+    if (name.empty()) {
+      GetErrorLogInstance()
+          << "A performance counter name was the empty string\n";
+      continue;
+    }
+
+    // Here first means first in group, ie the group leader
+    const bool is_first = (group_id < 0);
+
+    // This struct will be populated by libpfm from the counter string
+    // and then fed into the syscall perf_event_open
+    struct perf_event_attr attr {};
+    attr.size = sizeof(attr);
+
+    // This is the input struct to libpfm.
+    pfm_perf_encode_arg_t arg{};
+    arg.attr = &attr;
+    const int pfm_get = pfm_get_os_event_encoding(name.c_str(), kCounterMode,
+                                                  PFM_OS_PERF_EVENT, &arg);
+    if (pfm_get != PFM_SUCCESS) {
+      GetErrorLogInstance()
+          << "Unknown performance counter name: " << name << "\n";
+      continue;
+    }
+
+    // We then proceed to populate the remaining fields in our attribute struct
+    // Note: the man page for perf_event_create suggests inherit = true and
+    // read_format = PERF_FORMAT_GROUP don't work together, but that's not the
+    // case.
+    attr.disabled = is_first;
+    attr.inherit = true;
+    attr.pinned = is_first;
+    attr.exclude_kernel = true;
+    attr.exclude_user = false;
+    attr.exclude_hv = true;
+
+    // Read all counters in a group in one read.
+    attr.read_format = PERF_FORMAT_GROUP;
+
+    int id = -1;
+    while (id < 0) {
+      static constexpr size_t kNrOfSyscallRetries = 5;
+      // Retry syscall as it was interrupted often (b/64774091).
+      for (size_t num_retries = 0; num_retries < kNrOfSyscallRetries;
+           ++num_retries) {
+        id = perf_event_open(&attr, 0, -1, group_id, 0);
+        if (id >= 0 || errno != EINTR) {
+          break;
+        }
+      }
+      if (id < 0) {
+        // If the file descriptor is negative we might have reached a limit
+        // in the current group. Set the group_id to -1 and retry
+        if (group_id >= 0) {
+          // Create a new group
+          group_id = -1;
+        } else {
+          // At this point we have already retried to set a new group id and
+          // failed. We then give up.
+          break;
+        }
+      }
+    }
+
+    // We failed to get a new file descriptor. We might have reached a hard
+    // hardware limit that cannot be resolved even with group multiplexing
+    if (id < 0) {
+      GetErrorLogInstance() << "***WARNING** Failed to get a file descriptor "
+                               "for performance counter "
+                            << name << ". Ignoring\n";
+
+      // We give up on this counter but try to keep going
+      // as the others would be fine
+      continue;
+    }
+    if (group_id < 0) {
+      // This is a leader, store and assign it to the current file descriptor
+      leader_ids.push_back(id);
+      group_id = id;
+    }
+    // This is a valid counter, add it to our descriptor's list
+    counter_ids.push_back(id);
+    valid_names.push_back(name);
+  }
+
+  // Loop through all group leaders activating them
+  // There is another option of starting ALL counters in a process but
+  // that would be far reaching an intrusion. If the user is using PMCs
+  // by themselves then this would have a side effect on them. It is
+  // friendlier to loop through all groups individually.
+  for (int lead : leader_ids) {
+    if (ioctl(lead, PERF_EVENT_IOC_ENABLE) != 0) {
+      // This should never happen but if it does, we give up on the
+      // entire batch as recovery would be a mess.
+      GetErrorLogInstance() << "***WARNING*** Failed to start counters. "
+                               "Claring out all counters.\n";
+
+      // Close all peformance counters
+      for (int id : counter_ids) {
+        ::close(id);
+      }
+
+      // Return an empty object so our internal state is still good and
+      // the process can continue normally without impact
+      return NoCounters();
+    }
+  }
+
+  return PerfCounters(std::move(valid_names), std::move(counter_ids),
+                      std::move(leader_ids));
+}
+
+void PerfCounters::CloseCounters() const {
+  if (counter_ids_.empty()) {
+    return;
+  }
+  for (int lead : leader_ids_) {
+    ioctl(lead, PERF_EVENT_IOC_DISABLE);
+  }
+  for (int fd : counter_ids_) {
+    close(fd);
+  }
+}
+#else   // defined HAVE_LIBPFM
+size_t PerfCounterValues::Read(const std::vector<int>&) { return 0; }
+
+const bool PerfCounters::kSupported = false;
+
+bool PerfCounters::Initialize() { return false; }
+
+bool PerfCounters::IsCounterSupported(const std::string&) { return false; }
+
+PerfCounters PerfCounters::Create(
+    const std::vector<std::string>& counter_names) {
+  if (!counter_names.empty()) {
+    GetErrorLogInstance() << "Performance counters not supported.";
+  }
+  return NoCounters();
+}
+
+void PerfCounters::CloseCounters() const {}
+#endif  // defined HAVE_LIBPFM
+
+PerfCountersMeasurement::PerfCountersMeasurement(
+    const std::vector<std::string>& counter_names)
+    : start_values_(counter_names.size()), end_values_(counter_names.size()) {
+  counters_ = PerfCounters::Create(counter_names);
+}
+
+PerfCounters& PerfCounters::operator=(PerfCounters&& other) noexcept {
+  if (this != &other) {
+    CloseCounters();
+
+    counter_ids_ = std::move(other.counter_ids_);
+    leader_ids_ = std::move(other.leader_ids_);
+    counter_names_ = std::move(other.counter_names_);
+  }
+  return *this;
+}
+}  // namespace internal
+}  // namespace benchmark
diff --git a/third_party/google_benchmark/src/src/perf_counters.h b/third_party/google_benchmark/src/src/perf_counters.h
new file mode 100644
index 0000000..152a6f2
--- /dev/null
+++ b/third_party/google_benchmark/src/src/perf_counters.h
@@ -0,0 +1,202 @@
+// Copyright 2021 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef BENCHMARK_PERF_COUNTERS_H
+#define BENCHMARK_PERF_COUNTERS_H
+
+#include <array>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <vector>
+
+#include "benchmark/benchmark.h"
+#include "check.h"
+#include "log.h"
+#include "mutex.h"
+
+#ifndef BENCHMARK_OS_WINDOWS
+#include <unistd.h>
+#endif
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+// C4251: <symbol> needs to have dll-interface to be used by clients of class
+#pragma warning(disable : 4251)
+#endif
+
+namespace benchmark {
+namespace internal {
+
+// Typically, we can only read a small number of counters. There is also a
+// padding preceding counter values, when reading multiple counters with one
+// syscall (which is desirable). PerfCounterValues abstracts these details.
+// The implementation ensures the storage is inlined, and allows 0-based
+// indexing into the counter values.
+// The object is used in conjunction with a PerfCounters object, by passing it
+// to Snapshot(). The Read() method relocates individual reads, discarding
+// the initial padding from each group leader in the values buffer such that
+// all user accesses through the [] operator are correct.
+class BENCHMARK_EXPORT PerfCounterValues {
+ public:
+  explicit PerfCounterValues(size_t nr_counters) : nr_counters_(nr_counters) {
+    BM_CHECK_LE(nr_counters_, kMaxCounters);
+  }
+
+  // We are reading correctly now so the values don't need to skip padding
+  uint64_t operator[](size_t pos) const { return values_[pos]; }
+
+  // Increased the maximum to 32 only since the buffer
+  // is std::array<> backed
+  static constexpr size_t kMaxCounters = 32;
+
+ private:
+  friend class PerfCounters;
+  // Get the byte buffer in which perf counters can be captured.
+  // This is used by PerfCounters::Read
+  std::pair<char*, size_t> get_data_buffer() {
+    return {reinterpret_cast<char*>(values_.data()),
+            sizeof(uint64_t) * (kPadding + nr_counters_)};
+  }
+
+  // This reading is complex and as the goal of this class is to
+  // abstract away the intrincacies of the reading process, this is
+  // a better place for it
+  size_t Read(const std::vector<int>& leaders);
+
+  // Move the padding to 2 due to the reading algorithm (1st padding plus a
+  // current read padding)
+  static constexpr size_t kPadding = 2;
+  std::array<uint64_t, kPadding + kMaxCounters> values_;
+  const size_t nr_counters_;
+};
+
+// Collect PMU counters. The object, once constructed, is ready to be used by
+// calling read(). PMU counter collection is enabled from the time create() is
+// called, to obtain the object, until the object's destructor is called.
+class BENCHMARK_EXPORT PerfCounters final {
+ public:
+  // True iff this platform supports performance counters.
+  static const bool kSupported;
+
+  // Returns an empty object
+  static PerfCounters NoCounters() { return PerfCounters(); }
+
+  ~PerfCounters() { CloseCounters(); }
+  PerfCounters() = default;
+  PerfCounters(PerfCounters&&) = default;
+  PerfCounters(const PerfCounters&) = delete;
+  PerfCounters& operator=(PerfCounters&&) noexcept;
+  PerfCounters& operator=(const PerfCounters&) = delete;
+
+  // Platform-specific implementations may choose to do some library
+  // initialization here.
+  static bool Initialize();
+
+  // Check if the given counter is supported, if the app wants to
+  // check before passing
+  static bool IsCounterSupported(const std::string& name);
+
+  // Return a PerfCounters object ready to read the counters with the names
+  // specified. The values are user-mode only. The counter name format is
+  // implementation and OS specific.
+  // In case of failure, this method will in the worst case return an
+  // empty object whose state will still be valid.
+  static PerfCounters Create(const std::vector<std::string>& counter_names);
+
+  // Take a snapshot of the current value of the counters into the provided
+  // valid PerfCounterValues storage. The values are populated such that:
+  // names()[i]'s value is (*values)[i]
+  BENCHMARK_ALWAYS_INLINE bool Snapshot(PerfCounterValues* values) const {
+#ifndef BENCHMARK_OS_WINDOWS
+    assert(values != nullptr);
+    return values->Read(leader_ids_) == counter_ids_.size();
+#else
+    (void)values;
+    return false;
+#endif
+  }
+
+  const std::vector<std::string>& names() const { return counter_names_; }
+  size_t num_counters() const { return counter_names_.size(); }
+
+ private:
+  PerfCounters(const std::vector<std::string>& counter_names,
+               std::vector<int>&& counter_ids, std::vector<int>&& leader_ids)
+      : counter_ids_(std::move(counter_ids)),
+        leader_ids_(std::move(leader_ids)),
+        counter_names_(counter_names) {}
+
+  void CloseCounters() const;
+
+  std::vector<int> counter_ids_;
+  std::vector<int> leader_ids_;
+  std::vector<std::string> counter_names_;
+};
+
+// Typical usage of the above primitives.
+class BENCHMARK_EXPORT PerfCountersMeasurement final {
+ public:
+  PerfCountersMeasurement(const std::vector<std::string>& counter_names);
+
+  size_t num_counters() const { return counters_.num_counters(); }
+
+  std::vector<std::string> names() const { return counters_.names(); }
+
+  BENCHMARK_ALWAYS_INLINE bool Start() {
+    if (num_counters() == 0) return true;
+    // Tell the compiler to not move instructions above/below where we take
+    // the snapshot.
+    ClobberMemory();
+    valid_read_ &= counters_.Snapshot(&start_values_);
+    ClobberMemory();
+
+    return valid_read_;
+  }
+
+  BENCHMARK_ALWAYS_INLINE bool Stop(
+      std::vector<std::pair<std::string, double>>& measurements) {
+    if (num_counters() == 0) return true;
+    // Tell the compiler to not move instructions above/below where we take
+    // the snapshot.
+    ClobberMemory();
+    valid_read_ &= counters_.Snapshot(&end_values_);
+    ClobberMemory();
+
+    for (size_t i = 0; i < counters_.names().size(); ++i) {
+      double measurement = static_cast<double>(end_values_[i]) -
+                           static_cast<double>(start_values_[i]);
+      measurements.push_back({counters_.names()[i], measurement});
+    }
+
+    return valid_read_;
+  }
+
+ private:
+  PerfCounters counters_;
+  bool valid_read_ = true;
+  PerfCounterValues start_values_;
+  PerfCounterValues end_values_;
+};
+
+BENCHMARK_UNUSED static bool perf_init_anchor = PerfCounters::Initialize();
+
+}  // namespace internal
+}  // namespace benchmark
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+#endif  // BENCHMARK_PERF_COUNTERS_H
diff --git a/third_party/google_benchmark/src/re.h b/third_party/google_benchmark/src/src/re.h
similarity index 98%
rename from third_party/google_benchmark/src/re.h
rename to third_party/google_benchmark/src/src/re.h
index fbe2503..6300467 100644
--- a/third_party/google_benchmark/src/re.h
+++ b/third_party/google_benchmark/src/src/re.h
@@ -126,7 +126,7 @@
 
       // regerror returns the number of bytes necessary to null terminate
       // the string, so we move that when assigning to error.
-      CHECK_NE(needed, 0);
+      BM_CHECK_NE(needed, 0);
       error->assign(errbuf, needed - 1);
 
       delete[] errbuf;
diff --git a/third_party/google_benchmark/src/reporter.cc b/third_party/google_benchmark/src/src/reporter.cc
similarity index 76%
rename from third_party/google_benchmark/src/reporter.cc
rename to third_party/google_benchmark/src/src/reporter.cc
index cca6a11..076bc31 100644
--- a/third_party/google_benchmark/src/reporter.cc
+++ b/third_party/google_benchmark/src/src/reporter.cc
@@ -12,66 +12,35 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "benchmark/benchmark.h"
-#include "timers.h"
-
 #include <cstdlib>
-
 #include <iostream>
+#include <map>
+#include <string>
 #include <tuple>
 #include <vector>
 
+#include "benchmark/benchmark.h"
 #include "check.h"
-#include "starboard/log.h"
 #include "string_util.h"
+#include "timers.h"
 
 namespace benchmark {
-namespace {
-
-class sblog_ostreambuf : public std::streambuf {
- public:
-  explicit sblog_ostreambuf(SbLogPriority priority) : priority_(priority) {}
-
-  std::streamsize xsputn(const char_type *s, std::streamsize n) override {
-    buffer_.insert(buffer_.end(), s, s + n);
-    if (buffer_.back() == '\n') {
-      SbLog(priority_, buffer_.c_str());
-      buffer_.clear();
-    }
-    return n;
-  }
-
- private:
-  SbLogPriority priority_;
-  std::string buffer_;
-};
-
-std::ostream *GetOutputStream() {
-  static sblog_ostreambuf streambuf(kSbLogPriorityInfo);
-  static std::ostream os(&streambuf);
-  return &os;
-}
-
-std::ostream *GetErrorStream() {
-  static sblog_ostreambuf streambuf(kSbLogPriorityError);
-  static std::ostream os(&streambuf);
-  return &os;
-}
-
-}  // namespace
 
 BenchmarkReporter::BenchmarkReporter()
-    : output_stream_(benchmark::GetOutputStream()),
-      error_stream_(benchmark::GetErrorStream()) {}
+    : output_stream_(&std::cout), error_stream_(&std::cerr) {}
 
 BenchmarkReporter::~BenchmarkReporter() {}
 
 void BenchmarkReporter::PrintBasicContext(std::ostream *out,
                                           Context const &context) {
-  CHECK(out) << "cannot be null";
+  BM_CHECK(out) << "cannot be null";
   auto &Out = *out;
 
+#ifndef BENCHMARK_OS_QURT
+  // Date/time information is not available on QuRT.
+  // Attempting to get it via this call cause the binary to crash.
   Out << LocalDateTimeString() << "\n";
+#endif
 
   if (context.executable_name)
     Out << "Running " << context.executable_name << "\n";
@@ -99,7 +68,16 @@
     Out << "\n";
   }
 
-  if (info.scaling_enabled) {
+  std::map<std::string, std::string> *global_context =
+      internal::GetGlobalContext();
+
+  if (global_context != nullptr) {
+    for (const auto &kv : *global_context) {
+      Out << kv.first << ": " << kv.second << "\n";
+    }
+  }
+
+  if (CPUInfo::Scaling::ENABLED == info.scaling) {
     Out << "***WARNING*** CPU scaling is enabled, the benchmark "
            "real time measurements may be noisy and will incur extra "
            "overhead.\n";
diff --git a/third_party/google_benchmark/src/statistics.cc b/third_party/google_benchmark/src/src/statistics.cc
similarity index 79%
rename from third_party/google_benchmark/src/statistics.cc
rename to third_party/google_benchmark/src/src/statistics.cc
index bd5a3d6..c4b54b2 100644
--- a/third_party/google_benchmark/src/statistics.cc
+++ b/third_party/google_benchmark/src/src/statistics.cc
@@ -13,15 +13,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "benchmark/benchmark.h"
+#include "statistics.h"
 
 #include <algorithm>
 #include <cmath>
 #include <numeric>
 #include <string>
 #include <vector>
+
+#include "benchmark/benchmark.h"
 #include "check.h"
-#include "statistics.h"
 
 namespace benchmark {
 
@@ -74,14 +75,22 @@
   return Sqrt(v.size() / (v.size() - 1.0) * (avg_squares - Sqr(mean)));
 }
 
+double StatisticsCV(const std::vector<double>& v) {
+  if (v.size() < 2) return 0.0;
+
+  const auto stddev = StatisticsStdDev(v);
+  const auto mean = StatisticsMean(v);
+
+  return stddev / mean;
+}
+
 std::vector<BenchmarkReporter::Run> ComputeStats(
     const std::vector<BenchmarkReporter::Run>& reports) {
   typedef BenchmarkReporter::Run Run;
   std::vector<Run> results;
 
-  auto error_count =
-      std::count_if(reports.begin(), reports.end(),
-                    [](Run const& run) { return run.error_occurred; });
+  auto error_count = std::count_if(reports.begin(), reports.end(),
+                                   [](Run const& run) { return run.skipped; });
 
   if (reports.size() - error_count < 2) {
     // We don't report aggregated data if there was a single run.
@@ -108,26 +117,28 @@
     for (auto const& cnt : r.counters) {
       auto it = counter_stats.find(cnt.first);
       if (it == counter_stats.end()) {
-        counter_stats.insert({cnt.first, {cnt.second, std::vector<double>{}}});
-        it = counter_stats.find(cnt.first);
+        it = counter_stats
+                 .emplace(cnt.first,
+                          CounterStat{cnt.second, std::vector<double>{}})
+                 .first;
         it->second.s.reserve(reports.size());
       } else {
-        CHECK_EQ(counter_stats[cnt.first].c.flags, cnt.second.flags);
+        BM_CHECK_EQ(it->second.c.flags, cnt.second.flags);
       }
     }
   }
 
   // Populate the accumulators.
   for (Run const& run : reports) {
-    CHECK_EQ(reports[0].benchmark_name(), run.benchmark_name());
-    CHECK_EQ(run_iterations, run.iterations);
-    if (run.error_occurred) continue;
+    BM_CHECK_EQ(reports[0].benchmark_name(), run.benchmark_name());
+    BM_CHECK_EQ(run_iterations, run.iterations);
+    if (run.skipped) continue;
     real_accumulated_time_stat.emplace_back(run.real_accumulated_time);
     cpu_accumulated_time_stat.emplace_back(run.cpu_accumulated_time);
     // user counters
     for (auto const& cnt : run.counters) {
       auto it = counter_stats.find(cnt.first);
-      CHECK_NE(it, counter_stats.end());
+      BM_CHECK_NE(it, counter_stats.end());
       it->second.s.emplace_back(cnt.second);
     }
   }
@@ -148,11 +159,14 @@
     // Get the data from the accumulator to BenchmarkReporter::Run's.
     Run data;
     data.run_name = reports[0].run_name;
+    data.family_index = reports[0].family_index;
+    data.per_family_instance_index = reports[0].per_family_instance_index;
     data.run_type = BenchmarkReporter::Run::RT_Aggregate;
     data.threads = reports[0].threads;
     data.repetitions = reports[0].repetitions;
     data.repetition_index = Run::no_repetition_index;
     data.aggregate_name = Stat.name_;
+    data.aggregate_unit = Stat.unit_;
     data.report_label = report_label;
 
     // It is incorrect to say that an aggregate is computed over
@@ -165,13 +179,15 @@
     data.real_accumulated_time = Stat.compute_(real_accumulated_time_stat);
     data.cpu_accumulated_time = Stat.compute_(cpu_accumulated_time_stat);
 
-    // We will divide these times by data.iterations when reporting, but the
-    // data.iterations is not nessesairly the scale of these measurements,
-    // because in each repetition, these timers are sum over all the iterations.
-    // And if we want to say that the stats are over N repetitions and not
-    // M iterations, we need to multiply these by (N/M).
-    data.real_accumulated_time *= iteration_rescale_factor;
-    data.cpu_accumulated_time *= iteration_rescale_factor;
+    if (data.aggregate_unit == StatisticUnit::kTime) {
+      // We will divide these times by data.iterations when reporting, but the
+      // data.iterations is not necessarily the scale of these measurements,
+      // because in each repetition, these timers are sum over all the iters.
+      // And if we want to say that the stats are over N repetitions and not
+      // M iterations, we need to multiply these by (N/M).
+      data.real_accumulated_time *= iteration_rescale_factor;
+      data.cpu_accumulated_time *= iteration_rescale_factor;
+    }
 
     data.time_unit = reports[0].time_unit;
 
diff --git a/third_party/google_benchmark/src/statistics.h b/third_party/google_benchmark/src/src/statistics.h
similarity index 75%
rename from third_party/google_benchmark/src/statistics.h
rename to third_party/google_benchmark/src/src/statistics.h
index 7eccc85..6e5560e 100644
--- a/third_party/google_benchmark/src/statistics.h
+++ b/third_party/google_benchmark/src/src/statistics.h
@@ -22,15 +22,22 @@
 
 namespace benchmark {
 
-// Return a vector containing the mean, median and standard devation information
-// (and any user-specified info) for the specified list of reports. If 'reports'
-// contains less than two non-errored runs an empty vector is returned
+// Return a vector containing the mean, median and standard deviation
+// information (and any user-specified info) for the specified list of reports.
+// If 'reports' contains less than two non-errored runs an empty vector is
+// returned
+BENCHMARK_EXPORT
 std::vector<BenchmarkReporter::Run> ComputeStats(
     const std::vector<BenchmarkReporter::Run>& reports);
 
+BENCHMARK_EXPORT
 double StatisticsMean(const std::vector<double>& v);
+BENCHMARK_EXPORT
 double StatisticsMedian(const std::vector<double>& v);
+BENCHMARK_EXPORT
 double StatisticsStdDev(const std::vector<double>& v);
+BENCHMARK_EXPORT
+double StatisticsCV(const std::vector<double>& v);
 
 }  // end namespace benchmark
 
diff --git a/third_party/google_benchmark/src/string_util.cc b/third_party/google_benchmark/src/src/string_util.cc
similarity index 83%
rename from third_party/google_benchmark/src/string_util.cc
rename to third_party/google_benchmark/src/src/string_util.cc
index 39b01a1..5e2d24a 100644
--- a/third_party/google_benchmark/src/string_util.cc
+++ b/third_party/google_benchmark/src/src/string_util.cc
@@ -1,6 +1,9 @@
 #include "string_util.h"
 
 #include <array>
+#ifdef BENCHMARK_STL_ANDROID_GNUSTL
+#include <cerrno>
+#endif
 #include <cmath>
 #include <cstdarg>
 #include <cstdio>
@@ -91,10 +94,10 @@
 
   const char* array =
       (exponent > 0 ? (iec ? kBigIECUnits : kBigSIUnits) : kSmallSIUnits);
-  if (iec)
+  if (iec) {
     return array[index] + std::string("i");
-  else
-    return std::string(1, array[index]);
+  }
+  return std::string(1, array[index]);
 }
 
 std::string ToBinaryStringFullySpecified(double value, double threshold,
@@ -130,25 +133,25 @@
   // TODO(ericwf): use std::array for first attempt to avoid one memory
   // allocation guess what the size might be
   std::array<char, 256> local_buff;
-  std::size_t size = local_buff.size();
+
   // 2015-10-08: vsnprintf is used instead of snd::vsnprintf due to a limitation
   // in the android-ndk
-  auto ret = vsnprintf(local_buff.data(), size, msg, args_cp);
+  auto ret = vsnprintf(local_buff.data(), local_buff.size(), msg, args_cp);
 
   va_end(args_cp);
 
   // handle empty expansion
   if (ret == 0) return std::string{};
-  if (static_cast<std::size_t>(ret) < size)
+  if (static_cast<std::size_t>(ret) < local_buff.size())
     return std::string(local_buff.data());
 
   // we did not provide a long enough buffer on our first attempt.
   // add 1 to size to account for null-byte in size cast to prevent overflow
-  size = static_cast<std::size_t>(ret) + 1;
+  std::size_t size = static_cast<std::size_t>(ret) + 1;
   auto buff_ptr = std::unique_ptr<char[]>(new char[size]);
   // 2015-10-08: vsnprintf is used instead of snd::vsnprintf due to a limitation
   // in the android-ndk
-  ret = vsnprintf(buff_ptr.get(), size, msg, args);
+  vsnprintf(buff_ptr.get(), size, msg, args);
   return std::string(buff_ptr.get());
 }
 
@@ -160,6 +163,19 @@
   return tmp;
 }
 
+std::vector<std::string> StrSplit(const std::string& str, char delim) {
+  if (str.empty()) return {};
+  std::vector<std::string> ret;
+  size_t first = 0;
+  size_t next = str.find(delim);
+  for (; next != std::string::npos;
+       first = next + 1, next = str.find(delim, first)) {
+    ret.push_back(str.substr(first, next - first));
+  }
+  ret.push_back(str.substr(first));
+  return ret;
+}
+
 #ifdef BENCHMARK_STL_ANDROID_GNUSTL
 /*
  * GNU STL in Android NDK lacks support for some C++11 functions, including
@@ -182,11 +198,10 @@
 
   /* Check for errors and return */
   if (strtoulErrno == ERANGE) {
-    throw std::out_of_range(
-      "stoul failed: " + str + " is outside of range of unsigned long");
+    throw std::out_of_range("stoul failed: " + str +
+                            " is outside of range of unsigned long");
   } else if (strEnd == strStart || strtoulErrno != 0) {
-    throw std::invalid_argument(
-      "stoul failed: " + str + " is not an integer");
+    throw std::invalid_argument("stoul failed: " + str + " is not an integer");
   }
   if (pos != nullptr) {
     *pos = static_cast<size_t>(strEnd - strStart);
@@ -209,11 +224,10 @@
 
   /* Check for errors and return */
   if (strtolErrno == ERANGE || long(int(result)) != result) {
-    throw std::out_of_range(
-      "stoul failed: " + str + " is outside of range of int");
+    throw std::out_of_range("stoul failed: " + str +
+                            " is outside of range of int");
   } else if (strEnd == strStart || strtolErrno != 0) {
-    throw std::invalid_argument(
-      "stoul failed: " + str + " is not an integer");
+    throw std::invalid_argument("stoul failed: " + str + " is not an integer");
   }
   if (pos != nullptr) {
     *pos = static_cast<size_t>(strEnd - strStart);
@@ -236,11 +250,10 @@
 
   /* Check for errors and return */
   if (strtodErrno == ERANGE) {
-    throw std::out_of_range(
-      "stoul failed: " + str + " is outside of range of int");
+    throw std::out_of_range("stoul failed: " + str +
+                            " is outside of range of int");
   } else if (strEnd == strStart || strtodErrno != 0) {
-    throw std::invalid_argument(
-      "stoul failed: " + str + " is not an integer");
+    throw std::invalid_argument("stoul failed: " + str + " is not an integer");
   }
   if (pos != nullptr) {
     *pos = static_cast<size_t>(strEnd - strStart);
diff --git a/third_party/google_benchmark/src/string_util.h b/third_party/google_benchmark/src/src/string_util.h
similarity index 76%
rename from third_party/google_benchmark/src/string_util.h
rename to third_party/google_benchmark/src/src/string_util.h
index 09d7b4b..37bdd2e 100644
--- a/third_party/google_benchmark/src/string_util.h
+++ b/third_party/google_benchmark/src/src/string_util.h
@@ -4,6 +4,10 @@
 #include <sstream>
 #include <string>
 #include <utility>
+#include <vector>
+
+#include "benchmark/export.h"
+#include "check.h"
 #include "internal_macros.h"
 
 namespace benchmark {
@@ -12,6 +16,7 @@
 
 std::string HumanReadableNumber(double n, double one_k = 1024.0);
 
+BENCHMARK_EXPORT
 #if defined(__MINGW32__)
 __attribute__((format(__MINGW_PRINTF_FORMAT, 1, 2)))
 #elif defined(__GNUC__)
@@ -37,6 +42,11 @@
   return ss.str();
 }
 
+BENCHMARK_EXPORT
+std::vector<std::string> StrSplit(const std::string& str, char delim);
+
+// Disable lint checking for this block since it re-implements C functions.
+// NOLINTBEGIN
 #ifdef BENCHMARK_STL_ANDROID_GNUSTL
 /*
  * GNU STL in Android NDK lacks support for some C++11 functions, including
@@ -45,14 +55,15 @@
  * namespace, not std:: namespace.
  */
 unsigned long stoul(const std::string& str, size_t* pos = nullptr,
-                           int base = 10);
+                    int base = 10);
 int stoi(const std::string& str, size_t* pos = nullptr, int base = 10);
 double stod(const std::string& str, size_t* pos = nullptr);
 #else
-using std::stoul;
-using std::stoi;
-using std::stod;
+using std::stod;   // NOLINT(misc-unused-using-decls)
+using std::stoi;   // NOLINT(misc-unused-using-decls)
+using std::stoul;  // NOLINT(misc-unused-using-decls)
 #endif
+// NOLINTEND
 
 }  // end namespace benchmark
 
diff --git a/third_party/google_benchmark/src/src/sysinfo.cc b/third_party/google_benchmark/src/src/sysinfo.cc
new file mode 100644
index 0000000..4578cb0
--- /dev/null
+++ b/third_party/google_benchmark/src/src/sysinfo.cc
@@ -0,0 +1,855 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "internal_macros.h"
+
+#ifdef BENCHMARK_OS_WINDOWS
+#include <shlwapi.h>
+#undef StrCat  // Don't let StrCat in string_util.h be renamed to lstrcatA
+#include <versionhelpers.h>
+#include <windows.h>
+
+#include <codecvt>
+#else
+#include <fcntl.h>
+#if !defined(BENCHMARK_OS_FUCHSIA) && !defined(BENCHMARK_OS_QURT)
+#include <sys/resource.h>
+#endif
+#include <sys/time.h>
+#include <sys/types.h>  // this header must be included before 'sys/sysctl.h' to avoid compilation error on FreeBSD
+#include <unistd.h>
+#if defined BENCHMARK_OS_FREEBSD || defined BENCHMARK_OS_MACOSX || \
+    defined BENCHMARK_OS_NETBSD || defined BENCHMARK_OS_OPENBSD || \
+    defined BENCHMARK_OS_DRAGONFLY
+#define BENCHMARK_HAS_SYSCTL
+#include <sys/sysctl.h>
+#endif
+#endif
+#if defined(BENCHMARK_OS_SOLARIS)
+#include <kstat.h>
+#include <netdb.h>
+#endif
+#if defined(BENCHMARK_OS_QNX)
+#include <sys/syspage.h>
+#endif
+#if defined(BENCHMARK_OS_QURT)
+#include <qurt.h>
+#endif
+#if defined(BENCHMARK_HAS_PTHREAD_AFFINITY)
+#include <pthread.h>
+#endif
+
+#include <algorithm>
+#include <array>
+#include <bitset>
+#include <cerrno>
+#include <climits>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <limits>
+#include <locale>
+#include <memory>
+#include <random>
+#include <sstream>
+#include <utility>
+
+#include "benchmark/benchmark.h"
+#include "check.h"
+#include "cycleclock.h"
+#include "internal_macros.h"
+#include "log.h"
+#include "string_util.h"
+#include "timers.h"
+
+namespace benchmark {
+namespace {
+
+void PrintImp(std::ostream& out) { out << std::endl; }
+
+template <class First, class... Rest>
+void PrintImp(std::ostream& out, First&& f, Rest&&... rest) {
+  out << std::forward<First>(f);
+  PrintImp(out, std::forward<Rest>(rest)...);
+}
+
+template <class... Args>
+BENCHMARK_NORETURN void PrintErrorAndDie(Args&&... args) {
+  PrintImp(std::cerr, std::forward<Args>(args)...);
+  std::exit(EXIT_FAILURE);
+}
+
+#ifdef BENCHMARK_HAS_SYSCTL
+
+/// ValueUnion - A type used to correctly alias the byte-for-byte output of
+/// `sysctl` with the result type it's to be interpreted as.
+struct ValueUnion {
+  union DataT {
+    int32_t int32_value;
+    int64_t int64_value;
+    // For correct aliasing of union members from bytes.
+    char bytes[8];
+  };
+  using DataPtr = std::unique_ptr<DataT, decltype(&std::free)>;
+
+  // The size of the data union member + its trailing array size.
+  std::size_t size;
+  DataPtr buff;
+
+ public:
+  ValueUnion() : size(0), buff(nullptr, &std::free) {}
+
+  explicit ValueUnion(std::size_t buff_size)
+      : size(sizeof(DataT) + buff_size),
+        buff(::new (std::malloc(size)) DataT(), &std::free) {}
+
+  ValueUnion(ValueUnion&& other) = default;
+
+  explicit operator bool() const { return bool(buff); }
+
+  char* data() const { return buff->bytes; }
+
+  std::string GetAsString() const { return std::string(data()); }
+
+  int64_t GetAsInteger() const {
+    if (size == sizeof(buff->int32_value))
+      return buff->int32_value;
+    else if (size == sizeof(buff->int64_value))
+      return buff->int64_value;
+    BENCHMARK_UNREACHABLE();
+  }
+
+  template <class T, int N>
+  std::array<T, N> GetAsArray() {
+    const int arr_size = sizeof(T) * N;
+    BM_CHECK_LE(arr_size, size);
+    std::array<T, N> arr;
+    std::memcpy(arr.data(), data(), arr_size);
+    return arr;
+  }
+};
+
+ValueUnion GetSysctlImp(std::string const& name) {
+#if defined BENCHMARK_OS_OPENBSD
+  int mib[2];
+
+  mib[0] = CTL_HW;
+  if ((name == "hw.ncpu") || (name == "hw.cpuspeed")) {
+    ValueUnion buff(sizeof(int));
+
+    if (name == "hw.ncpu") {
+      mib[1] = HW_NCPU;
+    } else {
+      mib[1] = HW_CPUSPEED;
+    }
+
+    if (sysctl(mib, 2, buff.data(), &buff.Size, nullptr, 0) == -1) {
+      return ValueUnion();
+    }
+    return buff;
+  }
+  return ValueUnion();
+#else
+  std::size_t cur_buff_size = 0;
+  if (sysctlbyname(name.c_str(), nullptr, &cur_buff_size, nullptr, 0) == -1)
+    return ValueUnion();
+
+  ValueUnion buff(cur_buff_size);
+  if (sysctlbyname(name.c_str(), buff.data(), &buff.size, nullptr, 0) == 0)
+    return buff;
+  return ValueUnion();
+#endif
+}
+
+BENCHMARK_MAYBE_UNUSED
+bool GetSysctl(std::string const& name, std::string* out) {
+  out->clear();
+  auto buff = GetSysctlImp(name);
+  if (!buff) return false;
+  out->assign(buff.data());
+  return true;
+}
+
+template <class Tp,
+          class = typename std::enable_if<std::is_integral<Tp>::value>::type>
+bool GetSysctl(std::string const& name, Tp* out) {
+  *out = 0;
+  auto buff = GetSysctlImp(name);
+  if (!buff) return false;
+  *out = static_cast<Tp>(buff.GetAsInteger());
+  return true;
+}
+
+template <class Tp, size_t N>
+bool GetSysctl(std::string const& name, std::array<Tp, N>* out) {
+  auto buff = GetSysctlImp(name);
+  if (!buff) return false;
+  *out = buff.GetAsArray<Tp, N>();
+  return true;
+}
+#endif
+
+template <class ArgT>
+bool ReadFromFile(std::string const& fname, ArgT* arg) {
+  *arg = ArgT();
+  std::ifstream f(fname.c_str());
+  if (!f.is_open()) return false;
+  f >> *arg;
+  return f.good();
+}
+
+CPUInfo::Scaling CpuScaling(int num_cpus) {
+  // We don't have a valid CPU count, so don't even bother.
+  if (num_cpus <= 0) return CPUInfo::Scaling::UNKNOWN;
+#if defined(BENCHMARK_OS_QNX)
+  return CPUInfo::Scaling::UNKNOWN;
+#elif !defined(BENCHMARK_OS_WINDOWS)
+  // On Linux, the CPUfreq subsystem exposes CPU information as files on the
+  // local file system. If reading the exported files fails, then we may not be
+  // running on Linux, so we silently ignore all the read errors.
+  std::string res;
+  for (int cpu = 0; cpu < num_cpus; ++cpu) {
+    std::string governor_file =
+        StrCat("/sys/devices/system/cpu/cpu", cpu, "/cpufreq/scaling_governor");
+    if (ReadFromFile(governor_file, &res) && res != "performance")
+      return CPUInfo::Scaling::ENABLED;
+  }
+  return CPUInfo::Scaling::DISABLED;
+#else
+  return CPUInfo::Scaling::UNKNOWN;
+#endif
+}
+
+int CountSetBitsInCPUMap(std::string val) {
+  auto CountBits = [](std::string part) {
+    using CPUMask = std::bitset<sizeof(std::uintptr_t) * CHAR_BIT>;
+    part = "0x" + part;
+    CPUMask mask(benchmark::stoul(part, nullptr, 16));
+    return static_cast<int>(mask.count());
+  };
+  std::size_t pos;
+  int total = 0;
+  while ((pos = val.find(',')) != std::string::npos) {
+    total += CountBits(val.substr(0, pos));
+    val = val.substr(pos + 1);
+  }
+  if (!val.empty()) {
+    total += CountBits(val);
+  }
+  return total;
+}
+
+BENCHMARK_MAYBE_UNUSED
+std::vector<CPUInfo::CacheInfo> GetCacheSizesFromKVFS() {
+  std::vector<CPUInfo::CacheInfo> res;
+  std::string dir = "/sys/devices/system/cpu/cpu0/cache/";
+  int idx = 0;
+  while (true) {
+    CPUInfo::CacheInfo info;
+    std::string fpath = StrCat(dir, "index", idx++, "/");
+    std::ifstream f(StrCat(fpath, "size").c_str());
+    if (!f.is_open()) break;
+    std::string suffix;
+    f >> info.size;
+    if (f.fail())
+      PrintErrorAndDie("Failed while reading file '", fpath, "size'");
+    if (f.good()) {
+      f >> suffix;
+      if (f.bad())
+        PrintErrorAndDie(
+            "Invalid cache size format: failed to read size suffix");
+      else if (f && suffix != "K")
+        PrintErrorAndDie("Invalid cache size format: Expected bytes ", suffix);
+      else if (suffix == "K")
+        info.size *= 1024;
+    }
+    if (!ReadFromFile(StrCat(fpath, "type"), &info.type))
+      PrintErrorAndDie("Failed to read from file ", fpath, "type");
+    if (!ReadFromFile(StrCat(fpath, "level"), &info.level))
+      PrintErrorAndDie("Failed to read from file ", fpath, "level");
+    std::string map_str;
+    if (!ReadFromFile(StrCat(fpath, "shared_cpu_map"), &map_str))
+      PrintErrorAndDie("Failed to read from file ", fpath, "shared_cpu_map");
+    info.num_sharing = CountSetBitsInCPUMap(map_str);
+    res.push_back(info);
+  }
+
+  return res;
+}
+
+#ifdef BENCHMARK_OS_MACOSX
+std::vector<CPUInfo::CacheInfo> GetCacheSizesMacOSX() {
+  std::vector<CPUInfo::CacheInfo> res;
+  std::array<int, 4> cache_counts{{0, 0, 0, 0}};
+  GetSysctl("hw.cacheconfig", &cache_counts);
+
+  struct {
+    std::string name;
+    std::string type;
+    int level;
+    int num_sharing;
+  } cases[] = {{"hw.l1dcachesize", "Data", 1, cache_counts[1]},
+               {"hw.l1icachesize", "Instruction", 1, cache_counts[1]},
+               {"hw.l2cachesize", "Unified", 2, cache_counts[2]},
+               {"hw.l3cachesize", "Unified", 3, cache_counts[3]}};
+  for (auto& c : cases) {
+    int val;
+    if (!GetSysctl(c.name, &val)) continue;
+    CPUInfo::CacheInfo info;
+    info.type = c.type;
+    info.level = c.level;
+    info.size = val;
+    info.num_sharing = c.num_sharing;
+    res.push_back(std::move(info));
+  }
+  return res;
+}
+#elif defined(BENCHMARK_OS_WINDOWS)
+std::vector<CPUInfo::CacheInfo> GetCacheSizesWindows() {
+  std::vector<CPUInfo::CacheInfo> res;
+  DWORD buffer_size = 0;
+  using PInfo = SYSTEM_LOGICAL_PROCESSOR_INFORMATION;
+  using CInfo = CACHE_DESCRIPTOR;
+
+  using UPtr = std::unique_ptr<PInfo, decltype(&std::free)>;
+  GetLogicalProcessorInformation(nullptr, &buffer_size);
+  UPtr buff((PInfo*)malloc(buffer_size), &std::free);
+  if (!GetLogicalProcessorInformation(buff.get(), &buffer_size))
+    PrintErrorAndDie("Failed during call to GetLogicalProcessorInformation: ",
+                     GetLastError());
+
+  PInfo* it = buff.get();
+  PInfo* end = buff.get() + (buffer_size / sizeof(PInfo));
+
+  for (; it != end; ++it) {
+    if (it->Relationship != RelationCache) continue;
+    using BitSet = std::bitset<sizeof(ULONG_PTR) * CHAR_BIT>;
+    BitSet b(it->ProcessorMask);
+    // To prevent duplicates, only consider caches where CPU 0 is specified
+    if (!b.test(0)) continue;
+    const CInfo& cache = it->Cache;
+    CPUInfo::CacheInfo C;
+    C.num_sharing = static_cast<int>(b.count());
+    C.level = cache.Level;
+    C.size = cache.Size;
+    C.type = "Unknown";
+    switch (cache.Type) {
+      case CacheUnified:
+        C.type = "Unified";
+        break;
+      case CacheInstruction:
+        C.type = "Instruction";
+        break;
+      case CacheData:
+        C.type = "Data";
+        break;
+      case CacheTrace:
+        C.type = "Trace";
+        break;
+    }
+    res.push_back(C);
+  }
+  return res;
+}
+#elif BENCHMARK_OS_QNX
+std::vector<CPUInfo::CacheInfo> GetCacheSizesQNX() {
+  std::vector<CPUInfo::CacheInfo> res;
+  struct cacheattr_entry* cache = SYSPAGE_ENTRY(cacheattr);
+  uint32_t const elsize = SYSPAGE_ELEMENT_SIZE(cacheattr);
+  int num = SYSPAGE_ENTRY_SIZE(cacheattr) / elsize;
+  for (int i = 0; i < num; ++i) {
+    CPUInfo::CacheInfo info;
+    switch (cache->flags) {
+      case CACHE_FLAG_INSTR:
+        info.type = "Instruction";
+        info.level = 1;
+        break;
+      case CACHE_FLAG_DATA:
+        info.type = "Data";
+        info.level = 1;
+        break;
+      case CACHE_FLAG_UNIFIED:
+        info.type = "Unified";
+        info.level = 2;
+        break;
+      case CACHE_FLAG_SHARED:
+        info.type = "Shared";
+        info.level = 3;
+        break;
+      default:
+        continue;
+        break;
+    }
+    info.size = cache->line_size * cache->num_lines;
+    info.num_sharing = 0;
+    res.push_back(std::move(info));
+    cache = SYSPAGE_ARRAY_ADJ_OFFSET(cacheattr, cache, elsize);
+  }
+  return res;
+}
+#endif
+
+std::vector<CPUInfo::CacheInfo> GetCacheSizes() {
+#ifdef BENCHMARK_OS_MACOSX
+  return GetCacheSizesMacOSX();
+#elif defined(BENCHMARK_OS_WINDOWS)
+  return GetCacheSizesWindows();
+#elif defined(BENCHMARK_OS_QNX)
+  return GetCacheSizesQNX();
+#elif defined(BENCHMARK_OS_QURT)
+  return std::vector<CPUInfo::CacheInfo>();
+#else
+  return GetCacheSizesFromKVFS();
+#endif
+}
+
+std::string GetSystemName() {
+#if defined(BENCHMARK_OS_WINDOWS)
+  std::string str;
+  static constexpr int COUNT = MAX_COMPUTERNAME_LENGTH + 1;
+  TCHAR hostname[COUNT] = {'\0'};
+  DWORD DWCOUNT = COUNT;
+  if (!GetComputerName(hostname, &DWCOUNT)) return std::string("");
+#ifndef UNICODE
+  str = std::string(hostname, DWCOUNT);
+#else
+  // `WideCharToMultiByte` returns `0` when conversion fails.
+  int len = WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, hostname,
+                                DWCOUNT, NULL, 0, NULL, NULL);
+  str.resize(len);
+  WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, hostname, DWCOUNT, &str[0],
+                      str.size(), NULL, NULL);
+#endif
+  return str;
+#elif defined(BENCHMARK_OS_QURT)
+  std::string str = "Hexagon DSP";
+  qurt_arch_version_t arch_version_struct;
+  if (qurt_sysenv_get_arch_version(&arch_version_struct) == QURT_EOK) {
+    str += " v";
+    str += std::to_string(arch_version_struct.arch_version);
+  }
+  return str;
+#else
+#ifndef HOST_NAME_MAX
+#ifdef BENCHMARK_HAS_SYSCTL  // BSD/Mac doesn't have HOST_NAME_MAX defined
+#define HOST_NAME_MAX 64
+#elif defined(BENCHMARK_OS_NACL)
+#define HOST_NAME_MAX 64
+#elif defined(BENCHMARK_OS_QNX)
+#define HOST_NAME_MAX 154
+#elif defined(BENCHMARK_OS_RTEMS)
+#define HOST_NAME_MAX 256
+#elif defined(BENCHMARK_OS_SOLARIS)
+#define HOST_NAME_MAX MAXHOSTNAMELEN
+#else
+#pragma message("HOST_NAME_MAX not defined. using 64")
+#define HOST_NAME_MAX 64
+#endif
+#endif  // def HOST_NAME_MAX
+  char hostname[HOST_NAME_MAX];
+  int retVal = gethostname(hostname, HOST_NAME_MAX);
+  if (retVal != 0) return std::string("");
+  return std::string(hostname);
+#endif  // Catch-all POSIX block.
+}
+
+int GetNumCPUs() {
+#ifdef BENCHMARK_HAS_SYSCTL
+  int num_cpu = -1;
+  if (GetSysctl("hw.ncpu", &num_cpu)) return num_cpu;
+  fprintf(stderr, "Err: %s\n", strerror(errno));
+  std::exit(EXIT_FAILURE);
+#elif defined(BENCHMARK_OS_WINDOWS)
+  SYSTEM_INFO sysinfo;
+  // Use memset as opposed to = {} to avoid GCC missing initializer false
+  // positives.
+  std::memset(&sysinfo, 0, sizeof(SYSTEM_INFO));
+  GetSystemInfo(&sysinfo);
+  return sysinfo.dwNumberOfProcessors;  // number of logical
+                                        // processors in the current
+                                        // group
+#elif defined(BENCHMARK_OS_SOLARIS)
+  // Returns -1 in case of a failure.
+  long num_cpu = sysconf(_SC_NPROCESSORS_ONLN);
+  if (num_cpu < 0) {
+    fprintf(stderr, "sysconf(_SC_NPROCESSORS_ONLN) failed with error: %s\n",
+            strerror(errno));
+  }
+  return (int)num_cpu;
+#elif defined(BENCHMARK_OS_QNX)
+  return static_cast<int>(_syspage_ptr->num_cpu);
+#elif defined(BENCHMARK_OS_QURT)
+  qurt_sysenv_max_hthreads_t hardware_threads;
+  if (qurt_sysenv_get_max_hw_threads(&hardware_threads) != QURT_EOK) {
+    hardware_threads.max_hthreads = 1;
+  }
+  return hardware_threads.max_hthreads;
+#else
+  int num_cpus = 0;
+  int max_id = -1;
+  std::ifstream f("/proc/cpuinfo");
+  if (!f.is_open()) {
+    std::cerr << "failed to open /proc/cpuinfo\n";
+    return -1;
+  }
+  const std::string Key = "processor";
+  std::string ln;
+  while (std::getline(f, ln)) {
+    if (ln.empty()) continue;
+    std::size_t split_idx = ln.find(':');
+    std::string value;
+#if defined(__s390__)
+    // s390 has another format in /proc/cpuinfo
+    // it needs to be parsed differently
+    if (split_idx != std::string::npos)
+      value = ln.substr(Key.size() + 1, split_idx - Key.size() - 1);
+#else
+    if (split_idx != std::string::npos) value = ln.substr(split_idx + 1);
+#endif
+    if (ln.size() >= Key.size() && ln.compare(0, Key.size(), Key) == 0) {
+      num_cpus++;
+      if (!value.empty()) {
+        const int cur_id = benchmark::stoi(value);
+        max_id = std::max(cur_id, max_id);
+      }
+    }
+  }
+  if (f.bad()) {
+    std::cerr << "Failure reading /proc/cpuinfo\n";
+    return -1;
+  }
+  if (!f.eof()) {
+    std::cerr << "Failed to read to end of /proc/cpuinfo\n";
+    return -1;
+  }
+  f.close();
+
+  if ((max_id + 1) != num_cpus) {
+    fprintf(stderr,
+            "CPU ID assignments in /proc/cpuinfo seem messed up."
+            " This is usually caused by a bad BIOS.\n");
+  }
+  return num_cpus;
+#endif
+  BENCHMARK_UNREACHABLE();
+}
+
+class ThreadAffinityGuard final {
+ public:
+  ThreadAffinityGuard() : reset_affinity(SetAffinity()) {
+    if (!reset_affinity)
+      std::cerr << "***WARNING*** Failed to set thread affinity. Estimated CPU "
+                   "frequency may be incorrect."
+                << std::endl;
+  }
+
+  ~ThreadAffinityGuard() {
+    if (!reset_affinity) return;
+
+#if defined(BENCHMARK_HAS_PTHREAD_AFFINITY)
+    int ret = pthread_setaffinity_np(self, sizeof(previous_affinity),
+                                     &previous_affinity);
+    if (ret == 0) return;
+#elif defined(BENCHMARK_OS_WINDOWS_WIN32)
+    DWORD_PTR ret = SetThreadAffinityMask(self, previous_affinity);
+    if (ret != 0) return;
+#endif  // def BENCHMARK_HAS_PTHREAD_AFFINITY
+    PrintErrorAndDie("Failed to reset thread affinity");
+  }
+
+  ThreadAffinityGuard(ThreadAffinityGuard&&) = delete;
+  ThreadAffinityGuard(const ThreadAffinityGuard&) = delete;
+  ThreadAffinityGuard& operator=(ThreadAffinityGuard&&) = delete;
+  ThreadAffinityGuard& operator=(const ThreadAffinityGuard&) = delete;
+
+ private:
+  bool SetAffinity() {
+#if defined(BENCHMARK_HAS_PTHREAD_AFFINITY)
+    int ret;
+    self = pthread_self();
+    ret = pthread_getaffinity_np(self, sizeof(previous_affinity),
+                                 &previous_affinity);
+    if (ret != 0) return false;
+
+    cpu_set_t affinity;
+    memcpy(&affinity, &previous_affinity, sizeof(affinity));
+
+    bool is_first_cpu = true;
+
+    for (int i = 0; i < CPU_SETSIZE; ++i)
+      if (CPU_ISSET(i, &affinity)) {
+        if (is_first_cpu)
+          is_first_cpu = false;
+        else
+          CPU_CLR(i, &affinity);
+      }
+
+    if (is_first_cpu) return false;
+
+    ret = pthread_setaffinity_np(self, sizeof(affinity), &affinity);
+    return ret == 0;
+#elif defined(BENCHMARK_OS_WINDOWS_WIN32)
+    self = GetCurrentThread();
+    DWORD_PTR mask = static_cast<DWORD_PTR>(1) << GetCurrentProcessorNumber();
+    previous_affinity = SetThreadAffinityMask(self, mask);
+    return previous_affinity != 0;
+#else
+    return false;
+#endif  // def BENCHMARK_HAS_PTHREAD_AFFINITY
+  }
+
+#if defined(BENCHMARK_HAS_PTHREAD_AFFINITY)
+  pthread_t self;
+  cpu_set_t previous_affinity;
+#elif defined(BENCHMARK_OS_WINDOWS_WIN32)
+  HANDLE self;
+  DWORD_PTR previous_affinity;
+#endif  // def BENCHMARK_HAS_PTHREAD_AFFINITY
+  bool reset_affinity;
+};
+
+double GetCPUCyclesPerSecond(CPUInfo::Scaling scaling) {
+  // Currently, scaling is only used on linux path here,
+  // suppress diagnostics about it being unused on other paths.
+  (void)scaling;
+
+#if defined BENCHMARK_OS_LINUX || defined BENCHMARK_OS_CYGWIN
+  long freq;
+
+  // If the kernel is exporting the tsc frequency use that. There are issues
+  // where cpuinfo_max_freq cannot be relied on because the BIOS may be
+  // exporintg an invalid p-state (on x86) or p-states may be used to put the
+  // processor in a new mode (turbo mode). Essentially, those frequencies
+  // cannot always be relied upon. The same reasons apply to /proc/cpuinfo as
+  // well.
+  if (ReadFromFile("/sys/devices/system/cpu/cpu0/tsc_freq_khz", &freq)
+      // If CPU scaling is disabled, use the *current* frequency.
+      // Note that we specifically don't want to read cpuinfo_cur_freq,
+      // because it is only readable by root.
+      || (scaling == CPUInfo::Scaling::DISABLED &&
+          ReadFromFile("/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq",
+                       &freq))
+      // Otherwise, if CPU scaling may be in effect, we want to use
+      // the *maximum* frequency, not whatever CPU speed some random processor
+      // happens to be using now.
+      || ReadFromFile("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq",
+                      &freq)) {
+    // The value is in kHz (as the file name suggests).  For example, on a
+    // 2GHz warpstation, the file contains the value "2000000".
+    return freq * 1000.0;
+  }
+
+  const double error_value = -1;
+  double bogo_clock = error_value;
+
+  std::ifstream f("/proc/cpuinfo");
+  if (!f.is_open()) {
+    std::cerr << "failed to open /proc/cpuinfo\n";
+    return error_value;
+  }
+
+  auto StartsWithKey = [](std::string const& Value, std::string const& Key) {
+    if (Key.size() > Value.size()) return false;
+    auto Cmp = [&](char X, char Y) {
+      return std::tolower(X) == std::tolower(Y);
+    };
+    return std::equal(Key.begin(), Key.end(), Value.begin(), Cmp);
+  };
+
+  std::string ln;
+  while (std::getline(f, ln)) {
+    if (ln.empty()) continue;
+    std::size_t split_idx = ln.find(':');
+    std::string value;
+    if (split_idx != std::string::npos) value = ln.substr(split_idx + 1);
+    // When parsing the "cpu MHz" and "bogomips" (fallback) entries, we only
+    // accept positive values. Some environments (virtual machines) report zero,
+    // which would cause infinite looping in WallTime_Init.
+    if (StartsWithKey(ln, "cpu MHz")) {
+      if (!value.empty()) {
+        double cycles_per_second = benchmark::stod(value) * 1000000.0;
+        if (cycles_per_second > 0) return cycles_per_second;
+      }
+    } else if (StartsWithKey(ln, "bogomips")) {
+      if (!value.empty()) {
+        bogo_clock = benchmark::stod(value) * 1000000.0;
+        if (bogo_clock < 0.0) bogo_clock = error_value;
+      }
+    }
+  }
+  if (f.bad()) {
+    std::cerr << "Failure reading /proc/cpuinfo\n";
+    return error_value;
+  }
+  if (!f.eof()) {
+    std::cerr << "Failed to read to end of /proc/cpuinfo\n";
+    return error_value;
+  }
+  f.close();
+  // If we found the bogomips clock, but nothing better, we'll use it (but
+  // we're not happy about it); otherwise, fallback to the rough estimation
+  // below.
+  if (bogo_clock >= 0.0) return bogo_clock;
+
+#elif defined BENCHMARK_HAS_SYSCTL
+  constexpr auto* freqStr =
+#if defined(BENCHMARK_OS_FREEBSD) || defined(BENCHMARK_OS_NETBSD)
+      "machdep.tsc_freq";
+#elif defined BENCHMARK_OS_OPENBSD
+      "hw.cpuspeed";
+#elif defined BENCHMARK_OS_DRAGONFLY
+      "hw.tsc_frequency";
+#else
+      "hw.cpufrequency";
+#endif
+  unsigned long long hz = 0;
+#if defined BENCHMARK_OS_OPENBSD
+  if (GetSysctl(freqStr, &hz)) return hz * 1000000;
+#else
+  if (GetSysctl(freqStr, &hz)) return hz;
+#endif
+  fprintf(stderr, "Unable to determine clock rate from sysctl: %s: %s\n",
+          freqStr, strerror(errno));
+  fprintf(stderr,
+          "This does not affect benchmark measurements, only the "
+          "metadata output.\n");
+
+#elif defined BENCHMARK_OS_WINDOWS_WIN32
+  // In NT, read MHz from the registry. If we fail to do so or we're in win9x
+  // then make a crude estimate.
+  DWORD data, data_size = sizeof(data);
+  if (IsWindowsXPOrGreater() &&
+      SUCCEEDED(
+          SHGetValueA(HKEY_LOCAL_MACHINE,
+                      "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0",
+                      "~MHz", nullptr, &data, &data_size)))
+    return static_cast<double>((int64_t)data *
+                               (int64_t)(1000 * 1000));  // was mhz
+#elif defined(BENCHMARK_OS_SOLARIS)
+  kstat_ctl_t* kc = kstat_open();
+  if (!kc) {
+    std::cerr << "failed to open /dev/kstat\n";
+    return -1;
+  }
+  kstat_t* ksp = kstat_lookup(kc, const_cast<char*>("cpu_info"), -1,
+                              const_cast<char*>("cpu_info0"));
+  if (!ksp) {
+    std::cerr << "failed to lookup in /dev/kstat\n";
+    return -1;
+  }
+  if (kstat_read(kc, ksp, NULL) < 0) {
+    std::cerr << "failed to read from /dev/kstat\n";
+    return -1;
+  }
+  kstat_named_t* knp = (kstat_named_t*)kstat_data_lookup(
+      ksp, const_cast<char*>("current_clock_Hz"));
+  if (!knp) {
+    std::cerr << "failed to lookup data in /dev/kstat\n";
+    return -1;
+  }
+  if (knp->data_type != KSTAT_DATA_UINT64) {
+    std::cerr << "current_clock_Hz is of unexpected data type: "
+              << knp->data_type << "\n";
+    return -1;
+  }
+  double clock_hz = knp->value.ui64;
+  kstat_close(kc);
+  return clock_hz;
+#elif defined(BENCHMARK_OS_QNX)
+  return static_cast<double>((int64_t)(SYSPAGE_ENTRY(cpuinfo)->speed) *
+                             (int64_t)(1000 * 1000));
+#elif defined(BENCHMARK_OS_QURT)
+  // QuRT doesn't provide any API to query Hexagon frequency.
+  return 1000000000;
+#endif
+  // If we've fallen through, attempt to roughly estimate the CPU clock rate.
+
+  // Make sure to use the same cycle counter when starting and stopping the
+  // cycle timer. We just pin the current thread to a cpu in the previous
+  // affinity set.
+  ThreadAffinityGuard affinity_guard;
+
+  static constexpr double estimate_time_s = 1.0;
+  const double start_time = ChronoClockNow();
+  const auto start_ticks = cycleclock::Now();
+
+  // Impose load instead of calling sleep() to make sure the cycle counter
+  // works.
+  using PRNG = std::minstd_rand;
+  using Result = PRNG::result_type;
+  PRNG rng(static_cast<Result>(start_ticks));
+
+  Result state = 0;
+
+  do {
+    static constexpr size_t batch_size = 10000;
+    rng.discard(batch_size);
+    state += rng();
+
+  } while (ChronoClockNow() - start_time < estimate_time_s);
+
+  DoNotOptimize(state);
+
+  const auto end_ticks = cycleclock::Now();
+  const double end_time = ChronoClockNow();
+
+  return static_cast<double>(end_ticks - start_ticks) / (end_time - start_time);
+  // Reset the affinity of current thread when the lifetime of affinity_guard
+  // ends.
+}
+
+std::vector<double> GetLoadAvg() {
+#if (defined BENCHMARK_OS_FREEBSD || defined(BENCHMARK_OS_LINUX) ||     \
+     defined BENCHMARK_OS_MACOSX || defined BENCHMARK_OS_NETBSD ||      \
+     defined BENCHMARK_OS_OPENBSD || defined BENCHMARK_OS_DRAGONFLY) && \
+    !defined(__ANDROID__)
+  static constexpr int kMaxSamples = 3;
+  std::vector<double> res(kMaxSamples, 0.0);
+  const int nelem = getloadavg(res.data(), kMaxSamples);
+  if (nelem < 1) {
+    res.clear();
+  } else {
+    res.resize(nelem);
+  }
+  return res;
+#else
+  return {};
+#endif
+}
+
+}  // end namespace
+
+const CPUInfo& CPUInfo::Get() {
+  static const CPUInfo* info = new CPUInfo();
+  return *info;
+}
+
+CPUInfo::CPUInfo()
+    : num_cpus(GetNumCPUs()),
+      scaling(CpuScaling(num_cpus)),
+      cycles_per_second(GetCPUCyclesPerSecond(scaling)),
+      caches(GetCacheSizes()),
+      load_avg(GetLoadAvg()) {}
+
+const SystemInfo& SystemInfo::Get() {
+  static const SystemInfo* info = new SystemInfo();
+  return *info;
+}
+
+SystemInfo::SystemInfo() : name(GetSystemName()) {}
+}  // end namespace benchmark
diff --git a/third_party/google_benchmark/src/thread_manager.h b/third_party/google_benchmark/src/src/thread_manager.h
similarity index 94%
rename from third_party/google_benchmark/src/thread_manager.h
rename to third_party/google_benchmark/src/src/thread_manager.h
index 28e2dd5..819b3c4 100644
--- a/third_party/google_benchmark/src/thread_manager.h
+++ b/third_party/google_benchmark/src/src/thread_manager.h
@@ -36,7 +36,6 @@
                         [this]() { return alive_threads_ == 0; });
   }
 
- public:
   struct Result {
     IterationCount iterations = 0;
     double real_time_used = 0;
@@ -44,8 +43,8 @@
     double manual_time_used = 0;
     int64_t complexity_n = 0;
     std::string report_label_;
-    std::string error_message_;
-    bool has_error_ = false;
+    std::string skip_message_;
+    internal::Skipped skipped_ = internal::NotSkipped;
     UserCounters counters;
   };
   GUARDED_BY(GetBenchmarkMutex()) Result results;
diff --git a/third_party/google_benchmark/src/thread_timer.h b/third_party/google_benchmark/src/src/thread_timer.h
similarity index 95%
rename from third_party/google_benchmark/src/thread_timer.h
rename to third_party/google_benchmark/src/src/thread_timer.h
index 1703ca0..eb23f59 100644
--- a/third_party/google_benchmark/src/thread_timer.h
+++ b/third_party/google_benchmark/src/src/thread_timer.h
@@ -28,7 +28,7 @@
 
   // Called by each thread
   void StopTimer() {
-    CHECK(running_);
+    BM_CHECK(running_);
     running_ = false;
     real_time_used_ += ChronoClockNow() - start_real_time_;
     // Floating point error can result in the subtraction producing a negative
@@ -44,19 +44,19 @@
 
   // REQUIRES: timer is not running
   double real_time_used() const {
-    CHECK(!running_);
+    BM_CHECK(!running_);
     return real_time_used_;
   }
 
   // REQUIRES: timer is not running
   double cpu_time_used() const {
-    CHECK(!running_);
+    BM_CHECK(!running_);
     return cpu_time_used_;
   }
 
   // REQUIRES: timer is not running
   double manual_time_used() const {
-    CHECK(!running_);
+    BM_CHECK(!running_);
     return manual_time_used_;
   }
 
diff --git a/third_party/google_benchmark/src/timers.cc b/third_party/google_benchmark/src/src/timers.cc
similarity index 64%
rename from third_party/google_benchmark/src/timers.cc
rename to third_party/google_benchmark/src/src/timers.cc
index 7613ff9..b23feea 100644
--- a/third_party/google_benchmark/src/timers.cc
+++ b/third_party/google_benchmark/src/src/timers.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "timers.h"
+
 #include "internal_macros.h"
 
 #ifdef BENCHMARK_OS_WINDOWS
@@ -22,13 +23,14 @@
 #include <windows.h>
 #else
 #include <fcntl.h>
-#ifndef BENCHMARK_OS_FUCHSIA
+#if !defined(BENCHMARK_OS_FUCHSIA) && !defined(BENCHMARK_OS_QURT)
 #include <sys/resource.h>
 #endif
 #include <sys/time.h>
 #include <sys/types.h>  // this header must be included before 'sys/sysctl.h' to avoid compilation error on FreeBSD
 #include <unistd.h>
-#if defined BENCHMARK_OS_FREEBSD || defined BENCHMARK_OS_MACOSX
+#if defined BENCHMARK_OS_FREEBSD || defined BENCHMARK_OS_DRAGONFLY || \
+    defined BENCHMARK_OS_MACOSX
 #include <sys/sysctl.h>
 #endif
 #if defined(BENCHMARK_OS_MACOSX)
@@ -36,6 +38,9 @@
 #include <mach/mach_port.h>
 #include <mach/thread_act.h>
 #endif
+#if defined(BENCHMARK_OS_QURT)
+#include <qurt.h>
+#endif
 #endif
 
 #ifdef BENCHMARK_OS_EMSCRIPTEN
@@ -54,7 +59,6 @@
 
 #include "check.h"
 #include "log.h"
-#include "sleep.h"
 #include "string_util.h"
 
 namespace benchmark {
@@ -63,6 +67,9 @@
 #if defined(__GNUC__)
 #pragma GCC diagnostic ignored "-Wunused-function"
 #endif
+#if defined(__NVCOMPILER)
+#pragma diag_suppress declared_but_not_referenced
+#endif
 
 namespace {
 #if defined(BENCHMARK_OS_WINDOWS)
@@ -77,7 +84,7 @@
           static_cast<double>(user.QuadPart)) *
          1e-7;
 }
-#elif !defined(BENCHMARK_OS_FUCHSIA)
+#elif !defined(BENCHMARK_OS_FUCHSIA) && !defined(BENCHMARK_OS_QURT)
 double MakeTime(struct rusage const& ru) {
   return (static_cast<double>(ru.ru_utime.tv_sec) +
           static_cast<double>(ru.ru_utime.tv_usec) * 1e-6 +
@@ -117,15 +124,19 @@
                       &user_time))
     return MakeTime(kernel_time, user_time);
   DiagnoseAndExit("GetProccessTimes() failed");
+#elif defined(BENCHMARK_OS_QURT)
+  return static_cast<double>(
+             qurt_timer_timetick_to_us(qurt_timer_get_ticks())) *
+         1.0e-6;
 #elif defined(BENCHMARK_OS_EMSCRIPTEN)
   // clock_gettime(CLOCK_PROCESS_CPUTIME_ID, ...) returns 0 on Emscripten.
   // Use Emscripten-specific API. Reported CPU time would be exactly the
   // same as total time, but this is ok because there aren't long-latency
-  // syncronous system calls in Emscripten.
+  // synchronous system calls in Emscripten.
   return emscripten_get_now() * 1e-3;
 #elif defined(CLOCK_PROCESS_CPUTIME_ID) && !defined(BENCHMARK_OS_MACOSX)
-  // FIXME We want to use clock_gettime, but its not available in MacOS 10.11. See
-  // https://github.com/google/benchmark/pull/292
+  // FIXME We want to use clock_gettime, but its not available in MacOS 10.11.
+  // See https://github.com/google/benchmark/pull/292
   struct timespec spec;
   if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &spec) == 0)
     return MakeTime(spec);
@@ -147,14 +158,19 @@
   GetThreadTimes(this_thread, &creation_time, &exit_time, &kernel_time,
                  &user_time);
   return MakeTime(kernel_time, user_time);
+#elif defined(BENCHMARK_OS_QURT)
+  return static_cast<double>(
+             qurt_timer_timetick_to_us(qurt_timer_get_ticks())) *
+         1.0e-6;
 #elif defined(BENCHMARK_OS_MACOSX)
-  // FIXME We want to use clock_gettime, but its not available in MacOS 10.11. See
-  // https://github.com/google/benchmark/pull/292
+  // FIXME We want to use clock_gettime, but its not available in MacOS 10.11.
+  // See https://github.com/google/benchmark/pull/292
   mach_msg_type_number_t count = THREAD_BASIC_INFO_COUNT;
   thread_basic_info_data_t info;
   mach_port_t thread = pthread_mach_thread_np(pthread_self());
-  if (thread_info(thread, THREAD_BASIC_INFO, (thread_info_t)&info, &count) ==
-      KERN_SUCCESS) {
+  if (thread_info(thread, THREAD_BASIC_INFO,
+                  reinterpret_cast<thread_info_t>(&info),
+                  &count) == KERN_SUCCESS) {
     return MakeTime(info);
   }
   DiagnoseAndExit("ThreadCPUUsage() failed when evaluating thread_info");
@@ -178,40 +194,79 @@
 #endif
 }
 
-namespace {
-
-std::string DateTimeString(bool local) {
+std::string LocalDateTimeString() {
+  // Write the local time in RFC3339 format yyyy-mm-ddTHH:MM:SS+/-HH:MM.
   typedef std::chrono::system_clock Clock;
   std::time_t now = Clock::to_time_t(Clock::now());
-  const std::size_t kStorageSize = 128;
-  char storage[kStorageSize];
-  std::size_t written;
+  const std::size_t kTzOffsetLen = 6;
+  const std::size_t kTimestampLen = 19;
 
-  if (local) {
+  std::size_t tz_len;
+  std::size_t timestamp_len;
+  long int offset_minutes;
+  char tz_offset_sign = '+';
+  // tz_offset is set in one of three ways:
+  // * strftime with %z - This either returns empty or the ISO 8601 time.  The
+  // maximum length an
+  //   ISO 8601 string can be is 7 (e.g. -03:30, plus trailing zero).
+  // * snprintf with %c%02li:%02li - The maximum length is 41 (one for %c, up to
+  // 19 for %02li,
+  //   one for :, up to 19 %02li, plus trailing zero).
+  // * A fixed string of "-00:00".  The maximum length is 7 (-00:00, plus
+  // trailing zero).
+  //
+  // Thus, the maximum size this needs to be is 41.
+  char tz_offset[41];
+  // Long enough buffer to avoid format-overflow warnings
+  char storage[128];
+
 #if defined(BENCHMARK_OS_WINDOWS)
-    written =
-        std::strftime(storage, sizeof(storage), "%x %X", ::localtime(&now));
+  std::tm* timeinfo_p = ::localtime(&now);
 #else
-    std::tm timeinfo;
-    ::localtime_r(&now, &timeinfo);
-    written = std::strftime(storage, sizeof(storage), "%F %T", &timeinfo);
+  std::tm timeinfo;
+  std::tm* timeinfo_p = &timeinfo;
+  ::localtime_r(&now, &timeinfo);
 #endif
+
+  tz_len = std::strftime(tz_offset, sizeof(tz_offset), "%z", timeinfo_p);
+
+  if (tz_len < kTzOffsetLen && tz_len > 1) {
+    // Timezone offset was written. strftime writes offset as +HHMM or -HHMM,
+    // RFC3339 specifies an offset as +HH:MM or -HH:MM. To convert, we parse
+    // the offset as an integer, then reprint it to a string.
+
+    offset_minutes = ::strtol(tz_offset, NULL, 10);
+    if (offset_minutes < 0) {
+      offset_minutes *= -1;
+      tz_offset_sign = '-';
+    }
+
+    tz_len =
+        ::snprintf(tz_offset, sizeof(tz_offset), "%c%02li:%02li",
+                   tz_offset_sign, offset_minutes / 100, offset_minutes % 100);
+    BM_CHECK(tz_len == kTzOffsetLen);
+    ((void)tz_len);  // Prevent unused variable warning in optimized build.
   } else {
+    // Unknown offset. RFC3339 specifies that unknown local offsets should be
+    // written as UTC time with -00:00 timezone.
 #if defined(BENCHMARK_OS_WINDOWS)
-    written = std::strftime(storage, sizeof(storage), "%x %X", ::gmtime(&now));
+    // Potential race condition if another thread calls localtime or gmtime.
+    timeinfo_p = ::gmtime(&now);
 #else
-    std::tm timeinfo;
     ::gmtime_r(&now, &timeinfo);
-    written = std::strftime(storage, sizeof(storage), "%F %T", &timeinfo);
 #endif
+
+    strncpy(tz_offset, "-00:00", kTzOffsetLen + 1);
   }
-  CHECK(written < kStorageSize);
-  ((void)written);  // prevent unused variable in optimized mode.
+
+  timestamp_len =
+      std::strftime(storage, sizeof(storage), "%Y-%m-%dT%H:%M:%S", timeinfo_p);
+  BM_CHECK(timestamp_len == kTimestampLen);
+  // Prevent unused variable warning in optimized build.
+  ((void)kTimestampLen);
+
+  std::strncat(storage, tz_offset, sizeof(storage) - timestamp_len - 1);
   return std::string(storage);
 }
 
-}  // end namespace
-
-std::string LocalDateTimeString() { return DateTimeString(true); }
-
 }  // end namespace benchmark
diff --git a/third_party/google_benchmark/src/timers.h b/third_party/google_benchmark/src/src/timers.h
similarity index 100%
rename from third_party/google_benchmark/src/timers.h
rename to third_party/google_benchmark/src/src/timers.h
diff --git a/third_party/google_benchmark/src/sysinfo.cc b/third_party/google_benchmark/src/sysinfo.cc
deleted file mode 100644
index 5b7c4af..0000000
--- a/third_party/google_benchmark/src/sysinfo.cc
+++ /dev/null
@@ -1,708 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "internal_macros.h"
-
-#ifdef BENCHMARK_OS_WINDOWS
-#include <shlwapi.h>
-#undef StrCat  // Don't let StrCat in string_util.h be renamed to lstrcatA
-#include <versionhelpers.h>
-#include <windows.h>
-#include <codecvt>
-#else
-#include <fcntl.h>
-#ifndef BENCHMARK_OS_FUCHSIA
-#include <sys/resource.h>
-#endif
-#include <sys/time.h>
-#include <sys/types.h>  // this header must be included before 'sys/sysctl.h' to avoid compilation error on FreeBSD
-#include <unistd.h>
-#if defined BENCHMARK_OS_FREEBSD || defined BENCHMARK_OS_MACOSX || \
-    defined BENCHMARK_OS_NETBSD || defined BENCHMARK_OS_OPENBSD
-#define BENCHMARK_HAS_SYSCTL
-#include <sys/sysctl.h>
-#endif
-#endif
-#if defined(BENCHMARK_OS_SOLARIS)
-#include <kstat.h>
-#endif
-#if defined(BENCHMARK_OS_QNX)
-#include <sys/syspage.h>
-#endif
-
-#include <algorithm>
-#include <array>
-#include <bitset>
-#include <cerrno>
-#include <climits>
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <iterator>
-#include <limits>
-#include <memory>
-#include <sstream>
-#include <locale>
-
-#include "check.h"
-#include "cycleclock.h"
-#include "internal_macros.h"
-#include "log.h"
-#include "sleep.h"
-#include "string_util.h"
-
-namespace benchmark {
-namespace {
-
-void PrintImp(std::ostream& out) { out << std::endl; }
-
-template <class First, class... Rest>
-void PrintImp(std::ostream& out, First&& f, Rest&&... rest) {
-  out << std::forward<First>(f);
-  PrintImp(out, std::forward<Rest>(rest)...);
-}
-
-template <class... Args>
-BENCHMARK_NORETURN void PrintErrorAndDie(Args&&... args) {
-  PrintImp(std::cerr, std::forward<Args>(args)...);
-  std::exit(EXIT_FAILURE);
-}
-
-#ifdef BENCHMARK_HAS_SYSCTL
-
-/// ValueUnion - A type used to correctly alias the byte-for-byte output of
-/// `sysctl` with the result type it's to be interpreted as.
-struct ValueUnion {
-  union DataT {
-    uint32_t uint32_value;
-    uint64_t uint64_value;
-    // For correct aliasing of union members from bytes.
-    char bytes[8];
-  };
-  using DataPtr = std::unique_ptr<DataT, decltype(&std::free)>;
-
-  // The size of the data union member + its trailing array size.
-  size_t Size;
-  DataPtr Buff;
-
- public:
-  ValueUnion() : Size(0), Buff(nullptr, &std::free) {}
-
-  explicit ValueUnion(size_t BuffSize)
-      : Size(sizeof(DataT) + BuffSize),
-        Buff(::new (std::malloc(Size)) DataT(), &std::free) {}
-
-  ValueUnion(ValueUnion&& other) = default;
-
-  explicit operator bool() const { return bool(Buff); }
-
-  char* data() const { return Buff->bytes; }
-
-  std::string GetAsString() const { return std::string(data()); }
-
-  int64_t GetAsInteger() const {
-    if (Size == sizeof(Buff->uint32_value))
-      return static_cast<int32_t>(Buff->uint32_value);
-    else if (Size == sizeof(Buff->uint64_value))
-      return static_cast<int64_t>(Buff->uint64_value);
-    BENCHMARK_UNREACHABLE();
-  }
-
-  uint64_t GetAsUnsigned() const {
-    if (Size == sizeof(Buff->uint32_value))
-      return Buff->uint32_value;
-    else if (Size == sizeof(Buff->uint64_value))
-      return Buff->uint64_value;
-    BENCHMARK_UNREACHABLE();
-  }
-
-  template <class T, int N>
-  std::array<T, N> GetAsArray() {
-    const int ArrSize = sizeof(T) * N;
-    CHECK_LE(ArrSize, Size);
-    std::array<T, N> Arr;
-    std::memcpy(Arr.data(), data(), ArrSize);
-    return Arr;
-  }
-};
-
-ValueUnion GetSysctlImp(std::string const& Name) {
-#if defined BENCHMARK_OS_OPENBSD
-  int mib[2];
-
-  mib[0] = CTL_HW;
-  if ((Name == "hw.ncpu") || (Name == "hw.cpuspeed")){
-    ValueUnion buff(sizeof(int));
-
-    if (Name == "hw.ncpu") {
-      mib[1] = HW_NCPU;
-    } else {
-      mib[1] = HW_CPUSPEED;
-    }
-
-    if (sysctl(mib, 2, buff.data(), &buff.Size, nullptr, 0) == -1) {
-      return ValueUnion();
-    }
-    return buff;
-  }
-  return ValueUnion();
-#else
-  size_t CurBuffSize = 0;
-  if (sysctlbyname(Name.c_str(), nullptr, &CurBuffSize, nullptr, 0) == -1)
-    return ValueUnion();
-
-  ValueUnion buff(CurBuffSize);
-  if (sysctlbyname(Name.c_str(), buff.data(), &buff.Size, nullptr, 0) == 0)
-    return buff;
-  return ValueUnion();
-#endif
-}
-
-BENCHMARK_MAYBE_UNUSED
-bool GetSysctl(std::string const& Name, std::string* Out) {
-  Out->clear();
-  auto Buff = GetSysctlImp(Name);
-  if (!Buff) return false;
-  Out->assign(Buff.data());
-  return true;
-}
-
-template <class Tp,
-          class = typename std::enable_if<std::is_integral<Tp>::value>::type>
-bool GetSysctl(std::string const& Name, Tp* Out) {
-  *Out = 0;
-  auto Buff = GetSysctlImp(Name);
-  if (!Buff) return false;
-  *Out = static_cast<Tp>(Buff.GetAsUnsigned());
-  return true;
-}
-
-template <class Tp, size_t N>
-bool GetSysctl(std::string const& Name, std::array<Tp, N>* Out) {
-  auto Buff = GetSysctlImp(Name);
-  if (!Buff) return false;
-  *Out = Buff.GetAsArray<Tp, N>();
-  return true;
-}
-#endif
-
-template <class ArgT>
-bool ReadFromFile(std::string const& fname, ArgT* arg) {
-  *arg = ArgT();
-  std::ifstream f(fname.c_str());
-  if (!f.is_open()) return false;
-  f >> *arg;
-  return f.good();
-}
-
-bool CpuScalingEnabled(int num_cpus) {
-  // We don't have a valid CPU count, so don't even bother.
-  if (num_cpus <= 0) return false;
-#ifdef BENCHMARK_OS_QNX
-  return false;
-#endif
-#ifndef BENCHMARK_OS_WINDOWS
-  // On Linux, the CPUfreq subsystem exposes CPU information as files on the
-  // local file system. If reading the exported files fails, then we may not be
-  // running on Linux, so we silently ignore all the read errors.
-  std::string res;
-  for (int cpu = 0; cpu < num_cpus; ++cpu) {
-    std::string governor_file =
-        StrCat("/sys/devices/system/cpu/cpu", cpu, "/cpufreq/scaling_governor");
-    if (ReadFromFile(governor_file, &res) && res != "performance") return true;
-  }
-#endif
-  return false;
-}
-
-int CountSetBitsInCPUMap(std::string Val) {
-  auto CountBits = [](std::string Part) {
-    using CPUMask = std::bitset<sizeof(std::uintptr_t) * CHAR_BIT>;
-    Part = "0x" + Part;
-    CPUMask Mask(benchmark::stoul(Part, nullptr, 16));
-    return static_cast<int>(Mask.count());
-  };
-  size_t Pos;
-  int total = 0;
-  while ((Pos = Val.find(',')) != std::string::npos) {
-    total += CountBits(Val.substr(0, Pos));
-    Val = Val.substr(Pos + 1);
-  }
-  if (!Val.empty()) {
-    total += CountBits(Val);
-  }
-  return total;
-}
-
-BENCHMARK_MAYBE_UNUSED
-std::vector<CPUInfo::CacheInfo> GetCacheSizesFromKVFS() {
-  std::vector<CPUInfo::CacheInfo> res;
-  std::string dir = "/sys/devices/system/cpu/cpu0/cache/";
-  int Idx = 0;
-  while (true) {
-    CPUInfo::CacheInfo info;
-    std::string FPath = StrCat(dir, "index", Idx++, "/");
-    std::ifstream f(StrCat(FPath, "size").c_str());
-    if (!f.is_open()) break;
-    std::string suffix;
-    f >> info.size;
-    if (f.fail())
-      PrintErrorAndDie("Failed while reading file '", FPath, "size'");
-    if (f.good()) {
-      f >> suffix;
-      if (f.bad())
-        PrintErrorAndDie(
-            "Invalid cache size format: failed to read size suffix");
-      else if (f && suffix != "K")
-        PrintErrorAndDie("Invalid cache size format: Expected bytes ", suffix);
-      else if (suffix == "K")
-        info.size *= 1024;
-    }
-    if (!ReadFromFile(StrCat(FPath, "type"), &info.type))
-      PrintErrorAndDie("Failed to read from file ", FPath, "type");
-    if (!ReadFromFile(StrCat(FPath, "level"), &info.level))
-      PrintErrorAndDie("Failed to read from file ", FPath, "level");
-    std::string map_str;
-    if (!ReadFromFile(StrCat(FPath, "shared_cpu_map"), &map_str))
-      PrintErrorAndDie("Failed to read from file ", FPath, "shared_cpu_map");
-    info.num_sharing = CountSetBitsInCPUMap(map_str);
-    res.push_back(info);
-  }
-
-  return res;
-}
-
-#ifdef BENCHMARK_OS_MACOSX
-std::vector<CPUInfo::CacheInfo> GetCacheSizesMacOSX() {
-  std::vector<CPUInfo::CacheInfo> res;
-  std::array<uint64_t, 4> CacheCounts{{0, 0, 0, 0}};
-  GetSysctl("hw.cacheconfig", &CacheCounts);
-
-  struct {
-    std::string name;
-    std::string type;
-    int level;
-    uint64_t num_sharing;
-  } Cases[] = {{"hw.l1dcachesize", "Data", 1, CacheCounts[1]},
-               {"hw.l1icachesize", "Instruction", 1, CacheCounts[1]},
-               {"hw.l2cachesize", "Unified", 2, CacheCounts[2]},
-               {"hw.l3cachesize", "Unified", 3, CacheCounts[3]}};
-  for (auto& C : Cases) {
-    int val;
-    if (!GetSysctl(C.name, &val)) continue;
-    CPUInfo::CacheInfo info;
-    info.type = C.type;
-    info.level = C.level;
-    info.size = val;
-    info.num_sharing = static_cast<int>(C.num_sharing);
-    res.push_back(std::move(info));
-  }
-  return res;
-}
-#elif defined(BENCHMARK_OS_WINDOWS)
-std::vector<CPUInfo::CacheInfo> GetCacheSizesWindows() {
-  std::vector<CPUInfo::CacheInfo> res;
-  DWORD buffer_size = 0;
-  using PInfo = SYSTEM_LOGICAL_PROCESSOR_INFORMATION;
-  using CInfo = CACHE_DESCRIPTOR;
-
-  using UPtr = std::unique_ptr<PInfo, decltype(&std::free)>;
-  GetLogicalProcessorInformation(nullptr, &buffer_size);
-  UPtr buff((PInfo*)malloc(buffer_size), &std::free);
-  if (!GetLogicalProcessorInformation(buff.get(), &buffer_size))
-    PrintErrorAndDie("Failed during call to GetLogicalProcessorInformation: ",
-                     GetLastError());
-
-  PInfo* it = buff.get();
-  PInfo* end = buff.get() + (buffer_size / sizeof(PInfo));
-
-  for (; it != end; ++it) {
-    if (it->Relationship != RelationCache) continue;
-    using BitSet = std::bitset<sizeof(ULONG_PTR) * CHAR_BIT>;
-    BitSet B(it->ProcessorMask);
-    // To prevent duplicates, only consider caches where CPU 0 is specified
-    if (!B.test(0)) continue;
-    CInfo* Cache = &it->Cache;
-    CPUInfo::CacheInfo C;
-    C.num_sharing = static_cast<int>(B.count());
-    C.level = Cache->Level;
-    C.size = Cache->Size;
-    switch (Cache->Type) {
-      case CacheUnified:
-        C.type = "Unified";
-        break;
-      case CacheInstruction:
-        C.type = "Instruction";
-        break;
-      case CacheData:
-        C.type = "Data";
-        break;
-      case CacheTrace:
-        C.type = "Trace";
-        break;
-      default:
-        C.type = "Unknown";
-        break;
-    }
-    res.push_back(C);
-  }
-  return res;
-}
-#elif BENCHMARK_OS_QNX
-std::vector<CPUInfo::CacheInfo> GetCacheSizesQNX() {
-  std::vector<CPUInfo::CacheInfo> res;
-  struct cacheattr_entry *cache = SYSPAGE_ENTRY(cacheattr);
-  uint32_t const elsize = SYSPAGE_ELEMENT_SIZE(cacheattr);
-  int num = SYSPAGE_ENTRY_SIZE(cacheattr) / elsize ;
-  for(int i = 0; i < num; ++i ) {
-    CPUInfo::CacheInfo info;
-    switch (cache->flags){
-      case CACHE_FLAG_INSTR :
-        info.type = "Instruction";
-        info.level = 1;
-        break;
-      case CACHE_FLAG_DATA :
-        info.type = "Data";
-        info.level = 1;
-        break;
-      case CACHE_FLAG_UNIFIED :
-        info.type = "Unified";
-        info.level = 2;
-      case CACHE_FLAG_SHARED :
-        info.type = "Shared";
-        info.level = 3;
-      default :
-        continue;
-        break;
-    }
-    info.size = cache->line_size * cache->num_lines;
-    info.num_sharing = 0;
-    res.push_back(std::move(info));
-    cache = SYSPAGE_ARRAY_ADJ_OFFSET(cacheattr, cache, elsize);
-  }
-  return res;
-}
-#endif
-
-std::vector<CPUInfo::CacheInfo> GetCacheSizes() {
-#ifdef BENCHMARK_OS_MACOSX
-  return GetCacheSizesMacOSX();
-#elif defined(BENCHMARK_OS_WINDOWS)
-  return GetCacheSizesWindows();
-#elif defined(BENCHMARK_OS_QNX)
-  return GetCacheSizesQNX();
-#else
-  return GetCacheSizesFromKVFS();
-#endif
-}
-
-std::string GetSystemName() {
-#if defined(BENCHMARK_OS_WINDOWS)
-  std::string str;
-  const unsigned COUNT = MAX_COMPUTERNAME_LENGTH+1;
-  TCHAR  hostname[COUNT] = {'\0'};
-  DWORD DWCOUNT = COUNT;
-  if (!GetComputerName(hostname, &DWCOUNT))
-    return std::string("");
-#ifndef UNICODE
-  str = std::string(hostname, DWCOUNT);
-#else
-  //Using wstring_convert, Is deprecated in C++17
-  using convert_type = std::codecvt_utf8<wchar_t>;
-  std::wstring_convert<convert_type, wchar_t> converter;
-  std::wstring wStr(hostname, DWCOUNT);
-  str = converter.to_bytes(wStr);
-#endif
-  return str;
-#else // defined(BENCHMARK_OS_WINDOWS)
-#ifndef HOST_NAME_MAX
-#ifdef BENCHMARK_HAS_SYSCTL // BSD/Mac Doesnt have HOST_NAME_MAX defined
-#define HOST_NAME_MAX 64
-#elif defined(BENCHMARK_OS_NACL)
-#define HOST_NAME_MAX 64
-#elif defined(BENCHMARK_OS_QNX)
-#define HOST_NAME_MAX 154
-#elif defined(BENCHMARK_OS_RTEMS)
-#define HOST_NAME_MAX 256
-#else
-#warning "HOST_NAME_MAX not defined. using 64"
-#define HOST_NAME_MAX 64
-#endif
-#endif // def HOST_NAME_MAX
-  char hostname[HOST_NAME_MAX];
-  int retVal = gethostname(hostname, HOST_NAME_MAX);
-  if (retVal != 0) return std::string("");
-  return std::string(hostname);
-#endif // Catch-all POSIX block.
-}
-
-int GetNumCPUs() {
-#ifdef BENCHMARK_HAS_SYSCTL
-  int NumCPU = -1;
-  if (GetSysctl("hw.ncpu", &NumCPU)) return NumCPU;
-  fprintf(stderr, "Err: %s\n", strerror(errno));
-  std::exit(EXIT_FAILURE);
-#elif defined(BENCHMARK_OS_WINDOWS)
-  SYSTEM_INFO sysinfo;
-  // Use memset as opposed to = {} to avoid GCC missing initializer false
-  // positives.
-  std::memset(&sysinfo, 0, sizeof(SYSTEM_INFO));
-  GetSystemInfo(&sysinfo);
-  return sysinfo.dwNumberOfProcessors;  // number of logical
-                                        // processors in the current
-                                        // group
-#elif defined(BENCHMARK_OS_SOLARIS)
-  // Returns -1 in case of a failure.
-  int NumCPU = sysconf(_SC_NPROCESSORS_ONLN);
-  if (NumCPU < 0) {
-    fprintf(stderr,
-            "sysconf(_SC_NPROCESSORS_ONLN) failed with error: %s\n",
-            strerror(errno));
-  }
-  return NumCPU;
-#elif defined(BENCHMARK_OS_QNX)
-  return static_cast<int>(_syspage_ptr->num_cpu);
-#else
-  int NumCPUs = 0;
-  int MaxID = -1;
-  std::ifstream f("/proc/cpuinfo");
-  if (!f.is_open()) {
-    std::cerr << "failed to open /proc/cpuinfo\n";
-    return -1;
-  }
-  const std::string Key = "processor";
-  std::string ln;
-  while (std::getline(f, ln)) {
-    if (ln.empty()) continue;
-    size_t SplitIdx = ln.find(':');
-    std::string value;
-#if defined(__s390__)
-    // s390 has another format in /proc/cpuinfo
-    // it needs to be parsed differently
-    if (SplitIdx != std::string::npos) value = ln.substr(Key.size()+1,SplitIdx-Key.size()-1);
-#else
-    if (SplitIdx != std::string::npos) value = ln.substr(SplitIdx + 1);
-#endif
-    if (ln.size() >= Key.size() && ln.compare(0, Key.size(), Key) == 0) {
-      NumCPUs++;
-      if (!value.empty()) {
-        int CurID = benchmark::stoi(value);
-        MaxID = std::max(CurID, MaxID);
-      }
-    }
-  }
-  if (f.bad()) {
-    std::cerr << "Failure reading /proc/cpuinfo\n";
-    return -1;
-  }
-  if (!f.eof()) {
-    std::cerr << "Failed to read to end of /proc/cpuinfo\n";
-    return -1;
-  }
-  f.close();
-
-  if ((MaxID + 1) != NumCPUs) {
-    fprintf(stderr,
-            "CPU ID assignments in /proc/cpuinfo seem messed up."
-            " This is usually caused by a bad BIOS.\n");
-  }
-  return NumCPUs;
-#endif
-  BENCHMARK_UNREACHABLE();
-}
-
-double GetCPUCyclesPerSecond() {
-#if defined BENCHMARK_OS_LINUX || defined BENCHMARK_OS_CYGWIN
-  long freq;
-
-  // If the kernel is exporting the tsc frequency use that. There are issues
-  // where cpuinfo_max_freq cannot be relied on because the BIOS may be
-  // exporintg an invalid p-state (on x86) or p-states may be used to put the
-  // processor in a new mode (turbo mode). Essentially, those frequencies
-  // cannot always be relied upon. The same reasons apply to /proc/cpuinfo as
-  // well.
-  if (ReadFromFile("/sys/devices/system/cpu/cpu0/tsc_freq_khz", &freq)
-      // If CPU scaling is in effect, we want to use the *maximum* frequency,
-      // not whatever CPU speed some random processor happens to be using now.
-      || ReadFromFile("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq",
-                      &freq)) {
-    // The value is in kHz (as the file name suggests).  For example, on a
-    // 2GHz warpstation, the file contains the value "2000000".
-    return freq * 1000.0;
-  }
-
-  const double error_value = -1;
-  double bogo_clock = error_value;
-
-  std::ifstream f("/proc/cpuinfo");
-  if (!f.is_open()) {
-    std::cerr << "failed to open /proc/cpuinfo\n";
-    return error_value;
-  }
-
-  auto startsWithKey = [](std::string const& Value, std::string const& Key) {
-    if (Key.size() > Value.size()) return false;
-    auto Cmp = [&](char X, char Y) {
-      return std::tolower(X) == std::tolower(Y);
-    };
-    return std::equal(Key.begin(), Key.end(), Value.begin(), Cmp);
-  };
-
-  std::string ln;
-  while (std::getline(f, ln)) {
-    if (ln.empty()) continue;
-    size_t SplitIdx = ln.find(':');
-    std::string value;
-    if (SplitIdx != std::string::npos) value = ln.substr(SplitIdx + 1);
-    // When parsing the "cpu MHz" and "bogomips" (fallback) entries, we only
-    // accept positive values. Some environments (virtual machines) report zero,
-    // which would cause infinite looping in WallTime_Init.
-    if (startsWithKey(ln, "cpu MHz")) {
-      if (!value.empty()) {
-        double cycles_per_second = benchmark::stod(value) * 1000000.0;
-        if (cycles_per_second > 0) return cycles_per_second;
-      }
-    } else if (startsWithKey(ln, "bogomips")) {
-      if (!value.empty()) {
-        bogo_clock = benchmark::stod(value) * 1000000.0;
-        if (bogo_clock < 0.0) bogo_clock = error_value;
-      }
-    }
-  }
-  if (f.bad()) {
-    std::cerr << "Failure reading /proc/cpuinfo\n";
-    return error_value;
-  }
-  if (!f.eof()) {
-    std::cerr << "Failed to read to end of /proc/cpuinfo\n";
-    return error_value;
-  }
-  f.close();
-  // If we found the bogomips clock, but nothing better, we'll use it (but
-  // we're not happy about it); otherwise, fallback to the rough estimation
-  // below.
-  if (bogo_clock >= 0.0) return bogo_clock;
-
-#elif defined BENCHMARK_HAS_SYSCTL
-  constexpr auto* FreqStr =
-#if defined(BENCHMARK_OS_FREEBSD) || defined(BENCHMARK_OS_NETBSD)
-      "machdep.tsc_freq";
-#elif defined BENCHMARK_OS_OPENBSD
-      "hw.cpuspeed";
-#else
-      "hw.cpufrequency";
-#endif
-  unsigned long long hz = 0;
-#if defined BENCHMARK_OS_OPENBSD
-  if (GetSysctl(FreqStr, &hz)) return hz * 1000000;
-#else
-  if (GetSysctl(FreqStr, &hz)) return hz;
-#endif
-  fprintf(stderr, "Unable to determine clock rate from sysctl: %s: %s\n",
-          FreqStr, strerror(errno));
-
-#elif defined BENCHMARK_OS_WINDOWS
-  // In NT, read MHz from the registry. If we fail to do so or we're in win9x
-  // then make a crude estimate.
-  DWORD data, data_size = sizeof(data);
-  if (IsWindowsXPOrGreater() &&
-      SUCCEEDED(
-          SHGetValueA(HKEY_LOCAL_MACHINE,
-                      "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0",
-                      "~MHz", nullptr, &data, &data_size)))
-    return static_cast<double>((int64_t)data *
-                               (int64_t)(1000 * 1000));  // was mhz
-#elif defined (BENCHMARK_OS_SOLARIS)
-  kstat_ctl_t *kc = kstat_open();
-  if (!kc) {
-    std::cerr << "failed to open /dev/kstat\n";
-    return -1;
-  }
-  kstat_t *ksp = kstat_lookup(kc, (char*)"cpu_info", -1, (char*)"cpu_info0");
-  if (!ksp) {
-    std::cerr << "failed to lookup in /dev/kstat\n";
-    return -1;
-  }
-  if (kstat_read(kc, ksp, NULL) < 0) {
-    std::cerr << "failed to read from /dev/kstat\n";
-    return -1;
-  }
-  kstat_named_t *knp =
-      (kstat_named_t*)kstat_data_lookup(ksp, (char*)"current_clock_Hz");
-  if (!knp) {
-    std::cerr << "failed to lookup data in /dev/kstat\n";
-    return -1;
-  }
-  if (knp->data_type != KSTAT_DATA_UINT64) {
-    std::cerr << "current_clock_Hz is of unexpected data type: "
-              << knp->data_type << "\n";
-    return -1;
-  }
-  double clock_hz = knp->value.ui64;
-  kstat_close(kc);
-  return clock_hz;
-#elif defined (BENCHMARK_OS_QNX)
-  return static_cast<double>((int64_t)(SYSPAGE_ENTRY(cpuinfo)->speed) *
-                             (int64_t)(1000 * 1000));
-#endif
-  // If we've fallen through, attempt to roughly estimate the CPU clock rate.
-  const int estimate_time_ms = 1000;
-  const auto start_ticks = cycleclock::Now();
-  SleepForMilliseconds(estimate_time_ms);
-  return static_cast<double>(cycleclock::Now() - start_ticks);
-}
-
-std::vector<double> GetLoadAvg() {
-#if (defined BENCHMARK_OS_FREEBSD || defined(BENCHMARK_OS_LINUX) || \
-    defined BENCHMARK_OS_MACOSX || defined BENCHMARK_OS_NETBSD ||  \
-    defined BENCHMARK_OS_OPENBSD) && !defined(__ANDROID__)
-  constexpr int kMaxSamples = 3;
-  std::vector<double> res(kMaxSamples, 0.0);
-  const int nelem = getloadavg(res.data(), kMaxSamples);
-  if (nelem < 1) {
-    res.clear();
-  } else {
-    res.resize(nelem);
-  }
-  return res;
-#else
-  return {};
-#endif
-}
-
-}  // end namespace
-
-const CPUInfo& CPUInfo::Get() {
-  static const CPUInfo* info = new CPUInfo();
-  return *info;
-}
-
-CPUInfo::CPUInfo()
-    : num_cpus(GetNumCPUs()),
-      cycles_per_second(GetCPUCyclesPerSecond()),
-      caches(GetCacheSizes()),
-      scaling_enabled(CpuScalingEnabled(num_cpus)),
-      load_avg(GetLoadAvg()) {}
-
-
-const SystemInfo& SystemInfo::Get() {
-  static const SystemInfo* info = new SystemInfo();
-  return *info;
-}
-
-SystemInfo::SystemInfo() : name(GetSystemName()) {}
-}  // end namespace benchmark
diff --git a/third_party/google_benchmark/test/AssemblyTests.cmake b/third_party/google_benchmark/src/test/AssemblyTests.cmake
similarity index 63%
rename from third_party/google_benchmark/test/AssemblyTests.cmake
rename to third_party/google_benchmark/src/test/AssemblyTests.cmake
index 3d07858..c43c711 100644
--- a/third_party/google_benchmark/test/AssemblyTests.cmake
+++ b/third_party/google_benchmark/src/test/AssemblyTests.cmake
@@ -1,3 +1,23 @@
+set(CLANG_SUPPORTED_VERSION "5.0.0")
+set(GCC_SUPPORTED_VERSION "5.5.0")
+
+if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL ${CLANG_SUPPORTED_VERSION})
+    message (WARNING
+      "Unsupported Clang version " ${CMAKE_CXX_COMPILER_VERSION}
+      ". Expected is " ${CLANG_SUPPORTED_VERSION}
+      ". Assembly tests may be broken.")
+  endif()
+elseif(CMAKE_CXX_COMPILER_ID MATCHES "GNU")
+  if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL ${GCC_SUPPORTED_VERSION})
+    message (WARNING
+      "Unsupported GCC version " ${CMAKE_CXX_COMPILER_VERSION}
+      ". Expected is " ${GCC_SUPPORTED_VERSION}
+      ". Assembly tests may be broken.")
+  endif()
+else()
+  message (WARNING "Unsupported compiler. Assembly tests may be broken.")
+endif()
 
 include(split_list)
 
@@ -23,6 +43,7 @@
 macro(add_filecheck_test name)
   cmake_parse_arguments(ARG "" "" "CHECK_PREFIXES" ${ARGV})
   add_library(${name} OBJECT ${name}.cc)
+  target_link_libraries(${name} PRIVATE benchmark::benchmark)
   set_target_properties(${name} PROPERTIES COMPILE_FLAGS "-S ${ASM_TEST_FLAGS}")
   set(ASM_OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/${name}.s")
   add_custom_target(copy_${name} ALL
diff --git a/third_party/google_benchmark/src/test/BUILD b/third_party/google_benchmark/src/test/BUILD
new file mode 100644
index 0000000..8262d08
--- /dev/null
+++ b/third_party/google_benchmark/src/test/BUILD
@@ -0,0 +1,110 @@
+load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")
+
+platform(
+    name = "windows",
+    constraint_values = [
+        "@platforms//os:windows",
+    ],
+)
+
+TEST_COPTS = [
+    "-pedantic",
+    "-pedantic-errors",
+    "-std=c++11",
+    "-Wall",
+    "-Wconversion",
+    "-Wextra",
+    "-Wshadow",
+    #    "-Wshorten-64-to-32",
+    "-Wfloat-equal",
+    "-fstrict-aliasing",
+]
+
+# Some of the issues with DoNotOptimize only occur when optimization is enabled
+PER_SRC_COPTS = {
+    "donotoptimize_test.cc": ["-O3"],
+}
+
+TEST_ARGS = ["--benchmark_min_time=0.01s"]
+
+PER_SRC_TEST_ARGS = {
+    "user_counters_tabular_test.cc": ["--benchmark_counters_tabular=true"],
+    "repetitions_test.cc": [" --benchmark_repetitions=3"],
+    "spec_arg_test.cc": ["--benchmark_filter=BM_NotChosen"],
+    "spec_arg_verbosity_test.cc": ["--v=42"],
+}
+
+cc_library(
+    name = "output_test_helper",
+    testonly = 1,
+    srcs = ["output_test_helper.cc"],
+    hdrs = ["output_test.h"],
+    copts = select({
+        "//:windows": [],
+        "//conditions:default": TEST_COPTS,
+    }),
+    deps = [
+        "//:benchmark",
+        "//:benchmark_internal_headers",
+    ],
+)
+
+[
+    cc_test(
+        name = test_src[:-len(".cc")],
+        size = "small",
+        srcs = [test_src],
+        args = TEST_ARGS + PER_SRC_TEST_ARGS.get(test_src, []),
+        copts = select({
+            "//:windows": [],
+            "//conditions:default": TEST_COPTS,
+        }) + PER_SRC_COPTS.get(test_src, []),
+        deps = [
+            ":output_test_helper",
+            "//:benchmark",
+            "//:benchmark_internal_headers",
+            "@com_google_googletest//:gtest",
+            "@com_google_googletest//:gtest_main",
+        ],
+        # FIXME: Add support for assembly tests to bazel.
+        # See Issue #556
+        # https://github.com/google/benchmark/issues/556
+    )
+    for test_src in glob(
+        ["*test.cc"],
+        exclude = [
+            "*_assembly_test.cc",
+            "cxx03_test.cc",
+            "link_main_test.cc",
+        ],
+    )
+]
+
+cc_test(
+    name = "cxx03_test",
+    size = "small",
+    srcs = ["cxx03_test.cc"],
+    copts = TEST_COPTS + ["-std=c++03"],
+    target_compatible_with = select({
+        "//:windows": ["@platforms//:incompatible"],
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":output_test_helper",
+        "//:benchmark",
+        "//:benchmark_internal_headers",
+        "@com_google_googletest//:gtest",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_test(
+    name = "link_main_test",
+    size = "small",
+    srcs = ["link_main_test.cc"],
+    copts = select({
+        "//:windows": [],
+        "//conditions:default": TEST_COPTS,
+    }),
+    deps = ["//:benchmark_main"],
+)
diff --git a/third_party/google_benchmark/test/CMakeLists.txt b/third_party/google_benchmark/src/test/CMakeLists.txt
similarity index 74%
rename from third_party/google_benchmark/test/CMakeLists.txt
rename to third_party/google_benchmark/src/test/CMakeLists.txt
index ddcb1a1..78d6d51 100644
--- a/third_party/google_benchmark/test/CMakeLists.txt
+++ b/third_party/google_benchmark/src/test/CMakeLists.txt
@@ -1,5 +1,7 @@
 # Enable the tests
 
+set(THREADS_PREFER_PTHREAD_FLAG ON)
+
 find_package(Threads REQUIRED)
 include(CheckCXXCompilerFlag)
 
@@ -22,6 +24,10 @@
   endforeach()
 endif()
 
+if (NOT BUILD_SHARED_LIBS)
+  add_definitions(-DBENCHMARK_STATIC_DEFINE)
+endif()
+
 check_cxx_compiler_flag(-O3 BENCHMARK_HAS_O3_FLAG)
 set(BENCHMARK_O3_FLAG "")
 if (BENCHMARK_HAS_O3_FLAG)
@@ -35,33 +41,52 @@
 endif()
 
 add_library(output_test_helper STATIC output_test_helper.cc output_test.h)
+target_link_libraries(output_test_helper PRIVATE benchmark::benchmark)
 
 macro(compile_benchmark_test name)
   add_executable(${name} "${name}.cc")
-  target_link_libraries(${name} benchmark ${CMAKE_THREAD_LIBS_INIT})
+  target_link_libraries(${name} benchmark::benchmark ${CMAKE_THREAD_LIBS_INIT})
+  if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "NVHPC")
+  target_compile_options( ${name} PRIVATE --diag_suppress partial_override )
+  endif()
 endmacro(compile_benchmark_test)
 
 macro(compile_benchmark_test_with_main name)
   add_executable(${name} "${name}.cc")
-  target_link_libraries(${name} benchmark_main)
+  target_link_libraries(${name} benchmark::benchmark_main)
 endmacro(compile_benchmark_test_with_main)
 
 macro(compile_output_test name)
   add_executable(${name} "${name}.cc" output_test.h)
-  target_link_libraries(${name} output_test_helper benchmark
+  target_link_libraries(${name} output_test_helper benchmark::benchmark_main
           ${BENCHMARK_CXX_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
 endmacro(compile_output_test)
 
 # Demonstration executable
 compile_benchmark_test(benchmark_test)
-add_test(NAME benchmark COMMAND benchmark_test --benchmark_min_time=0.01)
+add_test(NAME benchmark COMMAND benchmark_test --benchmark_min_time=0.01s)
+
+compile_benchmark_test(spec_arg_test)
+add_test(NAME spec_arg COMMAND spec_arg_test --benchmark_filter=BM_NotChosen)
+
+compile_benchmark_test(spec_arg_verbosity_test)
+add_test(NAME spec_arg_verbosity COMMAND spec_arg_verbosity_test --v=42)
+
+compile_benchmark_test(benchmark_setup_teardown_test)
+add_test(NAME benchmark_setup_teardown COMMAND benchmark_setup_teardown_test)
 
 compile_benchmark_test(filter_test)
 macro(add_filter_test name filter expect)
-  add_test(NAME ${name} COMMAND filter_test --benchmark_min_time=0.01 --benchmark_filter=${filter} ${expect})
+  add_test(NAME ${name} COMMAND filter_test --benchmark_min_time=0.01s --benchmark_filter=${filter} ${expect})
   add_test(NAME ${name}_list_only COMMAND filter_test --benchmark_list_tests --benchmark_filter=${filter} ${expect})
 endmacro(add_filter_test)
 
+compile_benchmark_test(benchmark_min_time_flag_time_test)
+add_test(NAME min_time_flag_time COMMAND benchmark_min_time_flag_time_test)
+
+compile_benchmark_test(benchmark_min_time_flag_iters_test)
+add_test(NAME min_time_flag_iters COMMAND benchmark_min_time_flag_iters_test)
+
 add_filter_test(filter_simple "Foo" 3)
 add_filter_test(filter_simple_negative "-Foo" 2)
 add_filter_test(filter_suffix "BM_.*" 4)
@@ -82,16 +107,19 @@
 add_filter_test(filter_regex_end_negative "-.*Ba$" 4)
 
 compile_benchmark_test(options_test)
-add_test(NAME options_benchmarks COMMAND options_test --benchmark_min_time=0.01)
+add_test(NAME options_benchmarks COMMAND options_test --benchmark_min_time=0.01s)
 
 compile_benchmark_test(basic_test)
-add_test(NAME basic_benchmark COMMAND basic_test --benchmark_min_time=0.01)
+add_test(NAME basic_benchmark COMMAND basic_test --benchmark_min_time=0.01s)
+
+compile_output_test(repetitions_test)
+add_test(NAME repetitions_benchmark COMMAND repetitions_test --benchmark_min_time=0.01s --benchmark_repetitions=3)
 
 compile_benchmark_test(diagnostics_test)
-add_test(NAME diagnostics_test COMMAND diagnostics_test --benchmark_min_time=0.01)
+add_test(NAME diagnostics_test COMMAND diagnostics_test --benchmark_min_time=0.01s)
 
 compile_benchmark_test(skip_with_error_test)
-add_test(NAME skip_with_error_test COMMAND skip_with_error_test --benchmark_min_time=0.01)
+add_test(NAME skip_with_error_test COMMAND skip_with_error_test --benchmark_min_time=0.01s)
 
 compile_benchmark_test(donotoptimize_test)
 # Some of the issues with DoNotOptimize only occur when optimization is enabled
@@ -99,74 +127,87 @@
 if (BENCHMARK_HAS_O3_FLAG)
   set_target_properties(donotoptimize_test PROPERTIES COMPILE_FLAGS "-O3")
 endif()
-add_test(NAME donotoptimize_test COMMAND donotoptimize_test --benchmark_min_time=0.01)
+add_test(NAME donotoptimize_test COMMAND donotoptimize_test --benchmark_min_time=0.01s)
 
 compile_benchmark_test(fixture_test)
-add_test(NAME fixture_test COMMAND fixture_test --benchmark_min_time=0.01)
+add_test(NAME fixture_test COMMAND fixture_test --benchmark_min_time=0.01s)
 
 compile_benchmark_test(register_benchmark_test)
-add_test(NAME register_benchmark_test COMMAND register_benchmark_test --benchmark_min_time=0.01)
+add_test(NAME register_benchmark_test COMMAND register_benchmark_test --benchmark_min_time=0.01s)
 
 compile_benchmark_test(map_test)
-add_test(NAME map_test COMMAND map_test --benchmark_min_time=0.01)
+add_test(NAME map_test COMMAND map_test --benchmark_min_time=0.01s)
 
 compile_benchmark_test(multiple_ranges_test)
-add_test(NAME multiple_ranges_test COMMAND multiple_ranges_test --benchmark_min_time=0.01)
+add_test(NAME multiple_ranges_test COMMAND multiple_ranges_test --benchmark_min_time=0.01s)
+
+compile_benchmark_test(args_product_test)
+add_test(NAME args_product_test COMMAND args_product_test --benchmark_min_time=0.01s)
 
 compile_benchmark_test_with_main(link_main_test)
-add_test(NAME link_main_test COMMAND link_main_test --benchmark_min_time=0.01)
+add_test(NAME link_main_test COMMAND link_main_test --benchmark_min_time=0.01s)
 
 compile_output_test(reporter_output_test)
-add_test(NAME reporter_output_test COMMAND reporter_output_test --benchmark_min_time=0.01)
+add_test(NAME reporter_output_test COMMAND reporter_output_test --benchmark_min_time=0.01s)
 
 compile_output_test(templated_fixture_test)
-add_test(NAME templated_fixture_test COMMAND templated_fixture_test --benchmark_min_time=0.01)
+add_test(NAME templated_fixture_test COMMAND templated_fixture_test --benchmark_min_time=0.01s)
 
 compile_output_test(user_counters_test)
-add_test(NAME user_counters_test COMMAND user_counters_test --benchmark_min_time=0.01)
+add_test(NAME user_counters_test COMMAND user_counters_test --benchmark_min_time=0.01s)
+
+compile_output_test(perf_counters_test)
+add_test(NAME perf_counters_test COMMAND perf_counters_test --benchmark_min_time=0.01s --benchmark_perf_counters=CYCLES,BRANCHES)
 
 compile_output_test(internal_threading_test)
-add_test(NAME internal_threading_test COMMAND internal_threading_test --benchmark_min_time=0.01)
+add_test(NAME internal_threading_test COMMAND internal_threading_test --benchmark_min_time=0.01s)
 
 compile_output_test(report_aggregates_only_test)
-add_test(NAME report_aggregates_only_test COMMAND report_aggregates_only_test --benchmark_min_time=0.01)
+add_test(NAME report_aggregates_only_test COMMAND report_aggregates_only_test --benchmark_min_time=0.01s)
 
 compile_output_test(display_aggregates_only_test)
-add_test(NAME display_aggregates_only_test COMMAND display_aggregates_only_test --benchmark_min_time=0.01)
+add_test(NAME display_aggregates_only_test COMMAND display_aggregates_only_test --benchmark_min_time=0.01s)
 
 compile_output_test(user_counters_tabular_test)
-add_test(NAME user_counters_tabular_test COMMAND user_counters_tabular_test --benchmark_counters_tabular=true --benchmark_min_time=0.01)
+add_test(NAME user_counters_tabular_test COMMAND user_counters_tabular_test --benchmark_counters_tabular=true --benchmark_min_time=0.01s)
 
 compile_output_test(user_counters_thousands_test)
-add_test(NAME user_counters_thousands_test COMMAND user_counters_thousands_test --benchmark_min_time=0.01)
+add_test(NAME user_counters_thousands_test COMMAND user_counters_thousands_test --benchmark_min_time=0.01s)
 
 compile_output_test(memory_manager_test)
-add_test(NAME memory_manager_test COMMAND memory_manager_test --benchmark_min_time=0.01)
+add_test(NAME memory_manager_test COMMAND memory_manager_test --benchmark_min_time=0.01s)
 
-check_cxx_compiler_flag(-std=c++03 BENCHMARK_HAS_CXX03_FLAG)
-if (BENCHMARK_HAS_CXX03_FLAG)
+# MSVC does not allow to set the language standard to C++98/03.
+if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
   compile_benchmark_test(cxx03_test)
   set_target_properties(cxx03_test
       PROPERTIES
-      COMPILE_FLAGS "-std=c++03")
+      CXX_STANDARD 98
+      CXX_STANDARD_REQUIRED YES)
   # libstdc++ provides different definitions within <map> between dialects. When
   # LTO is enabled and -Werror is specified GCC diagnoses this ODR violation
   # causing the test to fail to compile. To prevent this we explicitly disable
   # the warning.
   check_cxx_compiler_flag(-Wno-odr BENCHMARK_HAS_WNO_ODR)
-  if (BENCHMARK_ENABLE_LTO AND BENCHMARK_HAS_WNO_ODR)
-    set_target_properties(cxx03_test
-        PROPERTIES
-        LINK_FLAGS "-Wno-odr")
+  check_cxx_compiler_flag(-Wno-lto-type-mismatch BENCHMARK_HAS_WNO_LTO_TYPE_MISMATCH)
+  # Cannot set_target_properties multiple times here because the warnings will
+  # be overwritten on each call
+  set (DISABLE_LTO_WARNINGS "")
+  if (BENCHMARK_HAS_WNO_ODR)
+    set(DISABLE_LTO_WARNINGS "${DISABLE_LTO_WARNINGS} -Wno-odr")
   endif()
-  add_test(NAME cxx03 COMMAND cxx03_test --benchmark_min_time=0.01)
+  if (BENCHMARK_HAS_WNO_LTO_TYPE_MISMATCH)
+    set(DISABLE_LTO_WARNINGS "${DISABLE_LTO_WARNINGS} -Wno-lto-type-mismatch")
+  endif()
+  set_target_properties(cxx03_test PROPERTIES LINK_FLAGS "${DISABLE_LTO_WARNINGS}")
+  add_test(NAME cxx03 COMMAND cxx03_test --benchmark_min_time=0.01s)
 endif()
 
 # Attempt to work around flaky test failures when running on Appveyor servers.
 if (DEFINED ENV{APPVEYOR})
-  set(COMPLEXITY_MIN_TIME "0.5")
+  set(COMPLEXITY_MIN_TIME "0.5s")
 else()
-  set(COMPLEXITY_MIN_TIME "0.01")
+  set(COMPLEXITY_MIN_TIME "0.01s")
 endif()
 compile_output_test(complexity_test)
 add_test(NAME complexity_benchmark COMMAND complexity_test --benchmark_min_time=${COMPLEXITY_MIN_TIME})
@@ -178,7 +219,7 @@
 if (BENCHMARK_ENABLE_GTEST_TESTS)
   macro(compile_gtest name)
     add_executable(${name} "${name}.cc")
-    target_link_libraries(${name} benchmark
+    target_link_libraries(${name} benchmark::benchmark
         gmock_main ${CMAKE_THREAD_LIBS_INIT})
   endmacro(compile_gtest)
 
@@ -189,9 +230,13 @@
 
   add_gtest(benchmark_gtest)
   add_gtest(benchmark_name_gtest)
+  add_gtest(benchmark_random_interleaving_gtest)
   add_gtest(commandlineflags_gtest)
   add_gtest(statistics_gtest)
   add_gtest(string_util_gtest)
+  add_gtest(perf_counters_gtest)
+  add_gtest(time_unit_gtest)
+  add_gtest(min_time_parse_gtest)
 endif(BENCHMARK_ENABLE_GTEST_TESTS)
 
 ###############################################################################
diff --git a/third_party/google_benchmark/src/test/args_product_test.cc b/third_party/google_benchmark/src/test/args_product_test.cc
new file mode 100644
index 0000000..63b8b71
--- /dev/null
+++ b/third_party/google_benchmark/src/test/args_product_test.cc
@@ -0,0 +1,77 @@
+#include <cassert>
+#include <iostream>
+#include <set>
+#include <vector>
+
+#include "benchmark/benchmark.h"
+
+class ArgsProductFixture : public ::benchmark::Fixture {
+ public:
+  ArgsProductFixture()
+      : expectedValues({{0, 100, 2000, 30000},
+                        {1, 15, 3, 8},
+                        {1, 15, 3, 9},
+                        {1, 15, 7, 8},
+                        {1, 15, 7, 9},
+                        {1, 15, 10, 8},
+                        {1, 15, 10, 9},
+                        {2, 15, 3, 8},
+                        {2, 15, 3, 9},
+                        {2, 15, 7, 8},
+                        {2, 15, 7, 9},
+                        {2, 15, 10, 8},
+                        {2, 15, 10, 9},
+                        {4, 5, 6, 11}}) {}
+
+  void SetUp(const ::benchmark::State& state) override {
+    std::vector<int64_t> ranges = {state.range(0), state.range(1),
+                                   state.range(2), state.range(3)};
+
+    assert(expectedValues.find(ranges) != expectedValues.end());
+
+    actualValues.insert(ranges);
+  }
+
+  // NOTE: This is not TearDown as we want to check after _all_ runs are
+  // complete.
+  ~ArgsProductFixture() override {
+    if (actualValues != expectedValues) {
+      std::cout << "EXPECTED\n";
+      for (const auto& v : expectedValues) {
+        std::cout << "{";
+        for (int64_t iv : v) {
+          std::cout << iv << ", ";
+        }
+        std::cout << "}\n";
+      }
+      std::cout << "ACTUAL\n";
+      for (const auto& v : actualValues) {
+        std::cout << "{";
+        for (int64_t iv : v) {
+          std::cout << iv << ", ";
+        }
+        std::cout << "}\n";
+      }
+    }
+  }
+
+  std::set<std::vector<int64_t>> expectedValues;
+  std::set<std::vector<int64_t>> actualValues;
+};
+
+BENCHMARK_DEFINE_F(ArgsProductFixture, Empty)(benchmark::State& state) {
+  for (auto _ : state) {
+    int64_t product =
+        state.range(0) * state.range(1) * state.range(2) * state.range(3);
+    for (int64_t x = 0; x < product; x++) {
+      benchmark::DoNotOptimize(x);
+    }
+  }
+}
+
+BENCHMARK_REGISTER_F(ArgsProductFixture, Empty)
+    ->Args({0, 100, 2000, 30000})
+    ->ArgsProduct({{1, 2}, {15}, {3, 7, 10}, {8, 9}})
+    ->Args({4, 5, 6, 11});
+
+BENCHMARK_MAIN();
diff --git a/third_party/google_benchmark/src/test/basic_test.cc b/third_party/google_benchmark/src/test/basic_test.cc
new file mode 100644
index 0000000..cba1b0f
--- /dev/null
+++ b/third_party/google_benchmark/src/test/basic_test.cc
@@ -0,0 +1,180 @@
+
+#include "benchmark/benchmark.h"
+
+#define BASIC_BENCHMARK_TEST(x) BENCHMARK(x)->Arg(8)->Arg(512)->Arg(8192)
+
+void BM_empty(benchmark::State& state) {
+  for (auto _ : state) {
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
+  }
+}
+BENCHMARK(BM_empty);
+BENCHMARK(BM_empty)->ThreadPerCpu();
+
+void BM_spin_empty(benchmark::State& state) {
+  for (auto _ : state) {
+    for (auto x = 0; x < state.range(0); ++x) {
+      benchmark::DoNotOptimize(x);
+    }
+  }
+}
+BASIC_BENCHMARK_TEST(BM_spin_empty);
+BASIC_BENCHMARK_TEST(BM_spin_empty)->ThreadPerCpu();
+
+void BM_spin_pause_before(benchmark::State& state) {
+  for (auto i = 0; i < state.range(0); ++i) {
+    benchmark::DoNotOptimize(i);
+  }
+  for (auto _ : state) {
+    for (auto i = 0; i < state.range(0); ++i) {
+      benchmark::DoNotOptimize(i);
+    }
+  }
+}
+BASIC_BENCHMARK_TEST(BM_spin_pause_before);
+BASIC_BENCHMARK_TEST(BM_spin_pause_before)->ThreadPerCpu();
+
+void BM_spin_pause_during(benchmark::State& state) {
+  for (auto _ : state) {
+    state.PauseTiming();
+    for (auto i = 0; i < state.range(0); ++i) {
+      benchmark::DoNotOptimize(i);
+    }
+    state.ResumeTiming();
+    for (auto i = 0; i < state.range(0); ++i) {
+      benchmark::DoNotOptimize(i);
+    }
+  }
+}
+BASIC_BENCHMARK_TEST(BM_spin_pause_during);
+BASIC_BENCHMARK_TEST(BM_spin_pause_during)->ThreadPerCpu();
+
+void BM_pause_during(benchmark::State& state) {
+  for (auto _ : state) {
+    state.PauseTiming();
+    state.ResumeTiming();
+  }
+}
+BENCHMARK(BM_pause_during);
+BENCHMARK(BM_pause_during)->ThreadPerCpu();
+BENCHMARK(BM_pause_during)->UseRealTime();
+BENCHMARK(BM_pause_during)->UseRealTime()->ThreadPerCpu();
+
+void BM_spin_pause_after(benchmark::State& state) {
+  for (auto _ : state) {
+    for (auto i = 0; i < state.range(0); ++i) {
+      benchmark::DoNotOptimize(i);
+    }
+  }
+  for (auto i = 0; i < state.range(0); ++i) {
+    benchmark::DoNotOptimize(i);
+  }
+}
+BASIC_BENCHMARK_TEST(BM_spin_pause_after);
+BASIC_BENCHMARK_TEST(BM_spin_pause_after)->ThreadPerCpu();
+
+void BM_spin_pause_before_and_after(benchmark::State& state) {
+  for (auto i = 0; i < state.range(0); ++i) {
+    benchmark::DoNotOptimize(i);
+  }
+  for (auto _ : state) {
+    for (auto i = 0; i < state.range(0); ++i) {
+      benchmark::DoNotOptimize(i);
+    }
+  }
+  for (auto i = 0; i < state.range(0); ++i) {
+    benchmark::DoNotOptimize(i);
+  }
+}
+BASIC_BENCHMARK_TEST(BM_spin_pause_before_and_after);
+BASIC_BENCHMARK_TEST(BM_spin_pause_before_and_after)->ThreadPerCpu();
+
+void BM_empty_stop_start(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_empty_stop_start);
+BENCHMARK(BM_empty_stop_start)->ThreadPerCpu();
+
+void BM_KeepRunning(benchmark::State& state) {
+  benchmark::IterationCount iter_count = 0;
+  assert(iter_count == state.iterations());
+  while (state.KeepRunning()) {
+    ++iter_count;
+  }
+  assert(iter_count == state.iterations());
+}
+BENCHMARK(BM_KeepRunning);
+
+void BM_KeepRunningBatch(benchmark::State& state) {
+  // Choose a batch size >1000 to skip the typical runs with iteration
+  // targets of 10, 100 and 1000.  If these are not actually skipped the
+  // bug would be detectable as consecutive runs with the same iteration
+  // count.  Below we assert that this does not happen.
+  const benchmark::IterationCount batch_size = 1009;
+
+  static benchmark::IterationCount prior_iter_count = 0;
+  benchmark::IterationCount iter_count = 0;
+  while (state.KeepRunningBatch(batch_size)) {
+    iter_count += batch_size;
+  }
+  assert(state.iterations() == iter_count);
+
+  // Verify that the iteration count always increases across runs (see
+  // comment above).
+  assert(iter_count == batch_size            // max_iterations == 1
+         || iter_count > prior_iter_count);  // max_iterations > batch_size
+  prior_iter_count = iter_count;
+}
+// Register with a fixed repetition count to establish the invariant that
+// the iteration count should always change across runs.  This overrides
+// the --benchmark_repetitions command line flag, which would otherwise
+// cause this test to fail if set > 1.
+BENCHMARK(BM_KeepRunningBatch)->Repetitions(1);
+
+void BM_RangedFor(benchmark::State& state) {
+  benchmark::IterationCount iter_count = 0;
+  for (auto _ : state) {
+    ++iter_count;
+  }
+  assert(iter_count == state.max_iterations);
+}
+BENCHMARK(BM_RangedFor);
+
+#ifdef BENCHMARK_HAS_CXX11
+template <typename T>
+void BM_OneTemplateFunc(benchmark::State& state) {
+  auto arg = state.range(0);
+  T sum = 0;
+  for (auto _ : state) {
+    sum += static_cast<T>(arg);
+  }
+}
+BENCHMARK(BM_OneTemplateFunc<int>)->Arg(1);
+BENCHMARK(BM_OneTemplateFunc<double>)->Arg(1);
+
+template <typename A, typename B>
+void BM_TwoTemplateFunc(benchmark::State& state) {
+  auto arg = state.range(0);
+  A sum = 0;
+  B prod = 1;
+  for (auto _ : state) {
+    sum += static_cast<A>(arg);
+    prod *= static_cast<B>(arg);
+  }
+}
+BENCHMARK(BM_TwoTemplateFunc<int, double>)->Arg(1);
+BENCHMARK(BM_TwoTemplateFunc<double, int>)->Arg(1);
+
+#endif  // BENCHMARK_HAS_CXX11
+
+// Ensure that StateIterator provides all the necessary typedefs required to
+// instantiate std::iterator_traits.
+static_assert(
+    std::is_same<typename std::iterator_traits<
+                     benchmark::State::StateIterator>::value_type,
+                 typename benchmark::State::StateIterator::value_type>::value,
+    "");
+
+BENCHMARK_MAIN();
diff --git a/third_party/google_benchmark/test/benchmark_gtest.cc b/third_party/google_benchmark/src/test/benchmark_gtest.cc
similarity index 73%
rename from third_party/google_benchmark/test/benchmark_gtest.cc
rename to third_party/google_benchmark/src/test/benchmark_gtest.cc
index 9557b20..2c9e555 100644
--- a/third_party/google_benchmark/test/benchmark_gtest.cc
+++ b/third_party/google_benchmark/src/test/benchmark_gtest.cc
@@ -1,11 +1,15 @@
+#include <map>
+#include <string>
 #include <vector>
 
 #include "../src/benchmark_register.h"
+#include "benchmark/benchmark.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
 namespace benchmark {
 namespace internal {
+
 namespace {
 
 TEST(AddRangeTest, Simple) {
@@ -34,8 +38,9 @@
 
 TEST(AddRangeTest, FullRange8) {
   std::vector<int8_t> dst;
-  AddRange(&dst, int8_t{1}, std::numeric_limits<int8_t>::max(), 8);
-  EXPECT_THAT(dst, testing::ElementsAre(1, 8, 64, 127));
+  AddRange(&dst, int8_t{1}, std::numeric_limits<int8_t>::max(), int8_t{8});
+  EXPECT_THAT(
+      dst, testing::ElementsAre(int8_t{1}, int8_t{8}, int8_t{64}, int8_t{127}));
 }
 
 TEST(AddRangeTest, FullRange64) {
@@ -90,6 +95,12 @@
   EXPECT_THAT(dst, testing::ElementsAre(0));
 }
 
+TEST(AddRangeTest, ZeroStartingRange) {
+  std::vector<int> dst;
+  AddRange(&dst, 0, 2, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(0, 1, 2));
+}
+
 TEST(AddRangeTest, NegativeRange64) {
   std::vector<int64_t> dst;
   AddRange<int64_t>(&dst, -4, 4, 2);
@@ -119,8 +130,38 @@
 
 TEST(AddRangeTest, Simple8) {
   std::vector<int8_t> dst;
-  AddRange<int8_t>(&dst, 1, 8, 2);
-  EXPECT_THAT(dst, testing::ElementsAre(1, 2, 4, 8));
+  AddRange<int8_t>(&dst, int8_t{1}, int8_t{8}, int8_t{2});
+  EXPECT_THAT(dst,
+              testing::ElementsAre(int8_t{1}, int8_t{2}, int8_t{4}, int8_t{8}));
+}
+
+TEST(AddCustomContext, Simple) {
+  std::map<std::string, std::string> *&global_context = GetGlobalContext();
+  EXPECT_THAT(global_context, nullptr);
+
+  AddCustomContext("foo", "bar");
+  AddCustomContext("baz", "qux");
+
+  EXPECT_THAT(*global_context,
+              testing::UnorderedElementsAre(testing::Pair("foo", "bar"),
+                                            testing::Pair("baz", "qux")));
+
+  delete global_context;
+  global_context = nullptr;
+}
+
+TEST(AddCustomContext, DuplicateKey) {
+  std::map<std::string, std::string> *&global_context = GetGlobalContext();
+  EXPECT_THAT(global_context, nullptr);
+
+  AddCustomContext("foo", "bar");
+  AddCustomContext("foo", "qux");
+
+  EXPECT_THAT(*global_context,
+              testing::UnorderedElementsAre(testing::Pair("foo", "bar")));
+
+  delete global_context;
+  global_context = nullptr;
 }
 
 }  // namespace
diff --git a/third_party/google_benchmark/src/test/benchmark_min_time_flag_iters_test.cc b/third_party/google_benchmark/src/test/benchmark_min_time_flag_iters_test.cc
new file mode 100644
index 0000000..eb9414a
--- /dev/null
+++ b/third_party/google_benchmark/src/test/benchmark_min_time_flag_iters_test.cc
@@ -0,0 +1,66 @@
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "benchmark/benchmark.h"
+
+// Tests that we can specify the number of iterations with
+// --benchmark_min_time=<NUM>x.
+namespace {
+
+class TestReporter : public benchmark::ConsoleReporter {
+ public:
+  virtual bool ReportContext(const Context& context) BENCHMARK_OVERRIDE {
+    return ConsoleReporter::ReportContext(context);
+  };
+
+  virtual void ReportRuns(const std::vector<Run>& report) BENCHMARK_OVERRIDE {
+    assert(report.size() == 1);
+    iter_nums_.push_back(report[0].iterations);
+    ConsoleReporter::ReportRuns(report);
+  };
+
+  TestReporter() {}
+
+  virtual ~TestReporter() {}
+
+  const std::vector<benchmark::IterationCount>& GetIters() const {
+    return iter_nums_;
+  }
+
+ private:
+  std::vector<benchmark::IterationCount> iter_nums_;
+};
+
+}  // end namespace
+
+static void BM_MyBench(benchmark::State& state) {
+  for (auto s : state) {
+  }
+}
+BENCHMARK(BM_MyBench);
+
+int main(int argc, char** argv) {
+  // Make a fake argv and append the new --benchmark_min_time=<foo> to it.
+  int fake_argc = argc + 1;
+  const char** fake_argv = new const char*[fake_argc];
+  for (int i = 0; i < argc; ++i) fake_argv[i] = argv[i];
+  fake_argv[argc] = "--benchmark_min_time=4x";
+
+  benchmark::Initialize(&fake_argc, const_cast<char**>(fake_argv));
+
+  TestReporter test_reporter;
+  const size_t returned_count =
+      benchmark::RunSpecifiedBenchmarks(&test_reporter, "BM_MyBench");
+  assert(returned_count == 1);
+
+  // Check the executed iters.
+  const std::vector<benchmark::IterationCount> iters = test_reporter.GetIters();
+  assert(!iters.empty() && iters[0] == 4);
+
+  delete[] fake_argv;
+  return 0;
+}
diff --git a/third_party/google_benchmark/src/test/benchmark_min_time_flag_time_test.cc b/third_party/google_benchmark/src/test/benchmark_min_time_flag_time_test.cc
new file mode 100644
index 0000000..b172ccc
--- /dev/null
+++ b/third_party/google_benchmark/src/test/benchmark_min_time_flag_time_test.cc
@@ -0,0 +1,90 @@
+#include <cassert>
+#include <climits>
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "benchmark/benchmark.h"
+
+// Tests that we can specify the min time with
+// --benchmark_min_time=<NUM> (no suffix needed) OR
+// --benchmark_min_time=<NUM>s
+namespace {
+
+// This is from benchmark.h
+typedef int64_t IterationCount;
+
+class TestReporter : public benchmark::ConsoleReporter {
+ public:
+  virtual bool ReportContext(const Context& context) BENCHMARK_OVERRIDE {
+    return ConsoleReporter::ReportContext(context);
+  };
+
+  virtual void ReportRuns(const std::vector<Run>& report) BENCHMARK_OVERRIDE {
+    assert(report.size() == 1);
+    ConsoleReporter::ReportRuns(report);
+  };
+
+  virtual void ReportRunsConfig(double min_time, bool /* has_explicit_iters */,
+                                IterationCount /* iters */) BENCHMARK_OVERRIDE {
+    min_times_.push_back(min_time);
+  }
+
+  TestReporter() {}
+
+  virtual ~TestReporter() {}
+
+  const std::vector<double>& GetMinTimes() const { return min_times_; }
+
+ private:
+  std::vector<double> min_times_;
+};
+
+bool AlmostEqual(double a, double b) {
+  return std::fabs(a - b) < std::numeric_limits<double>::epsilon();
+}
+
+void DoTestHelper(int* argc, const char** argv, double expected) {
+  benchmark::Initialize(argc, const_cast<char**>(argv));
+
+  TestReporter test_reporter;
+  const size_t returned_count =
+      benchmark::RunSpecifiedBenchmarks(&test_reporter, "BM_MyBench");
+  assert(returned_count == 1);
+
+  // Check the min_time
+  const std::vector<double>& min_times = test_reporter.GetMinTimes();
+  assert(!min_times.empty() && AlmostEqual(min_times[0], expected));
+}
+
+}  // end namespace
+
+static void BM_MyBench(benchmark::State& state) {
+  for (auto s : state) {
+  }
+}
+BENCHMARK(BM_MyBench);
+
+int main(int argc, char** argv) {
+  // Make a fake argv and append the new --benchmark_min_time=<foo> to it.
+  int fake_argc = argc + 1;
+  const char** fake_argv = new const char*[fake_argc];
+
+  for (int i = 0; i < argc; ++i) fake_argv[i] = argv[i];
+
+  const char* no_suffix = "--benchmark_min_time=4";
+  const char* with_suffix = "--benchmark_min_time=4.0s";
+  double expected = 4.0;
+
+  fake_argv[argc] = no_suffix;
+  DoTestHelper(&fake_argc, fake_argv, expected);
+
+  fake_argv[argc] = with_suffix;
+  DoTestHelper(&fake_argc, fake_argv, expected);
+
+  delete[] fake_argv;
+  return 0;
+}
diff --git a/third_party/google_benchmark/test/benchmark_name_gtest.cc b/third_party/google_benchmark/src/test/benchmark_name_gtest.cc
similarity index 88%
rename from third_party/google_benchmark/test/benchmark_name_gtest.cc
rename to third_party/google_benchmark/src/test/benchmark_name_gtest.cc
index afb401c..0a6746d 100644
--- a/third_party/google_benchmark/test/benchmark_name_gtest.cc
+++ b/third_party/google_benchmark/src/test/benchmark_name_gtest.cc
@@ -32,6 +32,14 @@
   EXPECT_EQ(name.str(), "function_name/some_args:3/4/min_time:3.4s");
 }
 
+TEST(BenchmarkNameTest, MinWarmUpTime) {
+  auto name = BenchmarkName();
+  name.function_name = "function_name";
+  name.args = "some_args:3/4";
+  name.min_warmup_time = "min_warmup_time:3.5s";
+  EXPECT_EQ(name.str(), "function_name/some_args:3/4/min_warmup_time:3.5s");
+}
+
 TEST(BenchmarkNameTest, Iterations) {
   auto name = BenchmarkName();
   name.function_name = "function_name";
diff --git a/third_party/google_benchmark/src/test/benchmark_random_interleaving_gtest.cc b/third_party/google_benchmark/src/test/benchmark_random_interleaving_gtest.cc
new file mode 100644
index 0000000..7f20867
--- /dev/null
+++ b/third_party/google_benchmark/src/test/benchmark_random_interleaving_gtest.cc
@@ -0,0 +1,126 @@
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "../src/commandlineflags.h"
+#include "../src/string_util.h"
+#include "benchmark/benchmark.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace benchmark {
+
+BM_DECLARE_bool(benchmark_enable_random_interleaving);
+BM_DECLARE_string(benchmark_filter);
+BM_DECLARE_int32(benchmark_repetitions);
+
+namespace internal {
+namespace {
+
+class EventQueue : public std::queue<std::string> {
+ public:
+  void Put(const std::string& event) { push(event); }
+
+  void Clear() {
+    while (!empty()) {
+      pop();
+    }
+  }
+
+  std::string Get() {
+    std::string event = front();
+    pop();
+    return event;
+  }
+};
+
+EventQueue* queue = new EventQueue();
+
+class NullReporter : public BenchmarkReporter {
+ public:
+  bool ReportContext(const Context& /*context*/) override { return true; }
+  void ReportRuns(const std::vector<Run>& /* report */) override {}
+};
+
+class BenchmarkTest : public testing::Test {
+ public:
+  static void SetupHook(int /* num_threads */) { queue->push("Setup"); }
+
+  static void TeardownHook(int /* num_threads */) { queue->push("Teardown"); }
+
+  void Execute(const std::string& pattern) {
+    queue->Clear();
+
+    std::unique_ptr<BenchmarkReporter> reporter(new NullReporter());
+    FLAGS_benchmark_filter = pattern;
+    RunSpecifiedBenchmarks(reporter.get());
+
+    queue->Put("DONE");  // End marker
+  }
+};
+
+void BM_Match1(benchmark::State& state) {
+  const int64_t arg = state.range(0);
+
+  for (auto _ : state) {
+  }
+  queue->Put(StrFormat("BM_Match1/%d", static_cast<int>(arg)));
+}
+BENCHMARK(BM_Match1)
+    ->Iterations(100)
+    ->Arg(1)
+    ->Arg(2)
+    ->Arg(3)
+    ->Range(10, 80)
+    ->Args({90})
+    ->Args({100});
+
+TEST_F(BenchmarkTest, Match1) {
+  Execute("BM_Match1");
+  ASSERT_EQ("BM_Match1/1", queue->Get());
+  ASSERT_EQ("BM_Match1/2", queue->Get());
+  ASSERT_EQ("BM_Match1/3", queue->Get());
+  ASSERT_EQ("BM_Match1/10", queue->Get());
+  ASSERT_EQ("BM_Match1/64", queue->Get());
+  ASSERT_EQ("BM_Match1/80", queue->Get());
+  ASSERT_EQ("BM_Match1/90", queue->Get());
+  ASSERT_EQ("BM_Match1/100", queue->Get());
+  ASSERT_EQ("DONE", queue->Get());
+}
+
+TEST_F(BenchmarkTest, Match1WithRepetition) {
+  FLAGS_benchmark_repetitions = 2;
+
+  Execute("BM_Match1/(64|80)");
+  ASSERT_EQ("BM_Match1/64", queue->Get());
+  ASSERT_EQ("BM_Match1/64", queue->Get());
+  ASSERT_EQ("BM_Match1/80", queue->Get());
+  ASSERT_EQ("BM_Match1/80", queue->Get());
+  ASSERT_EQ("DONE", queue->Get());
+}
+
+TEST_F(BenchmarkTest, Match1WithRandomInterleaving) {
+  FLAGS_benchmark_enable_random_interleaving = true;
+  FLAGS_benchmark_repetitions = 100;
+
+  std::map<std::string, int> element_count;
+  std::map<std::string, int> interleaving_count;
+  Execute("BM_Match1/(64|80)");
+  for (int i = 0; i < 100; ++i) {
+    std::vector<std::string> interleaving;
+    interleaving.push_back(queue->Get());
+    interleaving.push_back(queue->Get());
+    element_count[interleaving[0]]++;
+    element_count[interleaving[1]]++;
+    interleaving_count[StrFormat("%s,%s", interleaving[0].c_str(),
+                                 interleaving[1].c_str())]++;
+  }
+  EXPECT_EQ(element_count["BM_Match1/64"], 100) << "Unexpected repetitions.";
+  EXPECT_EQ(element_count["BM_Match1/80"], 100) << "Unexpected repetitions.";
+  EXPECT_GE(interleaving_count.size(), 2) << "Interleaving was not randomized.";
+  ASSERT_EQ("DONE", queue->Get());
+}
+
+}  // namespace
+}  // namespace internal
+}  // namespace benchmark
diff --git a/third_party/google_benchmark/src/test/benchmark_setup_teardown_test.cc b/third_party/google_benchmark/src/test/benchmark_setup_teardown_test.cc
new file mode 100644
index 0000000..6c3cc2e
--- /dev/null
+++ b/third_party/google_benchmark/src/test/benchmark_setup_teardown_test.cc
@@ -0,0 +1,157 @@
+#include <atomic>
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#include <string>
+
+#include "benchmark/benchmark.h"
+
+// Test that Setup() and Teardown() are called exactly once
+// for each benchmark run (single-threaded).
+namespace singlethreaded {
+static int setup_call = 0;
+static int teardown_call = 0;
+}  // namespace singlethreaded
+static void DoSetup1(const benchmark::State& state) {
+  ++singlethreaded::setup_call;
+
+  // Setup/Teardown should never be called with any thread_idx != 0.
+  assert(state.thread_index() == 0);
+}
+
+static void DoTeardown1(const benchmark::State& state) {
+  ++singlethreaded::teardown_call;
+  assert(state.thread_index() == 0);
+}
+
+static void BM_with_setup(benchmark::State& state) {
+  for (auto s : state) {
+  }
+}
+BENCHMARK(BM_with_setup)
+    ->Arg(1)
+    ->Arg(3)
+    ->Arg(5)
+    ->Arg(7)
+    ->Iterations(100)
+    ->Setup(DoSetup1)
+    ->Teardown(DoTeardown1);
+
+// Test that Setup() and Teardown() are called once for each group of threads.
+namespace concurrent {
+static std::atomic<int> setup_call(0);
+static std::atomic<int> teardown_call(0);
+static std::atomic<int> func_call(0);
+}  // namespace concurrent
+
+static void DoSetup2(const benchmark::State& state) {
+  concurrent::setup_call.fetch_add(1, std::memory_order_acquire);
+  assert(state.thread_index() == 0);
+}
+
+static void DoTeardown2(const benchmark::State& state) {
+  concurrent::teardown_call.fetch_add(1, std::memory_order_acquire);
+  assert(state.thread_index() == 0);
+}
+
+static void BM_concurrent(benchmark::State& state) {
+  for (auto s : state) {
+  }
+  concurrent::func_call.fetch_add(1, std::memory_order_acquire);
+}
+
+BENCHMARK(BM_concurrent)
+    ->Setup(DoSetup2)
+    ->Teardown(DoTeardown2)
+    ->Iterations(100)
+    ->Threads(5)
+    ->Threads(10)
+    ->Threads(15);
+
+// Testing interaction with Fixture::Setup/Teardown
+namespace fixture_interaction {
+int setup = 0;
+int fixture_setup = 0;
+}  // namespace fixture_interaction
+
+#define FIXTURE_BECHMARK_NAME MyFixture
+
+class FIXTURE_BECHMARK_NAME : public ::benchmark::Fixture {
+ public:
+  void SetUp(const ::benchmark::State&) override {
+    fixture_interaction::fixture_setup++;
+  }
+
+  ~FIXTURE_BECHMARK_NAME() override {}
+};
+
+BENCHMARK_F(FIXTURE_BECHMARK_NAME, BM_WithFixture)(benchmark::State& st) {
+  for (auto _ : st) {
+  }
+}
+
+static void DoSetupWithFixture(const benchmark::State&) {
+  fixture_interaction::setup++;
+}
+
+BENCHMARK_REGISTER_F(FIXTURE_BECHMARK_NAME, BM_WithFixture)
+    ->Arg(1)
+    ->Arg(3)
+    ->Arg(5)
+    ->Arg(7)
+    ->Setup(DoSetupWithFixture)
+    ->Repetitions(1)
+    ->Iterations(100);
+
+// Testing repetitions.
+namespace repetitions {
+int setup = 0;
+}
+
+static void DoSetupWithRepetitions(const benchmark::State&) {
+  repetitions::setup++;
+}
+static void BM_WithRep(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+
+BENCHMARK(BM_WithRep)
+    ->Arg(1)
+    ->Arg(3)
+    ->Arg(5)
+    ->Arg(7)
+    ->Setup(DoSetupWithRepetitions)
+    ->Iterations(100)
+    ->Repetitions(4);
+
+int main(int argc, char** argv) {
+  benchmark::Initialize(&argc, argv);
+
+  size_t ret = benchmark::RunSpecifiedBenchmarks(".");
+  assert(ret > 0);
+
+  // Setup/Teardown is called once for each arg group (1,3,5,7).
+  assert(singlethreaded::setup_call == 4);
+  assert(singlethreaded::teardown_call == 4);
+
+  // 3 group of threads calling this function (3,5,10).
+  assert(concurrent::setup_call.load(std::memory_order_relaxed) == 3);
+  assert(concurrent::teardown_call.load(std::memory_order_relaxed) == 3);
+  assert((5 + 10 + 15) ==
+         concurrent::func_call.load(std::memory_order_relaxed));
+
+  // Setup is called 4 times, once for each arg group (1,3,5,7)
+  assert(fixture_interaction::setup == 4);
+  // Fixture::Setup is called every time the bm routine is run.
+  // The exact number is indeterministic, so we just assert that
+  // it's more than setup.
+  assert(fixture_interaction::fixture_setup > fixture_interaction::setup);
+
+  // Setup is call once for each repetition * num_arg =  4 * 4 = 16.
+  assert(repetitions::setup == 16);
+
+  return 0;
+}
diff --git a/third_party/google_benchmark/test/benchmark_test.cc b/third_party/google_benchmark/src/test/benchmark_test.cc
similarity index 78%
rename from third_party/google_benchmark/test/benchmark_test.cc
rename to third_party/google_benchmark/src/test/benchmark_test.cc
index 3cd4f55..94590d5 100644
--- a/third_party/google_benchmark/test/benchmark_test.cc
+++ b/third_party/google_benchmark/src/test/benchmark_test.cc
@@ -5,6 +5,7 @@
 #include <stdint.h>
 
 #include <chrono>
+#include <complex>
 #include <cstdlib>
 #include <iostream>
 #include <limits>
@@ -26,7 +27,7 @@
 
 namespace {
 
-int BENCHMARK_NOINLINE Factorial(uint32_t n) {
+int BENCHMARK_NOINLINE Factorial(int n) {
   return (n == 1) ? 1 : n * Factorial(n - 1);
 }
 
@@ -74,7 +75,8 @@
 static void BM_CalculatePi(benchmark::State& state) {
   static const int depth = 1024;
   for (auto _ : state) {
-    benchmark::DoNotOptimize(CalculatePi(static_cast<int>(depth)));
+    double pi = CalculatePi(static_cast<int>(depth));
+    benchmark::DoNotOptimize(pi);
   }
 }
 BENCHMARK(BM_CalculatePi)->Threads(8);
@@ -90,11 +92,13 @@
     for (int j = 0; j < state.range(1); ++j) data.insert(rand());
   }
   state.SetItemsProcessed(state.iterations() * state.range(1));
-  state.SetBytesProcessed(state.iterations() * state.range(1) * sizeof(int));
+  state.SetBytesProcessed(state.iterations() * state.range(1) *
+                          static_cast<int64_t>(sizeof(int)));
 }
 
-// Test many inserts at once to reduce the total iterations needed. Otherwise, the slower,
-// non-timed part of each iteration will make the benchmark take forever.
+// Test many inserts at once to reduce the total iterations needed. Otherwise,
+// the slower, non-timed part of each iteration will make the benchmark take
+// forever.
 BENCHMARK(BM_SetInsert)->Ranges({{1 << 10, 8 << 10}, {128, 512}});
 
 template <typename Container,
@@ -107,7 +111,7 @@
   }
   const int64_t items_processed = state.iterations() * state.range(0);
   state.SetItemsProcessed(items_processed);
-  state.SetBytesProcessed(items_processed * sizeof(v));
+  state.SetBytesProcessed(items_processed * static_cast<int64_t>(sizeof(v)));
 }
 BENCHMARK_TEMPLATE2(BM_Sequential, std::vector<int>, int)
     ->Range(1 << 0, 1 << 10);
@@ -121,12 +125,15 @@
   size_t len = static_cast<size_t>(state.range(0));
   std::string s1(len, '-');
   std::string s2(len, '-');
-  for (auto _ : state) benchmark::DoNotOptimize(s1.compare(s2));
+  for (auto _ : state) {
+    auto comp = s1.compare(s2);
+    benchmark::DoNotOptimize(comp);
+  }
 }
 BENCHMARK(BM_StringCompare)->Range(1, 1 << 20);
 
 static void BM_SetupTeardown(benchmark::State& state) {
-  if (state.thread_index == 0) {
+  if (state.thread_index() == 0) {
     // No need to lock test_vector_mu here as this is running single-threaded.
     test_vector = new std::vector<int>();
   }
@@ -139,7 +146,7 @@
       test_vector->pop_back();
     ++i;
   }
-  if (state.thread_index == 0) {
+  if (state.thread_index() == 0) {
     delete test_vector;
   }
 }
@@ -156,11 +163,11 @@
 
 static void BM_ParallelMemset(benchmark::State& state) {
   int64_t size = state.range(0) / static_cast<int64_t>(sizeof(int));
-  int thread_size = static_cast<int>(size) / state.threads;
-  int from = thread_size * state.thread_index;
+  int thread_size = static_cast<int>(size) / state.threads();
+  int from = thread_size * state.thread_index();
   int to = from + thread_size;
 
-  if (state.thread_index == 0) {
+  if (state.thread_index() == 0) {
     test_vector = new std::vector<int>(static_cast<size_t>(size));
   }
 
@@ -168,11 +175,11 @@
     for (int i = from; i < to; i++) {
       // No need to lock test_vector_mu as ranges
       // do not overlap between threads.
-      benchmark::DoNotOptimize(test_vector->at(i) = 1);
+      benchmark::DoNotOptimize(test_vector->at(static_cast<size_t>(i)) = 1);
     }
   }
 
-  if (state.thread_index == 0) {
+  if (state.thread_index() == 0) {
     delete test_vector;
   }
 }
@@ -214,7 +221,8 @@
                   std::pair<int, double>(42, 3.8));
 
 void BM_non_template_args(benchmark::State& state, int, double) {
-  while(state.KeepRunning()) {}
+  while (state.KeepRunning()) {
+  }
 }
 BENCHMARK_CAPTURE(BM_non_template_args, basic_test, 0, 0);
 
@@ -223,14 +231,14 @@
 static void BM_DenseThreadRanges(benchmark::State& st) {
   switch (st.range(0)) {
     case 1:
-      assert(st.threads == 1 || st.threads == 2 || st.threads == 3);
+      assert(st.threads() == 1 || st.threads() == 2 || st.threads() == 3);
       break;
     case 2:
-      assert(st.threads == 1 || st.threads == 3 || st.threads == 4);
+      assert(st.threads() == 1 || st.threads() == 3 || st.threads() == 4);
       break;
     case 3:
-      assert(st.threads == 5 || st.threads == 8 || st.threads == 11 ||
-             st.threads == 14);
+      assert(st.threads() == 5 || st.threads() == 8 || st.threads() == 11 ||
+             st.threads() == 14);
       break;
     default:
       assert(false && "Invalid test case number");
@@ -242,4 +250,25 @@
 BENCHMARK(BM_DenseThreadRanges)->Arg(2)->DenseThreadRange(1, 4, 2);
 BENCHMARK(BM_DenseThreadRanges)->Arg(3)->DenseThreadRange(5, 14, 3);
 
+static void BM_BenchmarkName(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+
+  // Check that the benchmark name is passed correctly to `state`.
+  assert("BM_BenchmarkName" == state.name());
+}
+BENCHMARK(BM_BenchmarkName);
+
+// regression test for #1446
+template <typename type>
+static void BM_templated_test(benchmark::State& state) {
+  for (auto _ : state) {
+    type created_string;
+    benchmark::DoNotOptimize(created_string);
+  }
+}
+
+static auto BM_templated_test_double = BM_templated_test<std::complex<double>>;
+BENCHMARK(BM_templated_test_double);
+
 BENCHMARK_MAIN();
diff --git a/third_party/google_benchmark/test/clobber_memory_assembly_test.cc b/third_party/google_benchmark/src/test/clobber_memory_assembly_test.cc
similarity index 97%
rename from third_party/google_benchmark/test/clobber_memory_assembly_test.cc
rename to third_party/google_benchmark/src/test/clobber_memory_assembly_test.cc
index f41911a..54e26cc 100644
--- a/third_party/google_benchmark/test/clobber_memory_assembly_test.cc
+++ b/third_party/google_benchmark/src/test/clobber_memory_assembly_test.cc
@@ -3,13 +3,13 @@
 #ifdef __clang__
 #pragma clang diagnostic ignored "-Wreturn-type"
 #endif
+BENCHMARK_DISABLE_DEPRECATED_WARNING
 
 extern "C" {
 
 extern int ExternInt;
 extern int ExternInt2;
 extern int ExternInt3;
-
 }
 
 // CHECK-LABEL: test_basic:
diff --git a/third_party/google_benchmark/src/test/commandlineflags_gtest.cc b/third_party/google_benchmark/src/test/commandlineflags_gtest.cc
new file mode 100644
index 0000000..8412008
--- /dev/null
+++ b/third_party/google_benchmark/src/test/commandlineflags_gtest.cc
@@ -0,0 +1,228 @@
+#include <cstdlib>
+
+#include "../src/commandlineflags.h"
+#include "../src/internal_macros.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace benchmark {
+namespace {
+
+#if defined(BENCHMARK_OS_WINDOWS)
+int setenv(const char* name, const char* value, int overwrite) {
+  if (!overwrite) {
+    // NOTE: getenv_s is far superior but not available under mingw.
+    char* env_value = getenv(name);
+    if (env_value == nullptr) {
+      return -1;
+    }
+  }
+  return _putenv_s(name, value);
+}
+
+int unsetenv(const char* name) { return _putenv_s(name, ""); }
+
+#endif  // BENCHMARK_OS_WINDOWS
+
+TEST(BoolFromEnv, Default) {
+  ASSERT_EQ(unsetenv("NOT_IN_ENV"), 0);
+  EXPECT_EQ(BoolFromEnv("not_in_env", true), true);
+}
+
+TEST(BoolFromEnv, False) {
+  ASSERT_EQ(setenv("IN_ENV", "0", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "N", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "n", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "NO", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "No", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "no", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "F", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "f", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "FALSE", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "False", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "false", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "OFF", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "Off", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "off", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("IN_ENV");
+}
+
+TEST(BoolFromEnv, True) {
+  ASSERT_EQ(setenv("IN_ENV", "1", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "Y", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "y", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "YES", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "Yes", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "yes", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "T", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "t", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "TRUE", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "True", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "true", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "ON", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "On", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+  ASSERT_EQ(setenv("IN_ENV", "on", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+
+#ifndef BENCHMARK_OS_WINDOWS
+  ASSERT_EQ(setenv("IN_ENV", "", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("IN_ENV");
+#endif
+}
+
+TEST(Int32FromEnv, NotInEnv) {
+  ASSERT_EQ(unsetenv("NOT_IN_ENV"), 0);
+  EXPECT_EQ(Int32FromEnv("not_in_env", 42), 42);
+}
+
+TEST(Int32FromEnv, InvalidInteger) {
+  ASSERT_EQ(setenv("IN_ENV", "foo", 1), 0);
+  EXPECT_EQ(Int32FromEnv("in_env", 42), 42);
+  unsetenv("IN_ENV");
+}
+
+TEST(Int32FromEnv, ValidInteger) {
+  ASSERT_EQ(setenv("IN_ENV", "42", 1), 0);
+  EXPECT_EQ(Int32FromEnv("in_env", 64), 42);
+  unsetenv("IN_ENV");
+}
+
+TEST(DoubleFromEnv, NotInEnv) {
+  ASSERT_EQ(unsetenv("NOT_IN_ENV"), 0);
+  EXPECT_EQ(DoubleFromEnv("not_in_env", 0.51), 0.51);
+}
+
+TEST(DoubleFromEnv, InvalidReal) {
+  ASSERT_EQ(setenv("IN_ENV", "foo", 1), 0);
+  EXPECT_EQ(DoubleFromEnv("in_env", 0.51), 0.51);
+  unsetenv("IN_ENV");
+}
+
+TEST(DoubleFromEnv, ValidReal) {
+  ASSERT_EQ(setenv("IN_ENV", "0.51", 1), 0);
+  EXPECT_EQ(DoubleFromEnv("in_env", 0.71), 0.51);
+  unsetenv("IN_ENV");
+}
+
+TEST(StringFromEnv, Default) {
+  ASSERT_EQ(unsetenv("NOT_IN_ENV"), 0);
+  EXPECT_STREQ(StringFromEnv("not_in_env", "foo"), "foo");
+}
+
+TEST(StringFromEnv, Valid) {
+  ASSERT_EQ(setenv("IN_ENV", "foo", 1), 0);
+  EXPECT_STREQ(StringFromEnv("in_env", "bar"), "foo");
+  unsetenv("IN_ENV");
+}
+
+TEST(KvPairsFromEnv, Default) {
+  ASSERT_EQ(unsetenv("NOT_IN_ENV"), 0);
+  EXPECT_THAT(KvPairsFromEnv("not_in_env", {{"foo", "bar"}}),
+              testing::ElementsAre(testing::Pair("foo", "bar")));
+}
+
+TEST(KvPairsFromEnv, MalformedReturnsDefault) {
+  ASSERT_EQ(setenv("IN_ENV", "foo", 1), 0);
+  EXPECT_THAT(KvPairsFromEnv("in_env", {{"foo", "bar"}}),
+              testing::ElementsAre(testing::Pair("foo", "bar")));
+  unsetenv("IN_ENV");
+}
+
+TEST(KvPairsFromEnv, Single) {
+  ASSERT_EQ(setenv("IN_ENV", "foo=bar", 1), 0);
+  EXPECT_THAT(KvPairsFromEnv("in_env", {}),
+              testing::ElementsAre(testing::Pair("foo", "bar")));
+  unsetenv("IN_ENV");
+}
+
+TEST(KvPairsFromEnv, Multiple) {
+  ASSERT_EQ(setenv("IN_ENV", "foo=bar,baz=qux", 1), 0);
+  EXPECT_THAT(KvPairsFromEnv("in_env", {}),
+              testing::UnorderedElementsAre(testing::Pair("foo", "bar"),
+                                            testing::Pair("baz", "qux")));
+  unsetenv("IN_ENV");
+}
+
+}  // namespace
+}  // namespace benchmark
diff --git a/third_party/google_benchmark/test/complexity_test.cc b/third_party/google_benchmark/src/test/complexity_test.cc
similarity index 71%
rename from third_party/google_benchmark/test/complexity_test.cc
rename to third_party/google_benchmark/src/test/complexity_test.cc
index 5681fdc..76891e0 100644
--- a/third_party/google_benchmark/test/complexity_test.cc
+++ b/third_party/google_benchmark/src/test/complexity_test.cc
@@ -4,6 +4,7 @@
 #include <cmath>
 #include <cstdlib>
 #include <vector>
+
 #include "benchmark/benchmark.h"
 #include "output_test.h"
 
@@ -12,8 +13,10 @@
 #define ADD_COMPLEXITY_CASES(...) \
   int CONCAT(dummy, __LINE__) = AddComplexityTest(__VA_ARGS__)
 
-int AddComplexityTest(std::string test_name, std::string big_o_test_name,
-                      std::string rms_test_name, std::string big_o) {
+int AddComplexityTest(const std::string &test_name,
+                      const std::string &big_o_test_name,
+                      const std::string &rms_test_name,
+                      const std::string &big_o, int family_index) {
   SetSubstitutions({{"%name", test_name},
                     {"%bigo_name", big_o_test_name},
                     {"%rms_name", rms_test_name},
@@ -25,25 +28,33 @@
       {{"^%bigo_name %bigo_str %bigo_str[ ]*$"},
        {"^%bigo_name", MR_Not},  // Assert we we didn't only matched a name.
        {"^%rms_name %rms %rms[ ]*$", MR_Next}});
-  AddCases(TC_JSONOut, {{"\"name\": \"%bigo_name\",$"},
-                        {"\"run_name\": \"%name\",$", MR_Next},
-                        {"\"run_type\": \"aggregate\",$", MR_Next},
-                        {"\"repetitions\": %int,$", MR_Next},
-                        {"\"threads\": 1,$", MR_Next},
-                        {"\"aggregate_name\": \"BigO\",$", MR_Next},
-                        {"\"cpu_coefficient\": %float,$", MR_Next},
-                        {"\"real_coefficient\": %float,$", MR_Next},
-                        {"\"big_o\": \"%bigo\",$", MR_Next},
-                        {"\"time_unit\": \"ns\"$", MR_Next},
-                        {"}", MR_Next},
-                        {"\"name\": \"%rms_name\",$"},
-                        {"\"run_name\": \"%name\",$", MR_Next},
-                        {"\"run_type\": \"aggregate\",$", MR_Next},
-                        {"\"repetitions\": %int,$", MR_Next},
-                        {"\"threads\": 1,$", MR_Next},
-                        {"\"aggregate_name\": \"RMS\",$", MR_Next},
-                        {"\"rms\": %float$", MR_Next},
-                        {"}", MR_Next}});
+  AddCases(
+      TC_JSONOut,
+      {{"\"name\": \"%bigo_name\",$"},
+       {"\"family_index\": " + std::to_string(family_index) + ",$", MR_Next},
+       {"\"per_family_instance_index\": 0,$", MR_Next},
+       {"\"run_name\": \"%name\",$", MR_Next},
+       {"\"run_type\": \"aggregate\",$", MR_Next},
+       {"\"repetitions\": %int,$", MR_Next},
+       {"\"threads\": 1,$", MR_Next},
+       {"\"aggregate_name\": \"BigO\",$", MR_Next},
+       {"\"aggregate_unit\": \"time\",$", MR_Next},
+       {"\"cpu_coefficient\": %float,$", MR_Next},
+       {"\"real_coefficient\": %float,$", MR_Next},
+       {"\"big_o\": \"%bigo\",$", MR_Next},
+       {"\"time_unit\": \"ns\"$", MR_Next},
+       {"}", MR_Next},
+       {"\"name\": \"%rms_name\",$"},
+       {"\"family_index\": " + std::to_string(family_index) + ",$", MR_Next},
+       {"\"per_family_instance_index\": 0,$", MR_Next},
+       {"\"run_name\": \"%name\",$", MR_Next},
+       {"\"run_type\": \"aggregate\",$", MR_Next},
+       {"\"repetitions\": %int,$", MR_Next},
+       {"\"threads\": 1,$", MR_Next},
+       {"\"aggregate_name\": \"RMS\",$", MR_Next},
+       {"\"aggregate_unit\": \"percentage\",$", MR_Next},
+       {"\"rms\": %float$", MR_Next},
+       {"}", MR_Next}});
   AddCases(TC_CSVOut, {{"^\"%bigo_name\",,%float,%float,%bigo,,,,,$"},
                        {"^\"%bigo_name\"", MR_Not},
                        {"^\"%rms_name\",,%float,%float,,,,,,$", MR_Next}});
@@ -56,10 +67,10 @@
 // --------------------------- Testing BigO O(1) --------------------------- //
 // ========================================================================= //
 
-void BM_Complexity_O1(benchmark::State& state) {
+void BM_Complexity_O1(benchmark::State &state) {
   for (auto _ : state) {
     for (int i = 0; i < 1024; ++i) {
-      benchmark::DoNotOptimize(&i);
+      benchmark::DoNotOptimize(i);
     }
   }
   state.SetComplexityN(state.range(0));
@@ -82,15 +93,15 @@
 
 // Add enum tests
 ADD_COMPLEXITY_CASES(one_test_name, big_o_1_test_name, rms_o_1_test_name,
-                     enum_big_o_1);
+                     enum_big_o_1, /*family_index=*/0);
 
 // Add auto enum tests
 ADD_COMPLEXITY_CASES(one_test_name, big_o_1_test_name, rms_o_1_test_name,
-                     auto_big_o_1);
+                     auto_big_o_1, /*family_index=*/1);
 
 // Add lambda tests
 ADD_COMPLEXITY_CASES(one_test_name, big_o_1_test_name, rms_o_1_test_name,
-                     lambda_big_o_1);
+                     lambda_big_o_1, /*family_index=*/2);
 
 // ========================================================================= //
 // --------------------------- Testing BigO O(N) --------------------------- //
@@ -98,19 +109,20 @@
 
 std::vector<int> ConstructRandomVector(int64_t size) {
   std::vector<int> v;
-  v.reserve(static_cast<int>(size));
+  v.reserve(static_cast<size_t>(size));
   for (int i = 0; i < size; ++i) {
     v.push_back(static_cast<int>(std::rand() % size));
   }
   return v;
 }
 
-void BM_Complexity_O_N(benchmark::State& state) {
+void BM_Complexity_O_N(benchmark::State &state) {
   auto v = ConstructRandomVector(state.range(0));
   // Test worst case scenario (item not in vector)
   const int64_t item_not_in_vector = state.range(0) * 2;
   for (auto _ : state) {
-    benchmark::DoNotOptimize(std::find(v.begin(), v.end(), item_not_in_vector));
+    auto it = std::find(v.begin(), v.end(), item_not_in_vector);
+    benchmark::DoNotOptimize(it);
   }
   state.SetComplexityN(state.range(0));
 }
@@ -137,17 +149,17 @@
 
 // Add enum tests
 ADD_COMPLEXITY_CASES(n_test_name, big_o_n_test_name, rms_o_n_test_name,
-                     enum_auto_big_o_n);
+                     enum_auto_big_o_n, /*family_index=*/3);
 
 // Add lambda tests
 ADD_COMPLEXITY_CASES(n_test_name, big_o_n_test_name, rms_o_n_test_name,
-                     lambda_big_o_n);
+                     lambda_big_o_n, /*family_index=*/4);
 
 // ========================================================================= //
 // ------------------------- Testing BigO O(N*lgN) ------------------------- //
 // ========================================================================= //
 
-static void BM_Complexity_O_N_log_N(benchmark::State& state) {
+static void BM_Complexity_O_N_log_N(benchmark::State &state) {
   auto v = ConstructRandomVector(state.range(0));
   for (auto _ : state) {
     std::sort(v.begin(), v.end());
@@ -163,7 +175,7 @@
     ->RangeMultiplier(2)
     ->Range(1 << 10, 1 << 16)
     ->Complexity([](benchmark::IterationCount n) {
-      return kLog2E * n * log(static_cast<double>(n));
+      return kLog2E * static_cast<double>(n) * log(static_cast<double>(n));
     });
 BENCHMARK(BM_Complexity_O_N_log_N)
     ->RangeMultiplier(2)
@@ -178,20 +190,23 @@
 
 // Add enum tests
 ADD_COMPLEXITY_CASES(n_lg_n_test_name, big_o_n_lg_n_test_name,
-                     rms_o_n_lg_n_test_name, enum_auto_big_o_n_lg_n);
+                     rms_o_n_lg_n_test_name, enum_auto_big_o_n_lg_n,
+                     /*family_index=*/6);
 
 // Add lambda tests
 ADD_COMPLEXITY_CASES(n_lg_n_test_name, big_o_n_lg_n_test_name,
-                     rms_o_n_lg_n_test_name, lambda_big_o_n_lg_n);
+                     rms_o_n_lg_n_test_name, lambda_big_o_n_lg_n,
+                     /*family_index=*/7);
 
 // ========================================================================= //
 // -------- Testing formatting of Complexity with captured args ------------ //
 // ========================================================================= //
 
-void BM_ComplexityCaptureArgs(benchmark::State& state, int n) {
+void BM_ComplexityCaptureArgs(benchmark::State &state, int n) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
   }
   state.SetComplexityN(n);
 }
@@ -204,7 +219,7 @@
     "BM_ComplexityCaptureArgs/capture_test";
 
 ADD_COMPLEXITY_CASES(complexity_capture_name, complexity_capture_name + "_BigO",
-                     complexity_capture_name + "_RMS", "N");
+                     complexity_capture_name + "_RMS", "N", /*family_index=*/9);
 
 // ========================================================================= //
 // --------------------------- TEST CASES END ------------------------------ //
diff --git a/third_party/google_benchmark/test/cxx03_test.cc b/third_party/google_benchmark/src/test/cxx03_test.cc
similarity index 93%
rename from third_party/google_benchmark/test/cxx03_test.cc
rename to third_party/google_benchmark/src/test/cxx03_test.cc
index c4c9a52..9711c1b 100644
--- a/third_party/google_benchmark/test/cxx03_test.cc
+++ b/third_party/google_benchmark/src/test/cxx03_test.cc
@@ -44,8 +44,7 @@
 BENCHMARK_TEMPLATE1(BM_template1, int);
 
 template <class T>
-struct BM_Fixture : public ::benchmark::Fixture {
-};
+struct BM_Fixture : public ::benchmark::Fixture {};
 
 BENCHMARK_TEMPLATE_F(BM_Fixture, BM_template1, long)(benchmark::State& state) {
   BM_empty(state);
@@ -55,8 +54,8 @@
 }
 
 void BM_counters(benchmark::State& state) {
-    BM_empty(state);
-    state.counters["Foo"] = 2;
+  BM_empty(state);
+  state.counters["Foo"] = 2;
 }
 BENCHMARK(BM_counters);
 
diff --git a/third_party/google_benchmark/test/diagnostics_test.cc b/third_party/google_benchmark/src/test/diagnostics_test.cc
similarity index 84%
rename from third_party/google_benchmark/test/diagnostics_test.cc
rename to third_party/google_benchmark/src/test/diagnostics_test.cc
index dd64a33..fda14b3 100644
--- a/third_party/google_benchmark/test/diagnostics_test.cc
+++ b/third_party/google_benchmark/src/test/diagnostics_test.cc
@@ -26,7 +26,8 @@
 }
 
 void try_invalid_pause_resume(benchmark::State& state) {
-#if !defined(TEST_BENCHMARK_LIBRARY_HAS_NO_ASSERTIONS) && !defined(TEST_HAS_NO_EXCEPTIONS)
+#if !defined(TEST_BENCHMARK_LIBRARY_HAS_NO_ASSERTIONS) && \
+    !defined(TEST_HAS_NO_EXCEPTIONS)
   try {
     state.PauseTiming();
     std::abort();
@@ -48,7 +49,8 @@
   if (called_once == false) try_invalid_pause_resume(state);
 
   for (auto _ : state) {
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
   }
 
   if (called_once == false) try_invalid_pause_resume(state);
@@ -57,14 +59,14 @@
 }
 BENCHMARK(BM_diagnostic_test);
 
-
 void BM_diagnostic_test_keep_running(benchmark::State& state) {
   static bool called_once = false;
 
   if (called_once == false) try_invalid_pause_resume(state);
 
-  while(state.KeepRunning()) {
-    benchmark::DoNotOptimize(state.iterations());
+  while (state.KeepRunning()) {
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
   }
 
   if (called_once == false) try_invalid_pause_resume(state);
diff --git a/third_party/google_benchmark/test/display_aggregates_only_test.cc b/third_party/google_benchmark/src/test/display_aggregates_only_test.cc
similarity index 85%
rename from third_party/google_benchmark/test/display_aggregates_only_test.cc
rename to third_party/google_benchmark/src/test/display_aggregates_only_test.cc
index 3c36d3f..6ad65e7 100644
--- a/third_party/google_benchmark/test/display_aggregates_only_test.cc
+++ b/third_party/google_benchmark/src/test/display_aggregates_only_test.cc
@@ -19,21 +19,23 @@
 int main(int argc, char* argv[]) {
   const std::string output = GetFileReporterOutput(argc, argv);
 
-  if (SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3") != 6 ||
+  if (SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3") != 7 ||
       SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3\"") != 3 ||
       SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_mean\"") != 1 ||
       SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_median\"") !=
           1 ||
       SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\"") !=
-          1) {
-    std::cout << "Precondition mismatch. Expected to only find 6 "
+          1 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_cv\"") != 1) {
+    std::cout << "Precondition mismatch. Expected to only find 8 "
                  "occurrences of \"BM_SummaryRepeat/repeats:3\" substring:\n"
                  "\"name\": \"BM_SummaryRepeat/repeats:3\", "
                  "\"name\": \"BM_SummaryRepeat/repeats:3\", "
                  "\"name\": \"BM_SummaryRepeat/repeats:3\", "
                  "\"name\": \"BM_SummaryRepeat/repeats:3_mean\", "
                  "\"name\": \"BM_SummaryRepeat/repeats:3_median\", "
-                 "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\"\nThe entire "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_cv\"\nThe entire "
                  "output:\n";
     std::cout << output;
     return 1;
diff --git a/third_party/google_benchmark/test/donotoptimize_assembly_test.cc b/third_party/google_benchmark/src/test/donotoptimize_assembly_test.cc
similarity index 78%
rename from third_party/google_benchmark/test/donotoptimize_assembly_test.cc
rename to third_party/google_benchmark/src/test/donotoptimize_assembly_test.cc
index d4b0bab..dc286f5 100644
--- a/third_party/google_benchmark/test/donotoptimize_assembly_test.cc
+++ b/third_party/google_benchmark/src/test/donotoptimize_assembly_test.cc
@@ -3,19 +3,23 @@
 #ifdef __clang__
 #pragma clang diagnostic ignored "-Wreturn-type"
 #endif
+BENCHMARK_DISABLE_DEPRECATED_WARNING
 
 extern "C" {
 
 extern int ExternInt;
 extern int ExternInt2;
 extern int ExternInt3;
+extern int BigArray[2049];
+
+const int ConstBigArray[2049]{};
 
 inline int Add42(int x) { return x + 42; }
 
 struct NotTriviallyCopyable {
   NotTriviallyCopyable();
   explicit NotTriviallyCopyable(int x) : value(x) {}
-  NotTriviallyCopyable(NotTriviallyCopyable const&);
+  NotTriviallyCopyable(NotTriviallyCopyable const &);
   int value;
 };
 
@@ -24,7 +28,14 @@
   int data[2];
 };
 
+struct ExtraLarge {
+  int arr[2049];
+};
 }
+
+extern ExtraLarge ExtraLargeObj;
+const ExtraLarge ConstExtraLargeObj{};
+
 // CHECK-LABEL: test_with_rvalue:
 extern "C" void test_with_rvalue() {
   benchmark::DoNotOptimize(Add42(0));
@@ -69,6 +80,22 @@
   // CHECK: ret
 }
 
+// CHECK-LABEL: test_with_extra_large_lvalue_with_op:
+extern "C" void test_with_extra_large_lvalue_with_op() {
+  ExtraLargeObj.arr[16] = 42;
+  benchmark::DoNotOptimize(ExtraLargeObj);
+  // CHECK: movl $42, ExtraLargeObj+64(%rip)
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_with_big_array_with_op
+extern "C" void test_with_big_array_with_op() {
+  BigArray[16] = 42;
+  benchmark::DoNotOptimize(BigArray);
+  // CHECK: movl $42, BigArray+64(%rip)
+  // CHECK: ret
+}
+
 // CHECK-LABEL: test_with_non_trivial_lvalue:
 extern "C" void test_with_non_trivial_lvalue() {
   NotTriviallyCopyable NTC(ExternInt);
@@ -97,6 +124,18 @@
   // CHECK: ret
 }
 
+// CHECK-LABEL: test_with_const_extra_large_obj:
+extern "C" void test_with_const_extra_large_obj() {
+  benchmark::DoNotOptimize(ConstExtraLargeObj);
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_with_const_big_array
+extern "C" void test_with_const_big_array() {
+  benchmark::DoNotOptimize(ConstBigArray);
+  // CHECK: ret
+}
+
 // CHECK-LABEL: test_with_non_trivial_const_lvalue:
 extern "C" void test_with_non_trivial_const_lvalue() {
   const NotTriviallyCopyable Obj(ExternInt);
@@ -118,8 +157,7 @@
 // CHECK-LABEL: test_inc_integer:
 extern "C" int test_inc_integer() {
   int x = 0;
-  for (int i=0; i < 5; ++i)
-    benchmark::DoNotOptimize(++x);
+  for (int i = 0; i < 5; ++i) benchmark::DoNotOptimize(++x);
   // CHECK: movl $1, [[DEST:.*]]
   // CHECK: {{(addl \$1,|incl)}} [[DEST]]
   // CHECK: {{(addl \$1,|incl)}} [[DEST]]
@@ -147,7 +185,7 @@
   // CHECK-CLANG: movq %rax, -{{[0-9]+}}(%[[REG:[a-z]+]])
   // CHECK: ret
   int x = 42;
-  int * const xp = &x;
+  int *const xp = &x;
   benchmark::DoNotOptimize(xp);
 }
 
diff --git a/third_party/google_benchmark/test/donotoptimize_test.cc b/third_party/google_benchmark/src/test/donotoptimize_test.cc
similarity index 61%
rename from third_party/google_benchmark/test/donotoptimize_test.cc
rename to third_party/google_benchmark/src/test/donotoptimize_test.cc
index 2ce92d1..90d5af3 100644
--- a/third_party/google_benchmark/test/donotoptimize_test.cc
+++ b/third_party/google_benchmark/src/test/donotoptimize_test.cc
@@ -1,33 +1,43 @@
-#include "benchmark/benchmark.h"
-
 #include <cstdint>
 
+#include "benchmark/benchmark.h"
+
 namespace {
 #if defined(__GNUC__)
-std::uint64_t double_up(const std::uint64_t x) __attribute__((const));
+std::int64_t double_up(const std::int64_t x) __attribute__((const));
 #endif
-std::uint64_t double_up(const std::uint64_t x) { return x * 2; }
-}
+std::int64_t double_up(const std::int64_t x) { return x * 2; }
+}  // namespace
 
 // Using DoNotOptimize on types like BitRef seem to cause a lot of problems
 // with the inline assembly on both GCC and Clang.
 struct BitRef {
   int index;
-  unsigned char &byte;
+  unsigned char& byte;
 
-public:
+ public:
   static BitRef Make() {
     static unsigned char arr[2] = {};
     BitRef b(1, arr[0]);
     return b;
   }
-private:
+
+ private:
   BitRef(int i, unsigned char& b) : index(i), byte(b) {}
 };
 
 int main(int, char*[]) {
   // this test verifies compilation of DoNotOptimize() for some types
 
+  char buffer1[1] = "";
+  benchmark::DoNotOptimize(buffer1);
+
+  char buffer2[2] = "";
+  benchmark::DoNotOptimize(buffer2);
+
+  char buffer3[3] = "";
+  benchmark::DoNotOptimize(buffer3);
+
   char buffer8[8] = "";
   benchmark::DoNotOptimize(buffer8);
 
@@ -36,17 +46,19 @@
 
   char buffer1024[1024] = "";
   benchmark::DoNotOptimize(buffer1024);
-  benchmark::DoNotOptimize(&buffer1024[0]);
+  char* bptr = &buffer1024[0];
+  benchmark::DoNotOptimize(bptr);
 
   int x = 123;
   benchmark::DoNotOptimize(x);
-  benchmark::DoNotOptimize(&x);
+  int* xp = &x;
+  benchmark::DoNotOptimize(xp);
   benchmark::DoNotOptimize(x += 42);
 
-  benchmark::DoNotOptimize(double_up(x));
+  std::int64_t y = double_up(x);
+  benchmark::DoNotOptimize(y);
 
   // These tests are to e
-  benchmark::DoNotOptimize(BitRef::Make());
   BitRef lval = BitRef::Make();
   benchmark::DoNotOptimize(lval);
 }
diff --git a/third_party/google_benchmark/test/filter_test.cc b/third_party/google_benchmark/src/test/filter_test.cc
similarity index 60%
rename from third_party/google_benchmark/test/filter_test.cc
rename to third_party/google_benchmark/src/test/filter_test.cc
index 0e27065..4c8b8ea 100644
--- a/third_party/google_benchmark/test/filter_test.cc
+++ b/third_party/google_benchmark/src/test/filter_test.cc
@@ -1,36 +1,40 @@
-#include "benchmark/benchmark.h"
-
+#include <algorithm>
 #include <cassert>
 #include <cmath>
 #include <cstdint>
 #include <cstdlib>
-
 #include <iostream>
 #include <limits>
 #include <sstream>
 #include <string>
 
+#include "benchmark/benchmark.h"
+
 namespace {
 
 class TestReporter : public benchmark::ConsoleReporter {
  public:
-  virtual bool ReportContext(const Context& context) {
+  bool ReportContext(const Context& context) override {
     return ConsoleReporter::ReportContext(context);
   };
 
-  virtual void ReportRuns(const std::vector<Run>& report) {
+  void ReportRuns(const std::vector<Run>& report) override {
     ++count_;
+    max_family_index_ = std::max(max_family_index_, report[0].family_index);
     ConsoleReporter::ReportRuns(report);
   };
 
-  TestReporter() : count_(0) {}
+  TestReporter() : count_(0), max_family_index_(0) {}
 
-  virtual ~TestReporter() {}
+  ~TestReporter() override {}
 
-  size_t GetCount() const { return count_; }
+  int GetCount() const { return count_; }
+
+  int64_t GetMaxFamilyIndex() const { return max_family_index_; }
 
  private:
-  mutable size_t count_;
+  mutable int count_;
+  mutable int64_t max_family_index_;
 };
 
 }  // end namespace
@@ -65,7 +69,7 @@
 }
 BENCHMARK(BM_FooBa);
 
-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
   bool list_only = false;
   for (int i = 0; i < argc; ++i)
     list_only |= std::string(argv[i]).find("--benchmark_list_tests") !=
@@ -74,13 +78,13 @@
   benchmark::Initialize(&argc, argv);
 
   TestReporter test_reporter;
-  const size_t returned_count =
-      benchmark::RunSpecifiedBenchmarks(&test_reporter);
+  const int64_t returned_count =
+      static_cast<int64_t>(benchmark::RunSpecifiedBenchmarks(&test_reporter));
 
   if (argc == 2) {
     // Make sure we ran all of the tests
     std::stringstream ss(argv[1]);
-    size_t expected_return;
+    int64_t expected_return;
     ss >> expected_return;
 
     if (returned_count != expected_return) {
@@ -90,14 +94,23 @@
       return -1;
     }
 
-    const size_t expected_reports = list_only ? 0 : expected_return;
-    const size_t reports_count = test_reporter.GetCount();
+    const int64_t expected_reports = list_only ? 0 : expected_return;
+    const int64_t reports_count = test_reporter.GetCount();
     if (reports_count != expected_reports) {
       std::cerr << "ERROR: Expected " << expected_reports
                 << " tests to be run but reported_count = " << reports_count
                 << std::endl;
       return -1;
     }
+
+    const int64_t max_family_index = test_reporter.GetMaxFamilyIndex();
+    const int64_t num_families = reports_count == 0 ? 0 : 1 + max_family_index;
+    if (num_families != expected_reports) {
+      std::cerr << "ERROR: Expected " << expected_reports
+                << " test families to be run but num_families = "
+                << num_families << std::endl;
+      return -1;
+    }
   }
 
   return 0;
diff --git a/third_party/google_benchmark/src/test/fixture_test.cc b/third_party/google_benchmark/src/test/fixture_test.cc
new file mode 100644
index 0000000..d1093eb
--- /dev/null
+++ b/third_party/google_benchmark/src/test/fixture_test.cc
@@ -0,0 +1,51 @@
+
+#include <cassert>
+#include <memory>
+
+#include "benchmark/benchmark.h"
+
+#define FIXTURE_BECHMARK_NAME MyFixture
+
+class FIXTURE_BECHMARK_NAME : public ::benchmark::Fixture {
+ public:
+  void SetUp(const ::benchmark::State& state) override {
+    if (state.thread_index() == 0) {
+      assert(data.get() == nullptr);
+      data.reset(new int(42));
+    }
+  }
+
+  void TearDown(const ::benchmark::State& state) override {
+    if (state.thread_index() == 0) {
+      assert(data.get() != nullptr);
+      data.reset();
+    }
+  }
+
+  ~FIXTURE_BECHMARK_NAME() override { assert(data == nullptr); }
+
+  std::unique_ptr<int> data;
+};
+
+BENCHMARK_F(FIXTURE_BECHMARK_NAME, Foo)(benchmark::State& st) {
+  assert(data.get() != nullptr);
+  assert(*data == 42);
+  for (auto _ : st) {
+  }
+}
+
+BENCHMARK_DEFINE_F(FIXTURE_BECHMARK_NAME, Bar)(benchmark::State& st) {
+  if (st.thread_index() == 0) {
+    assert(data.get() != nullptr);
+    assert(*data == 42);
+  }
+  for (auto _ : st) {
+    assert(data.get() != nullptr);
+    assert(*data == 42);
+  }
+  st.SetItemsProcessed(st.range(0));
+}
+BENCHMARK_REGISTER_F(FIXTURE_BECHMARK_NAME, Bar)->Arg(42);
+BENCHMARK_REGISTER_F(FIXTURE_BECHMARK_NAME, Bar)->Arg(42)->ThreadPerCpu();
+
+BENCHMARK_MAIN();
diff --git a/third_party/google_benchmark/test/internal_threading_test.cc b/third_party/google_benchmark/src/test/internal_threading_test.cc
similarity index 99%
rename from third_party/google_benchmark/test/internal_threading_test.cc
rename to third_party/google_benchmark/src/test/internal_threading_test.cc
index 039d7c1..62b5b95 100644
--- a/third_party/google_benchmark/test/internal_threading_test.cc
+++ b/third_party/google_benchmark/src/test/internal_threading_test.cc
@@ -3,6 +3,7 @@
 
 #include <chrono>
 #include <thread>
+
 #include "../src/timers.h"
 #include "benchmark/benchmark.h"
 #include "output_test.h"
diff --git a/third_party/google_benchmark/test/link_main_test.cc b/third_party/google_benchmark/src/test/link_main_test.cc
similarity index 60%
rename from third_party/google_benchmark/test/link_main_test.cc
rename to third_party/google_benchmark/src/test/link_main_test.cc
index 241ad5c..e806500 100644
--- a/third_party/google_benchmark/test/link_main_test.cc
+++ b/third_party/google_benchmark/src/test/link_main_test.cc
@@ -2,7 +2,8 @@
 
 void BM_empty(benchmark::State& state) {
   for (auto _ : state) {
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
   }
 }
 BENCHMARK(BM_empty);
diff --git a/third_party/google_benchmark/test/map_test.cc b/third_party/google_benchmark/src/test/map_test.cc
similarity index 81%
rename from third_party/google_benchmark/test/map_test.cc
rename to third_party/google_benchmark/src/test/map_test.cc
index dbf7982..0fdba7c 100644
--- a/third_party/google_benchmark/test/map_test.cc
+++ b/third_party/google_benchmark/src/test/map_test.cc
@@ -1,8 +1,8 @@
-#include "benchmark/benchmark.h"
-
 #include <cstdlib>
 #include <map>
 
+#include "benchmark/benchmark.h"
+
 namespace {
 
 std::map<int, int> ConstructRandomMap(int size) {
@@ -24,7 +24,8 @@
     m = ConstructRandomMap(size);
     state.ResumeTiming();
     for (int i = 0; i < size; ++i) {
-      benchmark::DoNotOptimize(m.find(std::rand() % size));
+      auto it = m.find(std::rand() % size);
+      benchmark::DoNotOptimize(it);
     }
   }
   state.SetItemsProcessed(state.iterations() * size);
@@ -34,11 +35,11 @@
 // Using fixtures.
 class MapFixture : public ::benchmark::Fixture {
  public:
-  void SetUp(const ::benchmark::State& st) {
+  void SetUp(const ::benchmark::State& st) override {
     m = ConstructRandomMap(static_cast<int>(st.range(0)));
   }
 
-  void TearDown(const ::benchmark::State&) { m.clear(); }
+  void TearDown(const ::benchmark::State&) override { m.clear(); }
 
   std::map<int, int> m;
 };
@@ -47,7 +48,8 @@
   const int size = static_cast<int>(state.range(0));
   for (auto _ : state) {
     for (int i = 0; i < size; ++i) {
-      benchmark::DoNotOptimize(m.find(std::rand() % size));
+      auto it = m.find(std::rand() % size);
+      benchmark::DoNotOptimize(it);
     }
   }
   state.SetItemsProcessed(state.iterations() * size);
diff --git a/third_party/google_benchmark/test/memory_manager_test.cc b/third_party/google_benchmark/src/test/memory_manager_test.cc
similarity index 76%
rename from third_party/google_benchmark/test/memory_manager_test.cc
rename to third_party/google_benchmark/src/test/memory_manager_test.cc
index 90bed16..d94bd51 100644
--- a/third_party/google_benchmark/test/memory_manager_test.cc
+++ b/third_party/google_benchmark/src/test/memory_manager_test.cc
@@ -5,25 +5,28 @@
 #include "output_test.h"
 
 class TestMemoryManager : public benchmark::MemoryManager {
-  void Start() {}
-  void Stop(Result* result) {
-    result->num_allocs = 42;
-    result->max_bytes_used = 42000;
+  void Start() override {}
+  void Stop(Result& result) override {
+    result.num_allocs = 42;
+    result.max_bytes_used = 42000;
   }
 };
 
 void BM_empty(benchmark::State& state) {
   for (auto _ : state) {
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
   }
 }
 BENCHMARK(BM_empty);
 
 ADD_CASES(TC_ConsoleOut, {{"^BM_empty %console_report$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_empty\",$"},
+                       {"\"family_index\": 0,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_empty\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
-                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
                        {"\"repetition_index\": 0,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
diff --git a/third_party/google_benchmark/src/test/min_time_parse_gtest.cc b/third_party/google_benchmark/src/test/min_time_parse_gtest.cc
new file mode 100644
index 0000000..e2bdf67
--- /dev/null
+++ b/third_party/google_benchmark/src/test/min_time_parse_gtest.cc
@@ -0,0 +1,30 @@
+#include "../src/benchmark_runner.h"
+#include "gtest/gtest.h"
+
+namespace {
+
+TEST(ParseMinTimeTest, InvalidInput) {
+#if GTEST_HAS_DEATH_TEST
+  // Tests only runnable in debug mode (when BM_CHECK is enabled).
+#ifndef NDEBUG
+#ifndef TEST_BENCHMARK_LIBRARY_HAS_NO_ASSERTIONS
+  ASSERT_DEATH_IF_SUPPORTED(
+      { benchmark::internal::ParseBenchMinTime("abc"); },
+      "Malformed seconds value passed to --benchmark_min_time: `abc`");
+
+  ASSERT_DEATH_IF_SUPPORTED(
+      { benchmark::internal::ParseBenchMinTime("123ms"); },
+      "Malformed seconds value passed to --benchmark_min_time: `123ms`");
+
+  ASSERT_DEATH_IF_SUPPORTED(
+      { benchmark::internal::ParseBenchMinTime("1z"); },
+      "Malformed seconds value passed to --benchmark_min_time: `1z`");
+
+  ASSERT_DEATH_IF_SUPPORTED(
+      { benchmark::internal::ParseBenchMinTime("1hs"); },
+      "Malformed seconds value passed to --benchmark_min_time: `1hs`");
+#endif
+#endif
+#endif
+}
+}  // namespace
diff --git a/third_party/google_benchmark/test/multiple_ranges_test.cc b/third_party/google_benchmark/src/test/multiple_ranges_test.cc
similarity index 93%
rename from third_party/google_benchmark/test/multiple_ranges_test.cc
rename to third_party/google_benchmark/src/test/multiple_ranges_test.cc
index b25f40e..5300a96 100644
--- a/third_party/google_benchmark/test/multiple_ranges_test.cc
+++ b/third_party/google_benchmark/src/test/multiple_ranges_test.cc
@@ -1,10 +1,10 @@
-#include "benchmark/benchmark.h"
-
 #include <cassert>
 #include <iostream>
 #include <set>
 #include <vector>
 
+#include "benchmark/benchmark.h"
+
 class MultipleRangesFixture : public ::benchmark::Fixture {
  public:
   MultipleRangesFixture()
@@ -28,7 +28,7 @@
                         {2, 7, 15},
                         {7, 6, 3}}) {}
 
-  void SetUp(const ::benchmark::State& state) {
+  void SetUp(const ::benchmark::State& state) override {
     std::vector<int64_t> ranges = {state.range(0), state.range(1),
                                    state.range(2)};
 
@@ -39,10 +39,10 @@
 
   // NOTE: This is not TearDown as we want to check after _all_ runs are
   // complete.
-  virtual ~MultipleRangesFixture() {
+  ~MultipleRangesFixture() override {
     if (actualValues != expectedValues) {
       std::cout << "EXPECTED\n";
-      for (auto v : expectedValues) {
+      for (const auto& v : expectedValues) {
         std::cout << "{";
         for (int64_t iv : v) {
           std::cout << iv << ", ";
@@ -50,7 +50,7 @@
         std::cout << "}\n";
       }
       std::cout << "ACTUAL\n";
-      for (auto v : actualValues) {
+      for (const auto& v : actualValues) {
         std::cout << "{";
         for (int64_t iv : v) {
           std::cout << iv << ", ";
diff --git a/third_party/google_benchmark/test/options_test.cc b/third_party/google_benchmark/src/test/options_test.cc
similarity index 91%
rename from third_party/google_benchmark/test/options_test.cc
rename to third_party/google_benchmark/src/test/options_test.cc
index 7bfc235..a1b209f 100644
--- a/third_party/google_benchmark/test/options_test.cc
+++ b/third_party/google_benchmark/src/test/options_test.cc
@@ -1,7 +1,8 @@
-#include "benchmark/benchmark.h"
 #include <chrono>
 #include <thread>
 
+#include "benchmark/benchmark.h"
+
 #if defined(NDEBUG)
 #undef NDEBUG
 #endif
@@ -25,12 +26,15 @@
 BENCHMARK(BM_basic_slow)->Arg(10)->Unit(benchmark::kNanosecond);
 BENCHMARK(BM_basic_slow)->Arg(100)->Unit(benchmark::kMicrosecond);
 BENCHMARK(BM_basic_slow)->Arg(1000)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_basic_slow)->Arg(1000)->Unit(benchmark::kSecond);
 BENCHMARK(BM_basic)->Range(1, 8);
 BENCHMARK(BM_basic)->RangeMultiplier(2)->Range(1, 8);
 BENCHMARK(BM_basic)->DenseRange(10, 15);
 BENCHMARK(BM_basic)->Args({42, 42});
 BENCHMARK(BM_basic)->Ranges({{64, 512}, {64, 512}});
 BENCHMARK(BM_basic)->MinTime(0.7);
+BENCHMARK(BM_basic)->MinWarmUpTime(0.8);
+BENCHMARK(BM_basic)->MinTime(0.1)->MinWarmUpTime(0.2);
 BENCHMARK(BM_basic)->UseRealTime();
 BENCHMARK(BM_basic)->ThreadRange(2, 4);
 BENCHMARK(BM_basic)->ThreadPerCpu();
@@ -63,12 +67,10 @@
 
   // Test that the requested iteration count is respected.
   assert(state.max_iterations == 42);
-  size_t actual_iterations = 0;
-  for (auto _ : state)
-    ++actual_iterations;
+  for (auto _ : state) {
+  }
   assert(state.iterations() == state.max_iterations);
   assert(state.iterations() == 42);
-
 }
 BENCHMARK(BM_explicit_iteration_count)->Iterations(42);
 
diff --git a/third_party/google_benchmark/test/output_test.h b/third_party/google_benchmark/src/test/output_test.h
similarity index 84%
rename from third_party/google_benchmark/test/output_test.h
rename to third_party/google_benchmark/src/test/output_test.h
index 9385761..c08fe1d 100644
--- a/third_party/google_benchmark/test/output_test.h
+++ b/third_party/google_benchmark/src/test/output_test.h
@@ -85,7 +85,7 @@
 struct Results;
 typedef std::function<void(Results const&)> ResultsCheckFn;
 
-size_t AddChecker(const char* bm_name_pattern, ResultsCheckFn fn);
+size_t AddChecker(const std::string& bm_name_pattern, const ResultsCheckFn& fn);
 
 // Class holding the results of a benchmark.
 // It is passed in calls to checker functions.
@@ -113,13 +113,11 @@
     return NumIterations() * GetTime(kRealTime);
   }
   // get the cpu_time duration of the benchmark in seconds
-  double DurationCPUTime() const {
-    return NumIterations() * GetTime(kCpuTime);
-  }
+  double DurationCPUTime() const { return NumIterations() * GetTime(kCpuTime); }
 
   // get the string for a result by name, or nullptr if the name
   // is not found
-  const std::string* Get(const char* entry_name) const {
+  const std::string* Get(const std::string& entry_name) const {
     auto it = values.find(entry_name);
     if (it == values.end()) return nullptr;
     return &it->second;
@@ -128,12 +126,12 @@
   // get a result by name, parsed as a specific type.
   // NOTE: for counters, use GetCounterAs instead.
   template <class T>
-  T GetAs(const char* entry_name) const;
+  T GetAs(const std::string& entry_name) const;
 
   // counters are written as doubles, so they have to be read first
   // as a double, and only then converted to the asked type.
   template <class T>
-  T GetCounterAs(const char* entry_name) const {
+  T GetCounterAs(const std::string& entry_name) const {
     double dval = GetAs<double>(entry_name);
     T tval = static_cast<T>(dval);
     return tval;
@@ -141,14 +139,14 @@
 };
 
 template <class T>
-T Results::GetAs(const char* entry_name) const {
+T Results::GetAs(const std::string& entry_name) const {
   auto* sv = Get(entry_name);
-  CHECK(sv != nullptr && !sv->empty());
+  BM_CHECK(sv != nullptr && !sv->empty());
   std::stringstream ss;
   ss << *sv;
   T out;
   ss >> out;
-  CHECK(!ss.fail());
+  BM_CHECK(!ss.fail());
   return out;
 }
 
@@ -158,8 +156,8 @@
 
 // clang-format off
 
-#define _CHECK_RESULT_VALUE(entry, getfn, var_type, var_name, relationship, value) \
-    CONCAT(CHECK_, relationship)                                        \
+#define CHECK_RESULT_VALUE_IMPL(entry, getfn, var_type, var_name, relationship, value) \
+    CONCAT(BM_CHECK_, relationship)                                        \
     (entry.getfn< var_type >(var_name), (value)) << "\n"                \
     << __FILE__ << ":" << __LINE__ << ": " << (entry).name << ":\n"     \
     << __FILE__ << ":" << __LINE__ << ": "                              \
@@ -169,8 +167,8 @@
 
 // check with tolerance. eps_factor is the tolerance window, which is
 // interpreted relative to value (eg, 0.1 means 10% of value).
-#define _CHECK_FLOAT_RESULT_VALUE(entry, getfn, var_type, var_name, relationship, value, eps_factor) \
-    CONCAT(CHECK_FLOAT_, relationship)                                  \
+#define CHECK_FLOAT_RESULT_VALUE_IMPL(entry, getfn, var_type, var_name, relationship, value, eps_factor) \
+    CONCAT(BM_CHECK_FLOAT_, relationship)                                  \
     (entry.getfn< var_type >(var_name), (value), (eps_factor) * (value)) << "\n" \
     << __FILE__ << ":" << __LINE__ << ": " << (entry).name << ":\n"     \
     << __FILE__ << ":" << __LINE__ << ": "                              \
@@ -187,16 +185,16 @@
     << "%)"
 
 #define CHECK_RESULT_VALUE(entry, var_type, var_name, relationship, value) \
-    _CHECK_RESULT_VALUE(entry, GetAs, var_type, var_name, relationship, value)
+    CHECK_RESULT_VALUE_IMPL(entry, GetAs, var_type, var_name, relationship, value)
 
 #define CHECK_COUNTER_VALUE(entry, var_type, var_name, relationship, value) \
-    _CHECK_RESULT_VALUE(entry, GetCounterAs, var_type, var_name, relationship, value)
+    CHECK_RESULT_VALUE_IMPL(entry, GetCounterAs, var_type, var_name, relationship, value)
 
 #define CHECK_FLOAT_RESULT_VALUE(entry, var_name, relationship, value, eps_factor) \
-    _CHECK_FLOAT_RESULT_VALUE(entry, GetAs, double, var_name, relationship, value, eps_factor)
+    CHECK_FLOAT_RESULT_VALUE_IMPL(entry, GetAs, double, var_name, relationship, value, eps_factor)
 
 #define CHECK_FLOAT_COUNTER_VALUE(entry, var_name, relationship, value, eps_factor) \
-    _CHECK_FLOAT_RESULT_VALUE(entry, GetCounterAs, double, var_name, relationship, value, eps_factor)
+    CHECK_FLOAT_RESULT_VALUE_IMPL(entry, GetCounterAs, double, var_name, relationship, value, eps_factor)
 
 // clang-format on
 
diff --git a/third_party/google_benchmark/test/output_test_helper.cc b/third_party/google_benchmark/src/test/output_test_helper.cc
similarity index 81%
rename from third_party/google_benchmark/test/output_test_helper.cc
rename to third_party/google_benchmark/src/test/output_test_helper.cc
index bdb34c8..241af5c 100644
--- a/third_party/google_benchmark/test/output_test_helper.cc
+++ b/third_party/google_benchmark/src/test/output_test_helper.cc
@@ -10,6 +10,7 @@
 
 #include "../src/benchmark_api_internal.h"
 #include "../src/check.h"  // NOTE: check.h is for internal use only!
+#include "../src/log.h"    // NOTE: log.h is for internal use only
 #include "../src/re.h"     // NOTE: re.h is for internal use only
 #include "output_test.h"
 
@@ -40,14 +41,20 @@
   // clang-format off
   static std::string safe_dec_re = "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?";
   static std::string time_re = "([0-9]+[.])?[0-9]+";
+  static std::string percentage_re = "[0-9]+[.][0-9]{2}";
   static SubMap map = {
       {"%float", "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?"},
       // human-readable float
       {"%hrfloat", "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?[kMGTPEZYmunpfazy]?"},
+      {"%percentage", percentage_re},
       {"%int", "[ ]*[0-9]+"},
       {" %s ", "[ ]+"},
       {"%time", "[ ]*" + time_re + "[ ]+ns"},
       {"%console_report", "[ ]*" + time_re + "[ ]+ns [ ]*" + time_re + "[ ]+ns [ ]*[0-9]+"},
+      {"%console_percentage_report", "[ ]*" + percentage_re + "[ ]+% [ ]*" + percentage_re + "[ ]+% [ ]*[0-9]+"},
+      {"%console_us_report", "[ ]*" + time_re + "[ ]+us [ ]*" + time_re + "[ ]+us [ ]*[0-9]+"},
+      {"%console_ms_report", "[ ]*" + time_re + "[ ]+ms [ ]*" + time_re + "[ ]+ms [ ]*[0-9]+"},
+      {"%console_s_report", "[ ]*" + time_re + "[ ]+s [ ]*" + time_re + "[ ]+s [ ]*[0-9]+"},
       {"%console_time_only_report", "[ ]*" + time_re + "[ ]+ns [ ]*" + time_re + "[ ]+ns"},
       {"%console_us_report", "[ ]*" + time_re + "[ ]+us [ ]*" + time_re + "[ ]+us [ ]*[0-9]+"},
       {"%console_us_time_only_report", "[ ]*" + time_re + "[ ]+us [ ]*" + time_re + "[ ]+us"},
@@ -56,6 +63,8 @@
        "items_per_second,label,error_occurred,error_message"},
       {"%csv_report", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns,,,,,"},
       {"%csv_us_report", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",us,,,,,"},
+      {"%csv_ms_report", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ms,,,,,"},
+      {"%csv_s_report", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",s,,,,,"},
       {"%csv_bytes_report",
        "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns," + safe_dec_re + ",,,,"},
       {"%csv_items_report",
@@ -89,27 +98,27 @@
   bool on_first = true;
   std::string line;
   while (remaining_output.eof() == false) {
-    CHECK(remaining_output.good());
+    BM_CHECK(remaining_output.good());
     std::getline(remaining_output, line);
     if (on_first) {
       first_line = line;
       on_first = false;
     }
     for (const auto& NC : not_checks) {
-      CHECK(!NC.regex->Match(line))
+      BM_CHECK(!NC.regex->Match(line))
           << "Unexpected match for line \"" << line << "\" for MR_Not regex \""
           << NC.regex_str << "\""
           << "\n    actual regex string \"" << TC.substituted_regex << "\""
           << "\n    started matching near: " << first_line;
     }
     if (TC.regex->Match(line)) return;
-    CHECK(TC.match_rule != MR_Next)
+    BM_CHECK(TC.match_rule != MR_Next)
         << "Expected line \"" << line << "\" to match regex \"" << TC.regex_str
         << "\""
         << "\n    actual regex string \"" << TC.substituted_regex << "\""
         << "\n    started matching near: " << first_line;
   }
-  CHECK(remaining_output.eof() == false)
+  BM_CHECK(remaining_output.eof() == false)
       << "End of output reached before match for regex \"" << TC.regex_str
       << "\" was found"
       << "\n    actual regex string \"" << TC.substituted_regex << "\""
@@ -132,14 +141,14 @@
 class TestReporter : public benchmark::BenchmarkReporter {
  public:
   TestReporter(std::vector<benchmark::BenchmarkReporter*> reps)
-      : reporters_(reps) {}
+      : reporters_(std::move(reps)) {}
 
-  virtual bool ReportContext(const Context& context) {
+  bool ReportContext(const Context& context) override {
     bool last_ret = false;
     bool first = true;
     for (auto rep : reporters_) {
       bool new_ret = rep->ReportContext(context);
-      CHECK(first || new_ret == last_ret)
+      BM_CHECK(first || new_ret == last_ret)
           << "Reports return different values for ReportContext";
       first = false;
       last_ret = new_ret;
@@ -148,10 +157,10 @@
     return last_ret;
   }
 
-  void ReportRuns(const std::vector<Run>& report) {
+  void ReportRuns(const std::vector<Run>& report) override {
     for (auto rep : reporters_) rep->ReportRuns(report);
   }
-  void Finalize() {
+  void Finalize() override {
     for (auto rep : reporters_) rep->Finalize();
   }
 
@@ -174,7 +183,7 @@
  public:
   struct PatternAndFn : public TestCase {  // reusing TestCase for its regexes
     PatternAndFn(const std::string& rx, ResultsCheckFn fn_)
-        : TestCase(rx), fn(fn_) {}
+        : TestCase(rx), fn(std::move(fn_)) {}
     ResultsCheckFn fn;
   };
 
@@ -182,7 +191,7 @@
   std::vector<Results> results;
   std::vector<std::string> field_names;
 
-  void Add(const std::string& entry_pattern, ResultsCheckFn fn);
+  void Add(const std::string& entry_pattern, const ResultsCheckFn& fn);
 
   void CheckResults(std::stringstream& output);
 
@@ -201,7 +210,8 @@
 }
 
 // add a results checker for a benchmark
-void ResultsChecker::Add(const std::string& entry_pattern, ResultsCheckFn fn) {
+void ResultsChecker::Add(const std::string& entry_pattern,
+                         const ResultsCheckFn& fn) {
   check_patterns.emplace_back(entry_pattern, fn);
 }
 
@@ -221,7 +231,7 @@
   std::string line;
   bool on_first = true;
   while (output.eof() == false) {
-    CHECK(output.good());
+    BM_CHECK(output.good());
     std::getline(output, line);
     if (on_first) {
       SetHeader_(line);  // this is important
@@ -232,18 +242,17 @@
   }
   // finally we can call the subscribed check functions
   for (const auto& p : check_patterns) {
-    VLOG(2) << "--------------------------------\n";
-    VLOG(2) << "checking for benchmarks matching " << p.regex_str << "...\n";
+    BM_VLOG(2) << "--------------------------------\n";
+    BM_VLOG(2) << "checking for benchmarks matching " << p.regex_str << "...\n";
     for (const auto& r : results) {
       if (!p.regex->Match(r.name)) {
-        VLOG(2) << p.regex_str << " is not matched by " << r.name << "\n";
+        BM_VLOG(2) << p.regex_str << " is not matched by " << r.name << "\n";
         continue;
-      } else {
-        VLOG(2) << p.regex_str << " is matched by " << r.name << "\n";
       }
-      VLOG(1) << "Checking results of " << r.name << ": ... \n";
+      BM_VLOG(2) << p.regex_str << " is matched by " << r.name << "\n";
+      BM_VLOG(1) << "Checking results of " << r.name << ": ... \n";
       p.fn(r);
-      VLOG(1) << "Checking results of " << r.name << ": OK.\n";
+      BM_VLOG(1) << "Checking results of " << r.name << ": OK.\n";
     }
   }
 }
@@ -256,9 +265,9 @@
 // set the values for a benchmark
 void ResultsChecker::SetValues_(const std::string& entry_csv_line) {
   if (entry_csv_line.empty()) return;  // some lines are empty
-  CHECK(!field_names.empty());
+  BM_CHECK(!field_names.empty());
   auto vals = SplitCsv_(entry_csv_line);
-  CHECK_EQ(vals.size(), field_names.size());
+  BM_CHECK_EQ(vals.size(), field_names.size());
   results.emplace_back(vals[0]);  // vals[0] is the benchmark name
   auto& entry = results.back();
   for (size_t i = 1, e = vals.size(); i < e; ++i) {
@@ -273,7 +282,7 @@
   if (!field_names.empty()) out.reserve(field_names.size());
   size_t prev = 0, pos = line.find_first_of(','), curr = pos;
   while (pos != line.npos) {
-    CHECK(curr > 0);
+    BM_CHECK(curr > 0);
     if (line[prev] == '"') ++prev;
     if (line[curr - 1] == '"') --curr;
     out.push_back(line.substr(prev, curr - prev));
@@ -290,7 +299,7 @@
 
 }  // end namespace internal
 
-size_t AddChecker(const char* bm_name, ResultsCheckFn fn) {
+size_t AddChecker(const std::string& bm_name, const ResultsCheckFn& fn) {
   auto& rc = internal::GetResultsChecker();
   rc.Add(bm_name, fn);
   return rc.results.size();
@@ -304,32 +313,32 @@
   ss << name.substr(pos + 9, end);
   int num = 1;
   ss >> num;
-  CHECK(!ss.fail());
+  BM_CHECK(!ss.fail());
   return num;
 }
 
-double Results::NumIterations() const {
-  return GetAs<double>("iterations");
-}
+double Results::NumIterations() const { return GetAs<double>("iterations"); }
 
 double Results::GetTime(BenchmarkTime which) const {
-  CHECK(which == kCpuTime || which == kRealTime);
+  BM_CHECK(which == kCpuTime || which == kRealTime);
   const char* which_str = which == kCpuTime ? "cpu_time" : "real_time";
   double val = GetAs<double>(which_str);
   auto unit = Get("time_unit");
-  CHECK(unit);
+  BM_CHECK(unit);
   if (*unit == "ns") {
     return val * 1.e-9;
-  } else if (*unit == "us") {
-    return val * 1.e-6;
-  } else if (*unit == "ms") {
-    return val * 1.e-3;
-  } else if (*unit == "s") {
-    return val;
-  } else {
-    CHECK(1 == 0) << "unknown time unit: " << *unit;
-    return 0;
   }
+  if (*unit == "us") {
+    return val * 1.e-6;
+  }
+  if (*unit == "ms") {
+    return val * 1.e-3;
+  }
+  if (*unit == "s") {
+    return val;
+  }
+  BM_CHECK(1 == 0) << "unknown time unit: " << *unit;
+  return 0;
 }
 
 // ========================================================================= //
@@ -343,10 +352,10 @@
       regex(std::make_shared<benchmark::Regex>()) {
   std::string err_str;
   regex->Init(substituted_regex, &err_str);
-  CHECK(err_str.empty()) << "Could not construct regex \"" << substituted_regex
-                         << "\""
-                         << "\n    originally \"" << regex_str << "\""
-                         << "\n    got error: " << err_str;
+  BM_CHECK(err_str.empty())
+      << "Could not construct regex \"" << substituted_regex << "\""
+      << "\n    originally \"" << regex_str << "\""
+      << "\n    got error: " << err_str;
 }
 
 int AddCases(TestCaseID ID, std::initializer_list<TestCase> il) {
@@ -374,11 +383,9 @@
 }
 
 // Disable deprecated warnings temporarily because we need to reference
-// CSVReporter but don't want to trigger -Werror=-Wdeprecated
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated"
-#endif
+// CSVReporter but don't want to trigger -Werror=-Wdeprecated-declarations
+BENCHMARK_DISABLE_DEPRECATED_WARNING
+
 void RunOutputTests(int argc, char* argv[]) {
   using internal::GetTestCaseList;
   benchmark::Initialize(&argc, argv);
@@ -387,14 +394,14 @@
   benchmark::JSONReporter JR;
   benchmark::CSVReporter CSVR;
   struct ReporterTest {
-    const char* name;
+    std::string name;
     std::vector<TestCase>& output_cases;
     std::vector<TestCase>& error_cases;
     benchmark::BenchmarkReporter& reporter;
     std::stringstream out_stream;
     std::stringstream err_stream;
 
-    ReporterTest(const char* n, std::vector<TestCase>& out_tc,
+    ReporterTest(const std::string& n, std::vector<TestCase>& out_tc,
                  std::vector<TestCase>& err_tc,
                  benchmark::BenchmarkReporter& br)
         : name(n), output_cases(out_tc), error_cases(err_tc), reporter(br) {
@@ -402,12 +409,12 @@
       reporter.SetErrorStream(&err_stream);
     }
   } TestCases[] = {
-      {"ConsoleReporter", GetTestCaseList(TC_ConsoleOut),
+      {std::string("ConsoleReporter"), GetTestCaseList(TC_ConsoleOut),
        GetTestCaseList(TC_ConsoleErr), CR},
-      {"JSONReporter", GetTestCaseList(TC_JSONOut), GetTestCaseList(TC_JSONErr),
-       JR},
-      {"CSVReporter", GetTestCaseList(TC_CSVOut), GetTestCaseList(TC_CSVErr),
-       CSVR},
+      {std::string("JSONReporter"), GetTestCaseList(TC_JSONOut),
+       GetTestCaseList(TC_JSONErr), JR},
+      {std::string("CSVReporter"), GetTestCaseList(TC_CSVOut),
+       GetTestCaseList(TC_CSVErr), CSVR},
   };
 
   // Create the test reporter and run the benchmarks.
@@ -416,7 +423,8 @@
   benchmark::RunSpecifiedBenchmarks(&test_rep);
 
   for (auto& rep_test : TestCases) {
-    std::string msg = std::string("\nTesting ") + rep_test.name + " Output\n";
+    std::string msg =
+        std::string("\nTesting ") + rep_test.name + std::string(" Output\n");
     std::string banner(msg.size() - 1, '-');
     std::cout << banner << msg << banner << "\n";
 
@@ -433,13 +441,11 @@
   // the checks to subscribees.
   auto& csv = TestCases[2];
   // would use == but gcc spits a warning
-  CHECK(std::strcmp(csv.name, "CSVReporter") == 0);
+  BM_CHECK(csv.name == std::string("CSVReporter"));
   internal::GetResultsChecker().CheckResults(csv.out_stream);
 }
 
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
+BENCHMARK_RESTORE_DEPRECATED_WARNING
 
 int SubstrCnt(const std::string& haystack, const std::string& pat) {
   if (pat.length() == 0) return 0;
@@ -463,9 +469,8 @@
 
 static std::string GetRandomFileName() {
   std::string model = "test.%%%%%%";
-  for (auto & ch :  model) {
-    if (ch == '%')
-      ch = RandomHexChar();
+  for (auto& ch : model) {
+    if (ch == '%') ch = RandomHexChar();
   }
   return model;
 }
@@ -482,8 +487,7 @@
   int retries = 3;
   while (--retries) {
     std::string name = GetRandomFileName();
-    if (!FileExists(name))
-      return name;
+    if (!FileExists(name)) return name;
   }
   std::cerr << "Failed to create unique temporary file name" << std::endl;
   std::abort();
diff --git a/third_party/google_benchmark/src/test/perf_counters_gtest.cc b/third_party/google_benchmark/src/test/perf_counters_gtest.cc
new file mode 100644
index 0000000..e73ebc5
--- /dev/null
+++ b/third_party/google_benchmark/src/test/perf_counters_gtest.cc
@@ -0,0 +1,316 @@
+#include <random>
+#include <thread>
+
+#include "../src/perf_counters.h"
+#include "gtest/gtest.h"
+
+#ifndef GTEST_SKIP
+struct MsgHandler {
+  void operator=(std::ostream&) {}
+};
+#define GTEST_SKIP() return MsgHandler() = std::cout
+#endif
+
+using benchmark::internal::PerfCounters;
+using benchmark::internal::PerfCountersMeasurement;
+using benchmark::internal::PerfCounterValues;
+
+namespace {
+const char kGenericPerfEvent1[] = "CYCLES";
+const char kGenericPerfEvent2[] = "BRANCHES";
+const char kGenericPerfEvent3[] = "INSTRUCTIONS";
+
+TEST(PerfCountersTest, Init) {
+  EXPECT_EQ(PerfCounters::Initialize(), PerfCounters::kSupported);
+}
+
+TEST(PerfCountersTest, OneCounter) {
+  if (!PerfCounters::kSupported) {
+    GTEST_SKIP() << "Performance counters not supported.\n";
+  }
+  EXPECT_TRUE(PerfCounters::Initialize());
+  EXPECT_EQ(PerfCounters::Create({kGenericPerfEvent1}).num_counters(), 1);
+}
+
+TEST(PerfCountersTest, NegativeTest) {
+  if (!PerfCounters::kSupported) {
+    EXPECT_FALSE(PerfCounters::Initialize());
+    return;
+  }
+  EXPECT_TRUE(PerfCounters::Initialize());
+  // Sanity checks
+  // Create() will always create a valid object, even if passed no or
+  // wrong arguments as the new behavior is to warn and drop unsupported
+  // counters
+  EXPECT_EQ(PerfCounters::Create({}).num_counters(), 0);
+  EXPECT_EQ(PerfCounters::Create({""}).num_counters(), 0);
+  EXPECT_EQ(PerfCounters::Create({"not a counter name"}).num_counters(), 0);
+  {
+    // Try sneaking in a bad egg to see if it is filtered out. The
+    // number of counters has to be two, not zero
+    auto counter =
+        PerfCounters::Create({kGenericPerfEvent2, "", kGenericPerfEvent1});
+    EXPECT_EQ(counter.num_counters(), 2);
+    EXPECT_EQ(counter.names(), std::vector<std::string>(
+                                   {kGenericPerfEvent2, kGenericPerfEvent1}));
+  }
+  {
+    // Try sneaking in an outrageous counter, like a fat finger mistake
+    auto counter = PerfCounters::Create(
+        {kGenericPerfEvent3, "not a counter name", kGenericPerfEvent1});
+    EXPECT_EQ(counter.num_counters(), 2);
+    EXPECT_EQ(counter.names(), std::vector<std::string>(
+                                   {kGenericPerfEvent3, kGenericPerfEvent1}));
+  }
+  {
+    // Finally try a golden input - it should like all them
+    EXPECT_EQ(PerfCounters::Create(
+                  {kGenericPerfEvent1, kGenericPerfEvent2, kGenericPerfEvent3})
+                  .num_counters(),
+              3);
+  }
+  {
+    // Add a bad apple in the end of the chain to check the edges
+    auto counter = PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2,
+                                         kGenericPerfEvent3,
+                                         "MISPREDICTED_BRANCH_RETIRED"});
+    EXPECT_EQ(counter.num_counters(), 3);
+    EXPECT_EQ(counter.names(),
+              std::vector<std::string>({kGenericPerfEvent1, kGenericPerfEvent2,
+                                        kGenericPerfEvent3}));
+  }
+}
+
+TEST(PerfCountersTest, Read1Counter) {
+  if (!PerfCounters::kSupported) {
+    GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
+  }
+  EXPECT_TRUE(PerfCounters::Initialize());
+  auto counters = PerfCounters::Create({kGenericPerfEvent1});
+  EXPECT_EQ(counters.num_counters(), 1);
+  PerfCounterValues values1(1);
+  EXPECT_TRUE(counters.Snapshot(&values1));
+  EXPECT_GT(values1[0], 0);
+  PerfCounterValues values2(1);
+  EXPECT_TRUE(counters.Snapshot(&values2));
+  EXPECT_GT(values2[0], 0);
+  EXPECT_GT(values2[0], values1[0]);
+}
+
+TEST(PerfCountersTest, Read2Counters) {
+  if (!PerfCounters::kSupported) {
+    GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
+  }
+  EXPECT_TRUE(PerfCounters::Initialize());
+  auto counters =
+      PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2});
+  EXPECT_EQ(counters.num_counters(), 2);
+  PerfCounterValues values1(2);
+  EXPECT_TRUE(counters.Snapshot(&values1));
+  EXPECT_GT(values1[0], 0);
+  EXPECT_GT(values1[1], 0);
+  PerfCounterValues values2(2);
+  EXPECT_TRUE(counters.Snapshot(&values2));
+  EXPECT_GT(values2[0], 0);
+  EXPECT_GT(values2[1], 0);
+}
+
+TEST(PerfCountersTest, ReopenExistingCounters) {
+  // This test works in recent and old Intel hardware
+  // However we cannot make assumptions beyond 3 HW counters
+  if (!PerfCounters::kSupported) {
+    GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
+  }
+  EXPECT_TRUE(PerfCounters::Initialize());
+  std::vector<std::string> kMetrics({kGenericPerfEvent1});
+  std::vector<PerfCounters> counters(3);
+  for (auto& counter : counters) {
+    counter = PerfCounters::Create(kMetrics);
+  }
+  PerfCounterValues values(1);
+  EXPECT_TRUE(counters[0].Snapshot(&values));
+  EXPECT_TRUE(counters[1].Snapshot(&values));
+  EXPECT_TRUE(counters[2].Snapshot(&values));
+}
+
+TEST(PerfCountersTest, CreateExistingMeasurements) {
+  // The test works (i.e. causes read to fail) for the assumptions
+  // about hardware capabilities (i.e. small number (3) hardware
+  // counters) at this date,
+  // the same as previous test ReopenExistingCounters.
+  if (!PerfCounters::kSupported) {
+    GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
+  }
+  EXPECT_TRUE(PerfCounters::Initialize());
+
+  // This means we will try 10 counters but we can only guarantee
+  // for sure at this time that only 3 will work. Perhaps in the future
+  // we could use libpfm to query for the hardware limits on this
+  // particular platform.
+  const int kMaxCounters = 10;
+  const int kMinValidCounters = 3;
+
+  // Let's use a ubiquitous counter that is guaranteed to work
+  // on all platforms
+  const std::vector<std::string> kMetrics{"cycles"};
+
+  // Cannot create a vector of actual objects because the
+  // copy constructor of PerfCounters is deleted - and so is
+  // implicitly deleted on PerfCountersMeasurement too
+  std::vector<std::unique_ptr<PerfCountersMeasurement>>
+      perf_counter_measurements;
+
+  perf_counter_measurements.reserve(kMaxCounters);
+  for (int j = 0; j < kMaxCounters; ++j) {
+    perf_counter_measurements.emplace_back(
+        new PerfCountersMeasurement(kMetrics));
+  }
+
+  std::vector<std::pair<std::string, double>> measurements;
+
+  // Start all counters together to see if they hold
+  int max_counters = kMaxCounters;
+  for (int i = 0; i < kMaxCounters; ++i) {
+    auto& counter(*perf_counter_measurements[i]);
+    EXPECT_EQ(counter.num_counters(), 1);
+    if (!counter.Start()) {
+      max_counters = i;
+      break;
+    };
+  }
+
+  ASSERT_GE(max_counters, kMinValidCounters);
+
+  // Start all together
+  for (int i = 0; i < max_counters; ++i) {
+    auto& counter(*perf_counter_measurements[i]);
+    EXPECT_TRUE(counter.Stop(measurements) || (i >= kMinValidCounters));
+  }
+
+  // Start/stop individually
+  for (int i = 0; i < max_counters; ++i) {
+    auto& counter(*perf_counter_measurements[i]);
+    measurements.clear();
+    counter.Start();
+    EXPECT_TRUE(counter.Stop(measurements) || (i >= kMinValidCounters));
+  }
+}
+
+// We try to do some meaningful work here but the compiler
+// insists in optimizing away our loop so we had to add a
+// no-optimize macro. In case it fails, we added some entropy
+// to this pool as well.
+
+BENCHMARK_DONT_OPTIMIZE size_t do_work() {
+  static std::mt19937 rd{std::random_device{}()};
+  static std::uniform_int_distribution<size_t> mrand(0, 10);
+  const size_t kNumLoops = 1000000;
+  size_t sum = 0;
+  for (size_t j = 0; j < kNumLoops; ++j) {
+    sum += mrand(rd);
+  }
+  benchmark::DoNotOptimize(sum);
+  return sum;
+}
+
+void measure(size_t threadcount, PerfCounterValues* before,
+             PerfCounterValues* after) {
+  BM_CHECK_NE(before, nullptr);
+  BM_CHECK_NE(after, nullptr);
+  std::vector<std::thread> threads(threadcount);
+  auto work = [&]() { BM_CHECK(do_work() > 1000); };
+
+  // We need to first set up the counters, then start the threads, so the
+  // threads would inherit the counters. But later, we need to first destroy
+  // the thread pool (so all the work finishes), then measure the counters. So
+  // the scopes overlap, and we need to explicitly control the scope of the
+  // threadpool.
+  auto counters =
+      PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent3});
+  for (auto& t : threads) t = std::thread(work);
+  counters.Snapshot(before);
+  for (auto& t : threads) t.join();
+  counters.Snapshot(after);
+}
+
+TEST(PerfCountersTest, MultiThreaded) {
+  if (!PerfCounters::kSupported) {
+    GTEST_SKIP() << "Test skipped because libpfm is not supported.";
+  }
+  EXPECT_TRUE(PerfCounters::Initialize());
+  PerfCounterValues before(2);
+  PerfCounterValues after(2);
+
+  // Notice that this test will work even if we taskset it to a single CPU
+  // In this case the threads will run sequentially
+  // Start two threads and measure the number of combined cycles and
+  // instructions
+  measure(2, &before, &after);
+  std::vector<double> Elapsed2Threads{
+      static_cast<double>(after[0] - before[0]),
+      static_cast<double>(after[1] - before[1])};
+
+  // Start four threads and measure the number of combined cycles and
+  // instructions
+  measure(4, &before, &after);
+  std::vector<double> Elapsed4Threads{
+      static_cast<double>(after[0] - before[0]),
+      static_cast<double>(after[1] - before[1])};
+
+  // Some extra work will happen on the main thread - like joining the threads
+  // - so the ratio won't be quite 2.0, but very close.
+  EXPECT_GE(Elapsed4Threads[0], 1.9 * Elapsed2Threads[0]);
+  EXPECT_GE(Elapsed4Threads[1], 1.9 * Elapsed2Threads[1]);
+}
+
+TEST(PerfCountersTest, HardwareLimits) {
+  // The test works (i.e. causes read to fail) for the assumptions
+  // about hardware capabilities (i.e. small number (3-4) hardware
+  // counters) at this date,
+  // the same as previous test ReopenExistingCounters.
+  if (!PerfCounters::kSupported) {
+    GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
+  }
+  EXPECT_TRUE(PerfCounters::Initialize());
+
+  // Taken straight from `perf list` on x86-64
+  // Got all hardware names since these are the problematic ones
+  std::vector<std::string> counter_names{"cycles",  // leader
+                                         "instructions",
+                                         "branches",
+                                         "L1-dcache-loads",
+                                         "L1-dcache-load-misses",
+                                         "L1-dcache-prefetches",
+                                         "L1-icache-load-misses",  // leader
+                                         "L1-icache-loads",
+                                         "branch-load-misses",
+                                         "branch-loads",
+                                         "dTLB-load-misses",
+                                         "dTLB-loads",
+                                         "iTLB-load-misses",  // leader
+                                         "iTLB-loads",
+                                         "branch-instructions",
+                                         "branch-misses",
+                                         "cache-misses",
+                                         "cache-references",
+                                         "stalled-cycles-backend",  // leader
+                                         "stalled-cycles-frontend"};
+
+  // In the off-chance that some of these values are not supported,
+  // we filter them out so the test will complete without failure
+  // albeit it might not actually test the grouping on that platform
+  std::vector<std::string> valid_names;
+  for (const std::string& name : counter_names) {
+    if (PerfCounters::IsCounterSupported(name)) {
+      valid_names.push_back(name);
+    }
+  }
+  PerfCountersMeasurement counter(valid_names);
+
+  std::vector<std::pair<std::string, double>> measurements;
+
+  counter.Start();
+  EXPECT_TRUE(counter.Stop(measurements));
+}
+
+}  // namespace
diff --git a/third_party/google_benchmark/src/test/perf_counters_test.cc b/third_party/google_benchmark/src/test/perf_counters_test.cc
new file mode 100644
index 0000000..f0e9a17
--- /dev/null
+++ b/third_party/google_benchmark/src/test/perf_counters_test.cc
@@ -0,0 +1,28 @@
+#undef NDEBUG
+
+#include "../src/perf_counters.h"
+
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+static void BM_Simple(benchmark::State& state) {
+  for (auto _ : state) {
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
+  }
+}
+BENCHMARK(BM_Simple);
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Simple\",$"}});
+
+static void CheckSimple(Results const& e) {
+  CHECK_COUNTER_VALUE(e, double, "CYCLES", GT, 0);
+  CHECK_COUNTER_VALUE(e, double, "BRANCHES", GT, 0.0);
+}
+CHECK_BENCHMARK_RESULTS("BM_Simple", &CheckSimple);
+
+int main(int argc, char* argv[]) {
+  if (!benchmark::internal::PerfCounters::kSupported) {
+    return 0;
+  }
+  RunOutputTests(argc, argv);
+}
diff --git a/third_party/google_benchmark/test/register_benchmark_test.cc b/third_party/google_benchmark/src/test/register_benchmark_test.cc
similarity index 82%
rename from third_party/google_benchmark/test/register_benchmark_test.cc
rename to third_party/google_benchmark/src/test/register_benchmark_test.cc
index 3ac5b21..d69d144 100644
--- a/third_party/google_benchmark/test/register_benchmark_test.cc
+++ b/third_party/google_benchmark/src/test/register_benchmark_test.cc
@@ -10,7 +10,7 @@
 
 class TestReporter : public benchmark::ConsoleReporter {
  public:
-  virtual void ReportRuns(const std::vector<Run>& report) {
+  void ReportRuns(const std::vector<Run>& report) override {
     all_runs_.insert(all_runs_.end(), begin(report), end(report));
     ConsoleReporter::ReportRuns(report);
   }
@@ -19,24 +19,24 @@
 };
 
 struct TestCase {
-  std::string name;
-  const char* label;
+  const std::string name;
+  const std::string label;
   // Note: not explicit as we rely on it being converted through ADD_CASES.
-  TestCase(const char* xname) : TestCase(xname, nullptr) {}
-  TestCase(const char* xname, const char* xlabel)
+  TestCase(const std::string& xname) : TestCase(xname, "") {}
+  TestCase(const std::string& xname, const std::string& xlabel)
       : name(xname), label(xlabel) {}
 
   typedef benchmark::BenchmarkReporter::Run Run;
 
   void CheckRun(Run const& run) const {
     // clang-format off
-    CHECK(name == run.benchmark_name()) << "expected " << name << " got "
+    BM_CHECK(name == run.benchmark_name()) << "expected " << name << " got "
                                       << run.benchmark_name();
-    if (label) {
-      CHECK(run.report_label == label) << "expected " << label << " got "
+    if (!label.empty()) {
+      BM_CHECK(run.report_label == label) << "expected " << label << " got "
                                        << run.report_label;
     } else {
-      CHECK(run.report_label == "");
+      BM_CHECK(run.report_label.empty());
     }
     // clang-format on
   }
@@ -45,7 +45,7 @@
 std::vector<TestCase> ExpectedResults;
 
 int AddCases(std::initializer_list<TestCase> const& v) {
-  for (auto N : v) {
+  for (const auto& N : v) {
     ExpectedResults.push_back(N);
   }
   return 0;
@@ -96,6 +96,18 @@
 #endif  // BENCHMARK_HAS_NO_VARIADIC_REGISTER_BENCHMARK
 
 //----------------------------------------------------------------------------//
+// Test RegisterBenchmark with DISABLED_ benchmark
+//----------------------------------------------------------------------------//
+void DISABLED_BM_function(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(DISABLED_BM_function);
+ReturnVal dummy3 = benchmark::RegisterBenchmark("DISABLED_BM_function_manual",
+                                                DISABLED_BM_function);
+// No need to add cases because we don't expect them to run.
+
+//----------------------------------------------------------------------------//
 // Test RegisterBenchmark with different callable types
 //----------------------------------------------------------------------------//
 
@@ -111,7 +123,7 @@
   {
     CustomFixture fx;
     benchmark::RegisterBenchmark("custom_fixture", fx);
-    AddCases({"custom_fixture"});
+    AddCases({std::string("custom_fixture")});
   }
 #endif
 #ifndef BENCHMARK_HAS_NO_VARIADIC_REGISTER_BENCHMARK
diff --git a/third_party/google_benchmark/src/test/repetitions_test.cc b/third_party/google_benchmark/src/test/repetitions_test.cc
new file mode 100644
index 0000000..569777d
--- /dev/null
+++ b/third_party/google_benchmark/src/test/repetitions_test.cc
@@ -0,0 +1,214 @@
+
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+// ========================================================================= //
+// ------------------------ Testing Basic Output --------------------------- //
+// ========================================================================= //
+
+static void BM_ExplicitRepetitions(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_ExplicitRepetitions)->Repetitions(2);
+
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_ExplicitRepetitions/repeats:2 %console_report$"}});
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_ExplicitRepetitions/repeats:2 %console_report$"}});
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_ExplicitRepetitions/repeats:2_mean %console_report$"}});
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_ExplicitRepetitions/repeats:2_median %console_report$"}});
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_ExplicitRepetitions/repeats:2_stddev %console_report$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_ExplicitRepetitions/repeats:2\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_ExplicitRepetitions/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\"$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_ExplicitRepetitions/repeats:2\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_ExplicitRepetitions/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"repetition_index\": 1,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\"$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_ExplicitRepetitions/repeats:2_mean\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_ExplicitRepetitions/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\"$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_ExplicitRepetitions/repeats:2_median\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_ExplicitRepetitions/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\"$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_ExplicitRepetitions/repeats:2_stddev\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_ExplicitRepetitions/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\"$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_ExplicitRepetitions/repeats:2\",%csv_report$"}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_ExplicitRepetitions/repeats:2\",%csv_report$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_ExplicitRepetitions/repeats:2_mean\",%csv_report$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_ExplicitRepetitions/repeats:2_median\",%csv_report$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_ExplicitRepetitions/repeats:2_stddev\",%csv_report$"}});
+
+// ========================================================================= //
+// ------------------------ Testing Basic Output --------------------------- //
+// ========================================================================= //
+
+static void BM_ImplicitRepetitions(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_ImplicitRepetitions);
+
+ADD_CASES(TC_ConsoleOut, {{"^BM_ImplicitRepetitions %console_report$"}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_ImplicitRepetitions %console_report$"}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_ImplicitRepetitions %console_report$"}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_ImplicitRepetitions_mean %console_report$"}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_ImplicitRepetitions_median %console_report$"}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_ImplicitRepetitions_stddev %console_report$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_ImplicitRepetitions\",$"},
+                       {"\"family_index\": 1,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_ImplicitRepetitions\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\"$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_ImplicitRepetitions\",$"},
+                       {"\"family_index\": 1,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_ImplicitRepetitions\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"repetition_index\": 1,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\"$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_ImplicitRepetitions\",$"},
+                       {"\"family_index\": 1,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_ImplicitRepetitions\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"repetition_index\": 2,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\"$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_ImplicitRepetitions_mean\",$"},
+                       {"\"family_index\": 1,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_ImplicitRepetitions\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"mean\",$", MR_Next},
+                       {"\"aggregate_unit\": \"time\",$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\"$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_ImplicitRepetitions_median\",$"},
+                       {"\"family_index\": 1,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_ImplicitRepetitions\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"median\",$", MR_Next},
+                       {"\"aggregate_unit\": \"time\",$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\"$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_ImplicitRepetitions_stddev\",$"},
+                       {"\"family_index\": 1,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_ImplicitRepetitions\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"stddev\",$", MR_Next},
+                       {"\"aggregate_unit\": \"time\",$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\"$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_ImplicitRepetitions\",%csv_report$"}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_ImplicitRepetitions\",%csv_report$"}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_ImplicitRepetitions_mean\",%csv_report$"}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_ImplicitRepetitions_median\",%csv_report$"}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_ImplicitRepetitions_stddev\",%csv_report$"}});
+
+// ========================================================================= //
+// --------------------------- TEST CASES END ------------------------------ //
+// ========================================================================= //
+
+int main(int argc, char* argv[]) { RunOutputTests(argc, argv); }
diff --git a/third_party/google_benchmark/test/report_aggregates_only_test.cc b/third_party/google_benchmark/src/test/report_aggregates_only_test.cc
similarity index 82%
rename from third_party/google_benchmark/test/report_aggregates_only_test.cc
rename to third_party/google_benchmark/src/test/report_aggregates_only_test.cc
index 9646b9b..47da503 100644
--- a/third_party/google_benchmark/test/report_aggregates_only_test.cc
+++ b/third_party/google_benchmark/src/test/report_aggregates_only_test.cc
@@ -19,17 +19,19 @@
 int main(int argc, char* argv[]) {
   const std::string output = GetFileReporterOutput(argc, argv);
 
-  if (SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3") != 3 ||
+  if (SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3") != 4 ||
       SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_mean\"") != 1 ||
       SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_median\"") !=
           1 ||
       SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\"") !=
-          1) {
-    std::cout << "Precondition mismatch. Expected to only find three "
+          1 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_cv\"") != 1) {
+    std::cout << "Precondition mismatch. Expected to only find four "
                  "occurrences of \"BM_SummaryRepeat/repeats:3\" substring:\n"
                  "\"name\": \"BM_SummaryRepeat/repeats:3_mean\", "
                  "\"name\": \"BM_SummaryRepeat/repeats:3_median\", "
-                 "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\"\nThe entire "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_cv\"\nThe entire "
                  "output:\n";
     std::cout << output;
     return 1;
diff --git a/third_party/google_benchmark/test/reporter_output_test.cc b/third_party/google_benchmark/src/test/reporter_output_test.cc
similarity index 64%
rename from third_party/google_benchmark/test/reporter_output_test.cc
rename to third_party/google_benchmark/src/test/reporter_output_test.cc
index 1a96b5f..2eb545a 100644
--- a/third_party/google_benchmark/test/reporter_output_test.cc
+++ b/third_party/google_benchmark/src/test/reporter_output_test.cc
@@ -1,5 +1,6 @@
 
 #undef NDEBUG
+#include <numeric>
 #include <utility>
 
 #include "benchmark/benchmark.h"
@@ -15,8 +16,8 @@
 static int AddContextCases() {
   AddCases(TC_ConsoleErr,
            {
-               {"%int[-/]%int[-/]%int %int:%int:%int$", MR_Default},
-               {"Running .*/reporter_output_test(\\.exe)?$", MR_Next},
+               {"^%int-%int-%intT%int:%int:%int[-+]%int:%int$", MR_Default},
+               {"Running .*(/|\\\\)reporter_output_test(\\.exe)?$", MR_Next},
                {"Run on \\(%int X %float MHz CPU s?\\)", MR_Next},
            });
   AddCases(TC_JSONOut,
@@ -28,8 +29,7 @@
              MR_Next},
             {"\"num_cpus\": %int,$", MR_Next},
             {"\"mhz_per_cpu\": %float,$", MR_Next},
-            {"\"cpu_scaling_enabled\": ", MR_Next},
-            {"\"caches\": \\[$", MR_Next}});
+            {"\"caches\": \\[$", MR_Default}});
   auto const& Info = benchmark::CPUInfo::Get();
   auto const& Caches = Info.caches;
   if (!Caches.empty()) {
@@ -72,9 +72,11 @@
 
 ADD_CASES(TC_ConsoleOut, {{"^BM_basic %console_report$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_basic\",$"},
+                       {"\"family_index\": 0,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_basic\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
-                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
                        {"\"repetition_index\": 0,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
@@ -91,7 +93,8 @@
 void BM_bytes_per_second(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
   }
   state.SetBytesProcessed(1);
 }
@@ -100,9 +103,11 @@
 ADD_CASES(TC_ConsoleOut, {{"^BM_bytes_per_second %console_report "
                            "bytes_per_second=%float[kM]{0,1}/s$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_bytes_per_second\",$"},
+                       {"\"family_index\": 1,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_bytes_per_second\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
-                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
                        {"\"repetition_index\": 0,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
@@ -120,7 +125,8 @@
 void BM_items_per_second(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
   }
   state.SetItemsProcessed(1);
 }
@@ -129,9 +135,11 @@
 ADD_CASES(TC_ConsoleOut, {{"^BM_items_per_second %console_report "
                            "items_per_second=%float[kM]{0,1}/s$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_items_per_second\",$"},
+                       {"\"family_index\": 2,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_items_per_second\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
-                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
                        {"\"repetition_index\": 0,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
@@ -155,9 +163,11 @@
 
 ADD_CASES(TC_ConsoleOut, {{"^BM_label %console_report some label$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_label\",$"},
+                       {"\"family_index\": 3,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_label\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
-                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
                        {"\"repetition_index\": 0,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
@@ -170,6 +180,101 @@
                        "label\"%csv_label_report_end$"}});
 
 // ========================================================================= //
+// ------------------------ Testing Time Label Output ---------------------- //
+// ========================================================================= //
+
+void BM_time_label_nanosecond(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_time_label_nanosecond)->Unit(benchmark::kNanosecond);
+
+ADD_CASES(TC_ConsoleOut, {{"^BM_time_label_nanosecond %console_report$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_time_label_nanosecond\",$"},
+           {"\"family_index\": 4,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_time_label_nanosecond\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\"$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_time_label_nanosecond\",%csv_report$"}});
+
+void BM_time_label_microsecond(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_time_label_microsecond)->Unit(benchmark::kMicrosecond);
+
+ADD_CASES(TC_ConsoleOut, {{"^BM_time_label_microsecond %console_us_report$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_time_label_microsecond\",$"},
+           {"\"family_index\": 5,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_time_label_microsecond\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"us\"$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_time_label_microsecond\",%csv_us_report$"}});
+
+void BM_time_label_millisecond(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_time_label_millisecond)->Unit(benchmark::kMillisecond);
+
+ADD_CASES(TC_ConsoleOut, {{"^BM_time_label_millisecond %console_ms_report$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_time_label_millisecond\",$"},
+           {"\"family_index\": 6,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_time_label_millisecond\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ms\"$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_time_label_millisecond\",%csv_ms_report$"}});
+
+void BM_time_label_second(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_time_label_second)->Unit(benchmark::kSecond);
+
+ADD_CASES(TC_ConsoleOut, {{"^BM_time_label_second %console_s_report$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_time_label_second\",$"},
+                       {"\"family_index\": 7,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_time_label_second\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"s\"$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_time_label_second\",%csv_s_report$"}});
+
+// ========================================================================= //
 // ------------------------ Testing Error Output --------------------------- //
 // ========================================================================= //
 
@@ -181,9 +286,11 @@
 BENCHMARK(BM_error);
 ADD_CASES(TC_ConsoleOut, {{"^BM_error[ ]+ERROR OCCURRED: 'message'$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_error\",$"},
+                       {"\"family_index\": 8,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_error\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
-                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
                        {"\"repetition_index\": 0,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"error_occurred\": true,$", MR_Next},
@@ -203,15 +310,17 @@
 BENCHMARK(BM_no_arg_name)->Arg(3);
 ADD_CASES(TC_ConsoleOut, {{"^BM_no_arg_name/3 %console_report$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_no_arg_name/3\",$"},
+                       {"\"family_index\": 9,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_no_arg_name/3\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
-                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
                        {"\"repetition_index\": 0,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_no_arg_name/3\",%csv_report$"}});
 
 // ========================================================================= //
-// ------------------------ Testing Arg Name Output ----------------------- //
+// ------------------------ Testing Arg Name Output ------------------------ //
 // ========================================================================= //
 
 void BM_arg_name(benchmark::State& state) {
@@ -221,9 +330,11 @@
 BENCHMARK(BM_arg_name)->ArgName("first")->Arg(3);
 ADD_CASES(TC_ConsoleOut, {{"^BM_arg_name/first:3 %console_report$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_arg_name/first:3\",$"},
+                       {"\"family_index\": 10,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_arg_name/first:3\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
-                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
                        {"\"repetition_index\": 0,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_arg_name/first:3\",%csv_report$"}});
@@ -241,14 +352,42 @@
           {{"^BM_arg_names/first:2/5/third:4 %console_report$"}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_arg_names/first:2/5/third:4\",$"},
+           {"\"family_index\": 11,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_arg_names/first:2/5/third:4\",$", MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
-           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
            {"\"repetition_index\": 0,$", MR_Next},
            {"\"threads\": 1,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_arg_names/first:2/5/third:4\",%csv_report$"}});
 
 // ========================================================================= //
+// ------------------------ Testing Name Output ---------------------------- //
+// ========================================================================= //
+
+void BM_name(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_name)->Name("BM_custom_name");
+
+ADD_CASES(TC_ConsoleOut, {{"^BM_custom_name %console_report$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_custom_name\",$"},
+                       {"\"family_index\": 12,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
+                       {"\"run_name\": \"BM_custom_name\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\"$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_custom_name\",%csv_report$"}});
+
+// ========================================================================= //
 // ------------------------ Testing Big Args Output ------------------------ //
 // ========================================================================= //
 
@@ -267,7 +406,8 @@
 void BM_Complexity_O1(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
   }
   state.SetComplexityN(state.range(0));
 }
@@ -295,37 +435,50 @@
            {"^BM_Repeat/repeats:2_median %console_time_only_report [ ]*2$"},
            {"^BM_Repeat/repeats:2_stddev %console_time_only_report [ ]*2$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Repeat/repeats:2\",$"},
+                       {"\"family_index\": 15,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:2\"", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
                        {"\"repetitions\": 2,$", MR_Next},
                        {"\"repetition_index\": 0,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:2\",$"},
+                       {"\"family_index\": 15,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:2\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
                        {"\"repetitions\": 2,$", MR_Next},
                        {"\"repetition_index\": 1,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:2_mean\",$"},
+                       {"\"family_index\": 15,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:2\",$", MR_Next},
                        {"\"run_type\": \"aggregate\",$", MR_Next},
                        {"\"repetitions\": 2,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"aggregate_name\": \"mean\",$", MR_Next},
+                       {"\"aggregate_unit\": \"time\",$", MR_Next},
                        {"\"iterations\": 2,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:2_median\",$"},
+                       {"\"family_index\": 15,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:2\",$", MR_Next},
                        {"\"run_type\": \"aggregate\",$", MR_Next},
                        {"\"repetitions\": 2,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"aggregate_name\": \"median\",$", MR_Next},
+                       {"\"aggregate_unit\": \"time\",$", MR_Next},
                        {"\"iterations\": 2,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:2_stddev\",$"},
+                       {"\"family_index\": 15,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:2\",$", MR_Next},
                        {"\"run_type\": \"aggregate\",$", MR_Next},
                        {"\"repetitions\": 2,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"aggregate_name\": \"stddev\",$", MR_Next},
+                       {"\"aggregate_unit\": \"time\",$", MR_Next},
                        {"\"iterations\": 2,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_Repeat/repeats:2\",%csv_report$"},
                       {"^\"BM_Repeat/repeats:2\",%csv_report$"},
@@ -342,43 +495,58 @@
            {"^BM_Repeat/repeats:3_median %console_time_only_report [ ]*3$"},
            {"^BM_Repeat/repeats:3_stddev %console_time_only_report [ ]*3$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Repeat/repeats:3\",$"},
+                       {"\"family_index\": 16,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
                        {"\"repetitions\": 3,$", MR_Next},
                        {"\"repetition_index\": 0,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:3\",$"},
+                       {"\"family_index\": 16,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
                        {"\"repetitions\": 3,$", MR_Next},
                        {"\"repetition_index\": 1,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:3\",$"},
+                       {"\"family_index\": 16,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
                        {"\"repetitions\": 3,$", MR_Next},
                        {"\"repetition_index\": 2,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:3_mean\",$"},
+                       {"\"family_index\": 16,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
                        {"\"run_type\": \"aggregate\",$", MR_Next},
                        {"\"repetitions\": 3,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"aggregate_name\": \"mean\",$", MR_Next},
+                       {"\"aggregate_unit\": \"time\",$", MR_Next},
                        {"\"iterations\": 3,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:3_median\",$"},
+                       {"\"family_index\": 16,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
                        {"\"run_type\": \"aggregate\",$", MR_Next},
                        {"\"repetitions\": 3,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"aggregate_name\": \"median\",$", MR_Next},
+                       {"\"aggregate_unit\": \"time\",$", MR_Next},
                        {"\"iterations\": 3,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:3_stddev\",$"},
+                       {"\"family_index\": 16,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
                        {"\"run_type\": \"aggregate\",$", MR_Next},
                        {"\"repetitions\": 3,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"aggregate_name\": \"stddev\",$", MR_Next},
+                       {"\"aggregate_unit\": \"time\",$", MR_Next},
                        {"\"iterations\": 3,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_Repeat/repeats:3\",%csv_report$"},
                       {"^\"BM_Repeat/repeats:3\",%csv_report$"},
@@ -397,49 +565,66 @@
            {"^BM_Repeat/repeats:4_median %console_time_only_report [ ]*4$"},
            {"^BM_Repeat/repeats:4_stddev %console_time_only_report [ ]*4$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Repeat/repeats:4\",$"},
+                       {"\"family_index\": 17,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
                        {"\"repetitions\": 4,$", MR_Next},
                        {"\"repetition_index\": 0,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:4\",$"},
+                       {"\"family_index\": 17,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
                        {"\"repetitions\": 4,$", MR_Next},
                        {"\"repetition_index\": 1,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:4\",$"},
+                       {"\"family_index\": 17,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
                        {"\"repetitions\": 4,$", MR_Next},
                        {"\"repetition_index\": 2,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:4\",$"},
+                       {"\"family_index\": 17,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
                        {"\"repetitions\": 4,$", MR_Next},
                        {"\"repetition_index\": 3,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:4_mean\",$"},
+                       {"\"family_index\": 17,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
                        {"\"run_type\": \"aggregate\",$", MR_Next},
                        {"\"repetitions\": 4,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"aggregate_name\": \"mean\",$", MR_Next},
+                       {"\"aggregate_unit\": \"time\",$", MR_Next},
                        {"\"iterations\": 4,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:4_median\",$"},
+                       {"\"family_index\": 17,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
                        {"\"run_type\": \"aggregate\",$", MR_Next},
                        {"\"repetitions\": 4,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"aggregate_name\": \"median\",$", MR_Next},
+                       {"\"aggregate_unit\": \"time\",$", MR_Next},
                        {"\"iterations\": 4,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:4_stddev\",$"},
+                       {"\"family_index\": 17,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
                        {"\"run_type\": \"aggregate\",$", MR_Next},
                        {"\"repetitions\": 4,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"aggregate_name\": \"stddev\",$", MR_Next},
+                       {"\"aggregate_unit\": \"time\",$", MR_Next},
                        {"\"iterations\": 4,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_Repeat/repeats:4\",%csv_report$"},
                       {"^\"BM_Repeat/repeats:4\",%csv_report$"},
@@ -458,6 +643,8 @@
 BENCHMARK(BM_RepeatOnce)->Repetitions(1)->ReportAggregatesOnly();
 ADD_CASES(TC_ConsoleOut, {{"^BM_RepeatOnce/repeats:1 %console_report$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_RepeatOnce/repeats:1\",$"},
+                       {"\"family_index\": 18,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_RepeatOnce/repeats:1\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
                        {"\"repetitions\": 1,$", MR_Next},
@@ -480,25 +667,34 @@
 ADD_CASES(TC_JSONOut,
           {{".*BM_SummaryRepeat/repeats:3 ", MR_Not},
            {"\"name\": \"BM_SummaryRepeat/repeats:3_mean\",$"},
+           {"\"family_index\": 19,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_SummaryRepeat/repeats:3\",$", MR_Next},
            {"\"run_type\": \"aggregate\",$", MR_Next},
            {"\"repetitions\": 3,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": 3,$", MR_Next},
            {"\"name\": \"BM_SummaryRepeat/repeats:3_median\",$"},
+           {"\"family_index\": 19,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_SummaryRepeat/repeats:3\",$", MR_Next},
            {"\"run_type\": \"aggregate\",$", MR_Next},
            {"\"repetitions\": 3,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": 3,$", MR_Next},
            {"\"name\": \"BM_SummaryRepeat/repeats:3_stddev\",$"},
+           {"\"family_index\": 19,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_SummaryRepeat/repeats:3\",$", MR_Next},
            {"\"run_type\": \"aggregate\",$", MR_Next},
            {"\"repetitions\": 3,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": 3,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{".*BM_SummaryRepeat/repeats:3 ", MR_Not},
                       {"^\"BM_SummaryRepeat/repeats:3_mean\",%csv_report$"},
@@ -522,25 +718,34 @@
 ADD_CASES(TC_JSONOut,
           {{".*BM_SummaryDisplay/repeats:2 ", MR_Not},
            {"\"name\": \"BM_SummaryDisplay/repeats:2_mean\",$"},
+           {"\"family_index\": 20,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_SummaryDisplay/repeats:2\",$", MR_Next},
            {"\"run_type\": \"aggregate\",$", MR_Next},
            {"\"repetitions\": 2,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": 2,$", MR_Next},
            {"\"name\": \"BM_SummaryDisplay/repeats:2_median\",$"},
+           {"\"family_index\": 20,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_SummaryDisplay/repeats:2\",$", MR_Next},
            {"\"run_type\": \"aggregate\",$", MR_Next},
            {"\"repetitions\": 2,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": 2,$", MR_Next},
            {"\"name\": \"BM_SummaryDisplay/repeats:2_stddev\",$"},
+           {"\"family_index\": 20,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_SummaryDisplay/repeats:2\",$", MR_Next},
            {"\"run_type\": \"aggregate\",$", MR_Next},
            {"\"repetitions\": 2,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": 2,$", MR_Next}});
 ADD_CASES(TC_CSVOut,
           {{".*BM_SummaryDisplay/repeats:2 ", MR_Not},
@@ -568,27 +773,36 @@
 ADD_CASES(TC_JSONOut,
           {{".*BM_RepeatTimeUnit/repeats:3 ", MR_Not},
            {"\"name\": \"BM_RepeatTimeUnit/repeats:3_mean\",$"},
+           {"\"family_index\": 21,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_RepeatTimeUnit/repeats:3\",$", MR_Next},
            {"\"run_type\": \"aggregate\",$", MR_Next},
            {"\"repetitions\": 3,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": 3,$", MR_Next},
            {"\"time_unit\": \"us\",?$"},
            {"\"name\": \"BM_RepeatTimeUnit/repeats:3_median\",$"},
+           {"\"family_index\": 21,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_RepeatTimeUnit/repeats:3\",$", MR_Next},
            {"\"run_type\": \"aggregate\",$", MR_Next},
            {"\"repetitions\": 3,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": 3,$", MR_Next},
            {"\"time_unit\": \"us\",?$"},
            {"\"name\": \"BM_RepeatTimeUnit/repeats:3_stddev\",$"},
+           {"\"family_index\": 21,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_RepeatTimeUnit/repeats:3\",$", MR_Next},
            {"\"run_type\": \"aggregate\",$", MR_Next},
            {"\"repetitions\": 3,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": 3,$", MR_Next},
            {"\"time_unit\": \"us\",?$"}});
 ADD_CASES(TC_CSVOut,
@@ -636,6 +850,8 @@
 ADD_CASES(
     TC_JSONOut,
     {{"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$"},
+     {"\"family_index\": 22,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
      {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
       MR_Next},
      {"\"run_type\": \"iteration\",$", MR_Next},
@@ -645,6 +861,8 @@
      {"\"iterations\": 5,$", MR_Next},
      {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
      {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$"},
+     {"\"family_index\": 22,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
      {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
       MR_Next},
      {"\"run_type\": \"iteration\",$", MR_Next},
@@ -654,6 +872,8 @@
      {"\"iterations\": 5,$", MR_Next},
      {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
      {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$"},
+     {"\"family_index\": 22,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
      {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
       MR_Next},
      {"\"run_type\": \"iteration\",$", MR_Next},
@@ -663,39 +883,51 @@
      {"\"iterations\": 5,$", MR_Next},
      {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
      {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time_mean\",$"},
+     {"\"family_index\": 22,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
      {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
       MR_Next},
      {"\"run_type\": \"aggregate\",$", MR_Next},
      {"\"repetitions\": 3,$", MR_Next},
      {"\"threads\": 1,$", MR_Next},
      {"\"aggregate_name\": \"mean\",$", MR_Next},
+     {"\"aggregate_unit\": \"time\",$", MR_Next},
      {"\"iterations\": 3,$", MR_Next},
      {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
      {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time_median\",$"},
+     {"\"family_index\": 22,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
      {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
       MR_Next},
      {"\"run_type\": \"aggregate\",$", MR_Next},
      {"\"repetitions\": 3,$", MR_Next},
      {"\"threads\": 1,$", MR_Next},
      {"\"aggregate_name\": \"median\",$", MR_Next},
+     {"\"aggregate_unit\": \"time\",$", MR_Next},
      {"\"iterations\": 3,$", MR_Next},
      {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
      {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time_stddev\",$"},
+     {"\"family_index\": 22,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
      {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
       MR_Next},
      {"\"run_type\": \"aggregate\",$", MR_Next},
      {"\"repetitions\": 3,$", MR_Next},
      {"\"threads\": 1,$", MR_Next},
      {"\"aggregate_name\": \"stddev\",$", MR_Next},
+     {"\"aggregate_unit\": \"time\",$", MR_Next},
      {"\"iterations\": 3,$", MR_Next},
      {"\"real_time\": %float,$", MR_Next},
      {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time_\",$"},
+     {"\"family_index\": 22,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
      {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
       MR_Next},
      {"\"run_type\": \"aggregate\",$", MR_Next},
      {"\"repetitions\": 3,$", MR_Next},
      {"\"threads\": 1,$", MR_Next},
      {"\"aggregate_name\": \"\",$", MR_Next},
+     {"\"aggregate_unit\": \"time\",$", MR_Next},
      {"\"iterations\": 3,$", MR_Next},
      {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next}});
 ADD_CASES(
@@ -711,9 +943,157 @@
      {"^\"BM_UserStats/iterations:5/repeats:3/manual_time_\",%csv_report$"}});
 
 // ========================================================================= //
+// ------------- Testing relative standard deviation statistics ------------ //
+// ========================================================================= //
+
+const auto UserPercentStatistics = [](const std::vector<double>&) {
+  return 1. / 100.;
+};
+void BM_UserPercentStats(benchmark::State& state) {
+  for (auto _ : state) {
+    state.SetIterationTime(150 / 10e8);
+  }
+}
+// clang-format off
+BENCHMARK(BM_UserPercentStats)
+  ->Repetitions(3)
+  ->Iterations(5)
+  ->UseManualTime()
+  ->Unit(benchmark::TimeUnit::kNanosecond)
+  ->ComputeStatistics("", UserPercentStatistics, benchmark::StatisticUnit::kPercentage);
+// clang-format on
+
+// check that UserPercent-provided stats is calculated, and is after the
+// default-ones empty string as name is intentional, it would sort before
+// anything else
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_UserPercentStats/iterations:5/repeats:3/manual_time [ "
+            "]* 150 ns %time [ ]*5$"},
+           {"^BM_UserPercentStats/iterations:5/repeats:3/manual_time [ "
+            "]* 150 ns %time [ ]*5$"},
+           {"^BM_UserPercentStats/iterations:5/repeats:3/manual_time [ "
+            "]* 150 ns %time [ ]*5$"},
+           {"^BM_UserPercentStats/iterations:5/repeats:3/"
+            "manual_time_mean [ ]* 150 ns %time [ ]*3$"},
+           {"^BM_UserPercentStats/iterations:5/repeats:3/"
+            "manual_time_median [ ]* 150 ns %time [ ]*3$"},
+           {"^BM_UserPercentStats/iterations:5/repeats:3/"
+            "manual_time_stddev [ ]* 0.000 ns %time [ ]*3$"},
+           {"^BM_UserPercentStats/iterations:5/repeats:3/manual_time_ "
+            "[ ]* 1.00 % [ ]* 1.00 %[ ]*3$"}});
+ADD_CASES(
+    TC_JSONOut,
+    {{"\"name\": \"BM_UserPercentStats/iterations:5/repeats:3/manual_time\",$"},
+     {"\"family_index\": 23,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
+     {"\"run_name\": "
+      "\"BM_UserPercentStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"iteration\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"repetition_index\": 0,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"iterations\": 5,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": \"BM_UserPercentStats/iterations:5/repeats:3/manual_time\",$"},
+     {"\"family_index\": 23,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
+     {"\"run_name\": "
+      "\"BM_UserPercentStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"iteration\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"repetition_index\": 1,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"iterations\": 5,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": \"BM_UserPercentStats/iterations:5/repeats:3/manual_time\",$"},
+     {"\"family_index\": 23,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
+     {"\"run_name\": "
+      "\"BM_UserPercentStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"iteration\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"repetition_index\": 2,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"iterations\": 5,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": "
+      "\"BM_UserPercentStats/iterations:5/repeats:3/manual_time_mean\",$"},
+     {"\"family_index\": 23,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
+     {"\"run_name\": "
+      "\"BM_UserPercentStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"aggregate\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"aggregate_name\": \"mean\",$", MR_Next},
+     {"\"aggregate_unit\": \"time\",$", MR_Next},
+     {"\"iterations\": 3,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": "
+      "\"BM_UserPercentStats/iterations:5/repeats:3/manual_time_median\",$"},
+     {"\"family_index\": 23,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
+     {"\"run_name\": "
+      "\"BM_UserPercentStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"aggregate\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"aggregate_name\": \"median\",$", MR_Next},
+     {"\"aggregate_unit\": \"time\",$", MR_Next},
+     {"\"iterations\": 3,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": "
+      "\"BM_UserPercentStats/iterations:5/repeats:3/manual_time_stddev\",$"},
+     {"\"family_index\": 23,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
+     {"\"run_name\": "
+      "\"BM_UserPercentStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"aggregate\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"aggregate_name\": \"stddev\",$", MR_Next},
+     {"\"aggregate_unit\": \"time\",$", MR_Next},
+     {"\"iterations\": 3,$", MR_Next},
+     {"\"real_time\": %float,$", MR_Next},
+     {"\"name\": "
+      "\"BM_UserPercentStats/iterations:5/repeats:3/manual_time_\",$"},
+     {"\"family_index\": 23,$", MR_Next},
+     {"\"per_family_instance_index\": 0,$", MR_Next},
+     {"\"run_name\": "
+      "\"BM_UserPercentStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"aggregate\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"aggregate_name\": \"\",$", MR_Next},
+     {"\"aggregate_unit\": \"percentage\",$", MR_Next},
+     {"\"iterations\": 3,$", MR_Next},
+     {"\"real_time\": 1\\.(0)*e-(0)*2,$", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_UserPercentStats/iterations:5/repeats:3/"
+                       "manual_time\",%csv_report$"},
+                      {"^\"BM_UserPercentStats/iterations:5/repeats:3/"
+                       "manual_time\",%csv_report$"},
+                      {"^\"BM_UserPercentStats/iterations:5/repeats:3/"
+                       "manual_time\",%csv_report$"},
+                      {"^\"BM_UserPercentStats/iterations:5/repeats:3/"
+                       "manual_time_mean\",%csv_report$"},
+                      {"^\"BM_UserPercentStats/iterations:5/repeats:3/"
+                       "manual_time_median\",%csv_report$"},
+                      {"^\"BM_UserPercentStats/iterations:5/repeats:3/"
+                       "manual_time_stddev\",%csv_report$"},
+                      {"^\"BM_UserPercentStats/iterations:5/repeats:3/"
+                       "manual_time_\",%csv_report$"}});
+
+// ========================================================================= //
 // ------------------------- Testing StrEscape JSON ------------------------ //
 // ========================================================================= //
-#if 0 // enable when csv testing code correctly handles multi-line fields
+#if 0  // enable when csv testing code correctly handles multi-line fields
 void BM_JSON_Format(benchmark::State& state) {
   state.SkipWithError("val\b\f\n\r\t\\\"with\"es,capes");
   for (auto _ : state) {
@@ -721,9 +1101,11 @@
 }
 BENCHMARK(BM_JSON_Format);
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_JSON_Format\",$"},
+                                              {"\"family_index\": 23,$", MR_Next},
+{"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_JSON_Format\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
-                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
                        {"\"repetition_index\": 0,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"error_occurred\": true,$", MR_Next},
diff --git a/third_party/google_benchmark/test/skip_with_error_test.cc b/third_party/google_benchmark/src/test/skip_with_error_test.cc
similarity index 83%
rename from third_party/google_benchmark/test/skip_with_error_test.cc
rename to third_party/google_benchmark/src/test/skip_with_error_test.cc
index 0657977..b4c5e15 100644
--- a/third_party/google_benchmark/test/skip_with_error_test.cc
+++ b/third_party/google_benchmark/src/test/skip_with_error_test.cc
@@ -10,17 +10,17 @@
 
 class TestReporter : public benchmark::ConsoleReporter {
  public:
-  virtual bool ReportContext(const Context& context) {
+  bool ReportContext(const Context& context) override {
     return ConsoleReporter::ReportContext(context);
   };
 
-  virtual void ReportRuns(const std::vector<Run>& report) {
+  void ReportRuns(const std::vector<Run>& report) override {
     all_runs_.insert(all_runs_.end(), begin(report), end(report));
     ConsoleReporter::ReportRuns(report);
   }
 
   TestReporter() {}
-  virtual ~TestReporter() {}
+  ~TestReporter() override {}
 
   mutable std::vector<Run> all_runs_;
 };
@@ -33,21 +33,23 @@
   typedef benchmark::BenchmarkReporter::Run Run;
 
   void CheckRun(Run const& run) const {
-    CHECK(name == run.benchmark_name())
+    BM_CHECK(name == run.benchmark_name())
         << "expected " << name << " got " << run.benchmark_name();
-    CHECK(error_occurred == run.error_occurred);
-    CHECK(error_message == run.error_message);
+    BM_CHECK_EQ(error_occurred,
+                benchmark::internal::SkippedWithError == run.skipped);
+    BM_CHECK(error_message == run.skip_message);
     if (error_occurred) {
-      // CHECK(run.iterations == 0);
+      // BM_CHECK(run.iterations == 0);
     } else {
-      CHECK(run.iterations != 0);
+      BM_CHECK(run.iterations != 0);
     }
   }
 };
 
 std::vector<TestCase> ExpectedResults;
 
-int AddCases(const char* base_name, std::initializer_list<TestCase> const& v) {
+int AddCases(const std::string& base_name,
+             std::initializer_list<TestCase> const& v) {
   for (auto TC : v) {
     TC.name = base_name + TC.name;
     ExpectedResults.push_back(std::move(TC));
@@ -61,6 +63,12 @@
 
 }  // end namespace
 
+void BM_error_no_running(benchmark::State& state) {
+  state.SkipWithError("error message");
+}
+BENCHMARK(BM_error_no_running);
+ADD_CASES("BM_error_no_running", {{"", true, "error message"}});
+
 void BM_error_before_running(benchmark::State& state) {
   state.SkipWithError("error message");
   while (state.KeepRunning()) {
@@ -91,7 +99,7 @@
 void BM_error_during_running(benchmark::State& state) {
   int first_iter = true;
   while (state.KeepRunning()) {
-    if (state.range(0) == 1 && state.thread_index <= (state.threads / 2)) {
+    if (state.range(0) == 1 && state.thread_index() <= (state.threads() / 2)) {
       assert(first_iter);
       first_iter = false;
       state.SkipWithError("error message");
@@ -113,12 +121,13 @@
 
 void BM_error_during_running_ranged_for(benchmark::State& state) {
   assert(state.max_iterations > 3 && "test requires at least a few iterations");
-  int first_iter = true;
+  bool first_iter = true;
   // NOTE: Users should not write the for loop explicitly.
   for (auto It = state.begin(), End = state.end(); It != End; ++It) {
     if (state.range(0) == 1) {
       assert(first_iter);
       first_iter = false;
+      (void)first_iter;
       state.SkipWithError("error message");
       // Test the unfortunate but documented behavior that the ranged-for loop
       // doesn't automatically terminate when SkipWithError is set.
@@ -134,9 +143,10 @@
 
 void BM_error_after_running(benchmark::State& state) {
   for (auto _ : state) {
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
   }
-  if (state.thread_index <= (state.threads / 2))
+  if (state.thread_index() <= (state.threads() / 2))
     state.SkipWithError("error message");
 }
 BENCHMARK(BM_error_after_running)->ThreadRange(1, 8);
@@ -148,7 +158,7 @@
 void BM_error_while_paused(benchmark::State& state) {
   bool first_iter = true;
   while (state.KeepRunning()) {
-    if (state.range(0) == 1 && state.thread_index <= (state.threads / 2)) {
+    if (state.range(0) == 1 && state.thread_index() <= (state.threads() / 2)) {
       assert(first_iter);
       first_iter = false;
       state.PauseTiming();
diff --git a/third_party/google_benchmark/src/test/spec_arg_test.cc b/third_party/google_benchmark/src/test/spec_arg_test.cc
new file mode 100644
index 0000000..06aafbe
--- /dev/null
+++ b/third_party/google_benchmark/src/test/spec_arg_test.cc
@@ -0,0 +1,105 @@
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#include <string>
+#include <vector>
+
+#include "benchmark/benchmark.h"
+
+// Tests that we can override benchmark-spec value from FLAGS_benchmark_filter
+// with argument to RunSpecifiedBenchmarks(...).
+
+namespace {
+
+class TestReporter : public benchmark::ConsoleReporter {
+ public:
+  bool ReportContext(const Context& context) override {
+    return ConsoleReporter::ReportContext(context);
+  };
+
+  void ReportRuns(const std::vector<Run>& report) override {
+    assert(report.size() == 1);
+    matched_functions.push_back(report[0].run_name.function_name);
+    ConsoleReporter::ReportRuns(report);
+  };
+
+  TestReporter() {}
+
+  ~TestReporter() override {}
+
+  const std::vector<std::string>& GetMatchedFunctions() const {
+    return matched_functions;
+  }
+
+ private:
+  std::vector<std::string> matched_functions;
+};
+
+}  // end namespace
+
+static void BM_NotChosen(benchmark::State& state) {
+  assert(false && "SHOULD NOT BE CALLED");
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_NotChosen);
+
+static void BM_Chosen(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_Chosen);
+
+int main(int argc, char** argv) {
+  const std::string flag = "BM_NotChosen";
+
+  // Verify that argv specify --benchmark_filter=BM_NotChosen.
+  bool found = false;
+  for (int i = 0; i < argc; ++i) {
+    if (strcmp("--benchmark_filter=BM_NotChosen", argv[i]) == 0) {
+      found = true;
+      break;
+    }
+  }
+  assert(found);
+
+  benchmark::Initialize(&argc, argv);
+
+  // Check that the current flag value is reported accurately via the
+  // GetBenchmarkFilter() function.
+  if (flag != benchmark::GetBenchmarkFilter()) {
+    std::cerr
+        << "Seeing different value for flags. GetBenchmarkFilter() returns ["
+        << benchmark::GetBenchmarkFilter() << "] expected flag=[" << flag
+        << "]\n";
+    return 1;
+  }
+  TestReporter test_reporter;
+  const char* const spec = "BM_Chosen";
+  const size_t returned_count =
+      benchmark::RunSpecifiedBenchmarks(&test_reporter, spec);
+  assert(returned_count == 1);
+  const std::vector<std::string> matched_functions =
+      test_reporter.GetMatchedFunctions();
+  assert(matched_functions.size() == 1);
+  if (strcmp(spec, matched_functions.front().c_str()) != 0) {
+    std::cerr << "Expected benchmark [" << spec << "] to run, but got ["
+              << matched_functions.front() << "]\n";
+    return 2;
+  }
+
+  // Test that SetBenchmarkFilter works.
+  const std::string golden_value = "golden_value";
+  benchmark::SetBenchmarkFilter(golden_value);
+  std::string current_value = benchmark::GetBenchmarkFilter();
+  if (golden_value != current_value) {
+    std::cerr << "Expected [" << golden_value
+              << "] for --benchmark_filter but got [" << current_value << "]\n";
+    return 3;
+  }
+  return 0;
+}
diff --git a/third_party/google_benchmark/src/test/spec_arg_verbosity_test.cc b/third_party/google_benchmark/src/test/spec_arg_verbosity_test.cc
new file mode 100644
index 0000000..8f8eb6d
--- /dev/null
+++ b/third_party/google_benchmark/src/test/spec_arg_verbosity_test.cc
@@ -0,0 +1,43 @@
+#include <string.h>
+
+#include <iostream>
+
+#include "benchmark/benchmark.h"
+
+// Tests that the user specified verbosity level can be get.
+static void BM_Verbosity(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_Verbosity);
+
+int main(int argc, char** argv) {
+  const int32_t flagv = 42;
+
+  // Verify that argv specify --v=42.
+  bool found = false;
+  for (int i = 0; i < argc; ++i) {
+    if (strcmp("--v=42", argv[i]) == 0) {
+      found = true;
+      break;
+    }
+  }
+  if (!found) {
+    std::cerr << "This test requires '--v=42' to be passed as a command-line "
+              << "argument.\n";
+    return 1;
+  }
+
+  benchmark::Initialize(&argc, argv);
+
+  // Check that the current flag value is reported accurately via the
+  // GetBenchmarkVerbosity() function.
+  if (flagv != benchmark::GetBenchmarkVerbosity()) {
+    std::cerr
+        << "Seeing different value for flags. GetBenchmarkVerbosity() returns ["
+        << benchmark::GetBenchmarkVerbosity() << "] expected flag=[" << flagv
+        << "]\n";
+    return 1;
+  }
+  return 0;
+}
diff --git a/third_party/google_benchmark/test/state_assembly_test.cc b/third_party/google_benchmark/src/test/state_assembly_test.cc
similarity index 100%
rename from third_party/google_benchmark/test/state_assembly_test.cc
rename to third_party/google_benchmark/src/test/state_assembly_test.cc
diff --git a/third_party/google_benchmark/test/statistics_gtest.cc b/third_party/google_benchmark/src/test/statistics_gtest.cc
similarity index 71%
rename from third_party/google_benchmark/test/statistics_gtest.cc
rename to third_party/google_benchmark/src/test/statistics_gtest.cc
index 99e3149..1de2d87 100644
--- a/third_party/google_benchmark/test/statistics_gtest.cc
+++ b/third_party/google_benchmark/src/test/statistics_gtest.cc
@@ -21,8 +21,15 @@
 TEST(StatisticsTest, StdDev) {
   EXPECT_DOUBLE_EQ(benchmark::StatisticsStdDev({101, 101, 101, 101}), 0.0);
   EXPECT_DOUBLE_EQ(benchmark::StatisticsStdDev({1, 2, 3}), 1.0);
-  EXPECT_FLOAT_EQ(benchmark::StatisticsStdDev({1.5, 2.4, 3.3, 4.2, 5.1}),
-                  1.42302495);
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsStdDev({2.5, 2.4, 3.3, 4.2, 5.1}),
+                   1.151086443322134);
+}
+
+TEST(StatisticsTest, CV) {
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsCV({101, 101, 101, 101}), 0.0);
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsCV({1, 2, 3}), 1. / 2.);
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsCV({2.5, 2.4, 3.3, 4.2, 5.1}),
+                   0.32888184094918121);
 }
 
 }  // end namespace
diff --git a/third_party/google_benchmark/src/test/string_util_gtest.cc b/third_party/google_benchmark/src/test/string_util_gtest.cc
new file mode 100644
index 0000000..8bfdb7a
--- /dev/null
+++ b/third_party/google_benchmark/src/test/string_util_gtest.cc
@@ -0,0 +1,163 @@
+//===---------------------------------------------------------------------===//
+// statistics_test - Unit tests for src/statistics.cc
+//===---------------------------------------------------------------------===//
+
+#include <tuple>
+
+#include "../src/internal_macros.h"
+#include "../src/string_util.h"
+#include "gtest/gtest.h"
+
+namespace {
+TEST(StringUtilTest, stoul) {
+  {
+    size_t pos = 0;
+    EXPECT_EQ(0ul, benchmark::stoul("0", &pos));
+    EXPECT_EQ(1ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(7ul, benchmark::stoul("7", &pos));
+    EXPECT_EQ(1ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(135ul, benchmark::stoul("135", &pos));
+    EXPECT_EQ(3ul, pos);
+  }
+#if ULONG_MAX == 0xFFFFFFFFul
+  {
+    size_t pos = 0;
+    EXPECT_EQ(0xFFFFFFFFul, benchmark::stoul("4294967295", &pos));
+    EXPECT_EQ(10ul, pos);
+  }
+#elif ULONG_MAX == 0xFFFFFFFFFFFFFFFFul
+  {
+    size_t pos = 0;
+    EXPECT_EQ(0xFFFFFFFFFFFFFFFFul,
+              benchmark::stoul("18446744073709551615", &pos));
+    EXPECT_EQ(20ul, pos);
+  }
+#endif
+  {
+    size_t pos = 0;
+    EXPECT_EQ(10ul, benchmark::stoul("1010", &pos, 2));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(520ul, benchmark::stoul("1010", &pos, 8));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(1010ul, benchmark::stoul("1010", &pos, 10));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(4112ul, benchmark::stoul("1010", &pos, 16));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(0xBEEFul, benchmark::stoul("BEEF", &pos, 16));
+    EXPECT_EQ(4ul, pos);
+  }
+#ifndef BENCHMARK_HAS_NO_EXCEPTIONS
+  {
+    ASSERT_THROW(std::ignore = benchmark::stoul("this is a test"),
+                 std::invalid_argument);
+  }
+#endif
+}
+
+TEST(StringUtilTest, stoi){{size_t pos = 0;
+EXPECT_EQ(0, benchmark::stoi("0", &pos));
+EXPECT_EQ(1ul, pos);
+}  // namespace
+{
+  size_t pos = 0;
+  EXPECT_EQ(-17, benchmark::stoi("-17", &pos));
+  EXPECT_EQ(3ul, pos);
+}
+{
+  size_t pos = 0;
+  EXPECT_EQ(1357, benchmark::stoi("1357", &pos));
+  EXPECT_EQ(4ul, pos);
+}
+{
+  size_t pos = 0;
+  EXPECT_EQ(10, benchmark::stoi("1010", &pos, 2));
+  EXPECT_EQ(4ul, pos);
+}
+{
+  size_t pos = 0;
+  EXPECT_EQ(520, benchmark::stoi("1010", &pos, 8));
+  EXPECT_EQ(4ul, pos);
+}
+{
+  size_t pos = 0;
+  EXPECT_EQ(1010, benchmark::stoi("1010", &pos, 10));
+  EXPECT_EQ(4ul, pos);
+}
+{
+  size_t pos = 0;
+  EXPECT_EQ(4112, benchmark::stoi("1010", &pos, 16));
+  EXPECT_EQ(4ul, pos);
+}
+{
+  size_t pos = 0;
+  EXPECT_EQ(0xBEEF, benchmark::stoi("BEEF", &pos, 16));
+  EXPECT_EQ(4ul, pos);
+}
+#ifndef BENCHMARK_HAS_NO_EXCEPTIONS
+{
+  ASSERT_THROW(std::ignore = benchmark::stoi("this is a test"),
+               std::invalid_argument);
+}
+#endif
+}
+
+TEST(StringUtilTest, stod){{size_t pos = 0;
+EXPECT_EQ(0.0, benchmark::stod("0", &pos));
+EXPECT_EQ(1ul, pos);
+}
+{
+  size_t pos = 0;
+  EXPECT_EQ(-84.0, benchmark::stod("-84", &pos));
+  EXPECT_EQ(3ul, pos);
+}
+{
+  size_t pos = 0;
+  EXPECT_EQ(1234.0, benchmark::stod("1234", &pos));
+  EXPECT_EQ(4ul, pos);
+}
+{
+  size_t pos = 0;
+  EXPECT_EQ(1.5, benchmark::stod("1.5", &pos));
+  EXPECT_EQ(3ul, pos);
+}
+{
+  size_t pos = 0;
+  /* Note: exactly representable as double */
+  EXPECT_EQ(-1.25e+9, benchmark::stod("-1.25e+9", &pos));
+  EXPECT_EQ(8ul, pos);
+}
+#ifndef BENCHMARK_HAS_NO_EXCEPTIONS
+{
+  ASSERT_THROW(std::ignore = benchmark::stod("this is a test"),
+               std::invalid_argument);
+}
+#endif
+}
+
+TEST(StringUtilTest, StrSplit) {
+  EXPECT_EQ(benchmark::StrSplit("", ','), std::vector<std::string>{});
+  EXPECT_EQ(benchmark::StrSplit("hello", ','),
+            std::vector<std::string>({"hello"}));
+  EXPECT_EQ(benchmark::StrSplit("hello,there,is,more", ','),
+            std::vector<std::string>({"hello", "there", "is", "more"}));
+}
+
+}  // end namespace
diff --git a/third_party/google_benchmark/test/templated_fixture_test.cc b/third_party/google_benchmark/src/test/templated_fixture_test.cc
similarity index 99%
rename from third_party/google_benchmark/test/templated_fixture_test.cc
rename to third_party/google_benchmark/src/test/templated_fixture_test.cc
index fe9865c..af239c3 100644
--- a/third_party/google_benchmark/test/templated_fixture_test.cc
+++ b/third_party/google_benchmark/src/test/templated_fixture_test.cc
@@ -1,9 +1,9 @@
 
-#include "benchmark/benchmark.h"
-
 #include <cassert>
 #include <memory>
 
+#include "benchmark/benchmark.h"
+
 template <typename T>
 class MyFixture : public ::benchmark::Fixture {
  public:
diff --git a/third_party/google_benchmark/src/test/time_unit_gtest.cc b/third_party/google_benchmark/src/test/time_unit_gtest.cc
new file mode 100644
index 0000000..484ecbc
--- /dev/null
+++ b/third_party/google_benchmark/src/test/time_unit_gtest.cc
@@ -0,0 +1,37 @@
+#include "../include/benchmark/benchmark.h"
+#include "gtest/gtest.h"
+
+namespace benchmark {
+namespace internal {
+
+namespace {
+
+class DummyBenchmark : public Benchmark {
+ public:
+  DummyBenchmark() : Benchmark("dummy") {}
+  void Run(State&) override {}
+};
+
+TEST(DefaultTimeUnitTest, TimeUnitIsNotSet) {
+  DummyBenchmark benchmark;
+  EXPECT_EQ(benchmark.GetTimeUnit(), kNanosecond);
+}
+
+TEST(DefaultTimeUnitTest, DefaultIsSet) {
+  DummyBenchmark benchmark;
+  EXPECT_EQ(benchmark.GetTimeUnit(), kNanosecond);
+  SetDefaultTimeUnit(kMillisecond);
+  EXPECT_EQ(benchmark.GetTimeUnit(), kMillisecond);
+}
+
+TEST(DefaultTimeUnitTest, DefaultAndExplicitUnitIsSet) {
+  DummyBenchmark benchmark;
+  benchmark.Unit(kMillisecond);
+  SetDefaultTimeUnit(kMicrosecond);
+
+  EXPECT_EQ(benchmark.GetTimeUnit(), kMillisecond);
+}
+
+}  // namespace
+}  // namespace internal
+}  // namespace benchmark
diff --git a/third_party/google_benchmark/src/test/user_counters_tabular_test.cc b/third_party/google_benchmark/src/test/user_counters_tabular_test.cc
new file mode 100644
index 0000000..c98b769
--- /dev/null
+++ b/third_party/google_benchmark/src/test/user_counters_tabular_test.cc
@@ -0,0 +1,558 @@
+
+#undef NDEBUG
+
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+// @todo: <jpmag> this checks the full output at once; the rule for
+// CounterSet1 was failing because it was not matching "^[-]+$".
+// @todo: <jpmag> check that the counters are vertically aligned.
+ADD_CASES(TC_ConsoleOut,
+          {
+              // keeping these lines long improves readability, so:
+              // clang-format off
+    {"^[-]+$", MR_Next},
+    {"^Benchmark %s Time %s CPU %s Iterations %s Bar %s Bat %s Baz %s Foo %s Frob %s Lob$", MR_Next},
+    {"^[-]+$", MR_Next},
+      {"^BM_Counters_Tabular/repeats:2/threads:1 %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+      {"^BM_Counters_Tabular/repeats:2/threads:1 %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+      {"^BM_Counters_Tabular/repeats:2/threads:1_mean %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+      {"^BM_Counters_Tabular/repeats:2/threads:1_median %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+            {"^BM_Counters_Tabular/repeats:2/threads:1_stddev %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+            {"^BM_Counters_Tabular/repeats:2/threads:1_cv %console_percentage_report [ ]*%percentage[ ]*% [ ]*%percentage[ ]*% [ ]*%percentage[ ]*% [ ]*%percentage[ ]*% [ ]*%percentage[ ]*% [ ]*%percentage[ ]*%$", MR_Next},
+      {"^BM_Counters_Tabular/repeats:2/threads:2 %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+      {"^BM_Counters_Tabular/repeats:2/threads:2 %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+      {"^BM_Counters_Tabular/repeats:2/threads:2_mean %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+      {"^BM_Counters_Tabular/repeats:2/threads:2_median %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+            {"^BM_Counters_Tabular/repeats:2/threads:2_stddev %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+            {"^BM_Counters_Tabular/repeats:2/threads:2_cv %console_percentage_report [ ]*%percentage[ ]*% [ ]*%percentage[ ]*% [ ]*%percentage[ ]*% [ ]*%percentage[ ]*% [ ]*%percentage[ ]*% [ ]*%percentage[ ]*%$", MR_Next},
+    {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
+    {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
+    {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
+    {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
+    {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
+    {"^[-]+$", MR_Next},
+    {"^Benchmark %s Time %s CPU %s Iterations %s Bar %s Baz %s Foo$", MR_Next},
+    {"^[-]+$", MR_Next},
+    {"^BM_CounterSet0_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet0_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet0_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet0_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet0_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet1_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet1_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet1_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet1_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet1_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^[-]+$", MR_Next},
+    {"^Benchmark %s Time %s CPU %s Iterations %s Bat %s Baz %s Foo$", MR_Next},
+    {"^[-]+$", MR_Next},
+    {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$"},
+              // clang-format on
+          });
+ADD_CASES(TC_CSVOut, {{"%csv_header,"
+                       "\"Bar\",\"Bat\",\"Baz\",\"Foo\",\"Frob\",\"Lob\""}});
+
+// ========================================================================= //
+// ------------------------- Tabular Counters Output ----------------------- //
+// ========================================================================= //
+
+void BM_Counters_Tabular(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  namespace bm = benchmark;
+  state.counters.insert({
+      {"Foo", {1, bm::Counter::kAvgThreads}},
+      {"Bar", {2, bm::Counter::kAvgThreads}},
+      {"Baz", {4, bm::Counter::kAvgThreads}},
+      {"Bat", {8, bm::Counter::kAvgThreads}},
+      {"Frob", {16, bm::Counter::kAvgThreads}},
+      {"Lob", {32, bm::Counter::kAvgThreads}},
+  });
+}
+BENCHMARK(BM_Counters_Tabular)->ThreadRange(1, 2)->Repetitions(2);
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:1\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:1\",$",
+            MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:1\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:1\",$",
+            MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"repetition_index\": 1,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:1_mean\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:1\",$",
+            MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:1_median\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:1\",$",
+            MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:1_stddev\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:1\",$",
+            MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:1_cv\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:1\",$",
+            MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"cv\",$", MR_Next},
+           {"\"aggregate_unit\": \"percentage\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:2\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 1,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:2\",$",
+            MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 2,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:2\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 1,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:2\",$",
+            MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"repetition_index\": 1,$", MR_Next},
+           {"\"threads\": 2,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:2_median\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 1,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:2\",$",
+            MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 2,$", MR_Next},
+           {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:2_stddev\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 1,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:2\",$",
+            MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 2,$", MR_Next},
+           {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/repeats:2/threads:2_cv\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 1,$", MR_Next},
+           {"\"run_name\": \"BM_Counters_Tabular/repeats:2/threads:2\",$",
+            MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 2,$", MR_Next},
+           {"\"aggregate_name\": \"cv\",$", MR_Next},
+           {"\"aggregate_unit\": \"percentage\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:1\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:1\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:1_mean\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:1_median\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:1_stddev\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:1_cv\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:2\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:2\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:2_mean\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:2_median\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:2_stddev\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_Tabular/repeats:2/threads:2_cv\",%csv_report,"
+            "%float,%float,%float,%float,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckTabular(Results const& e) {
+  CHECK_COUNTER_VALUE(e, int, "Foo", EQ, 1);
+  CHECK_COUNTER_VALUE(e, int, "Bar", EQ, 2);
+  CHECK_COUNTER_VALUE(e, int, "Baz", EQ, 4);
+  CHECK_COUNTER_VALUE(e, int, "Bat", EQ, 8);
+  CHECK_COUNTER_VALUE(e, int, "Frob", EQ, 16);
+  CHECK_COUNTER_VALUE(e, int, "Lob", EQ, 32);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_Tabular/repeats:2/threads:1$",
+                        &CheckTabular);
+CHECK_BENCHMARK_RESULTS("BM_Counters_Tabular/repeats:2/threads:2$",
+                        &CheckTabular);
+
+// ========================================================================= //
+// -------------------- Tabular+Rate Counters Output ----------------------- //
+// ========================================================================= //
+
+void BM_CounterRates_Tabular(benchmark::State& state) {
+  for (auto _ : state) {
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
+  }
+  namespace bm = benchmark;
+  state.counters.insert({
+      {"Foo", {1, bm::Counter::kAvgThreadsRate}},
+      {"Bar", {2, bm::Counter::kAvgThreadsRate}},
+      {"Baz", {4, bm::Counter::kAvgThreadsRate}},
+      {"Bat", {8, bm::Counter::kAvgThreadsRate}},
+      {"Frob", {16, bm::Counter::kAvgThreadsRate}},
+      {"Lob", {32, bm::Counter::kAvgThreadsRate}},
+  });
+}
+BENCHMARK(BM_CounterRates_Tabular)->ThreadRange(1, 16);
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_CounterRates_Tabular/threads:%int\",$"},
+           {"\"family_index\": 1,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_CounterRates_Tabular/threads:%int\",$",
+            MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_CounterRates_Tabular/threads:%int\",%csv_report,"
+                       "%float,%float,%float,%float,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckTabularRate(Results const& e) {
+  double t = e.DurationCPUTime();
+  CHECK_FLOAT_COUNTER_VALUE(e, "Foo", EQ, 1. / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "Bar", EQ, 2. / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "Baz", EQ, 4. / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "Bat", EQ, 8. / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "Frob", EQ, 16. / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "Lob", EQ, 32. / t, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_CounterRates_Tabular/threads:%int",
+                        &CheckTabularRate);
+
+// ========================================================================= //
+// ------------------------- Tabular Counters Output ----------------------- //
+// ========================================================================= //
+
+// set only some of the counters
+void BM_CounterSet0_Tabular(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  namespace bm = benchmark;
+  state.counters.insert({
+      {"Foo", {10, bm::Counter::kAvgThreads}},
+      {"Bar", {20, bm::Counter::kAvgThreads}},
+      {"Baz", {40, bm::Counter::kAvgThreads}},
+  });
+}
+BENCHMARK(BM_CounterSet0_Tabular)->ThreadRange(1, 16);
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_CounterSet0_Tabular/threads:%int\",$"},
+           {"\"family_index\": 2,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_CounterSet0_Tabular/threads:%int\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_CounterSet0_Tabular/threads:%int\",%csv_report,"
+                       "%float,,%float,%float,,"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckSet0(Results const& e) {
+  CHECK_COUNTER_VALUE(e, int, "Foo", EQ, 10);
+  CHECK_COUNTER_VALUE(e, int, "Bar", EQ, 20);
+  CHECK_COUNTER_VALUE(e, int, "Baz", EQ, 40);
+}
+CHECK_BENCHMARK_RESULTS("BM_CounterSet0_Tabular", &CheckSet0);
+
+// again.
+void BM_CounterSet1_Tabular(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  namespace bm = benchmark;
+  state.counters.insert({
+      {"Foo", {15, bm::Counter::kAvgThreads}},
+      {"Bar", {25, bm::Counter::kAvgThreads}},
+      {"Baz", {45, bm::Counter::kAvgThreads}},
+  });
+}
+BENCHMARK(BM_CounterSet1_Tabular)->ThreadRange(1, 16);
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_CounterSet1_Tabular/threads:%int\",$"},
+           {"\"family_index\": 3,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_CounterSet1_Tabular/threads:%int\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_CounterSet1_Tabular/threads:%int\",%csv_report,"
+                       "%float,,%float,%float,,"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckSet1(Results const& e) {
+  CHECK_COUNTER_VALUE(e, int, "Foo", EQ, 15);
+  CHECK_COUNTER_VALUE(e, int, "Bar", EQ, 25);
+  CHECK_COUNTER_VALUE(e, int, "Baz", EQ, 45);
+}
+CHECK_BENCHMARK_RESULTS("BM_CounterSet1_Tabular/threads:%int", &CheckSet1);
+
+// ========================================================================= //
+// ------------------------- Tabular Counters Output ----------------------- //
+// ========================================================================= //
+
+// set only some of the counters, different set now.
+void BM_CounterSet2_Tabular(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  namespace bm = benchmark;
+  state.counters.insert({
+      {"Foo", {10, bm::Counter::kAvgThreads}},
+      {"Bat", {30, bm::Counter::kAvgThreads}},
+      {"Baz", {40, bm::Counter::kAvgThreads}},
+  });
+}
+BENCHMARK(BM_CounterSet2_Tabular)->ThreadRange(1, 16);
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_CounterSet2_Tabular/threads:%int\",$"},
+           {"\"family_index\": 4,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
+           {"\"run_name\": \"BM_CounterSet2_Tabular/threads:%int\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_CounterSet2_Tabular/threads:%int\",%csv_report,"
+                       ",%float,%float,%float,,"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckSet2(Results const& e) {
+  CHECK_COUNTER_VALUE(e, int, "Foo", EQ, 10);
+  CHECK_COUNTER_VALUE(e, int, "Bat", EQ, 30);
+  CHECK_COUNTER_VALUE(e, int, "Baz", EQ, 40);
+}
+CHECK_BENCHMARK_RESULTS("BM_CounterSet2_Tabular", &CheckSet2);
+
+// ========================================================================= //
+// --------------------------- TEST CASES END ------------------------------ //
+// ========================================================================= //
+
+int main(int argc, char* argv[]) { RunOutputTests(argc, argv); }
diff --git a/third_party/google_benchmark/test/user_counters_test.cc b/third_party/google_benchmark/src/test/user_counters_test.cc
similarity index 89%
rename from third_party/google_benchmark/test/user_counters_test.cc
rename to third_party/google_benchmark/src/test/user_counters_test.cc
index 5699f4f..4cd8ee3 100644
--- a/third_party/google_benchmark/test/user_counters_test.cc
+++ b/third_party/google_benchmark/src/test/user_counters_test.cc
@@ -26,15 +26,17 @@
   for (auto _ : state) {
   }
   state.counters["foo"] = 1;
-  state.counters["bar"] = 2 * (double)state.iterations();
+  state.counters["bar"] = 2 * static_cast<double>(state.iterations());
 }
 BENCHMARK(BM_Counters_Simple);
 ADD_CASES(TC_ConsoleOut,
           {{"^BM_Counters_Simple %console_report bar=%hrfloat foo=%hrfloat$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Simple\",$"},
+                       {"\"family_index\": 0,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Counters_Simple\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
-                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
                        {"\"repetition_index\": 0,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
@@ -65,7 +67,8 @@
 void BM_Counters_WithBytesAndItemsPSec(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
   }
   state.counters["foo"] = 1;
   state.counters["bar"] = ++num_calls1;
@@ -78,9 +81,11 @@
                            "foo=%hrfloat items_per_second=%hrfloat/s$"}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_WithBytesAndItemsPSec\",$"},
+           {"\"family_index\": 1,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_WithBytesAndItemsPSec\",$", MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
-           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
            {"\"repetition_index\": 0,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
@@ -114,7 +119,8 @@
 void BM_Counters_Rate(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
   }
   namespace bm = benchmark;
   state.counters["foo"] = bm::Counter{1, bm::Counter::kIsRate};
@@ -125,9 +131,11 @@
     TC_ConsoleOut,
     {{"^BM_Counters_Rate %console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Rate\",$"},
+                       {"\"family_index\": 2,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Counters_Rate\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
-                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
                        {"\"repetition_index\": 0,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
@@ -155,7 +163,8 @@
 void BM_Invert(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
   }
   namespace bm = benchmark;
   state.counters["foo"] = bm::Counter{0.0001, bm::Counter::kInvert};
@@ -165,9 +174,11 @@
 ADD_CASES(TC_ConsoleOut,
           {{"^BM_Invert %console_report bar=%hrfloatu foo=%hrfloatk$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Invert\",$"},
+                       {"\"family_index\": 3,$", MR_Next},
+                       {"\"per_family_instance_index\": 0,$", MR_Next},
                        {"\"run_name\": \"BM_Invert\",$", MR_Next},
                        {"\"run_type\": \"iteration\",$", MR_Next},
-                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
                        {"\"repetition_index\": 0,$", MR_Next},
                        {"\"threads\": 1,$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
@@ -187,14 +198,14 @@
 CHECK_BENCHMARK_RESULTS("BM_Invert", &CheckInvert);
 
 // ========================================================================= //
-// ------------------------- InvertedRate Counters Output
-// -------------------------- //
+// --------------------- InvertedRate Counters Output ---------------------- //
 // ========================================================================= //
 
 void BM_Counters_InvertedRate(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
   }
   namespace bm = benchmark;
   state.counters["foo"] =
@@ -207,9 +218,11 @@
                            "bar=%hrfloats foo=%hrfloats$"}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_InvertedRate\",$"},
+           {"\"family_index\": 4,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_InvertedRate\",$", MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
-           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
            {"\"repetition_index\": 0,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
@@ -246,9 +259,11 @@
                            "bar=%hrfloat foo=%hrfloat$"}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_Threads/threads:%int\",$"},
+           {"\"family_index\": 5,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_Threads/threads:%int\",$", MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
-           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
            {"\"repetition_index\": 0,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
@@ -285,9 +300,11 @@
                            "%console_report bar=%hrfloat foo=%hrfloat$"}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_AvgThreads/threads:%int\",$"},
+           {"\"family_index\": 6,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_AvgThreads/threads:%int\",$", MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
-           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
            {"\"repetition_index\": 0,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
@@ -316,7 +333,8 @@
 void BM_Counters_AvgThreadsRate(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
   }
   namespace bm = benchmark;
   state.counters["foo"] = bm::Counter{1, bm::Counter::kAvgThreadsRate};
@@ -327,10 +345,12 @@
                            "%console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_AvgThreadsRate/threads:%int\",$"},
+           {"\"family_index\": 7,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_AvgThreadsRate/threads:%int\",$",
             MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
-           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
            {"\"repetition_index\": 0,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
@@ -367,9 +387,11 @@
                            "bar=%hrfloat foo=%hrfloat$"}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_IterationInvariant\",$"},
+           {"\"family_index\": 8,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_IterationInvariant\",$", MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
-           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
            {"\"repetition_index\": 0,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
@@ -399,7 +421,8 @@
 void BM_Counters_kIsIterationInvariantRate(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
   }
   namespace bm = benchmark;
   state.counters["foo"] =
@@ -412,10 +435,12 @@
                            "%console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_kIsIterationInvariantRate\",$"},
+           {"\"family_index\": 9,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_kIsIterationInvariantRate\",$",
             MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
-           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
            {"\"repetition_index\": 0,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
@@ -440,7 +465,7 @@
                         &CheckIsIterationInvariantRate);
 
 // ========================================================================= //
-// ------------------- AvgIterations Counters Output ------------------ //
+// --------------------- AvgIterations Counters Output --------------------- //
 // ========================================================================= //
 
 void BM_Counters_AvgIterations(benchmark::State& state) {
@@ -455,9 +480,11 @@
                            "bar=%hrfloat foo=%hrfloat$"}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_AvgIterations\",$"},
+           {"\"family_index\": 10,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_AvgIterations\",$", MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
-           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
            {"\"repetition_index\": 0,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
@@ -480,13 +507,14 @@
 CHECK_BENCHMARK_RESULTS("BM_Counters_AvgIterations", &CheckAvgIterations);
 
 // ========================================================================= //
-// ----------------- AvgIterationsRate Counters Output ---------------- //
+// ------------------- AvgIterationsRate Counters Output ------------------- //
 // ========================================================================= //
 
 void BM_Counters_kAvgIterationsRate(benchmark::State& state) {
   for (auto _ : state) {
     // This test requires a non-zero CPU time to avoid divide-by-zero
-    benchmark::DoNotOptimize(state.iterations());
+    auto iterations = state.iterations();
+    benchmark::DoNotOptimize(iterations);
   }
   namespace bm = benchmark;
   state.counters["foo"] = bm::Counter{1, bm::Counter::kAvgIterationsRate};
@@ -498,9 +526,11 @@
                            "%console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_kAvgIterationsRate\",$"},
+           {"\"family_index\": 11,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_kAvgIterationsRate\",$", MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
-           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetitions\": 1,$", MR_Next},
            {"\"repetition_index\": 0,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
diff --git a/third_party/google_benchmark/test/user_counters_thousands_test.cc b/third_party/google_benchmark/src/test/user_counters_thousands_test.cc
similarity index 92%
rename from third_party/google_benchmark/test/user_counters_thousands_test.cc
rename to third_party/google_benchmark/src/test/user_counters_thousands_test.cc
index 21d8285..a42683b 100644
--- a/third_party/google_benchmark/test/user_counters_thousands_test.cc
+++ b/third_party/google_benchmark/src/test/user_counters_thousands_test.cc
@@ -51,6 +51,8 @@
     });
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_Thousands/repeats:2\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
            {"\"repetitions\": 2,$", MR_Next},
@@ -68,6 +70,8 @@
            {"}", MR_Next}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_Thousands/repeats:2\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
            {"\"run_type\": \"iteration\",$", MR_Next},
            {"\"repetitions\": 2,$", MR_Next},
@@ -85,11 +89,14 @@
            {"}", MR_Next}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_Thousands/repeats:2_mean\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
            {"\"run_type\": \"aggregate\",$", MR_Next},
            {"\"repetitions\": 2,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": 2,$", MR_Next},
            {"\"real_time\": %float,$", MR_Next},
            {"\"cpu_time\": %float,$", MR_Next},
@@ -102,11 +109,14 @@
            {"}", MR_Next}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_Thousands/repeats:2_median\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
            {"\"run_type\": \"aggregate\",$", MR_Next},
            {"\"repetitions\": 2,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": 2,$", MR_Next},
            {"\"real_time\": %float,$", MR_Next},
            {"\"cpu_time\": %float,$", MR_Next},
@@ -119,11 +129,14 @@
            {"}", MR_Next}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_Thousands/repeats:2_stddev\",$"},
+           {"\"family_index\": 0,$", MR_Next},
+           {"\"per_family_instance_index\": 0,$", MR_Next},
            {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
            {"\"run_type\": \"aggregate\",$", MR_Next},
            {"\"repetitions\": 2,$", MR_Next},
            {"\"threads\": 1,$", MR_Next},
            {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"aggregate_unit\": \"time\",$", MR_Next},
            {"\"iterations\": 2,$", MR_Next},
            {"\"real_time\": %float,$", MR_Next},
            {"\"cpu_time\": %float,$", MR_Next},
diff --git a/third_party/google_benchmark/src/tools/BUILD.bazel b/third_party/google_benchmark/src/tools/BUILD.bazel
new file mode 100644
index 0000000..5895883
--- /dev/null
+++ b/third_party/google_benchmark/src/tools/BUILD.bazel
@@ -0,0 +1,19 @@
+load("@py_deps//:requirements.bzl", "requirement")
+
+py_library(
+    name = "gbench",
+    srcs = glob(["gbench/*.py"]),
+    deps = [
+      requirement("numpy"),
+      requirement("scipy"),
+    ],
+)
+
+py_binary(
+    name = "compare",
+    srcs = ["compare.py"],
+    python_version = "PY2",
+    deps = [
+        ":gbench",
+    ],
+)
diff --git a/third_party/google_benchmark/tools/compare.py b/third_party/google_benchmark/src/tools/compare.py
similarity index 89%
rename from third_party/google_benchmark/tools/compare.py
rename to third_party/google_benchmark/src/tools/compare.py
index 539ace6..e5eeb24 100755
--- a/third_party/google_benchmark/tools/compare.py
+++ b/third_party/google_benchmark/src/tools/compare.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 import unittest
 """
@@ -7,26 +7,30 @@
 
 import argparse
 from argparse import ArgumentParser
+import json
 import sys
+import os
 import gbench
 from gbench import util, report
-from gbench.util import *
 
 
 def check_inputs(in1, in2, flags):
     """
     Perform checking on the user provided inputs and diagnose any abnormalities
     """
-    in1_kind, in1_err = classify_input_file(in1)
-    in2_kind, in2_err = classify_input_file(in2)
-    output_file = find_benchmark_flag('--benchmark_out=', flags)
-    output_type = find_benchmark_flag('--benchmark_out_format=', flags)
-    if in1_kind == IT_Executable and in2_kind == IT_Executable and output_file:
+    in1_kind, in1_err = util.classify_input_file(in1)
+    in2_kind, in2_err = util.classify_input_file(in2)
+    output_file = util.find_benchmark_flag('--benchmark_out=', flags)
+    output_type = util.find_benchmark_flag('--benchmark_out_format=', flags)
+    if in1_kind == util.IT_Executable and in2_kind == util.IT_Executable and output_file:
         print(("WARNING: '--benchmark_out=%s' will be passed to both "
                "benchmarks causing it to be overwritten") % output_file)
-    if in1_kind == IT_JSON and in2_kind == IT_JSON and len(flags) > 0:
-        print("WARNING: passing optional flags has no effect since both "
-              "inputs are JSON")
+    if in1_kind == util.IT_JSON and in2_kind == util.IT_JSON:
+        # When both sides are JSON the only supported flag is
+        # --benchmark_filter=
+        for flag in util.remove_benchmark_flags('--benchmark_filter=', flags):
+            print("WARNING: passing %s has no effect since both "
+                  "inputs are JSON" % flag)
     if output_type is not None and output_type != 'json':
         print(("ERROR: passing '--benchmark_out_format=%s' to 'compare.py`"
                " is not supported.") % output_type)
@@ -48,6 +52,20 @@
              "of repetitions. Do note that only the display is affected. "
              "Internally, all the actual runs are still used, e.g. for U test.")
 
+    parser.add_argument(
+        '--no-color',
+        dest='color',
+        default=True,
+        action="store_false",
+        help="Do not use colors in the terminal output"
+    )
+
+    parser.add_argument(
+        '-d',
+        '--dump_to_json',
+        dest='dump_to_json',
+        help="Additionally, dump benchmark comparison output to this file in JSON format.")
+
     utest = parser.add_argument_group()
     utest.add_argument(
         '--no-utest',
@@ -223,10 +241,10 @@
         options_contender = ['--benchmark_filter=%s' % filter_contender]
 
     # Run the benchmarks and report the results
-    json1 = json1_orig = gbench.util.run_or_load_benchmark(
-        test_baseline, benchmark_options + options_baseline)
-    json2 = json2_orig = gbench.util.run_or_load_benchmark(
-        test_contender, benchmark_options + options_contender)
+    json1 = json1_orig = gbench.util.sort_benchmark_results(gbench.util.run_or_load_benchmark(
+        test_baseline, benchmark_options + options_baseline))
+    json2 = json2_orig = gbench.util.sort_benchmark_results(gbench.util.run_or_load_benchmark(
+        test_contender, benchmark_options + options_contender))
 
     # Now, filter the benchmarks so that the difference report can work
     if filter_baseline and filter_contender:
@@ -236,14 +254,20 @@
         json2 = gbench.report.filter_benchmark(
             json2_orig, filter_contender, replacement)
 
-    # Diff and output
-    output_lines = gbench.report.generate_difference_report(
-        json1, json2, args.display_aggregates_only,
-        args.utest, args.utest_alpha)
+    diff_report = gbench.report.get_difference_report(
+        json1, json2, args.utest)
+    output_lines = gbench.report.print_difference_report(
+        diff_report,
+        args.display_aggregates_only,
+        args.utest, args.utest_alpha, args.color)
     print(description)
     for ln in output_lines:
         print(ln)
 
+    # Optionally, diff and output to JSON
+    if args.dump_to_json is not None:
+        with open(args.dump_to_json, 'w') as f_json:
+            json.dump(diff_report, f_json)
 
 class TestParser(unittest.TestCase):
     def setUp(self):
diff --git a/third_party/google_benchmark/tools/gbench/Inputs/test1_run1.json b/third_party/google_benchmark/src/tools/gbench/Inputs/test1_run1.json
similarity index 93%
rename from third_party/google_benchmark/tools/gbench/Inputs/test1_run1.json
rename to third_party/google_benchmark/src/tools/gbench/Inputs/test1_run1.json
index 601e327..9daed0b 100644
--- a/third_party/google_benchmark/tools/gbench/Inputs/test1_run1.json
+++ b/third_party/google_benchmark/src/tools/gbench/Inputs/test1_run1.json
@@ -114,6 +114,14 @@
       "real_time": 1,
       "cpu_time": 1,
       "time_unit": "s"
+    },
+    {
+      "name": "BM_hasLabel",
+      "label": "a label",
+      "iterations": 1,
+      "real_time": 1,
+      "cpu_time": 1,
+      "time_unit": "s"
     }
   ]
 }
diff --git a/third_party/google_benchmark/tools/gbench/Inputs/test1_run2.json b/third_party/google_benchmark/src/tools/gbench/Inputs/test1_run2.json
similarity index 94%
rename from third_party/google_benchmark/tools/gbench/Inputs/test1_run2.json
rename to third_party/google_benchmark/src/tools/gbench/Inputs/test1_run2.json
index 3cbcf39..dc52970 100644
--- a/third_party/google_benchmark/tools/gbench/Inputs/test1_run2.json
+++ b/third_party/google_benchmark/src/tools/gbench/Inputs/test1_run2.json
@@ -114,6 +114,14 @@
       "real_time": 1,
       "cpu_time": 1,
       "time_unit": "ns"
+    },
+    {
+      "name": "BM_hasLabel",
+      "label": "a label",
+      "iterations": 1,
+      "real_time": 1,
+      "cpu_time": 1,
+      "time_unit": "s"
     }
   ]
 }
diff --git a/third_party/google_benchmark/tools/gbench/Inputs/test2_run.json b/third_party/google_benchmark/src/tools/gbench/Inputs/test2_run.json
similarity index 100%
rename from third_party/google_benchmark/tools/gbench/Inputs/test2_run.json
rename to third_party/google_benchmark/src/tools/gbench/Inputs/test2_run.json
diff --git a/third_party/google_benchmark/tools/gbench/Inputs/test3_run0.json b/third_party/google_benchmark/src/tools/gbench/Inputs/test3_run0.json
similarity index 100%
rename from third_party/google_benchmark/tools/gbench/Inputs/test3_run0.json
rename to third_party/google_benchmark/src/tools/gbench/Inputs/test3_run0.json
diff --git a/third_party/google_benchmark/tools/gbench/Inputs/test3_run1.json b/third_party/google_benchmark/src/tools/gbench/Inputs/test3_run1.json
similarity index 100%
rename from third_party/google_benchmark/tools/gbench/Inputs/test3_run1.json
rename to third_party/google_benchmark/src/tools/gbench/Inputs/test3_run1.json
diff --git a/third_party/google_benchmark/src/tools/gbench/Inputs/test4_run.json b/third_party/google_benchmark/src/tools/gbench/Inputs/test4_run.json
new file mode 100644
index 0000000..eaa005f
--- /dev/null
+++ b/third_party/google_benchmark/src/tools/gbench/Inputs/test4_run.json
@@ -0,0 +1,96 @@
+{
+  "benchmarks": [
+    {
+      "name": "99 family 0 instance 0 repetition 0",
+      "run_type": "iteration",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "repetition_index": 0
+    },
+    {
+      "name": "98 family 0 instance 0 repetition 1",
+      "run_type": "iteration",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "repetition_index": 1
+    },
+    {
+      "name": "97 family 0 instance 0 aggregate",
+      "run_type": "aggregate",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "aggregate_name": "9 aggregate"
+    },
+
+
+    {
+      "name": "96 family 0 instance 1 repetition 0",
+      "run_type": "iteration",
+      "family_index": 0,
+      "per_family_instance_index": 1,
+      "repetition_index": 0
+    },
+    {
+      "name": "95 family 0 instance 1 repetition 1",
+      "run_type": "iteration",
+      "family_index": 0,
+      "per_family_instance_index": 1,
+      "repetition_index": 1
+    },
+    {
+      "name": "94 family 0 instance 1 aggregate",
+      "run_type": "aggregate",
+      "family_index": 0,
+      "per_family_instance_index": 1,
+      "aggregate_name": "9 aggregate"
+    },
+
+
+
+
+    {
+      "name": "93 family 1 instance 0 repetition 0",
+      "run_type": "iteration",
+      "family_index": 1,
+      "per_family_instance_index": 0,
+      "repetition_index": 0
+    },
+    {
+      "name": "92 family 1 instance 0 repetition 1",
+      "run_type": "iteration",
+      "family_index": 1,
+      "per_family_instance_index": 0,
+      "repetition_index": 1
+    },
+    {
+      "name": "91 family 1 instance 0 aggregate",
+      "run_type": "aggregate",
+      "family_index": 1,
+      "per_family_instance_index": 0,
+      "aggregate_name": "9 aggregate"
+    },
+
+
+    {
+      "name": "90 family 1 instance 1 repetition 0",
+      "run_type": "iteration",
+      "family_index": 1,
+      "per_family_instance_index": 1,
+      "repetition_index": 0
+    },
+    {
+      "name": "89 family 1 instance 1 repetition 1",
+      "run_type": "iteration",
+      "family_index": 1,
+      "per_family_instance_index": 1,
+      "repetition_index": 1
+    },
+    {
+      "name": "88 family 1 instance 1 aggregate",
+      "run_type": "aggregate",
+      "family_index": 1,
+      "per_family_instance_index": 1,
+      "aggregate_name": "9 aggregate"
+    }
+  ]
+}
diff --git a/third_party/google_benchmark/src/tools/gbench/Inputs/test4_run0.json b/third_party/google_benchmark/src/tools/gbench/Inputs/test4_run0.json
new file mode 100644
index 0000000..54cf127
--- /dev/null
+++ b/third_party/google_benchmark/src/tools/gbench/Inputs/test4_run0.json
@@ -0,0 +1,21 @@
+{
+  "context": {
+    "date": "2016-08-02 17:44:46",
+    "num_cpus": 4,
+    "mhz_per_cpu": 4228,
+    "cpu_scaling_enabled": false,
+    "library_build_type": "release"
+  },
+  "benchmarks": [
+    {
+      "name": "whocares",
+      "run_type": "aggregate",
+      "aggregate_name": "zz",
+      "aggregate_unit": "percentage",
+      "iterations": 1000,
+      "real_time": 0.01,
+      "cpu_time": 0.10,
+      "time_unit": "ns"
+    }
+  ]
+}
diff --git a/third_party/google_benchmark/src/tools/gbench/Inputs/test4_run1.json b/third_party/google_benchmark/src/tools/gbench/Inputs/test4_run1.json
new file mode 100644
index 0000000..25d5605
--- /dev/null
+++ b/third_party/google_benchmark/src/tools/gbench/Inputs/test4_run1.json
@@ -0,0 +1,21 @@
+{
+  "context": {
+    "date": "2016-08-02 17:44:46",
+    "num_cpus": 4,
+    "mhz_per_cpu": 4228,
+    "cpu_scaling_enabled": false,
+    "library_build_type": "release"
+  },
+  "benchmarks": [
+    {
+      "name": "whocares",
+      "run_type": "aggregate",
+      "aggregate_name": "zz",
+      "aggregate_unit": "percentage",
+      "iterations": 1000,
+      "real_time": 0.005,
+      "cpu_time": 0.15,
+      "time_unit": "ns"
+    }
+  ]
+}
diff --git a/third_party/google_benchmark/tools/gbench/__init__.py b/third_party/google_benchmark/src/tools/gbench/__init__.py
similarity index 100%
rename from third_party/google_benchmark/tools/gbench/__init__.py
rename to third_party/google_benchmark/src/tools/gbench/__init__.py
diff --git a/third_party/google_benchmark/src/tools/gbench/report.py b/third_party/google_benchmark/src/tools/gbench/report.py
new file mode 100644
index 0000000..b2bbfb9
--- /dev/null
+++ b/third_party/google_benchmark/src/tools/gbench/report.py
@@ -0,0 +1,1201 @@
+"""report.py - Utilities for reporting statistics about benchmark results
+"""
+
+import unittest
+import os
+import re
+import copy
+import random
+
+from scipy.stats import mannwhitneyu, gmean
+from numpy import array
+
+
+class BenchmarkColor(object):
+    def __init__(self, name, code):
+        self.name = name
+        self.code = code
+
+    def __repr__(self):
+        return '%s%r' % (self.__class__.__name__,
+                         (self.name, self.code))
+
+    def __format__(self, format):
+        return self.code
+
+
+# Benchmark Colors Enumeration
+BC_NONE = BenchmarkColor('NONE', '')
+BC_MAGENTA = BenchmarkColor('MAGENTA', '\033[95m')
+BC_CYAN = BenchmarkColor('CYAN', '\033[96m')
+BC_OKBLUE = BenchmarkColor('OKBLUE', '\033[94m')
+BC_OKGREEN = BenchmarkColor('OKGREEN', '\033[32m')
+BC_HEADER = BenchmarkColor('HEADER', '\033[92m')
+BC_WARNING = BenchmarkColor('WARNING', '\033[93m')
+BC_WHITE = BenchmarkColor('WHITE', '\033[97m')
+BC_FAIL = BenchmarkColor('FAIL', '\033[91m')
+BC_ENDC = BenchmarkColor('ENDC', '\033[0m')
+BC_BOLD = BenchmarkColor('BOLD', '\033[1m')
+BC_UNDERLINE = BenchmarkColor('UNDERLINE', '\033[4m')
+
+UTEST_MIN_REPETITIONS = 2
+UTEST_OPTIMAL_REPETITIONS = 9  # Lowest reasonable number, More is better.
+UTEST_COL_NAME = "_pvalue"
+
+_TIME_UNIT_TO_SECONDS_MULTIPLIER = {
+    "s": 1.0,
+    "ms": 1e-3,
+    "us": 1e-6,
+    "ns": 1e-9,
+}
+
+
+def color_format(use_color, fmt_str, *args, **kwargs):
+    """
+    Return the result of 'fmt_str.format(*args, **kwargs)' after transforming
+    'args' and 'kwargs' according to the value of 'use_color'. If 'use_color'
+    is False then all color codes in 'args' and 'kwargs' are replaced with
+    the empty string.
+    """
+    assert use_color is True or use_color is False
+    if not use_color:
+        args = [arg if not isinstance(arg, BenchmarkColor) else BC_NONE
+                for arg in args]
+        kwargs = {key: arg if not isinstance(arg, BenchmarkColor) else BC_NONE
+                  for key, arg in kwargs.items()}
+    return fmt_str.format(*args, **kwargs)
+
+
+def find_longest_name(benchmark_list):
+    """
+    Return the length of the longest benchmark name in a given list of
+    benchmark JSON objects
+    """
+    longest_name = 1
+    for bc in benchmark_list:
+        if len(bc['name']) > longest_name:
+            longest_name = len(bc['name'])
+    return longest_name
+
+
+def calculate_change(old_val, new_val):
+    """
+    Return a float representing the decimal change between old_val and new_val.
+    """
+    if old_val == 0 and new_val == 0:
+        return 0.0
+    if old_val == 0:
+        return float(new_val - old_val) / (float(old_val + new_val) / 2)
+    return float(new_val - old_val) / abs(old_val)
+
+
+def filter_benchmark(json_orig, family, replacement=""):
+    """
+    Apply a filter to the json, and only leave the 'family' of benchmarks.
+    """
+    regex = re.compile(family)
+    filtered = {}
+    filtered['benchmarks'] = []
+    for be in json_orig['benchmarks']:
+        if not regex.search(be['name']):
+            continue
+        filteredbench = copy.deepcopy(be)  # Do NOT modify the old name!
+        filteredbench['name'] = regex.sub(replacement, filteredbench['name'])
+        filtered['benchmarks'].append(filteredbench)
+    return filtered
+
+
+def get_unique_benchmark_names(json):
+    """
+    While *keeping* the order, give all the unique 'names' used for benchmarks.
+    """
+    seen = set()
+    uniqued = [x['name'] for x in json['benchmarks']
+               if x['name'] not in seen and
+               (seen.add(x['name']) or True)]
+    return uniqued
+
+
+def intersect(list1, list2):
+    """
+    Given two lists, get a new list consisting of the elements only contained
+    in *both of the input lists*, while preserving the ordering.
+    """
+    return [x for x in list1 if x in list2]
+
+
+def is_potentially_comparable_benchmark(x):
+    return ('time_unit' in x and 'real_time' in x and 'cpu_time' in x)
+
+
+def partition_benchmarks(json1, json2):
+    """
+    While preserving the ordering, find benchmarks with the same names in
+    both of the inputs, and group them.
+    (i.e. partition/filter into groups with common name)
+    """
+    json1_unique_names = get_unique_benchmark_names(json1)
+    json2_unique_names = get_unique_benchmark_names(json2)
+    names = intersect(json1_unique_names, json2_unique_names)
+    partitions = []
+    for name in names:
+        time_unit = None
+        # Pick the time unit from the first entry of the lhs benchmark.
+        # We should be careful not to crash with unexpected input.
+        for x in json1['benchmarks']:
+            if (x['name'] == name and is_potentially_comparable_benchmark(x)):
+                time_unit = x['time_unit']
+                break
+        if time_unit is None:
+            continue
+        # Filter by name and time unit.
+        # All the repetitions are assumed to be comparable.
+        lhs = [x for x in json1['benchmarks'] if x['name'] == name and
+               x['time_unit'] == time_unit]
+        rhs = [x for x in json2['benchmarks'] if x['name'] == name and
+               x['time_unit'] == time_unit]
+        partitions.append([lhs, rhs])
+    return partitions
+
+
+def get_timedelta_field_as_seconds(benchmark, field_name):
+    """
+    Get value of field_name field of benchmark, which is time with time unit
+    time_unit, as time in seconds.
+    """
+    timedelta = benchmark[field_name]
+    time_unit = benchmark.get('time_unit', 's')
+    return timedelta * _TIME_UNIT_TO_SECONDS_MULTIPLIER.get(time_unit)
+
+
+def calculate_geomean(json):
+    """
+    Extract all real/cpu times from all the benchmarks as seconds,
+    and calculate their geomean.
+    """
+    times = []
+    for benchmark in json['benchmarks']:
+        if 'run_type' in benchmark and benchmark['run_type'] == 'aggregate':
+            continue
+        times.append([get_timedelta_field_as_seconds(benchmark, 'real_time'),
+                      get_timedelta_field_as_seconds(benchmark, 'cpu_time')])
+    return gmean(times) if times else array([])
+
+
+def extract_field(partition, field_name):
+    # The count of elements may be different. We want *all* of them.
+    lhs = [x[field_name] for x in partition[0]]
+    rhs = [x[field_name] for x in partition[1]]
+    return [lhs, rhs]
+
+
+def calc_utest(timings_cpu, timings_time):
+    min_rep_cnt = min(len(timings_time[0]),
+                      len(timings_time[1]),
+                      len(timings_cpu[0]),
+                      len(timings_cpu[1]))
+
+    # Does *everything* has at least UTEST_MIN_REPETITIONS repetitions?
+    if min_rep_cnt < UTEST_MIN_REPETITIONS:
+        return False, None, None
+
+    time_pvalue = mannwhitneyu(
+        timings_time[0], timings_time[1], alternative='two-sided').pvalue
+    cpu_pvalue = mannwhitneyu(
+        timings_cpu[0], timings_cpu[1], alternative='two-sided').pvalue
+
+    return (min_rep_cnt >= UTEST_OPTIMAL_REPETITIONS), cpu_pvalue, time_pvalue
+
+
+def print_utest(bc_name, utest, utest_alpha, first_col_width, use_color=True):
+    def get_utest_color(pval):
+        return BC_FAIL if pval >= utest_alpha else BC_OKGREEN
+
+    # Check if we failed miserably with minimum required repetitions for utest
+    if not utest['have_optimal_repetitions'] and utest['cpu_pvalue'] is None and utest['time_pvalue'] is None:
+        return []
+
+    dsc = "U Test, Repetitions: {} vs {}".format(
+        utest['nr_of_repetitions'], utest['nr_of_repetitions_other'])
+    dsc_color = BC_OKGREEN
+
+    # We still got some results to show but issue a warning about it.
+    if not utest['have_optimal_repetitions']:
+        dsc_color = BC_WARNING
+        dsc += ". WARNING: Results unreliable! {}+ repetitions recommended.".format(
+            UTEST_OPTIMAL_REPETITIONS)
+
+    special_str = "{}{:<{}s}{endc}{}{:16.4f}{endc}{}{:16.4f}{endc}{}      {}"
+
+    return [color_format(use_color,
+                         special_str,
+                         BC_HEADER,
+                         "{}{}".format(bc_name, UTEST_COL_NAME),
+                         first_col_width,
+                         get_utest_color(
+                             utest['time_pvalue']), utest['time_pvalue'],
+                         get_utest_color(
+                             utest['cpu_pvalue']), utest['cpu_pvalue'],
+                         dsc_color, dsc,
+                         endc=BC_ENDC)]
+
+
+def get_difference_report(
+        json1,
+        json2,
+        utest=False):
+    """
+    Calculate and report the difference between each test of two benchmarks
+    runs specified as 'json1' and 'json2'. Output is another json containing
+    relevant details for each test run.
+    """
+    assert utest is True or utest is False
+
+    diff_report = []
+    partitions = partition_benchmarks(json1, json2)
+    for partition in partitions:
+        benchmark_name = partition[0][0]['name']
+        label = partition[0][0]['label'] if 'label' in partition[0][0] else ''
+        time_unit = partition[0][0]['time_unit']
+        measurements = []
+        utest_results = {}
+        # Careful, we may have different repetition count.
+        for i in range(min(len(partition[0]), len(partition[1]))):
+            bn = partition[0][i]
+            other_bench = partition[1][i]
+            measurements.append({
+                'real_time': bn['real_time'],
+                'cpu_time': bn['cpu_time'],
+                'real_time_other': other_bench['real_time'],
+                'cpu_time_other': other_bench['cpu_time'],
+                'time': calculate_change(bn['real_time'], other_bench['real_time']),
+                'cpu': calculate_change(bn['cpu_time'], other_bench['cpu_time'])
+            })
+
+        # After processing the whole partition, if requested, do the U test.
+        if utest:
+            timings_cpu = extract_field(partition, 'cpu_time')
+            timings_time = extract_field(partition, 'real_time')
+            have_optimal_repetitions, cpu_pvalue, time_pvalue = calc_utest(
+                timings_cpu, timings_time)
+            if cpu_pvalue and time_pvalue:
+                utest_results = {
+                    'have_optimal_repetitions': have_optimal_repetitions,
+                    'cpu_pvalue': cpu_pvalue,
+                    'time_pvalue': time_pvalue,
+                    'nr_of_repetitions': len(timings_cpu[0]),
+                    'nr_of_repetitions_other': len(timings_cpu[1])
+                }
+
+        # Store only if we had any measurements for given benchmark.
+        # E.g. partition_benchmarks will filter out the benchmarks having
+        # time units which are not compatible with other time units in the
+        # benchmark suite.
+        if measurements:
+            run_type = partition[0][0]['run_type'] if 'run_type' in partition[0][0] else ''
+            aggregate_name = partition[0][0]['aggregate_name'] if run_type == 'aggregate' and 'aggregate_name' in partition[0][0] else ''
+            diff_report.append({
+                'name': benchmark_name,
+                'label': label,
+                'measurements': measurements,
+                'time_unit': time_unit,
+                'run_type': run_type,
+                'aggregate_name': aggregate_name,
+                'utest': utest_results
+            })
+
+    lhs_gmean = calculate_geomean(json1)
+    rhs_gmean = calculate_geomean(json2)
+    if lhs_gmean.any() and rhs_gmean.any():
+        diff_report.append({
+            'name': 'OVERALL_GEOMEAN',
+            'label': '',
+            'measurements': [{
+                'real_time': lhs_gmean[0],
+                'cpu_time': lhs_gmean[1],
+                'real_time_other': rhs_gmean[0],
+                'cpu_time_other': rhs_gmean[1],
+                'time': calculate_change(lhs_gmean[0], rhs_gmean[0]),
+                'cpu': calculate_change(lhs_gmean[1], rhs_gmean[1])
+            }],
+            'time_unit': 's',
+            'run_type': 'aggregate',
+            'aggregate_name': 'geomean',
+            'utest': {}
+        })
+
+    return diff_report
+
+
+def print_difference_report(
+        json_diff_report,
+        include_aggregates_only=False,
+        utest=False,
+        utest_alpha=0.05,
+        use_color=True):
+    """
+    Calculate and report the difference between each test of two benchmarks
+    runs specified as 'json1' and 'json2'.
+    """
+    assert utest is True or utest is False
+
+    def get_color(res):
+        if res > 0.05:
+            return BC_FAIL
+        elif res > -0.07:
+            return BC_WHITE
+        else:
+            return BC_CYAN
+
+    first_col_width = find_longest_name(json_diff_report)
+    first_col_width = max(
+        first_col_width,
+        len('Benchmark'))
+    first_col_width += len(UTEST_COL_NAME)
+    first_line = "{:<{}s}Time             CPU      Time Old      Time New       CPU Old       CPU New".format(
+        'Benchmark', 12 + first_col_width)
+    output_strs = [first_line, '-' * len(first_line)]
+
+    fmt_str = "{}{:<{}s}{endc}{}{:+16.4f}{endc}{}{:+16.4f}{endc}{:14.0f}{:14.0f}{endc}{:14.0f}{:14.0f}"
+    for benchmark in json_diff_report:
+        # *If* we were asked to only include aggregates,
+        # and if it is non-aggregate, then don't print it.
+        if not include_aggregates_only or not 'run_type' in benchmark or benchmark['run_type'] == 'aggregate':
+            for measurement in benchmark['measurements']:
+                output_strs += [color_format(use_color,
+                                             fmt_str,
+                                             BC_HEADER,
+                                             benchmark['name'],
+                                             first_col_width,
+                                             get_color(measurement['time']),
+                                             measurement['time'],
+                                             get_color(measurement['cpu']),
+                                             measurement['cpu'],
+                                             measurement['real_time'],
+                                             measurement['real_time_other'],
+                                             measurement['cpu_time'],
+                                             measurement['cpu_time_other'],
+                                             endc=BC_ENDC)]
+
+        # After processing the measurements, if requested and
+        # if applicable (e.g. u-test exists for given benchmark),
+        # print the U test.
+        if utest and benchmark['utest']:
+            output_strs += print_utest(benchmark['name'],
+                                       benchmark['utest'],
+                                       utest_alpha=utest_alpha,
+                                       first_col_width=first_col_width,
+                                       use_color=use_color)
+
+    return output_strs
+
+
+###############################################################################
+# Unit tests
+
+
+class TestGetUniqueBenchmarkNames(unittest.TestCase):
+    def load_results(self):
+        import json
+        testInputs = os.path.join(
+            os.path.dirname(
+                os.path.realpath(__file__)),
+            'Inputs')
+        testOutput = os.path.join(testInputs, 'test3_run0.json')
+        with open(testOutput, 'r') as f:
+            json = json.load(f)
+        return json
+
+    def test_basic(self):
+        expect_lines = [
+            'BM_One',
+            'BM_Two',
+            'short',  # These two are not sorted
+            'medium',  # These two are not sorted
+        ]
+        json = self.load_results()
+        output_lines = get_unique_benchmark_names(json)
+        print("\n")
+        print("\n".join(output_lines))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            self.assertEqual(expect_lines[i], output_lines[i])
+
+
+class TestReportDifference(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        def load_results():
+            import json
+            testInputs = os.path.join(
+                os.path.dirname(
+                    os.path.realpath(__file__)),
+                'Inputs')
+            testOutput1 = os.path.join(testInputs, 'test1_run1.json')
+            testOutput2 = os.path.join(testInputs, 'test1_run2.json')
+            with open(testOutput1, 'r') as f:
+                json1 = json.load(f)
+            with open(testOutput2, 'r') as f:
+                json2 = json.load(f)
+            return json1, json2
+
+        json1, json2 = load_results()
+        cls.json_diff_report = get_difference_report(json1, json2)
+
+    def test_json_diff_report_pretty_printing(self):
+        expect_lines = [
+            ['BM_SameTimes', '+0.0000', '+0.0000', '10', '10', '10', '10'],
+            ['BM_2xFaster', '-0.5000', '-0.5000', '50', '25', '50', '25'],
+            ['BM_2xSlower', '+1.0000', '+1.0000', '50', '100', '50', '100'],
+            ['BM_1PercentFaster', '-0.0100', '-0.0100', '100', '99', '100', '99'],
+            ['BM_1PercentSlower', '+0.0100', '+0.0100', '100', '101', '100', '101'],
+            ['BM_10PercentFaster', '-0.1000', '-0.1000', '100', '90', '100', '90'],
+            ['BM_10PercentSlower', '+0.1000', '+0.1000', '100', '110', '100', '110'],
+            ['BM_100xSlower', '+99.0000', '+99.0000',
+                '100', '10000', '100', '10000'],
+            ['BM_100xFaster', '-0.9900', '-0.9900',
+                '10000', '100', '10000', '100'],
+            ['BM_10PercentCPUToTime', '+0.1000',
+                '-0.1000', '100', '110', '100', '90'],
+            ['BM_ThirdFaster', '-0.3333', '-0.3334', '100', '67', '100', '67'],
+            ['BM_NotBadTimeUnit', '-0.9000', '+0.2000', '0', '0', '0', '1'],
+            ['BM_hasLabel', '+0.0000', '+0.0000', '1', '1', '1', '1'],
+            ['OVERALL_GEOMEAN', '-0.8113', '-0.7779', '0', '0', '0', '0']
+        ]
+        output_lines_with_header = print_difference_report(
+            self.json_diff_report, use_color=False)
+        output_lines = output_lines_with_header[2:]
+        print("\n")
+        print("\n".join(output_lines_with_header))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(' ') if x]
+            self.assertEqual(len(parts), 7)
+            self.assertEqual(expect_lines[i], parts)
+
+    def test_json_diff_report_output(self):
+        expected_output = [
+            {
+                'name': 'BM_SameTimes',
+                'label': '',
+                'measurements': [{'time': 0.0000, 'cpu': 0.0000,
+                                  'real_time': 10, 'real_time_other': 10,
+                                  'cpu_time': 10, 'cpu_time_other': 10}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_2xFaster',
+                'label': '',
+                'measurements': [{'time': -0.5000, 'cpu': -0.5000,
+                                  'real_time': 50, 'real_time_other': 25,
+                                  'cpu_time': 50, 'cpu_time_other': 25}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_2xSlower',
+                'label': '',
+                'measurements': [{'time': 1.0000, 'cpu': 1.0000,
+                                  'real_time': 50, 'real_time_other': 100,
+                                  'cpu_time': 50, 'cpu_time_other': 100}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_1PercentFaster',
+                'label': '',
+                'measurements': [{'time': -0.0100, 'cpu': -0.0100,
+                                  'real_time': 100, 'real_time_other': 98.9999999,
+                                  'cpu_time': 100, 'cpu_time_other': 98.9999999}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_1PercentSlower',
+                'label': '',
+                'measurements': [{'time': 0.0100, 'cpu': 0.0100,
+                                  'real_time': 100, 'real_time_other': 101,
+                                  'cpu_time': 100, 'cpu_time_other': 101}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_10PercentFaster',
+                'label': '',
+                'measurements': [{'time': -0.1000, 'cpu': -0.1000,
+                                  'real_time': 100, 'real_time_other': 90,
+                                  'cpu_time': 100, 'cpu_time_other': 90}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_10PercentSlower',
+                'label': '',
+                'measurements': [{'time': 0.1000, 'cpu': 0.1000,
+                                  'real_time': 100, 'real_time_other': 110,
+                                  'cpu_time': 100, 'cpu_time_other': 110}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_100xSlower',
+                'label': '',
+                'measurements': [{'time': 99.0000, 'cpu': 99.0000,
+                                  'real_time': 100, 'real_time_other': 10000,
+                                  'cpu_time': 100, 'cpu_time_other': 10000}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_100xFaster',
+                'label': '',
+                'measurements': [{'time': -0.9900, 'cpu': -0.9900,
+                                  'real_time': 10000, 'real_time_other': 100,
+                                  'cpu_time': 10000, 'cpu_time_other': 100}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_10PercentCPUToTime',
+                'label': '',
+                'measurements': [{'time': 0.1000, 'cpu': -0.1000,
+                                  'real_time': 100, 'real_time_other': 110,
+                                  'cpu_time': 100, 'cpu_time_other': 90}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_ThirdFaster',
+                'label': '',
+                'measurements': [{'time': -0.3333, 'cpu': -0.3334,
+                                  'real_time': 100, 'real_time_other': 67,
+                                  'cpu_time': 100, 'cpu_time_other': 67}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_NotBadTimeUnit',
+                'label': '',
+                'measurements': [{'time': -0.9000, 'cpu': 0.2000,
+                                  'real_time': 0.4, 'real_time_other': 0.04,
+                                  'cpu_time': 0.5, 'cpu_time_other': 0.6}],
+                'time_unit': 's',
+                'utest': {}
+            },
+            {
+                'name': 'BM_hasLabel',
+                'label': 'a label',
+                'measurements': [{'time': 0.0000, 'cpu': 0.0000,
+                                  'real_time': 1, 'real_time_other': 1,
+                                  'cpu_time': 1, 'cpu_time_other': 1}],
+                'time_unit': 's',
+                'utest': {}
+            },
+            {
+                'name': 'OVERALL_GEOMEAN',
+                'label': '',
+                'measurements': [{'real_time': 3.1622776601683826e-06, 'cpu_time': 3.2130844755623912e-06,
+                                  'real_time_other': 1.9768988699420897e-07, 'cpu_time_other': 2.397447755209533e-07,
+                                  'time': -0.8112976497120911, 'cpu': -0.7778551721181174}],
+                'time_unit': 's',
+                'run_type': 'aggregate',
+                'aggregate_name': 'geomean', 'utest': {}
+            },
+        ]
+        self.assertEqual(len(self.json_diff_report), len(expected_output))
+        for out, expected in zip(
+                self.json_diff_report, expected_output):
+            self.assertEqual(out['name'], expected['name'])
+            self.assertEqual(out['label'], expected['label'])
+            self.assertEqual(out['time_unit'], expected['time_unit'])
+            assert_utest(self, out, expected)
+            assert_measurements(self, out, expected)
+
+
+class TestReportDifferenceBetweenFamilies(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        def load_result():
+            import json
+            testInputs = os.path.join(
+                os.path.dirname(
+                    os.path.realpath(__file__)),
+                'Inputs')
+            testOutput = os.path.join(testInputs, 'test2_run.json')
+            with open(testOutput, 'r') as f:
+                json = json.load(f)
+            return json
+
+        json = load_result()
+        json1 = filter_benchmark(json, "BM_Z.ro", ".")
+        json2 = filter_benchmark(json, "BM_O.e", ".")
+        cls.json_diff_report = get_difference_report(json1, json2)
+
+    def test_json_diff_report_pretty_printing(self):
+        expect_lines = [
+            ['.', '-0.5000', '-0.5000', '10', '5', '10', '5'],
+            ['./4', '-0.5000', '-0.5000', '40', '20', '40', '20'],
+            ['Prefix/.', '-0.5000', '-0.5000', '20', '10', '20', '10'],
+            ['Prefix/./3', '-0.5000', '-0.5000', '30', '15', '30', '15'],
+            ['OVERALL_GEOMEAN', '-0.5000', '-0.5000', '0', '0', '0', '0']
+        ]
+        output_lines_with_header = print_difference_report(
+            self.json_diff_report, use_color=False)
+        output_lines = output_lines_with_header[2:]
+        print("\n")
+        print("\n".join(output_lines_with_header))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(' ') if x]
+            self.assertEqual(len(parts), 7)
+            self.assertEqual(expect_lines[i], parts)
+
+    def test_json_diff_report(self):
+        expected_output = [
+            {
+                'name': u'.',
+                'measurements': [{'time': -0.5, 'cpu': -0.5, 'real_time': 10, 'real_time_other': 5, 'cpu_time': 10, 'cpu_time_other': 5}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': u'./4',
+                'measurements': [{'time': -0.5, 'cpu': -0.5, 'real_time': 40, 'real_time_other': 20, 'cpu_time': 40, 'cpu_time_other': 20}],
+                'time_unit': 'ns',
+                'utest': {},
+            },
+            {
+                'name': u'Prefix/.',
+                'measurements': [{'time': -0.5, 'cpu': -0.5, 'real_time': 20, 'real_time_other': 10, 'cpu_time': 20, 'cpu_time_other': 10}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': u'Prefix/./3',
+                'measurements': [{'time': -0.5, 'cpu': -0.5, 'real_time': 30, 'real_time_other': 15, 'cpu_time': 30, 'cpu_time_other': 15}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'OVERALL_GEOMEAN',
+                'measurements': [{'real_time': 2.213363839400641e-08, 'cpu_time': 2.213363839400641e-08,
+                                  'real_time_other': 1.1066819197003185e-08, 'cpu_time_other': 1.1066819197003185e-08,
+                                  'time': -0.5000000000000009, 'cpu': -0.5000000000000009}],
+                'time_unit': 's',
+                'run_type': 'aggregate',
+                'aggregate_name': 'geomean',
+                'utest': {}
+            }
+        ]
+        self.assertEqual(len(self.json_diff_report), len(expected_output))
+        for out, expected in zip(
+                self.json_diff_report, expected_output):
+            self.assertEqual(out['name'], expected['name'])
+            self.assertEqual(out['time_unit'], expected['time_unit'])
+            assert_utest(self, out, expected)
+            assert_measurements(self, out, expected)
+
+
+class TestReportDifferenceWithUTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        def load_results():
+            import json
+            testInputs = os.path.join(
+                os.path.dirname(
+                    os.path.realpath(__file__)),
+                'Inputs')
+            testOutput1 = os.path.join(testInputs, 'test3_run0.json')
+            testOutput2 = os.path.join(testInputs, 'test3_run1.json')
+            with open(testOutput1, 'r') as f:
+                json1 = json.load(f)
+            with open(testOutput2, 'r') as f:
+                json2 = json.load(f)
+            return json1, json2
+
+        json1, json2 = load_results()
+        cls.json_diff_report = get_difference_report(
+            json1, json2, utest=True)
+
+    def test_json_diff_report_pretty_printing(self):
+        expect_lines = [
+            ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
+            ['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'],
+            ['BM_Two', '-0.1250', '-0.1628', '8', '7', '86', '72'],
+            ['BM_Two_pvalue',
+             '1.0000',
+             '0.6667',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '2.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+            ['short', '-0.1250', '-0.0625', '8', '7', '80', '75'],
+            ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
+            ['short_pvalue',
+             '0.7671',
+             '0.2000',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '3.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+            ['medium', '-0.3750', '-0.3375', '8', '5', '80', '53'],
+            ['OVERALL_GEOMEAN', '+1.6405', '-0.6985', '0', '0', '0', '0']
+        ]
+        output_lines_with_header = print_difference_report(
+            self.json_diff_report, utest=True, utest_alpha=0.05, use_color=False)
+        output_lines = output_lines_with_header[2:]
+        print("\n")
+        print("\n".join(output_lines_with_header))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(' ') if x]
+            self.assertEqual(expect_lines[i], parts)
+
+    def test_json_diff_report_pretty_printing_aggregates_only(self):
+        expect_lines = [
+            ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
+            ['BM_Two_pvalue',
+             '1.0000',
+             '0.6667',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '2.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+            ['short', '-0.1250', '-0.0625', '8', '7', '80', '75'],
+            ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
+            ['short_pvalue',
+             '0.7671',
+             '0.2000',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '3.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+            ['OVERALL_GEOMEAN', '+1.6405', '-0.6985', '0', '0', '0', '0']
+        ]
+        output_lines_with_header = print_difference_report(
+            self.json_diff_report, include_aggregates_only=True, utest=True, utest_alpha=0.05, use_color=False)
+        output_lines = output_lines_with_header[2:]
+        print("\n")
+        print("\n".join(output_lines_with_header))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(' ') if x]
+            self.assertEqual(expect_lines[i], parts)
+
+    def test_json_diff_report(self):
+        expected_output = [
+            {
+                'name': u'BM_One',
+                'measurements': [
+                    {'time': -0.1,
+                     'cpu': 0.1,
+                     'real_time': 10,
+                     'real_time_other': 9,
+                     'cpu_time': 100,
+                     'cpu_time_other': 110}
+                ],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': u'BM_Two',
+                'measurements': [
+                    {'time': 0.1111111111111111,
+                     'cpu': -0.011111111111111112,
+                     'real_time': 9,
+                     'real_time_other': 10,
+                     'cpu_time': 90,
+                     'cpu_time_other': 89},
+                    {'time': -0.125, 'cpu': -0.16279069767441862, 'real_time': 8,
+                        'real_time_other': 7, 'cpu_time': 86, 'cpu_time_other': 72}
+                ],
+                'time_unit': 'ns',
+                'utest': {
+                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.6666666666666666, 'time_pvalue': 1.0
+                }
+            },
+            {
+                'name': u'short',
+                'measurements': [
+                    {'time': -0.125,
+                     'cpu': -0.0625,
+                     'real_time': 8,
+                     'real_time_other': 7,
+                     'cpu_time': 80,
+                     'cpu_time_other': 75},
+                    {'time': -0.4325,
+                     'cpu': -0.13506493506493514,
+                     'real_time': 8,
+                     'real_time_other': 4.54,
+                     'cpu_time': 77,
+                     'cpu_time_other': 66.6}
+                ],
+                'time_unit': 'ns',
+                'utest': {
+                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.2, 'time_pvalue': 0.7670968684102772
+                }
+            },
+            {
+                'name': u'medium',
+                'measurements': [
+                    {'time': -0.375,
+                     'cpu': -0.3375,
+                     'real_time': 8,
+                     'real_time_other': 5,
+                     'cpu_time': 80,
+                     'cpu_time_other': 53}
+                ],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'OVERALL_GEOMEAN',
+                'measurements': [{'real_time': 8.48528137423858e-09, 'cpu_time': 8.441336246629233e-08,
+                                  'real_time_other': 2.2405267593145244e-08, 'cpu_time_other': 2.5453661413660466e-08,
+                                  'time': 1.6404861082353634, 'cpu': -0.6984640740519662}],
+                'time_unit': 's',
+                'run_type': 'aggregate',
+                'aggregate_name': 'geomean',
+                'utest': {}
+            }
+        ]
+        self.assertEqual(len(self.json_diff_report), len(expected_output))
+        for out, expected in zip(
+                self.json_diff_report, expected_output):
+            self.assertEqual(out['name'], expected['name'])
+            self.assertEqual(out['time_unit'], expected['time_unit'])
+            assert_utest(self, out, expected)
+            assert_measurements(self, out, expected)
+
+
+class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(
+        unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        def load_results():
+            import json
+            testInputs = os.path.join(
+                os.path.dirname(
+                    os.path.realpath(__file__)),
+                'Inputs')
+            testOutput1 = os.path.join(testInputs, 'test3_run0.json')
+            testOutput2 = os.path.join(testInputs, 'test3_run1.json')
+            with open(testOutput1, 'r') as f:
+                json1 = json.load(f)
+            with open(testOutput2, 'r') as f:
+                json2 = json.load(f)
+            return json1, json2
+
+        json1, json2 = load_results()
+        cls.json_diff_report = get_difference_report(
+            json1, json2, utest=True)
+
+    def test_json_diff_report_pretty_printing(self):
+        expect_lines = [
+            ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
+            ['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'],
+            ['BM_Two', '-0.1250', '-0.1628', '8', '7', '86', '72'],
+            ['BM_Two_pvalue',
+             '1.0000',
+             '0.6667',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '2.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+            ['short', '-0.1250', '-0.0625', '8', '7', '80', '75'],
+            ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
+            ['short_pvalue',
+             '0.7671',
+             '0.2000',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '3.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+            ['medium', '-0.3750', '-0.3375', '8', '5', '80', '53'],
+            ['OVERALL_GEOMEAN', '+1.6405', '-0.6985', '0', '0', '0', '0']
+        ]
+        output_lines_with_header = print_difference_report(
+            self.json_diff_report,
+            utest=True, utest_alpha=0.05, use_color=False)
+        output_lines = output_lines_with_header[2:]
+        print("\n")
+        print("\n".join(output_lines_with_header))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(' ') if x]
+            self.assertEqual(expect_lines[i], parts)
+
+    def test_json_diff_report(self):
+        expected_output = [
+            {
+                'name': u'BM_One',
+                'measurements': [
+                    {'time': -0.1,
+                     'cpu': 0.1,
+                     'real_time': 10,
+                     'real_time_other': 9,
+                     'cpu_time': 100,
+                     'cpu_time_other': 110}
+                ],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': u'BM_Two',
+                'measurements': [
+                    {'time': 0.1111111111111111,
+                     'cpu': -0.011111111111111112,
+                     'real_time': 9,
+                     'real_time_other': 10,
+                     'cpu_time': 90,
+                     'cpu_time_other': 89},
+                    {'time': -0.125, 'cpu': -0.16279069767441862, 'real_time': 8,
+                        'real_time_other': 7, 'cpu_time': 86, 'cpu_time_other': 72}
+                ],
+                'time_unit': 'ns',
+                'utest': {
+                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.6666666666666666, 'time_pvalue': 1.0
+                }
+            },
+            {
+                'name': u'short',
+                'measurements': [
+                    {'time': -0.125,
+                     'cpu': -0.0625,
+                     'real_time': 8,
+                     'real_time_other': 7,
+                     'cpu_time': 80,
+                     'cpu_time_other': 75},
+                    {'time': -0.4325,
+                     'cpu': -0.13506493506493514,
+                     'real_time': 8,
+                     'real_time_other': 4.54,
+                     'cpu_time': 77,
+                     'cpu_time_other': 66.6}
+                ],
+                'time_unit': 'ns',
+                'utest': {
+                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.2, 'time_pvalue': 0.7670968684102772
+                }
+            },
+            {
+                'name': u'medium',
+                'measurements': [
+                    {'real_time_other': 5,
+                     'cpu_time': 80,
+                     'time': -0.375,
+                     'real_time': 8,
+                     'cpu_time_other': 53,
+                     'cpu': -0.3375
+                     }
+                ],
+                'utest': {},
+                'time_unit': u'ns',
+                'aggregate_name': ''
+            },
+            {
+                'name': 'OVERALL_GEOMEAN',
+                'measurements': [{'real_time': 8.48528137423858e-09, 'cpu_time': 8.441336246629233e-08,
+                                  'real_time_other': 2.2405267593145244e-08, 'cpu_time_other': 2.5453661413660466e-08,
+                                  'time': 1.6404861082353634, 'cpu': -0.6984640740519662}],
+                'time_unit': 's',
+                'run_type': 'aggregate',
+                'aggregate_name': 'geomean',
+                'utest': {}
+            }
+        ]
+        self.assertEqual(len(self.json_diff_report), len(expected_output))
+        for out, expected in zip(
+                self.json_diff_report, expected_output):
+            self.assertEqual(out['name'], expected['name'])
+            self.assertEqual(out['time_unit'], expected['time_unit'])
+            assert_utest(self, out, expected)
+            assert_measurements(self, out, expected)
+
+
+class TestReportDifferenceForPercentageAggregates(
+        unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        def load_results():
+            import json
+            testInputs = os.path.join(
+                os.path.dirname(
+                    os.path.realpath(__file__)),
+                'Inputs')
+            testOutput1 = os.path.join(testInputs, 'test4_run0.json')
+            testOutput2 = os.path.join(testInputs, 'test4_run1.json')
+            with open(testOutput1, 'r') as f:
+                json1 = json.load(f)
+            with open(testOutput2, 'r') as f:
+                json2 = json.load(f)
+            return json1, json2
+
+        json1, json2 = load_results()
+        cls.json_diff_report = get_difference_report(
+            json1, json2, utest=True)
+
+    def test_json_diff_report_pretty_printing(self):
+        expect_lines = [
+            ['whocares', '-0.5000', '+0.5000', '0', '0', '0', '0']
+        ]
+        output_lines_with_header = print_difference_report(
+            self.json_diff_report,
+            utest=True, utest_alpha=0.05, use_color=False)
+        output_lines = output_lines_with_header[2:]
+        print("\n")
+        print("\n".join(output_lines_with_header))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(' ') if x]
+            self.assertEqual(expect_lines[i], parts)
+
+    def test_json_diff_report(self):
+        expected_output = [
+            {
+                'name': u'whocares',
+                'measurements': [
+                    {'time': -0.5,
+                     'cpu': 0.5,
+                     'real_time': 0.01,
+                     'real_time_other': 0.005,
+                     'cpu_time': 0.10,
+                     'cpu_time_other': 0.15}
+                ],
+                'time_unit': 'ns',
+                'utest': {}
+            }
+        ]
+        self.assertEqual(len(self.json_diff_report), len(expected_output))
+        for out, expected in zip(
+                self.json_diff_report, expected_output):
+            self.assertEqual(out['name'], expected['name'])
+            self.assertEqual(out['time_unit'], expected['time_unit'])
+            assert_utest(self, out, expected)
+            assert_measurements(self, out, expected)
+
+
+class TestReportSorting(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        def load_result():
+            import json
+            testInputs = os.path.join(
+                os.path.dirname(
+                    os.path.realpath(__file__)),
+                'Inputs')
+            testOutput = os.path.join(testInputs, 'test4_run.json')
+            with open(testOutput, 'r') as f:
+                json = json.load(f)
+            return json
+
+        cls.json = load_result()
+
+    def test_json_diff_report_pretty_printing(self):
+        import util
+
+        expected_names = [
+            "99 family 0 instance 0 repetition 0",
+            "98 family 0 instance 0 repetition 1",
+            "97 family 0 instance 0 aggregate",
+            "96 family 0 instance 1 repetition 0",
+            "95 family 0 instance 1 repetition 1",
+            "94 family 0 instance 1 aggregate",
+            "93 family 1 instance 0 repetition 0",
+            "92 family 1 instance 0 repetition 1",
+            "91 family 1 instance 0 aggregate",
+            "90 family 1 instance 1 repetition 0",
+            "89 family 1 instance 1 repetition 1",
+            "88 family 1 instance 1 aggregate"
+        ]
+
+        for n in range(len(self.json['benchmarks']) ** 2):
+            random.shuffle(self.json['benchmarks'])
+            sorted_benchmarks = util.sort_benchmark_results(self.json)[
+                'benchmarks']
+            self.assertEqual(len(expected_names), len(sorted_benchmarks))
+            for out, expected in zip(sorted_benchmarks, expected_names):
+                self.assertEqual(out['name'], expected)
+
+
+def assert_utest(unittest_instance, lhs, rhs):
+    if lhs['utest']:
+        unittest_instance.assertAlmostEqual(
+            lhs['utest']['cpu_pvalue'],
+            rhs['utest']['cpu_pvalue'])
+        unittest_instance.assertAlmostEqual(
+            lhs['utest']['time_pvalue'],
+            rhs['utest']['time_pvalue'])
+        unittest_instance.assertEqual(
+            lhs['utest']['have_optimal_repetitions'],
+            rhs['utest']['have_optimal_repetitions'])
+    else:
+        # lhs is empty. assert if rhs is not.
+        unittest_instance.assertEqual(lhs['utest'], rhs['utest'])
+
+
+def assert_measurements(unittest_instance, lhs, rhs):
+    for m1, m2 in zip(lhs['measurements'], rhs['measurements']):
+        unittest_instance.assertEqual(m1['real_time'], m2['real_time'])
+        unittest_instance.assertEqual(m1['cpu_time'], m2['cpu_time'])
+        # m1['time'] and m1['cpu'] hold values which are being calculated,
+        # and therefore we must use almost-equal pattern.
+        unittest_instance.assertAlmostEqual(m1['time'], m2['time'], places=4)
+        unittest_instance.assertAlmostEqual(m1['cpu'], m2['cpu'], places=4)
+
+
+if __name__ == '__main__':
+    unittest.main()
+
+# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
+# kate: tab-width: 4; replace-tabs on; indent-width 4; tab-indents: off;
+# kate: indent-mode python; remove-trailing-spaces modified;
diff --git a/third_party/google_benchmark/tools/gbench/util.py b/third_party/google_benchmark/src/tools/gbench/util.py
similarity index 71%
rename from third_party/google_benchmark/tools/gbench/util.py
rename to third_party/google_benchmark/src/tools/gbench/util.py
index 1f8e8e2..5e79da8 100644
--- a/third_party/google_benchmark/tools/gbench/util.py
+++ b/third_party/google_benchmark/src/tools/gbench/util.py
@@ -2,9 +2,11 @@
 """
 import json
 import os
-import tempfile
+import re
 import subprocess
 import sys
+import tempfile
+
 
 # Input file type enumeration
 IT_Invalid = 0
@@ -57,7 +59,7 @@
     """
     Return a tuple (type, msg) where 'type' specifies the classified type
     of 'filename'. If 'type' is 'IT_Invalid' then 'msg' is a human readable
-    string represeting the error.
+    string representing the error.
     """
     ftype = IT_Invalid
     err_msg = None
@@ -110,13 +112,49 @@
     return [f for f in benchmark_flags if not f.startswith(prefix)]
 
 
-def load_benchmark_results(fname):
+def load_benchmark_results(fname, benchmark_filter):
     """
     Read benchmark output from a file and return the JSON object.
+
+    Apply benchmark_filter, a regular expression, with nearly the same
+    semantics of the --benchmark_filter argument.  May be None.
+    Note: the Python regular expression engine is used instead of the
+    one used by the C++ code, which may produce different results
+    in complex cases.
+
     REQUIRES: 'fname' names a file containing JSON benchmark output.
     """
+    def benchmark_wanted(benchmark):
+        if benchmark_filter is None:
+            return True
+        name = benchmark.get('run_name', None) or benchmark['name']
+        if re.search(benchmark_filter, name):
+            return True
+        return False
+
     with open(fname, 'r') as f:
-        return json.load(f)
+        results = json.load(f)
+        if 'benchmarks' in results:
+            results['benchmarks'] = list(filter(benchmark_wanted,
+                                                results['benchmarks']))
+        return results
+
+
+def sort_benchmark_results(result):
+    benchmarks = result['benchmarks']
+
+    # From inner key to the outer key!
+    benchmarks = sorted(
+        benchmarks, key=lambda benchmark: benchmark['repetition_index'] if 'repetition_index' in benchmark else -1)
+    benchmarks = sorted(
+        benchmarks, key=lambda benchmark: 1 if 'run_type' in benchmark and benchmark['run_type'] == "aggregate" else 0)
+    benchmarks = sorted(
+        benchmarks, key=lambda benchmark: benchmark['per_family_instance_index'] if 'per_family_instance_index' in benchmark else -1)
+    benchmarks = sorted(
+        benchmarks, key=lambda benchmark: benchmark['family_index'] if 'family_index' in benchmark else -1)
+
+    result['benchmarks'] = benchmarks
+    return result
 
 
 def run_benchmark(exe_name, benchmark_flags):
@@ -142,7 +180,7 @@
     if exitCode != 0:
         print('TEST FAILED...')
         sys.exit(exitCode)
-    json_res = load_benchmark_results(output_name)
+    json_res = load_benchmark_results(output_name, None)
     if is_temp_output:
         os.unlink(output_name)
     return json_res
@@ -157,8 +195,9 @@
     """
     ftype = check_input_file(filename)
     if ftype == IT_JSON:
-        return load_benchmark_results(filename)
-    elif ftype == IT_Executable:
+        benchmark_filter = find_benchmark_flag('--benchmark_filter=',
+                                               benchmark_flags)
+        return load_benchmark_results(filename, benchmark_filter)
+    if ftype == IT_Executable:
         return run_benchmark(filename, benchmark_flags)
-    else:
-        assert False  # This branch is unreachable
+    raise ValueError('Unknown file type %s' % ftype)
diff --git a/third_party/google_benchmark/src/tools/libpfm.BUILD.bazel b/third_party/google_benchmark/src/tools/libpfm.BUILD.bazel
new file mode 100644
index 0000000..6269534
--- /dev/null
+++ b/third_party/google_benchmark/src/tools/libpfm.BUILD.bazel
@@ -0,0 +1,22 @@
+# Build rule for libpfm, which is required to collect performance counters for
+# BENCHMARK_ENABLE_LIBPFM builds.
+
+load("@rules_foreign_cc//foreign_cc:defs.bzl", "make")
+
+filegroup(
+    name = "pfm_srcs",
+    srcs = glob(["**"]),
+)
+
+make(
+    name = "libpfm",
+    lib_source = ":pfm_srcs",
+    lib_name = "libpfm",
+    copts = [
+        "-Wno-format-truncation",
+        "-Wno-use-after-free",
+    ],
+    visibility = [
+        "//visibility:public",
+    ],
+)
diff --git a/third_party/google_benchmark/src/tools/requirements.txt b/third_party/google_benchmark/src/tools/requirements.txt
new file mode 100644
index 0000000..3b3331b
--- /dev/null
+++ b/third_party/google_benchmark/src/tools/requirements.txt
@@ -0,0 +1 @@
+scipy>=1.5.0
\ No newline at end of file
diff --git a/third_party/google_benchmark/tools/strip_asm.py b/third_party/google_benchmark/src/tools/strip_asm.py
similarity index 99%
rename from third_party/google_benchmark/tools/strip_asm.py
rename to third_party/google_benchmark/src/tools/strip_asm.py
index 9030550..d131dc7 100755
--- a/third_party/google_benchmark/tools/strip_asm.py
+++ b/third_party/google_benchmark/src/tools/strip_asm.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 """
 strip_asm.py - Cleanup ASM output for the specified file
diff --git a/third_party/google_benchmark/test/BUILD b/third_party/google_benchmark/test/BUILD
deleted file mode 100644
index 9bb8cb0..0000000
--- a/third_party/google_benchmark/test/BUILD
+++ /dev/null
@@ -1,73 +0,0 @@
-TEST_COPTS = [
-    "-pedantic",
-    "-pedantic-errors",
-    "-std=c++11",
-    "-Wall",
-    "-Wextra",
-    "-Wshadow",
-    #    "-Wshorten-64-to-32",
-    "-Wfloat-equal",
-    "-fstrict-aliasing",
-]
-
-PER_SRC_COPTS = ({
-    "cxx03_test.cc": ["-std=c++03"],
-    # Some of the issues with DoNotOptimize only occur when optimization is enabled
-    "donotoptimize_test.cc": ["-O3"],
-})
-
-TEST_ARGS = ["--benchmark_min_time=0.01"]
-
-PER_SRC_TEST_ARGS = ({
-    "user_counters_tabular_test.cc": ["--benchmark_counters_tabular=true"],
-})
-
-load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")
-
-cc_library(
-    name = "output_test_helper",
-    testonly = 1,
-    srcs = ["output_test_helper.cc"],
-    hdrs = ["output_test.h"],
-    copts = TEST_COPTS,
-    deps = [
-        "//:benchmark",
-        "//:benchmark_internal_headers",
-    ],
-)
-
-[
-    cc_test(
-        name = test_src[:-len(".cc")],
-        size = "small",
-        srcs = [test_src],
-        args = TEST_ARGS + PER_SRC_TEST_ARGS.get(test_src, []),
-        copts = TEST_COPTS + PER_SRC_COPTS.get(test_src, []),
-        deps = [
-            ":output_test_helper",
-            "//:benchmark",
-            "//:benchmark_internal_headers",
-            "@com_google_googletest//:gtest",
-        ] + (
-            ["@com_google_googletest//:gtest_main"] if (test_src[-len("gtest.cc"):] == "gtest.cc") else []
-        ),
-        # FIXME: Add support for assembly tests to bazel.
-        # See Issue #556
-        # https://github.com/google/benchmark/issues/556
-    )
-    for test_src in glob(
-        ["*test.cc"],
-        exclude = [
-            "*_assembly_test.cc",
-            "link_main_test.cc",
-        ],
-    )
-]
-
-cc_test(
-    name = "link_main_test",
-    size = "small",
-    srcs = ["link_main_test.cc"],
-    copts = TEST_COPTS,
-    deps = ["//:benchmark_main"],
-)
diff --git a/third_party/google_benchmark/test/basic_test.cc b/third_party/google_benchmark/test/basic_test.cc
deleted file mode 100644
index 5f3dd1a..0000000
--- a/third_party/google_benchmark/test/basic_test.cc
+++ /dev/null
@@ -1,136 +0,0 @@
-
-#include "benchmark/benchmark.h"
-
-#define BASIC_BENCHMARK_TEST(x) BENCHMARK(x)->Arg(8)->Arg(512)->Arg(8192)
-
-void BM_empty(benchmark::State& state) {
-  for (auto _ : state) {
-    benchmark::DoNotOptimize(state.iterations());
-  }
-}
-BENCHMARK(BM_empty);
-BENCHMARK(BM_empty)->ThreadPerCpu();
-
-void BM_spin_empty(benchmark::State& state) {
-  for (auto _ : state) {
-    for (int x = 0; x < state.range(0); ++x) {
-      benchmark::DoNotOptimize(x);
-    }
-  }
-}
-BASIC_BENCHMARK_TEST(BM_spin_empty);
-BASIC_BENCHMARK_TEST(BM_spin_empty)->ThreadPerCpu();
-
-void BM_spin_pause_before(benchmark::State& state) {
-  for (int i = 0; i < state.range(0); ++i) {
-    benchmark::DoNotOptimize(i);
-  }
-  for (auto _ : state) {
-    for (int i = 0; i < state.range(0); ++i) {
-      benchmark::DoNotOptimize(i);
-    }
-  }
-}
-BASIC_BENCHMARK_TEST(BM_spin_pause_before);
-BASIC_BENCHMARK_TEST(BM_spin_pause_before)->ThreadPerCpu();
-
-void BM_spin_pause_during(benchmark::State& state) {
-  for (auto _ : state) {
-    state.PauseTiming();
-    for (int i = 0; i < state.range(0); ++i) {
-      benchmark::DoNotOptimize(i);
-    }
-    state.ResumeTiming();
-    for (int i = 0; i < state.range(0); ++i) {
-      benchmark::DoNotOptimize(i);
-    }
-  }
-}
-BASIC_BENCHMARK_TEST(BM_spin_pause_during);
-BASIC_BENCHMARK_TEST(BM_spin_pause_during)->ThreadPerCpu();
-
-void BM_pause_during(benchmark::State& state) {
-  for (auto _ : state) {
-    state.PauseTiming();
-    state.ResumeTiming();
-  }
-}
-BENCHMARK(BM_pause_during);
-BENCHMARK(BM_pause_during)->ThreadPerCpu();
-BENCHMARK(BM_pause_during)->UseRealTime();
-BENCHMARK(BM_pause_during)->UseRealTime()->ThreadPerCpu();
-
-void BM_spin_pause_after(benchmark::State& state) {
-  for (auto _ : state) {
-    for (int i = 0; i < state.range(0); ++i) {
-      benchmark::DoNotOptimize(i);
-    }
-  }
-  for (int i = 0; i < state.range(0); ++i) {
-    benchmark::DoNotOptimize(i);
-  }
-}
-BASIC_BENCHMARK_TEST(BM_spin_pause_after);
-BASIC_BENCHMARK_TEST(BM_spin_pause_after)->ThreadPerCpu();
-
-void BM_spin_pause_before_and_after(benchmark::State& state) {
-  for (int i = 0; i < state.range(0); ++i) {
-    benchmark::DoNotOptimize(i);
-  }
-  for (auto _ : state) {
-    for (int i = 0; i < state.range(0); ++i) {
-      benchmark::DoNotOptimize(i);
-    }
-  }
-  for (int i = 0; i < state.range(0); ++i) {
-    benchmark::DoNotOptimize(i);
-  }
-}
-BASIC_BENCHMARK_TEST(BM_spin_pause_before_and_after);
-BASIC_BENCHMARK_TEST(BM_spin_pause_before_and_after)->ThreadPerCpu();
-
-void BM_empty_stop_start(benchmark::State& state) {
-  for (auto _ : state) {
-  }
-}
-BENCHMARK(BM_empty_stop_start);
-BENCHMARK(BM_empty_stop_start)->ThreadPerCpu();
-
-
-void BM_KeepRunning(benchmark::State& state) {
-  benchmark::IterationCount iter_count = 0;
-  assert(iter_count == state.iterations());
-  while (state.KeepRunning()) {
-    ++iter_count;
-  }
-  assert(iter_count == state.iterations());
-}
-BENCHMARK(BM_KeepRunning);
-
-void BM_KeepRunningBatch(benchmark::State& state) {
-  // Choose a prime batch size to avoid evenly dividing max_iterations.
-  const benchmark::IterationCount batch_size = 101;
-  benchmark::IterationCount iter_count = 0;
-  while (state.KeepRunningBatch(batch_size)) {
-    iter_count += batch_size;
-  }
-  assert(state.iterations() == iter_count);
-}
-BENCHMARK(BM_KeepRunningBatch);
-
-void BM_RangedFor(benchmark::State& state) {
-  benchmark::IterationCount iter_count = 0;
-  for (auto _ : state) {
-    ++iter_count;
-  }
-  assert(iter_count == state.max_iterations);
-}
-BENCHMARK(BM_RangedFor);
-
-// Ensure that StateIterator provides all the necessary typedefs required to
-// instantiate std::iterator_traits.
-static_assert(std::is_same<
-  typename std::iterator_traits<benchmark::State::StateIterator>::value_type,
-  typename benchmark::State::StateIterator::value_type>::value, "");
-
-BENCHMARK_MAIN();
diff --git a/third_party/google_benchmark/test/commandlineflags_gtest.cc b/third_party/google_benchmark/test/commandlineflags_gtest.cc
deleted file mode 100644
index 36bdb44..0000000
--- a/third_party/google_benchmark/test/commandlineflags_gtest.cc
+++ /dev/null
@@ -1,201 +0,0 @@
-#include <cstdlib>
-
-#include "../src/commandlineflags.h"
-#include "../src/internal_macros.h"
-#include "gtest/gtest.h"
-
-namespace benchmark {
-namespace {
-
-#if defined(BENCHMARK_OS_WINDOWS)
-int setenv(const char* name, const char* value, int overwrite) {
-  if (!overwrite) {
-    // NOTE: getenv_s is far superior but not available under mingw.
-    char* env_value = getenv(name);
-    if (env_value == nullptr) {
-      return -1;
-    }
-  }
-  return _putenv_s(name, value);
-}
-
-int unsetenv(const char* name) {
-  return _putenv_s(name, "");
-}
-
-#endif  // BENCHMARK_OS_WINDOWS
-
-TEST(BoolFromEnv, Default) {
-  ASSERT_EQ(unsetenv("BENCHMARK_NOT_IN_ENV"), 0);
-  EXPECT_EQ(BoolFromEnv("not_in_env", true), true);
-}
-
-TEST(BoolFromEnv, False) {
-  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "0", 1), 0);
-  EXPECT_EQ(BoolFromEnv("in_env", true), false);
-  unsetenv("BENCHMARK_IN_ENV");
-
-  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "N", 1), 0);
-  EXPECT_EQ(BoolFromEnv("in_env", true), false);
-  unsetenv("BENCHMARK_IN_ENV");
-
-  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "n", 1), 0);
-  EXPECT_EQ(BoolFromEnv("in_env", true), false);
-  unsetenv("BENCHMARK_IN_ENV");
-
-  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "NO", 1), 0);
-  EXPECT_EQ(BoolFromEnv("in_env", true), false);
-  unsetenv("BENCHMARK_IN_ENV");
-
-  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "No", 1), 0);
-  EXPECT_EQ(BoolFromEnv("in_env", true), false);
-  unsetenv("BENCHMARK_IN_ENV");
-
-  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "no", 1), 0);
-  EXPECT_EQ(BoolFromEnv("in_env", true), false);
-  unsetenv("BENCHMARK_IN_ENV");
-
-  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "F", 1), 0);
-  EXPECT_EQ(BoolFromEnv("in_env", true), false);
-  unsetenv("BENCHMARK_IN_ENV");
-
-  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "f", 1), 0);
-  EXPECT_EQ(BoolFromEnv("in_env", true), false);
-  unsetenv("BENCHMARK_IN_ENV");
-
-  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "FALSE", 1), 0);
-  EXPECT_EQ(BoolFromEnv("in_env", true), false);
-  unsetenv("BENCHMARK_IN_ENV");
-
-  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "False", 1), 0);
-  EXPECT_EQ(BoolFromEnv("in_env", true), false);
-  unsetenv("BENCHMARK_IN_ENV");
-
-  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "false", 1), 0);
-  EXPECT_EQ(BoolFromEnv("in_env", true), false);
-  unsetenv("BENCHMARK_IN_ENV");
-
-  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "OFF", 1), 0);
-  EXPECT_EQ(BoolFromEnv("in_env", true), false);
-  unsetenv("BENCHMARK_IN_ENV");
-
-  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "Off", 1), 0);
-  EXPECT_EQ(BoolFromEnv("in_env", true), false);
-  unsetenv("BENCHMARK_IN_ENV");
-
-  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "off", 1), 0);
-  EXPECT_EQ(BoolFromEnv("in_env", true), false);
-  unsetenv("BENCHMARK_IN_ENV");
-}
-
-TEST(BoolFromEnv, True) {
-  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "1", 1), 0);
-  EXPECT_EQ(BoolFromEnv("in_env", false), true);
-  unsetenv("BENCHMARK_IN_ENV");
-
-  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "Y", 1), 0);
-  EXPECT_EQ(BoolFromEnv("in_env", false), true);
-  unsetenv("BENCHMARK_IN_ENV");
-
-  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "y", 1), 0);
-  EXPECT_EQ(BoolFromEnv("in_env", false), true);
-  unsetenv("BENCHMARK_IN_ENV");
-
-  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "YES", 1), 0);
-  EXPECT_EQ(BoolFromEnv("in_env", false), true);
-  unsetenv("BENCHMARK_IN_ENV");
-
-  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "Yes", 1), 0);
-  EXPECT_EQ(BoolFromEnv("in_env", false), true);
-  unsetenv("BENCHMARK_IN_ENV");
-
-  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "yes", 1), 0);
-  EXPECT_EQ(BoolFromEnv("in_env", false), true);
-  unsetenv("BENCHMARK_IN_ENV");
-
-  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "T", 1), 0);
-  EXPECT_EQ(BoolFromEnv("in_env", false), true);
-  unsetenv("BENCHMARK_IN_ENV");
-
-  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "t", 1), 0);
-  EXPECT_EQ(BoolFromEnv("in_env", false), true);
-  unsetenv("BENCHMARK_IN_ENV");
-
-  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "TRUE", 1), 0);
-  EXPECT_EQ(BoolFromEnv("in_env", false), true);
-  unsetenv("BENCHMARK_IN_ENV");
-
-  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "True", 1), 0);
-  EXPECT_EQ(BoolFromEnv("in_env", false), true);
-  unsetenv("BENCHMARK_IN_ENV");
-
-  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "true", 1), 0);
-  EXPECT_EQ(BoolFromEnv("in_env", false), true);
-  unsetenv("BENCHMARK_IN_ENV");
-
-  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "ON", 1), 0);
-  EXPECT_EQ(BoolFromEnv("in_env", false), true);
-  unsetenv("BENCHMARK_IN_ENV");
-
-  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "On", 1), 0);
-  EXPECT_EQ(BoolFromEnv("in_env", false), true);
-  unsetenv("BENCHMARK_IN_ENV");
-
-  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "on", 1), 0);
-  EXPECT_EQ(BoolFromEnv("in_env", false), true);
-  unsetenv("BENCHMARK_IN_ENV");
-
-#ifndef BENCHMARK_OS_WINDOWS
-  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "", 1), 0);
-  EXPECT_EQ(BoolFromEnv("in_env", false), true);
-  unsetenv("BENCHMARK_IN_ENV");
-#endif
-}
-
-TEST(Int32FromEnv, NotInEnv) {
-  ASSERT_EQ(unsetenv("BENCHMARK_NOT_IN_ENV"), 0);
-  EXPECT_EQ(Int32FromEnv("not_in_env", 42), 42);
-}
-
-TEST(Int32FromEnv, InvalidInteger) {
-  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "foo", 1), 0);
-  EXPECT_EQ(Int32FromEnv("in_env", 42), 42);
-  unsetenv("BENCHMARK_IN_ENV");
-}
-
-TEST(Int32FromEnv, ValidInteger) {
-  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "42", 1), 0);
-  EXPECT_EQ(Int32FromEnv("in_env", 64), 42);
-  unsetenv("BENCHMARK_IN_ENV");
-}
-
-TEST(DoubleFromEnv, NotInEnv) {
-  ASSERT_EQ(unsetenv("BENCHMARK_NOT_IN_ENV"), 0);
-  EXPECT_EQ(DoubleFromEnv("not_in_env", 0.51), 0.51);
-}
-
-TEST(DoubleFromEnv, InvalidReal) {
-  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "foo", 1), 0);
-  EXPECT_EQ(DoubleFromEnv("in_env", 0.51), 0.51);
-  unsetenv("BENCHMARK_IN_ENV");
-}
-
-TEST(DoubleFromEnv, ValidReal) {
-  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "0.51", 1), 0);
-  EXPECT_EQ(DoubleFromEnv("in_env", 0.71), 0.51);
-  unsetenv("BENCHMARK_IN_ENV");
-}
-
-TEST(StringFromEnv, Default) {
-  ASSERT_EQ(unsetenv("BENCHMARK_NOT_IN_ENV"), 0);
-  EXPECT_STREQ(StringFromEnv("not_in_env", "foo"), "foo");
-}
-
-TEST(StringFromEnv, Valid) {
-  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "foo", 1), 0);
-  EXPECT_STREQ(StringFromEnv("in_env", "bar"), "foo");
-  unsetenv("BENCHMARK_IN_ENV");
-}
-
-}  // namespace
-}  // namespace benchmark
diff --git a/third_party/google_benchmark/test/fixture_test.cc b/third_party/google_benchmark/test/fixture_test.cc
deleted file mode 100644
index 1462b10..0000000
--- a/third_party/google_benchmark/test/fixture_test.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-
-#include "benchmark/benchmark.h"
-
-#include <cassert>
-#include <memory>
-
-class MyFixture : public ::benchmark::Fixture {
- public:
-  void SetUp(const ::benchmark::State& state) {
-    if (state.thread_index == 0) {
-      assert(data.get() == nullptr);
-      data.reset(new int(42));
-    }
-  }
-
-  void TearDown(const ::benchmark::State& state) {
-    if (state.thread_index == 0) {
-      assert(data.get() != nullptr);
-      data.reset();
-    }
-  }
-
-  ~MyFixture() { assert(data == nullptr); }
-
-  std::unique_ptr<int> data;
-};
-
-BENCHMARK_F(MyFixture, Foo)(benchmark::State &st) {
-  assert(data.get() != nullptr);
-  assert(*data == 42);
-  for (auto _ : st) {
-  }
-}
-
-BENCHMARK_DEFINE_F(MyFixture, Bar)(benchmark::State& st) {
-  if (st.thread_index == 0) {
-    assert(data.get() != nullptr);
-    assert(*data == 42);
-  }
-  for (auto _ : st) {
-    assert(data.get() != nullptr);
-    assert(*data == 42);
-  }
-  st.SetItemsProcessed(st.range(0));
-}
-BENCHMARK_REGISTER_F(MyFixture, Bar)->Arg(42);
-BENCHMARK_REGISTER_F(MyFixture, Bar)->Arg(42)->ThreadPerCpu();
-
-BENCHMARK_MAIN();
diff --git a/third_party/google_benchmark/test/string_util_gtest.cc b/third_party/google_benchmark/test/string_util_gtest.cc
deleted file mode 100644
index 01bf155..0000000
--- a/third_party/google_benchmark/test/string_util_gtest.cc
+++ /dev/null
@@ -1,153 +0,0 @@
-//===---------------------------------------------------------------------===//
-// statistics_test - Unit tests for src/statistics.cc
-//===---------------------------------------------------------------------===//
-
-#include "../src/string_util.h"
-#include "../src/internal_macros.h"
-#include "gtest/gtest.h"
-
-namespace {
-TEST(StringUtilTest, stoul) {
-  {
-    size_t pos = 0;
-    EXPECT_EQ(0ul, benchmark::stoul("0", &pos));
-    EXPECT_EQ(1ul, pos);
-  }
-  {
-    size_t pos = 0;
-    EXPECT_EQ(7ul, benchmark::stoul("7", &pos));
-    EXPECT_EQ(1ul, pos);
-  }
-  {
-    size_t pos = 0;
-    EXPECT_EQ(135ul, benchmark::stoul("135", &pos));
-    EXPECT_EQ(3ul, pos);
-  }
-#if ULONG_MAX == 0xFFFFFFFFul
-  {
-    size_t pos = 0;
-    EXPECT_EQ(0xFFFFFFFFul, benchmark::stoul("4294967295", &pos));
-    EXPECT_EQ(10ul, pos);
-  }
-#elif ULONG_MAX == 0xFFFFFFFFFFFFFFFFul
-  {
-    size_t pos = 0;
-    EXPECT_EQ(0xFFFFFFFFFFFFFFFFul, benchmark::stoul("18446744073709551615", &pos));
-    EXPECT_EQ(20ul, pos);
-  }
-#endif
-  {
-    size_t pos = 0;
-    EXPECT_EQ(10ul, benchmark::stoul("1010", &pos, 2));
-    EXPECT_EQ(4ul, pos);
-  }
-  {
-    size_t pos = 0;
-    EXPECT_EQ(520ul, benchmark::stoul("1010", &pos, 8));
-    EXPECT_EQ(4ul, pos);
-  }
-  {
-    size_t pos = 0;
-    EXPECT_EQ(1010ul, benchmark::stoul("1010", &pos, 10));
-    EXPECT_EQ(4ul, pos);
-  }
-  {
-    size_t pos = 0;
-    EXPECT_EQ(4112ul, benchmark::stoul("1010", &pos, 16));
-    EXPECT_EQ(4ul, pos);
-  }
-  {
-    size_t pos = 0;
-    EXPECT_EQ(0xBEEFul, benchmark::stoul("BEEF", &pos, 16));
-    EXPECT_EQ(4ul, pos);
-  }
-#ifndef BENCHMARK_HAS_NO_EXCEPTIONS
-  {
-    ASSERT_THROW(benchmark::stoul("this is a test"), std::invalid_argument);
-  }
-#endif
-}
-
-TEST(StringUtilTest, stoi) {
-  {
-    size_t pos = 0;
-    EXPECT_EQ(0, benchmark::stoi("0", &pos));
-    EXPECT_EQ(1ul, pos);
-  }
-  {
-    size_t pos = 0;
-    EXPECT_EQ(-17, benchmark::stoi("-17", &pos));
-    EXPECT_EQ(3ul, pos);
-  }
-  {
-    size_t pos = 0;
-    EXPECT_EQ(1357, benchmark::stoi("1357", &pos));
-    EXPECT_EQ(4ul, pos);
-  }
-  {
-    size_t pos = 0;
-    EXPECT_EQ(10, benchmark::stoi("1010", &pos, 2));
-    EXPECT_EQ(4ul, pos);
-  }
-  {
-    size_t pos = 0;
-    EXPECT_EQ(520, benchmark::stoi("1010", &pos, 8));
-    EXPECT_EQ(4ul, pos);
-  }
-  {
-    size_t pos = 0;
-    EXPECT_EQ(1010, benchmark::stoi("1010", &pos, 10));
-    EXPECT_EQ(4ul, pos);
-  }
-  {
-    size_t pos = 0;
-    EXPECT_EQ(4112, benchmark::stoi("1010", &pos, 16));
-    EXPECT_EQ(4ul, pos);
-  }
-  {
-    size_t pos = 0;
-    EXPECT_EQ(0xBEEF, benchmark::stoi("BEEF", &pos, 16));
-    EXPECT_EQ(4ul, pos);
-  }
-#ifndef BENCHMARK_HAS_NO_EXCEPTIONS
-  {
-    ASSERT_THROW(benchmark::stoi("this is a test"), std::invalid_argument);
-  }
-#endif
-}
-
-TEST(StringUtilTest, stod) {
-  {
-    size_t pos = 0;
-    EXPECT_EQ(0.0, benchmark::stod("0", &pos));
-    EXPECT_EQ(1ul, pos);
-  }
-  {
-    size_t pos = 0;
-    EXPECT_EQ(-84.0, benchmark::stod("-84", &pos));
-    EXPECT_EQ(3ul, pos);
-  }
-  {
-    size_t pos = 0;
-    EXPECT_EQ(1234.0, benchmark::stod("1234", &pos));
-    EXPECT_EQ(4ul, pos);
-  }
-  {
-    size_t pos = 0;
-    EXPECT_EQ(1.5, benchmark::stod("1.5", &pos));
-    EXPECT_EQ(3ul, pos);
-  }
-  {
-    size_t pos = 0;
-    /* Note: exactly representable as double */
-    EXPECT_EQ(-1.25e+9, benchmark::stod("-1.25e+9", &pos));
-    EXPECT_EQ(8ul, pos);
-  }
-#ifndef BENCHMARK_HAS_NO_EXCEPTIONS
-  {
-    ASSERT_THROW(benchmark::stod("this is a test"), std::invalid_argument);
-  }
-#endif
-}
-
-}  // end namespace
diff --git a/third_party/google_benchmark/test/user_counters_tabular_test.cc b/third_party/google_benchmark/test/user_counters_tabular_test.cc
deleted file mode 100644
index 18373c0..0000000
--- a/third_party/google_benchmark/test/user_counters_tabular_test.cc
+++ /dev/null
@@ -1,285 +0,0 @@
-
-#undef NDEBUG
-
-#include "benchmark/benchmark.h"
-#include "output_test.h"
-
-// @todo: <jpmag> this checks the full output at once; the rule for
-// CounterSet1 was failing because it was not matching "^[-]+$".
-// @todo: <jpmag> check that the counters are vertically aligned.
-ADD_CASES(
-    TC_ConsoleOut,
-    {
-        // keeping these lines long improves readability, so:
-        // clang-format off
-    {"^[-]+$", MR_Next},
-    {"^Benchmark %s Time %s CPU %s Iterations %s Bar %s Bat %s Baz %s Foo %s Frob %s Lob$", MR_Next},
-    {"^[-]+$", MR_Next},
-    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
-    {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
-    {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
-    {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
-    {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
-    {"^[-]+$", MR_Next},
-    {"^Benchmark %s Time %s CPU %s Iterations %s Bar %s Baz %s Foo$", MR_Next},
-    {"^[-]+$", MR_Next},
-    {"^BM_CounterSet0_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_CounterSet0_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_CounterSet0_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_CounterSet0_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_CounterSet0_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_CounterSet1_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_CounterSet1_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_CounterSet1_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_CounterSet1_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_CounterSet1_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^[-]+$", MR_Next},
-    {"^Benchmark %s Time %s CPU %s Iterations %s Bat %s Baz %s Foo$", MR_Next},
-    {"^[-]+$", MR_Next},
-    {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
-    {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$"},
-        // clang-format on
-    });
-ADD_CASES(TC_CSVOut, {{"%csv_header,"
-                       "\"Bar\",\"Bat\",\"Baz\",\"Foo\",\"Frob\",\"Lob\""}});
-
-// ========================================================================= //
-// ------------------------- Tabular Counters Output ----------------------- //
-// ========================================================================= //
-
-void BM_Counters_Tabular(benchmark::State& state) {
-  for (auto _ : state) {
-  }
-  namespace bm = benchmark;
-  state.counters.insert({
-      {"Foo", {1, bm::Counter::kAvgThreads}},
-      {"Bar", {2, bm::Counter::kAvgThreads}},
-      {"Baz", {4, bm::Counter::kAvgThreads}},
-      {"Bat", {8, bm::Counter::kAvgThreads}},
-      {"Frob", {16, bm::Counter::kAvgThreads}},
-      {"Lob", {32, bm::Counter::kAvgThreads}},
-  });
-}
-BENCHMARK(BM_Counters_Tabular)->ThreadRange(1, 16);
-ADD_CASES(TC_JSONOut,
-          {{"\"name\": \"BM_Counters_Tabular/threads:%int\",$"},
-           {"\"run_name\": \"BM_Counters_Tabular/threads:%int\",$", MR_Next},
-           {"\"run_type\": \"iteration\",$", MR_Next},
-           {"\"repetitions\": 0,$", MR_Next},
-           {"\"repetition_index\": 0,$", MR_Next},
-           {"\"threads\": 1,$", MR_Next},
-           {"\"iterations\": %int,$", MR_Next},
-           {"\"real_time\": %float,$", MR_Next},
-           {"\"cpu_time\": %float,$", MR_Next},
-           {"\"time_unit\": \"ns\",$", MR_Next},
-           {"\"Bar\": %float,$", MR_Next},
-           {"\"Bat\": %float,$", MR_Next},
-           {"\"Baz\": %float,$", MR_Next},
-           {"\"Foo\": %float,$", MR_Next},
-           {"\"Frob\": %float,$", MR_Next},
-           {"\"Lob\": %float$", MR_Next},
-           {"}", MR_Next}});
-ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_Tabular/threads:%int\",%csv_report,"
-                       "%float,%float,%float,%float,%float,%float$"}});
-// VS2013 does not allow this function to be passed as a lambda argument
-// to CHECK_BENCHMARK_RESULTS()
-void CheckTabular(Results const& e) {
-  CHECK_COUNTER_VALUE(e, int, "Foo", EQ, 1);
-  CHECK_COUNTER_VALUE(e, int, "Bar", EQ, 2);
-  CHECK_COUNTER_VALUE(e, int, "Baz", EQ, 4);
-  CHECK_COUNTER_VALUE(e, int, "Bat", EQ, 8);
-  CHECK_COUNTER_VALUE(e, int, "Frob", EQ, 16);
-  CHECK_COUNTER_VALUE(e, int, "Lob", EQ, 32);
-}
-CHECK_BENCHMARK_RESULTS("BM_Counters_Tabular/threads:%int", &CheckTabular);
-
-// ========================================================================= //
-// -------------------- Tabular+Rate Counters Output ----------------------- //
-// ========================================================================= //
-
-void BM_CounterRates_Tabular(benchmark::State& state) {
-  for (auto _ : state) {
-    // This test requires a non-zero CPU time to avoid divide-by-zero
-    benchmark::DoNotOptimize(state.iterations());
-  }
-  namespace bm = benchmark;
-  state.counters.insert({
-      {"Foo", {1, bm::Counter::kAvgThreadsRate}},
-      {"Bar", {2, bm::Counter::kAvgThreadsRate}},
-      {"Baz", {4, bm::Counter::kAvgThreadsRate}},
-      {"Bat", {8, bm::Counter::kAvgThreadsRate}},
-      {"Frob", {16, bm::Counter::kAvgThreadsRate}},
-      {"Lob", {32, bm::Counter::kAvgThreadsRate}},
-  });
-}
-BENCHMARK(BM_CounterRates_Tabular)->ThreadRange(1, 16);
-ADD_CASES(TC_JSONOut,
-          {{"\"name\": \"BM_CounterRates_Tabular/threads:%int\",$"},
-           {"\"run_name\": \"BM_CounterRates_Tabular/threads:%int\",$",
-            MR_Next},
-           {"\"run_type\": \"iteration\",$", MR_Next},
-           {"\"repetitions\": 0,$", MR_Next},
-           {"\"repetition_index\": 0,$", MR_Next},
-           {"\"threads\": 1,$", MR_Next},
-           {"\"iterations\": %int,$", MR_Next},
-           {"\"real_time\": %float,$", MR_Next},
-           {"\"cpu_time\": %float,$", MR_Next},
-           {"\"time_unit\": \"ns\",$", MR_Next},
-           {"\"Bar\": %float,$", MR_Next},
-           {"\"Bat\": %float,$", MR_Next},
-           {"\"Baz\": %float,$", MR_Next},
-           {"\"Foo\": %float,$", MR_Next},
-           {"\"Frob\": %float,$", MR_Next},
-           {"\"Lob\": %float$", MR_Next},
-           {"}", MR_Next}});
-ADD_CASES(TC_CSVOut, {{"^\"BM_CounterRates_Tabular/threads:%int\",%csv_report,"
-                       "%float,%float,%float,%float,%float,%float$"}});
-// VS2013 does not allow this function to be passed as a lambda argument
-// to CHECK_BENCHMARK_RESULTS()
-void CheckTabularRate(Results const& e) {
-  double t = e.DurationCPUTime();
-  CHECK_FLOAT_COUNTER_VALUE(e, "Foo", EQ, 1. / t, 0.001);
-  CHECK_FLOAT_COUNTER_VALUE(e, "Bar", EQ, 2. / t, 0.001);
-  CHECK_FLOAT_COUNTER_VALUE(e, "Baz", EQ, 4. / t, 0.001);
-  CHECK_FLOAT_COUNTER_VALUE(e, "Bat", EQ, 8. / t, 0.001);
-  CHECK_FLOAT_COUNTER_VALUE(e, "Frob", EQ, 16. / t, 0.001);
-  CHECK_FLOAT_COUNTER_VALUE(e, "Lob", EQ, 32. / t, 0.001);
-}
-CHECK_BENCHMARK_RESULTS("BM_CounterRates_Tabular/threads:%int",
-                        &CheckTabularRate);
-
-// ========================================================================= //
-// ------------------------- Tabular Counters Output ----------------------- //
-// ========================================================================= //
-
-// set only some of the counters
-void BM_CounterSet0_Tabular(benchmark::State& state) {
-  for (auto _ : state) {
-  }
-  namespace bm = benchmark;
-  state.counters.insert({
-      {"Foo", {10, bm::Counter::kAvgThreads}},
-      {"Bar", {20, bm::Counter::kAvgThreads}},
-      {"Baz", {40, bm::Counter::kAvgThreads}},
-  });
-}
-BENCHMARK(BM_CounterSet0_Tabular)->ThreadRange(1, 16);
-ADD_CASES(TC_JSONOut,
-          {{"\"name\": \"BM_CounterSet0_Tabular/threads:%int\",$"},
-           {"\"run_name\": \"BM_CounterSet0_Tabular/threads:%int\",$", MR_Next},
-           {"\"run_type\": \"iteration\",$", MR_Next},
-           {"\"repetitions\": 0,$", MR_Next},
-           {"\"repetition_index\": 0,$", MR_Next},
-           {"\"threads\": 1,$", MR_Next},
-           {"\"iterations\": %int,$", MR_Next},
-           {"\"real_time\": %float,$", MR_Next},
-           {"\"cpu_time\": %float,$", MR_Next},
-           {"\"time_unit\": \"ns\",$", MR_Next},
-           {"\"Bar\": %float,$", MR_Next},
-           {"\"Baz\": %float,$", MR_Next},
-           {"\"Foo\": %float$", MR_Next},
-           {"}", MR_Next}});
-ADD_CASES(TC_CSVOut, {{"^\"BM_CounterSet0_Tabular/threads:%int\",%csv_report,"
-                       "%float,,%float,%float,,"}});
-// VS2013 does not allow this function to be passed as a lambda argument
-// to CHECK_BENCHMARK_RESULTS()
-void CheckSet0(Results const& e) {
-  CHECK_COUNTER_VALUE(e, int, "Foo", EQ, 10);
-  CHECK_COUNTER_VALUE(e, int, "Bar", EQ, 20);
-  CHECK_COUNTER_VALUE(e, int, "Baz", EQ, 40);
-}
-CHECK_BENCHMARK_RESULTS("BM_CounterSet0_Tabular", &CheckSet0);
-
-// again.
-void BM_CounterSet1_Tabular(benchmark::State& state) {
-  for (auto _ : state) {
-  }
-  namespace bm = benchmark;
-  state.counters.insert({
-      {"Foo", {15, bm::Counter::kAvgThreads}},
-      {"Bar", {25, bm::Counter::kAvgThreads}},
-      {"Baz", {45, bm::Counter::kAvgThreads}},
-  });
-}
-BENCHMARK(BM_CounterSet1_Tabular)->ThreadRange(1, 16);
-ADD_CASES(TC_JSONOut,
-          {{"\"name\": \"BM_CounterSet1_Tabular/threads:%int\",$"},
-           {"\"run_name\": \"BM_CounterSet1_Tabular/threads:%int\",$", MR_Next},
-           {"\"run_type\": \"iteration\",$", MR_Next},
-           {"\"repetitions\": 0,$", MR_Next},
-           {"\"repetition_index\": 0,$", MR_Next},
-           {"\"threads\": 1,$", MR_Next},
-           {"\"iterations\": %int,$", MR_Next},
-           {"\"real_time\": %float,$", MR_Next},
-           {"\"cpu_time\": %float,$", MR_Next},
-           {"\"time_unit\": \"ns\",$", MR_Next},
-           {"\"Bar\": %float,$", MR_Next},
-           {"\"Baz\": %float,$", MR_Next},
-           {"\"Foo\": %float$", MR_Next},
-           {"}", MR_Next}});
-ADD_CASES(TC_CSVOut, {{"^\"BM_CounterSet1_Tabular/threads:%int\",%csv_report,"
-                       "%float,,%float,%float,,"}});
-// VS2013 does not allow this function to be passed as a lambda argument
-// to CHECK_BENCHMARK_RESULTS()
-void CheckSet1(Results const& e) {
-  CHECK_COUNTER_VALUE(e, int, "Foo", EQ, 15);
-  CHECK_COUNTER_VALUE(e, int, "Bar", EQ, 25);
-  CHECK_COUNTER_VALUE(e, int, "Baz", EQ, 45);
-}
-CHECK_BENCHMARK_RESULTS("BM_CounterSet1_Tabular/threads:%int", &CheckSet1);
-
-// ========================================================================= //
-// ------------------------- Tabular Counters Output ----------------------- //
-// ========================================================================= //
-
-// set only some of the counters, different set now.
-void BM_CounterSet2_Tabular(benchmark::State& state) {
-  for (auto _ : state) {
-  }
-  namespace bm = benchmark;
-  state.counters.insert({
-      {"Foo", {10, bm::Counter::kAvgThreads}},
-      {"Bat", {30, bm::Counter::kAvgThreads}},
-      {"Baz", {40, bm::Counter::kAvgThreads}},
-  });
-}
-BENCHMARK(BM_CounterSet2_Tabular)->ThreadRange(1, 16);
-ADD_CASES(TC_JSONOut,
-          {{"\"name\": \"BM_CounterSet2_Tabular/threads:%int\",$"},
-           {"\"run_name\": \"BM_CounterSet2_Tabular/threads:%int\",$", MR_Next},
-           {"\"run_type\": \"iteration\",$", MR_Next},
-           {"\"repetitions\": 0,$", MR_Next},
-           {"\"repetition_index\": 0,$", MR_Next},
-           {"\"threads\": 1,$", MR_Next},
-           {"\"iterations\": %int,$", MR_Next},
-           {"\"real_time\": %float,$", MR_Next},
-           {"\"cpu_time\": %float,$", MR_Next},
-           {"\"time_unit\": \"ns\",$", MR_Next},
-           {"\"Bat\": %float,$", MR_Next},
-           {"\"Baz\": %float,$", MR_Next},
-           {"\"Foo\": %float$", MR_Next},
-           {"}", MR_Next}});
-ADD_CASES(TC_CSVOut, {{"^\"BM_CounterSet2_Tabular/threads:%int\",%csv_report,"
-                       ",%float,%float,%float,,"}});
-// VS2013 does not allow this function to be passed as a lambda argument
-// to CHECK_BENCHMARK_RESULTS()
-void CheckSet2(Results const& e) {
-  CHECK_COUNTER_VALUE(e, int, "Foo", EQ, 10);
-  CHECK_COUNTER_VALUE(e, int, "Bat", EQ, 30);
-  CHECK_COUNTER_VALUE(e, int, "Baz", EQ, 40);
-}
-CHECK_BENCHMARK_RESULTS("BM_CounterSet2_Tabular", &CheckSet2);
-
-// ========================================================================= //
-// --------------------------- TEST CASES END ------------------------------ //
-// ========================================================================= //
-
-int main(int argc, char* argv[]) { RunOutputTests(argc, argv); }
diff --git a/third_party/google_benchmark/tools/gbench/report.py b/third_party/google_benchmark/tools/gbench/report.py
deleted file mode 100644
index 5bd3a8d..0000000
--- a/third_party/google_benchmark/tools/gbench/report.py
+++ /dev/null
@@ -1,541 +0,0 @@
-import unittest
-"""report.py - Utilities for reporting statistics about benchmark results
-"""
-import os
-import re
-import copy
-
-from scipy.stats import mannwhitneyu
-
-
-class BenchmarkColor(object):
-    def __init__(self, name, code):
-        self.name = name
-        self.code = code
-
-    def __repr__(self):
-        return '%s%r' % (self.__class__.__name__,
-                         (self.name, self.code))
-
-    def __format__(self, format):
-        return self.code
-
-
-# Benchmark Colors Enumeration
-BC_NONE = BenchmarkColor('NONE', '')
-BC_MAGENTA = BenchmarkColor('MAGENTA', '\033[95m')
-BC_CYAN = BenchmarkColor('CYAN', '\033[96m')
-BC_OKBLUE = BenchmarkColor('OKBLUE', '\033[94m')
-BC_OKGREEN = BenchmarkColor('OKGREEN', '\033[32m')
-BC_HEADER = BenchmarkColor('HEADER', '\033[92m')
-BC_WARNING = BenchmarkColor('WARNING', '\033[93m')
-BC_WHITE = BenchmarkColor('WHITE', '\033[97m')
-BC_FAIL = BenchmarkColor('FAIL', '\033[91m')
-BC_ENDC = BenchmarkColor('ENDC', '\033[0m')
-BC_BOLD = BenchmarkColor('BOLD', '\033[1m')
-BC_UNDERLINE = BenchmarkColor('UNDERLINE', '\033[4m')
-
-UTEST_MIN_REPETITIONS = 2
-UTEST_OPTIMAL_REPETITIONS = 9  # Lowest reasonable number, More is better.
-UTEST_COL_NAME = "_pvalue"
-
-
-def color_format(use_color, fmt_str, *args, **kwargs):
-    """
-    Return the result of 'fmt_str.format(*args, **kwargs)' after transforming
-    'args' and 'kwargs' according to the value of 'use_color'. If 'use_color'
-    is False then all color codes in 'args' and 'kwargs' are replaced with
-    the empty string.
-    """
-    assert use_color is True or use_color is False
-    if not use_color:
-        args = [arg if not isinstance(arg, BenchmarkColor) else BC_NONE
-                for arg in args]
-        kwargs = {key: arg if not isinstance(arg, BenchmarkColor) else BC_NONE
-                  for key, arg in kwargs.items()}
-    return fmt_str.format(*args, **kwargs)
-
-
-def find_longest_name(benchmark_list):
-    """
-    Return the length of the longest benchmark name in a given list of
-    benchmark JSON objects
-    """
-    longest_name = 1
-    for bc in benchmark_list:
-        if len(bc['name']) > longest_name:
-            longest_name = len(bc['name'])
-    return longest_name
-
-
-def calculate_change(old_val, new_val):
-    """
-    Return a float representing the decimal change between old_val and new_val.
-    """
-    if old_val == 0 and new_val == 0:
-        return 0.0
-    if old_val == 0:
-        return float(new_val - old_val) / (float(old_val + new_val) / 2)
-    return float(new_val - old_val) / abs(old_val)
-
-
-def filter_benchmark(json_orig, family, replacement=""):
-    """
-    Apply a filter to the json, and only leave the 'family' of benchmarks.
-    """
-    regex = re.compile(family)
-    filtered = {}
-    filtered['benchmarks'] = []
-    for be in json_orig['benchmarks']:
-        if not regex.search(be['name']):
-            continue
-        filteredbench = copy.deepcopy(be)  # Do NOT modify the old name!
-        filteredbench['name'] = regex.sub(replacement, filteredbench['name'])
-        filtered['benchmarks'].append(filteredbench)
-    return filtered
-
-
-def get_unique_benchmark_names(json):
-    """
-    While *keeping* the order, give all the unique 'names' used for benchmarks.
-    """
-    seen = set()
-    uniqued = [x['name'] for x in json['benchmarks']
-               if x['name'] not in seen and
-               (seen.add(x['name']) or True)]
-    return uniqued
-
-
-def intersect(list1, list2):
-    """
-    Given two lists, get a new list consisting of the elements only contained
-    in *both of the input lists*, while preserving the ordering.
-    """
-    return [x for x in list1 if x in list2]
-
-
-def is_potentially_comparable_benchmark(x):
-    return ('time_unit' in x and 'real_time' in x and 'cpu_time' in x)
-
-
-def partition_benchmarks(json1, json2):
-    """
-    While preserving the ordering, find benchmarks with the same names in
-    both of the inputs, and group them.
-    (i.e. partition/filter into groups with common name)
-    """
-    json1_unique_names = get_unique_benchmark_names(json1)
-    json2_unique_names = get_unique_benchmark_names(json2)
-    names = intersect(json1_unique_names, json2_unique_names)
-    partitions = []
-    for name in names:
-        time_unit = None
-        # Pick the time unit from the first entry of the lhs benchmark.
-        # We should be careful not to crash with unexpected input.
-        for x in json1['benchmarks']:
-            if (x['name'] == name and is_potentially_comparable_benchmark(x)):
-                time_unit = x['time_unit']
-                break
-        if time_unit is None:
-            continue
-        # Filter by name and time unit.
-        # All the repetitions are assumed to be comparable.
-        lhs = [x for x in json1['benchmarks'] if x['name'] == name and
-               x['time_unit'] == time_unit]
-        rhs = [x for x in json2['benchmarks'] if x['name'] == name and
-               x['time_unit'] == time_unit]
-        partitions.append([lhs, rhs])
-    return partitions
-
-
-def extract_field(partition, field_name):
-    # The count of elements may be different. We want *all* of them.
-    lhs = [x[field_name] for x in partition[0]]
-    rhs = [x[field_name] for x in partition[1]]
-    return [lhs, rhs]
-
-def calc_utest(timings_cpu, timings_time):
-    min_rep_cnt = min(len(timings_time[0]),
-                      len(timings_time[1]),
-                      len(timings_cpu[0]),
-                      len(timings_cpu[1]))
-
-    # Does *everything* has at least UTEST_MIN_REPETITIONS repetitions?
-    if min_rep_cnt < UTEST_MIN_REPETITIONS:
-        return False, None, None
-
-    time_pvalue = mannwhitneyu(
-        timings_time[0], timings_time[1], alternative='two-sided').pvalue
-    cpu_pvalue = mannwhitneyu(
-        timings_cpu[0], timings_cpu[1], alternative='two-sided').pvalue
-
-    return (min_rep_cnt >= UTEST_OPTIMAL_REPETITIONS), cpu_pvalue, time_pvalue
-
-def print_utest(partition, utest_alpha, first_col_width, use_color=True):
-    def get_utest_color(pval):
-        return BC_FAIL if pval >= utest_alpha else BC_OKGREEN
-
-    timings_time = extract_field(partition, 'real_time')
-    timings_cpu = extract_field(partition, 'cpu_time')
-    have_optimal_repetitions, cpu_pvalue, time_pvalue = calc_utest(timings_cpu, timings_time)
-
-    # Check if we failed miserably with minimum required repetitions for utest
-    if not have_optimal_repetitions and cpu_pvalue is None and time_pvalue is None:
-        return []
-
-    dsc = "U Test, Repetitions: {} vs {}".format(
-        len(timings_cpu[0]), len(timings_cpu[1]))
-    dsc_color = BC_OKGREEN
-
-    # We still got some results to show but issue a warning about it.
-    if not have_optimal_repetitions:
-        dsc_color = BC_WARNING
-        dsc += ". WARNING: Results unreliable! {}+ repetitions recommended.".format(
-            UTEST_OPTIMAL_REPETITIONS)
-
-    special_str = "{}{:<{}s}{endc}{}{:16.4f}{endc}{}{:16.4f}{endc}{}      {}"
-
-    last_name = partition[0][0]['name']
-    return [color_format(use_color,
-                         special_str,
-                         BC_HEADER,
-                         "{}{}".format(last_name, UTEST_COL_NAME),
-                         first_col_width,
-                         get_utest_color(time_pvalue), time_pvalue,
-                         get_utest_color(cpu_pvalue), cpu_pvalue,
-                         dsc_color, dsc,
-                         endc=BC_ENDC)]
-
-
-def generate_difference_report(
-        json1,
-        json2,
-        display_aggregates_only=False,
-        utest=False,
-        utest_alpha=0.05,
-        use_color=True):
-    """
-    Calculate and report the difference between each test of two benchmarks
-    runs specified as 'json1' and 'json2'.
-    """
-    assert utest is True or utest is False
-    first_col_width = find_longest_name(json1['benchmarks'])
-
-    def find_test(name):
-        for b in json2['benchmarks']:
-            if b['name'] == name:
-                return b
-        return None
-
-    first_col_width = max(
-        first_col_width,
-        len('Benchmark'))
-    first_col_width += len(UTEST_COL_NAME)
-    first_line = "{:<{}s}Time             CPU      Time Old      Time New       CPU Old       CPU New".format(
-        'Benchmark', 12 + first_col_width)
-    output_strs = [first_line, '-' * len(first_line)]
-
-    partitions = partition_benchmarks(json1, json2)
-    for partition in partitions:
-        # Careful, we may have different repetition count.
-        for i in range(min(len(partition[0]), len(partition[1]))):
-            bn = partition[0][i]
-            other_bench = partition[1][i]
-
-            # *If* we were asked to only display aggregates,
-            # and if it is non-aggregate, then skip it.
-            if display_aggregates_only and 'run_type' in bn and 'run_type' in other_bench:
-                assert bn['run_type'] == other_bench['run_type']
-                if bn['run_type'] != 'aggregate':
-                    continue
-
-            fmt_str = "{}{:<{}s}{endc}{}{:+16.4f}{endc}{}{:+16.4f}{endc}{:14.0f}{:14.0f}{endc}{:14.0f}{:14.0f}"
-
-            def get_color(res):
-                if res > 0.05:
-                    return BC_FAIL
-                elif res > -0.07:
-                    return BC_WHITE
-                else:
-                    return BC_CYAN
-
-            tres = calculate_change(bn['real_time'], other_bench['real_time'])
-            cpures = calculate_change(bn['cpu_time'], other_bench['cpu_time'])
-            output_strs += [color_format(use_color,
-                                         fmt_str,
-                                         BC_HEADER,
-                                         bn['name'],
-                                         first_col_width,
-                                         get_color(tres),
-                                         tres,
-                                         get_color(cpures),
-                                         cpures,
-                                         bn['real_time'],
-                                         other_bench['real_time'],
-                                         bn['cpu_time'],
-                                         other_bench['cpu_time'],
-                                         endc=BC_ENDC)]
-
-        # After processing the whole partition, if requested, do the U test.
-        if utest:
-            output_strs += print_utest(partition,
-                                       utest_alpha=utest_alpha,
-                                       first_col_width=first_col_width,
-                                       use_color=use_color)
-
-    return output_strs
-
-
-###############################################################################
-# Unit tests
-
-
-class TestGetUniqueBenchmarkNames(unittest.TestCase):
-    def load_results(self):
-        import json
-        testInputs = os.path.join(
-            os.path.dirname(
-                os.path.realpath(__file__)),
-            'Inputs')
-        testOutput = os.path.join(testInputs, 'test3_run0.json')
-        with open(testOutput, 'r') as f:
-            json = json.load(f)
-        return json
-
-    def test_basic(self):
-        expect_lines = [
-            'BM_One',
-            'BM_Two',
-            'short',  # These two are not sorted
-            'medium',  # These two are not sorted
-        ]
-        json = self.load_results()
-        output_lines = get_unique_benchmark_names(json)
-        print("\n")
-        print("\n".join(output_lines))
-        self.assertEqual(len(output_lines), len(expect_lines))
-        for i in range(0, len(output_lines)):
-            self.assertEqual(expect_lines[i], output_lines[i])
-
-
-class TestReportDifference(unittest.TestCase):
-    def load_results(self):
-        import json
-        testInputs = os.path.join(
-            os.path.dirname(
-                os.path.realpath(__file__)),
-            'Inputs')
-        testOutput1 = os.path.join(testInputs, 'test1_run1.json')
-        testOutput2 = os.path.join(testInputs, 'test1_run2.json')
-        with open(testOutput1, 'r') as f:
-            json1 = json.load(f)
-        with open(testOutput2, 'r') as f:
-            json2 = json.load(f)
-        return json1, json2
-
-    def test_basic(self):
-        expect_lines = [
-            ['BM_SameTimes', '+0.0000', '+0.0000', '10', '10', '10', '10'],
-            ['BM_2xFaster', '-0.5000', '-0.5000', '50', '25', '50', '25'],
-            ['BM_2xSlower', '+1.0000', '+1.0000', '50', '100', '50', '100'],
-            ['BM_1PercentFaster', '-0.0100', '-0.0100', '100', '99', '100', '99'],
-            ['BM_1PercentSlower', '+0.0100', '+0.0100', '100', '101', '100', '101'],
-            ['BM_10PercentFaster', '-0.1000', '-0.1000', '100', '90', '100', '90'],
-            ['BM_10PercentSlower', '+0.1000', '+0.1000', '100', '110', '100', '110'],
-            ['BM_100xSlower', '+99.0000', '+99.0000',
-                '100', '10000', '100', '10000'],
-            ['BM_100xFaster', '-0.9900', '-0.9900',
-                '10000', '100', '10000', '100'],
-            ['BM_10PercentCPUToTime', '+0.1000',
-                '-0.1000', '100', '110', '100', '90'],
-            ['BM_ThirdFaster', '-0.3333', '-0.3334', '100', '67', '100', '67'],
-            ['BM_NotBadTimeUnit', '-0.9000', '+0.2000', '0', '0', '0', '1'],
-        ]
-        json1, json2 = self.load_results()
-        output_lines_with_header = generate_difference_report(
-            json1, json2, use_color=False)
-        output_lines = output_lines_with_header[2:]
-        print("\n")
-        print("\n".join(output_lines_with_header))
-        self.assertEqual(len(output_lines), len(expect_lines))
-        for i in range(0, len(output_lines)):
-            parts = [x for x in output_lines[i].split(' ') if x]
-            self.assertEqual(len(parts), 7)
-            self.assertEqual(expect_lines[i], parts)
-
-
-class TestReportDifferenceBetweenFamilies(unittest.TestCase):
-    def load_result(self):
-        import json
-        testInputs = os.path.join(
-            os.path.dirname(
-                os.path.realpath(__file__)),
-            'Inputs')
-        testOutput = os.path.join(testInputs, 'test2_run.json')
-        with open(testOutput, 'r') as f:
-            json = json.load(f)
-        return json
-
-    def test_basic(self):
-        expect_lines = [
-            ['.', '-0.5000', '-0.5000', '10', '5', '10', '5'],
-            ['./4', '-0.5000', '-0.5000', '40', '20', '40', '20'],
-            ['Prefix/.', '-0.5000', '-0.5000', '20', '10', '20', '10'],
-            ['Prefix/./3', '-0.5000', '-0.5000', '30', '15', '30', '15'],
-        ]
-        json = self.load_result()
-        json1 = filter_benchmark(json, "BM_Z.ro", ".")
-        json2 = filter_benchmark(json, "BM_O.e", ".")
-        output_lines_with_header = generate_difference_report(
-            json1, json2, use_color=False)
-        output_lines = output_lines_with_header[2:]
-        print("\n")
-        print("\n".join(output_lines_with_header))
-        self.assertEqual(len(output_lines), len(expect_lines))
-        for i in range(0, len(output_lines)):
-            parts = [x for x in output_lines[i].split(' ') if x]
-            self.assertEqual(len(parts), 7)
-            self.assertEqual(expect_lines[i], parts)
-
-
-class TestReportDifferenceWithUTest(unittest.TestCase):
-    def load_results(self):
-        import json
-        testInputs = os.path.join(
-            os.path.dirname(
-                os.path.realpath(__file__)),
-            'Inputs')
-        testOutput1 = os.path.join(testInputs, 'test3_run0.json')
-        testOutput2 = os.path.join(testInputs, 'test3_run1.json')
-        with open(testOutput1, 'r') as f:
-            json1 = json.load(f)
-        with open(testOutput2, 'r') as f:
-            json2 = json.load(f)
-        return json1, json2
-
-    def test_utest(self):
-        expect_lines = []
-        expect_lines = [
-            ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
-            ['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'],
-            ['BM_Two', '-0.1250', '-0.1628', '8', '7', '86', '72'],
-            ['BM_Two_pvalue',
-             '0.6985',
-             '0.6985',
-             'U',
-             'Test,',
-             'Repetitions:',
-             '2',
-             'vs',
-             '2.',
-             'WARNING:',
-             'Results',
-             'unreliable!',
-             '9+',
-             'repetitions',
-             'recommended.'],
-            ['short', '-0.1250', '-0.0625', '8', '7', '80', '75'],
-            ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
-            ['short_pvalue',
-             '0.7671',
-             '0.1489',
-             'U',
-             'Test,',
-             'Repetitions:',
-             '2',
-             'vs',
-             '3.',
-             'WARNING:',
-             'Results',
-             'unreliable!',
-             '9+',
-             'repetitions',
-             'recommended.'],
-            ['medium', '-0.3750', '-0.3375', '8', '5', '80', '53'],
-        ]
-        json1, json2 = self.load_results()
-        output_lines_with_header = generate_difference_report(
-            json1, json2, utest=True, utest_alpha=0.05, use_color=False)
-        output_lines = output_lines_with_header[2:]
-        print("\n")
-        print("\n".join(output_lines_with_header))
-        self.assertEqual(len(output_lines), len(expect_lines))
-        for i in range(0, len(output_lines)):
-            parts = [x for x in output_lines[i].split(' ') if x]
-            self.assertEqual(expect_lines[i], parts)
-
-
-class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(
-        unittest.TestCase):
-    def load_results(self):
-        import json
-        testInputs = os.path.join(
-            os.path.dirname(
-                os.path.realpath(__file__)),
-            'Inputs')
-        testOutput1 = os.path.join(testInputs, 'test3_run0.json')
-        testOutput2 = os.path.join(testInputs, 'test3_run1.json')
-        with open(testOutput1, 'r') as f:
-            json1 = json.load(f)
-        with open(testOutput2, 'r') as f:
-            json2 = json.load(f)
-        return json1, json2
-
-    def test_utest(self):
-        expect_lines = []
-        expect_lines = [
-            ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
-            ['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'],
-            ['BM_Two', '-0.1250', '-0.1628', '8', '7', '86', '72'],
-            ['BM_Two_pvalue',
-             '0.6985',
-             '0.6985',
-             'U',
-             'Test,',
-             'Repetitions:',
-             '2',
-             'vs',
-             '2.',
-             'WARNING:',
-             'Results',
-             'unreliable!',
-             '9+',
-             'repetitions',
-             'recommended.'],
-            ['short', '-0.1250', '-0.0625', '8', '7', '80', '75'],
-            ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
-            ['short_pvalue',
-             '0.7671',
-             '0.1489',
-             'U',
-             'Test,',
-             'Repetitions:',
-             '2',
-             'vs',
-             '3.',
-             'WARNING:',
-             'Results',
-             'unreliable!',
-             '9+',
-             'repetitions',
-             'recommended.'],
-        ]
-        json1, json2 = self.load_results()
-        output_lines_with_header = generate_difference_report(
-            json1, json2, display_aggregates_only=True,
-            utest=True, utest_alpha=0.05, use_color=False)
-        output_lines = output_lines_with_header[2:]
-        print("\n")
-        print("\n".join(output_lines_with_header))
-        self.assertEqual(len(output_lines), len(expect_lines))
-        for i in range(0, len(output_lines)):
-            parts = [x for x in output_lines[i].split(' ') if x]
-            self.assertEqual(expect_lines[i], parts)
-
-
-if __name__ == '__main__':
-    unittest.main()
-
-# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
-# kate: tab-width: 4; replace-tabs on; indent-width 4; tab-indents: off;
-# kate: indent-mode python; remove-trailing-spaces modified;