blob: 7236a17ac61805fcb9f98337fa6ef561030762b0 [file] [log] [blame]
// Copyright 2018 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "base/allocator/partition_allocator/partition_bucket.h"
#include <algorithm>
#include <cstdint>
#include <tuple>
#include "base/allocator/partition_allocator/address_pool_manager.h"
#include "base/allocator/partition_allocator/freeslot_bitmap.h"
#include "base/allocator/partition_allocator/freeslot_bitmap_constants.h"
#include "base/allocator/partition_allocator/oom.h"
#include "base/allocator/partition_allocator/page_allocator.h"
#include "base/allocator/partition_allocator/page_allocator_constants.h"
#include "base/allocator/partition_allocator/partition_address_space.h"
#include "base/allocator/partition_allocator/partition_alloc.h"
#include "base/allocator/partition_allocator/partition_alloc_base/bits.h"
#include "base/allocator/partition_allocator/partition_alloc_base/compiler_specific.h"
#include "base/allocator/partition_allocator/partition_alloc_base/component_export.h"
#include "base/allocator/partition_allocator/partition_alloc_base/debug/alias.h"
#include "base/allocator/partition_allocator/partition_alloc_base/debug/debugging_buildflags.h"
#include "base/allocator/partition_allocator/partition_alloc_base/immediate_crash.h"
#include "base/allocator/partition_allocator/partition_alloc_base/thread_annotations.h"
#include "base/allocator/partition_allocator/partition_alloc_buildflags.h"
#include "base/allocator/partition_allocator/partition_alloc_check.h"
#include "base/allocator/partition_allocator/partition_alloc_config.h"
#include "base/allocator/partition_allocator/partition_alloc_constants.h"
#include "base/allocator/partition_allocator/partition_alloc_forward.h"
#include "base/allocator/partition_allocator/partition_direct_map_extent.h"
#include "base/allocator/partition_allocator/partition_oom.h"
#include "base/allocator/partition_allocator/partition_page.h"
#include "base/allocator/partition_allocator/reservation_offset_table.h"
#include "base/allocator/partition_allocator/tagging.h"
#include "build/build_config.h"
#if BUILDFLAG(USE_STARSCAN)
#include "base/allocator/partition_allocator/starscan/pcscan.h"
#endif
namespace partition_alloc::internal {
namespace {
#if PA_CONFIG(ENABLE_SHADOW_METADATA)
PA_ALWAYS_INLINE uintptr_t ShadowMetadataStart(uintptr_t super_page,
pool_handle pool) {
uintptr_t shadow_metadata_start =
super_page + SystemPageSize() + ShadowPoolOffset(pool);
PA_DCHECK(!PartitionAddressSpace::IsInRegularPool(shadow_metadata_start));
PA_DCHECK(!PartitionAddressSpace::IsInBRPPool(shadow_metadata_start));
return shadow_metadata_start;
}
#endif
template <bool thread_safe>
[[noreturn]] PA_NOINLINE void PartitionOutOfMemoryMappingFailure(
PartitionRoot<thread_safe>* root,
size_t size) PA_LOCKS_EXCLUDED(root->lock_) {
PA_NO_CODE_FOLDING();
root->OutOfMemory(size);
PA_IMMEDIATE_CRASH(); // Not required, kept as documentation.
}
template <bool thread_safe>
[[noreturn]] PA_NOINLINE void PartitionOutOfMemoryCommitFailure(
PartitionRoot<thread_safe>* root,
size_t size) PA_LOCKS_EXCLUDED(root->lock_) {
PA_NO_CODE_FOLDING();
root->OutOfMemory(size);
PA_IMMEDIATE_CRASH(); // Not required, kept as documentation.
}
#if !BUILDFLAG(HAS_64_BIT_POINTERS) && BUILDFLAG(ENABLE_BACKUP_REF_PTR_SUPPORT)
// |start| has to be aligned to kSuperPageSize, but |end| doesn't. This means
// that a partial super page is allowed at the end. Since the block list uses
// kSuperPageSize granularity, a partial super page is considered blocked if
// there is a raw_ptr<T> pointing anywhere in that super page, even if doesn't
// point to that partially allocated region.
bool AreAllowedSuperPagesForBRPPool(uintptr_t start, uintptr_t end) {
PA_DCHECK(!(start % kSuperPageSize));
for (uintptr_t super_page = start; super_page < end;
super_page += kSuperPageSize) {
// If any blocked super page is found inside the given memory region,
// the memory region is blocked.
if (!AddressPoolManagerBitmap::IsAllowedSuperPageForBRPPool(super_page)) {
AddressPoolManagerBitmap::IncrementBlocklistHitCount();
return false;
}
}
return true;
}
#endif // !BUILDFLAG(HAS_64_BIT_POINTERS) &&
// BUILDFLAG(ENABLE_BACKUP_REF_PTR_SUPPORT)
// Reserves |requested_size| worth of super pages from the specified pool.
// If BRP pool is requested this function will honor BRP block list.
//
// The returned address will be aligned to kSuperPageSize, and so
// |requested_address| should be. |requested_size| doesn't have to be, however.
//
// |requested_address| is merely a hint, which will be attempted, but easily
// given up on if doesn't work the first time.
//
// The function doesn't need to hold root->lock_ or any other locks, because:
// - It (1) reserves memory, (2) then consults AreAllowedSuperPagesForBRPPool
// for that memory, and (3) returns the memory if
// allowed, or unreserves and decommits if not allowed. So no other
// overlapping region can be allocated while executing
// AreAllowedSuperPagesForBRPPool.
// - IsAllowedSuperPageForBRPPool (used by AreAllowedSuperPagesForBRPPool) is
// designed to not need locking.
uintptr_t ReserveMemoryFromPool(pool_handle pool,
uintptr_t requested_address,
size_t requested_size) {
PA_DCHECK(!(requested_address % kSuperPageSize));
uintptr_t reserved_address = AddressPoolManager::GetInstance().Reserve(
pool, requested_address, requested_size);
// In 32-bit mode, when allocating from BRP pool, verify that the requested
// allocation honors the block list. Find a better address otherwise.
#if !BUILDFLAG(HAS_64_BIT_POINTERS) && BUILDFLAG(ENABLE_BACKUP_REF_PTR_SUPPORT)
if (pool == kBRPPoolHandle) {
constexpr int kMaxRandomAddressTries = 10;
for (int i = 0; i < kMaxRandomAddressTries; ++i) {
if (!reserved_address ||
AreAllowedSuperPagesForBRPPool(reserved_address,
reserved_address + requested_size)) {
break;
}
AddressPoolManager::GetInstance().UnreserveAndDecommit(
pool, reserved_address, requested_size);
// No longer try to honor |requested_address|, because it didn't work for
// us last time.
reserved_address =
AddressPoolManager::GetInstance().Reserve(pool, 0, requested_size);
}
// If the allocation attempt succeeds, we will break out of the following
// loop immediately.
//
// Last resort: sequentially scan the whole 32-bit address space. The number
// of blocked super-pages should be very small, so we expect to practically
// never need to run the following code. Note that it may fail to find an
// available super page, e.g., when it becomes available after the scan
// passes through it, but we accept the risk.
for (uintptr_t address_to_try = kSuperPageSize; address_to_try != 0;
address_to_try += kSuperPageSize) {
if (!reserved_address ||
AreAllowedSuperPagesForBRPPool(reserved_address,
reserved_address + requested_size)) {
break;
}
AddressPoolManager::GetInstance().UnreserveAndDecommit(
pool, reserved_address, requested_size);
// Reserve() can return a different pointer than attempted.
reserved_address = AddressPoolManager::GetInstance().Reserve(
pool, address_to_try, requested_size);
}
// If the loop ends naturally, the last allocated region hasn't been
// verified. Do it now.
if (reserved_address &&
!AreAllowedSuperPagesForBRPPool(reserved_address,
reserved_address + requested_size)) {
AddressPoolManager::GetInstance().UnreserveAndDecommit(
pool, reserved_address, requested_size);
reserved_address = 0;
}
}
#endif // !BUILDFLAG(HAS_64_BIT_POINTERS) &&
// BUILDFLAG(ENABLE_BACKUP_REF_PTR_SUPPORT)
#if !BUILDFLAG(HAS_64_BIT_POINTERS)
// Only mark the region as belonging to the pool after it has passed the
// blocklist check in order to avoid a potential race with destructing a
// raw_ptr<T> object that points to non-PA memory in another thread.
// If `MarkUsed` was called earlier, the other thread could incorrectly
// determine that the allocation had come form PartitionAlloc.
if (reserved_address) {
AddressPoolManager::GetInstance().MarkUsed(pool, reserved_address,
requested_size);
}
#endif
PA_DCHECK(!(reserved_address % kSuperPageSize));
return reserved_address;
}
template <bool thread_safe>
SlotSpanMetadata<thread_safe>* PartitionDirectMap(
PartitionRoot<thread_safe>* root,
unsigned int flags,
size_t raw_size,
size_t slot_span_alignment) {
PA_DCHECK((slot_span_alignment >= PartitionPageSize()) &&
base::bits::IsPowerOfTwo(slot_span_alignment));
// No static EXCLUSIVE_LOCKS_REQUIRED(), as the checker doesn't understand
// scoped unlocking.
root->lock_.AssertAcquired();
const bool return_null = flags & AllocFlags::kReturnNull;
if (PA_UNLIKELY(raw_size > MaxDirectMapped())) {
if (return_null) {
return nullptr;
}
// The lock is here to protect PA from:
// 1. Concurrent calls
// 2. Reentrant calls
//
// This is fine here however, as:
// 1. Concurrency: |PartitionRoot::OutOfMemory()| never returns, so the lock
// will not be re-acquired, which would lead to acting on inconsistent
// data that could have been modified in-between releasing and acquiring
// it.
// 2. Reentrancy: This is why we release the lock. On some platforms,
// terminating the process may free() memory, or even possibly try to
// allocate some. Calling free() is fine, but will deadlock since
// |PartitionRoot::lock_| is not recursive.
//
// Supporting reentrant calls properly is hard, and not a requirement for
// PA. However up to that point, we've only *read* data, not *written* to
// any state. Reentrant calls are then fine, especially as we don't continue
// on this path. The only downside is possibly endless recursion if the OOM
// handler allocates and fails to use UncheckedMalloc() or equivalent, but
// that's violating the contract of base::TerminateBecauseOutOfMemory().
ScopedUnlockGuard unlock{root->lock_};
PartitionExcessiveAllocationSize(raw_size);
}
PartitionDirectMapExtent<thread_safe>* map_extent = nullptr;
PartitionPage<thread_safe>* page = nullptr;
{
// Getting memory for direct-mapped allocations doesn't interact with the
// rest of the allocator, but takes a long time, as it involves several
// system calls. Although no mmap() (or equivalent) calls are made on
// 64 bit systems, page permissions are changed with mprotect(), which is
// a syscall.
//
// These calls are almost always slow (at least a couple us per syscall on a
// desktop Linux machine), and they also have a very long latency tail,
// possibly from getting descheduled. As a consequence, we should not hold
// the lock when performing a syscall. This is not the only problematic
// location, but since this one doesn't interact with the rest of the
// allocator, we can safely drop and then re-acquire the lock.
//
// Note that this only affects allocations that are not served out of the
// thread cache, but as a simple example the buffer partition in blink is
// frequently used for large allocations (e.g. ArrayBuffer), and frequent,
// small ones (e.g. WTF::String), and does not have a thread cache.
ScopedUnlockGuard scoped_unlock{root->lock_};
const size_t slot_size =
PartitionRoot<thread_safe>::GetDirectMapSlotSize(raw_size);
// The super page starts with a partition page worth of metadata and guard
// pages, hence alignment requests ==PartitionPageSize() will be
// automatically satisfied. Padding is needed for higher-order alignment
// requests. Note, |slot_span_alignment| is at least 1 partition page.
const size_t padding_for_alignment =
slot_span_alignment - PartitionPageSize();
const size_t reservation_size =
PartitionRoot<thread_safe>::GetDirectMapReservationSize(
raw_size + padding_for_alignment);
#if BUILDFLAG(PA_DCHECK_IS_ON)
const size_t available_reservation_size =
reservation_size - padding_for_alignment -
PartitionRoot<thread_safe>::GetDirectMapMetadataAndGuardPagesSize();
PA_DCHECK(slot_size <= available_reservation_size);
#endif
pool_handle pool = root->ChoosePool();
uintptr_t reservation_start;
{
// Reserving memory from the pool is actually not a syscall on 64 bit
// platforms.
#if !BUILDFLAG(HAS_64_BIT_POINTERS)
ScopedSyscallTimer timer{root};
#endif
reservation_start = ReserveMemoryFromPool(pool, 0, reservation_size);
}
if (PA_UNLIKELY(!reservation_start)) {
if (return_null) {
return nullptr;
}
PartitionOutOfMemoryMappingFailure(root, reservation_size);
}
root->total_size_of_direct_mapped_pages.fetch_add(
reservation_size, std::memory_order_relaxed);
// Shift by 1 partition page (metadata + guard pages) and alignment padding.
const uintptr_t slot_start =
reservation_start + PartitionPageSize() + padding_for_alignment;
{
ScopedSyscallTimer timer{root};
RecommitSystemPages(reservation_start + SystemPageSize(),
SystemPageSize(),
#if PA_CONFIG(ENABLE_SHADOW_METADATA)
root->PageAccessibilityWithPkeyIfEnabled(
PageAccessibilityConfiguration::kRead),
#else
root->PageAccessibilityWithPkeyIfEnabled(
PageAccessibilityConfiguration::kReadWrite),
#endif
PageAccessibilityDisposition::kRequireUpdate);
}
#if BUILDFLAG(PUT_REF_COUNT_IN_PREVIOUS_SLOT)
// If PUT_REF_COUNT_IN_PREVIOUS_SLOT is on, and if the BRP pool is
// used, allocate a SystemPage for RefCount "bitmap" (only one of its
// elements will be used).
if (pool == kBRPPoolHandle) {
ScopedSyscallTimer timer{root};
RecommitSystemPages(reservation_start + SystemPageSize() * 2,
SystemPageSize(),
root->PageAccessibilityWithPkeyIfEnabled(
PageAccessibilityConfiguration::kReadWrite),
PageAccessibilityDisposition::kRequireUpdate);
}
#endif
#if PA_CONFIG(ENABLE_SHADOW_METADATA)
{
ScopedSyscallTimer timer{root};
RecommitSystemPages(ShadowMetadataStart(reservation_start, pool),
SystemPageSize(),
root->PageAccessibilityWithPkeyIfEnabled(
PageAccessibilityConfiguration::kReadWrite),
PageAccessibilityDisposition::kRequireUpdate);
}
#endif
// No need to hold root->lock_. Now that memory is reserved, no other
// overlapping region can be allocated (because of how pools work),
// so no other thread can update the same offset table entries at the
// same time. Furthermore, nobody will be ready these offsets until this
// function returns.
uintptr_t address_start = reservation_start;
uintptr_t address_end = address_start + reservation_size;
auto* offset_ptr = ReservationOffsetPointer(address_start);
uint16_t offset = 0;
while (address_start < address_end) {
PA_DCHECK(offset_ptr < GetReservationOffsetTableEnd(address_start));
PA_DCHECK(offset < kOffsetTagNormalBuckets);
*offset_ptr++ = offset++;
address_start += kSuperPageSize;
}
auto* super_page_extent =
PartitionSuperPageToExtent<thread_safe>(reservation_start);
super_page_extent->root = root;
// The new structures are all located inside a fresh system page so they
// will all be zeroed out. These DCHECKs are for documentation and to assert
// our expectations of the kernel.
PA_DCHECK(!super_page_extent->number_of_consecutive_super_pages);
PA_DCHECK(!super_page_extent->next);
PartitionPage<thread_safe>* first_page =
reinterpret_cast<PartitionPage<thread_safe>*>(super_page_extent) + 1;
page = PartitionPage<thread_safe>::FromAddr(slot_start);
// |first_page| and |page| may be equal, if there is no alignment padding.
if (page != first_page) {
PA_DCHECK(page > first_page);
PA_DCHECK(page - first_page <=
PartitionPage<thread_safe>::kMaxSlotSpanMetadataOffset);
PA_CHECK(!first_page->is_valid);
first_page->has_valid_span_after_this = true;
first_page->slot_span_metadata_offset = page - first_page;
}
auto* metadata =
reinterpret_cast<PartitionDirectMapMetadata<thread_safe>*>(page);
// Since direct map metadata is larger than PartitionPage, make sure the
// first and the last bytes are on the same system page, i.e. within the
// super page metadata region.
PA_DCHECK(base::bits::AlignDown(reinterpret_cast<uintptr_t>(metadata),
SystemPageSize()) ==
base::bits::AlignDown(
reinterpret_cast<uintptr_t>(metadata) +
sizeof(PartitionDirectMapMetadata<thread_safe>) - 1,
SystemPageSize()));
PA_DCHECK(page == &metadata->page);
page->is_valid = true;
PA_DCHECK(!page->has_valid_span_after_this);
PA_DCHECK(!page->slot_span_metadata_offset);
PA_DCHECK(!page->slot_span_metadata.next_slot_span);
PA_DCHECK(!page->slot_span_metadata.marked_full);
PA_DCHECK(!page->slot_span_metadata.num_allocated_slots);
PA_DCHECK(!page->slot_span_metadata.num_unprovisioned_slots);
PA_DCHECK(!page->slot_span_metadata.in_empty_cache());
PA_DCHECK(!metadata->subsequent_page.subsequent_page_metadata.raw_size);
// Raw size is set later, by the caller.
metadata->subsequent_page.slot_span_metadata_offset = 1;
PA_DCHECK(!metadata->bucket.active_slot_spans_head);
PA_DCHECK(!metadata->bucket.empty_slot_spans_head);
PA_DCHECK(!metadata->bucket.decommitted_slot_spans_head);
PA_DCHECK(!metadata->bucket.num_system_pages_per_slot_span);
PA_DCHECK(!metadata->bucket.num_full_slot_spans);
metadata->bucket.slot_size = slot_size;
new (&page->slot_span_metadata)
SlotSpanMetadata<thread_safe>(&metadata->bucket);
// It is typically possible to map a large range of inaccessible pages, and
// this is leveraged in multiple places, including the pools. However,
// this doesn't mean that we can commit all this memory. For the vast
// majority of allocations, this just means that we crash in a slightly
// different place, but for callers ready to handle failures, we have to
// return nullptr. See crbug.com/1187404.
//
// Note that we didn't check above, because if we cannot even commit a
// single page, then this is likely hopeless anyway, and we will crash very
// soon.
const bool ok = root->TryRecommitSystemPagesForData(
slot_start, slot_size, PageAccessibilityDisposition::kRequireUpdate);
if (!ok) {
if (!return_null) {
PartitionOutOfMemoryCommitFailure(root, slot_size);
}
{
ScopedSyscallTimer timer{root};
#if !BUILDFLAG(HAS_64_BIT_POINTERS)
AddressPoolManager::GetInstance().MarkUnused(pool, reservation_start,
reservation_size);
#endif
AddressPoolManager::GetInstance().UnreserveAndDecommit(
pool, reservation_start, reservation_size);
}
root->total_size_of_direct_mapped_pages.fetch_sub(
reservation_size, std::memory_order_relaxed);
return nullptr;
}
auto* next_entry = PartitionFreelistEntry::EmplaceAndInitNull(slot_start);
page->slot_span_metadata.SetFreelistHead(next_entry);
map_extent = &metadata->direct_map_extent;
map_extent->reservation_size = reservation_size;
map_extent->padding_for_alignment = padding_for_alignment;
map_extent->bucket = &metadata->bucket;
}
root->lock_.AssertAcquired();
// Maintain the doubly-linked list of all direct mappings.
map_extent->next_extent = root->direct_map_list;
if (map_extent->next_extent) {
map_extent->next_extent->prev_extent = map_extent;
}
map_extent->prev_extent = nullptr;
root->direct_map_list = map_extent;
return &page->slot_span_metadata;
}
uint8_t ComputeSystemPagesPerSlotSpanPreferSmall(size_t slot_size) {
if (slot_size > MaxRegularSlotSpanSize()) {
// This is technically not needed, as for now all the larger slot sizes are
// multiples of the system page size.
return base::bits::AlignUp(slot_size, SystemPageSize()) / SystemPageSize();
}
// Smaller slot spans waste less address space, as well as potentially lower
// fragmentation:
// - Address space: This comes from fuller SuperPages (since the tail end of a
// SuperPage is more likely to be used when the slot span is smaller. Also,
// if a slot span is partially used, a smaller slot span will use less
// address space.
// - In-slot fragmentation: Slot span management code will prioritize
// almost-full slot spans, as well as trying to keep empty slot spans
// empty. The more granular this logic can work, the better.
//
// Since metadata space overhead is constant per-PartitionPage, keeping
// smaller slot spans makes sense.
//
// Underlying memory allocation is done per-PartitionPage, but memory commit
// is done per system page. This means that we prefer to fill the entirety of
// a PartitionPage with a slot span, but we can tolerate some system pages
// being empty at the end, as these will not cost committed or dirty memory.
//
// The choice below is, for multi-slot slot spans:
// - If a full PartitionPage slot span is possible with less than 2% of a
// *single* system page wasted, use it. The smallest possible size wins.
// - Otherwise, select the size with the smallest virtual address space
// loss. Allow a SlotSpan to leave some slack in its PartitionPage, up to
// 1/4 of the total.
for (size_t partition_page_count = 1;
partition_page_count <= kMaxPartitionPagesPerRegularSlotSpan;
partition_page_count++) {
size_t candidate_size = partition_page_count * PartitionPageSize();
size_t waste = candidate_size % slot_size;
if (waste <= .02 * SystemPageSize()) {
return partition_page_count * NumSystemPagesPerPartitionPage();
}
}
size_t best_count = 0;
size_t best_waste = std::numeric_limits<size_t>::max();
for (size_t partition_page_count = 1;
partition_page_count <= kMaxPartitionPagesPerRegularSlotSpan;
partition_page_count++) {
// Prefer no slack.
for (size_t slack = 0; slack < partition_page_count; slack++) {
size_t system_page_count =
partition_page_count * NumSystemPagesPerPartitionPage() - slack;
size_t candidate_size = system_page_count * SystemPageSize();
size_t waste = candidate_size % slot_size;
if (waste < best_waste) {
best_waste = waste;
best_count = system_page_count;
}
}
}
return best_count;
}
uint8_t ComputeSystemPagesPerSlotSpanInternal(size_t slot_size) {
// This works out reasonably for the current bucket sizes of the generic
// allocator, and the current values of partition page size and constants.
// Specifically, we have enough room to always pack the slots perfectly into
// some number of system pages. The only waste is the waste associated with
// unfaulted pages (i.e. wasted address space).
// TODO: we end up using a lot of system pages for very small sizes. For
// example, we'll use 12 system pages for slot size 24. The slot size is so
// small that the waste would be tiny with just 4, or 1, system pages. Later,
// we can investigate whether there are anti-fragmentation benefits to using
// fewer system pages.
double best_waste_ratio = 1.0f;
uint16_t best_pages = 0;
if (slot_size > MaxRegularSlotSpanSize()) {
// TODO(ajwong): Why is there a DCHECK here for this?
// http://crbug.com/776537
PA_DCHECK(!(slot_size % SystemPageSize()));
best_pages = static_cast<uint16_t>(slot_size >> SystemPageShift());
PA_CHECK(best_pages <= std::numeric_limits<uint8_t>::max());
return static_cast<uint8_t>(best_pages);
}
PA_DCHECK(slot_size <= MaxRegularSlotSpanSize());
for (uint16_t i = NumSystemPagesPerPartitionPage() - 1;
i <= MaxSystemPagesPerRegularSlotSpan(); ++i) {
size_t page_size = i << SystemPageShift();
size_t num_slots = page_size / slot_size;
size_t waste = page_size - (num_slots * slot_size);
// Leaving a page unfaulted is not free; the page will occupy an empty page
// table entry. Make a simple attempt to account for that.
//
// TODO(ajwong): This looks wrong. PTEs are allocated for all pages
// regardless of whether or not they are wasted. Should it just
// be waste += i * sizeof(void*)?
// http://crbug.com/776537
size_t num_remainder_pages = i & (NumSystemPagesPerPartitionPage() - 1);
size_t num_unfaulted_pages =
num_remainder_pages
? (NumSystemPagesPerPartitionPage() - num_remainder_pages)
: 0;
waste += sizeof(void*) * num_unfaulted_pages;
double waste_ratio =
static_cast<double>(waste) / static_cast<double>(page_size);
if (waste_ratio < best_waste_ratio) {
best_waste_ratio = waste_ratio;
best_pages = i;
}
}
PA_DCHECK(best_pages > 0);
PA_CHECK(best_pages <= MaxSystemPagesPerRegularSlotSpan());
return static_cast<uint8_t>(best_pages);
}
} // namespace
uint8_t ComputeSystemPagesPerSlotSpan(size_t slot_size,
bool prefer_smaller_slot_spans) {
if (prefer_smaller_slot_spans) {
size_t system_page_count =
ComputeSystemPagesPerSlotSpanPreferSmall(slot_size);
size_t waste = (system_page_count * SystemPageSize()) % slot_size;
// In case the waste is too large (more than 5% of a page), don't try to use
// the "small" slot span formula. This happens when we have a lot of
// buckets, in some cases the formula doesn't find a nice, small size.
if (waste <= .05 * SystemPageSize()) {
return system_page_count;
}
}
return ComputeSystemPagesPerSlotSpanInternal(slot_size);
}
template <bool thread_safe>
void PartitionBucket<thread_safe>::Init(uint32_t new_slot_size) {
slot_size = new_slot_size;
slot_size_reciprocal = kReciprocalMask / new_slot_size + 1;
active_slot_spans_head =
SlotSpanMetadata<thread_safe>::get_sentinel_slot_span_non_const();
empty_slot_spans_head = nullptr;
decommitted_slot_spans_head = nullptr;
num_full_slot_spans = 0;
bool prefer_smaller_slot_spans =
#if PA_CONFIG(PREFER_SMALLER_SLOT_SPANS)
true
#else
false
#endif
;
num_system_pages_per_slot_span =
ComputeSystemPagesPerSlotSpan(slot_size, prefer_smaller_slot_spans);
}
template <bool thread_safe>
PA_ALWAYS_INLINE SlotSpanMetadata<thread_safe>*
PartitionBucket<thread_safe>::AllocNewSlotSpan(PartitionRoot<thread_safe>* root,
unsigned int flags,
size_t slot_span_alignment) {
PA_DCHECK(!(root->next_partition_page % PartitionPageSize()));
PA_DCHECK(!(root->next_partition_page_end % PartitionPageSize()));
size_t num_partition_pages = get_pages_per_slot_span();
size_t slot_span_reservation_size = num_partition_pages
<< PartitionPageShift();
size_t slot_span_committed_size = get_bytes_per_span();
PA_DCHECK(num_partition_pages <= NumPartitionPagesPerSuperPage());
PA_DCHECK(slot_span_committed_size % SystemPageSize() == 0);
PA_DCHECK(slot_span_committed_size <= slot_span_reservation_size);
uintptr_t adjusted_next_partition_page =
base::bits::AlignUp(root->next_partition_page, slot_span_alignment);
if (PA_UNLIKELY(adjusted_next_partition_page + slot_span_reservation_size >
root->next_partition_page_end)) {
// AllocNewSuperPage() may crash (e.g. address space exhaustion), put data
// on stack.
PA_DEBUG_DATA_ON_STACK("slotsize", slot_size);
PA_DEBUG_DATA_ON_STACK("spansize", slot_span_reservation_size);
// In this case, we can no longer hand out pages from the current super page
// allocation. Get a new super page.
if (!AllocNewSuperPage(root, flags)) {
return nullptr;
}
// AllocNewSuperPage() updates root->next_partition_page, re-query.
adjusted_next_partition_page =
base::bits::AlignUp(root->next_partition_page, slot_span_alignment);
PA_CHECK(adjusted_next_partition_page + slot_span_reservation_size <=
root->next_partition_page_end);
}
auto* gap_start_page =
PartitionPage<thread_safe>::FromAddr(root->next_partition_page);
auto* gap_end_page =
PartitionPage<thread_safe>::FromAddr(adjusted_next_partition_page);
for (auto* page = gap_start_page; page < gap_end_page; ++page) {
PA_DCHECK(!page->is_valid);
page->has_valid_span_after_this = 1;
}
root->next_partition_page =
adjusted_next_partition_page + slot_span_reservation_size;
uintptr_t slot_span_start = adjusted_next_partition_page;
auto* slot_span = &gap_end_page->slot_span_metadata;
InitializeSlotSpan(slot_span);
// Now that slot span is initialized, it's safe to call FromSlotStart.
PA_DCHECK(slot_span ==
SlotSpanMetadata<thread_safe>::FromSlotStart(slot_span_start));
// System pages in the super page come in a decommited state. Commit them
// before vending them back.
// If lazy commit is enabled, pages will be committed when provisioning slots,
// in ProvisionMoreSlotsAndAllocOne(), not here.
if (!kUseLazyCommit) {
PA_DEBUG_DATA_ON_STACK("slotsize", slot_size);
PA_DEBUG_DATA_ON_STACK("spansize", slot_span_reservation_size);
PA_DEBUG_DATA_ON_STACK("spancmt", slot_span_committed_size);
root->RecommitSystemPagesForData(
slot_span_start, slot_span_committed_size,
PageAccessibilityDisposition::kRequireUpdate);
}
PA_CHECK(get_slots_per_span() <=
SlotSpanMetadata<ThreadSafe>::kMaxSlotsPerSlotSpan);
// Double check that we had enough space in the super page for the new slot
// span.
PA_DCHECK(root->next_partition_page <= root->next_partition_page_end);
return slot_span;
}
template <bool thread_safe>
uintptr_t PartitionBucket<thread_safe>::AllocNewSuperPageSpan(
PartitionRoot<thread_safe>* root,
size_t super_page_count,
unsigned int flags) {
PA_CHECK(super_page_count > 0);
PA_CHECK(super_page_count <=
std::numeric_limits<size_t>::max() / kSuperPageSize);
// Need a new super page. We want to allocate super pages in a contiguous
// address region as much as possible. This is important for not causing
// page table bloat and not fragmenting address spaces in 32 bit
// architectures.
uintptr_t requested_address = root->next_super_page;
pool_handle pool = root->ChoosePool();
uintptr_t super_page_span_start = ReserveMemoryFromPool(
pool, requested_address, super_page_count * kSuperPageSize);
if (PA_UNLIKELY(!super_page_span_start)) {
if (flags & AllocFlags::kReturnNull) {
return 0;
}
// Didn't manage to get a new uncommitted super page -> address space issue.
::partition_alloc::internal::ScopedUnlockGuard unlock{root->lock_};
PartitionOutOfMemoryMappingFailure(root, kSuperPageSize);
}
uintptr_t super_page_span_end =
super_page_span_start + super_page_count * kSuperPageSize;
for (uintptr_t super_page = super_page_span_start;
super_page < super_page_span_end; super_page += kSuperPageSize) {
InitializeSuperPage(root, super_page, 0);
}
return super_page_span_start;
}
template <bool thread_safe>
PA_ALWAYS_INLINE uintptr_t PartitionBucket<thread_safe>::AllocNewSuperPage(
PartitionRoot<thread_safe>* root,
unsigned int flags) {
auto super_page = AllocNewSuperPageSpan(root, 1, flags);
if (PA_UNLIKELY(!super_page)) {
// If the `kReturnNull` flag isn't set and the allocation attempt fails,
// `AllocNewSuperPageSpan` should've failed with an OOM crash.
PA_DCHECK(flags & AllocFlags::kReturnNull);
return 0;
}
return SuperPagePayloadBegin(super_page, root->IsQuarantineAllowed());
}
template <bool thread_safe>
PA_ALWAYS_INLINE uintptr_t PartitionBucket<thread_safe>::InitializeSuperPage(
PartitionRoot<thread_safe>* root,
uintptr_t super_page,
uintptr_t requested_address) {
*ReservationOffsetPointer(super_page) = kOffsetTagNormalBuckets;
root->total_size_of_super_pages.fetch_add(kSuperPageSize,
std::memory_order_relaxed);
root->next_super_page = super_page + kSuperPageSize;
uintptr_t state_bitmap =
super_page + PartitionPageSize() +
(is_direct_mapped() ? 0 : ReservedFreeSlotBitmapSize());
#if BUILDFLAG(USE_STARSCAN)
PA_DCHECK(SuperPageStateBitmapAddr(super_page) == state_bitmap);
const size_t state_bitmap_reservation_size =
root->IsQuarantineAllowed() ? ReservedStateBitmapSize() : 0;
const size_t state_bitmap_size_to_commit =
root->IsQuarantineAllowed() ? CommittedStateBitmapSize() : 0;
PA_DCHECK(state_bitmap_reservation_size % PartitionPageSize() == 0);
PA_DCHECK(state_bitmap_size_to_commit % SystemPageSize() == 0);
PA_DCHECK(state_bitmap_size_to_commit <= state_bitmap_reservation_size);
uintptr_t payload = state_bitmap + state_bitmap_reservation_size;
#else
uintptr_t payload = state_bitmap;
#endif // BUILDFLAG(USE_STARSCAN)
root->next_partition_page = payload;
root->next_partition_page_end = root->next_super_page - PartitionPageSize();
PA_DCHECK(payload ==
SuperPagePayloadBegin(super_page, root->IsQuarantineAllowed()));
PA_DCHECK(root->next_partition_page_end == SuperPagePayloadEnd(super_page));
// Keep the first partition page in the super page inaccessible to serve as a
// guard page, except an "island" in the middle where we put page metadata and
// also a tiny amount of extent metadata.
{
ScopedSyscallTimer timer{root};
RecommitSystemPages(super_page + SystemPageSize(), SystemPageSize(),
#if PA_CONFIG(ENABLE_SHADOW_METADATA)
root->PageAccessibilityWithPkeyIfEnabled(
PageAccessibilityConfiguration::kRead),
#else
root->PageAccessibilityWithPkeyIfEnabled(
PageAccessibilityConfiguration::kReadWrite),
#endif
PageAccessibilityDisposition::kRequireUpdate);
}
#if BUILDFLAG(PUT_REF_COUNT_IN_PREVIOUS_SLOT)
// If PUT_REF_COUNT_IN_PREVIOUS_SLOT is on, and if the BRP pool is
// used, allocate a SystemPage for RefCount bitmap.
if (root->ChoosePool() == kBRPPoolHandle) {
ScopedSyscallTimer timer{root};
RecommitSystemPages(super_page + SystemPageSize() * 2, SystemPageSize(),
root->PageAccessibilityWithPkeyIfEnabled(
PageAccessibilityConfiguration::kReadWrite),
PageAccessibilityDisposition::kRequireUpdate);
}
#endif
#if PA_CONFIG(ENABLE_SHADOW_METADATA)
{
ScopedSyscallTimer timer{root};
RecommitSystemPages(ShadowMetadataStart(super_page, root->ChoosePool()),
SystemPageSize(),
root->PageAccessibilityWithPkeyIfEnabled(
PageAccessibilityConfiguration::kReadWrite),
PageAccessibilityDisposition::kRequireUpdate);
}
#endif
// If we were after a specific address, but didn't get it, assume that
// the system chose a lousy address. Here most OS'es have a default
// algorithm that isn't randomized. For example, most Linux
// distributions will allocate the mapping directly before the last
// successful mapping, which is far from random. So we just get fresh
// randomness for the next mapping attempt.
if (requested_address && requested_address != super_page) {
root->next_super_page = 0;
}
// We allocated a new super page so update super page metadata.
// First check if this is a new extent or not.
auto* latest_extent = PartitionSuperPageToExtent<thread_safe>(super_page);
// By storing the root in every extent metadata object, we have a fast way
// to go from a pointer within the partition to the root object.
latest_extent->root = root;
// Most new extents will be part of a larger extent, and these two fields
// are unused, but we initialize them to 0 so that we get a clear signal
// in case they are accidentally used.
latest_extent->number_of_consecutive_super_pages = 0;
latest_extent->next = nullptr;
latest_extent->number_of_nonempty_slot_spans = 0;
PartitionSuperPageExtentEntry<thread_safe>* current_extent =
root->current_extent;
const bool is_new_extent = super_page != requested_address;
if (PA_UNLIKELY(is_new_extent)) {
if (PA_UNLIKELY(!current_extent)) {
PA_DCHECK(!root->first_extent);
root->first_extent = latest_extent;
} else {
PA_DCHECK(current_extent->number_of_consecutive_super_pages);
current_extent->next = latest_extent;
}
root->current_extent = latest_extent;
latest_extent->number_of_consecutive_super_pages = 1;
} else {
// We allocated next to an existing extent so just nudge the size up a
// little.
PA_DCHECK(current_extent->number_of_consecutive_super_pages);
++current_extent->number_of_consecutive_super_pages;
PA_DCHECK(payload > SuperPagesBeginFromExtent(current_extent) &&
payload < SuperPagesEndFromExtent(current_extent));
}
// If PCScan is used, commit the state bitmap. Otherwise, leave it uncommitted
// and let PartitionRoot::RegisterScannableRoot() commit it when needed. Make
// sure to register the super-page after it has been fully initialized.
// Otherwise, the concurrent scanner may try to access |extent->root| which
// could be not initialized yet.
#if BUILDFLAG(USE_STARSCAN)
if (root->IsQuarantineEnabled()) {
{
ScopedSyscallTimer timer{root};
RecommitSystemPages(state_bitmap, state_bitmap_size_to_commit,
root->PageAccessibilityWithPkeyIfEnabled(
PageAccessibilityConfiguration::kReadWrite),
PageAccessibilityDisposition::kRequireUpdate);
}
PCScan::RegisterNewSuperPage(root, super_page);
}
#endif // BUILDFLAG(USE_STARSCAN)
#if BUILDFLAG(USE_FREESLOT_BITMAP)
// Commit the pages for freeslot bitmap.
if (!is_direct_mapped()) {
uintptr_t freeslot_bitmap_addr = super_page + PartitionPageSize();
PA_DCHECK(SuperPageFreeSlotBitmapAddr(super_page) == freeslot_bitmap_addr);
ScopedSyscallTimer timer{root};
RecommitSystemPages(freeslot_bitmap_addr, CommittedFreeSlotBitmapSize(),
root->PageAccessibilityWithPkeyIfEnabled(
PageAccessibilityConfiguration::kReadWrite),
PageAccessibilityDisposition::kRequireUpdate);
}
#endif
return payload;
}
template <bool thread_safe>
PA_ALWAYS_INLINE void PartitionBucket<thread_safe>::InitializeSlotSpan(
SlotSpanMetadata<thread_safe>* slot_span) {
new (slot_span) SlotSpanMetadata<thread_safe>(this);
slot_span->Reset();
uint16_t num_partition_pages = get_pages_per_slot_span();
auto* page = reinterpret_cast<PartitionPage<thread_safe>*>(slot_span);
for (uint16_t i = 0; i < num_partition_pages; ++i, ++page) {
PA_DCHECK(i <= PartitionPage<thread_safe>::kMaxSlotSpanMetadataOffset);
page->slot_span_metadata_offset = i;
page->is_valid = true;
}
}
template <bool thread_safe>
PA_ALWAYS_INLINE uintptr_t
PartitionBucket<thread_safe>::ProvisionMoreSlotsAndAllocOne(
PartitionRoot<thread_safe>* root,
SlotSpanMetadata<thread_safe>* slot_span) {
PA_DCHECK(slot_span !=
SlotSpanMetadata<thread_safe>::get_sentinel_slot_span());
size_t num_slots = slot_span->num_unprovisioned_slots;
PA_DCHECK(num_slots);
PA_DCHECK(num_slots <= get_slots_per_span());
// We should only get here when _every_ slot is either used or unprovisioned.
// (The third possible state is "on the freelist". If we have a non-empty
// freelist, we should not get here.)
PA_DCHECK(num_slots + slot_span->num_allocated_slots == get_slots_per_span());
// Similarly, make explicitly sure that the freelist is empty.
PA_DCHECK(!slot_span->get_freelist_head());
PA_DCHECK(!slot_span->is_full());
uintptr_t slot_span_start =
SlotSpanMetadata<thread_safe>::ToSlotSpanStart(slot_span);
// If we got here, the first unallocated slot is either partially or fully on
// an uncommitted page. If the latter, it must be at the start of that page.
uintptr_t return_slot =
slot_span_start + (slot_size * slot_span->num_allocated_slots);
uintptr_t next_slot = return_slot + slot_size;
uintptr_t commit_start = base::bits::AlignUp(return_slot, SystemPageSize());
PA_DCHECK(next_slot > commit_start);
uintptr_t commit_end = base::bits::AlignUp(next_slot, SystemPageSize());
// If the slot was partially committed, |return_slot| and |next_slot| fall
// in different pages. If the slot was fully uncommitted, |return_slot| points
// to the page start and |next_slot| doesn't, thus only the latter gets
// rounded up.
PA_DCHECK(commit_end > commit_start);
// The slot being returned is considered allocated.
slot_span->num_allocated_slots++;
// Round down, because a slot that doesn't fully fit in the new page(s) isn't
// provisioned.
size_t slots_to_provision = (commit_end - return_slot) / slot_size;
slot_span->num_unprovisioned_slots -= slots_to_provision;
PA_DCHECK(slot_span->num_allocated_slots +
slot_span->num_unprovisioned_slots <=
get_slots_per_span());
// If lazy commit is enabled, meaning system pages in the slot span come
// in an initially decommitted state, commit them here.
// Note, we can't use PageAccessibilityDisposition::kAllowKeepForPerf, because
// we have no knowledge which pages have been committed before (it doesn't
// matter on Windows anyway).
if (kUseLazyCommit) {
// TODO(lizeb): Handle commit failure.
root->RecommitSystemPagesForData(
commit_start, commit_end - commit_start,
PageAccessibilityDisposition::kRequireUpdate);
}
if (PA_LIKELY(slot_size <= kMaxMemoryTaggingSize &&
root->IsMemoryTaggingEnabled())) {
// Ensure the MTE-tag of the memory pointed by |return_slot| is unguessable.
TagMemoryRangeRandomly(return_slot, slot_size);
}
// Add all slots that fit within so far committed pages to the free list.
PartitionFreelistEntry* prev_entry = nullptr;
uintptr_t next_slot_end = next_slot + slot_size;
size_t free_list_entries_added = 0;
while (next_slot_end <= commit_end) {
void* next_slot_ptr;
if (PA_LIKELY(slot_size <= kMaxMemoryTaggingSize)) {
// Ensure the MTE-tag of the memory pointed by other provisioned slot is
// unguessable. They will be returned to the app as is, and the MTE-tag
// will only change upon calling Free().
next_slot_ptr = TagMemoryRangeRandomly(next_slot, slot_size);
} else {
// No MTE-tagging for larger slots, just cast.
next_slot_ptr = reinterpret_cast<void*>(next_slot);
}
auto* entry = PartitionFreelistEntry::EmplaceAndInitNull(next_slot_ptr);
if (!slot_span->get_freelist_head()) {
PA_DCHECK(!prev_entry);
PA_DCHECK(!free_list_entries_added);
slot_span->SetFreelistHead(entry);
} else {
PA_DCHECK(free_list_entries_added);
prev_entry->SetNext(entry);
}
#if BUILDFLAG(USE_FREESLOT_BITMAP)
FreeSlotBitmapMarkSlotAsFree(next_slot);
#endif
next_slot = next_slot_end;
next_slot_end = next_slot + slot_size;
prev_entry = entry;
#if BUILDFLAG(PA_DCHECK_IS_ON)
free_list_entries_added++;
#endif
}
#if BUILDFLAG(USE_FREESLOT_BITMAP)
FreeSlotBitmapMarkSlotAsFree(return_slot);
#endif
#if BUILDFLAG(PA_DCHECK_IS_ON)
// The only provisioned slot not added to the free list is the one being
// returned.
PA_DCHECK(slots_to_provision == free_list_entries_added + 1);
// We didn't necessarily provision more than one slot (e.g. if |slot_size|
// is large), meaning that |slot_span->freelist_head| can be nullptr.
if (slot_span->get_freelist_head()) {
PA_DCHECK(free_list_entries_added);
slot_span->get_freelist_head()->CheckFreeList(slot_size);
}
#endif
// We had no free slots, and created some (potentially 0) in sorted order.
slot_span->set_freelist_sorted();
return return_slot;
}
template <bool thread_safe>
bool PartitionBucket<thread_safe>::SetNewActiveSlotSpan() {
SlotSpanMetadata<thread_safe>* slot_span = active_slot_spans_head;
if (slot_span == SlotSpanMetadata<thread_safe>::get_sentinel_slot_span()) {
return false;
}
SlotSpanMetadata<thread_safe>* next_slot_span;
// The goal here is to find a suitable slot span in the active list. Suitable
// slot spans are |is_active()|, i.e. they either have (a) freelist entries,
// or (b) unprovisioned free space. The first case is preferable, since it
// doesn't cost a system call, and doesn't cause new memory to become dirty.
//
// While looking for a new slot span, active list maintenance is performed,
// that is:
// - Empty and decommitted slot spans are moved to their respective lists.
// - Full slot spans are removed from the active list but are not moved
// anywhere. They could be tracked in a separate list, but this would
// increase cost non trivially. Indeed, a full slot span is likely to become
// non-full at some point (due to a free() hitting it). Since we only have
// space in the metadata for a single linked list pointer, removing the
// newly-non-full slot span from the "full" list would require walking it
// (to know what's before it in the full list).
//
// Since we prefer slot spans with provisioned freelist entries, maintenance
// happens in two stages:
// 1. Walk the list to find candidates. Each of the skipped slot span is moved
// to either:
// - one of the long-lived lists: empty, decommitted
// - the temporary "active slots spans with no freelist entry" list
// - Nowhere for full slot spans.
// 2. Once we have a candidate:
// - Set it as the new active list head
// - Reattach the temporary list
//
// Note that in most cases, the whole list will not be walked and maintained
// at this stage.
SlotSpanMetadata<thread_safe>* to_provision_head = nullptr;
SlotSpanMetadata<thread_safe>* to_provision_tail = nullptr;
for (; slot_span; slot_span = next_slot_span) {
next_slot_span = slot_span->next_slot_span;
PA_DCHECK(slot_span->bucket == this);
PA_DCHECK(slot_span != empty_slot_spans_head);
PA_DCHECK(slot_span != decommitted_slot_spans_head);
if (slot_span->is_active()) {
// Has provisioned slots.
if (slot_span->get_freelist_head()) {
// Will use this slot span, no need to go further.
break;
} else {
// Keeping head and tail because we don't want to reverse the list.
if (!to_provision_head) {
to_provision_head = slot_span;
}
if (to_provision_tail) {
to_provision_tail->next_slot_span = slot_span;
}
to_provision_tail = slot_span;
slot_span->next_slot_span = nullptr;
}
} else if (slot_span->is_empty()) {
slot_span->next_slot_span = empty_slot_spans_head;
empty_slot_spans_head = slot_span;
} else if (PA_LIKELY(slot_span->is_decommitted())) {
slot_span->next_slot_span = decommitted_slot_spans_head;
decommitted_slot_spans_head = slot_span;
} else {
PA_DCHECK(slot_span->is_full());
// Move this slot span... nowhere, and also mark it as full. We need it
// marked so that free'ing can tell, and move it back into the active
// list.
slot_span->marked_full = 1;
++num_full_slot_spans;
// Overflow. Most likely a correctness issue in the code. It is in theory
// possible that the number of full slot spans really reaches (1 << 24),
// but this is very unlikely (and not possible with most pool settings).
PA_CHECK(num_full_slot_spans);
// Not necessary but might help stop accidents.
slot_span->next_slot_span = nullptr;
}
}
bool usable_active_list_head = false;
// Found an active slot span with provisioned entries on the freelist.
if (slot_span) {
usable_active_list_head = true;
// We have active slot spans with unprovisioned entries. Re-attach them into
// the active list, past the span with freelist entries.
if (to_provision_head) {
auto* next = slot_span->next_slot_span;
slot_span->next_slot_span = to_provision_head;
to_provision_tail->next_slot_span = next;
}
active_slot_spans_head = slot_span;
} else if (to_provision_head) {
usable_active_list_head = true;
// Need to provision new slots.
active_slot_spans_head = to_provision_head;
} else {
// Active list is now empty.
active_slot_spans_head =
SlotSpanMetadata<thread_safe>::get_sentinel_slot_span_non_const();
}
return usable_active_list_head;
}
template <bool thread_safe>
void PartitionBucket<thread_safe>::MaintainActiveList() {
SlotSpanMetadata<thread_safe>* slot_span = active_slot_spans_head;
if (slot_span == SlotSpanMetadata<thread_safe>::get_sentinel_slot_span()) {
return;
}
SlotSpanMetadata<thread_safe>* new_active_slot_spans_head = nullptr;
SlotSpanMetadata<thread_safe>* new_active_slot_spans_tail = nullptr;
SlotSpanMetadata<thread_safe>* next_slot_span;
for (; slot_span; slot_span = next_slot_span) {
next_slot_span = slot_span->next_slot_span;
if (slot_span->is_active()) {
// Ordering in the active slot span list matters, don't reverse it.
if (!new_active_slot_spans_head) {
new_active_slot_spans_head = slot_span;
}
if (new_active_slot_spans_tail) {
new_active_slot_spans_tail->next_slot_span = slot_span;
}
new_active_slot_spans_tail = slot_span;
slot_span->next_slot_span = nullptr;
} else if (slot_span->is_empty()) {
// For the empty and decommitted lists, LIFO ordering makes sense (since
// it would lead to reusing memory which has been touched relatively
// recently, which only matters for committed spans though).
slot_span->next_slot_span = empty_slot_spans_head;
empty_slot_spans_head = slot_span;
} else if (slot_span->is_decommitted()) {
slot_span->next_slot_span = decommitted_slot_spans_head;
decommitted_slot_spans_head = slot_span;
} else {
// Full slot spans are not tracked, just accounted for.
PA_DCHECK(slot_span->is_full());
slot_span->marked_full = 1;
++num_full_slot_spans;
PA_CHECK(num_full_slot_spans); // Overflow.
slot_span->next_slot_span = nullptr;
}
}
if (!new_active_slot_spans_head) {
new_active_slot_spans_head =
SlotSpanMetadata<thread_safe>::get_sentinel_slot_span_non_const();
}
active_slot_spans_head = new_active_slot_spans_head;
}
template <bool thread_safe>
void PartitionBucket<thread_safe>::SortSlotSpanFreelists() {
for (auto* slot_span = active_slot_spans_head; slot_span;
slot_span = slot_span->next_slot_span) {
// No need to sort the freelist if it's already sorted. Note that if the
// freelist is sorted, this means that it didn't change at all since the
// last call. This may be a good signal to shrink it if possible (if an
// entire OS page is free, we can decommit it).
//
// Besides saving CPU, this also avoids touching memory of fully idle slot
// spans, which may required paging.
if (slot_span->num_allocated_slots > 0 &&
!slot_span->freelist_is_sorted()) {
slot_span->SortFreelist();
}
}
}
PA_COMPONENT_EXPORT(PARTITION_ALLOC)
bool CompareSlotSpans(SlotSpanMetadata<ThreadSafe>* a,
SlotSpanMetadata<ThreadSafe>* b) {
auto criteria_tuple = [](SlotSpanMetadata<ThreadSafe> const* a) {
size_t freelist_length = a->GetFreelistLength();
// The criteria are, in order (hence the lexicographic comparison below):
// 1. Prefer slot spans with freelist entries. The ones without freelist
// entries would be skipped in SetNewActiveSlotSpan() anyway.
// 2. Then the ones with the fewest freelist entries. They are either close
// to being full (for the provisioned memory), or close to being pushed
// at the end of the list (since they would not have freelist entries
// anymore, and would either fall into the first case, or be skipped by
// SetNewActiveSlotSpan()).
// 3. The ones with the fewer unprovisioned slots, meaning that they are
// close to being completely full.
//
// Note that this sorting order is not necessarily the best one when slot
// spans are partially provisioned. From local testing, in steady-state,
// most slot spans are entirely provisioned (or decommitted), which may be a
// consequence of the lack of partial slot span decommit, or of fairly
// effective fragmentation avoidance heuristics. Make sure to evaluate
// whether an alternative sorting order (sorting according to freelist size
// + unprovisioned slots) makes more sense.
return std::tuple<bool, size_t, size_t>{
freelist_length == 0, freelist_length, a->num_unprovisioned_slots};
};
return criteria_tuple(a) < criteria_tuple(b);
}
template <bool thread_safe>
void PartitionBucket<thread_safe>::SortActiveSlotSpans() {
// Sorting up to |kMaxSlotSpansToSort| slot spans. This is capped for two
// reasons:
// - Limiting execution time
// - Current code cannot allocate.
//
// In practice though, it's rare to have that many active slot spans.
SlotSpanMetadata<thread_safe>* active_spans_array[kMaxSlotSpansToSort];
size_t index = 0;
SlotSpanMetadata<thread_safe>* overflow_spans_start = nullptr;
for (auto* slot_span = active_slot_spans_head; slot_span;
slot_span = slot_span->next_slot_span) {
if (index < kMaxSlotSpansToSort) {
active_spans_array[index++] = slot_span;
} else {
// Starting from this one, not sorting the slot spans.
overflow_spans_start = slot_span;
break;
}
}
// We sort the active slot spans so that allocations are preferably serviced
// from the fullest ones. This way we hope to reduce fragmentation by keeping
// as few slot spans as full as possible.
//
// With perfect information on allocation lifespan, we would be able to pack
// allocations and get almost no fragmentation. This is obviously not the
// case, so we have partially full SlotSpans. Nevertheless, as a heuristic we
// want to:
// - Keep almost-empty slot spans as empty as possible
// - Keep mostly-full slot spans as full as possible
//
// The first part is done in the hope that future free()s will make these
// slot spans completely empty, allowing us to reclaim them. To that end, sort
// SlotSpans periodically so that the fullest ones are preferred.
//
// std::sort() is not completely guaranteed to never allocate memory. However,
// it may not throw std::bad_alloc, which constrains the implementation. In
// addition, this is protected by the reentrancy guard, so we would detect
// such an allocation.
std::sort(active_spans_array, active_spans_array + index, CompareSlotSpans);
active_slot_spans_head = overflow_spans_start;
// Reverse order, since we insert at the head of the list.
for (int i = index - 1; i >= 0; i--) {
if (active_spans_array[i] ==
SlotSpanMetadata<thread_safe>::get_sentinel_slot_span()) {
// The sentinel is const, don't try to write to it.
PA_DCHECK(active_slot_spans_head == nullptr);
} else {
active_spans_array[i]->next_slot_span = active_slot_spans_head;
}
active_slot_spans_head = active_spans_array[i];
}
}
template <bool thread_safe>
uintptr_t PartitionBucket<thread_safe>::SlowPathAlloc(
PartitionRoot<thread_safe>* root,
unsigned int flags,
size_t raw_size,
size_t slot_span_alignment,
bool* is_already_zeroed) {
PA_DCHECK((slot_span_alignment >= PartitionPageSize()) &&
base::bits::IsPowerOfTwo(slot_span_alignment));
// The slow path is called when the freelist is empty. The only exception is
// when a higher-order alignment is requested, in which case the freelist
// logic is bypassed and we go directly for slot span allocation.
bool allocate_aligned_slot_span = slot_span_alignment > PartitionPageSize();
PA_DCHECK(!active_slot_spans_head->get_freelist_head() ||
allocate_aligned_slot_span);
SlotSpanMetadata<thread_safe>* new_slot_span = nullptr;
// |new_slot_span->bucket| will always be |this|, except when |this| is the
// sentinel bucket, which is used to signal a direct mapped allocation. In
// this case |new_bucket| will be set properly later. This avoids a read for
// most allocations.
PartitionBucket* new_bucket = this;
*is_already_zeroed = false;
// For the PartitionRoot::Alloc() API, we have a bunch of buckets
// marked as special cases. We bounce them through to the slow path so that
// we can still have a blazing fast hot path due to lack of corner-case
// branches.
//
// Note: The ordering of the conditionals matter! In particular,
// SetNewActiveSlotSpan() has a side-effect even when returning
// false where it sweeps the active list and may move things into the empty or
// decommitted lists which affects the subsequent conditional.
if (PA_UNLIKELY(is_direct_mapped())) {
PA_DCHECK(raw_size > kMaxBucketed);
PA_DCHECK(this == &root->sentinel_bucket);
PA_DCHECK(active_slot_spans_head ==
SlotSpanMetadata<thread_safe>::get_sentinel_slot_span());
// No fast path for direct-mapped allocations.
if (flags & AllocFlags::kFastPathOrReturnNull) {
return 0;
}
new_slot_span =
PartitionDirectMap(root, flags, raw_size, slot_span_alignment);
if (new_slot_span) {
new_bucket = new_slot_span->bucket;
}
// Memory from PageAllocator is always zeroed.
*is_already_zeroed = true;
} else if (PA_LIKELY(!allocate_aligned_slot_span && SetNewActiveSlotSpan())) {
// First, did we find an active slot span in the active list?
new_slot_span = active_slot_spans_head;
PA_DCHECK(new_slot_span->is_active());
} else if (PA_LIKELY(!allocate_aligned_slot_span &&
(empty_slot_spans_head != nullptr ||
decommitted_slot_spans_head != nullptr))) {
// Second, look in our lists of empty and decommitted slot spans.
// Check empty slot spans first, which are preferred, but beware that an
// empty slot span might have been decommitted.
while (PA_LIKELY((new_slot_span = empty_slot_spans_head) != nullptr)) {
PA_DCHECK(new_slot_span->bucket == this);
PA_DCHECK(new_slot_span->is_empty() || new_slot_span->is_decommitted());
empty_slot_spans_head = new_slot_span->next_slot_span;
// Accept the empty slot span unless it got decommitted.
if (new_slot_span->get_freelist_head()) {
new_slot_span->next_slot_span = nullptr;
new_slot_span->ToSuperPageExtent()
->IncrementNumberOfNonemptySlotSpans();
// Re-activating an empty slot span, update accounting.
size_t dirty_size = base::bits::AlignUp(
new_slot_span->GetProvisionedSize(), SystemPageSize());
PA_DCHECK(root->empty_slot_spans_dirty_bytes >= dirty_size);
root->empty_slot_spans_dirty_bytes -= dirty_size;
break;
}
PA_DCHECK(new_slot_span->is_decommitted());
new_slot_span->next_slot_span = decommitted_slot_spans_head;
decommitted_slot_spans_head = new_slot_span;
}
if (PA_UNLIKELY(!new_slot_span) &&
PA_LIKELY(decommitted_slot_spans_head != nullptr)) {
// Commit can be expensive, don't do it.
if (flags & AllocFlags::kFastPathOrReturnNull) {
return 0;
}
new_slot_span = decommitted_slot_spans_head;
PA_DCHECK(new_slot_span->bucket == this);
PA_DCHECK(new_slot_span->is_decommitted());
decommitted_slot_spans_head = new_slot_span->next_slot_span;
// If lazy commit is enabled, pages will be recommitted when provisioning
// slots, in ProvisionMoreSlotsAndAllocOne(), not here.
if (!kUseLazyCommit) {
uintptr_t slot_span_start =
SlotSpanMetadata<thread_safe>::ToSlotSpanStart(new_slot_span);
// Since lazy commit isn't used, we have a guarantee that all slot span
// pages have been previously committed, and then decommitted using
// PageAccessibilityDisposition::kAllowKeepForPerf, so use the
// same option as an optimization.
// TODO(lizeb): Handle commit failure.
root->RecommitSystemPagesForData(
slot_span_start, new_slot_span->bucket->get_bytes_per_span(),
PageAccessibilityDisposition::kAllowKeepForPerf);
}
new_slot_span->Reset();
*is_already_zeroed = DecommittedMemoryIsAlwaysZeroed();
}
PA_DCHECK(new_slot_span);
} else {
// Getting a new slot span is expensive, don't do it.
if (flags & AllocFlags::kFastPathOrReturnNull) {
return 0;
}
// Third. If we get here, we need a brand new slot span.
// TODO(bartekn): For single-slot slot spans, we can use rounded raw_size
// as slot_span_committed_size.
new_slot_span = AllocNewSlotSpan(root, flags, slot_span_alignment);
// New memory from PageAllocator is always zeroed.
*is_already_zeroed = true;
}
// Bail if we had a memory allocation failure.
if (PA_UNLIKELY(!new_slot_span)) {
PA_DCHECK(active_slot_spans_head ==
SlotSpanMetadata<thread_safe>::get_sentinel_slot_span());
if (flags & AllocFlags::kReturnNull) {
return 0;
}
// See comment in PartitionDirectMap() for unlocking.
ScopedUnlockGuard unlock{root->lock_};
root->OutOfMemory(raw_size);
PA_IMMEDIATE_CRASH(); // Not required, kept as documentation.
}
PA_DCHECK(new_bucket != &root->sentinel_bucket);
new_bucket->active_slot_spans_head = new_slot_span;
if (new_slot_span->CanStoreRawSize()) {
new_slot_span->SetRawSize(raw_size);
}
// If we found an active slot span with free slots, or an empty slot span, we
// have a usable freelist head.
if (PA_LIKELY(new_slot_span->get_freelist_head() != nullptr)) {
PartitionFreelistEntry* entry =
new_slot_span->PopForAlloc(new_bucket->slot_size);
// We may have set *is_already_zeroed to true above, make sure that the
// freelist entry doesn't contain data. Either way, it wouldn't be a good
// idea to let users see our internal data.
uintptr_t slot_start = entry->ClearForAllocation();
return slot_start;
}
// Otherwise, we need to provision more slots by committing more pages. Build
// the free list for the newly provisioned slots.
PA_DCHECK(new_slot_span->num_unprovisioned_slots);
return ProvisionMoreSlotsAndAllocOne(root, new_slot_span);
}
template <bool thread_safe>
uintptr_t PartitionBucket<thread_safe>::AllocNewSuperPageSpanForGwpAsan(
PartitionRoot<thread_safe>* root,
size_t super_page_count,
unsigned int flags) {
return AllocNewSuperPageSpan(root, super_page_count, flags);
}
template <bool thread_safe>
void PartitionBucket<thread_safe>::InitializeSlotSpanForGwpAsan(
SlotSpanMetadata<thread_safe>* slot_span) {
InitializeSlotSpan(slot_span);
}
template struct PartitionBucket<ThreadSafe>;
} // namespace partition_alloc::internal