blob: aff8f24abebb90b692803d957af3ec157c972ab9 [file] [log] [blame]
// Copyright (c) 2018 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "base/allocator/partition_allocator/partition_bucket.h"
#include "base/allocator/partition_allocator/oom.h"
#include "base/allocator/partition_allocator/page_allocator.h"
#include "base/allocator/partition_allocator/partition_alloc_constants.h"
#include "base/allocator/partition_allocator/partition_direct_map_extent.h"
#include "base/allocator/partition_allocator/partition_oom.h"
#include "base/allocator/partition_allocator/partition_page.h"
#include "base/allocator/partition_allocator/partition_root_base.h"
#include "build/build_config.h"
namespace base {
namespace internal {
namespace {
ALWAYS_INLINE PartitionPage* PartitionDirectMap(PartitionRootBase* root,
int flags,
size_t raw_size) {
size_t size = PartitionBucket::get_direct_map_size(raw_size);
// Because we need to fake looking like a super page, we need to allocate
// a bunch of system pages more than "size":
// - The first few system pages are the partition page in which the super
// page metadata is stored. We fault just one system page out of a partition
// page sized clump.
// - We add a trailing guard page on 32-bit (on 64-bit we rely on the
// massive address space plus randomization instead).
size_t map_size = size + kPartitionPageSize;
#if !defined(ARCH_CPU_64_BITS)
map_size += kSystemPageSize;
#endif
// Round up to the allocation granularity.
map_size += kPageAllocationGranularityOffsetMask;
map_size &= kPageAllocationGranularityBaseMask;
char* ptr = reinterpret_cast<char*>(
AllocPages(nullptr, map_size, kSuperPageSize, PageReadWrite));
if (UNLIKELY(!ptr))
return nullptr;
size_t committed_page_size = size + kSystemPageSize;
root->total_size_of_direct_mapped_pages += committed_page_size;
root->IncreaseCommittedPages(committed_page_size);
char* slot = ptr + kPartitionPageSize;
CHECK(SetSystemPagesAccess(ptr + (kSystemPageSize * 2),
kPartitionPageSize - (kSystemPageSize * 2),
PageInaccessible));
#if !defined(ARCH_CPU_64_BITS)
CHECK(SetSystemPagesAccess(ptr, kSystemPageSize, PageInaccessible));
CHECK(SetSystemPagesAccess(slot + size, kSystemPageSize, PageInaccessible));
#endif
PartitionSuperPageExtentEntry* extent =
reinterpret_cast<PartitionSuperPageExtentEntry*>(
PartitionSuperPageToMetadataArea(ptr));
extent->root = root;
// The new structures are all located inside a fresh system page so they
// will all be zeroed out. These DCHECKs are for documentation.
DCHECK(!extent->super_page_base);
DCHECK(!extent->super_pages_end);
DCHECK(!extent->next);
PartitionPage* page = PartitionPage::FromPointerNoAlignmentCheck(slot);
PartitionBucket* bucket = reinterpret_cast<PartitionBucket*>(
reinterpret_cast<char*>(page) + (kPageMetadataSize * 2));
DCHECK(!page->next_page);
DCHECK(!page->num_allocated_slots);
DCHECK(!page->num_unprovisioned_slots);
DCHECK(!page->page_offset);
DCHECK(!page->empty_cache_index);
page->bucket = bucket;
page->freelist_head = reinterpret_cast<PartitionFreelistEntry*>(slot);
PartitionFreelistEntry* next_entry =
reinterpret_cast<PartitionFreelistEntry*>(slot);
next_entry->next = PartitionFreelistEntry::Transform(nullptr);
DCHECK(!bucket->active_pages_head);
DCHECK(!bucket->empty_pages_head);
DCHECK(!bucket->decommitted_pages_head);
DCHECK(!bucket->num_system_pages_per_slot_span);
DCHECK(!bucket->num_full_pages);
bucket->slot_size = size;
PartitionDirectMapExtent* map_extent =
PartitionDirectMapExtent::FromPage(page);
map_extent->map_size = map_size - kPartitionPageSize - kSystemPageSize;
map_extent->bucket = bucket;
// Maintain the doubly-linked list of all direct mappings.
map_extent->next_extent = root->direct_map_list;
if (map_extent->next_extent)
map_extent->next_extent->prev_extent = map_extent;
map_extent->prev_extent = nullptr;
root->direct_map_list = map_extent;
return page;
}
} // namespace
// static
PartitionBucket PartitionBucket::sentinel_bucket_;
PartitionBucket* PartitionBucket::get_sentinel_bucket() {
return &sentinel_bucket_;
}
// TODO(ajwong): This seems to interact badly with
// get_pages_per_slot_span() which rounds the value from this up to a
// multiple of kNumSystemPagesPerPartitionPage (aka 4) anyways.
// http://crbug.com/776537
//
// TODO(ajwong): The waste calculation seems wrong. The PTE usage should cover
// both used and unsed pages.
// http://crbug.com/776537
uint8_t PartitionBucket::get_system_pages_per_slot_span() {
// This works out reasonably for the current bucket sizes of the generic
// allocator, and the current values of partition page size and constants.
// Specifically, we have enough room to always pack the slots perfectly into
// some number of system pages. The only waste is the waste associated with
// unfaulted pages (i.e. wasted address space).
// TODO: we end up using a lot of system pages for very small sizes. For
// example, we'll use 12 system pages for slot size 24. The slot size is
// so small that the waste would be tiny with just 4, or 1, system pages.
// Later, we can investigate whether there are anti-fragmentation benefits
// to using fewer system pages.
double best_waste_ratio = 1.0f;
uint16_t best_pages = 0;
if (this->slot_size > kMaxSystemPagesPerSlotSpan * kSystemPageSize) {
// TODO(ajwong): Why is there a DCHECK here for this?
// http://crbug.com/776537
DCHECK(!(this->slot_size % kSystemPageSize));
best_pages = static_cast<uint16_t>(this->slot_size / kSystemPageSize);
// TODO(ajwong): Should this be checking against
// kMaxSystemPagesPerSlotSpan or numeric_limits<uint8_t>::max?
// http://crbug.com/776537
CHECK(best_pages < (1 << 8));
return static_cast<uint8_t>(best_pages);
}
DCHECK(this->slot_size <= kMaxSystemPagesPerSlotSpan * kSystemPageSize);
for (uint16_t i = kNumSystemPagesPerPartitionPage - 1;
i <= kMaxSystemPagesPerSlotSpan; ++i) {
size_t page_size = kSystemPageSize * i;
size_t num_slots = page_size / this->slot_size;
size_t waste = page_size - (num_slots * this->slot_size);
// Leaving a page unfaulted is not free; the page will occupy an empty page
// table entry. Make a simple attempt to account for that.
//
// TODO(ajwong): This looks wrong. PTEs are allocated for all pages
// regardless of whether or not they are wasted. Should it just
// be waste += i * sizeof(void*)?
// http://crbug.com/776537
size_t num_remainder_pages = i & (kNumSystemPagesPerPartitionPage - 1);
size_t num_unfaulted_pages =
num_remainder_pages
? (kNumSystemPagesPerPartitionPage - num_remainder_pages)
: 0;
waste += sizeof(void*) * num_unfaulted_pages;
double waste_ratio =
static_cast<double>(waste) / static_cast<double>(page_size);
if (waste_ratio < best_waste_ratio) {
best_waste_ratio = waste_ratio;
best_pages = i;
}
}
DCHECK(best_pages > 0);
CHECK(best_pages <= kMaxSystemPagesPerSlotSpan);
return static_cast<uint8_t>(best_pages);
}
void PartitionBucket::Init(uint32_t new_slot_size) {
slot_size = new_slot_size;
active_pages_head = PartitionPage::get_sentinel_page();
empty_pages_head = nullptr;
decommitted_pages_head = nullptr;
num_full_pages = 0;
num_system_pages_per_slot_span = get_system_pages_per_slot_span();
}
NOINLINE void PartitionBucket::OnFull() {
OOM_CRASH();
}
ALWAYS_INLINE void* PartitionBucket::AllocNewSlotSpan(
PartitionRootBase* root,
int flags,
uint16_t num_partition_pages) {
DCHECK(!(reinterpret_cast<uintptr_t>(root->next_partition_page) %
kPartitionPageSize));
DCHECK(!(reinterpret_cast<uintptr_t>(root->next_partition_page_end) %
kPartitionPageSize));
DCHECK(num_partition_pages <= kNumPartitionPagesPerSuperPage);
size_t total_size = kPartitionPageSize * num_partition_pages;
size_t num_partition_pages_left =
(root->next_partition_page_end - root->next_partition_page) >>
kPartitionPageShift;
if (LIKELY(num_partition_pages_left >= num_partition_pages)) {
// In this case, we can still hand out pages from the current super page
// allocation.
char* ret = root->next_partition_page;
// Fresh System Pages in the SuperPages are decommited. Commit them
// before vending them back.
CHECK(SetSystemPagesAccess(ret, total_size, PageReadWrite));
root->next_partition_page += total_size;
root->IncreaseCommittedPages(total_size);
return ret;
}
// Need a new super page. We want to allocate super pages in a continguous
// address region as much as possible. This is important for not causing
// page table bloat and not fragmenting address spaces in 32 bit
// architectures.
char* requested_address = root->next_super_page;
char* super_page = reinterpret_cast<char*>(AllocPages(
requested_address, kSuperPageSize, kSuperPageSize, PageReadWrite));
if (UNLIKELY(!super_page))
return nullptr;
root->total_size_of_super_pages += kSuperPageSize;
root->IncreaseCommittedPages(total_size);
// |total_size| MUST be less than kSuperPageSize - (kPartitionPageSize*2).
// This is a trustworthy value because num_partition_pages is not user
// controlled.
//
// TODO(ajwong): Introduce a DCHECK.
root->next_super_page = super_page + kSuperPageSize;
char* ret = super_page + kPartitionPageSize;
root->next_partition_page = ret + total_size;
root->next_partition_page_end = root->next_super_page - kPartitionPageSize;
// Make the first partition page in the super page a guard page, but leave a
// hole in the middle.
// This is where we put page metadata and also a tiny amount of extent
// metadata.
CHECK(SetSystemPagesAccess(super_page, kSystemPageSize, PageInaccessible));
CHECK(SetSystemPagesAccess(super_page + (kSystemPageSize * 2),
kPartitionPageSize - (kSystemPageSize * 2),
PageInaccessible));
// CHECK(SetSystemPagesAccess(super_page + (kSuperPageSize -
// kPartitionPageSize),
// kPartitionPageSize, PageInaccessible));
// All remaining slotspans for the unallocated PartitionPages inside the
// SuperPage are conceptually decommitted. Correctly set the state here
// so they do not occupy resources.
//
// TODO(ajwong): Refactor Page Allocator API so the SuperPage comes in
// decommited initially.
CHECK(SetSystemPagesAccess(super_page + kPartitionPageSize + total_size,
(kSuperPageSize - kPartitionPageSize - total_size),
PageInaccessible));
// If we were after a specific address, but didn't get it, assume that
// the system chose a lousy address. Here most OS'es have a default
// algorithm that isn't randomized. For example, most Linux
// distributions will allocate the mapping directly before the last
// successful mapping, which is far from random. So we just get fresh
// randomness for the next mapping attempt.
if (requested_address && requested_address != super_page)
root->next_super_page = nullptr;
// We allocated a new super page so update super page metadata.
// First check if this is a new extent or not.
PartitionSuperPageExtentEntry* latest_extent =
reinterpret_cast<PartitionSuperPageExtentEntry*>(
PartitionSuperPageToMetadataArea(super_page));
// By storing the root in every extent metadata object, we have a fast way
// to go from a pointer within the partition to the root object.
latest_extent->root = root;
// Most new extents will be part of a larger extent, and these three fields
// are unused, but we initialize them to 0 so that we get a clear signal
// in case they are accidentally used.
latest_extent->super_page_base = nullptr;
latest_extent->super_pages_end = nullptr;
latest_extent->next = nullptr;
PartitionSuperPageExtentEntry* current_extent = root->current_extent;
bool is_new_extent = (super_page != requested_address);
if (UNLIKELY(is_new_extent)) {
if (UNLIKELY(!current_extent)) {
DCHECK(!root->first_extent);
root->first_extent = latest_extent;
} else {
DCHECK(current_extent->super_page_base);
current_extent->next = latest_extent;
}
root->current_extent = latest_extent;
latest_extent->super_page_base = super_page;
latest_extent->super_pages_end = super_page + kSuperPageSize;
} else {
// We allocated next to an existing extent so just nudge the size up a
// little.
DCHECK(current_extent->super_pages_end);
current_extent->super_pages_end += kSuperPageSize;
DCHECK(ret >= current_extent->super_page_base &&
ret < current_extent->super_pages_end);
}
return ret;
}
ALWAYS_INLINE uint16_t PartitionBucket::get_pages_per_slot_span() {
// Rounds up to nearest multiple of kNumSystemPagesPerPartitionPage.
return (num_system_pages_per_slot_span +
(kNumSystemPagesPerPartitionPage - 1)) /
kNumSystemPagesPerPartitionPage;
}
ALWAYS_INLINE void PartitionBucket::InitializeSlotSpan(PartitionPage* page) {
// The bucket never changes. We set it up once.
page->bucket = this;
page->empty_cache_index = -1;
page->Reset();
// If this page has just a single slot, do not set up page offsets for any
// page metadata other than the first one. This ensures that attempts to
// touch invalid page metadata fail.
if (page->num_unprovisioned_slots == 1)
return;
uint16_t num_partition_pages = get_pages_per_slot_span();
char* page_char_ptr = reinterpret_cast<char*>(page);
for (uint16_t i = 1; i < num_partition_pages; ++i) {
page_char_ptr += kPageMetadataSize;
PartitionPage* secondary_page =
reinterpret_cast<PartitionPage*>(page_char_ptr);
secondary_page->page_offset = i;
}
}
ALWAYS_INLINE char* PartitionBucket::AllocAndFillFreelist(PartitionPage* page) {
DCHECK(page != PartitionPage::get_sentinel_page());
uint16_t num_slots = page->num_unprovisioned_slots;
DCHECK(num_slots);
// We should only get here when _every_ slot is either used or unprovisioned.
// (The third state is "on the freelist". If we have a non-empty freelist, we
// should not get here.)
DCHECK(num_slots + page->num_allocated_slots == this->get_slots_per_span());
// Similarly, make explicitly sure that the freelist is empty.
DCHECK(!page->freelist_head);
DCHECK(page->num_allocated_slots >= 0);
size_t size = this->slot_size;
char* base = reinterpret_cast<char*>(PartitionPage::ToPointer(page));
char* return_object = base + (size * page->num_allocated_slots);
char* first_freelist_pointer = return_object + size;
char* first_freelist_pointer_extent =
first_freelist_pointer + sizeof(PartitionFreelistEntry*);
// Our goal is to fault as few system pages as possible. We calculate the
// page containing the "end" of the returned slot, and then allow freelist
// pointers to be written up to the end of that page.
char* sub_page_limit = reinterpret_cast<char*>(
RoundUpToSystemPage(reinterpret_cast<size_t>(first_freelist_pointer)));
char* slots_limit = return_object + (size * num_slots);
char* freelist_limit = sub_page_limit;
if (UNLIKELY(slots_limit < freelist_limit))
freelist_limit = slots_limit;
uint16_t num_new_freelist_entries = 0;
if (LIKELY(first_freelist_pointer_extent <= freelist_limit)) {
// Only consider used space in the slot span. If we consider wasted
// space, we may get an off-by-one when a freelist pointer fits in the
// wasted space, but a slot does not.
// We know we can fit at least one freelist pointer.
num_new_freelist_entries = 1;
// Any further entries require space for the whole slot span.
num_new_freelist_entries += static_cast<uint16_t>(
(freelist_limit - first_freelist_pointer_extent) / size);
}
// We always return an object slot -- that's the +1 below.
// We do not neccessarily create any new freelist entries, because we cross
// sub page boundaries frequently for large bucket sizes.
DCHECK(num_new_freelist_entries + 1 <= num_slots);
num_slots -= (num_new_freelist_entries + 1);
page->num_unprovisioned_slots = num_slots;
page->num_allocated_slots++;
if (LIKELY(num_new_freelist_entries)) {
char* freelist_pointer = first_freelist_pointer;
PartitionFreelistEntry* entry =
reinterpret_cast<PartitionFreelistEntry*>(freelist_pointer);
page->freelist_head = entry;
while (--num_new_freelist_entries) {
freelist_pointer += size;
PartitionFreelistEntry* next_entry =
reinterpret_cast<PartitionFreelistEntry*>(freelist_pointer);
entry->next = PartitionFreelistEntry::Transform(next_entry);
entry = next_entry;
}
entry->next = PartitionFreelistEntry::Transform(nullptr);
} else {
page->freelist_head = nullptr;
}
return return_object;
}
bool PartitionBucket::SetNewActivePage() {
PartitionPage* page = this->active_pages_head;
if (page == PartitionPage::get_sentinel_page())
return false;
PartitionPage* next_page;
for (; page; page = next_page) {
next_page = page->next_page;
DCHECK(page->bucket == this);
DCHECK(page != this->empty_pages_head);
DCHECK(page != this->decommitted_pages_head);
if (LIKELY(page->is_active())) {
// This page is usable because it has freelist entries, or has
// unprovisioned slots we can create freelist entries from.
this->active_pages_head = page;
return true;
}
// Deal with empty and decommitted pages.
if (LIKELY(page->is_empty())) {
page->next_page = this->empty_pages_head;
this->empty_pages_head = page;
} else if (LIKELY(page->is_decommitted())) {
page->next_page = this->decommitted_pages_head;
this->decommitted_pages_head = page;
} else {
DCHECK(page->is_full());
// If we get here, we found a full page. Skip over it too, and also
// tag it as full (via a negative value). We need it tagged so that
// free'ing can tell, and move it back into the active page list.
page->num_allocated_slots = -page->num_allocated_slots;
++this->num_full_pages;
// num_full_pages is a uint16_t for efficient packing so guard against
// overflow to be safe.
if (UNLIKELY(!this->num_full_pages))
OnFull();
// Not necessary but might help stop accidents.
page->next_page = nullptr;
}
}
this->active_pages_head = PartitionPage::get_sentinel_page();
return false;
}
void* PartitionBucket::SlowPathAlloc(PartitionRootBase* root,
int flags,
size_t size,
bool* is_already_zeroed) {
// The slow path is called when the freelist is empty.
DCHECK(!this->active_pages_head->freelist_head);
PartitionPage* new_page = nullptr;
*is_already_zeroed = false;
// For the PartitionRootGeneric::Alloc() API, we have a bunch of buckets
// marked as special cases. We bounce them through to the slow path so that
// we can still have a blazing fast hot path due to lack of corner-case
// branches.
//
// Note: The ordering of the conditionals matter! In particular,
// SetNewActivePage() has a side-effect even when returning
// false where it sweeps the active page list and may move things into
// the empty or decommitted lists which affects the subsequent conditional.
bool return_null = flags & PartitionAllocReturnNull;
if (UNLIKELY(this->is_direct_mapped())) {
DCHECK(size > kGenericMaxBucketed);
DCHECK(this == get_sentinel_bucket());
DCHECK(this->active_pages_head == PartitionPage::get_sentinel_page());
if (size > kGenericMaxDirectMapped) {
if (return_null)
return nullptr;
PartitionExcessiveAllocationSize();
}
new_page = PartitionDirectMap(root, flags, size);
#if !defined(OS_MACOSX)
// Turn off the optimization to see if it helps https://crbug.com/892550.
*is_already_zeroed = true;
#endif
} else if (LIKELY(this->SetNewActivePage())) {
// First, did we find an active page in the active pages list?
new_page = this->active_pages_head;
DCHECK(new_page->is_active());
} else if (LIKELY(this->empty_pages_head != nullptr) ||
LIKELY(this->decommitted_pages_head != nullptr)) {
// Second, look in our lists of empty and decommitted pages.
// Check empty pages first, which are preferred, but beware that an
// empty page might have been decommitted.
while (LIKELY((new_page = this->empty_pages_head) != nullptr)) {
DCHECK(new_page->bucket == this);
DCHECK(new_page->is_empty() || new_page->is_decommitted());
this->empty_pages_head = new_page->next_page;
// Accept the empty page unless it got decommitted.
if (new_page->freelist_head) {
new_page->next_page = nullptr;
break;
}
DCHECK(new_page->is_decommitted());
new_page->next_page = this->decommitted_pages_head;
this->decommitted_pages_head = new_page;
}
if (UNLIKELY(!new_page) &&
LIKELY(this->decommitted_pages_head != nullptr)) {
new_page = this->decommitted_pages_head;
DCHECK(new_page->bucket == this);
DCHECK(new_page->is_decommitted());
this->decommitted_pages_head = new_page->next_page;
void* addr = PartitionPage::ToPointer(new_page);
root->RecommitSystemPages(addr, new_page->bucket->get_bytes_per_span());
new_page->Reset();
// TODO(https://crbug.com/890752): Optimizing here might cause pages to
// not be zeroed.
// *is_already_zeroed = true;
}
DCHECK(new_page);
} else {
// Third. If we get here, we need a brand new page.
uint16_t num_partition_pages = this->get_pages_per_slot_span();
void* raw_pages = AllocNewSlotSpan(root, flags, num_partition_pages);
if (LIKELY(raw_pages != nullptr)) {
new_page = PartitionPage::FromPointerNoAlignmentCheck(raw_pages);
InitializeSlotSpan(new_page);
// TODO(https://crbug.com/890752): Optimizing here causes pages to not be
// zeroed on at least macOS.
// *is_already_zeroed = true;
}
}
// Bail if we had a memory allocation failure.
if (UNLIKELY(!new_page)) {
DCHECK(this->active_pages_head == PartitionPage::get_sentinel_page());
if (return_null)
return nullptr;
root->OutOfMemory();
}
// TODO(ajwong): Is there a way to avoid the reading of bucket here?
// It seems like in many of the conditional branches above, |this| ==
// |new_page->bucket|. Maybe pull this into another function?
PartitionBucket* bucket = new_page->bucket;
DCHECK(bucket != get_sentinel_bucket());
bucket->active_pages_head = new_page;
new_page->set_raw_size(size);
// If we found an active page with free slots, or an empty page, we have a
// usable freelist head.
if (LIKELY(new_page->freelist_head != nullptr)) {
PartitionFreelistEntry* entry = new_page->freelist_head;
PartitionFreelistEntry* new_head =
PartitionFreelistEntry::Transform(entry->next);
new_page->freelist_head = new_head;
new_page->num_allocated_slots++;
return entry;
}
// Otherwise, we need to build the freelist.
DCHECK(new_page->num_unprovisioned_slots);
return AllocAndFillFreelist(new_page);
}
} // namespace internal
} // namespace base