blob: 17a611eeacb865cecbc80c0defea9bc141cc86e6 [file] [log] [blame]
//===-- xray_profile_collector.cc ------------------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file is a part of XRay, a dynamic runtime instrumentation system.
//
// This implements the interface for the profileCollectorService.
//
//===----------------------------------------------------------------------===//
#include "xray_profile_collector.h"
#include "sanitizer_common/sanitizer_allocator_internal.h"
#include "sanitizer_common/sanitizer_common.h"
#include "sanitizer_common/sanitizer_vector.h"
#include "xray_profiling_flags.h"
#include <memory>
#include <pthread.h>
#include <utility>
namespace __xray {
namespace profileCollectorService {
namespace {
SpinMutex GlobalMutex;
struct ThreadTrie {
tid_t TId;
FunctionCallTrie *Trie;
};
struct ProfileBuffer {
void *Data;
size_t Size;
};
// Current version of the profile format.
constexpr u64 XRayProfilingVersion = 0x20180424;
// Identifier for XRay profiling files 'xrayprof' in hex.
constexpr u64 XRayMagicBytes = 0x7872617970726f66;
struct XRayProfilingFileHeader {
const u64 MagicBytes = XRayMagicBytes;
const u64 Version = XRayProfilingVersion;
u64 Timestamp = 0; // System time in nanoseconds.
u64 PID = 0; // Process ID.
};
struct BlockHeader {
u32 BlockSize;
u32 BlockNum;
u64 ThreadId;
};
// These need to be pointers that point to heap/internal-allocator-allocated
// objects because these are accessed even at program exit.
Vector<ThreadTrie> *ThreadTries = nullptr;
Vector<ProfileBuffer> *ProfileBuffers = nullptr;
FunctionCallTrie::Allocators *GlobalAllocators = nullptr;
} // namespace
void post(const FunctionCallTrie &T, tid_t TId) {
static pthread_once_t Once = PTHREAD_ONCE_INIT;
pthread_once(&Once, +[] {
SpinMutexLock Lock(&GlobalMutex);
GlobalAllocators = reinterpret_cast<FunctionCallTrie::Allocators *>(
InternalAlloc(sizeof(FunctionCallTrie::Allocators)));
new (GlobalAllocators) FunctionCallTrie::Allocators();
*GlobalAllocators = FunctionCallTrie::InitAllocatorsCustom(
profilingFlags()->global_allocator_max);
ThreadTries = reinterpret_cast<Vector<ThreadTrie> *>(
InternalAlloc(sizeof(Vector<ThreadTrie>)));
new (ThreadTries) Vector<ThreadTrie>();
ProfileBuffers = reinterpret_cast<Vector<ProfileBuffer> *>(
InternalAlloc(sizeof(Vector<ProfileBuffer>)));
new (ProfileBuffers) Vector<ProfileBuffer>();
});
DCHECK_NE(GlobalAllocators, nullptr);
DCHECK_NE(ThreadTries, nullptr);
DCHECK_NE(ProfileBuffers, nullptr);
ThreadTrie *Item = nullptr;
{
SpinMutexLock Lock(&GlobalMutex);
if (GlobalAllocators == nullptr)
return;
Item = ThreadTries->PushBack();
Item->TId = TId;
// Here we're using the internal allocator instead of the managed allocator
// because:
//
// 1) We're not using the segmented array data structure to host
// FunctionCallTrie objects. We're using a Vector (from sanitizer_common)
// which works like a std::vector<...> keeping elements contiguous in
// memory. The segmented array data structure assumes that elements are
// trivially destructible, where FunctionCallTrie isn't.
//
// 2) Using a managed allocator means we need to manage that separately,
// which complicates the nature of this code. To get around that, we're
// using the internal allocator instead, which has its own global state
// and is decoupled from the lifetime management required by the managed
// allocator we have in XRay.
//
Item->Trie = reinterpret_cast<FunctionCallTrie *>(InternalAlloc(
sizeof(FunctionCallTrie), nullptr, alignof(FunctionCallTrie)));
DCHECK_NE(Item->Trie, nullptr);
new (Item->Trie) FunctionCallTrie(*GlobalAllocators);
}
T.deepCopyInto(*Item->Trie);
}
// A PathArray represents the function id's representing a stack trace. In this
// context a path is almost always represented from the leaf function in a call
// stack to a root of the call trie.
using PathArray = Array<int32_t>;
struct ProfileRecord {
using PathAllocator = typename PathArray::AllocatorType;
// The Path in this record is the function id's from the leaf to the root of
// the function call stack as represented from a FunctionCallTrie.
PathArray *Path = nullptr;
const FunctionCallTrie::Node *Node = nullptr;
// Constructor for in-place construction.
ProfileRecord(PathAllocator &A, const FunctionCallTrie::Node *N)
: Path([&] {
auto P =
reinterpret_cast<PathArray *>(InternalAlloc(sizeof(PathArray)));
new (P) PathArray(A);
return P;
}()),
Node(N) {}
};
namespace {
using ProfileRecordArray = Array<ProfileRecord>;
// Walk a depth-first traversal of each root of the FunctionCallTrie to generate
// the path(s) and the data associated with the path.
static void populateRecords(ProfileRecordArray &PRs,
ProfileRecord::PathAllocator &PA,
const FunctionCallTrie &Trie) {
using StackArray = Array<const FunctionCallTrie::Node *>;
using StackAllocator = typename StackArray::AllocatorType;
StackAllocator StackAlloc(profilingFlags()->stack_allocator_max);
StackArray DFSStack(StackAlloc);
for (const auto R : Trie.getRoots()) {
DFSStack.Append(R);
while (!DFSStack.empty()) {
auto Node = DFSStack.back();
DFSStack.trim(1);
auto Record = PRs.AppendEmplace(PA, Node);
if (Record == nullptr)
return;
DCHECK_NE(Record, nullptr);
// Traverse the Node's parents and as we're doing so, get the FIds in
// the order they appear.
for (auto N = Node; N != nullptr; N = N->Parent)
Record->Path->Append(N->FId);
DCHECK(!Record->Path->empty());
for (const auto C : Node->Callees)
DFSStack.Append(C.NodePtr);
}
}
}
static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header,
const ProfileRecordArray &ProfileRecords) {
auto NextPtr = static_cast<char *>(
internal_memcpy(Buffer->Data, &Header, sizeof(Header))) +
sizeof(Header);
for (const auto &Record : ProfileRecords) {
// List of IDs follow:
for (const auto FId : *Record.Path)
NextPtr =
static_cast<char *>(internal_memcpy(NextPtr, &FId, sizeof(FId))) +
sizeof(FId);
// Add the sentinel here.
constexpr int32_t SentinelFId = 0;
NextPtr = static_cast<char *>(
internal_memset(NextPtr, SentinelFId, sizeof(SentinelFId))) +
sizeof(SentinelFId);
// Add the node data here.
NextPtr =
static_cast<char *>(internal_memcpy(NextPtr, &Record.Node->CallCount,
sizeof(Record.Node->CallCount))) +
sizeof(Record.Node->CallCount);
NextPtr = static_cast<char *>(
internal_memcpy(NextPtr, &Record.Node->CumulativeLocalTime,
sizeof(Record.Node->CumulativeLocalTime))) +
sizeof(Record.Node->CumulativeLocalTime);
}
DCHECK_EQ(NextPtr - static_cast<char *>(Buffer->Data), Buffer->Size);
}
} // namespace
void serialize() {
SpinMutexLock Lock(&GlobalMutex);
// Clear out the global ProfileBuffers.
for (uptr I = 0; I < ProfileBuffers->Size(); ++I)
InternalFree((*ProfileBuffers)[I].Data);
ProfileBuffers->Reset();
if (ThreadTries->Size() == 0)
return;
// Then repopulate the global ProfileBuffers.
for (u32 I = 0; I < ThreadTries->Size(); ++I) {
using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType;
ProfileRecordAllocator PRAlloc(profilingFlags()->global_allocator_max);
ProfileRecord::PathAllocator PathAlloc(
profilingFlags()->global_allocator_max);
ProfileRecordArray ProfileRecords(PRAlloc);
// First, we want to compute the amount of space we're going to need. We'll
// use a local allocator and an __xray::Array<...> to store the intermediary
// data, then compute the size as we're going along. Then we'll allocate the
// contiguous space to contain the thread buffer data.
const auto &Trie = *(*ThreadTries)[I].Trie;
if (Trie.getRoots().empty())
continue;
populateRecords(ProfileRecords, PathAlloc, Trie);
DCHECK(!Trie.getRoots().empty());
DCHECK(!ProfileRecords.empty());
// Go through each record, to compute the sizes.
//
// header size = block size (4 bytes)
// + block number (4 bytes)
// + thread id (8 bytes)
// record size = path ids (4 bytes * number of ids + sentinel 4 bytes)
// + call count (8 bytes)
// + local time (8 bytes)
// + end of record (8 bytes)
u32 CumulativeSizes = 0;
for (const auto &Record : ProfileRecords)
CumulativeSizes += 20 + (4 * Record.Path->size());
BlockHeader Header{16 + CumulativeSizes, I, (*ThreadTries)[I].TId};
auto Buffer = ProfileBuffers->PushBack();
Buffer->Size = sizeof(Header) + CumulativeSizes;
Buffer->Data = InternalAlloc(Buffer->Size, nullptr, 64);
DCHECK_NE(Buffer->Data, nullptr);
serializeRecords(Buffer, Header, ProfileRecords);
// Now clean up the ProfileRecords array, one at a time.
for (auto &Record : ProfileRecords) {
Record.Path->~PathArray();
InternalFree(Record.Path);
}
}
}
void reset() {
SpinMutexLock Lock(&GlobalMutex);
if (ProfileBuffers != nullptr) {
// Clear out the profile buffers that have been serialized.
for (uptr I = 0; I < ProfileBuffers->Size(); ++I)
InternalFree((*ProfileBuffers)[I].Data);
ProfileBuffers->Reset();
InternalFree(ProfileBuffers);
ProfileBuffers = nullptr;
}
if (ThreadTries != nullptr) {
// Clear out the function call tries per thread.
for (uptr I = 0; I < ThreadTries->Size(); ++I) {
auto &T = (*ThreadTries)[I];
T.Trie->~FunctionCallTrie();
InternalFree(T.Trie);
}
ThreadTries->Reset();
InternalFree(ThreadTries);
ThreadTries = nullptr;
}
// Reset the global allocators.
if (GlobalAllocators != nullptr) {
GlobalAllocators->~Allocators();
InternalFree(GlobalAllocators);
GlobalAllocators = nullptr;
}
GlobalAllocators = reinterpret_cast<FunctionCallTrie::Allocators *>(
InternalAlloc(sizeof(FunctionCallTrie::Allocators)));
new (GlobalAllocators) FunctionCallTrie::Allocators();
*GlobalAllocators = FunctionCallTrie::InitAllocators();
ThreadTries = reinterpret_cast<Vector<ThreadTrie> *>(
InternalAlloc(sizeof(Vector<ThreadTrie>)));
new (ThreadTries) Vector<ThreadTrie>();
ProfileBuffers = reinterpret_cast<Vector<ProfileBuffer> *>(
InternalAlloc(sizeof(Vector<ProfileBuffer>)));
new (ProfileBuffers) Vector<ProfileBuffer>();
}
XRayBuffer nextBuffer(XRayBuffer B) {
SpinMutexLock Lock(&GlobalMutex);
if (ProfileBuffers == nullptr || ProfileBuffers->Size() == 0)
return {nullptr, 0};
static pthread_once_t Once = PTHREAD_ONCE_INIT;
static typename std::aligned_storage<sizeof(XRayProfilingFileHeader)>::type
FileHeaderStorage;
pthread_once(&Once,
+[] { new (&FileHeaderStorage) XRayProfilingFileHeader{}; });
if (UNLIKELY(B.Data == nullptr)) {
// The first buffer should always contain the file header information.
auto &FileHeader =
*reinterpret_cast<XRayProfilingFileHeader *>(&FileHeaderStorage);
FileHeader.Timestamp = NanoTime();
FileHeader.PID = internal_getpid();
return {&FileHeaderStorage, sizeof(XRayProfilingFileHeader)};
}
if (UNLIKELY(B.Data == &FileHeaderStorage))
return {(*ProfileBuffers)[0].Data, (*ProfileBuffers)[0].Size};
BlockHeader Header;
internal_memcpy(&Header, B.Data, sizeof(BlockHeader));
auto NextBlock = Header.BlockNum + 1;
if (NextBlock < ProfileBuffers->Size())
return {(*ProfileBuffers)[NextBlock].Data,
(*ProfileBuffers)[NextBlock].Size};
return {nullptr, 0};
}
} // namespace profileCollectorService
} // namespace __xray