src/third_party/llvm-project/compiler-rt/lib/esan/working_set.cpp - cobalt - Git at Google

 //===-- working_set.cpp ---------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file is a part of EfficiencySanitizer, a family of performance tuners.
 //
 // This file contains working-set-specific code.
 //===----------------------------------------------------------------------===//

 #include "working_set.h"
 #include "esan.h"
 #include "esan_circular_buffer.h"
 #include "esan_flags.h"
 #include "esan_shadow.h"
 #include "esan_sideline.h"
 #include "sanitizer_common/sanitizer_procmaps.h"

 // We shadow every cache line of app memory with one shadow byte.
 // - The highest bit of each shadow byte indicates whether the corresponding
 //   cache line has ever been accessed.
 // - The lowest bit of each shadow byte indicates whether the corresponding
 //   cache line was accessed since the last sample.
 // - The other bits are used for working set snapshots at successively
 //   lower frequencies, each bit to the left from the lowest bit stepping
 //   down the frequency by 2 to the power of getFlags()->snapshot_step.
 // Thus we have something like this:
 //   Bit 0: Since last sample
 //   Bit 1: Since last 2^2 samples
 //   Bit 2: Since last 2^4 samples
 //   Bit 3: ...
 //   Bit 7: Ever accessed.
 // We live with races in accessing each shadow byte.
 typedef unsigned char byte;

 namespace __esan {

 // Our shadow memory assumes that the line size is 64.
 static const u32 CacheLineSize = 64;

 // See the shadow byte layout description above.
 static const u32 TotalWorkingSetBitIdx = 7;
 // We accumulate to the left until we hit this bit.
 // We don't need to accumulate to the final bit as it's set on each ref
 // by the compiler instrumentation.
 static const u32 MaxAccumBitIdx = 6;
 static const u32 CurWorkingSetBitIdx = 0;
 static const byte ShadowAccessedVal =
   (1 << TotalWorkingSetBitIdx) | (1 << CurWorkingSetBitIdx);

 static SidelineThread Thread;
 // If we use real-time-based timer samples this won't overflow in any realistic
 // scenario, but if we switch to some other unit (such as memory accesses) we
 // may want to consider a 64-bit int.
 static u32 SnapshotNum;

 // We store the wset size for each of 8 different sampling frequencies.
 static const u32 NumFreq = 8; // One for each bit of our shadow bytes.
 // We cannot use static objects as the global destructor is called
 // prior to our finalize routine.
 // These are each circular buffers, sized up front.
 CircularBuffer<u32> SizePerFreq[NumFreq];
 // We cannot rely on static initializers (they may run too late) but
 // we record the size here for clarity:
 u32 CircularBufferSizes[NumFreq] = {
   // These are each mmap-ed so our minimum is one page.
   32*1024,
   16*1024,
   8*1024,
   4*1024,
   4*1024,
   4*1024,
   4*1024,
   4*1024,
 };

 void processRangeAccessWorkingSet(uptr PC, uptr Addr, SIZE_T Size,
                                   bool IsWrite) {
   if (Size == 0)
     return;
   SIZE_T I = 0;
   uptr LineSize = getFlags()->cache_line_size;
   // As Addr+Size could overflow at the top of a 32-bit address space,
   // we avoid the simpler formula that rounds the start and end.
   SIZE_T NumLines = Size / LineSize +
     // Add any extra at the start or end adding on an extra line:
     (LineSize - 1 + Addr % LineSize + Size % LineSize) / LineSize;
   byte *Shadow = (byte *)appToShadow(Addr);
   // Write shadow bytes until we're word-aligned.
   while (I < NumLines && (uptr)Shadow % 4 != 0) {
     if ((*Shadow & ShadowAccessedVal) != ShadowAccessedVal)
       *Shadow |= ShadowAccessedVal;
     ++Shadow;
     ++I;
   }
   // Write whole shadow words at a time.
   // Using a word-stride loop improves the runtime of a microbenchmark of
   // memset calls by 10%.
   u32 WordValue = ShadowAccessedVal | ShadowAccessedVal << 8 |
     ShadowAccessedVal << 16 | ShadowAccessedVal << 24;
   while (I + 4 <= NumLines) {
     if ((*(u32*)Shadow & WordValue) != WordValue)
       *(u32*)Shadow |= WordValue;
     Shadow += 4;
     I += 4;
   }
   // Write any trailing shadow bytes.
   while (I < NumLines) {
     if ((*Shadow & ShadowAccessedVal) != ShadowAccessedVal)
       *Shadow |= ShadowAccessedVal;
     ++Shadow;
     ++I;
   }
 }

 // This routine will word-align ShadowStart and ShadowEnd prior to scanning.
 // It does *not* clear for BitIdx==TotalWorkingSetBitIdx, as that top bit
 // measures the access during the entire execution and should never be cleared.
 static u32 countAndClearShadowValues(u32 BitIdx, uptr ShadowStart,
                                      uptr ShadowEnd) {
   u32 WorkingSetSize = 0;
   u32 ByteValue = 0x1 << BitIdx;
   u32 WordValue = ByteValue | ByteValue << 8 | ByteValue << 16 |
     ByteValue << 24;
   // Get word aligned start.
   ShadowStart = RoundDownTo(ShadowStart, sizeof(u32));
   bool Accum = getFlags()->record_snapshots && BitIdx < MaxAccumBitIdx;
   // Do not clear the bit that measures access during the entire execution.
   bool Clear = BitIdx < TotalWorkingSetBitIdx;
   for (u32 *Ptr = (u32 *)ShadowStart; Ptr < (u32 *)ShadowEnd; ++Ptr) {
     if ((*Ptr & WordValue) != 0) {
       byte *BytePtr = (byte *)Ptr;
       for (u32 j = 0; j < sizeof(u32); ++j) {
         if (BytePtr[j] & ByteValue) {
           ++WorkingSetSize;
           if (Accum) {
             // Accumulate to the lower-frequency bit to the left.
             BytePtr[j] |= (ByteValue << 1);
           }
         }
       }
       if (Clear) {
         // Clear this bit from every shadow byte.
         *Ptr &= ~WordValue;
       }
     }
   }
   return WorkingSetSize;
 }

 // Scan shadow memory to calculate the number of cache lines being accessed,
 // i.e., the number of non-zero bits indexed by BitIdx in each shadow byte.
 // We also clear the lowest bits (most recent working set snapshot).
 // We do *not* clear for BitIdx==TotalWorkingSetBitIdx, as that top bit
 // measures the access during the entire execution and should never be cleared.
 static u32 computeWorkingSizeAndReset(u32 BitIdx) {
   u32 WorkingSetSize = 0;
   MemoryMappingLayout MemIter(true/*cache*/);
   MemoryMappedSegment Segment;
   while (MemIter.Next(&Segment)) {
     VPrintf(4, "%s: considering %p-%p app=%d shadow=%d prot=%u\n", __FUNCTION__,
             Segment.start, Segment.end, Segment.protection,
             isAppMem(Segment.start), isShadowMem(Segment.start));
     if (isShadowMem(Segment.start) && Segment.IsWritable()) {
       VPrintf(3, "%s: walking %p-%p\n", __FUNCTION__, Segment.start,
               Segment.end);
       WorkingSetSize +=
           countAndClearShadowValues(BitIdx, Segment.start, Segment.end);
     }
   }
   return WorkingSetSize;
 }

 // This is invoked from a signal handler but in a sideline thread doing nothing
 // else so it is a little less fragile than a typical signal handler.
 static void takeSample(void *Arg) {
   u32 BitIdx = CurWorkingSetBitIdx;
   u32 Freq = 1;
   ++SnapshotNum; // Simpler to skip 0 whose mod matches everything.
   while (BitIdx <= MaxAccumBitIdx && (SnapshotNum % Freq) == 0) {
     u32 NumLines = computeWorkingSizeAndReset(BitIdx);
     VReport(1, "%s: snapshot #%5d bit %d freq %4d: %8u\n", SanitizerToolName,
             SnapshotNum, BitIdx, Freq, NumLines);
     SizePerFreq[BitIdx].push_back(NumLines);
     Freq = Freq << getFlags()->snapshot_step;
     BitIdx++;
   }
 }

 unsigned int getSampleCountWorkingSet()
 {
   return SnapshotNum;
 }

 // Initialization that must be done before any instrumented code is executed.
 void initializeShadowWorkingSet() {
   CHECK(getFlags()->cache_line_size == CacheLineSize);
   registerMemoryFaultHandler();
 }

 void initializeWorkingSet() {
   if (getFlags()->record_snapshots) {
     for (u32 i = 0; i < NumFreq; ++i)
       SizePerFreq[i].initialize(CircularBufferSizes[i]);
     Thread.launchThread(takeSample, nullptr, getFlags()->sample_freq);
   }
 }

 static u32 getPeriodForPrinting(u32 MilliSec, const char *&Unit) {
   if (MilliSec > 600000) {
     Unit = "min";
     return MilliSec / 60000;
   } else if (MilliSec > 10000) {
     Unit = "sec";
     return MilliSec / 1000;
   } else {
     Unit = "ms";
     return MilliSec;
   }
 }

 static u32 getSizeForPrinting(u32 NumOfCachelines, const char *&Unit) {
   // We need a constant to avoid software divide support:
   static const u32 KilobyteCachelines = (0x1 << 10) / CacheLineSize;
   static const u32 MegabyteCachelines = KilobyteCachelines << 10;

   if (NumOfCachelines > 10 * MegabyteCachelines) {
     Unit = "MB";
     return NumOfCachelines / MegabyteCachelines;
   } else if (NumOfCachelines > 10 * KilobyteCachelines) {
     Unit = "KB";
     return NumOfCachelines / KilobyteCachelines;
   } else {
     Unit = "Bytes";
     return NumOfCachelines * CacheLineSize;
   }
 }

 void reportWorkingSet() {
   const char *Unit;
   if (getFlags()->record_snapshots) {
     u32 Freq = 1;
     Report(" Total number of samples: %u\n", SnapshotNum);
     for (u32 i = 0; i < NumFreq; ++i) {
       u32 Time = getPeriodForPrinting(getFlags()->sample_freq*Freq, Unit);
       Report(" Samples array #%d at period %u %s\n", i, Time, Unit);
       // FIXME: report whether we wrapped around and thus whether we
       // have data on the whole run or just the last N samples.
       for (u32 j = 0; j < SizePerFreq[i].size(); ++j) {
         u32 Size = getSizeForPrinting(SizePerFreq[i][j], Unit);
         Report("#%4d: %8u %s (%9u cache lines)\n", j, Size, Unit,
                SizePerFreq[i][j]);
       }
       Freq = Freq << getFlags()->snapshot_step;
     }
   }

   // Get the working set size for the entire execution.
   u32 NumOfCachelines = computeWorkingSizeAndReset(TotalWorkingSetBitIdx);
   u32 Size = getSizeForPrinting(NumOfCachelines, Unit);
   Report(" %s: the total working set size: %u %s (%u cache lines)\n",
          SanitizerToolName, Size, Unit, NumOfCachelines);
 }

 int finalizeWorkingSet() {
   if (getFlags()->record_snapshots)
     Thread.joinThread();
   reportWorkingSet();
   if (getFlags()->record_snapshots) {
     for (u32 i = 0; i < NumFreq; ++i)
       SizePerFreq[i].free();
   }
   return 0;
 }

 } // namespace __esan
	//===-- working_set.cpp ---------------------------------------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file is a part of EfficiencySanitizer, a family of performance tuners.
	//
	// This file contains working-set-specific code.
	//===----------------------------------------------------------------------===//

	#include "working_set.h"
	#include "esan.h"
	#include "esan_circular_buffer.h"
	#include "esan_flags.h"
	#include "esan_shadow.h"
	#include "esan_sideline.h"
	#include "sanitizer_common/sanitizer_procmaps.h"

	// We shadow every cache line of app memory with one shadow byte.
	// - The highest bit of each shadow byte indicates whether the corresponding
	// cache line has ever been accessed.
	// - The lowest bit of each shadow byte indicates whether the corresponding
	// cache line was accessed since the last sample.
	// - The other bits are used for working set snapshots at successively
	// lower frequencies, each bit to the left from the lowest bit stepping
	// down the frequency by 2 to the power of getFlags()->snapshot_step.
	// Thus we have something like this:
	// Bit 0: Since last sample
	// Bit 1: Since last 2^2 samples
	// Bit 2: Since last 2^4 samples
	// Bit 3: ...
	// Bit 7: Ever accessed.
	// We live with races in accessing each shadow byte.
	typedef unsigned char byte;

	namespace __esan {

	// Our shadow memory assumes that the line size is 64.
	static const u32 CacheLineSize = 64;

	// See the shadow byte layout description above.
	static const u32 TotalWorkingSetBitIdx = 7;
	// We accumulate to the left until we hit this bit.
	// We don't need to accumulate to the final bit as it's set on each ref
	// by the compiler instrumentation.
	static const u32 MaxAccumBitIdx = 6;
	static const u32 CurWorkingSetBitIdx = 0;
	static const byte ShadowAccessedVal =
	(1 << TotalWorkingSetBitIdx) \| (1 << CurWorkingSetBitIdx);

	static SidelineThread Thread;
	// If we use real-time-based timer samples this won't overflow in any realistic
	// scenario, but if we switch to some other unit (such as memory accesses) we
	// may want to consider a 64-bit int.
	static u32 SnapshotNum;

	// We store the wset size for each of 8 different sampling frequencies.
	static const u32 NumFreq = 8; // One for each bit of our shadow bytes.
	// We cannot use static objects as the global destructor is called
	// prior to our finalize routine.
	// These are each circular buffers, sized up front.
	CircularBuffer<u32> SizePerFreq[NumFreq];
	// We cannot rely on static initializers (they may run too late) but
	// we record the size here for clarity:
	u32 CircularBufferSizes[NumFreq] = {
	// These are each mmap-ed so our minimum is one page.
	32*1024,
	16*1024,
	8*1024,
	4*1024,
	4*1024,
	4*1024,
	4*1024,
	4*1024,
	};

	void processRangeAccessWorkingSet(uptr PC, uptr Addr, SIZE_T Size,
	bool IsWrite) {
	if (Size == 0)
	return;
	SIZE_T I = 0;
	uptr LineSize = getFlags()->cache_line_size;
	// As Addr+Size could overflow at the top of a 32-bit address space,
	// we avoid the simpler formula that rounds the start and end.
	SIZE_T NumLines = Size / LineSize +
	// Add any extra at the start or end adding on an extra line:
	(LineSize - 1 + Addr % LineSize + Size % LineSize) / LineSize;
	byte Shadow = (byte )appToShadow(Addr);
	// Write shadow bytes until we're word-aligned.
	while (I < NumLines && (uptr)Shadow % 4 != 0) {
	if ((*Shadow & ShadowAccessedVal) != ShadowAccessedVal)
	*Shadow \|= ShadowAccessedVal;
	++Shadow;
	++I;
	}
	// Write whole shadow words at a time.
	// Using a word-stride loop improves the runtime of a microbenchmark of
	// memset calls by 10%.
	u32 WordValue = ShadowAccessedVal \| ShadowAccessedVal << 8 \|
	ShadowAccessedVal << 16 \| ShadowAccessedVal << 24;
	while (I + 4 <= NumLines) {
	if (((u32)Shadow & WordValue) != WordValue)
	(u32)Shadow \|= WordValue;
	Shadow += 4;
	I += 4;
	}
	// Write any trailing shadow bytes.
	while (I < NumLines) {
	if ((*Shadow & ShadowAccessedVal) != ShadowAccessedVal)
	*Shadow \|= ShadowAccessedVal;
	++Shadow;
	++I;
	}
	}

	// This routine will word-align ShadowStart and ShadowEnd prior to scanning.
	// It does not clear for BitIdx==TotalWorkingSetBitIdx, as that top bit
	// measures the access during the entire execution and should never be cleared.
	static u32 countAndClearShadowValues(u32 BitIdx, uptr ShadowStart,
	uptr ShadowEnd) {
	u32 WorkingSetSize = 0;
	u32 ByteValue = 0x1 << BitIdx;
	u32 WordValue = ByteValue \| ByteValue << 8 \| ByteValue << 16 \|
	ByteValue << 24;
	// Get word aligned start.
	ShadowStart = RoundDownTo(ShadowStart, sizeof(u32));
	bool Accum = getFlags()->record_snapshots && BitIdx < MaxAccumBitIdx;
	// Do not clear the bit that measures access during the entire execution.
	bool Clear = BitIdx < TotalWorkingSetBitIdx;
	for (u32 Ptr = (u32 )ShadowStart; Ptr < (u32 *)ShadowEnd; ++Ptr) {
	if ((*Ptr & WordValue) != 0) {
	byte BytePtr = (byte )Ptr;
	for (u32 j = 0; j < sizeof(u32); ++j) {
	if (BytePtr[j] & ByteValue) {
	++WorkingSetSize;
	if (Accum) {
	// Accumulate to the lower-frequency bit to the left.
	BytePtr[j] \|= (ByteValue << 1);
	}
	}
	}
	if (Clear) {
	// Clear this bit from every shadow byte.
	*Ptr &= ~WordValue;
	}
	}
	}
	return WorkingSetSize;
	}

	// Scan shadow memory to calculate the number of cache lines being accessed,
	// i.e., the number of non-zero bits indexed by BitIdx in each shadow byte.
	// We also clear the lowest bits (most recent working set snapshot).
	// We do not clear for BitIdx==TotalWorkingSetBitIdx, as that top bit
	// measures the access during the entire execution and should never be cleared.
	static u32 computeWorkingSizeAndReset(u32 BitIdx) {
	u32 WorkingSetSize = 0;
	MemoryMappingLayout MemIter(true/cache/);
	MemoryMappedSegment Segment;
	while (MemIter.Next(&Segment)) {
	VPrintf(4, "%s: considering %p-%p app=%d shadow=%d prot=%u\n", __FUNCTION__,
	Segment.start, Segment.end, Segment.protection,
	isAppMem(Segment.start), isShadowMem(Segment.start));
	if (isShadowMem(Segment.start) && Segment.IsWritable()) {
	VPrintf(3, "%s: walking %p-%p\n", __FUNCTION__, Segment.start,
	Segment.end);
	WorkingSetSize +=
	countAndClearShadowValues(BitIdx, Segment.start, Segment.end);
	}
	}
	return WorkingSetSize;
	}

	// This is invoked from a signal handler but in a sideline thread doing nothing
	// else so it is a little less fragile than a typical signal handler.
	static void takeSample(void *Arg) {
	u32 BitIdx = CurWorkingSetBitIdx;
	u32 Freq = 1;
	++SnapshotNum; // Simpler to skip 0 whose mod matches everything.
	while (BitIdx <= MaxAccumBitIdx && (SnapshotNum % Freq) == 0) {
	u32 NumLines = computeWorkingSizeAndReset(BitIdx);
	VReport(1, "%s: snapshot #%5d bit %d freq %4d: %8u\n", SanitizerToolName,
	SnapshotNum, BitIdx, Freq, NumLines);
	SizePerFreq[BitIdx].push_back(NumLines);
	Freq = Freq << getFlags()->snapshot_step;
	BitIdx++;
	}
	}

	unsigned int getSampleCountWorkingSet()
	{
	return SnapshotNum;
	}

	// Initialization that must be done before any instrumented code is executed.
	void initializeShadowWorkingSet() {
	CHECK(getFlags()->cache_line_size == CacheLineSize);
	registerMemoryFaultHandler();
	}

	void initializeWorkingSet() {
	if (getFlags()->record_snapshots) {
	for (u32 i = 0; i < NumFreq; ++i)
	SizePerFreq[i].initialize(CircularBufferSizes[i]);
	Thread.launchThread(takeSample, nullptr, getFlags()->sample_freq);
	}
	}

	static u32 getPeriodForPrinting(u32 MilliSec, const char *&Unit) {
	if (MilliSec > 600000) {
	Unit = "min";
	return MilliSec / 60000;
	} else if (MilliSec > 10000) {
	Unit = "sec";
	return MilliSec / 1000;
	} else {
	Unit = "ms";
	return MilliSec;
	}
	}

	static u32 getSizeForPrinting(u32 NumOfCachelines, const char *&Unit) {
	// We need a constant to avoid software divide support:
	static const u32 KilobyteCachelines = (0x1 << 10) / CacheLineSize;
	static const u32 MegabyteCachelines = KilobyteCachelines << 10;

	if (NumOfCachelines > 10 * MegabyteCachelines) {
	Unit = "MB";
	return NumOfCachelines / MegabyteCachelines;
	} else if (NumOfCachelines > 10 * KilobyteCachelines) {
	Unit = "KB";
	return NumOfCachelines / KilobyteCachelines;
	} else {
	Unit = "Bytes";
	return NumOfCachelines * CacheLineSize;
	}
	}

	void reportWorkingSet() {
	const char *Unit;
	if (getFlags()->record_snapshots) {
	u32 Freq = 1;
	Report(" Total number of samples: %u\n", SnapshotNum);
	for (u32 i = 0; i < NumFreq; ++i) {
	u32 Time = getPeriodForPrinting(getFlags()->sample_freq*Freq, Unit);
	Report(" Samples array #%d at period %u %s\n", i, Time, Unit);
	// FIXME: report whether we wrapped around and thus whether we
	// have data on the whole run or just the last N samples.
	for (u32 j = 0; j < SizePerFreq[i].size(); ++j) {
	u32 Size = getSizeForPrinting(SizePerFreq[i][j], Unit);
	Report("#%4d: %8u %s (%9u cache lines)\n", j, Size, Unit,
	SizePerFreq[i][j]);
	}
	Freq = Freq << getFlags()->snapshot_step;
	}
	}

	// Get the working set size for the entire execution.
	u32 NumOfCachelines = computeWorkingSizeAndReset(TotalWorkingSetBitIdx);
	u32 Size = getSizeForPrinting(NumOfCachelines, Unit);
	Report(" %s: the total working set size: %u %s (%u cache lines)\n",
	SanitizerToolName, Size, Unit, NumOfCachelines);
	}

	int finalizeWorkingSet() {
	if (getFlags()->record_snapshots)
	Thread.joinThread();
	reportWorkingSet();
	if (getFlags()->record_snapshots) {
	for (u32 i = 0; i < NumFreq; ++i)
	SizePerFreq[i].free();
	}
	return 0;
	}

	} // namespace __esan