singrdk/base/Libraries/CLRProfiler/CLRProfiler.sg

//
// Copyright (c) Microsoft Corporation.  All rights reserved.
//

namespace Microsoft.Singularity.Diagnostics
{
    using FileSystem.Utils;

    using Microsoft.Singularity;
    using Microsoft.Singularity.Channels;
    using Microsoft.Singularity.Directory;
    using Microsoft.Singularity.V1.Services;

    using System;
    using System.Collections;
    using System.GCs;
    using System.IO;
    using System.Runtime.CompilerServices;
    using System.Threading;

    // Receive notifications from the GC and deliver them to a profiler, via a log file.
    public class CLRProfiler : GCProfiler
    {
        // The public API for a typical client app.
        static public void StartProfiling()
        {
            CLRProfiler prof = new CLRProfiler();
            prof.Initialize();
            GC.SetProfiler(prof);
        }

        internal abstract class Buffer
        {
            // This must be big enough for whatever the biggest logical entry is that someone
            // might write from a mutator thread.  The GC thread has unbounded entries and
            // doesn't need to observe this rule.  I think the biggest entry is probably a
            // typeName, which could be 512 bytes for the name, plus some slop.
            //
            // In a checked build, we assert that we remain within this requirement.  In a
            // retail build, we quietly generate a corrupt log with entries intermingled.
            // If you don't like this policy, change LocalBuffer.FlushOnOverflow to do
            // something different.
            protected const int  bufferSize = 800;
            protected byte[]!    data;                  // data buffer
            protected int        iBuff;                 // index into data buffer

            protected Buffer()
            {
                data = new byte[bufferSize];
                iBuff = 0;
                base();
            }

            // Write a byte
            internal void Write(byte c)
            {
                if (iBuff >= bufferSize) {
                    DebugStub.Assert(iBuff == bufferSize);
                    FlushOnOverflow();
                    DebugStub.Assert(iBuff == 0);      // no races possible
                }
                data[iBuff++] = c;
            }

            // Write a byte and a space-separator
            internal void WritePair(byte c)
            {
                Write(c);
                Write(' ');
            }

            // Perform a Write of the number.ToString -- without allocating anything, with
            // a space-separator
            internal void WriteNumber(ulong ul)
            {
                if (ul == 0) {
                    WritePair('0');
                    return;
                }

                Write('0');
                Write('x');

                bool leadingZero = true;
                byte nybble;

                // 16 nybbles in an 8-byte quantity
                for (int i=0; i<16; i++) {
                    nybble = (byte) ((ul & 0xf000000000000000) >> 60);
                    ul <<= 4;

                    if (leadingZero) {
                        if (nybble == 0) {
                            continue;       // no need to print leading zeroes
                        }
                        else {
                            leadingZero = false;
                        }
                    }

                    nybble += (byte) ((nybble > 9) ? 'a'-10 : '0');
                    Write(nybble);
                }
                Write(' ');
            }

            // No space-separator!
            internal void WriteString(string! s)
            {
                for (int i=0; i<s.Length; i++) {
                    Write((byte) s[i]);     // simplest Unicode -> ASCII
                }
            }

            // Terminate an entry
            internal void WriteCRLF()
            {
                Write('\r');
                Write('\n');
            }

            // For some buffer subtypes, it is a bug to overflow an entry while writing a
            // partial entry.  That's why we distinguish the overflow and consistent
            // forms of flush.
            internal virtual void FlushOnOverflow()
            {
                FlushConsistentEntry();
            }

            internal abstract void FlushConsistentEntry();

            // A child buffer can ask its parent to accept some bytes.  The parent is responsible
            // for the appropriate synchronization (including with the GC!).
            internal abstract void TakeBytes(byte[]! bytes, int count);
        }

        // A LocalBuffer that -- when completely consistent -- can flush up to its Parent.
        internal class LocalBuffer : Buffer
        {
            private Buffer!             parent;

            // With everyone allocating and using local buffers, we need at least a minimal
            // cache for recycling.
            private LocalBuffer         next;       // create linked list for a cache of buffers

            static private LocalBuffer  cache;
            static private SpinLock     cacheLock;

            internal LocalBuffer(Buffer! parent)
            {
                this.parent = parent;
                base();
            }

            // The LocalBuffers are supposed to be large enough to be accommodate any non-GC
            // logging.  This isn't the case, and we are about to corrupt the log by interleaving
            // output from multiple entries.
            internal override void FlushOnOverflow()
            {
#if _DEBUG
                DebugStub.Assert(false);
#endif
                base.FlushOnOverflow();             // emit a corruption
            }

            internal override void FlushConsistentEntry()
            {
                if (iBuff > 0) {
                    parent.TakeBytes(data, iBuff);
                    iBuff = 0;
                }
            }

            internal override void TakeBytes(byte[]! bytes, int count)
            {
                DebugStub.Assert(false, "Never implemented arbitrarily nested LocalBuffers");
            }

            // We don't want to create a new buffer on every log entry.  Recycle
            // buffers using a pool.
            static internal LocalBuffer! FindOrCreateBuffer(FileBuffer! fileBuffer)
            {
                cacheLock.Acquire();
                LocalBuffer temp = cache;
                if (temp != null) {
                    cache = temp.next;
                    temp.next = null;
                }
                cacheLock.Release();
                if (temp != null) {
                    // Not a very general cache.  Make sure it isn't being abused.
                    DebugStub.Assert(fileBuffer == temp.parent);
                    return temp;
                }
                // cache underflow:
                return new LocalBuffer(fileBuffer);
            }

            // The below is unsafe.  If there are any outstanding references to this buffer, then
            // we no longer have single-threaded access and the entries will become corrupt.
            internal void Recycle()
            {
                FlushConsistentEntry();

                cacheLock.Acquire();        // cache grows to a high water mark
                this.next = cache;
                cache = this;
                cacheLock.Release();
            }
        }

        // A buffer associated with a file to which it directly writes.
        internal class FileBuffer : Buffer
        {
            private TRef<FileContract.Imp:Ready>! connRef;
            private long       filePos;
            private SpinLock   fileLock;
            private Thread     owningThread;

            internal FileBuffer(TRef<FileContract.Imp:Ready>! connRef)
            {
                this.connRef = connRef;
                base();
            }

            internal override void FlushConsistentEntry()
            {
                if (iBuff > 0) {
                    FileContract.Imp conn = connRef.Acquire();
                    byte[]! in ExHeap bytes = Bitter.FromByteArray(data, 0, iBuff);

                    try {
                        long bytesWritten;
                        int error;

                        conn.SendWrite(bytes, 0, filePos, bytes.Length);
                        conn.RecvAckWrite(out bytes, out bytesWritten, out error);
                        delete bytes;

                        filePos += bytesWritten;
                        if (error != 0) {
                            Console.WriteLine("Failed to write to GC Profiler log\n");
                            // TODO: We need a real error story for the platform.
                            throw new Exception("Failed to write to GC Profiler log");
                        }
                    }
                    finally {
                        // Other threads could have passed their 'enabled' check and be
                        // blocked on the monitor.  So even on failure we must empty
                        // the buffer
                        iBuff = 0;
                        connRef.Release(conn);
                    }
                }
            }

            // TakeBytes from a child LocalBuffer in a single consistent cycle.  And do
            // so without allocating or making a kernel transition or otherwise yielding
            // to the GC.
            //
            // If the FileBuffer is currently Owned for exclusive access (i.e. the GC is
            // streaming inconsistent records of arbitrary size and cannot be interrupted)
            // then Yield to the GC and try again.  Sleeping should go through the kernel
            // and provide an adequate Transition.
            //
            // Everything happens in a SpinLock because these locks don't themselves
            // allocate.  Also, the SpinLock guarantees a consistent read of state like
            // 'OwningThread' across all the CPUs.
            internal override void TakeBytes(byte[]! bytes, int count)
            {
                DebugStub.Assert(count <= bufferSize);     // Guaranteed.
                while (true) {
                    fileLock.Acquire();

                    // Is the GC thread holding onto exclusive access right now?
                    if (OwningThread != null) {
                        DebugStub.Assert(OwningThread != Thread.CurrentThread);
                        fileLock.Release();
                        Thread.Sleep(0);                // yield to the GC

                        // It's possible that OwningThread has changed again (though
                        // not in a manner that involves the current thread).  Start over.
                        continue;
                    }

                    // Ensure there is enough space in the FileBuffer for all our
                    // bytes.  Once we start accepting bytes, there must be no
                    // allocations.  And we cannot write to disk while holding the
                    // spinlock.
                    if (iBuff + count > bufferSize) {
                        fileLock.Release();
                        FlushConsistentEntry();

                        // It's possible that OwningThread has changed, or another
                        // writer has intruded.  Start over.
                        continue;
                    }

                    // We are holding the lock.  The GC hasn't taken control from us.
                    // There is room for our entry.  Blast it without allocating or yielding.
                    for (int i=0; i<count; i++) {
                        data[iBuff++] = bytes[i];
                    }
                    fileLock.Release();
                    return;                             // success.
                }
            }

            // A FileBuffer can be owned by a client thread, in which case it will not TakeBytes
            // from any children.
            internal Thread OwningThread
            {
                get {
                    // Reads are either performed for Assertion purposes by the thread that
                    // is the owner, or they are only performed by threads that hold the
                    // spinlock.  However, this invariant cannot be easily expressed in an
                    // Assert.
                    return owningThread;
                }
                set {
                    fileLock.Acquire();
                    owningThread = value;
                    fileLock.Release();
                }
            }
        }

        private bool        enabled;                  // profiling is enabled?

        private ulong       lastCycle;                // time entries
        private ulong       firstCycle;
        private const int   cycleGranularity = 5000000;

        private const int   maxGeneration = 3;        // CLR Profiler only handles this many
        private int[]!      generations;

        private TRef<FileContract.Imp:Ready> connRef;

        private Hashtable!  typeTable;                // hash of types->ids
        private Hashtable!  stackTable;               // hash of stacks->ids
        private Hashtable!  funcTable;                // hash of Eips->ids

        private uint        nextTypeNo;               // hand out IDs monotonically
        private uint        nextStackNo;
        private uint        nextFuncNo;

        private FileBuffer  _fileBuffer;
        private FileBuffer! fileBuffer {
            get {
                // We can't capture the specs (in the type system) that
                // one needs to call Initialize before any other ops
                assert _fileBuffer != null;
                return _fileBuffer;
            }
            set {
                this._fileBuffer = value;
            }
        }

        private int         rootCount;                // break up 'r' entries into chunks

        private Thread      recurseThread;            // prevent recursion when the profiler allocates
        private SpinLock    allocationLock;

        // For stack traces, the Eips and corresponding stack numbers
        private const int   stackSize = 8;            // increase for bigger / slower stack traces
        private UIntPtr[]!  stackEips;
        private uint[]!     stackNos;

        internal CLRProfiler()
        {
            this.generations = new int[maxGeneration];
            this.typeTable = new Hashtable();
            this.stackTable = new Hashtable();
            this.funcTable = new Hashtable();
            this.stackEips = new UIntPtr[stackSize];
            this.stackNos = new uint[stackSize];
            firstCycle = Processor.CycleCount;
            base();
        }

        // Initialization, prior to attempting to set this profiler into the GC.  It's
        // inappropriate to do this stuff inside a constructor.
        internal void Initialize()
        {
            string logDirectory = "/fs/gcProfiler";
            bool   isDirectory;

            if ((DirectoryUtils.PathIsDirectory(logDirectory, out isDirectory) != 0) ||
                !isDirectory) {

                // If it doesn't look like a healthy directory, try & create it.  There
                // seems to be a bug where valid nested directories fail the PathIsDirectory
                // call.  So make a best effort to create it, but if it already exists this
                // will be a NOP.  Therefore, ignore failure at this stage.
                DirectoryUtils.CreateDirectory(logDirectory);
            }

            ushort pid = ProcessService.GetCurrentProcessId();

            string logFile = logDirectory + "/" + pid.ToString();

            // If the file already exists, blow it away.  Don't worry about failures on the
            // delete... Create & Open are what we really care about.
            FileUtils.DeleteFile(logFile);

            if (FileUtils.CreateFile(logFile) != 0) {
                Console.WriteLine("Couldn't create file '" + logFile + "'");
                return;
            }

            FileContract.Imp conn = FileUtils.OpenFile(logFile);
            if (conn == null) {
                Console.WriteLine("Couldn't open file '" + logFile + "'");
                return;
            }

            connRef = new TRef<FileContract.Imp:Ready>(conn);
            fileBuffer = new FileBuffer(connRef);

            // Write out the log header, while we are still single-threaded.
            WriteHeader(fileBuffer);

            // When we set this, we are no longer single-threaded:
            this.enabled = true;
        }

        // the API by which our base class calls us

        protected override void Shutdown()
        {
            // This is presumed to be called when the process is single-threaded, since
            // the entire GC heap is shutting down.
            if (enabled) {
                // flush & close the log
                fileBuffer.FlushConsistentEntry();
                enabled = false;
            }
        }

        // There are 3 entry points that must write to the log, causing multi-threaded
        // concerns.
        //
        // 1.  PreGC
        // 2.  PostGC
        // 3.  Allocation
        //
        // Here are those concerns:
        //
        // o  Multiple Allocation notifications will race with each other.
        // 
        // o  Some GCs might use multiple threads.  For now, I assume a single-threaded
        //    GC.
        //
        // o  Some GCs might be concurrent with the application.  For now, I assume that
        //    GC allows mutator threads to run, so long as they don't allocate, make a
        //    user/kernel transition, or otherwise synchronize with the GC.
        //
        // o  Some GCs might relocate objects.  I haven't implemented the notifications
        //    for this yet.
        //
        // The solution is that non-GC logging is performed to LocalBuffers.  When a buffer
        // is in a consistent state, it can be flushed to the shared FileBuffer.
        // There are some subtleties to this:
        //
        // 1.  The GC writes some entries that are unbounded in size (like the elements
        //     coming out of a huge object array).  So it cannot use a LocalBuffer.
        //
        // 2.  The mutator threads might have to write a set of entries like a stack
        //     trace, a type definition, and an allocation entry which are all related.
        //
        // The second issue is addressed by making the local buffers big enough to write
        // fully formed consistent entries without overflowing.  We also tolerate races
        // in the definitions of some of the related entries, leading to duplicates.
        //
        // The first issue is addressed by putting the GC in a special position where
        // it can stream to the file on overflow, and prevent any mutator threads from
        // injecting their consistent entries into that stream, thereby corrupting it.
        // This means that a mutator might allocate in one GC, but not report that
        // allocation until the next GC.  The CLR Profiler tool seems resilient to these
        // anomalies.

        // A GC is about to start.  The world is in a sane place, except that we may
        // have stopped the mutator threads if this is a StopTheWorld collection.
        protected override void PreGC(int generation)
        {
            try {
                // Take ownership of the fileBuffer to prevent mutating threads from
                // interleaving with us.
                DebugStub.Assert(fileBuffer.OwningThread == null);
                fileBuffer.OwningThread = Thread.CurrentThread;

                // emit the fact that a GC will occur
                MaybeTick(fileBuffer);

                if (generation >= maxGeneration) {
                    generation = maxGeneration-1;
                }
                generations[generation]++;

                fileBuffer.WritePair('g');
                fileBuffer.WriteNumber((ulong) generations[0]);
                fileBuffer.WriteNumber((ulong) generations[1]);
                fileBuffer.WriteNumber((ulong) generations[2]);
                fileBuffer.WriteCRLF();
            }
            catch (Exception) {
                enabled = false;
                throw;
            }
        }

        // A GC has finished.  The world is in a sane place, except that we might not
        // have started up all the mutator threads if this is a StopTheWorld collection.
        protected override void PostGC()
        {
            try {
                // emit the fact a GC has happened, including the state of the heap.
                MaybeTick(fileBuffer);

                // Write all the Roots
                fileBuffer.WritePair('r');
                rootCount = 0;
                ScanRoots();
                fileBuffer.WriteCRLF();        // finish off the open log entry

                // Write all the reachability graph of the heap
                ScanObjects();

                // Once we have finished writing everything, we can allow mutator threads to
                // share access to the fileBuffer with their own consistent entries.
                fileBuffer.FlushConsistentEntry();
                DebugStub.Assert(fileBuffer.OwningThread == Thread.CurrentThread);
                fileBuffer.OwningThread = null;
            }
            catch (Exception) {
                enabled = false;
                throw;
            }
        }

        // In the list of roots, we have found another object
        protected override void ScanOneRoot(UIntPtr objectAddress)
        {
            if (objectAddress != 0) {
                // We want to put a reasonable chunk into each 'r' entry and then use multiple
                // such entries.  But I'm tempted to say there is a bug in CLR Profiler that
                // prevents it from correctly handling multiple consecutive 'r' log entries.
                // So make the chunk size enormous for now.
                if (rootCount > 10000) {
                    fileBuffer.WriteCRLF();
                    fileBuffer.WritePair('r');
                    rootCount = 0;
                }
                fileBuffer.WriteNumber((ulong) objectAddress);
                rootCount++;
            }
        }

        // In the heap reachability graph, we are starting to dump a new object
        protected override void StartScanOneObject(UIntPtr objectAddress, Type type, UIntPtr size)
        {
            uint typeNo = MaybeWriteType(fileBuffer, (!)type);

            fileBuffer.WritePair('o');
            fileBuffer.WriteNumber((ulong) objectAddress);
            fileBuffer.WriteNumber((ulong) typeNo);
            fileBuffer.WriteNumber((ulong) size);

            // This will be followed by repeated calls to ScanOneObjectField, then
            // EndScanOneObject
        }

        // This is one of the references that the current object holds.
        protected override void ScanOneObjectField(UIntPtr objectAddress)
        {
            if (objectAddress != 0) {
                fileBuffer.WriteNumber((ulong) objectAddress);
            }
        }

        // We are finished scanning one object
        protected override void EndScanOneObject()
        {
            fileBuffer.WriteCRLF();
        }

        // Create a log entry for the allocation that just occurred on this thread.
        protected override void Allocation(UIntPtr objAddr, Type type, UIntPtr size)
        {
            // We cannot recurse inside an Allocation notification, or we will simply
            // blow the stack on the first entry.  Also, we don't want to log allocations
            // that occur as a consequence of logging the state of the GC heap -- though
            // we could support that if we chose to.
            if (enabled &&
                recurseThread != Thread.CurrentThread &&            // recurse?
                fileBuffer.OwningThread != Thread.CurrentThread) {  // GC logging?
                allocationLock.Acquire();
                try {
                    DebugStub.Assert(recurseThread == null);
                    recurseThread = Thread.CurrentThread;
                    // record the fact that an object was allocated, into a local
                    // buffer that will be written to the fileBuffer only when we
                    // are no longer subject to the GC.
                    LocalBuffer! localBuffer = LocalBuffer.FindOrCreateBuffer(fileBuffer);
                    MaybeTick(localBuffer);

                    // Emit a stack trace.
                    // TODO: Processor.GetStackEips doesn't indicate how much of the array
                    // it modified, so we have to zero it before each call.  Better to change
                    // the GetStackEips API!
                    for (int i=0; i<stackEips.Length; i++) {
                        stackEips[i] = 0;
                    }
                    Processor.GetStackEips(stackEips);
                    uint stkNo = MaybeWriteStack(localBuffer, (!) type, size, stackEips);

                    localBuffer.WritePair('!');
                    localBuffer.WriteNumber((ulong) Thread.CurrentThread.GetThreadId());
                    localBuffer.WriteNumber((ulong) objAddr);
                    localBuffer.WriteNumber((ulong) stkNo);
                    localBuffer.WriteCRLF();

                    localBuffer.Recycle();
                }
                finally {
                    recurseThread = null;
                    allocationLock.Release();
                }
            }
        }

        // The rest of our implementation details:

        private uint MaybeWriteStack(Buffer! buffer, Type! type, UIntPtr size, UIntPtr[]! stackEips)
        {
            // First make sure that we have a type record for the object being
            // instantiated at this stack.
            uint typeNo = MaybeWriteType(buffer, type);

            // Then, make sure we have a function record for each Eip in the stack.  Of course
            // we don't know when a bunch of Eips map to different offsets in the same function.
            // So make a function for each unique Eip & fix it up in the post-processing.
            // Hopefully there aren't too many unique call sites in each method.
            ulong hash = typeNo; // perhaps "typeNo ^ size" ?
            int   edge = -1;
            for (int i=0; i<stackEips.Length && stackEips[i] != 0; i++) {
                edge++;
                // Map the individual Eips into their corresponding functions
                stackNos[i] = MaybeWriteFunction(buffer, stackEips[i]);
                hash = (hash << 11) + (hash ^ stackNos[i]);
            }

            // Note that we will statistically map some distinct stacks into the same
            // stack if they have the same hash.  Oh well.
            object o = stackTable[hash];
            if (o != null) {
                return (uint) o;
            }

            // It's a novel stack.  Note that we embed the size into the stack, but we
            // don't include the size in the hash.  There's a technique for sharing
            // prefixes of stacks that could be explored here to get more accurate profiles
            // without huge stack expansion.
            // TODO: consider the above.
            uint stackNo = nextStackNo++;

            buffer.WritePair('n');
            buffer.WriteNumber((ulong) stackNo);
            buffer.WritePair('1');     // bits to indicate variant of record
            buffer.WriteNumber((ulong) typeNo);
            buffer.WriteNumber((ulong) size);

            // Stacks are emitted backwards, presumably to support common prefixes better.
            for (int i=edge; i>=0; i--) {
                buffer.WriteNumber((ulong) stackNos[i]);
            }
            buffer.WriteCRLF();

            // Flush this entry before putting it into the hashtable, where other threads might
            // see it.  This means we may have duplicate conflicting entries for the same logical
            // stack, but this is tolerated by the profiler.
            buffer.FlushConsistentEntry();
            stackTable[hash] = stackNo;

            return stackNo;
        }

        private uint MaybeWriteFunction(Buffer! buffer, UIntPtr eip)
        {
            // Given the EIP of a Function, make sure the function has been defined.  Since we
            // don't have enough runtime information to map Eips to functions, we must rely on
            // post-processing.
            object o = funcTable[eip];
            if (o != null) {
                return (uint) o;
            }

            uint funcNo = nextFuncNo++;

            // Emit the function record

            buffer.WritePair('f');
            buffer.WriteNumber(funcNo);
            buffer.WriteString("Function_EIP_");
            buffer.WriteNumber((ulong) eip);
            buffer.WriteString("(UNKNOWN ARGUMENTS) 0 0");
            buffer.WriteCRLF();

            // Flush this entry before putting it into the hashtable, where other threads might
            // see it.  This means we may have duplicate conflicting entries for the same logical
            // type, but this is tolerated by the profiler.
            buffer.FlushConsistentEntry();
            funcTable[eip] = funcNo;

            return funcNo;
        }

        private uint MaybeWriteType(Buffer! buffer, Type! type)
        {
            // Given a Type, make sure that it has been defined.
            object o = typeTable[type];
            if (o != null) {
                return (uint) o;
            }

            uint typeNo = nextTypeNo++;

            // Emit the type record

            buffer.WritePair('t');
            buffer.WriteNumber((ulong) typeNo);
            buffer.WritePair('0');  // TODO: assume not Finalizable for now
            buffer.WriteString((!)type.FullName);
            buffer.WriteCRLF();

            // Flush this entry before putting it into the hashtable, where other threads might
            // see it.  This means we may have duplicate conflicting entries for the same logical
            // type, but this is tolerated by the profiler.
            buffer.FlushConsistentEntry();
            typeTable[type] = typeNo;

            return typeNo;
        }

        // Use the fast 'rdtsc' to decide when to emit a tick value.
        private void MaybeTick(Buffer! buffer)
        {
            ulong nowCycle = Processor.CycleCount;

            // Emit a tick-count record only if it's been a while since the last one.
            if (nowCycle > lastCycle + cycleGranularity) {
                lastCycle = nowCycle;
                Tick(buffer);
            }
        }

        private void Tick(Buffer! buffer)
        {
            // Time since profiling began on this application.
            buffer.WritePair('i');
            buffer.WriteNumber((ulong) ((lastCycle - firstCycle) / 1000000));
            buffer.WriteCRLF();
        }

        // Write the start of the Log file.
        [MethodImplAttribute(MethodImplOptions.NoInlining)]
        private void WriteHeader(Buffer! buffer)
        {
            // We need a comment record that gives the address of a well-known symbol.  This
            // allows us to adjust all the EIPs in the log for the actual load address.
            // Obtain the value before doing anything else, so the only noise in the address
            // comes from the method prolog.
            UIntPtr eip = GetAddressOfWriteHeader();
            MaybeTick(buffer);

            buffer.WritePair('z');
            buffer.WriteNumber((ulong) eip);
            buffer.WriteString("CLRProfiler_WriteHeader");
            buffer.WriteCRLF();

            buffer.FlushConsistentEntry();
        }

        [MethodImplAttribute(MethodImplOptions.NoInlining)]
        private UIntPtr GetAddressOfWriteHeader()
        {
            // Well, this actually includes the prolog of this method, so it is off by a
            // few bytes.
            return Processor.GetCallerEip();
        }
    }
}