third_party/go/patches/gvisor-fix-debug-builds.patch - monogon - Gitiles

 From eb22b742839180a0bdb3953c061da15ba822d56d Mon Sep 17 00:00:00 2001
 From: Tim Windelschmidt <tim@monogon.tech>
 Date: Tue, 12 Sep 2023 15:06:49 +0200
 Subject: [PATCH] fix debug builds

 ---
  pkg/sentry/platform/kvm/address_space.go      |   3 +
  .../platform/kvm/address_space_debug.go       | 242 +++++
  .../platform/kvm/bluepill_debug_unsafe.go     | 215 +++++
  pkg/sentry/platform/kvm/bluepill_unsafe.go    |   4 +-
  pkg/sentry/platform/kvm/machine.go            |   3 +
  pkg/sentry/platform/kvm/machine_debug.go      | 826 ++++++++++++++++++
  6 files changed, 1291 insertions(+), 2 deletions(-)
  create mode 100644 pkg/sentry/platform/kvm/address_space_debug.go
  create mode 100644 pkg/sentry/platform/kvm/bluepill_debug_unsafe.go
  create mode 100644 pkg/sentry/platform/kvm/machine_debug.go

 diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
 index 79ccbea35..7e30d0365 100644
 --- a/pkg/sentry/platform/kvm/address_space.go
 +++ b/pkg/sentry/platform/kvm/address_space.go
 @@ -12,6 +12,9 @@
  // See the License for the specific language governing permissions and
  // limitations under the License.

 +//go:build !kvm_debug
 +// +build !kvm_debug
 +
  package kvm

  import (
 diff --git a/pkg/sentry/platform/kvm/address_space_debug.go b/pkg/sentry/platform/kvm/address_space_debug.go
 new file mode 100644
 index 000000000..69aeba45a
 --- /dev/null
 +++ b/pkg/sentry/platform/kvm/address_space_debug.go
 @@ -0,0 +1,242 @@
 +// Copyright 2018 The gVisor Authors.
 +//
 +// Licensed under the Apache License, Version 2.0 (the "License");
 +// you may not use this file except in compliance with the License.
 +// You may obtain a copy of the License at
 +//
 +//     http://www.apache.org/licenses/LICENSE-2.0
 +//
 +// Unless required by applicable law or agreed to in writing, software
 +// distributed under the License is distributed on an "AS IS" BASIS,
 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 +// See the License for the specific language governing permissions and
 +// limitations under the License.
 +
 +//go:build kvm_debug
 +// +build kvm_debug
 +
 +package kvm
 +
 +import (
 +	"gvisor.dev/gvisor/pkg/atomicbitops"
 +	"gvisor.dev/gvisor/pkg/hostarch"
 +	"gvisor.dev/gvisor/pkg/ring0/pagetables"
 +	"gvisor.dev/gvisor/pkg/sentry/memmap"
 +	"gvisor.dev/gvisor/pkg/sentry/platform"
 +	"gvisor.dev/gvisor/pkg/sync"
 +)
 +
 +// dirtySet tracks vCPUs for invalidation.
 +type dirtySet struct {
 +	vCPUMasks []atomicbitops.Uint64
 +}
 +
 +// forEach iterates over all CPUs in the dirty set.
 +func (ds *dirtySet) forEach(m *machine, fn func(c *vCPU)) {
 +	for index := range ds.vCPUMasks {
 +		mask := ds.vCPUMasks[index].Swap(0)
 +		if mask != 0 {
 +			for bit := 0; bit < 64; bit++ {
 +				if mask&(1<<uint64(bit)) == 0 {
 +					continue
 +				}
 +				id := 64*index + bit
 +				fn(m.vCPUsByID[id])
 +			}
 +		}
 +	}
 +}
 +
 +// mark marks the given vCPU as dirty and returns whether it was previously
 +// clean. Being previously clean implies that a flush is needed on entry.
 +func (ds *dirtySet) mark(c *vCPU) bool {
 +	index := uint64(c.id) / 64
 +	bit := uint64(1) << uint(c.id%64)
 +
 +	oldValue := ds.vCPUMasks[index].Load()
 +	if oldValue&bit != 0 {
 +		return false // Not clean.
 +	}
 +
 +	// Set the bit unilaterally, and ensure that a flush takes place. Note
 +	// that it's possible for races to occur here, but since the flush is
 +	// taking place long after these lines there's no race in practice.
 +	atomicbitops.OrUint64(&ds.vCPUMasks[index], bit)
 +	return true // Previously clean.
 +}
 +
 +// addressSpace is a wrapper for PageTables.
 +type addressSpace struct {
 +	platform.NoAddressSpaceIO
 +
 +	// mu is the lock for modifications to the address space.
 +	//
 +	// Note that the page tables themselves are not locked.
 +	mu sync.Mutex
 +
 +	// machine is the underlying machine.
 +	machine *machine
 +
 +	// pageTables are for this particular address space.
 +	pageTables *pagetables.PageTables
 +
 +	// dirtySet is the set of dirty vCPUs.
 +	dirtySet *dirtySet
 +}
 +
 +// Invalidate interrupts all dirty contexts.
 +func (as *addressSpace) Invalidate() {
 +	as.mu.Lock()
 +	defer as.mu.Unlock()
 +	as.invalidate()
 +}
 +
 +// Touch adds the given vCPU to the dirty list.
 +//
 +// The return value indicates whether a flush is required.
 +func (as *addressSpace) Touch(c *vCPU) bool {
 +	return as.dirtySet.mark(c)
 +}
 +
 +type hostMapEntry struct {
 +	addr   uintptr
 +	length uintptr
 +}
 +
 +// mapLocked maps the given host entry.
 +//
 +// +checkescape:hard,stack
 +func (as *addressSpace) mapLocked(addr hostarch.Addr, m hostMapEntry, at hostarch.AccessType) (inv bool) {
 +	for m.length > 0 {
 +		physical, length, ok := translateToPhysical(m.addr)
 +		if !ok {
 +			panic("unable to translate segment")
 +		}
 +		if length > m.length {
 +			length = m.length
 +		}
 +
 +		// Ensure that this map has physical mappings. If the page does
 +		// not have physical mappings, the KVM module may inject
 +		// spurious exceptions when emulation fails (i.e. it tries to
 +		// emulate because the RIP is pointed at those pages).
 +		as.machine.mapPhysical(physical, length, physicalRegions)
 +
 +		// Install the page table mappings. Note that the ordering is
 +		// important; if the pagetable mappings were installed before
 +		// ensuring the physical pages were available, then some other
 +		// thread could theoretically access them.
 +		inv = as.pageTables.Map(addr, length, pagetables.MapOpts{
 +			AccessType: at,
 +			User:       true,
 +		}, physical) || inv
 +		m.addr += length
 +		m.length -= length
 +		addr += hostarch.Addr(length)
 +	}
 +
 +	return inv
 +}
 +
 +// MapFile implements platform.AddressSpace.MapFile.
 +func (as *addressSpace) MapFile(addr hostarch.Addr, f memmap.File, fr memmap.FileRange, at hostarch.AccessType, precommit bool) error {
 +	as.mu.Lock()
 +	defer as.mu.Unlock()
 +
 +	// Get mappings in the sentry's address space, which are guaranteed to be
 +	// valid as long as a reference is held on the mapped pages (which is in
 +	// turn required by AddressSpace.MapFile precondition).
 +	//
 +	// If precommit is true, we will touch mappings to commit them, so ensure
 +	// that mappings are readable from sentry context.
 +	//
 +	// We don't execute from application file-mapped memory, and guest page
 +	// tables don't care if we have execute permission (but they do need pages
 +	// to be readable).
 +	bs, err := f.MapInternal(fr, hostarch.AccessType{
 +		Read:  at.Read || at.Execute || precommit,
 +		Write: at.Write,
 +	})
 +	if err != nil {
 +		return err
 +	}
 +
 +	// See block in mapLocked.
 +	as.pageTables.Allocator.(*allocator).cpu = as.machine.Get()
 +	defer as.machine.Put(as.pageTables.Allocator.(*allocator).cpu)
 +
 +	// Map the mappings in the sentry's address space (guest physical memory)
 +	// into the application's address space (guest virtual memory).
 +	inv := false
 +	for !bs.IsEmpty() {
 +		b := bs.Head()
 +		bs = bs.Tail()
 +		// Since fr was page-aligned, b should also be page-aligned. We do the
 +		// lookup in our host page tables for this translation.
 +		if precommit {
 +			s := b.ToSlice()
 +			for i := 0; i < len(s); i += hostarch.PageSize {
 +				_ = s[i] // Touch to commit.
 +			}
 +		}
 +
 +		// See bluepill_allocator.go.
 +		bluepill(as.pageTables.Allocator.(*allocator).cpu)
 +
 +		// Perform the mapping.
 +		prev := as.mapLocked(addr, hostMapEntry{
 +			addr:   b.Addr(),
 +			length: uintptr(b.Len()),
 +		}, at)
 +		inv = inv || prev
 +		addr += hostarch.Addr(b.Len())
 +	}
 +	if inv {
 +		as.invalidate()
 +	}
 +
 +	return nil
 +}
 +
 +// unmapLocked is an escape-checked wrapped around Unmap.
 +//
 +// +checkescape:hard,stack
 +func (as *addressSpace) unmapLocked(addr hostarch.Addr, length uint64) bool {
 +	return as.pageTables.Unmap(addr, uintptr(length))
 +}
 +
 +// Unmap unmaps the given range by calling pagetables.PageTables.Unmap.
 +func (as *addressSpace) Unmap(addr hostarch.Addr, length uint64) {
 +	as.mu.Lock()
 +	defer as.mu.Unlock()
 +
 +	// See above & bluepill_allocator.go.
 +	as.pageTables.Allocator.(*allocator).cpu = as.machine.Get()
 +	defer as.machine.Put(as.pageTables.Allocator.(*allocator).cpu)
 +	bluepill(as.pageTables.Allocator.(*allocator).cpu)
 +
 +	if prev := as.unmapLocked(addr, length); prev {
 +		// Invalidate all active vCPUs.
 +		as.invalidate()
 +
 +		// Recycle any freed intermediate pages.
 +		as.pageTables.Allocator.Recycle()
 +	}
 +}
 +
 +// Release releases the page tables.
 +func (as *addressSpace) Release() {
 +	as.Unmap(0, ^uint64(0))
 +
 +	// Free all pages from the allocator.
 +	as.pageTables.Allocator.(*allocator).base.Drain()
 +
 +	// Drop all cached machine references.
 +	as.machine.dropPageTables(as.pageTables)
 +}
 +
 +// PreFork implements platform.AddressSpace.PreFork.
 +func (as *addressSpace) PreFork() {}
 +
 +// PostFork implements platform.AddressSpace.PostFork.
 +func (as *addressSpace) PostFork() {}
 diff --git a/pkg/sentry/platform/kvm/bluepill_debug_unsafe.go b/pkg/sentry/platform/kvm/bluepill_debug_unsafe.go
 new file mode 100644
 index 000000000..5feb45c19
 --- /dev/null
 +++ b/pkg/sentry/platform/kvm/bluepill_debug_unsafe.go
 @@ -0,0 +1,215 @@
 +// Copyright 2018 The gVisor Authors.
 +//
 +// Licensed under the Apache License, Version 2.0 (the "License");
 +// you may not use this file except in compliance with the License.
 +// You may obtain a copy of the License at
 +//
 +//     http://www.apache.org/licenses/LICENSE-2.0
 +//
 +// Unless required by applicable law or agreed to in writing, software
 +// distributed under the License is distributed on an "AS IS" BASIS,
 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 +// See the License for the specific language governing permissions and
 +// limitations under the License.
 +
 +//go:build go1.18 && kvm_debug
 +// +build go1.18,kvm_debug
 +
 +// //go:linkname directives type-checked by checklinkname. Any other
 +// non-linkname assumptions outside the Go 1 compatibility guarantee should
 +// have an accompanied vet check or version guard build tag.
 +
 +package kvm
 +
 +import (
 +	"unsafe"
 +
 +	"golang.org/x/sys/unix"
 +	"gvisor.dev/gvisor/pkg/sentry/arch"
 +)
 +
 +//go:linkname throw runtime.throw
 +func throw(s string)
 +
 +// vCPUPtr returns a CPU for the given address.
 +func vCPUPtr(addr uintptr) *vCPU {
 +	return (*vCPU)(unsafe.Pointer(addr))
 +}
 +
 +// bytePtr returns a bytePtr for the given address.
 +func bytePtr(addr uintptr) *byte {
 +	return (*byte)(unsafe.Pointer(addr))
 +}
 +
 +// uintptrValue returns a uintptr for the given address.
 +func uintptrValue(addr *byte) uintptr {
 +	return (uintptr)(unsafe.Pointer(addr))
 +}
 +
 +// bluepillArchContext returns the UContext64.
 +func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 {
 +	return &((*arch.UContext64)(context).MContext)
 +}
 +
 +// bluepillHandleHlt is reponsible for handling VM-Exit.
 +func bluepillGuestExit(c *vCPU, context unsafe.Pointer) {
 +	// Increment our counter.
 +	c.guestExits.Add(1)
 +
 +	// Copy out registers.
 +	bluepillArchExit(c, bluepillArchContext(context))
 +
 +	// Return to the vCPUReady state; notify any waiters.
 +	user := c.state.Load() & vCPUUser
 +	switch c.state.Swap(user) {
 +	case user | vCPUGuest: // Expected case.
 +	case user | vCPUGuest | vCPUWaiter:
 +		c.notify()
 +	default:
 +		throw("invalid state")
 +	}
 +}
 +
 +var hexSyms = []byte("0123456789abcdef")
 +
 +func printHex(title []byte, val uint64) {
 +	var str [18]byte
 +	for i := 0; i < 16; i++ {
 +		str[16-i] = hexSyms[val&0xf]
 +		val = val >> 4
 +	}
 +	str[0] = ' '
 +	str[17] = '\n'
 +	unix.RawSyscall(unix.SYS_WRITE, uintptr(unix.Stderr), uintptr(unsafe.Pointer(&title[0])), uintptr(len(title)))
 +	unix.RawSyscall(unix.SYS_WRITE, uintptr(unix.Stderr), uintptr(unsafe.Pointer(&str)), 18)
 +}
 +
 +// bluepillHandler is called from the signal stub.
 +//
 +// The world may be stopped while this is executing, and it executes on the
 +// signal stack. It should only execute raw system calls and functions that are
 +// explicitly marked go:nosplit.
 +//
 +// Ideally, this function should switch to gsignal, as runtime.sigtramp does,
 +// but that is tedious given all the runtime internals. That said, using
 +// gsignal inside a signal handler is not _required_, provided we avoid stack
 +// splits and allocations. Note that calling any splittable function here will
 +// be flaky; if the signal stack is below the G stack then we will trigger a
 +// split and crash. If above, we won't trigger a split.
 +//
 +// +checkescape:all
 +func bluepillHandler(context unsafe.Pointer) {
 +	// Sanitize the registers; interrupts must always be disabled.
 +	c := bluepillArchEnter(bluepillArchContext(context))
 +
 +	// Mark this as guest mode.
 +	switch c.state.Swap(vCPUGuest | vCPUUser) {
 +	case vCPUUser: // Expected case.
 +	case vCPUUser | vCPUWaiter:
 +		c.notify()
 +	default:
 +		throw("invalid state")
 +	}
 +
 +	for {
 +		hostExitCounter.Increment()
 +		_, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(c.fd), _KVM_RUN, 0) // escapes: no.
 +		switch errno {
 +		case 0: // Expected case.
 +		case unix.EINTR:
 +			interruptCounter.Increment()
 +			// First, we process whatever pending signal
 +			// interrupted KVM. Since we're in a signal handler
 +			// currently, all signals are masked and the signal
 +			// must have been delivered directly to this thread.
 +			timeout := unix.Timespec{}
 +			sig, _, errno := unix.RawSyscall6( // escapes: no.
 +				unix.SYS_RT_SIGTIMEDWAIT,
 +				uintptr(unsafe.Pointer(&bounceSignalMask)),
 +				0,                                 // siginfo.
 +				uintptr(unsafe.Pointer(&timeout)), // timeout.
 +				8,                                 // sigset size.
 +				0, 0)
 +			if errno == unix.EAGAIN {
 +				continue
 +			}
 +			if errno != 0 {
 +				throw("error waiting for pending signal")
 +			}
 +			if sig != uintptr(bounceSignal) {
 +				throw("unexpected signal")
 +			}
 +
 +			// Check whether the current state of the vCPU is ready
 +			// for interrupt injection. Because we don't have a
 +			// PIC, we can't inject an interrupt while they are
 +			// masked. We need to request a window if it's not
 +			// ready.
 +			if bluepillReadyStopGuest(c) {
 +				// Force injection below; the vCPU is ready.
 +				c.runData.exitReason = _KVM_EXIT_IRQ_WINDOW_OPEN
 +			} else {
 +				c.runData.requestInterruptWindow = 1
 +				continue // Rerun vCPU.
 +			}
 +		case unix.EFAULT:
 +			// If a fault is not serviceable due to the host
 +			// backing pages having page permissions, instead of an
 +			// MMIO exit we receive EFAULT from the run ioctl. We
 +			// always inject an NMI here since we may be in kernel
 +			// mode and have interrupts disabled.
 +			bluepillSigBus(c)
 +			continue // Rerun vCPU.
 +		case unix.ENOSYS:
 +			bluepillHandleEnosys(c)
 +			continue
 +		default:
 +			throw("run failed")
 +		}
 +
 +		switch c.runData.exitReason {
 +		case _KVM_EXIT_EXCEPTION:
 +			c.die(bluepillArchContext(context), "exception")
 +			return
 +		case _KVM_EXIT_IO:
 +			c.die(bluepillArchContext(context), "I/O")
 +			return
 +		case _KVM_EXIT_INTERNAL_ERROR:
 +			// An internal error is typically thrown when emulation
 +			// fails. This can occur via the MMIO path below (and
 +			// it might fail because we have multiple regions that
 +			// are not mapped). We would actually prefer that no
 +			// emulation occur, and don't mind at all if it fails.
 +		case _KVM_EXIT_HYPERCALL:
 +			c.die(bluepillArchContext(context), "hypercall")
 +			return
 +		case _KVM_EXIT_DEBUG:
 +			c.die(bluepillArchContext(context), "debug")
 +			return
 +		case _KVM_EXIT_HLT:
 +			c.hltSanityCheck()
 +			bluepillGuestExit(c, context)
 +			return
 +		case _KVM_EXIT_MMIO:
 +			physical := uintptr(c.runData.data[0])
 +			if getHypercallID(physical) == _KVM_HYPERCALL_VMEXIT {
 +				bluepillGuestExit(c, context)
 +				return
 +			}
 +
 +			c.die(bluepillArchContext(context), "exit_mmio")
 +			return
 +		case _KVM_EXIT_IRQ_WINDOW_OPEN:
 +			bluepillStopGuest(c)
 +		case _KVM_EXIT_SHUTDOWN:
 +			c.die(bluepillArchContext(context), "shutdown")
 +			return
 +		case _KVM_EXIT_FAIL_ENTRY:
 +			c.die(bluepillArchContext(context), "entry failed")
 +			return
 +		default:
 +			bluepillArchHandleExit(c, context)
 +			return
 +		}
 +	}
 +}
 diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
 index 81bd9f814..ad8b966e7 100644
 --- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
 +++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
 @@ -12,8 +12,8 @@
  // See the License for the specific language governing permissions and
  // limitations under the License.

 -//go:build go1.18
 -// +build go1.18
 +//go:build go1.18 && !kvm_debug
 +// +build go1.18,!kvm_debug

  // //go:linkname directives type-checked by checklinkname. Any other
  // non-linkname assumptions outside the Go 1 compatibility guarantee should
 diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
 index f39bf1f06..4f0264db7 100644
 --- a/pkg/sentry/platform/kvm/machine.go
 +++ b/pkg/sentry/platform/kvm/machine.go
 @@ -12,6 +12,9 @@
  // See the License for the specific language governing permissions and
  // limitations under the License.

 +//go:build !kvm_debug
 +// +build !kvm_debug
 +
  package kvm

  import (
 diff --git a/pkg/sentry/platform/kvm/machine_debug.go b/pkg/sentry/platform/kvm/machine_debug.go
 new file mode 100644
 index 000000000..0a4735d2d
 --- /dev/null
 +++ b/pkg/sentry/platform/kvm/machine_debug.go
 @@ -0,0 +1,826 @@
 +// Copyright 2018 The gVisor Authors.
 +//
 +// Licensed under the Apache License, Version 2.0 (the "License");
 +// you may not use this file except in compliance with the License.
 +// You may obtain a copy of the License at
 +//
 +//     http://www.apache.org/licenses/LICENSE-2.0
 +//
 +// Unless required by applicable law or agreed to in writing, software
 +// distributed under the License is distributed on an "AS IS" BASIS,
 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 +// See the License for the specific language governing permissions and
 +// limitations under the License.
 +
 +//go:build kvm_debug
 +// +build kvm_debug
 +
 +package kvm
 +
 +import (
 +	"fmt"
 +	"runtime"
 +	gosync "sync"
 +	"sync/atomic"
 +	"time"
 +
 +	"golang.org/x/sys/unix"
 +	"gvisor.dev/gvisor/pkg/abi/linux"
 +	"gvisor.dev/gvisor/pkg/atomicbitops"
 +	"gvisor.dev/gvisor/pkg/hostarch"
 +	"gvisor.dev/gvisor/pkg/hosttid"
 +	"gvisor.dev/gvisor/pkg/log"
 +	"gvisor.dev/gvisor/pkg/metric"
 +	"gvisor.dev/gvisor/pkg/ring0"
 +	"gvisor.dev/gvisor/pkg/ring0/pagetables"
 +	"gvisor.dev/gvisor/pkg/seccomp"
 +	ktime "gvisor.dev/gvisor/pkg/sentry/time"
 +	"gvisor.dev/gvisor/pkg/sighandling"
 +	"gvisor.dev/gvisor/pkg/sync"
 +)
 +
 +// machine contains state associated with the VM as a whole.
 +type machine struct {
 +	// fd is the vm fd.
 +	fd int
 +
 +	// machinePoolIndex is the index in the machinePool array.
 +	machinePoolIndex uint32
 +
 +	// nextSlot is the next slot for setMemoryRegion.
 +	//
 +	// If nextSlot is ^uint32(0), then slots are currently being updated, and the
 +	// caller should retry.
 +	nextSlot atomicbitops.Uint32
 +
 +	// upperSharedPageTables tracks the read-only shared upper of all the pagetables.
 +	upperSharedPageTables *pagetables.PageTables
 +
 +	// kernel is the set of global structures.
 +	kernel ring0.Kernel
 +
 +	// mu protects vCPUs.
 +	mu sync.RWMutex
 +
 +	// available is notified when vCPUs are available.
 +	available sync.Cond
 +
 +	// vCPUsByTID are the machine vCPUs.
 +	//
 +	// These are populated dynamically.
 +	vCPUsByTID map[uint64]*vCPU
 +
 +	// vCPUsByID are the machine vCPUs, can be indexed by the vCPU's ID.
 +	vCPUsByID []*vCPU
 +
 +	// usedVCPUs is the number of vCPUs that have been used from the
 +	// vCPUsByID pool.
 +	usedVCPUs int
 +
 +	// maxVCPUs is the maximum number of vCPUs supported by the machine.
 +	maxVCPUs int
 +
 +	// maxSlots is the maximum number of memory slots supported by the machine.
 +	maxSlots int
 +
 +	// tscControl checks whether cpu supports TSC scaling
 +	tscControl bool
 +
 +	// usedSlots is the set of used physical addresses (not sorted).
 +	usedSlots []uintptr
 +}
 +
 +const (
 +	// vCPUReady is an alias for all the below clear.
 +	vCPUReady uint32 = 0
 +
 +	// vCPUser indicates that the vCPU is in or about to enter user mode.
 +	vCPUUser uint32 = 1 << 0
 +
 +	// vCPUGuest indicates the vCPU is in guest mode.
 +	vCPUGuest uint32 = 1 << 1
 +
 +	// vCPUWaiter indicates that there is a waiter.
 +	//
 +	// If this is set, then notify must be called on any state transitions.
 +	vCPUWaiter uint32 = 1 << 2
 +)
 +
 +// Field values for the get_vcpu metric acquisition path used.
 +var (
 +	getVCPUAcquisitionFastReused = metric.FieldValue{"fast_reused"}
 +	getVCPUAcquisitionReused     = metric.FieldValue{"reused"}
 +	getVCPUAcquisitionUnused     = metric.FieldValue{"unused"}
 +	getVCPUAcquisitionStolen     = metric.FieldValue{"stolen"}
 +)
 +
 +var (
 +	// hostExitCounter is a metric that tracks how many times the sentry
 +	// performed a host to guest world switch.
 +	hostExitCounter = metric.MustCreateNewProfilingUint64Metric(
 +		"/kvm/host_exits", false, "The number of times the sentry performed a host to guest world switch.")
 +
 +	// userExitCounter is a metric that tracks how many times the sentry has
 +	// had an exit from userspace. Analogous to vCPU.userExits.
 +	userExitCounter = metric.MustCreateNewProfilingUint64Metric(
 +		"/kvm/user_exits", false, "The number of times the sentry has had an exit from userspace.")
 +
 +	// interruptCounter is a metric that tracks how many times execution returned
 +	// to the KVM host to handle a pending signal.
 +	interruptCounter = metric.MustCreateNewProfilingUint64Metric(
 +		"/kvm/interrupts", false, "The number of times the signal handler was invoked.")
 +
 +	// mmapCallCounter is a metric that tracks how many times the function
 +	// seccompMmapSyscall has been called.
 +	mmapCallCounter = metric.MustCreateNewProfilingUint64Metric(
 +		"/kvm/mmap_calls", false, "The number of times seccompMmapSyscall has been called.")
 +
 +	// getVCPUCounter is a metric that tracks how many times different paths of
 +	// machine.Get() are triggered.
 +	getVCPUCounter = metric.MustCreateNewProfilingUint64Metric(
 +		"/kvm/get_vcpu", false, "The number of times that machine.Get() was called, split by path the function took.",
 +		metric.NewField("acquisition_type", &getVCPUAcquisitionFastReused, &getVCPUAcquisitionReused, &getVCPUAcquisitionUnused, &getVCPUAcquisitionStolen))
 +
 +	// asInvalidateDuration are durations of calling addressSpace.invalidate().
 +	asInvalidateDuration = metric.MustCreateNewProfilingTimerMetric("/kvm/address_space_invalidate",
 +		metric.NewExponentialBucketer(15, uint64(time.Nanosecond*100), 1, 2),
 +		"Duration of calling addressSpace.invalidate().")
 +)
 +
 +// vCPU is a single KVM vCPU.
 +type vCPU struct {
 +	// CPU is the kernel CPU data.
 +	//
 +	// This must be the first element of this structure, it is referenced
 +	// by the bluepill code (see bluepill_amd64.s).
 +	ring0.CPU
 +
 +	// id is the vCPU id.
 +	id int
 +
 +	// fd is the vCPU fd.
 +	fd int
 +
 +	// tid is the last set tid.
 +	tid atomicbitops.Uint64
 +
 +	// userExits is the count of user exits.
 +	userExits atomicbitops.Uint64
 +
 +	// guestExits is the count of guest to host world switches.
 +	guestExits atomicbitops.Uint64
 +
 +	// faults is a count of world faults (informational only).
 +	faults uint32
 +
 +	// state is the vCPU state.
 +	//
 +	// This is a bitmask of the three fields (vCPU*) described above.
 +	state atomicbitops.Uint32
 +
 +	// runData for this vCPU.
 +	runData *runData
 +
 +	// machine associated with this vCPU.
 +	machine *machine
 +
 +	// active is the current addressSpace: this is set and read atomically,
 +	// it is used to elide unnecessary interrupts due to invalidations.
 +	active atomicAddressSpace
 +
 +	// vCPUArchState is the architecture-specific state.
 +	vCPUArchState
 +
 +	// dieState holds state related to vCPU death.
 +	dieState dieState
 +}
 +
 +type dieState struct {
 +	// message is thrown from die.
 +	message string
 +
 +	// guestRegs is used to store register state during vCPU.die() to prevent
 +	// allocation inside nosplit function.
 +	guestRegs userRegs
 +}
 +
 +// createVCPU creates and returns a new vCPU.
 +//
 +// Precondition: mu must be held.
 +func (m *machine) createVCPU(id int) *vCPU {
 +	// Create the vCPU.
 +	fd, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CREATE_VCPU, uintptr(id))
 +	if errno != 0 {
 +		panic(fmt.Sprintf("error creating new vCPU: %v", errno))
 +	}
 +
 +	c := &vCPU{
 +		id:      id,
 +		fd:      int(fd),
 +		machine: m,
 +	}
 +	c.CPU.Init(&m.kernel, c.id, c)
 +	m.vCPUsByID[c.id] = c
 +
 +	// Ensure the signal mask is correct.
 +	if err := c.setSignalMask(); err != nil {
 +		panic(fmt.Sprintf("error setting signal mask: %v", err))
 +	}
 +
 +	// Map the run data.
 +	runData, err := mapRunData(int(fd))
 +	if err != nil {
 +		panic(fmt.Sprintf("error mapping run data: %v", err))
 +	}
 +	c.runData = runData
 +
 +	// Initialize architecture state.
 +	if err := c.initArchState(); err != nil {
 +		panic(fmt.Sprintf("error initialization vCPU state: %v", err))
 +	}
 +
 +	return c // Done.
 +}
 +
 +// newMachine returns a new VM context.
 +func newMachine(vm int) (*machine, error) {
 +	// Create the machine.
 +	m := &machine{fd: vm}
 +	m.available.L = &m.mu
 +
 +	// Pull the maximum vCPUs.
 +	m.getMaxVCPU()
 +	log.Debugf("The maximum number of vCPUs is %d.", m.maxVCPUs)
 +	m.vCPUsByTID = make(map[uint64]*vCPU)
 +	m.vCPUsByID = make([]*vCPU, m.maxVCPUs)
 +	m.kernel.Init(m.maxVCPUs)
 +
 +	// Pull the maximum slots.
 +	maxSlots, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_MEMSLOTS)
 +	if errno != 0 {
 +		m.maxSlots = _KVM_NR_MEMSLOTS
 +	} else {
 +		m.maxSlots = int(maxSlots)
 +	}
 +	log.Debugf("The maximum number of slots is %d.", m.maxSlots)
 +	m.usedSlots = make([]uintptr, m.maxSlots)
 +
 +	// Check TSC Scaling
 +	hasTSCControl, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_TSC_CONTROL)
 +	m.tscControl = errno == 0 && hasTSCControl == 1
 +	log.Debugf("TSC scaling support: %t.", m.tscControl)
 +
 +	// Create the upper shared pagetables and kernel(sentry) pagetables.
 +	m.upperSharedPageTables = pagetables.New(newAllocator())
 +	m.mapUpperHalf(m.upperSharedPageTables)
 +	m.upperSharedPageTables.Allocator.(*allocator).base.Drain()
 +	m.upperSharedPageTables.MarkReadOnlyShared()
 +	m.kernel.PageTables = pagetables.NewWithUpper(newAllocator(), m.upperSharedPageTables, ring0.KernelStartAddress)
 +
 +	// Install seccomp rules to trap runtime mmap system calls. They will
 +	// be handled by seccompMmapHandler.
 +	seccompMmapRules(m)
 +
 +	// Apply the physical mappings. Note that these mappings may point to
 +	// guest physical addresses that are not actually available. These
 +	// physical pages are mapped on demand, see kernel_unsafe.go.
 +	applyPhysicalRegions(func(pr physicalRegion) bool {
 +		// Map everything in the lower half.
 +		m.kernel.PageTables.Map(
 +			hostarch.Addr(pr.virtual),
 +			pr.length,
 +			pagetables.MapOpts{AccessType: hostarch.ReadWrite},
 +			pr.physical)
 +
 +		return true // Keep iterating.
 +	})
 +
 +	// Ensure that the currently mapped virtual regions are actually
 +	// available in the VM. Note that this doesn't guarantee no future
 +	// faults, however it should guarantee that everything is available to
 +	// ensure successful vCPU entry.
 +	mapRegion := func(vr virtualRegion, flags uint32) {
 +		for virtual := vr.virtual; virtual < vr.virtual+vr.length; {
 +			physical, length, ok := translateToPhysical(virtual)
 +			if !ok {
 +				// This must be an invalid region that was
 +				// knocked out by creation of the physical map.
 +				return
 +			}
 +			if virtual+length > vr.virtual+vr.length {
 +				// Cap the length to the end of the area.
 +				length = vr.virtual + vr.length - virtual
 +			}
 +			// Update page tables for executable mappings.
 +			if vr.accessType.Execute {
 +				if vr.accessType.Write {
 +					panic(fmt.Sprintf("executable mapping can't be writable: %#v", vr))
 +				}
 +				m.kernel.PageTables.Map(
 +					hostarch.Addr(virtual),
 +					length,
 +					pagetables.MapOpts{AccessType: vr.accessType},
 +					physical)
 +			}
 +
 +			// Ensure the physical range is mapped.
 +			m.mapPhysical(physical, length, physicalRegions)
 +			virtual += length
 +		}
 +	}
 +
 +	// handleBluepillFault takes the slot spinlock and it is called from
 +	// seccompMmapHandler, so here we have to guarantee that mmap is not
 +	// called while we hold the slot spinlock.
 +	disableAsyncPreemption()
 +	applyVirtualRegions(func(vr virtualRegion) {
 +		if excludeVirtualRegion(vr) {
 +			return // skip region.
 +		}
 +		// Take into account that the stack can grow down.
 +		if vr.filename == "[stack]" {
 +			vr.virtual -= 1 << 20
 +			vr.length += 1 << 20
 +		}
 +
 +		mapRegion(vr, 0)
 +
 +	})
 +	enableAsyncPreemption()
 +
 +	// Initialize architecture state.
 +	if err := m.initArchState(); err != nil {
 +		m.Destroy()
 +		return nil, err
 +	}
 +
 +	// Ensure the machine is cleaned up properly.
 +	runtime.SetFinalizer(m, (*machine).Destroy)
 +	return m, nil
 +}
 +
 +// hasSlot returns true if the given address is mapped.
 +//
 +// This must be done via a linear scan.
 +//
 +//go:nosplit
 +func (m *machine) hasSlot(physical uintptr) bool {
 +	slotLen := int(m.nextSlot.Load())
 +	// When slots are being updated, nextSlot is ^uint32(0). As this situation
 +	// is less likely happen, we just set the slotLen to m.maxSlots, and scan
 +	// the whole usedSlots array.
 +	if slotLen == int(^uint32(0)) {
 +		slotLen = m.maxSlots
 +	}
 +	for i := 0; i < slotLen; i++ {
 +		if p := atomic.LoadUintptr(&m.usedSlots[i]); p == physical {
 +			return true
 +		}
 +	}
 +	return false
 +}
 +
 +// mapPhysical checks for the mapping of a physical range, and installs one if
 +// not available. This attempts to be efficient for calls in the hot path.
 +//
 +// This throws on error.
 +func (m *machine) mapPhysical(physical, length uintptr, phyRegions []physicalRegion) {
 +	for end := physical + length; physical < end; {
 +		_, physicalStart, length, pr := calculateBluepillFault(physical, phyRegions)
 +		if pr == nil {
 +			// Should never happen.
 +			throw("mapPhysical on unknown physical address")
 +		}
 +
 +		// Is this already mapped? Check the usedSlots.
 +		if !m.hasSlot(physicalStart) {
 +			if _, ok := handleBluepillFault(m, physical, phyRegions); !ok {
 +				throw("handleBluepillFault failed")
 +			}
 +		}
 +
 +		// Move to the next chunk.
 +		physical = physicalStart + length
 +	}
 +}
 +
 +// Destroy frees associated resources.
 +//
 +// Destroy should only be called once all active users of the machine are gone.
 +// The machine object should not be used after calling Destroy.
 +//
 +// Precondition: all vCPUs must be returned to the machine.
 +func (m *machine) Destroy() {
 +	runtime.SetFinalizer(m, nil)
 +
 +	// Destroy vCPUs.
 +	for _, c := range m.vCPUsByID {
 +		if c == nil {
 +			continue
 +		}
 +
 +		// Ensure the vCPU is not still running in guest mode. This is
 +		// possible iff teardown has been done by other threads, and
 +		// somehow a single thread has not executed any system calls.
 +		c.BounceToHost()
 +
 +		// Note that the runData may not be mapped if an error occurs
 +		// during the middle of initialization.
 +		if c.runData != nil {
 +			if err := unmapRunData(c.runData); err != nil {
 +				panic(fmt.Sprintf("error unmapping rundata: %v", err))
 +			}
 +		}
 +		if err := unix.Close(int(c.fd)); err != nil {
 +			panic(fmt.Sprintf("error closing vCPU fd: %v", err))
 +		}
 +	}
 +
 +	machinePool[m.machinePoolIndex].Store(nil)
 +	seccompMmapSync()
 +
 +	// vCPUs are gone: teardown machine state.
 +	if err := unix.Close(m.fd); err != nil {
 +		panic(fmt.Sprintf("error closing VM fd: %v", err))
 +	}
 +}
 +
 +// Get gets an available vCPU.
 +//
 +// This will return with the OS thread locked.
 +//
 +// It is guaranteed that if any OS thread TID is in guest, m.vCPUs[TID] points
 +// to the vCPU in which the OS thread TID is running. So if Get() returns with
 +// the corrent context in guest, the vCPU of it must be the same as what
 +// Get() returns.
 +func (m *machine) Get() *vCPU {
 +	m.mu.RLock()
 +	runtime.LockOSThread()
 +	tid := hosttid.Current()
 +
 +	// Check for an exact match.
 +	if c := m.vCPUsByTID[tid]; c != nil {
 +		c.lock()
 +		m.mu.RUnlock()
 +		getVCPUCounter.Increment(&getVCPUAcquisitionFastReused)
 +		return c
 +	}
 +
 +	// The happy path failed. We now proceed to acquire an exclusive lock
 +	// (because the vCPU map may change), and scan all available vCPUs.
 +	// In this case, we first unlock the OS thread. Otherwise, if mu is
 +	// not available, the current system thread will be parked and a new
 +	// system thread spawned. We avoid this situation by simply refreshing
 +	// tid after relocking the system thread.
 +	m.mu.RUnlock()
 +	runtime.UnlockOSThread()
 +	m.mu.Lock()
 +	runtime.LockOSThread()
 +	tid = hosttid.Current()
 +
 +	// Recheck for an exact match.
 +	if c := m.vCPUsByTID[tid]; c != nil {
 +		c.lock()
 +		m.mu.Unlock()
 +		getVCPUCounter.Increment(&getVCPUAcquisitionReused)
 +		return c
 +	}
 +
 +	for {
 +		// Get vCPU from the m.vCPUsByID pool.
 +		if m.usedVCPUs < m.maxVCPUs {
 +			c := m.vCPUsByID[m.usedVCPUs]
 +			m.usedVCPUs++
 +			c.lock()
 +			m.vCPUsByTID[tid] = c
 +			m.mu.Unlock()
 +			c.loadSegments(tid)
 +			getVCPUCounter.Increment(&getVCPUAcquisitionUnused)
 +			return c
 +		}
 +
 +		// Scan for an available vCPU.
 +		for origTID, c := range m.vCPUsByTID {
 +			if c.state.CompareAndSwap(vCPUReady, vCPUUser) {
 +				delete(m.vCPUsByTID, origTID)
 +				m.vCPUsByTID[tid] = c
 +				m.mu.Unlock()
 +				c.loadSegments(tid)
 +				getVCPUCounter.Increment(&getVCPUAcquisitionUnused)
 +				return c
 +			}
 +		}
 +
 +		// Scan for something not in user mode.
 +		for origTID, c := range m.vCPUsByTID {
 +			if !c.state.CompareAndSwap(vCPUGuest, vCPUGuest|vCPUWaiter) {
 +				continue
 +			}
 +
 +			// The vCPU is not be able to transition to
 +			// vCPUGuest|vCPUWaiter or to vCPUUser because that
 +			// transition requires holding the machine mutex, as we
 +			// do now. There is no path to register a waiter on
 +			// just the vCPUReady state.
 +			for {
 +				c.waitUntilNot(vCPUGuest | vCPUWaiter)
 +				if c.state.CompareAndSwap(vCPUReady, vCPUUser) {
 +					break
 +				}
 +			}
 +
 +			// Steal the vCPU.
 +			delete(m.vCPUsByTID, origTID)
 +			m.vCPUsByTID[tid] = c
 +			m.mu.Unlock()
 +			c.loadSegments(tid)
 +			getVCPUCounter.Increment(&getVCPUAcquisitionStolen)
 +			return c
 +		}
 +
 +		// Everything is executing in user mode. Wait until something
 +		// is available.  Note that signaling the condition variable
 +		// will have the extra effect of kicking the vCPUs out of guest
 +		// mode if that's where they were.
 +		m.available.Wait()
 +	}
 +}
 +
 +// Put puts the current vCPU.
 +func (m *machine) Put(c *vCPU) {
 +	c.unlock()
 +	runtime.UnlockOSThread()
 +
 +	m.mu.RLock()
 +	m.available.Signal()
 +	m.mu.RUnlock()
 +}
 +
 +// newDirtySet returns a new dirty set.
 +func (m *machine) newDirtySet() *dirtySet {
 +	return &dirtySet{
 +		vCPUMasks: make([]atomicbitops.Uint64,
 +			(m.maxVCPUs+63)/64, (m.maxVCPUs+63)/64),
 +	}
 +}
 +
 +// dropPageTables drops cached page table entries.
 +func (m *machine) dropPageTables(pt *pagetables.PageTables) {
 +	m.mu.Lock()
 +	defer m.mu.Unlock()
 +
 +	// Clear from all PCIDs.
 +	for _, c := range m.vCPUsByID {
 +		if c != nil && c.PCIDs != nil {
 +			c.PCIDs.Drop(pt)
 +		}
 +	}
 +}
 +
 +// lock marks the vCPU as in user mode.
 +//
 +// This should only be called directly when known to be safe, i.e. when
 +// the vCPU is owned by the current TID with no chance of theft.
 +//
 +//go:nosplit
 +func (c *vCPU) lock() {
 +	atomicbitops.OrUint32(&c.state, vCPUUser)
 +}
 +
 +// unlock clears the vCPUUser bit.
 +//
 +//go:nosplit
 +func (c *vCPU) unlock() {
 +	origState := atomicbitops.CompareAndSwapUint32(&c.state, vCPUUser|vCPUGuest, vCPUGuest)
 +	if origState == vCPUUser|vCPUGuest {
 +		// Happy path: no exits are forced, and we can continue
 +		// executing on our merry way with a single atomic access.
 +		return
 +	}
 +
 +	// Clear the lock.
 +	for {
 +		state := atomicbitops.CompareAndSwapUint32(&c.state, origState, origState&^vCPUUser)
 +		if state == origState {
 +			break
 +		}
 +		origState = state
 +	}
 +	switch origState {
 +	case vCPUUser:
 +		// Normal state.
 +	case vCPUUser | vCPUGuest | vCPUWaiter:
 +		// Force a transition: this must trigger a notification when we
 +		// return from guest mode. We must clear vCPUWaiter here
 +		// anyways, because BounceToKernel will force a transition only
 +		// from ring3 to ring0, which will not clear this bit. Halt may
 +		// workaround the issue, but if there is no exception or
 +		// syscall in this period, BounceToKernel will hang.
 +		atomicbitops.AndUint32(&c.state, ^vCPUWaiter)
 +		c.notify()
 +	case vCPUUser | vCPUWaiter:
 +		// Waiting for the lock to be released; the responsibility is
 +		// on us to notify the waiter and clear the associated bit.
 +		atomicbitops.AndUint32(&c.state, ^vCPUWaiter)
 +		c.notify()
 +	default:
 +		panic("invalid state")
 +	}
 +}
 +
 +// NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt.
 +//
 +//go:nosplit
 +func (c *vCPU) NotifyInterrupt() {
 +	c.BounceToKernel()
 +}
 +
 +// pid is used below in bounce.
 +var pid = unix.Getpid()
 +
 +// bounce forces a return to the kernel or to host mode.
 +//
 +// This effectively unwinds the state machine.
 +func (c *vCPU) bounce(forceGuestExit bool) {
 +	origGuestExits := c.guestExits.Load()
 +	origUserExits := c.userExits.Load()
 +	for {
 +		switch state := c.state.Load(); state {
 +		case vCPUReady, vCPUWaiter:
 +			// There is nothing to be done, we're already in the
 +			// kernel pre-acquisition. The Bounce criteria have
 +			// been satisfied.
 +			return
 +		case vCPUUser:
 +			// We need to register a waiter for the actual guest
 +			// transition. When the transition takes place, then we
 +			// can inject an interrupt to ensure a return to host
 +			// mode.
 +			c.state.CompareAndSwap(state, state|vCPUWaiter)
 +		case vCPUUser | vCPUWaiter:
 +			// Wait for the transition to guest mode. This should
 +			// come from the bluepill handler.
 +			c.waitUntilNot(state)
 +		case vCPUGuest, vCPUUser | vCPUGuest:
 +			if state == vCPUGuest && !forceGuestExit {
 +				// The vCPU is already not acquired, so there's
 +				// no need to do a fresh injection here.
 +				return
 +			}
 +			// The vCPU is in user or kernel mode. Attempt to
 +			// register a notification on change.
 +			if !c.state.CompareAndSwap(state, state|vCPUWaiter) {
 +				break // Retry.
 +			}
 +			for {
 +				// We need to spin here until the signal is
 +				// delivered, because Tgkill can return EAGAIN
 +				// under memory pressure. Since we already
 +				// marked ourselves as a waiter, we need to
 +				// ensure that a signal is actually delivered.
 +				if err := unix.Tgkill(pid, int(c.tid.Load()), bounceSignal); err == nil {
 +					break
 +				} else if err.(unix.Errno) == unix.EAGAIN {
 +					continue
 +				} else {
 +					// Nothing else should be returned by tgkill.
 +					panic(fmt.Sprintf("unexpected tgkill error: %v", err))
 +				}
 +			}
 +		case vCPUGuest | vCPUWaiter, vCPUUser | vCPUGuest | vCPUWaiter:
 +			if state == vCPUGuest|vCPUWaiter && !forceGuestExit {
 +				// See above.
 +				return
 +			}
 +			// Wait for the transition. This again should happen
 +			// from the bluepill handler, but on the way out.
 +			c.waitUntilNot(state)
 +		default:
 +			// Should not happen: the above is exhaustive.
 +			panic("invalid state")
 +		}
 +
 +		// Check if we've missed the state transition, but
 +		// we can safely return at this point in time.
 +		newGuestExits := c.guestExits.Load()
 +		newUserExits := c.userExits.Load()
 +		if newUserExits != origUserExits && (!forceGuestExit || newGuestExits != origGuestExits) {
 +			return
 +		}
 +	}
 +}
 +
 +// BounceToKernel ensures that the vCPU bounces back to the kernel.
 +//
 +//go:nosplit
 +func (c *vCPU) BounceToKernel() {
 +	c.bounce(false)
 +}
 +
 +// BounceToHost ensures that the vCPU is in host mode.
 +//
 +//go:nosplit
 +func (c *vCPU) BounceToHost() {
 +	c.bounce(true)
 +}
 +
 +// setSystemTimeLegacy calibrates and sets an approximate system time.
 +func (c *vCPU) setSystemTimeLegacy() error {
 +	const minIterations = 10
 +	minimum := uint64(0)
 +	for iter := 0; ; iter++ {
 +		// Try to set the TSC to an estimate of where it will be
 +		// on the host during a "fast" system call iteration.
 +		start := uint64(ktime.Rdtsc())
 +		if err := c.setTSC(start + (minimum / 2)); err != nil {
 +			return err
 +		}
 +		// See if this is our new minimum call time. Note that this
 +		// serves two functions: one, we make sure that we are
 +		// accurately predicting the offset we need to set. Second, we
 +		// don't want to do the final set on a slow call, which could
 +		// produce a really bad result.
 +		end := uint64(ktime.Rdtsc())
 +		if end < start {
 +			continue // Totally bogus: unstable TSC?
 +		}
 +		current := end - start
 +		if current < minimum || iter == 0 {
 +			minimum = current // Set our new minimum.
 +		}
 +		// Is this past minIterations and within ~10% of minimum?
 +		upperThreshold := (((minimum << 3) + minimum) >> 3)
 +		if iter >= minIterations && current <= upperThreshold {
 +			return nil
 +		}
 +	}
 +}
 +
 +const machinePoolSize = 16
 +
 +// machinePool is enumerated from the seccompMmapHandler signal handler
 +var (
 +	machinePool          [machinePoolSize]machineAtomicPtr
 +	machinePoolLen       atomicbitops.Uint32
 +	machinePoolMu        sync.Mutex
 +	seccompMmapRulesOnce gosync.Once
 +)
 +
 +func sigsysHandler()
 +func addrOfSigsysHandler() uintptr
 +
 +// seccompMmapRules adds seccomp rules to trap mmap system calls that will be
 +// handled in seccompMmapHandler.
 +func seccompMmapRules(m *machine) {
 +	seccompMmapRulesOnce.Do(func() {
 +		// Install the handler.
 +		if err := sighandling.ReplaceSignalHandler(unix.SIGSYS, addrOfSigsysHandler(), &savedSigsysHandler); err != nil {
 +			panic(fmt.Sprintf("Unable to set handler for signal %d: %v", bluepillSignal, err))
 +		}
 +		rules := []seccomp.RuleSet{}
 +		rules = append(rules, []seccomp.RuleSet{
 +			// Trap mmap system calls and handle them in sigsysGoHandler
 +			{
 +				Rules: seccomp.SyscallRules{
 +					unix.SYS_MMAP: {
 +						{
 +							seccomp.MatchAny{},
 +							seccomp.MatchAny{},
 +							seccomp.MaskedEqual(unix.PROT_EXEC, 0),
 +							/* MAP_DENYWRITE is ignored and used only for filtering. */
 +							seccomp.MaskedEqual(unix.MAP_DENYWRITE, 0),
 +						},
 +					},
 +				},
 +				Action: linux.SECCOMP_RET_TRAP,
 +			},
 +		}...)
 +		instrs, err := seccomp.BuildProgram(rules, linux.SECCOMP_RET_ALLOW, linux.SECCOMP_RET_ALLOW)
 +		if err != nil {
 +			panic(fmt.Sprintf("failed to build rules: %v", err))
 +		}
 +		// Perform the actual installation.
 +		if err := seccomp.SetFilter(instrs); err != nil {
 +			panic(fmt.Sprintf("failed to set filter: %v", err))
 +		}
 +	})
 +
 +	machinePoolMu.Lock()
 +	n := machinePoolLen.Load()
 +	i := uint32(0)
 +	for ; i < n; i++ {
 +		if machinePool[i].Load() == nil {
 +			break
 +		}
 +	}
 +	if i == n {
 +		if i == machinePoolSize {
 +			machinePoolMu.Unlock()
 +			panic("machinePool is full")
 +		}
 +		machinePoolLen.Add(1)
 +	}
 +	machinePool[i].Store(m)
 +	m.machinePoolIndex = i
 +	machinePoolMu.Unlock()
 +}
 --
 2.41.0