|  | From eb22b742839180a0bdb3953c061da15ba822d56d Mon Sep 17 00:00:00 2001 | 
|  | From: Tim Windelschmidt <tim@monogon.tech> | 
|  | Date: Tue, 12 Sep 2023 15:06:49 +0200 | 
|  | Subject: [PATCH] fix debug builds | 
|  |  | 
|  | --- | 
|  | pkg/sentry/platform/kvm/address_space.go      |   3 + | 
|  | .../platform/kvm/address_space_debug.go       | 242 +++++ | 
|  | .../platform/kvm/bluepill_debug_unsafe.go     | 215 +++++ | 
|  | pkg/sentry/platform/kvm/bluepill_unsafe.go    |   4 +- | 
|  | pkg/sentry/platform/kvm/machine.go            |   3 + | 
|  | pkg/sentry/platform/kvm/machine_debug.go      | 826 ++++++++++++++++++ | 
|  | 6 files changed, 1291 insertions(+), 2 deletions(-) | 
|  | create mode 100644 pkg/sentry/platform/kvm/address_space_debug.go | 
|  | create mode 100644 pkg/sentry/platform/kvm/bluepill_debug_unsafe.go | 
|  | create mode 100644 pkg/sentry/platform/kvm/machine_debug.go | 
|  |  | 
|  | diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go | 
|  | index 79ccbea35..7e30d0365 100644 | 
|  | --- a/pkg/sentry/platform/kvm/address_space.go | 
|  | +++ b/pkg/sentry/platform/kvm/address_space.go | 
|  | @@ -12,6 +12,9 @@ | 
|  | // See the License for the specific language governing permissions and | 
|  | // limitations under the License. | 
|  |  | 
|  | +//go:build !kvm_debug | 
|  | +// +build !kvm_debug | 
|  | + | 
|  | package kvm | 
|  |  | 
|  | import ( | 
|  | diff --git a/pkg/sentry/platform/kvm/address_space_debug.go b/pkg/sentry/platform/kvm/address_space_debug.go | 
|  | new file mode 100644 | 
|  | index 000000000..69aeba45a | 
|  | --- /dev/null | 
|  | +++ b/pkg/sentry/platform/kvm/address_space_debug.go | 
|  | @@ -0,0 +1,242 @@ | 
|  | +// Copyright 2018 The gVisor Authors. | 
|  | +// | 
|  | +// Licensed under the Apache License, Version 2.0 (the "License"); | 
|  | +// you may not use this file except in compliance with the License. | 
|  | +// You may obtain a copy of the License at | 
|  | +// | 
|  | +//     http://www.apache.org/licenses/LICENSE-2.0 | 
|  | +// | 
|  | +// Unless required by applicable law or agreed to in writing, software | 
|  | +// distributed under the License is distributed on an "AS IS" BASIS, | 
|  | +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
|  | +// See the License for the specific language governing permissions and | 
|  | +// limitations under the License. | 
|  | + | 
|  | +//go:build kvm_debug | 
|  | +// +build kvm_debug | 
|  | + | 
|  | +package kvm | 
|  | + | 
|  | +import ( | 
|  | +	"gvisor.dev/gvisor/pkg/atomicbitops" | 
|  | +	"gvisor.dev/gvisor/pkg/hostarch" | 
|  | +	"gvisor.dev/gvisor/pkg/ring0/pagetables" | 
|  | +	"gvisor.dev/gvisor/pkg/sentry/memmap" | 
|  | +	"gvisor.dev/gvisor/pkg/sentry/platform" | 
|  | +	"gvisor.dev/gvisor/pkg/sync" | 
|  | +) | 
|  | + | 
|  | +// dirtySet tracks vCPUs for invalidation. | 
|  | +type dirtySet struct { | 
|  | +	vCPUMasks []atomicbitops.Uint64 | 
|  | +} | 
|  | + | 
|  | +// forEach iterates over all CPUs in the dirty set. | 
|  | +func (ds *dirtySet) forEach(m *machine, fn func(c *vCPU)) { | 
|  | +	for index := range ds.vCPUMasks { | 
|  | +		mask := ds.vCPUMasks[index].Swap(0) | 
|  | +		if mask != 0 { | 
|  | +			for bit := 0; bit < 64; bit++ { | 
|  | +				if mask&(1<<uint64(bit)) == 0 { | 
|  | +					continue | 
|  | +				} | 
|  | +				id := 64*index + bit | 
|  | +				fn(m.vCPUsByID[id]) | 
|  | +			} | 
|  | +		} | 
|  | +	} | 
|  | +} | 
|  | + | 
|  | +// mark marks the given vCPU as dirty and returns whether it was previously | 
|  | +// clean. Being previously clean implies that a flush is needed on entry. | 
|  | +func (ds *dirtySet) mark(c *vCPU) bool { | 
|  | +	index := uint64(c.id) / 64 | 
|  | +	bit := uint64(1) << uint(c.id%64) | 
|  | + | 
|  | +	oldValue := ds.vCPUMasks[index].Load() | 
|  | +	if oldValue&bit != 0 { | 
|  | +		return false // Not clean. | 
|  | +	} | 
|  | + | 
|  | +	// Set the bit unilaterally, and ensure that a flush takes place. Note | 
|  | +	// that it's possible for races to occur here, but since the flush is | 
|  | +	// taking place long after these lines there's no race in practice. | 
|  | +	atomicbitops.OrUint64(&ds.vCPUMasks[index], bit) | 
|  | +	return true // Previously clean. | 
|  | +} | 
|  | + | 
|  | +// addressSpace is a wrapper for PageTables. | 
|  | +type addressSpace struct { | 
|  | +	platform.NoAddressSpaceIO | 
|  | + | 
|  | +	// mu is the lock for modifications to the address space. | 
|  | +	// | 
|  | +	// Note that the page tables themselves are not locked. | 
|  | +	mu sync.Mutex | 
|  | + | 
|  | +	// machine is the underlying machine. | 
|  | +	machine *machine | 
|  | + | 
|  | +	// pageTables are for this particular address space. | 
|  | +	pageTables *pagetables.PageTables | 
|  | + | 
|  | +	// dirtySet is the set of dirty vCPUs. | 
|  | +	dirtySet *dirtySet | 
|  | +} | 
|  | + | 
|  | +// Invalidate interrupts all dirty contexts. | 
|  | +func (as *addressSpace) Invalidate() { | 
|  | +	as.mu.Lock() | 
|  | +	defer as.mu.Unlock() | 
|  | +	as.invalidate() | 
|  | +} | 
|  | + | 
|  | +// Touch adds the given vCPU to the dirty list. | 
|  | +// | 
|  | +// The return value indicates whether a flush is required. | 
|  | +func (as *addressSpace) Touch(c *vCPU) bool { | 
|  | +	return as.dirtySet.mark(c) | 
|  | +} | 
|  | + | 
|  | +type hostMapEntry struct { | 
|  | +	addr   uintptr | 
|  | +	length uintptr | 
|  | +} | 
|  | + | 
|  | +// mapLocked maps the given host entry. | 
|  | +// | 
|  | +// +checkescape:hard,stack | 
|  | +func (as *addressSpace) mapLocked(addr hostarch.Addr, m hostMapEntry, at hostarch.AccessType) (inv bool) { | 
|  | +	for m.length > 0 { | 
|  | +		physical, length, ok := translateToPhysical(m.addr) | 
|  | +		if !ok { | 
|  | +			panic("unable to translate segment") | 
|  | +		} | 
|  | +		if length > m.length { | 
|  | +			length = m.length | 
|  | +		} | 
|  | + | 
|  | +		// Ensure that this map has physical mappings. If the page does | 
|  | +		// not have physical mappings, the KVM module may inject | 
|  | +		// spurious exceptions when emulation fails (i.e. it tries to | 
|  | +		// emulate because the RIP is pointed at those pages). | 
|  | +		as.machine.mapPhysical(physical, length, physicalRegions) | 
|  | + | 
|  | +		// Install the page table mappings. Note that the ordering is | 
|  | +		// important; if the pagetable mappings were installed before | 
|  | +		// ensuring the physical pages were available, then some other | 
|  | +		// thread could theoretically access them. | 
|  | +		inv = as.pageTables.Map(addr, length, pagetables.MapOpts{ | 
|  | +			AccessType: at, | 
|  | +			User:       true, | 
|  | +		}, physical) || inv | 
|  | +		m.addr += length | 
|  | +		m.length -= length | 
|  | +		addr += hostarch.Addr(length) | 
|  | +	} | 
|  | + | 
|  | +	return inv | 
|  | +} | 
|  | + | 
|  | +// MapFile implements platform.AddressSpace.MapFile. | 
|  | +func (as *addressSpace) MapFile(addr hostarch.Addr, f memmap.File, fr memmap.FileRange, at hostarch.AccessType, precommit bool) error { | 
|  | +	as.mu.Lock() | 
|  | +	defer as.mu.Unlock() | 
|  | + | 
|  | +	// Get mappings in the sentry's address space, which are guaranteed to be | 
|  | +	// valid as long as a reference is held on the mapped pages (which is in | 
|  | +	// turn required by AddressSpace.MapFile precondition). | 
|  | +	// | 
|  | +	// If precommit is true, we will touch mappings to commit them, so ensure | 
|  | +	// that mappings are readable from sentry context. | 
|  | +	// | 
|  | +	// We don't execute from application file-mapped memory, and guest page | 
|  | +	// tables don't care if we have execute permission (but they do need pages | 
|  | +	// to be readable). | 
|  | +	bs, err := f.MapInternal(fr, hostarch.AccessType{ | 
|  | +		Read:  at.Read || at.Execute || precommit, | 
|  | +		Write: at.Write, | 
|  | +	}) | 
|  | +	if err != nil { | 
|  | +		return err | 
|  | +	} | 
|  | + | 
|  | +	// See block in mapLocked. | 
|  | +	as.pageTables.Allocator.(*allocator).cpu = as.machine.Get() | 
|  | +	defer as.machine.Put(as.pageTables.Allocator.(*allocator).cpu) | 
|  | + | 
|  | +	// Map the mappings in the sentry's address space (guest physical memory) | 
|  | +	// into the application's address space (guest virtual memory). | 
|  | +	inv := false | 
|  | +	for !bs.IsEmpty() { | 
|  | +		b := bs.Head() | 
|  | +		bs = bs.Tail() | 
|  | +		// Since fr was page-aligned, b should also be page-aligned. We do the | 
|  | +		// lookup in our host page tables for this translation. | 
|  | +		if precommit { | 
|  | +			s := b.ToSlice() | 
|  | +			for i := 0; i < len(s); i += hostarch.PageSize { | 
|  | +				_ = s[i] // Touch to commit. | 
|  | +			} | 
|  | +		} | 
|  | + | 
|  | +		// See bluepill_allocator.go. | 
|  | +		bluepill(as.pageTables.Allocator.(*allocator).cpu) | 
|  | + | 
|  | +		// Perform the mapping. | 
|  | +		prev := as.mapLocked(addr, hostMapEntry{ | 
|  | +			addr:   b.Addr(), | 
|  | +			length: uintptr(b.Len()), | 
|  | +		}, at) | 
|  | +		inv = inv || prev | 
|  | +		addr += hostarch.Addr(b.Len()) | 
|  | +	} | 
|  | +	if inv { | 
|  | +		as.invalidate() | 
|  | +	} | 
|  | + | 
|  | +	return nil | 
|  | +} | 
|  | + | 
|  | +// unmapLocked is an escape-checked wrapped around Unmap. | 
|  | +// | 
|  | +// +checkescape:hard,stack | 
|  | +func (as *addressSpace) unmapLocked(addr hostarch.Addr, length uint64) bool { | 
|  | +	return as.pageTables.Unmap(addr, uintptr(length)) | 
|  | +} | 
|  | + | 
|  | +// Unmap unmaps the given range by calling pagetables.PageTables.Unmap. | 
|  | +func (as *addressSpace) Unmap(addr hostarch.Addr, length uint64) { | 
|  | +	as.mu.Lock() | 
|  | +	defer as.mu.Unlock() | 
|  | + | 
|  | +	// See above & bluepill_allocator.go. | 
|  | +	as.pageTables.Allocator.(*allocator).cpu = as.machine.Get() | 
|  | +	defer as.machine.Put(as.pageTables.Allocator.(*allocator).cpu) | 
|  | +	bluepill(as.pageTables.Allocator.(*allocator).cpu) | 
|  | + | 
|  | +	if prev := as.unmapLocked(addr, length); prev { | 
|  | +		// Invalidate all active vCPUs. | 
|  | +		as.invalidate() | 
|  | + | 
|  | +		// Recycle any freed intermediate pages. | 
|  | +		as.pageTables.Allocator.Recycle() | 
|  | +	} | 
|  | +} | 
|  | + | 
|  | +// Release releases the page tables. | 
|  | +func (as *addressSpace) Release() { | 
|  | +	as.Unmap(0, ^uint64(0)) | 
|  | + | 
|  | +	// Free all pages from the allocator. | 
|  | +	as.pageTables.Allocator.(*allocator).base.Drain() | 
|  | + | 
|  | +	// Drop all cached machine references. | 
|  | +	as.machine.dropPageTables(as.pageTables) | 
|  | +} | 
|  | + | 
|  | +// PreFork implements platform.AddressSpace.PreFork. | 
|  | +func (as *addressSpace) PreFork() {} | 
|  | + | 
|  | +// PostFork implements platform.AddressSpace.PostFork. | 
|  | +func (as *addressSpace) PostFork() {} | 
|  | diff --git a/pkg/sentry/platform/kvm/bluepill_debug_unsafe.go b/pkg/sentry/platform/kvm/bluepill_debug_unsafe.go | 
|  | new file mode 100644 | 
|  | index 000000000..5feb45c19 | 
|  | --- /dev/null | 
|  | +++ b/pkg/sentry/platform/kvm/bluepill_debug_unsafe.go | 
|  | @@ -0,0 +1,215 @@ | 
|  | +// Copyright 2018 The gVisor Authors. | 
|  | +// | 
|  | +// Licensed under the Apache License, Version 2.0 (the "License"); | 
|  | +// you may not use this file except in compliance with the License. | 
|  | +// You may obtain a copy of the License at | 
|  | +// | 
|  | +//     http://www.apache.org/licenses/LICENSE-2.0 | 
|  | +// | 
|  | +// Unless required by applicable law or agreed to in writing, software | 
|  | +// distributed under the License is distributed on an "AS IS" BASIS, | 
|  | +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
|  | +// See the License for the specific language governing permissions and | 
|  | +// limitations under the License. | 
|  | + | 
|  | +//go:build go1.18 && kvm_debug | 
|  | +// +build go1.18,kvm_debug | 
|  | + | 
|  | +// //go:linkname directives type-checked by checklinkname. Any other | 
|  | +// non-linkname assumptions outside the Go 1 compatibility guarantee should | 
|  | +// have an accompanied vet check or version guard build tag. | 
|  | + | 
|  | +package kvm | 
|  | + | 
|  | +import ( | 
|  | +	"unsafe" | 
|  | + | 
|  | +	"golang.org/x/sys/unix" | 
|  | +	"gvisor.dev/gvisor/pkg/sentry/arch" | 
|  | +) | 
|  | + | 
|  | +//go:linkname throw runtime.throw | 
|  | +func throw(s string) | 
|  | + | 
|  | +// vCPUPtr returns a CPU for the given address. | 
|  | +func vCPUPtr(addr uintptr) *vCPU { | 
|  | +	return (*vCPU)(unsafe.Pointer(addr)) | 
|  | +} | 
|  | + | 
|  | +// bytePtr returns a bytePtr for the given address. | 
|  | +func bytePtr(addr uintptr) *byte { | 
|  | +	return (*byte)(unsafe.Pointer(addr)) | 
|  | +} | 
|  | + | 
|  | +// uintptrValue returns a uintptr for the given address. | 
|  | +func uintptrValue(addr *byte) uintptr { | 
|  | +	return (uintptr)(unsafe.Pointer(addr)) | 
|  | +} | 
|  | + | 
|  | +// bluepillArchContext returns the UContext64. | 
|  | +func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 { | 
|  | +	return &((*arch.UContext64)(context).MContext) | 
|  | +} | 
|  | + | 
|  | +// bluepillHandleHlt is reponsible for handling VM-Exit. | 
|  | +func bluepillGuestExit(c *vCPU, context unsafe.Pointer) { | 
|  | +	// Increment our counter. | 
|  | +	c.guestExits.Add(1) | 
|  | + | 
|  | +	// Copy out registers. | 
|  | +	bluepillArchExit(c, bluepillArchContext(context)) | 
|  | + | 
|  | +	// Return to the vCPUReady state; notify any waiters. | 
|  | +	user := c.state.Load() & vCPUUser | 
|  | +	switch c.state.Swap(user) { | 
|  | +	case user | vCPUGuest: // Expected case. | 
|  | +	case user | vCPUGuest | vCPUWaiter: | 
|  | +		c.notify() | 
|  | +	default: | 
|  | +		throw("invalid state") | 
|  | +	} | 
|  | +} | 
|  | + | 
|  | +var hexSyms = []byte("0123456789abcdef") | 
|  | + | 
|  | +func printHex(title []byte, val uint64) { | 
|  | +	var str [18]byte | 
|  | +	for i := 0; i < 16; i++ { | 
|  | +		str[16-i] = hexSyms[val&0xf] | 
|  | +		val = val >> 4 | 
|  | +	} | 
|  | +	str[0] = ' ' | 
|  | +	str[17] = '\n' | 
|  | +	unix.RawSyscall(unix.SYS_WRITE, uintptr(unix.Stderr), uintptr(unsafe.Pointer(&title[0])), uintptr(len(title))) | 
|  | +	unix.RawSyscall(unix.SYS_WRITE, uintptr(unix.Stderr), uintptr(unsafe.Pointer(&str)), 18) | 
|  | +} | 
|  | + | 
|  | +// bluepillHandler is called from the signal stub. | 
|  | +// | 
|  | +// The world may be stopped while this is executing, and it executes on the | 
|  | +// signal stack. It should only execute raw system calls and functions that are | 
|  | +// explicitly marked go:nosplit. | 
|  | +// | 
|  | +// Ideally, this function should switch to gsignal, as runtime.sigtramp does, | 
|  | +// but that is tedious given all the runtime internals. That said, using | 
|  | +// gsignal inside a signal handler is not _required_, provided we avoid stack | 
|  | +// splits and allocations. Note that calling any splittable function here will | 
|  | +// be flaky; if the signal stack is below the G stack then we will trigger a | 
|  | +// split and crash. If above, we won't trigger a split. | 
|  | +// | 
|  | +// +checkescape:all | 
|  | +func bluepillHandler(context unsafe.Pointer) { | 
|  | +	// Sanitize the registers; interrupts must always be disabled. | 
|  | +	c := bluepillArchEnter(bluepillArchContext(context)) | 
|  | + | 
|  | +	// Mark this as guest mode. | 
|  | +	switch c.state.Swap(vCPUGuest | vCPUUser) { | 
|  | +	case vCPUUser: // Expected case. | 
|  | +	case vCPUUser | vCPUWaiter: | 
|  | +		c.notify() | 
|  | +	default: | 
|  | +		throw("invalid state") | 
|  | +	} | 
|  | + | 
|  | +	for { | 
|  | +		hostExitCounter.Increment() | 
|  | +		_, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(c.fd), _KVM_RUN, 0) // escapes: no. | 
|  | +		switch errno { | 
|  | +		case 0: // Expected case. | 
|  | +		case unix.EINTR: | 
|  | +			interruptCounter.Increment() | 
|  | +			// First, we process whatever pending signal | 
|  | +			// interrupted KVM. Since we're in a signal handler | 
|  | +			// currently, all signals are masked and the signal | 
|  | +			// must have been delivered directly to this thread. | 
|  | +			timeout := unix.Timespec{} | 
|  | +			sig, _, errno := unix.RawSyscall6( // escapes: no. | 
|  | +				unix.SYS_RT_SIGTIMEDWAIT, | 
|  | +				uintptr(unsafe.Pointer(&bounceSignalMask)), | 
|  | +				0,                                 // siginfo. | 
|  | +				uintptr(unsafe.Pointer(&timeout)), // timeout. | 
|  | +				8,                                 // sigset size. | 
|  | +				0, 0) | 
|  | +			if errno == unix.EAGAIN { | 
|  | +				continue | 
|  | +			} | 
|  | +			if errno != 0 { | 
|  | +				throw("error waiting for pending signal") | 
|  | +			} | 
|  | +			if sig != uintptr(bounceSignal) { | 
|  | +				throw("unexpected signal") | 
|  | +			} | 
|  | + | 
|  | +			// Check whether the current state of the vCPU is ready | 
|  | +			// for interrupt injection. Because we don't have a | 
|  | +			// PIC, we can't inject an interrupt while they are | 
|  | +			// masked. We need to request a window if it's not | 
|  | +			// ready. | 
|  | +			if bluepillReadyStopGuest(c) { | 
|  | +				// Force injection below; the vCPU is ready. | 
|  | +				c.runData.exitReason = _KVM_EXIT_IRQ_WINDOW_OPEN | 
|  | +			} else { | 
|  | +				c.runData.requestInterruptWindow = 1 | 
|  | +				continue // Rerun vCPU. | 
|  | +			} | 
|  | +		case unix.EFAULT: | 
|  | +			// If a fault is not serviceable due to the host | 
|  | +			// backing pages having page permissions, instead of an | 
|  | +			// MMIO exit we receive EFAULT from the run ioctl. We | 
|  | +			// always inject an NMI here since we may be in kernel | 
|  | +			// mode and have interrupts disabled. | 
|  | +			bluepillSigBus(c) | 
|  | +			continue // Rerun vCPU. | 
|  | +		case unix.ENOSYS: | 
|  | +			bluepillHandleEnosys(c) | 
|  | +			continue | 
|  | +		default: | 
|  | +			throw("run failed") | 
|  | +		} | 
|  | + | 
|  | +		switch c.runData.exitReason { | 
|  | +		case _KVM_EXIT_EXCEPTION: | 
|  | +			c.die(bluepillArchContext(context), "exception") | 
|  | +			return | 
|  | +		case _KVM_EXIT_IO: | 
|  | +			c.die(bluepillArchContext(context), "I/O") | 
|  | +			return | 
|  | +		case _KVM_EXIT_INTERNAL_ERROR: | 
|  | +			// An internal error is typically thrown when emulation | 
|  | +			// fails. This can occur via the MMIO path below (and | 
|  | +			// it might fail because we have multiple regions that | 
|  | +			// are not mapped). We would actually prefer that no | 
|  | +			// emulation occur, and don't mind at all if it fails. | 
|  | +		case _KVM_EXIT_HYPERCALL: | 
|  | +			c.die(bluepillArchContext(context), "hypercall") | 
|  | +			return | 
|  | +		case _KVM_EXIT_DEBUG: | 
|  | +			c.die(bluepillArchContext(context), "debug") | 
|  | +			return | 
|  | +		case _KVM_EXIT_HLT: | 
|  | +			c.hltSanityCheck() | 
|  | +			bluepillGuestExit(c, context) | 
|  | +			return | 
|  | +		case _KVM_EXIT_MMIO: | 
|  | +			physical := uintptr(c.runData.data[0]) | 
|  | +			if getHypercallID(physical) == _KVM_HYPERCALL_VMEXIT { | 
|  | +				bluepillGuestExit(c, context) | 
|  | +				return | 
|  | +			} | 
|  | + | 
|  | +			c.die(bluepillArchContext(context), "exit_mmio") | 
|  | +			return | 
|  | +		case _KVM_EXIT_IRQ_WINDOW_OPEN: | 
|  | +			bluepillStopGuest(c) | 
|  | +		case _KVM_EXIT_SHUTDOWN: | 
|  | +			c.die(bluepillArchContext(context), "shutdown") | 
|  | +			return | 
|  | +		case _KVM_EXIT_FAIL_ENTRY: | 
|  | +			c.die(bluepillArchContext(context), "entry failed") | 
|  | +			return | 
|  | +		default: | 
|  | +			bluepillArchHandleExit(c, context) | 
|  | +			return | 
|  | +		} | 
|  | +	} | 
|  | +} | 
|  | diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go | 
|  | index 81bd9f814..ad8b966e7 100644 | 
|  | --- a/pkg/sentry/platform/kvm/bluepill_unsafe.go | 
|  | +++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go | 
|  | @@ -12,8 +12,8 @@ | 
|  | // See the License for the specific language governing permissions and | 
|  | // limitations under the License. | 
|  |  | 
|  | -//go:build go1.18 | 
|  | -// +build go1.18 | 
|  | +//go:build go1.18 && !kvm_debug | 
|  | +// +build go1.18,!kvm_debug | 
|  |  | 
|  | // //go:linkname directives type-checked by checklinkname. Any other | 
|  | // non-linkname assumptions outside the Go 1 compatibility guarantee should | 
|  | diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go | 
|  | index f39bf1f06..4f0264db7 100644 | 
|  | --- a/pkg/sentry/platform/kvm/machine.go | 
|  | +++ b/pkg/sentry/platform/kvm/machine.go | 
|  | @@ -12,6 +12,9 @@ | 
|  | // See the License for the specific language governing permissions and | 
|  | // limitations under the License. | 
|  |  | 
|  | +//go:build !kvm_debug | 
|  | +// +build !kvm_debug | 
|  | + | 
|  | package kvm | 
|  |  | 
|  | import ( | 
|  | diff --git a/pkg/sentry/platform/kvm/machine_debug.go b/pkg/sentry/platform/kvm/machine_debug.go | 
|  | new file mode 100644 | 
|  | index 000000000..0a4735d2d | 
|  | --- /dev/null | 
|  | +++ b/pkg/sentry/platform/kvm/machine_debug.go | 
|  | @@ -0,0 +1,826 @@ | 
|  | +// Copyright 2018 The gVisor Authors. | 
|  | +// | 
|  | +// Licensed under the Apache License, Version 2.0 (the "License"); | 
|  | +// you may not use this file except in compliance with the License. | 
|  | +// You may obtain a copy of the License at | 
|  | +// | 
|  | +//     http://www.apache.org/licenses/LICENSE-2.0 | 
|  | +// | 
|  | +// Unless required by applicable law or agreed to in writing, software | 
|  | +// distributed under the License is distributed on an "AS IS" BASIS, | 
|  | +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
|  | +// See the License for the specific language governing permissions and | 
|  | +// limitations under the License. | 
|  | + | 
|  | +//go:build kvm_debug | 
|  | +// +build kvm_debug | 
|  | + | 
|  | +package kvm | 
|  | + | 
|  | +import ( | 
|  | +	"fmt" | 
|  | +	"runtime" | 
|  | +	gosync "sync" | 
|  | +	"sync/atomic" | 
|  | +	"time" | 
|  | + | 
|  | +	"golang.org/x/sys/unix" | 
|  | +	"gvisor.dev/gvisor/pkg/abi/linux" | 
|  | +	"gvisor.dev/gvisor/pkg/atomicbitops" | 
|  | +	"gvisor.dev/gvisor/pkg/hostarch" | 
|  | +	"gvisor.dev/gvisor/pkg/hosttid" | 
|  | +	"gvisor.dev/gvisor/pkg/log" | 
|  | +	"gvisor.dev/gvisor/pkg/metric" | 
|  | +	"gvisor.dev/gvisor/pkg/ring0" | 
|  | +	"gvisor.dev/gvisor/pkg/ring0/pagetables" | 
|  | +	"gvisor.dev/gvisor/pkg/seccomp" | 
|  | +	ktime "gvisor.dev/gvisor/pkg/sentry/time" | 
|  | +	"gvisor.dev/gvisor/pkg/sighandling" | 
|  | +	"gvisor.dev/gvisor/pkg/sync" | 
|  | +) | 
|  | + | 
|  | +// machine contains state associated with the VM as a whole. | 
|  | +type machine struct { | 
|  | +	// fd is the vm fd. | 
|  | +	fd int | 
|  | + | 
|  | +	// machinePoolIndex is the index in the machinePool array. | 
|  | +	machinePoolIndex uint32 | 
|  | + | 
|  | +	// nextSlot is the next slot for setMemoryRegion. | 
|  | +	// | 
|  | +	// If nextSlot is ^uint32(0), then slots are currently being updated, and the | 
|  | +	// caller should retry. | 
|  | +	nextSlot atomicbitops.Uint32 | 
|  | + | 
|  | +	// upperSharedPageTables tracks the read-only shared upper of all the pagetables. | 
|  | +	upperSharedPageTables *pagetables.PageTables | 
|  | + | 
|  | +	// kernel is the set of global structures. | 
|  | +	kernel ring0.Kernel | 
|  | + | 
|  | +	// mu protects vCPUs. | 
|  | +	mu sync.RWMutex | 
|  | + | 
|  | +	// available is notified when vCPUs are available. | 
|  | +	available sync.Cond | 
|  | + | 
|  | +	// vCPUsByTID are the machine vCPUs. | 
|  | +	// | 
|  | +	// These are populated dynamically. | 
|  | +	vCPUsByTID map[uint64]*vCPU | 
|  | + | 
|  | +	// vCPUsByID are the machine vCPUs, can be indexed by the vCPU's ID. | 
|  | +	vCPUsByID []*vCPU | 
|  | + | 
|  | +	// usedVCPUs is the number of vCPUs that have been used from the | 
|  | +	// vCPUsByID pool. | 
|  | +	usedVCPUs int | 
|  | + | 
|  | +	// maxVCPUs is the maximum number of vCPUs supported by the machine. | 
|  | +	maxVCPUs int | 
|  | + | 
|  | +	// maxSlots is the maximum number of memory slots supported by the machine. | 
|  | +	maxSlots int | 
|  | + | 
|  | +	// tscControl checks whether cpu supports TSC scaling | 
|  | +	tscControl bool | 
|  | + | 
|  | +	// usedSlots is the set of used physical addresses (not sorted). | 
|  | +	usedSlots []uintptr | 
|  | +} | 
|  | + | 
|  | +const ( | 
|  | +	// vCPUReady is an alias for all the below clear. | 
|  | +	vCPUReady uint32 = 0 | 
|  | + | 
|  | +	// vCPUser indicates that the vCPU is in or about to enter user mode. | 
|  | +	vCPUUser uint32 = 1 << 0 | 
|  | + | 
|  | +	// vCPUGuest indicates the vCPU is in guest mode. | 
|  | +	vCPUGuest uint32 = 1 << 1 | 
|  | + | 
|  | +	// vCPUWaiter indicates that there is a waiter. | 
|  | +	// | 
|  | +	// If this is set, then notify must be called on any state transitions. | 
|  | +	vCPUWaiter uint32 = 1 << 2 | 
|  | +) | 
|  | + | 
|  | +// Field values for the get_vcpu metric acquisition path used. | 
|  | +var ( | 
|  | +	getVCPUAcquisitionFastReused = metric.FieldValue{"fast_reused"} | 
|  | +	getVCPUAcquisitionReused     = metric.FieldValue{"reused"} | 
|  | +	getVCPUAcquisitionUnused     = metric.FieldValue{"unused"} | 
|  | +	getVCPUAcquisitionStolen     = metric.FieldValue{"stolen"} | 
|  | +) | 
|  | + | 
|  | +var ( | 
|  | +	// hostExitCounter is a metric that tracks how many times the sentry | 
|  | +	// performed a host to guest world switch. | 
|  | +	hostExitCounter = metric.MustCreateNewProfilingUint64Metric( | 
|  | +		"/kvm/host_exits", false, "The number of times the sentry performed a host to guest world switch.") | 
|  | + | 
|  | +	// userExitCounter is a metric that tracks how many times the sentry has | 
|  | +	// had an exit from userspace. Analogous to vCPU.userExits. | 
|  | +	userExitCounter = metric.MustCreateNewProfilingUint64Metric( | 
|  | +		"/kvm/user_exits", false, "The number of times the sentry has had an exit from userspace.") | 
|  | + | 
|  | +	// interruptCounter is a metric that tracks how many times execution returned | 
|  | +	// to the KVM host to handle a pending signal. | 
|  | +	interruptCounter = metric.MustCreateNewProfilingUint64Metric( | 
|  | +		"/kvm/interrupts", false, "The number of times the signal handler was invoked.") | 
|  | + | 
|  | +	// mmapCallCounter is a metric that tracks how many times the function | 
|  | +	// seccompMmapSyscall has been called. | 
|  | +	mmapCallCounter = metric.MustCreateNewProfilingUint64Metric( | 
|  | +		"/kvm/mmap_calls", false, "The number of times seccompMmapSyscall has been called.") | 
|  | + | 
|  | +	// getVCPUCounter is a metric that tracks how many times different paths of | 
|  | +	// machine.Get() are triggered. | 
|  | +	getVCPUCounter = metric.MustCreateNewProfilingUint64Metric( | 
|  | +		"/kvm/get_vcpu", false, "The number of times that machine.Get() was called, split by path the function took.", | 
|  | +		metric.NewField("acquisition_type", &getVCPUAcquisitionFastReused, &getVCPUAcquisitionReused, &getVCPUAcquisitionUnused, &getVCPUAcquisitionStolen)) | 
|  | + | 
|  | +	// asInvalidateDuration are durations of calling addressSpace.invalidate(). | 
|  | +	asInvalidateDuration = metric.MustCreateNewProfilingTimerMetric("/kvm/address_space_invalidate", | 
|  | +		metric.NewExponentialBucketer(15, uint64(time.Nanosecond*100), 1, 2), | 
|  | +		"Duration of calling addressSpace.invalidate().") | 
|  | +) | 
|  | + | 
|  | +// vCPU is a single KVM vCPU. | 
|  | +type vCPU struct { | 
|  | +	// CPU is the kernel CPU data. | 
|  | +	// | 
|  | +	// This must be the first element of this structure, it is referenced | 
|  | +	// by the bluepill code (see bluepill_amd64.s). | 
|  | +	ring0.CPU | 
|  | + | 
|  | +	// id is the vCPU id. | 
|  | +	id int | 
|  | + | 
|  | +	// fd is the vCPU fd. | 
|  | +	fd int | 
|  | + | 
|  | +	// tid is the last set tid. | 
|  | +	tid atomicbitops.Uint64 | 
|  | + | 
|  | +	// userExits is the count of user exits. | 
|  | +	userExits atomicbitops.Uint64 | 
|  | + | 
|  | +	// guestExits is the count of guest to host world switches. | 
|  | +	guestExits atomicbitops.Uint64 | 
|  | + | 
|  | +	// faults is a count of world faults (informational only). | 
|  | +	faults uint32 | 
|  | + | 
|  | +	// state is the vCPU state. | 
|  | +	// | 
|  | +	// This is a bitmask of the three fields (vCPU*) described above. | 
|  | +	state atomicbitops.Uint32 | 
|  | + | 
|  | +	// runData for this vCPU. | 
|  | +	runData *runData | 
|  | + | 
|  | +	// machine associated with this vCPU. | 
|  | +	machine *machine | 
|  | + | 
|  | +	// active is the current addressSpace: this is set and read atomically, | 
|  | +	// it is used to elide unnecessary interrupts due to invalidations. | 
|  | +	active atomicAddressSpace | 
|  | + | 
|  | +	// vCPUArchState is the architecture-specific state. | 
|  | +	vCPUArchState | 
|  | + | 
|  | +	// dieState holds state related to vCPU death. | 
|  | +	dieState dieState | 
|  | +} | 
|  | + | 
|  | +type dieState struct { | 
|  | +	// message is thrown from die. | 
|  | +	message string | 
|  | + | 
|  | +	// guestRegs is used to store register state during vCPU.die() to prevent | 
|  | +	// allocation inside nosplit function. | 
|  | +	guestRegs userRegs | 
|  | +} | 
|  | + | 
|  | +// createVCPU creates and returns a new vCPU. | 
|  | +// | 
|  | +// Precondition: mu must be held. | 
|  | +func (m *machine) createVCPU(id int) *vCPU { | 
|  | +	// Create the vCPU. | 
|  | +	fd, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CREATE_VCPU, uintptr(id)) | 
|  | +	if errno != 0 { | 
|  | +		panic(fmt.Sprintf("error creating new vCPU: %v", errno)) | 
|  | +	} | 
|  | + | 
|  | +	c := &vCPU{ | 
|  | +		id:      id, | 
|  | +		fd:      int(fd), | 
|  | +		machine: m, | 
|  | +	} | 
|  | +	c.CPU.Init(&m.kernel, c.id, c) | 
|  | +	m.vCPUsByID[c.id] = c | 
|  | + | 
|  | +	// Ensure the signal mask is correct. | 
|  | +	if err := c.setSignalMask(); err != nil { | 
|  | +		panic(fmt.Sprintf("error setting signal mask: %v", err)) | 
|  | +	} | 
|  | + | 
|  | +	// Map the run data. | 
|  | +	runData, err := mapRunData(int(fd)) | 
|  | +	if err != nil { | 
|  | +		panic(fmt.Sprintf("error mapping run data: %v", err)) | 
|  | +	} | 
|  | +	c.runData = runData | 
|  | + | 
|  | +	// Initialize architecture state. | 
|  | +	if err := c.initArchState(); err != nil { | 
|  | +		panic(fmt.Sprintf("error initialization vCPU state: %v", err)) | 
|  | +	} | 
|  | + | 
|  | +	return c // Done. | 
|  | +} | 
|  | + | 
|  | +// newMachine returns a new VM context. | 
|  | +func newMachine(vm int) (*machine, error) { | 
|  | +	// Create the machine. | 
|  | +	m := &machine{fd: vm} | 
|  | +	m.available.L = &m.mu | 
|  | + | 
|  | +	// Pull the maximum vCPUs. | 
|  | +	m.getMaxVCPU() | 
|  | +	log.Debugf("The maximum number of vCPUs is %d.", m.maxVCPUs) | 
|  | +	m.vCPUsByTID = make(map[uint64]*vCPU) | 
|  | +	m.vCPUsByID = make([]*vCPU, m.maxVCPUs) | 
|  | +	m.kernel.Init(m.maxVCPUs) | 
|  | + | 
|  | +	// Pull the maximum slots. | 
|  | +	maxSlots, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_MEMSLOTS) | 
|  | +	if errno != 0 { | 
|  | +		m.maxSlots = _KVM_NR_MEMSLOTS | 
|  | +	} else { | 
|  | +		m.maxSlots = int(maxSlots) | 
|  | +	} | 
|  | +	log.Debugf("The maximum number of slots is %d.", m.maxSlots) | 
|  | +	m.usedSlots = make([]uintptr, m.maxSlots) | 
|  | + | 
|  | +	// Check TSC Scaling | 
|  | +	hasTSCControl, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_TSC_CONTROL) | 
|  | +	m.tscControl = errno == 0 && hasTSCControl == 1 | 
|  | +	log.Debugf("TSC scaling support: %t.", m.tscControl) | 
|  | + | 
|  | +	// Create the upper shared pagetables and kernel(sentry) pagetables. | 
|  | +	m.upperSharedPageTables = pagetables.New(newAllocator()) | 
|  | +	m.mapUpperHalf(m.upperSharedPageTables) | 
|  | +	m.upperSharedPageTables.Allocator.(*allocator).base.Drain() | 
|  | +	m.upperSharedPageTables.MarkReadOnlyShared() | 
|  | +	m.kernel.PageTables = pagetables.NewWithUpper(newAllocator(), m.upperSharedPageTables, ring0.KernelStartAddress) | 
|  | + | 
|  | +	// Install seccomp rules to trap runtime mmap system calls. They will | 
|  | +	// be handled by seccompMmapHandler. | 
|  | +	seccompMmapRules(m) | 
|  | + | 
|  | +	// Apply the physical mappings. Note that these mappings may point to | 
|  | +	// guest physical addresses that are not actually available. These | 
|  | +	// physical pages are mapped on demand, see kernel_unsafe.go. | 
|  | +	applyPhysicalRegions(func(pr physicalRegion) bool { | 
|  | +		// Map everything in the lower half. | 
|  | +		m.kernel.PageTables.Map( | 
|  | +			hostarch.Addr(pr.virtual), | 
|  | +			pr.length, | 
|  | +			pagetables.MapOpts{AccessType: hostarch.ReadWrite}, | 
|  | +			pr.physical) | 
|  | + | 
|  | +		return true // Keep iterating. | 
|  | +	}) | 
|  | + | 
|  | +	// Ensure that the currently mapped virtual regions are actually | 
|  | +	// available in the VM. Note that this doesn't guarantee no future | 
|  | +	// faults, however it should guarantee that everything is available to | 
|  | +	// ensure successful vCPU entry. | 
|  | +	mapRegion := func(vr virtualRegion, flags uint32) { | 
|  | +		for virtual := vr.virtual; virtual < vr.virtual+vr.length; { | 
|  | +			physical, length, ok := translateToPhysical(virtual) | 
|  | +			if !ok { | 
|  | +				// This must be an invalid region that was | 
|  | +				// knocked out by creation of the physical map. | 
|  | +				return | 
|  | +			} | 
|  | +			if virtual+length > vr.virtual+vr.length { | 
|  | +				// Cap the length to the end of the area. | 
|  | +				length = vr.virtual + vr.length - virtual | 
|  | +			} | 
|  | +			// Update page tables for executable mappings. | 
|  | +			if vr.accessType.Execute { | 
|  | +				if vr.accessType.Write { | 
|  | +					panic(fmt.Sprintf("executable mapping can't be writable: %#v", vr)) | 
|  | +				} | 
|  | +				m.kernel.PageTables.Map( | 
|  | +					hostarch.Addr(virtual), | 
|  | +					length, | 
|  | +					pagetables.MapOpts{AccessType: vr.accessType}, | 
|  | +					physical) | 
|  | +			} | 
|  | + | 
|  | +			// Ensure the physical range is mapped. | 
|  | +			m.mapPhysical(physical, length, physicalRegions) | 
|  | +			virtual += length | 
|  | +		} | 
|  | +	} | 
|  | + | 
|  | +	// handleBluepillFault takes the slot spinlock and it is called from | 
|  | +	// seccompMmapHandler, so here we have to guarantee that mmap is not | 
|  | +	// called while we hold the slot spinlock. | 
|  | +	disableAsyncPreemption() | 
|  | +	applyVirtualRegions(func(vr virtualRegion) { | 
|  | +		if excludeVirtualRegion(vr) { | 
|  | +			return // skip region. | 
|  | +		} | 
|  | +		// Take into account that the stack can grow down. | 
|  | +		if vr.filename == "[stack]" { | 
|  | +			vr.virtual -= 1 << 20 | 
|  | +			vr.length += 1 << 20 | 
|  | +		} | 
|  | + | 
|  | +		mapRegion(vr, 0) | 
|  | + | 
|  | +	}) | 
|  | +	enableAsyncPreemption() | 
|  | + | 
|  | +	// Initialize architecture state. | 
|  | +	if err := m.initArchState(); err != nil { | 
|  | +		m.Destroy() | 
|  | +		return nil, err | 
|  | +	} | 
|  | + | 
|  | +	// Ensure the machine is cleaned up properly. | 
|  | +	runtime.SetFinalizer(m, (*machine).Destroy) | 
|  | +	return m, nil | 
|  | +} | 
|  | + | 
|  | +// hasSlot returns true if the given address is mapped. | 
|  | +// | 
|  | +// This must be done via a linear scan. | 
|  | +// | 
|  | +//go:nosplit | 
|  | +func (m *machine) hasSlot(physical uintptr) bool { | 
|  | +	slotLen := int(m.nextSlot.Load()) | 
|  | +	// When slots are being updated, nextSlot is ^uint32(0). As this situation | 
|  | +	// is less likely happen, we just set the slotLen to m.maxSlots, and scan | 
|  | +	// the whole usedSlots array. | 
|  | +	if slotLen == int(^uint32(0)) { | 
|  | +		slotLen = m.maxSlots | 
|  | +	} | 
|  | +	for i := 0; i < slotLen; i++ { | 
|  | +		if p := atomic.LoadUintptr(&m.usedSlots[i]); p == physical { | 
|  | +			return true | 
|  | +		} | 
|  | +	} | 
|  | +	return false | 
|  | +} | 
|  | + | 
|  | +// mapPhysical checks for the mapping of a physical range, and installs one if | 
|  | +// not available. This attempts to be efficient for calls in the hot path. | 
|  | +// | 
|  | +// This throws on error. | 
|  | +func (m *machine) mapPhysical(physical, length uintptr, phyRegions []physicalRegion) { | 
|  | +	for end := physical + length; physical < end; { | 
|  | +		_, physicalStart, length, pr := calculateBluepillFault(physical, phyRegions) | 
|  | +		if pr == nil { | 
|  | +			// Should never happen. | 
|  | +			throw("mapPhysical on unknown physical address") | 
|  | +		} | 
|  | + | 
|  | +		// Is this already mapped? Check the usedSlots. | 
|  | +		if !m.hasSlot(physicalStart) { | 
|  | +			if _, ok := handleBluepillFault(m, physical, phyRegions); !ok { | 
|  | +				throw("handleBluepillFault failed") | 
|  | +			} | 
|  | +		} | 
|  | + | 
|  | +		// Move to the next chunk. | 
|  | +		physical = physicalStart + length | 
|  | +	} | 
|  | +} | 
|  | + | 
|  | +// Destroy frees associated resources. | 
|  | +// | 
|  | +// Destroy should only be called once all active users of the machine are gone. | 
|  | +// The machine object should not be used after calling Destroy. | 
|  | +// | 
|  | +// Precondition: all vCPUs must be returned to the machine. | 
|  | +func (m *machine) Destroy() { | 
|  | +	runtime.SetFinalizer(m, nil) | 
|  | + | 
|  | +	// Destroy vCPUs. | 
|  | +	for _, c := range m.vCPUsByID { | 
|  | +		if c == nil { | 
|  | +			continue | 
|  | +		} | 
|  | + | 
|  | +		// Ensure the vCPU is not still running in guest mode. This is | 
|  | +		// possible iff teardown has been done by other threads, and | 
|  | +		// somehow a single thread has not executed any system calls. | 
|  | +		c.BounceToHost() | 
|  | + | 
|  | +		// Note that the runData may not be mapped if an error occurs | 
|  | +		// during the middle of initialization. | 
|  | +		if c.runData != nil { | 
|  | +			if err := unmapRunData(c.runData); err != nil { | 
|  | +				panic(fmt.Sprintf("error unmapping rundata: %v", err)) | 
|  | +			} | 
|  | +		} | 
|  | +		if err := unix.Close(int(c.fd)); err != nil { | 
|  | +			panic(fmt.Sprintf("error closing vCPU fd: %v", err)) | 
|  | +		} | 
|  | +	} | 
|  | + | 
|  | +	machinePool[m.machinePoolIndex].Store(nil) | 
|  | +	seccompMmapSync() | 
|  | + | 
|  | +	// vCPUs are gone: teardown machine state. | 
|  | +	if err := unix.Close(m.fd); err != nil { | 
|  | +		panic(fmt.Sprintf("error closing VM fd: %v", err)) | 
|  | +	} | 
|  | +} | 
|  | + | 
|  | +// Get gets an available vCPU. | 
|  | +// | 
|  | +// This will return with the OS thread locked. | 
|  | +// | 
|  | +// It is guaranteed that if any OS thread TID is in guest, m.vCPUs[TID] points | 
|  | +// to the vCPU in which the OS thread TID is running. So if Get() returns with | 
|  | +// the corrent context in guest, the vCPU of it must be the same as what | 
|  | +// Get() returns. | 
|  | +func (m *machine) Get() *vCPU { | 
|  | +	m.mu.RLock() | 
|  | +	runtime.LockOSThread() | 
|  | +	tid := hosttid.Current() | 
|  | + | 
|  | +	// Check for an exact match. | 
|  | +	if c := m.vCPUsByTID[tid]; c != nil { | 
|  | +		c.lock() | 
|  | +		m.mu.RUnlock() | 
|  | +		getVCPUCounter.Increment(&getVCPUAcquisitionFastReused) | 
|  | +		return c | 
|  | +	} | 
|  | + | 
|  | +	// The happy path failed. We now proceed to acquire an exclusive lock | 
|  | +	// (because the vCPU map may change), and scan all available vCPUs. | 
|  | +	// In this case, we first unlock the OS thread. Otherwise, if mu is | 
|  | +	// not available, the current system thread will be parked and a new | 
|  | +	// system thread spawned. We avoid this situation by simply refreshing | 
|  | +	// tid after relocking the system thread. | 
|  | +	m.mu.RUnlock() | 
|  | +	runtime.UnlockOSThread() | 
|  | +	m.mu.Lock() | 
|  | +	runtime.LockOSThread() | 
|  | +	tid = hosttid.Current() | 
|  | + | 
|  | +	// Recheck for an exact match. | 
|  | +	if c := m.vCPUsByTID[tid]; c != nil { | 
|  | +		c.lock() | 
|  | +		m.mu.Unlock() | 
|  | +		getVCPUCounter.Increment(&getVCPUAcquisitionReused) | 
|  | +		return c | 
|  | +	} | 
|  | + | 
|  | +	for { | 
|  | +		// Get vCPU from the m.vCPUsByID pool. | 
|  | +		if m.usedVCPUs < m.maxVCPUs { | 
|  | +			c := m.vCPUsByID[m.usedVCPUs] | 
|  | +			m.usedVCPUs++ | 
|  | +			c.lock() | 
|  | +			m.vCPUsByTID[tid] = c | 
|  | +			m.mu.Unlock() | 
|  | +			c.loadSegments(tid) | 
|  | +			getVCPUCounter.Increment(&getVCPUAcquisitionUnused) | 
|  | +			return c | 
|  | +		} | 
|  | + | 
|  | +		// Scan for an available vCPU. | 
|  | +		for origTID, c := range m.vCPUsByTID { | 
|  | +			if c.state.CompareAndSwap(vCPUReady, vCPUUser) { | 
|  | +				delete(m.vCPUsByTID, origTID) | 
|  | +				m.vCPUsByTID[tid] = c | 
|  | +				m.mu.Unlock() | 
|  | +				c.loadSegments(tid) | 
|  | +				getVCPUCounter.Increment(&getVCPUAcquisitionUnused) | 
|  | +				return c | 
|  | +			} | 
|  | +		} | 
|  | + | 
|  | +		// Scan for something not in user mode. | 
|  | +		for origTID, c := range m.vCPUsByTID { | 
|  | +			if !c.state.CompareAndSwap(vCPUGuest, vCPUGuest|vCPUWaiter) { | 
|  | +				continue | 
|  | +			} | 
|  | + | 
|  | +			// The vCPU is not be able to transition to | 
|  | +			// vCPUGuest|vCPUWaiter or to vCPUUser because that | 
|  | +			// transition requires holding the machine mutex, as we | 
|  | +			// do now. There is no path to register a waiter on | 
|  | +			// just the vCPUReady state. | 
|  | +			for { | 
|  | +				c.waitUntilNot(vCPUGuest | vCPUWaiter) | 
|  | +				if c.state.CompareAndSwap(vCPUReady, vCPUUser) { | 
|  | +					break | 
|  | +				} | 
|  | +			} | 
|  | + | 
|  | +			// Steal the vCPU. | 
|  | +			delete(m.vCPUsByTID, origTID) | 
|  | +			m.vCPUsByTID[tid] = c | 
|  | +			m.mu.Unlock() | 
|  | +			c.loadSegments(tid) | 
|  | +			getVCPUCounter.Increment(&getVCPUAcquisitionStolen) | 
|  | +			return c | 
|  | +		} | 
|  | + | 
|  | +		// Everything is executing in user mode. Wait until something | 
|  | +		// is available.  Note that signaling the condition variable | 
|  | +		// will have the extra effect of kicking the vCPUs out of guest | 
|  | +		// mode if that's where they were. | 
|  | +		m.available.Wait() | 
|  | +	} | 
|  | +} | 
|  | + | 
|  | +// Put puts the current vCPU. | 
|  | +func (m *machine) Put(c *vCPU) { | 
|  | +	c.unlock() | 
|  | +	runtime.UnlockOSThread() | 
|  | + | 
|  | +	m.mu.RLock() | 
|  | +	m.available.Signal() | 
|  | +	m.mu.RUnlock() | 
|  | +} | 
|  | + | 
|  | +// newDirtySet returns a new dirty set. | 
|  | +func (m *machine) newDirtySet() *dirtySet { | 
|  | +	return &dirtySet{ | 
|  | +		vCPUMasks: make([]atomicbitops.Uint64, | 
|  | +			(m.maxVCPUs+63)/64, (m.maxVCPUs+63)/64), | 
|  | +	} | 
|  | +} | 
|  | + | 
|  | +// dropPageTables drops cached page table entries. | 
|  | +func (m *machine) dropPageTables(pt *pagetables.PageTables) { | 
|  | +	m.mu.Lock() | 
|  | +	defer m.mu.Unlock() | 
|  | + | 
|  | +	// Clear from all PCIDs. | 
|  | +	for _, c := range m.vCPUsByID { | 
|  | +		if c != nil && c.PCIDs != nil { | 
|  | +			c.PCIDs.Drop(pt) | 
|  | +		} | 
|  | +	} | 
|  | +} | 
|  | + | 
|  | +// lock marks the vCPU as in user mode. | 
|  | +// | 
|  | +// This should only be called directly when known to be safe, i.e. when | 
|  | +// the vCPU is owned by the current TID with no chance of theft. | 
|  | +// | 
|  | +//go:nosplit | 
|  | +func (c *vCPU) lock() { | 
|  | +	atomicbitops.OrUint32(&c.state, vCPUUser) | 
|  | +} | 
|  | + | 
|  | +// unlock clears the vCPUUser bit. | 
|  | +// | 
|  | +//go:nosplit | 
|  | +func (c *vCPU) unlock() { | 
|  | +	origState := atomicbitops.CompareAndSwapUint32(&c.state, vCPUUser|vCPUGuest, vCPUGuest) | 
|  | +	if origState == vCPUUser|vCPUGuest { | 
|  | +		// Happy path: no exits are forced, and we can continue | 
|  | +		// executing on our merry way with a single atomic access. | 
|  | +		return | 
|  | +	} | 
|  | + | 
|  | +	// Clear the lock. | 
|  | +	for { | 
|  | +		state := atomicbitops.CompareAndSwapUint32(&c.state, origState, origState&^vCPUUser) | 
|  | +		if state == origState { | 
|  | +			break | 
|  | +		} | 
|  | +		origState = state | 
|  | +	} | 
|  | +	switch origState { | 
|  | +	case vCPUUser: | 
|  | +		// Normal state. | 
|  | +	case vCPUUser | vCPUGuest | vCPUWaiter: | 
|  | +		// Force a transition: this must trigger a notification when we | 
|  | +		// return from guest mode. We must clear vCPUWaiter here | 
|  | +		// anyways, because BounceToKernel will force a transition only | 
|  | +		// from ring3 to ring0, which will not clear this bit. Halt may | 
|  | +		// workaround the issue, but if there is no exception or | 
|  | +		// syscall in this period, BounceToKernel will hang. | 
|  | +		atomicbitops.AndUint32(&c.state, ^vCPUWaiter) | 
|  | +		c.notify() | 
|  | +	case vCPUUser | vCPUWaiter: | 
|  | +		// Waiting for the lock to be released; the responsibility is | 
|  | +		// on us to notify the waiter and clear the associated bit. | 
|  | +		atomicbitops.AndUint32(&c.state, ^vCPUWaiter) | 
|  | +		c.notify() | 
|  | +	default: | 
|  | +		panic("invalid state") | 
|  | +	} | 
|  | +} | 
|  | + | 
|  | +// NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt. | 
|  | +// | 
|  | +//go:nosplit | 
|  | +func (c *vCPU) NotifyInterrupt() { | 
|  | +	c.BounceToKernel() | 
|  | +} | 
|  | + | 
|  | +// pid is used below in bounce. | 
|  | +var pid = unix.Getpid() | 
|  | + | 
|  | +// bounce forces a return to the kernel or to host mode. | 
|  | +// | 
|  | +// This effectively unwinds the state machine. | 
|  | +func (c *vCPU) bounce(forceGuestExit bool) { | 
|  | +	origGuestExits := c.guestExits.Load() | 
|  | +	origUserExits := c.userExits.Load() | 
|  | +	for { | 
|  | +		switch state := c.state.Load(); state { | 
|  | +		case vCPUReady, vCPUWaiter: | 
|  | +			// There is nothing to be done, we're already in the | 
|  | +			// kernel pre-acquisition. The Bounce criteria have | 
|  | +			// been satisfied. | 
|  | +			return | 
|  | +		case vCPUUser: | 
|  | +			// We need to register a waiter for the actual guest | 
|  | +			// transition. When the transition takes place, then we | 
|  | +			// can inject an interrupt to ensure a return to host | 
|  | +			// mode. | 
|  | +			c.state.CompareAndSwap(state, state|vCPUWaiter) | 
|  | +		case vCPUUser | vCPUWaiter: | 
|  | +			// Wait for the transition to guest mode. This should | 
|  | +			// come from the bluepill handler. | 
|  | +			c.waitUntilNot(state) | 
|  | +		case vCPUGuest, vCPUUser | vCPUGuest: | 
|  | +			if state == vCPUGuest && !forceGuestExit { | 
|  | +				// The vCPU is already not acquired, so there's | 
|  | +				// no need to do a fresh injection here. | 
|  | +				return | 
|  | +			} | 
|  | +			// The vCPU is in user or kernel mode. Attempt to | 
|  | +			// register a notification on change. | 
|  | +			if !c.state.CompareAndSwap(state, state|vCPUWaiter) { | 
|  | +				break // Retry. | 
|  | +			} | 
|  | +			for { | 
|  | +				// We need to spin here until the signal is | 
|  | +				// delivered, because Tgkill can return EAGAIN | 
|  | +				// under memory pressure. Since we already | 
|  | +				// marked ourselves as a waiter, we need to | 
|  | +				// ensure that a signal is actually delivered. | 
|  | +				if err := unix.Tgkill(pid, int(c.tid.Load()), bounceSignal); err == nil { | 
|  | +					break | 
|  | +				} else if err.(unix.Errno) == unix.EAGAIN { | 
|  | +					continue | 
|  | +				} else { | 
|  | +					// Nothing else should be returned by tgkill. | 
|  | +					panic(fmt.Sprintf("unexpected tgkill error: %v", err)) | 
|  | +				} | 
|  | +			} | 
|  | +		case vCPUGuest | vCPUWaiter, vCPUUser | vCPUGuest | vCPUWaiter: | 
|  | +			if state == vCPUGuest|vCPUWaiter && !forceGuestExit { | 
|  | +				// See above. | 
|  | +				return | 
|  | +			} | 
|  | +			// Wait for the transition. This again should happen | 
|  | +			// from the bluepill handler, but on the way out. | 
|  | +			c.waitUntilNot(state) | 
|  | +		default: | 
|  | +			// Should not happen: the above is exhaustive. | 
|  | +			panic("invalid state") | 
|  | +		} | 
|  | + | 
|  | +		// Check if we've missed the state transition, but | 
|  | +		// we can safely return at this point in time. | 
|  | +		newGuestExits := c.guestExits.Load() | 
|  | +		newUserExits := c.userExits.Load() | 
|  | +		if newUserExits != origUserExits && (!forceGuestExit || newGuestExits != origGuestExits) { | 
|  | +			return | 
|  | +		} | 
|  | +	} | 
|  | +} | 
|  | + | 
|  | +// BounceToKernel ensures that the vCPU bounces back to the kernel. | 
|  | +// | 
|  | +//go:nosplit | 
|  | +func (c *vCPU) BounceToKernel() { | 
|  | +	c.bounce(false) | 
|  | +} | 
|  | + | 
|  | +// BounceToHost ensures that the vCPU is in host mode. | 
|  | +// | 
|  | +//go:nosplit | 
|  | +func (c *vCPU) BounceToHost() { | 
|  | +	c.bounce(true) | 
|  | +} | 
|  | + | 
|  | +// setSystemTimeLegacy calibrates and sets an approximate system time. | 
|  | +func (c *vCPU) setSystemTimeLegacy() error { | 
|  | +	const minIterations = 10 | 
|  | +	minimum := uint64(0) | 
|  | +	for iter := 0; ; iter++ { | 
|  | +		// Try to set the TSC to an estimate of where it will be | 
|  | +		// on the host during a "fast" system call iteration. | 
|  | +		start := uint64(ktime.Rdtsc()) | 
|  | +		if err := c.setTSC(start + (minimum / 2)); err != nil { | 
|  | +			return err | 
|  | +		} | 
|  | +		// See if this is our new minimum call time. Note that this | 
|  | +		// serves two functions: one, we make sure that we are | 
|  | +		// accurately predicting the offset we need to set. Second, we | 
|  | +		// don't want to do the final set on a slow call, which could | 
|  | +		// produce a really bad result. | 
|  | +		end := uint64(ktime.Rdtsc()) | 
|  | +		if end < start { | 
|  | +			continue // Totally bogus: unstable TSC? | 
|  | +		} | 
|  | +		current := end - start | 
|  | +		if current < minimum || iter == 0 { | 
|  | +			minimum = current // Set our new minimum. | 
|  | +		} | 
|  | +		// Is this past minIterations and within ~10% of minimum? | 
|  | +		upperThreshold := (((minimum << 3) + minimum) >> 3) | 
|  | +		if iter >= minIterations && current <= upperThreshold { | 
|  | +			return nil | 
|  | +		} | 
|  | +	} | 
|  | +} | 
|  | + | 
|  | +const machinePoolSize = 16 | 
|  | + | 
|  | +// machinePool is enumerated from the seccompMmapHandler signal handler | 
|  | +var ( | 
|  | +	machinePool          [machinePoolSize]machineAtomicPtr | 
|  | +	machinePoolLen       atomicbitops.Uint32 | 
|  | +	machinePoolMu        sync.Mutex | 
|  | +	seccompMmapRulesOnce gosync.Once | 
|  | +) | 
|  | + | 
|  | +func sigsysHandler() | 
|  | +func addrOfSigsysHandler() uintptr | 
|  | + | 
|  | +// seccompMmapRules adds seccomp rules to trap mmap system calls that will be | 
|  | +// handled in seccompMmapHandler. | 
|  | +func seccompMmapRules(m *machine) { | 
|  | +	seccompMmapRulesOnce.Do(func() { | 
|  | +		// Install the handler. | 
|  | +		if err := sighandling.ReplaceSignalHandler(unix.SIGSYS, addrOfSigsysHandler(), &savedSigsysHandler); err != nil { | 
|  | +			panic(fmt.Sprintf("Unable to set handler for signal %d: %v", bluepillSignal, err)) | 
|  | +		} | 
|  | +		rules := []seccomp.RuleSet{} | 
|  | +		rules = append(rules, []seccomp.RuleSet{ | 
|  | +			// Trap mmap system calls and handle them in sigsysGoHandler | 
|  | +			{ | 
|  | +				Rules: seccomp.SyscallRules{ | 
|  | +					unix.SYS_MMAP: { | 
|  | +						{ | 
|  | +							seccomp.MatchAny{}, | 
|  | +							seccomp.MatchAny{}, | 
|  | +							seccomp.MaskedEqual(unix.PROT_EXEC, 0), | 
|  | +							/* MAP_DENYWRITE is ignored and used only for filtering. */ | 
|  | +							seccomp.MaskedEqual(unix.MAP_DENYWRITE, 0), | 
|  | +						}, | 
|  | +					}, | 
|  | +				}, | 
|  | +				Action: linux.SECCOMP_RET_TRAP, | 
|  | +			}, | 
|  | +		}...) | 
|  | +		instrs, err := seccomp.BuildProgram(rules, linux.SECCOMP_RET_ALLOW, linux.SECCOMP_RET_ALLOW) | 
|  | +		if err != nil { | 
|  | +			panic(fmt.Sprintf("failed to build rules: %v", err)) | 
|  | +		} | 
|  | +		// Perform the actual installation. | 
|  | +		if err := seccomp.SetFilter(instrs); err != nil { | 
|  | +			panic(fmt.Sprintf("failed to set filter: %v", err)) | 
|  | +		} | 
|  | +	}) | 
|  | + | 
|  | +	machinePoolMu.Lock() | 
|  | +	n := machinePoolLen.Load() | 
|  | +	i := uint32(0) | 
|  | +	for ; i < n; i++ { | 
|  | +		if machinePool[i].Load() == nil { | 
|  | +			break | 
|  | +		} | 
|  | +	} | 
|  | +	if i == n { | 
|  | +		if i == machinePoolSize { | 
|  | +			machinePoolMu.Unlock() | 
|  | +			panic("machinePool is full") | 
|  | +		} | 
|  | +		machinePoolLen.Add(1) | 
|  | +	} | 
|  | +	machinePool[i].Store(m) | 
|  | +	m.machinePoolIndex = i | 
|  | +	machinePoolMu.Unlock() | 
|  | +} | 
|  | -- | 
|  | 2.41.0 | 
|  |  |