| From eb22b742839180a0bdb3953c061da15ba822d56d Mon Sep 17 00:00:00 2001 |
| From: Tim Windelschmidt <tim@monogon.tech> |
| Date: Tue, 12 Sep 2023 15:06:49 +0200 |
| Subject: [PATCH] fix debug builds |
| |
| --- |
| pkg/sentry/platform/kvm/address_space.go | 3 + |
| .../platform/kvm/address_space_debug.go | 242 +++++ |
| .../platform/kvm/bluepill_debug_unsafe.go | 215 +++++ |
| pkg/sentry/platform/kvm/bluepill_unsafe.go | 4 +- |
| pkg/sentry/platform/kvm/machine.go | 3 + |
| pkg/sentry/platform/kvm/machine_debug.go | 826 ++++++++++++++++++ |
| 6 files changed, 1291 insertions(+), 2 deletions(-) |
| create mode 100644 pkg/sentry/platform/kvm/address_space_debug.go |
| create mode 100644 pkg/sentry/platform/kvm/bluepill_debug_unsafe.go |
| create mode 100644 pkg/sentry/platform/kvm/machine_debug.go |
| |
| diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go |
| index 79ccbea35..7e30d0365 100644 |
| --- a/pkg/sentry/platform/kvm/address_space.go |
| +++ b/pkg/sentry/platform/kvm/address_space.go |
| @@ -12,6 +12,9 @@ |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| +//go:build !kvm_debug |
| +// +build !kvm_debug |
| + |
| package kvm |
| |
| import ( |
| diff --git a/pkg/sentry/platform/kvm/address_space_debug.go b/pkg/sentry/platform/kvm/address_space_debug.go |
| new file mode 100644 |
| index 000000000..69aeba45a |
| --- /dev/null |
| +++ b/pkg/sentry/platform/kvm/address_space_debug.go |
| @@ -0,0 +1,242 @@ |
| +// Copyright 2018 The gVisor Authors. |
| +// |
| +// Licensed under the Apache License, Version 2.0 (the "License"); |
| +// you may not use this file except in compliance with the License. |
| +// You may obtain a copy of the License at |
| +// |
| +// http://www.apache.org/licenses/LICENSE-2.0 |
| +// |
| +// Unless required by applicable law or agreed to in writing, software |
| +// distributed under the License is distributed on an "AS IS" BASIS, |
| +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| +// See the License for the specific language governing permissions and |
| +// limitations under the License. |
| + |
| +//go:build kvm_debug |
| +// +build kvm_debug |
| + |
| +package kvm |
| + |
| +import ( |
| + "gvisor.dev/gvisor/pkg/atomicbitops" |
| + "gvisor.dev/gvisor/pkg/hostarch" |
| + "gvisor.dev/gvisor/pkg/ring0/pagetables" |
| + "gvisor.dev/gvisor/pkg/sentry/memmap" |
| + "gvisor.dev/gvisor/pkg/sentry/platform" |
| + "gvisor.dev/gvisor/pkg/sync" |
| +) |
| + |
| +// dirtySet tracks vCPUs for invalidation. |
| +type dirtySet struct { |
| + vCPUMasks []atomicbitops.Uint64 |
| +} |
| + |
| +// forEach iterates over all CPUs in the dirty set. |
| +func (ds *dirtySet) forEach(m *machine, fn func(c *vCPU)) { |
| + for index := range ds.vCPUMasks { |
| + mask := ds.vCPUMasks[index].Swap(0) |
| + if mask != 0 { |
| + for bit := 0; bit < 64; bit++ { |
| + if mask&(1<<uint64(bit)) == 0 { |
| + continue |
| + } |
| + id := 64*index + bit |
| + fn(m.vCPUsByID[id]) |
| + } |
| + } |
| + } |
| +} |
| + |
| +// mark marks the given vCPU as dirty and returns whether it was previously |
| +// clean. Being previously clean implies that a flush is needed on entry. |
| +func (ds *dirtySet) mark(c *vCPU) bool { |
| + index := uint64(c.id) / 64 |
| + bit := uint64(1) << uint(c.id%64) |
| + |
| + oldValue := ds.vCPUMasks[index].Load() |
| + if oldValue&bit != 0 { |
| + return false // Not clean. |
| + } |
| + |
| + // Set the bit unilaterally, and ensure that a flush takes place. Note |
| + // that it's possible for races to occur here, but since the flush is |
| + // taking place long after these lines there's no race in practice. |
| + atomicbitops.OrUint64(&ds.vCPUMasks[index], bit) |
| + return true // Previously clean. |
| +} |
| + |
| +// addressSpace is a wrapper for PageTables. |
| +type addressSpace struct { |
| + platform.NoAddressSpaceIO |
| + |
| + // mu is the lock for modifications to the address space. |
| + // |
| + // Note that the page tables themselves are not locked. |
| + mu sync.Mutex |
| + |
| + // machine is the underlying machine. |
| + machine *machine |
| + |
| + // pageTables are for this particular address space. |
| + pageTables *pagetables.PageTables |
| + |
| + // dirtySet is the set of dirty vCPUs. |
| + dirtySet *dirtySet |
| +} |
| + |
| +// Invalidate interrupts all dirty contexts. |
| +func (as *addressSpace) Invalidate() { |
| + as.mu.Lock() |
| + defer as.mu.Unlock() |
| + as.invalidate() |
| +} |
| + |
| +// Touch adds the given vCPU to the dirty list. |
| +// |
| +// The return value indicates whether a flush is required. |
| +func (as *addressSpace) Touch(c *vCPU) bool { |
| + return as.dirtySet.mark(c) |
| +} |
| + |
| +type hostMapEntry struct { |
| + addr uintptr |
| + length uintptr |
| +} |
| + |
| +// mapLocked maps the given host entry. |
| +// |
| +// +checkescape:hard,stack |
| +func (as *addressSpace) mapLocked(addr hostarch.Addr, m hostMapEntry, at hostarch.AccessType) (inv bool) { |
| + for m.length > 0 { |
| + physical, length, ok := translateToPhysical(m.addr) |
| + if !ok { |
| + panic("unable to translate segment") |
| + } |
| + if length > m.length { |
| + length = m.length |
| + } |
| + |
| + // Ensure that this map has physical mappings. If the page does |
| + // not have physical mappings, the KVM module may inject |
| + // spurious exceptions when emulation fails (i.e. it tries to |
| + // emulate because the RIP is pointed at those pages). |
| + as.machine.mapPhysical(physical, length, physicalRegions) |
| + |
| + // Install the page table mappings. Note that the ordering is |
| + // important; if the pagetable mappings were installed before |
| + // ensuring the physical pages were available, then some other |
| + // thread could theoretically access them. |
| + inv = as.pageTables.Map(addr, length, pagetables.MapOpts{ |
| + AccessType: at, |
| + User: true, |
| + }, physical) || inv |
| + m.addr += length |
| + m.length -= length |
| + addr += hostarch.Addr(length) |
| + } |
| + |
| + return inv |
| +} |
| + |
| +// MapFile implements platform.AddressSpace.MapFile. |
| +func (as *addressSpace) MapFile(addr hostarch.Addr, f memmap.File, fr memmap.FileRange, at hostarch.AccessType, precommit bool) error { |
| + as.mu.Lock() |
| + defer as.mu.Unlock() |
| + |
| + // Get mappings in the sentry's address space, which are guaranteed to be |
| + // valid as long as a reference is held on the mapped pages (which is in |
| + // turn required by AddressSpace.MapFile precondition). |
| + // |
| + // If precommit is true, we will touch mappings to commit them, so ensure |
| + // that mappings are readable from sentry context. |
| + // |
| + // We don't execute from application file-mapped memory, and guest page |
| + // tables don't care if we have execute permission (but they do need pages |
| + // to be readable). |
| + bs, err := f.MapInternal(fr, hostarch.AccessType{ |
| + Read: at.Read || at.Execute || precommit, |
| + Write: at.Write, |
| + }) |
| + if err != nil { |
| + return err |
| + } |
| + |
| + // See block in mapLocked. |
| + as.pageTables.Allocator.(*allocator).cpu = as.machine.Get() |
| + defer as.machine.Put(as.pageTables.Allocator.(*allocator).cpu) |
| + |
| + // Map the mappings in the sentry's address space (guest physical memory) |
| + // into the application's address space (guest virtual memory). |
| + inv := false |
| + for !bs.IsEmpty() { |
| + b := bs.Head() |
| + bs = bs.Tail() |
| + // Since fr was page-aligned, b should also be page-aligned. We do the |
| + // lookup in our host page tables for this translation. |
| + if precommit { |
| + s := b.ToSlice() |
| + for i := 0; i < len(s); i += hostarch.PageSize { |
| + _ = s[i] // Touch to commit. |
| + } |
| + } |
| + |
| + // See bluepill_allocator.go. |
| + bluepill(as.pageTables.Allocator.(*allocator).cpu) |
| + |
| + // Perform the mapping. |
| + prev := as.mapLocked(addr, hostMapEntry{ |
| + addr: b.Addr(), |
| + length: uintptr(b.Len()), |
| + }, at) |
| + inv = inv || prev |
| + addr += hostarch.Addr(b.Len()) |
| + } |
| + if inv { |
| + as.invalidate() |
| + } |
| + |
| + return nil |
| +} |
| + |
| +// unmapLocked is an escape-checked wrapped around Unmap. |
| +// |
| +// +checkescape:hard,stack |
| +func (as *addressSpace) unmapLocked(addr hostarch.Addr, length uint64) bool { |
| + return as.pageTables.Unmap(addr, uintptr(length)) |
| +} |
| + |
| +// Unmap unmaps the given range by calling pagetables.PageTables.Unmap. |
| +func (as *addressSpace) Unmap(addr hostarch.Addr, length uint64) { |
| + as.mu.Lock() |
| + defer as.mu.Unlock() |
| + |
| + // See above & bluepill_allocator.go. |
| + as.pageTables.Allocator.(*allocator).cpu = as.machine.Get() |
| + defer as.machine.Put(as.pageTables.Allocator.(*allocator).cpu) |
| + bluepill(as.pageTables.Allocator.(*allocator).cpu) |
| + |
| + if prev := as.unmapLocked(addr, length); prev { |
| + // Invalidate all active vCPUs. |
| + as.invalidate() |
| + |
| + // Recycle any freed intermediate pages. |
| + as.pageTables.Allocator.Recycle() |
| + } |
| +} |
| + |
| +// Release releases the page tables. |
| +func (as *addressSpace) Release() { |
| + as.Unmap(0, ^uint64(0)) |
| + |
| + // Free all pages from the allocator. |
| + as.pageTables.Allocator.(*allocator).base.Drain() |
| + |
| + // Drop all cached machine references. |
| + as.machine.dropPageTables(as.pageTables) |
| +} |
| + |
| +// PreFork implements platform.AddressSpace.PreFork. |
| +func (as *addressSpace) PreFork() {} |
| + |
| +// PostFork implements platform.AddressSpace.PostFork. |
| +func (as *addressSpace) PostFork() {} |
| diff --git a/pkg/sentry/platform/kvm/bluepill_debug_unsafe.go b/pkg/sentry/platform/kvm/bluepill_debug_unsafe.go |
| new file mode 100644 |
| index 000000000..5feb45c19 |
| --- /dev/null |
| +++ b/pkg/sentry/platform/kvm/bluepill_debug_unsafe.go |
| @@ -0,0 +1,215 @@ |
| +// Copyright 2018 The gVisor Authors. |
| +// |
| +// Licensed under the Apache License, Version 2.0 (the "License"); |
| +// you may not use this file except in compliance with the License. |
| +// You may obtain a copy of the License at |
| +// |
| +// http://www.apache.org/licenses/LICENSE-2.0 |
| +// |
| +// Unless required by applicable law or agreed to in writing, software |
| +// distributed under the License is distributed on an "AS IS" BASIS, |
| +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| +// See the License for the specific language governing permissions and |
| +// limitations under the License. |
| + |
| +//go:build go1.18 && kvm_debug |
| +// +build go1.18,kvm_debug |
| + |
| +// //go:linkname directives type-checked by checklinkname. Any other |
| +// non-linkname assumptions outside the Go 1 compatibility guarantee should |
| +// have an accompanied vet check or version guard build tag. |
| + |
| +package kvm |
| + |
| +import ( |
| + "unsafe" |
| + |
| + "golang.org/x/sys/unix" |
| + "gvisor.dev/gvisor/pkg/sentry/arch" |
| +) |
| + |
| +//go:linkname throw runtime.throw |
| +func throw(s string) |
| + |
| +// vCPUPtr returns a CPU for the given address. |
| +func vCPUPtr(addr uintptr) *vCPU { |
| + return (*vCPU)(unsafe.Pointer(addr)) |
| +} |
| + |
| +// bytePtr returns a bytePtr for the given address. |
| +func bytePtr(addr uintptr) *byte { |
| + return (*byte)(unsafe.Pointer(addr)) |
| +} |
| + |
| +// uintptrValue returns a uintptr for the given address. |
| +func uintptrValue(addr *byte) uintptr { |
| + return (uintptr)(unsafe.Pointer(addr)) |
| +} |
| + |
| +// bluepillArchContext returns the UContext64. |
| +func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 { |
| + return &((*arch.UContext64)(context).MContext) |
| +} |
| + |
| +// bluepillHandleHlt is reponsible for handling VM-Exit. |
| +func bluepillGuestExit(c *vCPU, context unsafe.Pointer) { |
| + // Increment our counter. |
| + c.guestExits.Add(1) |
| + |
| + // Copy out registers. |
| + bluepillArchExit(c, bluepillArchContext(context)) |
| + |
| + // Return to the vCPUReady state; notify any waiters. |
| + user := c.state.Load() & vCPUUser |
| + switch c.state.Swap(user) { |
| + case user | vCPUGuest: // Expected case. |
| + case user | vCPUGuest | vCPUWaiter: |
| + c.notify() |
| + default: |
| + throw("invalid state") |
| + } |
| +} |
| + |
| +var hexSyms = []byte("0123456789abcdef") |
| + |
| +func printHex(title []byte, val uint64) { |
| + var str [18]byte |
| + for i := 0; i < 16; i++ { |
| + str[16-i] = hexSyms[val&0xf] |
| + val = val >> 4 |
| + } |
| + str[0] = ' ' |
| + str[17] = '\n' |
| + unix.RawSyscall(unix.SYS_WRITE, uintptr(unix.Stderr), uintptr(unsafe.Pointer(&title[0])), uintptr(len(title))) |
| + unix.RawSyscall(unix.SYS_WRITE, uintptr(unix.Stderr), uintptr(unsafe.Pointer(&str)), 18) |
| +} |
| + |
| +// bluepillHandler is called from the signal stub. |
| +// |
| +// The world may be stopped while this is executing, and it executes on the |
| +// signal stack. It should only execute raw system calls and functions that are |
| +// explicitly marked go:nosplit. |
| +// |
| +// Ideally, this function should switch to gsignal, as runtime.sigtramp does, |
| +// but that is tedious given all the runtime internals. That said, using |
| +// gsignal inside a signal handler is not _required_, provided we avoid stack |
| +// splits and allocations. Note that calling any splittable function here will |
| +// be flaky; if the signal stack is below the G stack then we will trigger a |
| +// split and crash. If above, we won't trigger a split. |
| +// |
| +// +checkescape:all |
| +func bluepillHandler(context unsafe.Pointer) { |
| + // Sanitize the registers; interrupts must always be disabled. |
| + c := bluepillArchEnter(bluepillArchContext(context)) |
| + |
| + // Mark this as guest mode. |
| + switch c.state.Swap(vCPUGuest | vCPUUser) { |
| + case vCPUUser: // Expected case. |
| + case vCPUUser | vCPUWaiter: |
| + c.notify() |
| + default: |
| + throw("invalid state") |
| + } |
| + |
| + for { |
| + hostExitCounter.Increment() |
| + _, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(c.fd), _KVM_RUN, 0) // escapes: no. |
| + switch errno { |
| + case 0: // Expected case. |
| + case unix.EINTR: |
| + interruptCounter.Increment() |
| + // First, we process whatever pending signal |
| + // interrupted KVM. Since we're in a signal handler |
| + // currently, all signals are masked and the signal |
| + // must have been delivered directly to this thread. |
| + timeout := unix.Timespec{} |
| + sig, _, errno := unix.RawSyscall6( // escapes: no. |
| + unix.SYS_RT_SIGTIMEDWAIT, |
| + uintptr(unsafe.Pointer(&bounceSignalMask)), |
| + 0, // siginfo. |
| + uintptr(unsafe.Pointer(&timeout)), // timeout. |
| + 8, // sigset size. |
| + 0, 0) |
| + if errno == unix.EAGAIN { |
| + continue |
| + } |
| + if errno != 0 { |
| + throw("error waiting for pending signal") |
| + } |
| + if sig != uintptr(bounceSignal) { |
| + throw("unexpected signal") |
| + } |
| + |
| + // Check whether the current state of the vCPU is ready |
| + // for interrupt injection. Because we don't have a |
| + // PIC, we can't inject an interrupt while they are |
| + // masked. We need to request a window if it's not |
| + // ready. |
| + if bluepillReadyStopGuest(c) { |
| + // Force injection below; the vCPU is ready. |
| + c.runData.exitReason = _KVM_EXIT_IRQ_WINDOW_OPEN |
| + } else { |
| + c.runData.requestInterruptWindow = 1 |
| + continue // Rerun vCPU. |
| + } |
| + case unix.EFAULT: |
| + // If a fault is not serviceable due to the host |
| + // backing pages having page permissions, instead of an |
| + // MMIO exit we receive EFAULT from the run ioctl. We |
| + // always inject an NMI here since we may be in kernel |
| + // mode and have interrupts disabled. |
| + bluepillSigBus(c) |
| + continue // Rerun vCPU. |
| + case unix.ENOSYS: |
| + bluepillHandleEnosys(c) |
| + continue |
| + default: |
| + throw("run failed") |
| + } |
| + |
| + switch c.runData.exitReason { |
| + case _KVM_EXIT_EXCEPTION: |
| + c.die(bluepillArchContext(context), "exception") |
| + return |
| + case _KVM_EXIT_IO: |
| + c.die(bluepillArchContext(context), "I/O") |
| + return |
| + case _KVM_EXIT_INTERNAL_ERROR: |
| + // An internal error is typically thrown when emulation |
| + // fails. This can occur via the MMIO path below (and |
| + // it might fail because we have multiple regions that |
| + // are not mapped). We would actually prefer that no |
| + // emulation occur, and don't mind at all if it fails. |
| + case _KVM_EXIT_HYPERCALL: |
| + c.die(bluepillArchContext(context), "hypercall") |
| + return |
| + case _KVM_EXIT_DEBUG: |
| + c.die(bluepillArchContext(context), "debug") |
| + return |
| + case _KVM_EXIT_HLT: |
| + c.hltSanityCheck() |
| + bluepillGuestExit(c, context) |
| + return |
| + case _KVM_EXIT_MMIO: |
| + physical := uintptr(c.runData.data[0]) |
| + if getHypercallID(physical) == _KVM_HYPERCALL_VMEXIT { |
| + bluepillGuestExit(c, context) |
| + return |
| + } |
| + |
| + c.die(bluepillArchContext(context), "exit_mmio") |
| + return |
| + case _KVM_EXIT_IRQ_WINDOW_OPEN: |
| + bluepillStopGuest(c) |
| + case _KVM_EXIT_SHUTDOWN: |
| + c.die(bluepillArchContext(context), "shutdown") |
| + return |
| + case _KVM_EXIT_FAIL_ENTRY: |
| + c.die(bluepillArchContext(context), "entry failed") |
| + return |
| + default: |
| + bluepillArchHandleExit(c, context) |
| + return |
| + } |
| + } |
| +} |
| diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go |
| index 81bd9f814..ad8b966e7 100644 |
| --- a/pkg/sentry/platform/kvm/bluepill_unsafe.go |
| +++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go |
| @@ -12,8 +12,8 @@ |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| -//go:build go1.18 |
| -// +build go1.18 |
| +//go:build go1.18 && !kvm_debug |
| +// +build go1.18,!kvm_debug |
| |
| // //go:linkname directives type-checked by checklinkname. Any other |
| // non-linkname assumptions outside the Go 1 compatibility guarantee should |
| diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go |
| index f39bf1f06..4f0264db7 100644 |
| --- a/pkg/sentry/platform/kvm/machine.go |
| +++ b/pkg/sentry/platform/kvm/machine.go |
| @@ -12,6 +12,9 @@ |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| +//go:build !kvm_debug |
| +// +build !kvm_debug |
| + |
| package kvm |
| |
| import ( |
| diff --git a/pkg/sentry/platform/kvm/machine_debug.go b/pkg/sentry/platform/kvm/machine_debug.go |
| new file mode 100644 |
| index 000000000..0a4735d2d |
| --- /dev/null |
| +++ b/pkg/sentry/platform/kvm/machine_debug.go |
| @@ -0,0 +1,826 @@ |
| +// Copyright 2018 The gVisor Authors. |
| +// |
| +// Licensed under the Apache License, Version 2.0 (the "License"); |
| +// you may not use this file except in compliance with the License. |
| +// You may obtain a copy of the License at |
| +// |
| +// http://www.apache.org/licenses/LICENSE-2.0 |
| +// |
| +// Unless required by applicable law or agreed to in writing, software |
| +// distributed under the License is distributed on an "AS IS" BASIS, |
| +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| +// See the License for the specific language governing permissions and |
| +// limitations under the License. |
| + |
| +//go:build kvm_debug |
| +// +build kvm_debug |
| + |
| +package kvm |
| + |
| +import ( |
| + "fmt" |
| + "runtime" |
| + gosync "sync" |
| + "sync/atomic" |
| + "time" |
| + |
| + "golang.org/x/sys/unix" |
| + "gvisor.dev/gvisor/pkg/abi/linux" |
| + "gvisor.dev/gvisor/pkg/atomicbitops" |
| + "gvisor.dev/gvisor/pkg/hostarch" |
| + "gvisor.dev/gvisor/pkg/hosttid" |
| + "gvisor.dev/gvisor/pkg/log" |
| + "gvisor.dev/gvisor/pkg/metric" |
| + "gvisor.dev/gvisor/pkg/ring0" |
| + "gvisor.dev/gvisor/pkg/ring0/pagetables" |
| + "gvisor.dev/gvisor/pkg/seccomp" |
| + ktime "gvisor.dev/gvisor/pkg/sentry/time" |
| + "gvisor.dev/gvisor/pkg/sighandling" |
| + "gvisor.dev/gvisor/pkg/sync" |
| +) |
| + |
| +// machine contains state associated with the VM as a whole. |
| +type machine struct { |
| + // fd is the vm fd. |
| + fd int |
| + |
| + // machinePoolIndex is the index in the machinePool array. |
| + machinePoolIndex uint32 |
| + |
| + // nextSlot is the next slot for setMemoryRegion. |
| + // |
| + // If nextSlot is ^uint32(0), then slots are currently being updated, and the |
| + // caller should retry. |
| + nextSlot atomicbitops.Uint32 |
| + |
| + // upperSharedPageTables tracks the read-only shared upper of all the pagetables. |
| + upperSharedPageTables *pagetables.PageTables |
| + |
| + // kernel is the set of global structures. |
| + kernel ring0.Kernel |
| + |
| + // mu protects vCPUs. |
| + mu sync.RWMutex |
| + |
| + // available is notified when vCPUs are available. |
| + available sync.Cond |
| + |
| + // vCPUsByTID are the machine vCPUs. |
| + // |
| + // These are populated dynamically. |
| + vCPUsByTID map[uint64]*vCPU |
| + |
| + // vCPUsByID are the machine vCPUs, can be indexed by the vCPU's ID. |
| + vCPUsByID []*vCPU |
| + |
| + // usedVCPUs is the number of vCPUs that have been used from the |
| + // vCPUsByID pool. |
| + usedVCPUs int |
| + |
| + // maxVCPUs is the maximum number of vCPUs supported by the machine. |
| + maxVCPUs int |
| + |
| + // maxSlots is the maximum number of memory slots supported by the machine. |
| + maxSlots int |
| + |
| + // tscControl checks whether cpu supports TSC scaling |
| + tscControl bool |
| + |
| + // usedSlots is the set of used physical addresses (not sorted). |
| + usedSlots []uintptr |
| +} |
| + |
| +const ( |
| + // vCPUReady is an alias for all the below clear. |
| + vCPUReady uint32 = 0 |
| + |
| + // vCPUser indicates that the vCPU is in or about to enter user mode. |
| + vCPUUser uint32 = 1 << 0 |
| + |
| + // vCPUGuest indicates the vCPU is in guest mode. |
| + vCPUGuest uint32 = 1 << 1 |
| + |
| + // vCPUWaiter indicates that there is a waiter. |
| + // |
| + // If this is set, then notify must be called on any state transitions. |
| + vCPUWaiter uint32 = 1 << 2 |
| +) |
| + |
| +// Field values for the get_vcpu metric acquisition path used. |
| +var ( |
| + getVCPUAcquisitionFastReused = metric.FieldValue{"fast_reused"} |
| + getVCPUAcquisitionReused = metric.FieldValue{"reused"} |
| + getVCPUAcquisitionUnused = metric.FieldValue{"unused"} |
| + getVCPUAcquisitionStolen = metric.FieldValue{"stolen"} |
| +) |
| + |
| +var ( |
| + // hostExitCounter is a metric that tracks how many times the sentry |
| + // performed a host to guest world switch. |
| + hostExitCounter = metric.MustCreateNewProfilingUint64Metric( |
| + "/kvm/host_exits", false, "The number of times the sentry performed a host to guest world switch.") |
| + |
| + // userExitCounter is a metric that tracks how many times the sentry has |
| + // had an exit from userspace. Analogous to vCPU.userExits. |
| + userExitCounter = metric.MustCreateNewProfilingUint64Metric( |
| + "/kvm/user_exits", false, "The number of times the sentry has had an exit from userspace.") |
| + |
| + // interruptCounter is a metric that tracks how many times execution returned |
| + // to the KVM host to handle a pending signal. |
| + interruptCounter = metric.MustCreateNewProfilingUint64Metric( |
| + "/kvm/interrupts", false, "The number of times the signal handler was invoked.") |
| + |
| + // mmapCallCounter is a metric that tracks how many times the function |
| + // seccompMmapSyscall has been called. |
| + mmapCallCounter = metric.MustCreateNewProfilingUint64Metric( |
| + "/kvm/mmap_calls", false, "The number of times seccompMmapSyscall has been called.") |
| + |
| + // getVCPUCounter is a metric that tracks how many times different paths of |
| + // machine.Get() are triggered. |
| + getVCPUCounter = metric.MustCreateNewProfilingUint64Metric( |
| + "/kvm/get_vcpu", false, "The number of times that machine.Get() was called, split by path the function took.", |
| + metric.NewField("acquisition_type", &getVCPUAcquisitionFastReused, &getVCPUAcquisitionReused, &getVCPUAcquisitionUnused, &getVCPUAcquisitionStolen)) |
| + |
| + // asInvalidateDuration are durations of calling addressSpace.invalidate(). |
| + asInvalidateDuration = metric.MustCreateNewProfilingTimerMetric("/kvm/address_space_invalidate", |
| + metric.NewExponentialBucketer(15, uint64(time.Nanosecond*100), 1, 2), |
| + "Duration of calling addressSpace.invalidate().") |
| +) |
| + |
| +// vCPU is a single KVM vCPU. |
| +type vCPU struct { |
| + // CPU is the kernel CPU data. |
| + // |
| + // This must be the first element of this structure, it is referenced |
| + // by the bluepill code (see bluepill_amd64.s). |
| + ring0.CPU |
| + |
| + // id is the vCPU id. |
| + id int |
| + |
| + // fd is the vCPU fd. |
| + fd int |
| + |
| + // tid is the last set tid. |
| + tid atomicbitops.Uint64 |
| + |
| + // userExits is the count of user exits. |
| + userExits atomicbitops.Uint64 |
| + |
| + // guestExits is the count of guest to host world switches. |
| + guestExits atomicbitops.Uint64 |
| + |
| + // faults is a count of world faults (informational only). |
| + faults uint32 |
| + |
| + // state is the vCPU state. |
| + // |
| + // This is a bitmask of the three fields (vCPU*) described above. |
| + state atomicbitops.Uint32 |
| + |
| + // runData for this vCPU. |
| + runData *runData |
| + |
| + // machine associated with this vCPU. |
| + machine *machine |
| + |
| + // active is the current addressSpace: this is set and read atomically, |
| + // it is used to elide unnecessary interrupts due to invalidations. |
| + active atomicAddressSpace |
| + |
| + // vCPUArchState is the architecture-specific state. |
| + vCPUArchState |
| + |
| + // dieState holds state related to vCPU death. |
| + dieState dieState |
| +} |
| + |
| +type dieState struct { |
| + // message is thrown from die. |
| + message string |
| + |
| + // guestRegs is used to store register state during vCPU.die() to prevent |
| + // allocation inside nosplit function. |
| + guestRegs userRegs |
| +} |
| + |
| +// createVCPU creates and returns a new vCPU. |
| +// |
| +// Precondition: mu must be held. |
| +func (m *machine) createVCPU(id int) *vCPU { |
| + // Create the vCPU. |
| + fd, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CREATE_VCPU, uintptr(id)) |
| + if errno != 0 { |
| + panic(fmt.Sprintf("error creating new vCPU: %v", errno)) |
| + } |
| + |
| + c := &vCPU{ |
| + id: id, |
| + fd: int(fd), |
| + machine: m, |
| + } |
| + c.CPU.Init(&m.kernel, c.id, c) |
| + m.vCPUsByID[c.id] = c |
| + |
| + // Ensure the signal mask is correct. |
| + if err := c.setSignalMask(); err != nil { |
| + panic(fmt.Sprintf("error setting signal mask: %v", err)) |
| + } |
| + |
| + // Map the run data. |
| + runData, err := mapRunData(int(fd)) |
| + if err != nil { |
| + panic(fmt.Sprintf("error mapping run data: %v", err)) |
| + } |
| + c.runData = runData |
| + |
| + // Initialize architecture state. |
| + if err := c.initArchState(); err != nil { |
| + panic(fmt.Sprintf("error initialization vCPU state: %v", err)) |
| + } |
| + |
| + return c // Done. |
| +} |
| + |
| +// newMachine returns a new VM context. |
| +func newMachine(vm int) (*machine, error) { |
| + // Create the machine. |
| + m := &machine{fd: vm} |
| + m.available.L = &m.mu |
| + |
| + // Pull the maximum vCPUs. |
| + m.getMaxVCPU() |
| + log.Debugf("The maximum number of vCPUs is %d.", m.maxVCPUs) |
| + m.vCPUsByTID = make(map[uint64]*vCPU) |
| + m.vCPUsByID = make([]*vCPU, m.maxVCPUs) |
| + m.kernel.Init(m.maxVCPUs) |
| + |
| + // Pull the maximum slots. |
| + maxSlots, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_MEMSLOTS) |
| + if errno != 0 { |
| + m.maxSlots = _KVM_NR_MEMSLOTS |
| + } else { |
| + m.maxSlots = int(maxSlots) |
| + } |
| + log.Debugf("The maximum number of slots is %d.", m.maxSlots) |
| + m.usedSlots = make([]uintptr, m.maxSlots) |
| + |
| + // Check TSC Scaling |
| + hasTSCControl, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_TSC_CONTROL) |
| + m.tscControl = errno == 0 && hasTSCControl == 1 |
| + log.Debugf("TSC scaling support: %t.", m.tscControl) |
| + |
| + // Create the upper shared pagetables and kernel(sentry) pagetables. |
| + m.upperSharedPageTables = pagetables.New(newAllocator()) |
| + m.mapUpperHalf(m.upperSharedPageTables) |
| + m.upperSharedPageTables.Allocator.(*allocator).base.Drain() |
| + m.upperSharedPageTables.MarkReadOnlyShared() |
| + m.kernel.PageTables = pagetables.NewWithUpper(newAllocator(), m.upperSharedPageTables, ring0.KernelStartAddress) |
| + |
| + // Install seccomp rules to trap runtime mmap system calls. They will |
| + // be handled by seccompMmapHandler. |
| + seccompMmapRules(m) |
| + |
| + // Apply the physical mappings. Note that these mappings may point to |
| + // guest physical addresses that are not actually available. These |
| + // physical pages are mapped on demand, see kernel_unsafe.go. |
| + applyPhysicalRegions(func(pr physicalRegion) bool { |
| + // Map everything in the lower half. |
| + m.kernel.PageTables.Map( |
| + hostarch.Addr(pr.virtual), |
| + pr.length, |
| + pagetables.MapOpts{AccessType: hostarch.ReadWrite}, |
| + pr.physical) |
| + |
| + return true // Keep iterating. |
| + }) |
| + |
| + // Ensure that the currently mapped virtual regions are actually |
| + // available in the VM. Note that this doesn't guarantee no future |
| + // faults, however it should guarantee that everything is available to |
| + // ensure successful vCPU entry. |
| + mapRegion := func(vr virtualRegion, flags uint32) { |
| + for virtual := vr.virtual; virtual < vr.virtual+vr.length; { |
| + physical, length, ok := translateToPhysical(virtual) |
| + if !ok { |
| + // This must be an invalid region that was |
| + // knocked out by creation of the physical map. |
| + return |
| + } |
| + if virtual+length > vr.virtual+vr.length { |
| + // Cap the length to the end of the area. |
| + length = vr.virtual + vr.length - virtual |
| + } |
| + // Update page tables for executable mappings. |
| + if vr.accessType.Execute { |
| + if vr.accessType.Write { |
| + panic(fmt.Sprintf("executable mapping can't be writable: %#v", vr)) |
| + } |
| + m.kernel.PageTables.Map( |
| + hostarch.Addr(virtual), |
| + length, |
| + pagetables.MapOpts{AccessType: vr.accessType}, |
| + physical) |
| + } |
| + |
| + // Ensure the physical range is mapped. |
| + m.mapPhysical(physical, length, physicalRegions) |
| + virtual += length |
| + } |
| + } |
| + |
| + // handleBluepillFault takes the slot spinlock and it is called from |
| + // seccompMmapHandler, so here we have to guarantee that mmap is not |
| + // called while we hold the slot spinlock. |
| + disableAsyncPreemption() |
| + applyVirtualRegions(func(vr virtualRegion) { |
| + if excludeVirtualRegion(vr) { |
| + return // skip region. |
| + } |
| + // Take into account that the stack can grow down. |
| + if vr.filename == "[stack]" { |
| + vr.virtual -= 1 << 20 |
| + vr.length += 1 << 20 |
| + } |
| + |
| + mapRegion(vr, 0) |
| + |
| + }) |
| + enableAsyncPreemption() |
| + |
| + // Initialize architecture state. |
| + if err := m.initArchState(); err != nil { |
| + m.Destroy() |
| + return nil, err |
| + } |
| + |
| + // Ensure the machine is cleaned up properly. |
| + runtime.SetFinalizer(m, (*machine).Destroy) |
| + return m, nil |
| +} |
| + |
| +// hasSlot returns true if the given address is mapped. |
| +// |
| +// This must be done via a linear scan. |
| +// |
| +//go:nosplit |
| +func (m *machine) hasSlot(physical uintptr) bool { |
| + slotLen := int(m.nextSlot.Load()) |
| + // When slots are being updated, nextSlot is ^uint32(0). As this situation |
| + // is less likely happen, we just set the slotLen to m.maxSlots, and scan |
| + // the whole usedSlots array. |
| + if slotLen == int(^uint32(0)) { |
| + slotLen = m.maxSlots |
| + } |
| + for i := 0; i < slotLen; i++ { |
| + if p := atomic.LoadUintptr(&m.usedSlots[i]); p == physical { |
| + return true |
| + } |
| + } |
| + return false |
| +} |
| + |
| +// mapPhysical checks for the mapping of a physical range, and installs one if |
| +// not available. This attempts to be efficient for calls in the hot path. |
| +// |
| +// This throws on error. |
| +func (m *machine) mapPhysical(physical, length uintptr, phyRegions []physicalRegion) { |
| + for end := physical + length; physical < end; { |
| + _, physicalStart, length, pr := calculateBluepillFault(physical, phyRegions) |
| + if pr == nil { |
| + // Should never happen. |
| + throw("mapPhysical on unknown physical address") |
| + } |
| + |
| + // Is this already mapped? Check the usedSlots. |
| + if !m.hasSlot(physicalStart) { |
| + if _, ok := handleBluepillFault(m, physical, phyRegions); !ok { |
| + throw("handleBluepillFault failed") |
| + } |
| + } |
| + |
| + // Move to the next chunk. |
| + physical = physicalStart + length |
| + } |
| +} |
| + |
| +// Destroy frees associated resources. |
| +// |
| +// Destroy should only be called once all active users of the machine are gone. |
| +// The machine object should not be used after calling Destroy. |
| +// |
| +// Precondition: all vCPUs must be returned to the machine. |
| +func (m *machine) Destroy() { |
| + runtime.SetFinalizer(m, nil) |
| + |
| + // Destroy vCPUs. |
| + for _, c := range m.vCPUsByID { |
| + if c == nil { |
| + continue |
| + } |
| + |
| + // Ensure the vCPU is not still running in guest mode. This is |
| + // possible iff teardown has been done by other threads, and |
| + // somehow a single thread has not executed any system calls. |
| + c.BounceToHost() |
| + |
| + // Note that the runData may not be mapped if an error occurs |
| + // during the middle of initialization. |
| + if c.runData != nil { |
| + if err := unmapRunData(c.runData); err != nil { |
| + panic(fmt.Sprintf("error unmapping rundata: %v", err)) |
| + } |
| + } |
| + if err := unix.Close(int(c.fd)); err != nil { |
| + panic(fmt.Sprintf("error closing vCPU fd: %v", err)) |
| + } |
| + } |
| + |
| + machinePool[m.machinePoolIndex].Store(nil) |
| + seccompMmapSync() |
| + |
| + // vCPUs are gone: teardown machine state. |
| + if err := unix.Close(m.fd); err != nil { |
| + panic(fmt.Sprintf("error closing VM fd: %v", err)) |
| + } |
| +} |
| + |
| +// Get gets an available vCPU. |
| +// |
| +// This will return with the OS thread locked. |
| +// |
| +// It is guaranteed that if any OS thread TID is in guest, m.vCPUs[TID] points |
| +// to the vCPU in which the OS thread TID is running. So if Get() returns with |
| +// the corrent context in guest, the vCPU of it must be the same as what |
| +// Get() returns. |
| +func (m *machine) Get() *vCPU { |
| + m.mu.RLock() |
| + runtime.LockOSThread() |
| + tid := hosttid.Current() |
| + |
| + // Check for an exact match. |
| + if c := m.vCPUsByTID[tid]; c != nil { |
| + c.lock() |
| + m.mu.RUnlock() |
| + getVCPUCounter.Increment(&getVCPUAcquisitionFastReused) |
| + return c |
| + } |
| + |
| + // The happy path failed. We now proceed to acquire an exclusive lock |
| + // (because the vCPU map may change), and scan all available vCPUs. |
| + // In this case, we first unlock the OS thread. Otherwise, if mu is |
| + // not available, the current system thread will be parked and a new |
| + // system thread spawned. We avoid this situation by simply refreshing |
| + // tid after relocking the system thread. |
| + m.mu.RUnlock() |
| + runtime.UnlockOSThread() |
| + m.mu.Lock() |
| + runtime.LockOSThread() |
| + tid = hosttid.Current() |
| + |
| + // Recheck for an exact match. |
| + if c := m.vCPUsByTID[tid]; c != nil { |
| + c.lock() |
| + m.mu.Unlock() |
| + getVCPUCounter.Increment(&getVCPUAcquisitionReused) |
| + return c |
| + } |
| + |
| + for { |
| + // Get vCPU from the m.vCPUsByID pool. |
| + if m.usedVCPUs < m.maxVCPUs { |
| + c := m.vCPUsByID[m.usedVCPUs] |
| + m.usedVCPUs++ |
| + c.lock() |
| + m.vCPUsByTID[tid] = c |
| + m.mu.Unlock() |
| + c.loadSegments(tid) |
| + getVCPUCounter.Increment(&getVCPUAcquisitionUnused) |
| + return c |
| + } |
| + |
| + // Scan for an available vCPU. |
| + for origTID, c := range m.vCPUsByTID { |
| + if c.state.CompareAndSwap(vCPUReady, vCPUUser) { |
| + delete(m.vCPUsByTID, origTID) |
| + m.vCPUsByTID[tid] = c |
| + m.mu.Unlock() |
| + c.loadSegments(tid) |
| + getVCPUCounter.Increment(&getVCPUAcquisitionUnused) |
| + return c |
| + } |
| + } |
| + |
| + // Scan for something not in user mode. |
| + for origTID, c := range m.vCPUsByTID { |
| + if !c.state.CompareAndSwap(vCPUGuest, vCPUGuest|vCPUWaiter) { |
| + continue |
| + } |
| + |
| + // The vCPU is not be able to transition to |
| + // vCPUGuest|vCPUWaiter or to vCPUUser because that |
| + // transition requires holding the machine mutex, as we |
| + // do now. There is no path to register a waiter on |
| + // just the vCPUReady state. |
| + for { |
| + c.waitUntilNot(vCPUGuest | vCPUWaiter) |
| + if c.state.CompareAndSwap(vCPUReady, vCPUUser) { |
| + break |
| + } |
| + } |
| + |
| + // Steal the vCPU. |
| + delete(m.vCPUsByTID, origTID) |
| + m.vCPUsByTID[tid] = c |
| + m.mu.Unlock() |
| + c.loadSegments(tid) |
| + getVCPUCounter.Increment(&getVCPUAcquisitionStolen) |
| + return c |
| + } |
| + |
| + // Everything is executing in user mode. Wait until something |
| + // is available. Note that signaling the condition variable |
| + // will have the extra effect of kicking the vCPUs out of guest |
| + // mode if that's where they were. |
| + m.available.Wait() |
| + } |
| +} |
| + |
| +// Put puts the current vCPU. |
| +func (m *machine) Put(c *vCPU) { |
| + c.unlock() |
| + runtime.UnlockOSThread() |
| + |
| + m.mu.RLock() |
| + m.available.Signal() |
| + m.mu.RUnlock() |
| +} |
| + |
| +// newDirtySet returns a new dirty set. |
| +func (m *machine) newDirtySet() *dirtySet { |
| + return &dirtySet{ |
| + vCPUMasks: make([]atomicbitops.Uint64, |
| + (m.maxVCPUs+63)/64, (m.maxVCPUs+63)/64), |
| + } |
| +} |
| + |
| +// dropPageTables drops cached page table entries. |
| +func (m *machine) dropPageTables(pt *pagetables.PageTables) { |
| + m.mu.Lock() |
| + defer m.mu.Unlock() |
| + |
| + // Clear from all PCIDs. |
| + for _, c := range m.vCPUsByID { |
| + if c != nil && c.PCIDs != nil { |
| + c.PCIDs.Drop(pt) |
| + } |
| + } |
| +} |
| + |
| +// lock marks the vCPU as in user mode. |
| +// |
| +// This should only be called directly when known to be safe, i.e. when |
| +// the vCPU is owned by the current TID with no chance of theft. |
| +// |
| +//go:nosplit |
| +func (c *vCPU) lock() { |
| + atomicbitops.OrUint32(&c.state, vCPUUser) |
| +} |
| + |
| +// unlock clears the vCPUUser bit. |
| +// |
| +//go:nosplit |
| +func (c *vCPU) unlock() { |
| + origState := atomicbitops.CompareAndSwapUint32(&c.state, vCPUUser|vCPUGuest, vCPUGuest) |
| + if origState == vCPUUser|vCPUGuest { |
| + // Happy path: no exits are forced, and we can continue |
| + // executing on our merry way with a single atomic access. |
| + return |
| + } |
| + |
| + // Clear the lock. |
| + for { |
| + state := atomicbitops.CompareAndSwapUint32(&c.state, origState, origState&^vCPUUser) |
| + if state == origState { |
| + break |
| + } |
| + origState = state |
| + } |
| + switch origState { |
| + case vCPUUser: |
| + // Normal state. |
| + case vCPUUser | vCPUGuest | vCPUWaiter: |
| + // Force a transition: this must trigger a notification when we |
| + // return from guest mode. We must clear vCPUWaiter here |
| + // anyways, because BounceToKernel will force a transition only |
| + // from ring3 to ring0, which will not clear this bit. Halt may |
| + // workaround the issue, but if there is no exception or |
| + // syscall in this period, BounceToKernel will hang. |
| + atomicbitops.AndUint32(&c.state, ^vCPUWaiter) |
| + c.notify() |
| + case vCPUUser | vCPUWaiter: |
| + // Waiting for the lock to be released; the responsibility is |
| + // on us to notify the waiter and clear the associated bit. |
| + atomicbitops.AndUint32(&c.state, ^vCPUWaiter) |
| + c.notify() |
| + default: |
| + panic("invalid state") |
| + } |
| +} |
| + |
| +// NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt. |
| +// |
| +//go:nosplit |
| +func (c *vCPU) NotifyInterrupt() { |
| + c.BounceToKernel() |
| +} |
| + |
| +// pid is used below in bounce. |
| +var pid = unix.Getpid() |
| + |
| +// bounce forces a return to the kernel or to host mode. |
| +// |
| +// This effectively unwinds the state machine. |
| +func (c *vCPU) bounce(forceGuestExit bool) { |
| + origGuestExits := c.guestExits.Load() |
| + origUserExits := c.userExits.Load() |
| + for { |
| + switch state := c.state.Load(); state { |
| + case vCPUReady, vCPUWaiter: |
| + // There is nothing to be done, we're already in the |
| + // kernel pre-acquisition. The Bounce criteria have |
| + // been satisfied. |
| + return |
| + case vCPUUser: |
| + // We need to register a waiter for the actual guest |
| + // transition. When the transition takes place, then we |
| + // can inject an interrupt to ensure a return to host |
| + // mode. |
| + c.state.CompareAndSwap(state, state|vCPUWaiter) |
| + case vCPUUser | vCPUWaiter: |
| + // Wait for the transition to guest mode. This should |
| + // come from the bluepill handler. |
| + c.waitUntilNot(state) |
| + case vCPUGuest, vCPUUser | vCPUGuest: |
| + if state == vCPUGuest && !forceGuestExit { |
| + // The vCPU is already not acquired, so there's |
| + // no need to do a fresh injection here. |
| + return |
| + } |
| + // The vCPU is in user or kernel mode. Attempt to |
| + // register a notification on change. |
| + if !c.state.CompareAndSwap(state, state|vCPUWaiter) { |
| + break // Retry. |
| + } |
| + for { |
| + // We need to spin here until the signal is |
| + // delivered, because Tgkill can return EAGAIN |
| + // under memory pressure. Since we already |
| + // marked ourselves as a waiter, we need to |
| + // ensure that a signal is actually delivered. |
| + if err := unix.Tgkill(pid, int(c.tid.Load()), bounceSignal); err == nil { |
| + break |
| + } else if err.(unix.Errno) == unix.EAGAIN { |
| + continue |
| + } else { |
| + // Nothing else should be returned by tgkill. |
| + panic(fmt.Sprintf("unexpected tgkill error: %v", err)) |
| + } |
| + } |
| + case vCPUGuest | vCPUWaiter, vCPUUser | vCPUGuest | vCPUWaiter: |
| + if state == vCPUGuest|vCPUWaiter && !forceGuestExit { |
| + // See above. |
| + return |
| + } |
| + // Wait for the transition. This again should happen |
| + // from the bluepill handler, but on the way out. |
| + c.waitUntilNot(state) |
| + default: |
| + // Should not happen: the above is exhaustive. |
| + panic("invalid state") |
| + } |
| + |
| + // Check if we've missed the state transition, but |
| + // we can safely return at this point in time. |
| + newGuestExits := c.guestExits.Load() |
| + newUserExits := c.userExits.Load() |
| + if newUserExits != origUserExits && (!forceGuestExit || newGuestExits != origGuestExits) { |
| + return |
| + } |
| + } |
| +} |
| + |
| +// BounceToKernel ensures that the vCPU bounces back to the kernel. |
| +// |
| +//go:nosplit |
| +func (c *vCPU) BounceToKernel() { |
| + c.bounce(false) |
| +} |
| + |
| +// BounceToHost ensures that the vCPU is in host mode. |
| +// |
| +//go:nosplit |
| +func (c *vCPU) BounceToHost() { |
| + c.bounce(true) |
| +} |
| + |
| +// setSystemTimeLegacy calibrates and sets an approximate system time. |
| +func (c *vCPU) setSystemTimeLegacy() error { |
| + const minIterations = 10 |
| + minimum := uint64(0) |
| + for iter := 0; ; iter++ { |
| + // Try to set the TSC to an estimate of where it will be |
| + // on the host during a "fast" system call iteration. |
| + start := uint64(ktime.Rdtsc()) |
| + if err := c.setTSC(start + (minimum / 2)); err != nil { |
| + return err |
| + } |
| + // See if this is our new minimum call time. Note that this |
| + // serves two functions: one, we make sure that we are |
| + // accurately predicting the offset we need to set. Second, we |
| + // don't want to do the final set on a slow call, which could |
| + // produce a really bad result. |
| + end := uint64(ktime.Rdtsc()) |
| + if end < start { |
| + continue // Totally bogus: unstable TSC? |
| + } |
| + current := end - start |
| + if current < minimum || iter == 0 { |
| + minimum = current // Set our new minimum. |
| + } |
| + // Is this past minIterations and within ~10% of minimum? |
| + upperThreshold := (((minimum << 3) + minimum) >> 3) |
| + if iter >= minIterations && current <= upperThreshold { |
| + return nil |
| + } |
| + } |
| +} |
| + |
| +const machinePoolSize = 16 |
| + |
| +// machinePool is enumerated from the seccompMmapHandler signal handler |
| +var ( |
| + machinePool [machinePoolSize]machineAtomicPtr |
| + machinePoolLen atomicbitops.Uint32 |
| + machinePoolMu sync.Mutex |
| + seccompMmapRulesOnce gosync.Once |
| +) |
| + |
| +func sigsysHandler() |
| +func addrOfSigsysHandler() uintptr |
| + |
| +// seccompMmapRules adds seccomp rules to trap mmap system calls that will be |
| +// handled in seccompMmapHandler. |
| +func seccompMmapRules(m *machine) { |
| + seccompMmapRulesOnce.Do(func() { |
| + // Install the handler. |
| + if err := sighandling.ReplaceSignalHandler(unix.SIGSYS, addrOfSigsysHandler(), &savedSigsysHandler); err != nil { |
| + panic(fmt.Sprintf("Unable to set handler for signal %d: %v", bluepillSignal, err)) |
| + } |
| + rules := []seccomp.RuleSet{} |
| + rules = append(rules, []seccomp.RuleSet{ |
| + // Trap mmap system calls and handle them in sigsysGoHandler |
| + { |
| + Rules: seccomp.SyscallRules{ |
| + unix.SYS_MMAP: { |
| + { |
| + seccomp.MatchAny{}, |
| + seccomp.MatchAny{}, |
| + seccomp.MaskedEqual(unix.PROT_EXEC, 0), |
| + /* MAP_DENYWRITE is ignored and used only for filtering. */ |
| + seccomp.MaskedEqual(unix.MAP_DENYWRITE, 0), |
| + }, |
| + }, |
| + }, |
| + Action: linux.SECCOMP_RET_TRAP, |
| + }, |
| + }...) |
| + instrs, err := seccomp.BuildProgram(rules, linux.SECCOMP_RET_ALLOW, linux.SECCOMP_RET_ALLOW) |
| + if err != nil { |
| + panic(fmt.Sprintf("failed to build rules: %v", err)) |
| + } |
| + // Perform the actual installation. |
| + if err := seccomp.SetFilter(instrs); err != nil { |
| + panic(fmt.Sprintf("failed to set filter: %v", err)) |
| + } |
| + }) |
| + |
| + machinePoolMu.Lock() |
| + n := machinePoolLen.Load() |
| + i := uint32(0) |
| + for ; i < n; i++ { |
| + if machinePool[i].Load() == nil { |
| + break |
| + } |
| + } |
| + if i == n { |
| + if i == machinePoolSize { |
| + machinePoolMu.Unlock() |
| + panic("machinePool is full") |
| + } |
| + machinePoolLen.Add(1) |
| + } |
| + machinePool[i].Store(m) |
| + m.machinePoolIndex = i |
| + machinePoolMu.Unlock() |
| +} |
| -- |
| 2.41.0 |
| |