workspace: rules_go, gazelle, go, gVisor update
This commit not only updates rules_go and friends, but also updates
gVisor, removes legacy protobuf usage and switches from using
build_configuration to a config flag for bazel
Change-Id: Idb383f35ca0fec4cb7329e9d991f08f28cf9b1fb
Reviewed-on: https://review.monogon.dev/c/monogon/+/2129
Tested-by: Jenkins CI
Reviewed-by: Lorenz Brun <lorenz@monogon.tech>
diff --git a/third_party/go/patches/gvisor-fix-debug-builds.patch b/third_party/go/patches/gvisor-fix-debug-builds.patch
new file mode 100644
index 0000000..ea5b04d
--- /dev/null
+++ b/third_party/go/patches/gvisor-fix-debug-builds.patch
@@ -0,0 +1,1364 @@
+From eb22b742839180a0bdb3953c061da15ba822d56d Mon Sep 17 00:00:00 2001
+From: Tim Windelschmidt <tim@monogon.tech>
+Date: Tue, 12 Sep 2023 15:06:49 +0200
+Subject: [PATCH] fix debug builds
+
+---
+ pkg/sentry/platform/kvm/address_space.go | 3 +
+ .../platform/kvm/address_space_debug.go | 242 +++++
+ .../platform/kvm/bluepill_debug_unsafe.go | 215 +++++
+ pkg/sentry/platform/kvm/bluepill_unsafe.go | 4 +-
+ pkg/sentry/platform/kvm/machine.go | 3 +
+ pkg/sentry/platform/kvm/machine_debug.go | 826 ++++++++++++++++++
+ 6 files changed, 1291 insertions(+), 2 deletions(-)
+ create mode 100644 pkg/sentry/platform/kvm/address_space_debug.go
+ create mode 100644 pkg/sentry/platform/kvm/bluepill_debug_unsafe.go
+ create mode 100644 pkg/sentry/platform/kvm/machine_debug.go
+
+diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
+index 79ccbea35..7e30d0365 100644
+--- a/pkg/sentry/platform/kvm/address_space.go
++++ b/pkg/sentry/platform/kvm/address_space.go
+@@ -12,6 +12,9 @@
+ // See the License for the specific language governing permissions and
+ // limitations under the License.
+
++//go:build !kvm_debug
++// +build !kvm_debug
++
+ package kvm
+
+ import (
+diff --git a/pkg/sentry/platform/kvm/address_space_debug.go b/pkg/sentry/platform/kvm/address_space_debug.go
+new file mode 100644
+index 000000000..69aeba45a
+--- /dev/null
++++ b/pkg/sentry/platform/kvm/address_space_debug.go
+@@ -0,0 +1,242 @@
++// Copyright 2018 The gVisor Authors.
++//
++// Licensed under the Apache License, Version 2.0 (the "License");
++// you may not use this file except in compliance with the License.
++// You may obtain a copy of the License at
++//
++// http://www.apache.org/licenses/LICENSE-2.0
++//
++// Unless required by applicable law or agreed to in writing, software
++// distributed under the License is distributed on an "AS IS" BASIS,
++// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++// See the License for the specific language governing permissions and
++// limitations under the License.
++
++//go:build kvm_debug
++// +build kvm_debug
++
++package kvm
++
++import (
++ "gvisor.dev/gvisor/pkg/atomicbitops"
++ "gvisor.dev/gvisor/pkg/hostarch"
++ "gvisor.dev/gvisor/pkg/ring0/pagetables"
++ "gvisor.dev/gvisor/pkg/sentry/memmap"
++ "gvisor.dev/gvisor/pkg/sentry/platform"
++ "gvisor.dev/gvisor/pkg/sync"
++)
++
++// dirtySet tracks vCPUs for invalidation.
++type dirtySet struct {
++ vCPUMasks []atomicbitops.Uint64
++}
++
++// forEach iterates over all CPUs in the dirty set.
++func (ds *dirtySet) forEach(m *machine, fn func(c *vCPU)) {
++ for index := range ds.vCPUMasks {
++ mask := ds.vCPUMasks[index].Swap(0)
++ if mask != 0 {
++ for bit := 0; bit < 64; bit++ {
++ if mask&(1<<uint64(bit)) == 0 {
++ continue
++ }
++ id := 64*index + bit
++ fn(m.vCPUsByID[id])
++ }
++ }
++ }
++}
++
++// mark marks the given vCPU as dirty and returns whether it was previously
++// clean. Being previously clean implies that a flush is needed on entry.
++func (ds *dirtySet) mark(c *vCPU) bool {
++ index := uint64(c.id) / 64
++ bit := uint64(1) << uint(c.id%64)
++
++ oldValue := ds.vCPUMasks[index].Load()
++ if oldValue&bit != 0 {
++ return false // Not clean.
++ }
++
++ // Set the bit unilaterally, and ensure that a flush takes place. Note
++ // that it's possible for races to occur here, but since the flush is
++ // taking place long after these lines there's no race in practice.
++ atomicbitops.OrUint64(&ds.vCPUMasks[index], bit)
++ return true // Previously clean.
++}
++
++// addressSpace is a wrapper for PageTables.
++type addressSpace struct {
++ platform.NoAddressSpaceIO
++
++ // mu is the lock for modifications to the address space.
++ //
++ // Note that the page tables themselves are not locked.
++ mu sync.Mutex
++
++ // machine is the underlying machine.
++ machine *machine
++
++ // pageTables are for this particular address space.
++ pageTables *pagetables.PageTables
++
++ // dirtySet is the set of dirty vCPUs.
++ dirtySet *dirtySet
++}
++
++// Invalidate interrupts all dirty contexts.
++func (as *addressSpace) Invalidate() {
++ as.mu.Lock()
++ defer as.mu.Unlock()
++ as.invalidate()
++}
++
++// Touch adds the given vCPU to the dirty list.
++//
++// The return value indicates whether a flush is required.
++func (as *addressSpace) Touch(c *vCPU) bool {
++ return as.dirtySet.mark(c)
++}
++
++type hostMapEntry struct {
++ addr uintptr
++ length uintptr
++}
++
++// mapLocked maps the given host entry.
++//
++// +checkescape:hard,stack
++func (as *addressSpace) mapLocked(addr hostarch.Addr, m hostMapEntry, at hostarch.AccessType) (inv bool) {
++ for m.length > 0 {
++ physical, length, ok := translateToPhysical(m.addr)
++ if !ok {
++ panic("unable to translate segment")
++ }
++ if length > m.length {
++ length = m.length
++ }
++
++ // Ensure that this map has physical mappings. If the page does
++ // not have physical mappings, the KVM module may inject
++ // spurious exceptions when emulation fails (i.e. it tries to
++ // emulate because the RIP is pointed at those pages).
++ as.machine.mapPhysical(physical, length, physicalRegions)
++
++ // Install the page table mappings. Note that the ordering is
++ // important; if the pagetable mappings were installed before
++ // ensuring the physical pages were available, then some other
++ // thread could theoretically access them.
++ inv = as.pageTables.Map(addr, length, pagetables.MapOpts{
++ AccessType: at,
++ User: true,
++ }, physical) || inv
++ m.addr += length
++ m.length -= length
++ addr += hostarch.Addr(length)
++ }
++
++ return inv
++}
++
++// MapFile implements platform.AddressSpace.MapFile.
++func (as *addressSpace) MapFile(addr hostarch.Addr, f memmap.File, fr memmap.FileRange, at hostarch.AccessType, precommit bool) error {
++ as.mu.Lock()
++ defer as.mu.Unlock()
++
++ // Get mappings in the sentry's address space, which are guaranteed to be
++ // valid as long as a reference is held on the mapped pages (which is in
++ // turn required by AddressSpace.MapFile precondition).
++ //
++ // If precommit is true, we will touch mappings to commit them, so ensure
++ // that mappings are readable from sentry context.
++ //
++ // We don't execute from application file-mapped memory, and guest page
++ // tables don't care if we have execute permission (but they do need pages
++ // to be readable).
++ bs, err := f.MapInternal(fr, hostarch.AccessType{
++ Read: at.Read || at.Execute || precommit,
++ Write: at.Write,
++ })
++ if err != nil {
++ return err
++ }
++
++ // See block in mapLocked.
++ as.pageTables.Allocator.(*allocator).cpu = as.machine.Get()
++ defer as.machine.Put(as.pageTables.Allocator.(*allocator).cpu)
++
++ // Map the mappings in the sentry's address space (guest physical memory)
++ // into the application's address space (guest virtual memory).
++ inv := false
++ for !bs.IsEmpty() {
++ b := bs.Head()
++ bs = bs.Tail()
++ // Since fr was page-aligned, b should also be page-aligned. We do the
++ // lookup in our host page tables for this translation.
++ if precommit {
++ s := b.ToSlice()
++ for i := 0; i < len(s); i += hostarch.PageSize {
++ _ = s[i] // Touch to commit.
++ }
++ }
++
++ // See bluepill_allocator.go.
++ bluepill(as.pageTables.Allocator.(*allocator).cpu)
++
++ // Perform the mapping.
++ prev := as.mapLocked(addr, hostMapEntry{
++ addr: b.Addr(),
++ length: uintptr(b.Len()),
++ }, at)
++ inv = inv || prev
++ addr += hostarch.Addr(b.Len())
++ }
++ if inv {
++ as.invalidate()
++ }
++
++ return nil
++}
++
++// unmapLocked is an escape-checked wrapped around Unmap.
++//
++// +checkescape:hard,stack
++func (as *addressSpace) unmapLocked(addr hostarch.Addr, length uint64) bool {
++ return as.pageTables.Unmap(addr, uintptr(length))
++}
++
++// Unmap unmaps the given range by calling pagetables.PageTables.Unmap.
++func (as *addressSpace) Unmap(addr hostarch.Addr, length uint64) {
++ as.mu.Lock()
++ defer as.mu.Unlock()
++
++ // See above & bluepill_allocator.go.
++ as.pageTables.Allocator.(*allocator).cpu = as.machine.Get()
++ defer as.machine.Put(as.pageTables.Allocator.(*allocator).cpu)
++ bluepill(as.pageTables.Allocator.(*allocator).cpu)
++
++ if prev := as.unmapLocked(addr, length); prev {
++ // Invalidate all active vCPUs.
++ as.invalidate()
++
++ // Recycle any freed intermediate pages.
++ as.pageTables.Allocator.Recycle()
++ }
++}
++
++// Release releases the page tables.
++func (as *addressSpace) Release() {
++ as.Unmap(0, ^uint64(0))
++
++ // Free all pages from the allocator.
++ as.pageTables.Allocator.(*allocator).base.Drain()
++
++ // Drop all cached machine references.
++ as.machine.dropPageTables(as.pageTables)
++}
++
++// PreFork implements platform.AddressSpace.PreFork.
++func (as *addressSpace) PreFork() {}
++
++// PostFork implements platform.AddressSpace.PostFork.
++func (as *addressSpace) PostFork() {}
+diff --git a/pkg/sentry/platform/kvm/bluepill_debug_unsafe.go b/pkg/sentry/platform/kvm/bluepill_debug_unsafe.go
+new file mode 100644
+index 000000000..5feb45c19
+--- /dev/null
++++ b/pkg/sentry/platform/kvm/bluepill_debug_unsafe.go
+@@ -0,0 +1,215 @@
++// Copyright 2018 The gVisor Authors.
++//
++// Licensed under the Apache License, Version 2.0 (the "License");
++// you may not use this file except in compliance with the License.
++// You may obtain a copy of the License at
++//
++// http://www.apache.org/licenses/LICENSE-2.0
++//
++// Unless required by applicable law or agreed to in writing, software
++// distributed under the License is distributed on an "AS IS" BASIS,
++// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++// See the License for the specific language governing permissions and
++// limitations under the License.
++
++//go:build go1.18 && kvm_debug
++// +build go1.18,kvm_debug
++
++// //go:linkname directives type-checked by checklinkname. Any other
++// non-linkname assumptions outside the Go 1 compatibility guarantee should
++// have an accompanied vet check or version guard build tag.
++
++package kvm
++
++import (
++ "unsafe"
++
++ "golang.org/x/sys/unix"
++ "gvisor.dev/gvisor/pkg/sentry/arch"
++)
++
++//go:linkname throw runtime.throw
++func throw(s string)
++
++// vCPUPtr returns a CPU for the given address.
++func vCPUPtr(addr uintptr) *vCPU {
++ return (*vCPU)(unsafe.Pointer(addr))
++}
++
++// bytePtr returns a bytePtr for the given address.
++func bytePtr(addr uintptr) *byte {
++ return (*byte)(unsafe.Pointer(addr))
++}
++
++// uintptrValue returns a uintptr for the given address.
++func uintptrValue(addr *byte) uintptr {
++ return (uintptr)(unsafe.Pointer(addr))
++}
++
++// bluepillArchContext returns the UContext64.
++func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 {
++ return &((*arch.UContext64)(context).MContext)
++}
++
++// bluepillHandleHlt is reponsible for handling VM-Exit.
++func bluepillGuestExit(c *vCPU, context unsafe.Pointer) {
++ // Increment our counter.
++ c.guestExits.Add(1)
++
++ // Copy out registers.
++ bluepillArchExit(c, bluepillArchContext(context))
++
++ // Return to the vCPUReady state; notify any waiters.
++ user := c.state.Load() & vCPUUser
++ switch c.state.Swap(user) {
++ case user | vCPUGuest: // Expected case.
++ case user | vCPUGuest | vCPUWaiter:
++ c.notify()
++ default:
++ throw("invalid state")
++ }
++}
++
++var hexSyms = []byte("0123456789abcdef")
++
++func printHex(title []byte, val uint64) {
++ var str [18]byte
++ for i := 0; i < 16; i++ {
++ str[16-i] = hexSyms[val&0xf]
++ val = val >> 4
++ }
++ str[0] = ' '
++ str[17] = '\n'
++ unix.RawSyscall(unix.SYS_WRITE, uintptr(unix.Stderr), uintptr(unsafe.Pointer(&title[0])), uintptr(len(title)))
++ unix.RawSyscall(unix.SYS_WRITE, uintptr(unix.Stderr), uintptr(unsafe.Pointer(&str)), 18)
++}
++
++// bluepillHandler is called from the signal stub.
++//
++// The world may be stopped while this is executing, and it executes on the
++// signal stack. It should only execute raw system calls and functions that are
++// explicitly marked go:nosplit.
++//
++// Ideally, this function should switch to gsignal, as runtime.sigtramp does,
++// but that is tedious given all the runtime internals. That said, using
++// gsignal inside a signal handler is not _required_, provided we avoid stack
++// splits and allocations. Note that calling any splittable function here will
++// be flaky; if the signal stack is below the G stack then we will trigger a
++// split and crash. If above, we won't trigger a split.
++//
++// +checkescape:all
++func bluepillHandler(context unsafe.Pointer) {
++ // Sanitize the registers; interrupts must always be disabled.
++ c := bluepillArchEnter(bluepillArchContext(context))
++
++ // Mark this as guest mode.
++ switch c.state.Swap(vCPUGuest | vCPUUser) {
++ case vCPUUser: // Expected case.
++ case vCPUUser | vCPUWaiter:
++ c.notify()
++ default:
++ throw("invalid state")
++ }
++
++ for {
++ hostExitCounter.Increment()
++ _, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(c.fd), _KVM_RUN, 0) // escapes: no.
++ switch errno {
++ case 0: // Expected case.
++ case unix.EINTR:
++ interruptCounter.Increment()
++ // First, we process whatever pending signal
++ // interrupted KVM. Since we're in a signal handler
++ // currently, all signals are masked and the signal
++ // must have been delivered directly to this thread.
++ timeout := unix.Timespec{}
++ sig, _, errno := unix.RawSyscall6( // escapes: no.
++ unix.SYS_RT_SIGTIMEDWAIT,
++ uintptr(unsafe.Pointer(&bounceSignalMask)),
++ 0, // siginfo.
++ uintptr(unsafe.Pointer(&timeout)), // timeout.
++ 8, // sigset size.
++ 0, 0)
++ if errno == unix.EAGAIN {
++ continue
++ }
++ if errno != 0 {
++ throw("error waiting for pending signal")
++ }
++ if sig != uintptr(bounceSignal) {
++ throw("unexpected signal")
++ }
++
++ // Check whether the current state of the vCPU is ready
++ // for interrupt injection. Because we don't have a
++ // PIC, we can't inject an interrupt while they are
++ // masked. We need to request a window if it's not
++ // ready.
++ if bluepillReadyStopGuest(c) {
++ // Force injection below; the vCPU is ready.
++ c.runData.exitReason = _KVM_EXIT_IRQ_WINDOW_OPEN
++ } else {
++ c.runData.requestInterruptWindow = 1
++ continue // Rerun vCPU.
++ }
++ case unix.EFAULT:
++ // If a fault is not serviceable due to the host
++ // backing pages having page permissions, instead of an
++ // MMIO exit we receive EFAULT from the run ioctl. We
++ // always inject an NMI here since we may be in kernel
++ // mode and have interrupts disabled.
++ bluepillSigBus(c)
++ continue // Rerun vCPU.
++ case unix.ENOSYS:
++ bluepillHandleEnosys(c)
++ continue
++ default:
++ throw("run failed")
++ }
++
++ switch c.runData.exitReason {
++ case _KVM_EXIT_EXCEPTION:
++ c.die(bluepillArchContext(context), "exception")
++ return
++ case _KVM_EXIT_IO:
++ c.die(bluepillArchContext(context), "I/O")
++ return
++ case _KVM_EXIT_INTERNAL_ERROR:
++ // An internal error is typically thrown when emulation
++ // fails. This can occur via the MMIO path below (and
++ // it might fail because we have multiple regions that
++ // are not mapped). We would actually prefer that no
++ // emulation occur, and don't mind at all if it fails.
++ case _KVM_EXIT_HYPERCALL:
++ c.die(bluepillArchContext(context), "hypercall")
++ return
++ case _KVM_EXIT_DEBUG:
++ c.die(bluepillArchContext(context), "debug")
++ return
++ case _KVM_EXIT_HLT:
++ c.hltSanityCheck()
++ bluepillGuestExit(c, context)
++ return
++ case _KVM_EXIT_MMIO:
++ physical := uintptr(c.runData.data[0])
++ if getHypercallID(physical) == _KVM_HYPERCALL_VMEXIT {
++ bluepillGuestExit(c, context)
++ return
++ }
++
++ c.die(bluepillArchContext(context), "exit_mmio")
++ return
++ case _KVM_EXIT_IRQ_WINDOW_OPEN:
++ bluepillStopGuest(c)
++ case _KVM_EXIT_SHUTDOWN:
++ c.die(bluepillArchContext(context), "shutdown")
++ return
++ case _KVM_EXIT_FAIL_ENTRY:
++ c.die(bluepillArchContext(context), "entry failed")
++ return
++ default:
++ bluepillArchHandleExit(c, context)
++ return
++ }
++ }
++}
+diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
+index 81bd9f814..ad8b966e7 100644
+--- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
++++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
+@@ -12,8 +12,8 @@
+ // See the License for the specific language governing permissions and
+ // limitations under the License.
+
+-//go:build go1.18
+-// +build go1.18
++//go:build go1.18 && !kvm_debug
++// +build go1.18,!kvm_debug
+
+ // //go:linkname directives type-checked by checklinkname. Any other
+ // non-linkname assumptions outside the Go 1 compatibility guarantee should
+diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
+index f39bf1f06..4f0264db7 100644
+--- a/pkg/sentry/platform/kvm/machine.go
++++ b/pkg/sentry/platform/kvm/machine.go
+@@ -12,6 +12,9 @@
+ // See the License for the specific language governing permissions and
+ // limitations under the License.
+
++//go:build !kvm_debug
++// +build !kvm_debug
++
+ package kvm
+
+ import (
+diff --git a/pkg/sentry/platform/kvm/machine_debug.go b/pkg/sentry/platform/kvm/machine_debug.go
+new file mode 100644
+index 000000000..0a4735d2d
+--- /dev/null
++++ b/pkg/sentry/platform/kvm/machine_debug.go
+@@ -0,0 +1,826 @@
++// Copyright 2018 The gVisor Authors.
++//
++// Licensed under the Apache License, Version 2.0 (the "License");
++// you may not use this file except in compliance with the License.
++// You may obtain a copy of the License at
++//
++// http://www.apache.org/licenses/LICENSE-2.0
++//
++// Unless required by applicable law or agreed to in writing, software
++// distributed under the License is distributed on an "AS IS" BASIS,
++// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++// See the License for the specific language governing permissions and
++// limitations under the License.
++
++//go:build kvm_debug
++// +build kvm_debug
++
++package kvm
++
++import (
++ "fmt"
++ "runtime"
++ gosync "sync"
++ "sync/atomic"
++ "time"
++
++ "golang.org/x/sys/unix"
++ "gvisor.dev/gvisor/pkg/abi/linux"
++ "gvisor.dev/gvisor/pkg/atomicbitops"
++ "gvisor.dev/gvisor/pkg/hostarch"
++ "gvisor.dev/gvisor/pkg/hosttid"
++ "gvisor.dev/gvisor/pkg/log"
++ "gvisor.dev/gvisor/pkg/metric"
++ "gvisor.dev/gvisor/pkg/ring0"
++ "gvisor.dev/gvisor/pkg/ring0/pagetables"
++ "gvisor.dev/gvisor/pkg/seccomp"
++ ktime "gvisor.dev/gvisor/pkg/sentry/time"
++ "gvisor.dev/gvisor/pkg/sighandling"
++ "gvisor.dev/gvisor/pkg/sync"
++)
++
++// machine contains state associated with the VM as a whole.
++type machine struct {
++ // fd is the vm fd.
++ fd int
++
++ // machinePoolIndex is the index in the machinePool array.
++ machinePoolIndex uint32
++
++ // nextSlot is the next slot for setMemoryRegion.
++ //
++ // If nextSlot is ^uint32(0), then slots are currently being updated, and the
++ // caller should retry.
++ nextSlot atomicbitops.Uint32
++
++ // upperSharedPageTables tracks the read-only shared upper of all the pagetables.
++ upperSharedPageTables *pagetables.PageTables
++
++ // kernel is the set of global structures.
++ kernel ring0.Kernel
++
++ // mu protects vCPUs.
++ mu sync.RWMutex
++
++ // available is notified when vCPUs are available.
++ available sync.Cond
++
++ // vCPUsByTID are the machine vCPUs.
++ //
++ // These are populated dynamically.
++ vCPUsByTID map[uint64]*vCPU
++
++ // vCPUsByID are the machine vCPUs, can be indexed by the vCPU's ID.
++ vCPUsByID []*vCPU
++
++ // usedVCPUs is the number of vCPUs that have been used from the
++ // vCPUsByID pool.
++ usedVCPUs int
++
++ // maxVCPUs is the maximum number of vCPUs supported by the machine.
++ maxVCPUs int
++
++ // maxSlots is the maximum number of memory slots supported by the machine.
++ maxSlots int
++
++ // tscControl checks whether cpu supports TSC scaling
++ tscControl bool
++
++ // usedSlots is the set of used physical addresses (not sorted).
++ usedSlots []uintptr
++}
++
++const (
++ // vCPUReady is an alias for all the below clear.
++ vCPUReady uint32 = 0
++
++ // vCPUser indicates that the vCPU is in or about to enter user mode.
++ vCPUUser uint32 = 1 << 0
++
++ // vCPUGuest indicates the vCPU is in guest mode.
++ vCPUGuest uint32 = 1 << 1
++
++ // vCPUWaiter indicates that there is a waiter.
++ //
++ // If this is set, then notify must be called on any state transitions.
++ vCPUWaiter uint32 = 1 << 2
++)
++
++// Field values for the get_vcpu metric acquisition path used.
++var (
++ getVCPUAcquisitionFastReused = metric.FieldValue{"fast_reused"}
++ getVCPUAcquisitionReused = metric.FieldValue{"reused"}
++ getVCPUAcquisitionUnused = metric.FieldValue{"unused"}
++ getVCPUAcquisitionStolen = metric.FieldValue{"stolen"}
++)
++
++var (
++ // hostExitCounter is a metric that tracks how many times the sentry
++ // performed a host to guest world switch.
++ hostExitCounter = metric.MustCreateNewProfilingUint64Metric(
++ "/kvm/host_exits", false, "The number of times the sentry performed a host to guest world switch.")
++
++ // userExitCounter is a metric that tracks how many times the sentry has
++ // had an exit from userspace. Analogous to vCPU.userExits.
++ userExitCounter = metric.MustCreateNewProfilingUint64Metric(
++ "/kvm/user_exits", false, "The number of times the sentry has had an exit from userspace.")
++
++ // interruptCounter is a metric that tracks how many times execution returned
++ // to the KVM host to handle a pending signal.
++ interruptCounter = metric.MustCreateNewProfilingUint64Metric(
++ "/kvm/interrupts", false, "The number of times the signal handler was invoked.")
++
++ // mmapCallCounter is a metric that tracks how many times the function
++ // seccompMmapSyscall has been called.
++ mmapCallCounter = metric.MustCreateNewProfilingUint64Metric(
++ "/kvm/mmap_calls", false, "The number of times seccompMmapSyscall has been called.")
++
++ // getVCPUCounter is a metric that tracks how many times different paths of
++ // machine.Get() are triggered.
++ getVCPUCounter = metric.MustCreateNewProfilingUint64Metric(
++ "/kvm/get_vcpu", false, "The number of times that machine.Get() was called, split by path the function took.",
++ metric.NewField("acquisition_type", &getVCPUAcquisitionFastReused, &getVCPUAcquisitionReused, &getVCPUAcquisitionUnused, &getVCPUAcquisitionStolen))
++
++ // asInvalidateDuration are durations of calling addressSpace.invalidate().
++ asInvalidateDuration = metric.MustCreateNewProfilingTimerMetric("/kvm/address_space_invalidate",
++ metric.NewExponentialBucketer(15, uint64(time.Nanosecond*100), 1, 2),
++ "Duration of calling addressSpace.invalidate().")
++)
++
++// vCPU is a single KVM vCPU.
++type vCPU struct {
++ // CPU is the kernel CPU data.
++ //
++ // This must be the first element of this structure, it is referenced
++ // by the bluepill code (see bluepill_amd64.s).
++ ring0.CPU
++
++ // id is the vCPU id.
++ id int
++
++ // fd is the vCPU fd.
++ fd int
++
++ // tid is the last set tid.
++ tid atomicbitops.Uint64
++
++ // userExits is the count of user exits.
++ userExits atomicbitops.Uint64
++
++ // guestExits is the count of guest to host world switches.
++ guestExits atomicbitops.Uint64
++
++ // faults is a count of world faults (informational only).
++ faults uint32
++
++ // state is the vCPU state.
++ //
++ // This is a bitmask of the three fields (vCPU*) described above.
++ state atomicbitops.Uint32
++
++ // runData for this vCPU.
++ runData *runData
++
++ // machine associated with this vCPU.
++ machine *machine
++
++ // active is the current addressSpace: this is set and read atomically,
++ // it is used to elide unnecessary interrupts due to invalidations.
++ active atomicAddressSpace
++
++ // vCPUArchState is the architecture-specific state.
++ vCPUArchState
++
++ // dieState holds state related to vCPU death.
++ dieState dieState
++}
++
++type dieState struct {
++ // message is thrown from die.
++ message string
++
++ // guestRegs is used to store register state during vCPU.die() to prevent
++ // allocation inside nosplit function.
++ guestRegs userRegs
++}
++
++// createVCPU creates and returns a new vCPU.
++//
++// Precondition: mu must be held.
++func (m *machine) createVCPU(id int) *vCPU {
++ // Create the vCPU.
++ fd, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CREATE_VCPU, uintptr(id))
++ if errno != 0 {
++ panic(fmt.Sprintf("error creating new vCPU: %v", errno))
++ }
++
++ c := &vCPU{
++ id: id,
++ fd: int(fd),
++ machine: m,
++ }
++ c.CPU.Init(&m.kernel, c.id, c)
++ m.vCPUsByID[c.id] = c
++
++ // Ensure the signal mask is correct.
++ if err := c.setSignalMask(); err != nil {
++ panic(fmt.Sprintf("error setting signal mask: %v", err))
++ }
++
++ // Map the run data.
++ runData, err := mapRunData(int(fd))
++ if err != nil {
++ panic(fmt.Sprintf("error mapping run data: %v", err))
++ }
++ c.runData = runData
++
++ // Initialize architecture state.
++ if err := c.initArchState(); err != nil {
++ panic(fmt.Sprintf("error initialization vCPU state: %v", err))
++ }
++
++ return c // Done.
++}
++
++// newMachine returns a new VM context.
++func newMachine(vm int) (*machine, error) {
++ // Create the machine.
++ m := &machine{fd: vm}
++ m.available.L = &m.mu
++
++ // Pull the maximum vCPUs.
++ m.getMaxVCPU()
++ log.Debugf("The maximum number of vCPUs is %d.", m.maxVCPUs)
++ m.vCPUsByTID = make(map[uint64]*vCPU)
++ m.vCPUsByID = make([]*vCPU, m.maxVCPUs)
++ m.kernel.Init(m.maxVCPUs)
++
++ // Pull the maximum slots.
++ maxSlots, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_MEMSLOTS)
++ if errno != 0 {
++ m.maxSlots = _KVM_NR_MEMSLOTS
++ } else {
++ m.maxSlots = int(maxSlots)
++ }
++ log.Debugf("The maximum number of slots is %d.", m.maxSlots)
++ m.usedSlots = make([]uintptr, m.maxSlots)
++
++ // Check TSC Scaling
++ hasTSCControl, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_TSC_CONTROL)
++ m.tscControl = errno == 0 && hasTSCControl == 1
++ log.Debugf("TSC scaling support: %t.", m.tscControl)
++
++ // Create the upper shared pagetables and kernel(sentry) pagetables.
++ m.upperSharedPageTables = pagetables.New(newAllocator())
++ m.mapUpperHalf(m.upperSharedPageTables)
++ m.upperSharedPageTables.Allocator.(*allocator).base.Drain()
++ m.upperSharedPageTables.MarkReadOnlyShared()
++ m.kernel.PageTables = pagetables.NewWithUpper(newAllocator(), m.upperSharedPageTables, ring0.KernelStartAddress)
++
++ // Install seccomp rules to trap runtime mmap system calls. They will
++ // be handled by seccompMmapHandler.
++ seccompMmapRules(m)
++
++ // Apply the physical mappings. Note that these mappings may point to
++ // guest physical addresses that are not actually available. These
++ // physical pages are mapped on demand, see kernel_unsafe.go.
++ applyPhysicalRegions(func(pr physicalRegion) bool {
++ // Map everything in the lower half.
++ m.kernel.PageTables.Map(
++ hostarch.Addr(pr.virtual),
++ pr.length,
++ pagetables.MapOpts{AccessType: hostarch.ReadWrite},
++ pr.physical)
++
++ return true // Keep iterating.
++ })
++
++ // Ensure that the currently mapped virtual regions are actually
++ // available in the VM. Note that this doesn't guarantee no future
++ // faults, however it should guarantee that everything is available to
++ // ensure successful vCPU entry.
++ mapRegion := func(vr virtualRegion, flags uint32) {
++ for virtual := vr.virtual; virtual < vr.virtual+vr.length; {
++ physical, length, ok := translateToPhysical(virtual)
++ if !ok {
++ // This must be an invalid region that was
++ // knocked out by creation of the physical map.
++ return
++ }
++ if virtual+length > vr.virtual+vr.length {
++ // Cap the length to the end of the area.
++ length = vr.virtual + vr.length - virtual
++ }
++ // Update page tables for executable mappings.
++ if vr.accessType.Execute {
++ if vr.accessType.Write {
++ panic(fmt.Sprintf("executable mapping can't be writable: %#v", vr))
++ }
++ m.kernel.PageTables.Map(
++ hostarch.Addr(virtual),
++ length,
++ pagetables.MapOpts{AccessType: vr.accessType},
++ physical)
++ }
++
++ // Ensure the physical range is mapped.
++ m.mapPhysical(physical, length, physicalRegions)
++ virtual += length
++ }
++ }
++
++ // handleBluepillFault takes the slot spinlock and it is called from
++ // seccompMmapHandler, so here we have to guarantee that mmap is not
++ // called while we hold the slot spinlock.
++ disableAsyncPreemption()
++ applyVirtualRegions(func(vr virtualRegion) {
++ if excludeVirtualRegion(vr) {
++ return // skip region.
++ }
++ // Take into account that the stack can grow down.
++ if vr.filename == "[stack]" {
++ vr.virtual -= 1 << 20
++ vr.length += 1 << 20
++ }
++
++ mapRegion(vr, 0)
++
++ })
++ enableAsyncPreemption()
++
++ // Initialize architecture state.
++ if err := m.initArchState(); err != nil {
++ m.Destroy()
++ return nil, err
++ }
++
++ // Ensure the machine is cleaned up properly.
++ runtime.SetFinalizer(m, (*machine).Destroy)
++ return m, nil
++}
++
++// hasSlot returns true if the given address is mapped.
++//
++// This must be done via a linear scan.
++//
++//go:nosplit
++func (m *machine) hasSlot(physical uintptr) bool {
++ slotLen := int(m.nextSlot.Load())
++ // When slots are being updated, nextSlot is ^uint32(0). As this situation
++ // is less likely happen, we just set the slotLen to m.maxSlots, and scan
++ // the whole usedSlots array.
++ if slotLen == int(^uint32(0)) {
++ slotLen = m.maxSlots
++ }
++ for i := 0; i < slotLen; i++ {
++ if p := atomic.LoadUintptr(&m.usedSlots[i]); p == physical {
++ return true
++ }
++ }
++ return false
++}
++
++// mapPhysical checks for the mapping of a physical range, and installs one if
++// not available. This attempts to be efficient for calls in the hot path.
++//
++// This throws on error.
++func (m *machine) mapPhysical(physical, length uintptr, phyRegions []physicalRegion) {
++ for end := physical + length; physical < end; {
++ _, physicalStart, length, pr := calculateBluepillFault(physical, phyRegions)
++ if pr == nil {
++ // Should never happen.
++ throw("mapPhysical on unknown physical address")
++ }
++
++ // Is this already mapped? Check the usedSlots.
++ if !m.hasSlot(physicalStart) {
++ if _, ok := handleBluepillFault(m, physical, phyRegions); !ok {
++ throw("handleBluepillFault failed")
++ }
++ }
++
++ // Move to the next chunk.
++ physical = physicalStart + length
++ }
++}
++
++// Destroy frees associated resources.
++//
++// Destroy should only be called once all active users of the machine are gone.
++// The machine object should not be used after calling Destroy.
++//
++// Precondition: all vCPUs must be returned to the machine.
++func (m *machine) Destroy() {
++ runtime.SetFinalizer(m, nil)
++
++ // Destroy vCPUs.
++ for _, c := range m.vCPUsByID {
++ if c == nil {
++ continue
++ }
++
++ // Ensure the vCPU is not still running in guest mode. This is
++ // possible iff teardown has been done by other threads, and
++ // somehow a single thread has not executed any system calls.
++ c.BounceToHost()
++
++ // Note that the runData may not be mapped if an error occurs
++ // during the middle of initialization.
++ if c.runData != nil {
++ if err := unmapRunData(c.runData); err != nil {
++ panic(fmt.Sprintf("error unmapping rundata: %v", err))
++ }
++ }
++ if err := unix.Close(int(c.fd)); err != nil {
++ panic(fmt.Sprintf("error closing vCPU fd: %v", err))
++ }
++ }
++
++ machinePool[m.machinePoolIndex].Store(nil)
++ seccompMmapSync()
++
++ // vCPUs are gone: teardown machine state.
++ if err := unix.Close(m.fd); err != nil {
++ panic(fmt.Sprintf("error closing VM fd: %v", err))
++ }
++}
++
++// Get gets an available vCPU.
++//
++// This will return with the OS thread locked.
++//
++// It is guaranteed that if any OS thread TID is in guest, m.vCPUs[TID] points
++// to the vCPU in which the OS thread TID is running. So if Get() returns with
++// the corrent context in guest, the vCPU of it must be the same as what
++// Get() returns.
++func (m *machine) Get() *vCPU {
++ m.mu.RLock()
++ runtime.LockOSThread()
++ tid := hosttid.Current()
++
++ // Check for an exact match.
++ if c := m.vCPUsByTID[tid]; c != nil {
++ c.lock()
++ m.mu.RUnlock()
++ getVCPUCounter.Increment(&getVCPUAcquisitionFastReused)
++ return c
++ }
++
++ // The happy path failed. We now proceed to acquire an exclusive lock
++ // (because the vCPU map may change), and scan all available vCPUs.
++ // In this case, we first unlock the OS thread. Otherwise, if mu is
++ // not available, the current system thread will be parked and a new
++ // system thread spawned. We avoid this situation by simply refreshing
++ // tid after relocking the system thread.
++ m.mu.RUnlock()
++ runtime.UnlockOSThread()
++ m.mu.Lock()
++ runtime.LockOSThread()
++ tid = hosttid.Current()
++
++ // Recheck for an exact match.
++ if c := m.vCPUsByTID[tid]; c != nil {
++ c.lock()
++ m.mu.Unlock()
++ getVCPUCounter.Increment(&getVCPUAcquisitionReused)
++ return c
++ }
++
++ for {
++ // Get vCPU from the m.vCPUsByID pool.
++ if m.usedVCPUs < m.maxVCPUs {
++ c := m.vCPUsByID[m.usedVCPUs]
++ m.usedVCPUs++
++ c.lock()
++ m.vCPUsByTID[tid] = c
++ m.mu.Unlock()
++ c.loadSegments(tid)
++ getVCPUCounter.Increment(&getVCPUAcquisitionUnused)
++ return c
++ }
++
++ // Scan for an available vCPU.
++ for origTID, c := range m.vCPUsByTID {
++ if c.state.CompareAndSwap(vCPUReady, vCPUUser) {
++ delete(m.vCPUsByTID, origTID)
++ m.vCPUsByTID[tid] = c
++ m.mu.Unlock()
++ c.loadSegments(tid)
++ getVCPUCounter.Increment(&getVCPUAcquisitionUnused)
++ return c
++ }
++ }
++
++ // Scan for something not in user mode.
++ for origTID, c := range m.vCPUsByTID {
++ if !c.state.CompareAndSwap(vCPUGuest, vCPUGuest|vCPUWaiter) {
++ continue
++ }
++
++ // The vCPU is not be able to transition to
++ // vCPUGuest|vCPUWaiter or to vCPUUser because that
++ // transition requires holding the machine mutex, as we
++ // do now. There is no path to register a waiter on
++ // just the vCPUReady state.
++ for {
++ c.waitUntilNot(vCPUGuest | vCPUWaiter)
++ if c.state.CompareAndSwap(vCPUReady, vCPUUser) {
++ break
++ }
++ }
++
++ // Steal the vCPU.
++ delete(m.vCPUsByTID, origTID)
++ m.vCPUsByTID[tid] = c
++ m.mu.Unlock()
++ c.loadSegments(tid)
++ getVCPUCounter.Increment(&getVCPUAcquisitionStolen)
++ return c
++ }
++
++ // Everything is executing in user mode. Wait until something
++ // is available. Note that signaling the condition variable
++ // will have the extra effect of kicking the vCPUs out of guest
++ // mode if that's where they were.
++ m.available.Wait()
++ }
++}
++
++// Put puts the current vCPU.
++func (m *machine) Put(c *vCPU) {
++ c.unlock()
++ runtime.UnlockOSThread()
++
++ m.mu.RLock()
++ m.available.Signal()
++ m.mu.RUnlock()
++}
++
++// newDirtySet returns a new dirty set.
++func (m *machine) newDirtySet() *dirtySet {
++ return &dirtySet{
++ vCPUMasks: make([]atomicbitops.Uint64,
++ (m.maxVCPUs+63)/64, (m.maxVCPUs+63)/64),
++ }
++}
++
++// dropPageTables drops cached page table entries.
++func (m *machine) dropPageTables(pt *pagetables.PageTables) {
++ m.mu.Lock()
++ defer m.mu.Unlock()
++
++ // Clear from all PCIDs.
++ for _, c := range m.vCPUsByID {
++ if c != nil && c.PCIDs != nil {
++ c.PCIDs.Drop(pt)
++ }
++ }
++}
++
++// lock marks the vCPU as in user mode.
++//
++// This should only be called directly when known to be safe, i.e. when
++// the vCPU is owned by the current TID with no chance of theft.
++//
++//go:nosplit
++func (c *vCPU) lock() {
++ atomicbitops.OrUint32(&c.state, vCPUUser)
++}
++
++// unlock clears the vCPUUser bit.
++//
++//go:nosplit
++func (c *vCPU) unlock() {
++ origState := atomicbitops.CompareAndSwapUint32(&c.state, vCPUUser|vCPUGuest, vCPUGuest)
++ if origState == vCPUUser|vCPUGuest {
++ // Happy path: no exits are forced, and we can continue
++ // executing on our merry way with a single atomic access.
++ return
++ }
++
++ // Clear the lock.
++ for {
++ state := atomicbitops.CompareAndSwapUint32(&c.state, origState, origState&^vCPUUser)
++ if state == origState {
++ break
++ }
++ origState = state
++ }
++ switch origState {
++ case vCPUUser:
++ // Normal state.
++ case vCPUUser | vCPUGuest | vCPUWaiter:
++ // Force a transition: this must trigger a notification when we
++ // return from guest mode. We must clear vCPUWaiter here
++ // anyways, because BounceToKernel will force a transition only
++ // from ring3 to ring0, which will not clear this bit. Halt may
++ // workaround the issue, but if there is no exception or
++ // syscall in this period, BounceToKernel will hang.
++ atomicbitops.AndUint32(&c.state, ^vCPUWaiter)
++ c.notify()
++ case vCPUUser | vCPUWaiter:
++ // Waiting for the lock to be released; the responsibility is
++ // on us to notify the waiter and clear the associated bit.
++ atomicbitops.AndUint32(&c.state, ^vCPUWaiter)
++ c.notify()
++ default:
++ panic("invalid state")
++ }
++}
++
++// NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt.
++//
++//go:nosplit
++func (c *vCPU) NotifyInterrupt() {
++ c.BounceToKernel()
++}
++
++// pid is used below in bounce.
++var pid = unix.Getpid()
++
++// bounce forces a return to the kernel or to host mode.
++//
++// This effectively unwinds the state machine.
++func (c *vCPU) bounce(forceGuestExit bool) {
++ origGuestExits := c.guestExits.Load()
++ origUserExits := c.userExits.Load()
++ for {
++ switch state := c.state.Load(); state {
++ case vCPUReady, vCPUWaiter:
++ // There is nothing to be done, we're already in the
++ // kernel pre-acquisition. The Bounce criteria have
++ // been satisfied.
++ return
++ case vCPUUser:
++ // We need to register a waiter for the actual guest
++ // transition. When the transition takes place, then we
++ // can inject an interrupt to ensure a return to host
++ // mode.
++ c.state.CompareAndSwap(state, state|vCPUWaiter)
++ case vCPUUser | vCPUWaiter:
++ // Wait for the transition to guest mode. This should
++ // come from the bluepill handler.
++ c.waitUntilNot(state)
++ case vCPUGuest, vCPUUser | vCPUGuest:
++ if state == vCPUGuest && !forceGuestExit {
++ // The vCPU is already not acquired, so there's
++ // no need to do a fresh injection here.
++ return
++ }
++ // The vCPU is in user or kernel mode. Attempt to
++ // register a notification on change.
++ if !c.state.CompareAndSwap(state, state|vCPUWaiter) {
++ break // Retry.
++ }
++ for {
++ // We need to spin here until the signal is
++ // delivered, because Tgkill can return EAGAIN
++ // under memory pressure. Since we already
++ // marked ourselves as a waiter, we need to
++ // ensure that a signal is actually delivered.
++ if err := unix.Tgkill(pid, int(c.tid.Load()), bounceSignal); err == nil {
++ break
++ } else if err.(unix.Errno) == unix.EAGAIN {
++ continue
++ } else {
++ // Nothing else should be returned by tgkill.
++ panic(fmt.Sprintf("unexpected tgkill error: %v", err))
++ }
++ }
++ case vCPUGuest | vCPUWaiter, vCPUUser | vCPUGuest | vCPUWaiter:
++ if state == vCPUGuest|vCPUWaiter && !forceGuestExit {
++ // See above.
++ return
++ }
++ // Wait for the transition. This again should happen
++ // from the bluepill handler, but on the way out.
++ c.waitUntilNot(state)
++ default:
++ // Should not happen: the above is exhaustive.
++ panic("invalid state")
++ }
++
++ // Check if we've missed the state transition, but
++ // we can safely return at this point in time.
++ newGuestExits := c.guestExits.Load()
++ newUserExits := c.userExits.Load()
++ if newUserExits != origUserExits && (!forceGuestExit || newGuestExits != origGuestExits) {
++ return
++ }
++ }
++}
++
++// BounceToKernel ensures that the vCPU bounces back to the kernel.
++//
++//go:nosplit
++func (c *vCPU) BounceToKernel() {
++ c.bounce(false)
++}
++
++// BounceToHost ensures that the vCPU is in host mode.
++//
++//go:nosplit
++func (c *vCPU) BounceToHost() {
++ c.bounce(true)
++}
++
++// setSystemTimeLegacy calibrates and sets an approximate system time.
++func (c *vCPU) setSystemTimeLegacy() error {
++ const minIterations = 10
++ minimum := uint64(0)
++ for iter := 0; ; iter++ {
++ // Try to set the TSC to an estimate of where it will be
++ // on the host during a "fast" system call iteration.
++ start := uint64(ktime.Rdtsc())
++ if err := c.setTSC(start + (minimum / 2)); err != nil {
++ return err
++ }
++ // See if this is our new minimum call time. Note that this
++ // serves two functions: one, we make sure that we are
++ // accurately predicting the offset we need to set. Second, we
++ // don't want to do the final set on a slow call, which could
++ // produce a really bad result.
++ end := uint64(ktime.Rdtsc())
++ if end < start {
++ continue // Totally bogus: unstable TSC?
++ }
++ current := end - start
++ if current < minimum || iter == 0 {
++ minimum = current // Set our new minimum.
++ }
++ // Is this past minIterations and within ~10% of minimum?
++ upperThreshold := (((minimum << 3) + minimum) >> 3)
++ if iter >= minIterations && current <= upperThreshold {
++ return nil
++ }
++ }
++}
++
++const machinePoolSize = 16
++
++// machinePool is enumerated from the seccompMmapHandler signal handler
++var (
++ machinePool [machinePoolSize]machineAtomicPtr
++ machinePoolLen atomicbitops.Uint32
++ machinePoolMu sync.Mutex
++ seccompMmapRulesOnce gosync.Once
++)
++
++func sigsysHandler()
++func addrOfSigsysHandler() uintptr
++
++// seccompMmapRules adds seccomp rules to trap mmap system calls that will be
++// handled in seccompMmapHandler.
++func seccompMmapRules(m *machine) {
++ seccompMmapRulesOnce.Do(func() {
++ // Install the handler.
++ if err := sighandling.ReplaceSignalHandler(unix.SIGSYS, addrOfSigsysHandler(), &savedSigsysHandler); err != nil {
++ panic(fmt.Sprintf("Unable to set handler for signal %d: %v", bluepillSignal, err))
++ }
++ rules := []seccomp.RuleSet{}
++ rules = append(rules, []seccomp.RuleSet{
++ // Trap mmap system calls and handle them in sigsysGoHandler
++ {
++ Rules: seccomp.SyscallRules{
++ unix.SYS_MMAP: {
++ {
++ seccomp.MatchAny{},
++ seccomp.MatchAny{},
++ seccomp.MaskedEqual(unix.PROT_EXEC, 0),
++ /* MAP_DENYWRITE is ignored and used only for filtering. */
++ seccomp.MaskedEqual(unix.MAP_DENYWRITE, 0),
++ },
++ },
++ },
++ Action: linux.SECCOMP_RET_TRAP,
++ },
++ }...)
++ instrs, err := seccomp.BuildProgram(rules, linux.SECCOMP_RET_ALLOW, linux.SECCOMP_RET_ALLOW)
++ if err != nil {
++ panic(fmt.Sprintf("failed to build rules: %v", err))
++ }
++ // Perform the actual installation.
++ if err := seccomp.SetFilter(instrs); err != nil {
++ panic(fmt.Sprintf("failed to set filter: %v", err))
++ }
++ })
++
++ machinePoolMu.Lock()
++ n := machinePoolLen.Load()
++ i := uint32(0)
++ for ; i < n; i++ {
++ if machinePool[i].Load() == nil {
++ break
++ }
++ }
++ if i == n {
++ if i == machinePoolSize {
++ machinePoolMu.Unlock()
++ panic("machinePool is full")
++ }
++ machinePoolLen.Add(1)
++ }
++ machinePool[i].Store(m)
++ m.machinePoolIndex = i
++ machinePoolMu.Unlock()
++}
+--
+2.41.0
+