Blame - third_party/go/patches/gvisor-fix-debug-builds.patch - monogon

blob: ea5b04d1e78b8c658ab1cf84459c4796acad7d33 [file] [log] [blame]

Lorenz Brun	6570219	2023-08-31 16:27:38 +0200	[diff] [blame^]	1	From eb22b742839180a0bdb3953c061da15ba822d56d Mon Sep 17 00:00:00 2001
				2	From: Tim Windelschmidt <tim@monogon.tech>
				3	Date: Tue, 12 Sep 2023 15:06:49 +0200
				4	Subject: [PATCH] fix debug builds
				5
				6	---
				7	pkg/sentry/platform/kvm/address_space.go \| 3 +
				8	.../platform/kvm/address_space_debug.go \| 242 +++++
				9	.../platform/kvm/bluepill_debug_unsafe.go \| 215 +++++
				10	pkg/sentry/platform/kvm/bluepill_unsafe.go \| 4 +-
				11	pkg/sentry/platform/kvm/machine.go \| 3 +
				12	pkg/sentry/platform/kvm/machine_debug.go \| 826 ++++++++++++++++++
				13	6 files changed, 1291 insertions(+), 2 deletions(-)
				14	create mode 100644 pkg/sentry/platform/kvm/address_space_debug.go
				15	create mode 100644 pkg/sentry/platform/kvm/bluepill_debug_unsafe.go
				16	create mode 100644 pkg/sentry/platform/kvm/machine_debug.go
				17
				18	diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
				19	index 79ccbea35..7e30d0365 100644
				20	--- a/pkg/sentry/platform/kvm/address_space.go
				21	+++ b/pkg/sentry/platform/kvm/address_space.go
				22	@@ -12,6 +12,9 @@
				23	// See the License for the specific language governing permissions and
				24	// limitations under the License.
				25
				26	+//go:build !kvm_debug
				27	+// +build !kvm_debug
				28	+
				29	package kvm
				30
				31	import (
				32	diff --git a/pkg/sentry/platform/kvm/address_space_debug.go b/pkg/sentry/platform/kvm/address_space_debug.go
				33	new file mode 100644
				34	index 000000000..69aeba45a
				35	--- /dev/null
				36	+++ b/pkg/sentry/platform/kvm/address_space_debug.go
				37	@@ -0,0 +1,242 @@
				38	+// Copyright 2018 The gVisor Authors.
				39	+//
				40	+// Licensed under the Apache License, Version 2.0 (the "License");
				41	+// you may not use this file except in compliance with the License.
				42	+// You may obtain a copy of the License at
				43	+//
				44	+// http://www.apache.org/licenses/LICENSE-2.0
				45	+//
				46	+// Unless required by applicable law or agreed to in writing, software
				47	+// distributed under the License is distributed on an "AS IS" BASIS,
				48	+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				49	+// See the License for the specific language governing permissions and
				50	+// limitations under the License.
				51	+
				52	+//go:build kvm_debug
				53	+// +build kvm_debug
				54	+
				55	+package kvm
				56	+
				57	+import (
				58	+ "gvisor.dev/gvisor/pkg/atomicbitops"
				59	+ "gvisor.dev/gvisor/pkg/hostarch"
				60	+ "gvisor.dev/gvisor/pkg/ring0/pagetables"
				61	+ "gvisor.dev/gvisor/pkg/sentry/memmap"
				62	+ "gvisor.dev/gvisor/pkg/sentry/platform"
				63	+ "gvisor.dev/gvisor/pkg/sync"
				64	+)
				65	+
				66	+// dirtySet tracks vCPUs for invalidation.
				67	+type dirtySet struct {
				68	+ vCPUMasks []atomicbitops.Uint64
				69	+}
				70	+
				71	+// forEach iterates over all CPUs in the dirty set.
				72	+func (ds dirtySet) forEach(m machine, fn func(c *vCPU)) {
				73	+ for index := range ds.vCPUMasks {
				74	+ mask := ds.vCPUMasks[index].Swap(0)
				75	+ if mask != 0 {
				76	+ for bit := 0; bit < 64; bit++ {
				77	+ if mask&(1<<uint64(bit)) == 0 {
				78	+ continue
				79	+ }
				80	+ id := 64*index + bit
				81	+ fn(m.vCPUsByID[id])
				82	+ }
				83	+ }
				84	+ }
				85	+}
				86	+
				87	+// mark marks the given vCPU as dirty and returns whether it was previously
				88	+// clean. Being previously clean implies that a flush is needed on entry.
				89	+func (ds dirtySet) mark(c vCPU) bool {
				90	+ index := uint64(c.id) / 64
				91	+ bit := uint64(1) << uint(c.id%64)
				92	+
				93	+ oldValue := ds.vCPUMasks[index].Load()
				94	+ if oldValue&bit != 0 {
				95	+ return false // Not clean.
				96	+ }
				97	+
				98	+ // Set the bit unilaterally, and ensure that a flush takes place. Note
				99	+ // that it's possible for races to occur here, but since the flush is
				100	+ // taking place long after these lines there's no race in practice.
				101	+ atomicbitops.OrUint64(&ds.vCPUMasks[index], bit)
				102	+ return true // Previously clean.
				103	+}
				104	+
				105	+// addressSpace is a wrapper for PageTables.
				106	+type addressSpace struct {
				107	+ platform.NoAddressSpaceIO
				108	+
				109	+ // mu is the lock for modifications to the address space.
				110	+ //
				111	+ // Note that the page tables themselves are not locked.
				112	+ mu sync.Mutex
				113	+
				114	+ // machine is the underlying machine.
				115	+ machine *machine
				116	+
				117	+ // pageTables are for this particular address space.
				118	+ pageTables *pagetables.PageTables
				119	+
				120	+ // dirtySet is the set of dirty vCPUs.
				121	+ dirtySet *dirtySet
				122	+}
				123	+
				124	+// Invalidate interrupts all dirty contexts.
				125	+func (as *addressSpace) Invalidate() {
				126	+ as.mu.Lock()
				127	+ defer as.mu.Unlock()
				128	+ as.invalidate()
				129	+}
				130	+
				131	+// Touch adds the given vCPU to the dirty list.
				132	+//
				133	+// The return value indicates whether a flush is required.
				134	+func (as addressSpace) Touch(c vCPU) bool {
				135	+ return as.dirtySet.mark(c)
				136	+}
				137	+
				138	+type hostMapEntry struct {
				139	+ addr uintptr
				140	+ length uintptr
				141	+}
				142	+
				143	+// mapLocked maps the given host entry.
				144	+//
				145	+// +checkescape:hard,stack
				146	+func (as *addressSpace) mapLocked(addr hostarch.Addr, m hostMapEntry, at hostarch.AccessType) (inv bool) {
				147	+ for m.length > 0 {
				148	+ physical, length, ok := translateToPhysical(m.addr)
				149	+ if !ok {
				150	+ panic("unable to translate segment")
				151	+ }
				152	+ if length > m.length {
				153	+ length = m.length
				154	+ }
				155	+
				156	+ // Ensure that this map has physical mappings. If the page does
				157	+ // not have physical mappings, the KVM module may inject
				158	+ // spurious exceptions when emulation fails (i.e. it tries to
				159	+ // emulate because the RIP is pointed at those pages).
				160	+ as.machine.mapPhysical(physical, length, physicalRegions)
				161	+
				162	+ // Install the page table mappings. Note that the ordering is
				163	+ // important; if the pagetable mappings were installed before
				164	+ // ensuring the physical pages were available, then some other
				165	+ // thread could theoretically access them.
				166	+ inv = as.pageTables.Map(addr, length, pagetables.MapOpts{
				167	+ AccessType: at,
				168	+ User: true,
				169	+ }, physical) \|\| inv
				170	+ m.addr += length
				171	+ m.length -= length
				172	+ addr += hostarch.Addr(length)
				173	+ }
				174	+
				175	+ return inv
				176	+}
				177	+
				178	+// MapFile implements platform.AddressSpace.MapFile.
				179	+func (as *addressSpace) MapFile(addr hostarch.Addr, f memmap.File, fr memmap.FileRange, at hostarch.AccessType, precommit bool) error {
				180	+ as.mu.Lock()
				181	+ defer as.mu.Unlock()
				182	+
				183	+ // Get mappings in the sentry's address space, which are guaranteed to be
				184	+ // valid as long as a reference is held on the mapped pages (which is in
				185	+ // turn required by AddressSpace.MapFile precondition).
				186	+ //
				187	+ // If precommit is true, we will touch mappings to commit them, so ensure
				188	+ // that mappings are readable from sentry context.
				189	+ //
				190	+ // We don't execute from application file-mapped memory, and guest page
				191	+ // tables don't care if we have execute permission (but they do need pages
				192	+ // to be readable).
				193	+ bs, err := f.MapInternal(fr, hostarch.AccessType{
				194	+ Read: at.Read \|\| at.Execute \|\| precommit,
				195	+ Write: at.Write,
				196	+ })
				197	+ if err != nil {
				198	+ return err
				199	+ }
				200	+
				201	+ // See block in mapLocked.
				202	+ as.pageTables.Allocator.(*allocator).cpu = as.machine.Get()
				203	+ defer as.machine.Put(as.pageTables.Allocator.(*allocator).cpu)
				204	+
				205	+ // Map the mappings in the sentry's address space (guest physical memory)
				206	+ // into the application's address space (guest virtual memory).
				207	+ inv := false
				208	+ for !bs.IsEmpty() {
				209	+ b := bs.Head()
				210	+ bs = bs.Tail()
				211	+ // Since fr was page-aligned, b should also be page-aligned. We do the
				212	+ // lookup in our host page tables for this translation.
				213	+ if precommit {
				214	+ s := b.ToSlice()
				215	+ for i := 0; i < len(s); i += hostarch.PageSize {
				216	+ _ = s[i] // Touch to commit.
				217	+ }
				218	+ }
				219	+
				220	+ // See bluepill_allocator.go.
				221	+ bluepill(as.pageTables.Allocator.(*allocator).cpu)
				222	+
				223	+ // Perform the mapping.
				224	+ prev := as.mapLocked(addr, hostMapEntry{
				225	+ addr: b.Addr(),
				226	+ length: uintptr(b.Len()),
				227	+ }, at)
				228	+ inv = inv \|\| prev
				229	+ addr += hostarch.Addr(b.Len())
				230	+ }
				231	+ if inv {
				232	+ as.invalidate()
				233	+ }
				234	+
				235	+ return nil
				236	+}
				237	+
				238	+// unmapLocked is an escape-checked wrapped around Unmap.
				239	+//
				240	+// +checkescape:hard,stack
				241	+func (as *addressSpace) unmapLocked(addr hostarch.Addr, length uint64) bool {
				242	+ return as.pageTables.Unmap(addr, uintptr(length))
				243	+}
				244	+
				245	+// Unmap unmaps the given range by calling pagetables.PageTables.Unmap.
				246	+func (as *addressSpace) Unmap(addr hostarch.Addr, length uint64) {
				247	+ as.mu.Lock()
				248	+ defer as.mu.Unlock()
				249	+
				250	+ // See above & bluepill_allocator.go.
				251	+ as.pageTables.Allocator.(*allocator).cpu = as.machine.Get()
				252	+ defer as.machine.Put(as.pageTables.Allocator.(*allocator).cpu)
				253	+ bluepill(as.pageTables.Allocator.(*allocator).cpu)
				254	+
				255	+ if prev := as.unmapLocked(addr, length); prev {
				256	+ // Invalidate all active vCPUs.
				257	+ as.invalidate()
				258	+
				259	+ // Recycle any freed intermediate pages.
				260	+ as.pageTables.Allocator.Recycle()
				261	+ }
				262	+}
				263	+
				264	+// Release releases the page tables.
				265	+func (as *addressSpace) Release() {
				266	+ as.Unmap(0, ^uint64(0))
				267	+
				268	+ // Free all pages from the allocator.
				269	+ as.pageTables.Allocator.(*allocator).base.Drain()
				270	+
				271	+ // Drop all cached machine references.
				272	+ as.machine.dropPageTables(as.pageTables)
				273	+}
				274	+
				275	+// PreFork implements platform.AddressSpace.PreFork.
				276	+func (as *addressSpace) PreFork() {}
				277	+
				278	+// PostFork implements platform.AddressSpace.PostFork.
				279	+func (as *addressSpace) PostFork() {}
				280	diff --git a/pkg/sentry/platform/kvm/bluepill_debug_unsafe.go b/pkg/sentry/platform/kvm/bluepill_debug_unsafe.go
				281	new file mode 100644
				282	index 000000000..5feb45c19
				283	--- /dev/null
				284	+++ b/pkg/sentry/platform/kvm/bluepill_debug_unsafe.go
				285	@@ -0,0 +1,215 @@
				286	+// Copyright 2018 The gVisor Authors.
				287	+//
				288	+// Licensed under the Apache License, Version 2.0 (the "License");
				289	+// you may not use this file except in compliance with the License.
				290	+// You may obtain a copy of the License at
				291	+//
				292	+// http://www.apache.org/licenses/LICENSE-2.0
				293	+//
				294	+// Unless required by applicable law or agreed to in writing, software
				295	+// distributed under the License is distributed on an "AS IS" BASIS,
				296	+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				297	+// See the License for the specific language governing permissions and
				298	+// limitations under the License.
				299	+
				300	+//go:build go1.18 && kvm_debug
				301	+// +build go1.18,kvm_debug
				302	+
				303	+// //go:linkname directives type-checked by checklinkname. Any other
				304	+// non-linkname assumptions outside the Go 1 compatibility guarantee should
				305	+// have an accompanied vet check or version guard build tag.
				306	+
				307	+package kvm
				308	+
				309	+import (
				310	+ "unsafe"
				311	+
				312	+ "golang.org/x/sys/unix"
				313	+ "gvisor.dev/gvisor/pkg/sentry/arch"
				314	+)
				315	+
				316	+//go:linkname throw runtime.throw
				317	+func throw(s string)
				318	+
				319	+// vCPUPtr returns a CPU for the given address.
				320	+func vCPUPtr(addr uintptr) *vCPU {
				321	+ return (*vCPU)(unsafe.Pointer(addr))
				322	+}
				323	+
				324	+// bytePtr returns a bytePtr for the given address.
				325	+func bytePtr(addr uintptr) *byte {
				326	+ return (*byte)(unsafe.Pointer(addr))
				327	+}
				328	+
				329	+// uintptrValue returns a uintptr for the given address.
				330	+func uintptrValue(addr *byte) uintptr {
				331	+ return (uintptr)(unsafe.Pointer(addr))
				332	+}
				333	+
				334	+// bluepillArchContext returns the UContext64.
				335	+func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 {
				336	+ return &((*arch.UContext64)(context).MContext)
				337	+}
				338	+
				339	+// bluepillHandleHlt is reponsible for handling VM-Exit.
				340	+func bluepillGuestExit(c *vCPU, context unsafe.Pointer) {
				341	+ // Increment our counter.
				342	+ c.guestExits.Add(1)
				343	+
				344	+ // Copy out registers.
				345	+ bluepillArchExit(c, bluepillArchContext(context))
				346	+
				347	+ // Return to the vCPUReady state; notify any waiters.
				348	+ user := c.state.Load() & vCPUUser
				349	+ switch c.state.Swap(user) {
				350	+ case user \| vCPUGuest: // Expected case.
				351	+ case user \| vCPUGuest \| vCPUWaiter:
				352	+ c.notify()
				353	+ default:
				354	+ throw("invalid state")
				355	+ }
				356	+}
				357	+
				358	+var hexSyms = []byte("0123456789abcdef")
				359	+
				360	+func printHex(title []byte, val uint64) {
				361	+ var str [18]byte
				362	+ for i := 0; i < 16; i++ {
				363	+ str[16-i] = hexSyms[val&0xf]
				364	+ val = val >> 4
				365	+ }
				366	+ str[0] = ' '
				367	+ str[17] = '\n'
				368	+ unix.RawSyscall(unix.SYS_WRITE, uintptr(unix.Stderr), uintptr(unsafe.Pointer(&title[0])), uintptr(len(title)))
				369	+ unix.RawSyscall(unix.SYS_WRITE, uintptr(unix.Stderr), uintptr(unsafe.Pointer(&str)), 18)
				370	+}
				371	+
				372	+// bluepillHandler is called from the signal stub.
				373	+//
				374	+// The world may be stopped while this is executing, and it executes on the
				375	+// signal stack. It should only execute raw system calls and functions that are
				376	+// explicitly marked go:nosplit.
				377	+//
				378	+// Ideally, this function should switch to gsignal, as runtime.sigtramp does,
				379	+// but that is tedious given all the runtime internals. That said, using
				380	+// gsignal inside a signal handler is not _required_, provided we avoid stack
				381	+// splits and allocations. Note that calling any splittable function here will
				382	+// be flaky; if the signal stack is below the G stack then we will trigger a
				383	+// split and crash. If above, we won't trigger a split.
				384	+//
				385	+// +checkescape:all
				386	+func bluepillHandler(context unsafe.Pointer) {
				387	+ // Sanitize the registers; interrupts must always be disabled.
				388	+ c := bluepillArchEnter(bluepillArchContext(context))
				389	+
				390	+ // Mark this as guest mode.
				391	+ switch c.state.Swap(vCPUGuest \| vCPUUser) {
				392	+ case vCPUUser: // Expected case.
				393	+ case vCPUUser \| vCPUWaiter:
				394	+ c.notify()
				395	+ default:
				396	+ throw("invalid state")
				397	+ }
				398	+
				399	+ for {
				400	+ hostExitCounter.Increment()
				401	+ _, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(c.fd), _KVM_RUN, 0) // escapes: no.
				402	+ switch errno {
				403	+ case 0: // Expected case.
				404	+ case unix.EINTR:
				405	+ interruptCounter.Increment()
				406	+ // First, we process whatever pending signal
				407	+ // interrupted KVM. Since we're in a signal handler
				408	+ // currently, all signals are masked and the signal
				409	+ // must have been delivered directly to this thread.
				410	+ timeout := unix.Timespec{}
				411	+ sig, _, errno := unix.RawSyscall6( // escapes: no.
				412	+ unix.SYS_RT_SIGTIMEDWAIT,
				413	+ uintptr(unsafe.Pointer(&bounceSignalMask)),
				414	+ 0, // siginfo.
				415	+ uintptr(unsafe.Pointer(&timeout)), // timeout.
				416	+ 8, // sigset size.
				417	+ 0, 0)
				418	+ if errno == unix.EAGAIN {
				419	+ continue
				420	+ }
				421	+ if errno != 0 {
				422	+ throw("error waiting for pending signal")
				423	+ }
				424	+ if sig != uintptr(bounceSignal) {
				425	+ throw("unexpected signal")
				426	+ }
				427	+
				428	+ // Check whether the current state of the vCPU is ready
				429	+ // for interrupt injection. Because we don't have a
				430	+ // PIC, we can't inject an interrupt while they are
				431	+ // masked. We need to request a window if it's not
				432	+ // ready.
				433	+ if bluepillReadyStopGuest(c) {
				434	+ // Force injection below; the vCPU is ready.
				435	+ c.runData.exitReason = _KVM_EXIT_IRQ_WINDOW_OPEN
				436	+ } else {
				437	+ c.runData.requestInterruptWindow = 1
				438	+ continue // Rerun vCPU.
				439	+ }
				440	+ case unix.EFAULT:
				441	+ // If a fault is not serviceable due to the host
				442	+ // backing pages having page permissions, instead of an
				443	+ // MMIO exit we receive EFAULT from the run ioctl. We
				444	+ // always inject an NMI here since we may be in kernel
				445	+ // mode and have interrupts disabled.
				446	+ bluepillSigBus(c)
				447	+ continue // Rerun vCPU.
				448	+ case unix.ENOSYS:
				449	+ bluepillHandleEnosys(c)
				450	+ continue
				451	+ default:
				452	+ throw("run failed")
				453	+ }
				454	+
				455	+ switch c.runData.exitReason {
				456	+ case _KVM_EXIT_EXCEPTION:
				457	+ c.die(bluepillArchContext(context), "exception")
				458	+ return
				459	+ case _KVM_EXIT_IO:
				460	+ c.die(bluepillArchContext(context), "I/O")
				461	+ return
				462	+ case _KVM_EXIT_INTERNAL_ERROR:
				463	+ // An internal error is typically thrown when emulation
				464	+ // fails. This can occur via the MMIO path below (and
				465	+ // it might fail because we have multiple regions that
				466	+ // are not mapped). We would actually prefer that no
				467	+ // emulation occur, and don't mind at all if it fails.
				468	+ case _KVM_EXIT_HYPERCALL:
				469	+ c.die(bluepillArchContext(context), "hypercall")
				470	+ return
				471	+ case _KVM_EXIT_DEBUG:
				472	+ c.die(bluepillArchContext(context), "debug")
				473	+ return
				474	+ case _KVM_EXIT_HLT:
				475	+ c.hltSanityCheck()
				476	+ bluepillGuestExit(c, context)
				477	+ return
				478	+ case _KVM_EXIT_MMIO:
				479	+ physical := uintptr(c.runData.data[0])
				480	+ if getHypercallID(physical) == _KVM_HYPERCALL_VMEXIT {
				481	+ bluepillGuestExit(c, context)
				482	+ return
				483	+ }
				484	+
				485	+ c.die(bluepillArchContext(context), "exit_mmio")
				486	+ return
				487	+ case _KVM_EXIT_IRQ_WINDOW_OPEN:
				488	+ bluepillStopGuest(c)
				489	+ case _KVM_EXIT_SHUTDOWN:
				490	+ c.die(bluepillArchContext(context), "shutdown")
				491	+ return
				492	+ case _KVM_EXIT_FAIL_ENTRY:
				493	+ c.die(bluepillArchContext(context), "entry failed")
				494	+ return
				495	+ default:
				496	+ bluepillArchHandleExit(c, context)
				497	+ return
				498	+ }
				499	+ }
				500	+}
				501	diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
				502	index 81bd9f814..ad8b966e7 100644
				503	--- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
				504	+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
				505	@@ -12,8 +12,8 @@
				506	// See the License for the specific language governing permissions and
				507	// limitations under the License.
				508
				509	-//go:build go1.18
				510	-// +build go1.18
				511	+//go:build go1.18 && !kvm_debug
				512	+// +build go1.18,!kvm_debug
				513
				514	// //go:linkname directives type-checked by checklinkname. Any other
				515	// non-linkname assumptions outside the Go 1 compatibility guarantee should
				516	diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
				517	index f39bf1f06..4f0264db7 100644
				518	--- a/pkg/sentry/platform/kvm/machine.go
				519	+++ b/pkg/sentry/platform/kvm/machine.go
				520	@@ -12,6 +12,9 @@
				521	// See the License for the specific language governing permissions and
				522	// limitations under the License.
				523
				524	+//go:build !kvm_debug
				525	+// +build !kvm_debug
				526	+
				527	package kvm
				528
				529	import (
				530	diff --git a/pkg/sentry/platform/kvm/machine_debug.go b/pkg/sentry/platform/kvm/machine_debug.go
				531	new file mode 100644
				532	index 000000000..0a4735d2d
				533	--- /dev/null
				534	+++ b/pkg/sentry/platform/kvm/machine_debug.go
				535	@@ -0,0 +1,826 @@
				536	+// Copyright 2018 The gVisor Authors.
				537	+//
				538	+// Licensed under the Apache License, Version 2.0 (the "License");
				539	+// you may not use this file except in compliance with the License.
				540	+// You may obtain a copy of the License at
				541	+//
				542	+// http://www.apache.org/licenses/LICENSE-2.0
				543	+//
				544	+// Unless required by applicable law or agreed to in writing, software
				545	+// distributed under the License is distributed on an "AS IS" BASIS,
				546	+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				547	+// See the License for the specific language governing permissions and
				548	+// limitations under the License.
				549	+
				550	+//go:build kvm_debug
				551	+// +build kvm_debug
				552	+
				553	+package kvm
				554	+
				555	+import (
				556	+ "fmt"
				557	+ "runtime"
				558	+ gosync "sync"
				559	+ "sync/atomic"
				560	+ "time"
				561	+
				562	+ "golang.org/x/sys/unix"
				563	+ "gvisor.dev/gvisor/pkg/abi/linux"
				564	+ "gvisor.dev/gvisor/pkg/atomicbitops"
				565	+ "gvisor.dev/gvisor/pkg/hostarch"
				566	+ "gvisor.dev/gvisor/pkg/hosttid"
				567	+ "gvisor.dev/gvisor/pkg/log"
				568	+ "gvisor.dev/gvisor/pkg/metric"
				569	+ "gvisor.dev/gvisor/pkg/ring0"
				570	+ "gvisor.dev/gvisor/pkg/ring0/pagetables"
				571	+ "gvisor.dev/gvisor/pkg/seccomp"
				572	+ ktime "gvisor.dev/gvisor/pkg/sentry/time"
				573	+ "gvisor.dev/gvisor/pkg/sighandling"
				574	+ "gvisor.dev/gvisor/pkg/sync"
				575	+)
				576	+
				577	+// machine contains state associated with the VM as a whole.
				578	+type machine struct {
				579	+ // fd is the vm fd.
				580	+ fd int
				581	+
				582	+ // machinePoolIndex is the index in the machinePool array.
				583	+ machinePoolIndex uint32
				584	+
				585	+ // nextSlot is the next slot for setMemoryRegion.
				586	+ //
				587	+ // If nextSlot is ^uint32(0), then slots are currently being updated, and the
				588	+ // caller should retry.
				589	+ nextSlot atomicbitops.Uint32
				590	+
				591	+ // upperSharedPageTables tracks the read-only shared upper of all the pagetables.
				592	+ upperSharedPageTables *pagetables.PageTables
				593	+
				594	+ // kernel is the set of global structures.
				595	+ kernel ring0.Kernel
				596	+
				597	+ // mu protects vCPUs.
				598	+ mu sync.RWMutex
				599	+
				600	+ // available is notified when vCPUs are available.
				601	+ available sync.Cond
				602	+
				603	+ // vCPUsByTID are the machine vCPUs.
				604	+ //
				605	+ // These are populated dynamically.
				606	+ vCPUsByTID map[uint64]*vCPU
				607	+
				608	+ // vCPUsByID are the machine vCPUs, can be indexed by the vCPU's ID.
				609	+ vCPUsByID []*vCPU
				610	+
				611	+ // usedVCPUs is the number of vCPUs that have been used from the
				612	+ // vCPUsByID pool.
				613	+ usedVCPUs int
				614	+
				615	+ // maxVCPUs is the maximum number of vCPUs supported by the machine.
				616	+ maxVCPUs int
				617	+
				618	+ // maxSlots is the maximum number of memory slots supported by the machine.
				619	+ maxSlots int
				620	+
				621	+ // tscControl checks whether cpu supports TSC scaling
				622	+ tscControl bool
				623	+
				624	+ // usedSlots is the set of used physical addresses (not sorted).
				625	+ usedSlots []uintptr
				626	+}
				627	+
				628	+const (
				629	+ // vCPUReady is an alias for all the below clear.
				630	+ vCPUReady uint32 = 0
				631	+
				632	+ // vCPUser indicates that the vCPU is in or about to enter user mode.
				633	+ vCPUUser uint32 = 1 << 0
				634	+
				635	+ // vCPUGuest indicates the vCPU is in guest mode.
				636	+ vCPUGuest uint32 = 1 << 1
				637	+
				638	+ // vCPUWaiter indicates that there is a waiter.
				639	+ //
				640	+ // If this is set, then notify must be called on any state transitions.
				641	+ vCPUWaiter uint32 = 1 << 2
				642	+)
				643	+
				644	+// Field values for the get_vcpu metric acquisition path used.
				645	+var (
				646	+ getVCPUAcquisitionFastReused = metric.FieldValue{"fast_reused"}
				647	+ getVCPUAcquisitionReused = metric.FieldValue{"reused"}
				648	+ getVCPUAcquisitionUnused = metric.FieldValue{"unused"}
				649	+ getVCPUAcquisitionStolen = metric.FieldValue{"stolen"}
				650	+)
				651	+
				652	+var (
				653	+ // hostExitCounter is a metric that tracks how many times the sentry
				654	+ // performed a host to guest world switch.
				655	+ hostExitCounter = metric.MustCreateNewProfilingUint64Metric(
				656	+ "/kvm/host_exits", false, "The number of times the sentry performed a host to guest world switch.")
				657	+
				658	+ // userExitCounter is a metric that tracks how many times the sentry has
				659	+ // had an exit from userspace. Analogous to vCPU.userExits.
				660	+ userExitCounter = metric.MustCreateNewProfilingUint64Metric(
				661	+ "/kvm/user_exits", false, "The number of times the sentry has had an exit from userspace.")
				662	+
				663	+ // interruptCounter is a metric that tracks how many times execution returned
				664	+ // to the KVM host to handle a pending signal.
				665	+ interruptCounter = metric.MustCreateNewProfilingUint64Metric(
				666	+ "/kvm/interrupts", false, "The number of times the signal handler was invoked.")
				667	+
				668	+ // mmapCallCounter is a metric that tracks how many times the function
				669	+ // seccompMmapSyscall has been called.
				670	+ mmapCallCounter = metric.MustCreateNewProfilingUint64Metric(
				671	+ "/kvm/mmap_calls", false, "The number of times seccompMmapSyscall has been called.")
				672	+
				673	+ // getVCPUCounter is a metric that tracks how many times different paths of
				674	+ // machine.Get() are triggered.
				675	+ getVCPUCounter = metric.MustCreateNewProfilingUint64Metric(
				676	+ "/kvm/get_vcpu", false, "The number of times that machine.Get() was called, split by path the function took.",
				677	+ metric.NewField("acquisition_type", &getVCPUAcquisitionFastReused, &getVCPUAcquisitionReused, &getVCPUAcquisitionUnused, &getVCPUAcquisitionStolen))
				678	+
				679	+ // asInvalidateDuration are durations of calling addressSpace.invalidate().
				680	+ asInvalidateDuration = metric.MustCreateNewProfilingTimerMetric("/kvm/address_space_invalidate",
				681	+ metric.NewExponentialBucketer(15, uint64(time.Nanosecond*100), 1, 2),
				682	+ "Duration of calling addressSpace.invalidate().")
				683	+)
				684	+
				685	+// vCPU is a single KVM vCPU.
				686	+type vCPU struct {
				687	+ // CPU is the kernel CPU data.
				688	+ //
				689	+ // This must be the first element of this structure, it is referenced
				690	+ // by the bluepill code (see bluepill_amd64.s).
				691	+ ring0.CPU
				692	+
				693	+ // id is the vCPU id.
				694	+ id int
				695	+
				696	+ // fd is the vCPU fd.
				697	+ fd int
				698	+
				699	+ // tid is the last set tid.
				700	+ tid atomicbitops.Uint64
				701	+
				702	+ // userExits is the count of user exits.
				703	+ userExits atomicbitops.Uint64
				704	+
				705	+ // guestExits is the count of guest to host world switches.
				706	+ guestExits atomicbitops.Uint64
				707	+
				708	+ // faults is a count of world faults (informational only).
				709	+ faults uint32
				710	+
				711	+ // state is the vCPU state.
				712	+ //
				713	+ // This is a bitmask of the three fields (vCPU*) described above.
				714	+ state atomicbitops.Uint32
				715	+
				716	+ // runData for this vCPU.
				717	+ runData *runData
				718	+
				719	+ // machine associated with this vCPU.
				720	+ machine *machine
				721	+
				722	+ // active is the current addressSpace: this is set and read atomically,
				723	+ // it is used to elide unnecessary interrupts due to invalidations.
				724	+ active atomicAddressSpace
				725	+
				726	+ // vCPUArchState is the architecture-specific state.
				727	+ vCPUArchState
				728	+
				729	+ // dieState holds state related to vCPU death.
				730	+ dieState dieState
				731	+}
				732	+
				733	+type dieState struct {
				734	+ // message is thrown from die.
				735	+ message string
				736	+
				737	+ // guestRegs is used to store register state during vCPU.die() to prevent
				738	+ // allocation inside nosplit function.
				739	+ guestRegs userRegs
				740	+}
				741	+
				742	+// createVCPU creates and returns a new vCPU.
				743	+//
				744	+// Precondition: mu must be held.
				745	+func (m machine) createVCPU(id int) vCPU {
				746	+ // Create the vCPU.
				747	+ fd, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CREATE_VCPU, uintptr(id))
				748	+ if errno != 0 {
				749	+ panic(fmt.Sprintf("error creating new vCPU: %v", errno))
				750	+ }
				751	+
				752	+ c := &vCPU{
				753	+ id: id,
				754	+ fd: int(fd),
				755	+ machine: m,
				756	+ }
				757	+ c.CPU.Init(&m.kernel, c.id, c)
				758	+ m.vCPUsByID[c.id] = c
				759	+
				760	+ // Ensure the signal mask is correct.
				761	+ if err := c.setSignalMask(); err != nil {
				762	+ panic(fmt.Sprintf("error setting signal mask: %v", err))
				763	+ }
				764	+
				765	+ // Map the run data.
				766	+ runData, err := mapRunData(int(fd))
				767	+ if err != nil {
				768	+ panic(fmt.Sprintf("error mapping run data: %v", err))
				769	+ }
				770	+ c.runData = runData
				771	+
				772	+ // Initialize architecture state.
				773	+ if err := c.initArchState(); err != nil {
				774	+ panic(fmt.Sprintf("error initialization vCPU state: %v", err))
				775	+ }
				776	+
				777	+ return c // Done.
				778	+}
				779	+
				780	+// newMachine returns a new VM context.
				781	+func newMachine(vm int) (*machine, error) {
				782	+ // Create the machine.
				783	+ m := &machine{fd: vm}
				784	+ m.available.L = &m.mu
				785	+
				786	+ // Pull the maximum vCPUs.
				787	+ m.getMaxVCPU()
				788	+ log.Debugf("The maximum number of vCPUs is %d.", m.maxVCPUs)
				789	+ m.vCPUsByTID = make(map[uint64]*vCPU)
				790	+ m.vCPUsByID = make([]*vCPU, m.maxVCPUs)
				791	+ m.kernel.Init(m.maxVCPUs)
				792	+
				793	+ // Pull the maximum slots.
				794	+ maxSlots, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_MEMSLOTS)
				795	+ if errno != 0 {
				796	+ m.maxSlots = _KVM_NR_MEMSLOTS
				797	+ } else {
				798	+ m.maxSlots = int(maxSlots)
				799	+ }
				800	+ log.Debugf("The maximum number of slots is %d.", m.maxSlots)
				801	+ m.usedSlots = make([]uintptr, m.maxSlots)
				802	+
				803	+ // Check TSC Scaling
				804	+ hasTSCControl, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_TSC_CONTROL)
				805	+ m.tscControl = errno == 0 && hasTSCControl == 1
				806	+ log.Debugf("TSC scaling support: %t.", m.tscControl)
				807	+
				808	+ // Create the upper shared pagetables and kernel(sentry) pagetables.
				809	+ m.upperSharedPageTables = pagetables.New(newAllocator())
				810	+ m.mapUpperHalf(m.upperSharedPageTables)
				811	+ m.upperSharedPageTables.Allocator.(*allocator).base.Drain()
				812	+ m.upperSharedPageTables.MarkReadOnlyShared()
				813	+ m.kernel.PageTables = pagetables.NewWithUpper(newAllocator(), m.upperSharedPageTables, ring0.KernelStartAddress)
				814	+
				815	+ // Install seccomp rules to trap runtime mmap system calls. They will
				816	+ // be handled by seccompMmapHandler.
				817	+ seccompMmapRules(m)
				818	+
				819	+ // Apply the physical mappings. Note that these mappings may point to
				820	+ // guest physical addresses that are not actually available. These
				821	+ // physical pages are mapped on demand, see kernel_unsafe.go.
				822	+ applyPhysicalRegions(func(pr physicalRegion) bool {
				823	+ // Map everything in the lower half.
				824	+ m.kernel.PageTables.Map(
				825	+ hostarch.Addr(pr.virtual),
				826	+ pr.length,
				827	+ pagetables.MapOpts{AccessType: hostarch.ReadWrite},
				828	+ pr.physical)
				829	+
				830	+ return true // Keep iterating.
				831	+ })
				832	+
				833	+ // Ensure that the currently mapped virtual regions are actually
				834	+ // available in the VM. Note that this doesn't guarantee no future
				835	+ // faults, however it should guarantee that everything is available to
				836	+ // ensure successful vCPU entry.
				837	+ mapRegion := func(vr virtualRegion, flags uint32) {
				838	+ for virtual := vr.virtual; virtual < vr.virtual+vr.length; {
				839	+ physical, length, ok := translateToPhysical(virtual)
				840	+ if !ok {
				841	+ // This must be an invalid region that was
				842	+ // knocked out by creation of the physical map.
				843	+ return
				844	+ }
				845	+ if virtual+length > vr.virtual+vr.length {
				846	+ // Cap the length to the end of the area.
				847	+ length = vr.virtual + vr.length - virtual
				848	+ }
				849	+ // Update page tables for executable mappings.
				850	+ if vr.accessType.Execute {
				851	+ if vr.accessType.Write {
				852	+ panic(fmt.Sprintf("executable mapping can't be writable: %#v", vr))
				853	+ }
				854	+ m.kernel.PageTables.Map(
				855	+ hostarch.Addr(virtual),
				856	+ length,
				857	+ pagetables.MapOpts{AccessType: vr.accessType},
				858	+ physical)
				859	+ }
				860	+
				861	+ // Ensure the physical range is mapped.
				862	+ m.mapPhysical(physical, length, physicalRegions)
				863	+ virtual += length
				864	+ }
				865	+ }
				866	+
				867	+ // handleBluepillFault takes the slot spinlock and it is called from
				868	+ // seccompMmapHandler, so here we have to guarantee that mmap is not
				869	+ // called while we hold the slot spinlock.
				870	+ disableAsyncPreemption()
				871	+ applyVirtualRegions(func(vr virtualRegion) {
				872	+ if excludeVirtualRegion(vr) {
				873	+ return // skip region.
				874	+ }
				875	+ // Take into account that the stack can grow down.
				876	+ if vr.filename == "[stack]" {
				877	+ vr.virtual -= 1 << 20
				878	+ vr.length += 1 << 20
				879	+ }
				880	+
				881	+ mapRegion(vr, 0)
				882	+
				883	+ })
				884	+ enableAsyncPreemption()
				885	+
				886	+ // Initialize architecture state.
				887	+ if err := m.initArchState(); err != nil {
				888	+ m.Destroy()
				889	+ return nil, err
				890	+ }
				891	+
				892	+ // Ensure the machine is cleaned up properly.
				893	+ runtime.SetFinalizer(m, (*machine).Destroy)
				894	+ return m, nil
				895	+}
				896	+
				897	+// hasSlot returns true if the given address is mapped.
				898	+//
				899	+// This must be done via a linear scan.
				900	+//
				901	+//go:nosplit
				902	+func (m *machine) hasSlot(physical uintptr) bool {
				903	+ slotLen := int(m.nextSlot.Load())
				904	+ // When slots are being updated, nextSlot is ^uint32(0). As this situation
				905	+ // is less likely happen, we just set the slotLen to m.maxSlots, and scan
				906	+ // the whole usedSlots array.
				907	+ if slotLen == int(^uint32(0)) {
				908	+ slotLen = m.maxSlots
				909	+ }
				910	+ for i := 0; i < slotLen; i++ {
				911	+ if p := atomic.LoadUintptr(&m.usedSlots[i]); p == physical {
				912	+ return true
				913	+ }
				914	+ }
				915	+ return false
				916	+}
				917	+
				918	+// mapPhysical checks for the mapping of a physical range, and installs one if
				919	+// not available. This attempts to be efficient for calls in the hot path.
				920	+//
				921	+// This throws on error.
				922	+func (m *machine) mapPhysical(physical, length uintptr, phyRegions []physicalRegion) {
				923	+ for end := physical + length; physical < end; {
				924	+ _, physicalStart, length, pr := calculateBluepillFault(physical, phyRegions)
				925	+ if pr == nil {
				926	+ // Should never happen.
				927	+ throw("mapPhysical on unknown physical address")
				928	+ }
				929	+
				930	+ // Is this already mapped? Check the usedSlots.
				931	+ if !m.hasSlot(physicalStart) {
				932	+ if _, ok := handleBluepillFault(m, physical, phyRegions); !ok {
				933	+ throw("handleBluepillFault failed")
				934	+ }
				935	+ }
				936	+
				937	+ // Move to the next chunk.
				938	+ physical = physicalStart + length
				939	+ }
				940	+}
				941	+
				942	+// Destroy frees associated resources.
				943	+//
				944	+// Destroy should only be called once all active users of the machine are gone.
				945	+// The machine object should not be used after calling Destroy.
				946	+//
				947	+// Precondition: all vCPUs must be returned to the machine.
				948	+func (m *machine) Destroy() {
				949	+ runtime.SetFinalizer(m, nil)
				950	+
				951	+ // Destroy vCPUs.
				952	+ for _, c := range m.vCPUsByID {
				953	+ if c == nil {
				954	+ continue
				955	+ }
				956	+
				957	+ // Ensure the vCPU is not still running in guest mode. This is
				958	+ // possible iff teardown has been done by other threads, and
				959	+ // somehow a single thread has not executed any system calls.
				960	+ c.BounceToHost()
				961	+
				962	+ // Note that the runData may not be mapped if an error occurs
				963	+ // during the middle of initialization.
				964	+ if c.runData != nil {
				965	+ if err := unmapRunData(c.runData); err != nil {
				966	+ panic(fmt.Sprintf("error unmapping rundata: %v", err))
				967	+ }
				968	+ }
				969	+ if err := unix.Close(int(c.fd)); err != nil {
				970	+ panic(fmt.Sprintf("error closing vCPU fd: %v", err))
				971	+ }
				972	+ }
				973	+
				974	+ machinePool[m.machinePoolIndex].Store(nil)
				975	+ seccompMmapSync()
				976	+
				977	+ // vCPUs are gone: teardown machine state.
				978	+ if err := unix.Close(m.fd); err != nil {
				979	+ panic(fmt.Sprintf("error closing VM fd: %v", err))
				980	+ }
				981	+}
				982	+
				983	+// Get gets an available vCPU.
				984	+//
				985	+// This will return with the OS thread locked.
				986	+//
				987	+// It is guaranteed that if any OS thread TID is in guest, m.vCPUs[TID] points
				988	+// to the vCPU in which the OS thread TID is running. So if Get() returns with
				989	+// the corrent context in guest, the vCPU of it must be the same as what
				990	+// Get() returns.
				991	+func (m machine) Get() vCPU {
				992	+ m.mu.RLock()
				993	+ runtime.LockOSThread()
				994	+ tid := hosttid.Current()
				995	+
				996	+ // Check for an exact match.
				997	+ if c := m.vCPUsByTID[tid]; c != nil {
				998	+ c.lock()
				999	+ m.mu.RUnlock()
				1000	+ getVCPUCounter.Increment(&getVCPUAcquisitionFastReused)
				1001	+ return c
				1002	+ }
				1003	+
				1004	+ // The happy path failed. We now proceed to acquire an exclusive lock
				1005	+ // (because the vCPU map may change), and scan all available vCPUs.
				1006	+ // In this case, we first unlock the OS thread. Otherwise, if mu is
				1007	+ // not available, the current system thread will be parked and a new
				1008	+ // system thread spawned. We avoid this situation by simply refreshing
				1009	+ // tid after relocking the system thread.
				1010	+ m.mu.RUnlock()
				1011	+ runtime.UnlockOSThread()
				1012	+ m.mu.Lock()
				1013	+ runtime.LockOSThread()
				1014	+ tid = hosttid.Current()
				1015	+
				1016	+ // Recheck for an exact match.
				1017	+ if c := m.vCPUsByTID[tid]; c != nil {
				1018	+ c.lock()
				1019	+ m.mu.Unlock()
				1020	+ getVCPUCounter.Increment(&getVCPUAcquisitionReused)
				1021	+ return c
				1022	+ }
				1023	+
				1024	+ for {
				1025	+ // Get vCPU from the m.vCPUsByID pool.
				1026	+ if m.usedVCPUs < m.maxVCPUs {
				1027	+ c := m.vCPUsByID[m.usedVCPUs]
				1028	+ m.usedVCPUs++
				1029	+ c.lock()
				1030	+ m.vCPUsByTID[tid] = c
				1031	+ m.mu.Unlock()
				1032	+ c.loadSegments(tid)
				1033	+ getVCPUCounter.Increment(&getVCPUAcquisitionUnused)
				1034	+ return c
				1035	+ }
				1036	+
				1037	+ // Scan for an available vCPU.
				1038	+ for origTID, c := range m.vCPUsByTID {
				1039	+ if c.state.CompareAndSwap(vCPUReady, vCPUUser) {
				1040	+ delete(m.vCPUsByTID, origTID)
				1041	+ m.vCPUsByTID[tid] = c
				1042	+ m.mu.Unlock()
				1043	+ c.loadSegments(tid)
				1044	+ getVCPUCounter.Increment(&getVCPUAcquisitionUnused)
				1045	+ return c
				1046	+ }
				1047	+ }
				1048	+
				1049	+ // Scan for something not in user mode.
				1050	+ for origTID, c := range m.vCPUsByTID {
				1051	+ if !c.state.CompareAndSwap(vCPUGuest, vCPUGuest\|vCPUWaiter) {
				1052	+ continue
				1053	+ }
				1054	+
				1055	+ // The vCPU is not be able to transition to
				1056	+ // vCPUGuest\|vCPUWaiter or to vCPUUser because that
				1057	+ // transition requires holding the machine mutex, as we
				1058	+ // do now. There is no path to register a waiter on
				1059	+ // just the vCPUReady state.
				1060	+ for {
				1061	+ c.waitUntilNot(vCPUGuest \| vCPUWaiter)
				1062	+ if c.state.CompareAndSwap(vCPUReady, vCPUUser) {
				1063	+ break
				1064	+ }
				1065	+ }
				1066	+
				1067	+ // Steal the vCPU.
				1068	+ delete(m.vCPUsByTID, origTID)
				1069	+ m.vCPUsByTID[tid] = c
				1070	+ m.mu.Unlock()
				1071	+ c.loadSegments(tid)
				1072	+ getVCPUCounter.Increment(&getVCPUAcquisitionStolen)
				1073	+ return c
				1074	+ }
				1075	+
				1076	+ // Everything is executing in user mode. Wait until something
				1077	+ // is available. Note that signaling the condition variable
				1078	+ // will have the extra effect of kicking the vCPUs out of guest
				1079	+ // mode if that's where they were.
				1080	+ m.available.Wait()
				1081	+ }
				1082	+}
				1083	+
				1084	+// Put puts the current vCPU.
				1085	+func (m machine) Put(c vCPU) {
				1086	+ c.unlock()
				1087	+ runtime.UnlockOSThread()
				1088	+
				1089	+ m.mu.RLock()
				1090	+ m.available.Signal()
				1091	+ m.mu.RUnlock()
				1092	+}
				1093	+
				1094	+// newDirtySet returns a new dirty set.
				1095	+func (m machine) newDirtySet() dirtySet {
				1096	+ return &dirtySet{
				1097	+ vCPUMasks: make([]atomicbitops.Uint64,
				1098	+ (m.maxVCPUs+63)/64, (m.maxVCPUs+63)/64),
				1099	+ }
				1100	+}
				1101	+
				1102	+// dropPageTables drops cached page table entries.
				1103	+func (m machine) dropPageTables(pt pagetables.PageTables) {
				1104	+ m.mu.Lock()
				1105	+ defer m.mu.Unlock()
				1106	+
				1107	+ // Clear from all PCIDs.
				1108	+ for _, c := range m.vCPUsByID {
				1109	+ if c != nil && c.PCIDs != nil {
				1110	+ c.PCIDs.Drop(pt)
				1111	+ }
				1112	+ }
				1113	+}
				1114	+
				1115	+// lock marks the vCPU as in user mode.
				1116	+//
				1117	+// This should only be called directly when known to be safe, i.e. when
				1118	+// the vCPU is owned by the current TID with no chance of theft.
				1119	+//
				1120	+//go:nosplit
				1121	+func (c *vCPU) lock() {
				1122	+ atomicbitops.OrUint32(&c.state, vCPUUser)
				1123	+}
				1124	+
				1125	+// unlock clears the vCPUUser bit.
				1126	+//
				1127	+//go:nosplit
				1128	+func (c *vCPU) unlock() {
				1129	+ origState := atomicbitops.CompareAndSwapUint32(&c.state, vCPUUser\|vCPUGuest, vCPUGuest)
				1130	+ if origState == vCPUUser\|vCPUGuest {
				1131	+ // Happy path: no exits are forced, and we can continue
				1132	+ // executing on our merry way with a single atomic access.
				1133	+ return
				1134	+ }
				1135	+
				1136	+ // Clear the lock.
				1137	+ for {
				1138	+ state := atomicbitops.CompareAndSwapUint32(&c.state, origState, origState&^vCPUUser)
				1139	+ if state == origState {
				1140	+ break
				1141	+ }
				1142	+ origState = state
				1143	+ }
				1144	+ switch origState {
				1145	+ case vCPUUser:
				1146	+ // Normal state.
				1147	+ case vCPUUser \| vCPUGuest \| vCPUWaiter:
				1148	+ // Force a transition: this must trigger a notification when we
				1149	+ // return from guest mode. We must clear vCPUWaiter here
				1150	+ // anyways, because BounceToKernel will force a transition only
				1151	+ // from ring3 to ring0, which will not clear this bit. Halt may
				1152	+ // workaround the issue, but if there is no exception or
				1153	+ // syscall in this period, BounceToKernel will hang.
				1154	+ atomicbitops.AndUint32(&c.state, ^vCPUWaiter)
				1155	+ c.notify()
				1156	+ case vCPUUser \| vCPUWaiter:
				1157	+ // Waiting for the lock to be released; the responsibility is
				1158	+ // on us to notify the waiter and clear the associated bit.
				1159	+ atomicbitops.AndUint32(&c.state, ^vCPUWaiter)
				1160	+ c.notify()
				1161	+ default:
				1162	+ panic("invalid state")
				1163	+ }
				1164	+}
				1165	+
				1166	+// NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt.
				1167	+//
				1168	+//go:nosplit
				1169	+func (c *vCPU) NotifyInterrupt() {
				1170	+ c.BounceToKernel()
				1171	+}
				1172	+
				1173	+// pid is used below in bounce.
				1174	+var pid = unix.Getpid()
				1175	+
				1176	+// bounce forces a return to the kernel or to host mode.
				1177	+//
				1178	+// This effectively unwinds the state machine.
				1179	+func (c *vCPU) bounce(forceGuestExit bool) {
				1180	+ origGuestExits := c.guestExits.Load()
				1181	+ origUserExits := c.userExits.Load()
				1182	+ for {
				1183	+ switch state := c.state.Load(); state {
				1184	+ case vCPUReady, vCPUWaiter:
				1185	+ // There is nothing to be done, we're already in the
				1186	+ // kernel pre-acquisition. The Bounce criteria have
				1187	+ // been satisfied.
				1188	+ return
				1189	+ case vCPUUser:
				1190	+ // We need to register a waiter for the actual guest
				1191	+ // transition. When the transition takes place, then we
				1192	+ // can inject an interrupt to ensure a return to host
				1193	+ // mode.
				1194	+ c.state.CompareAndSwap(state, state\|vCPUWaiter)
				1195	+ case vCPUUser \| vCPUWaiter:
				1196	+ // Wait for the transition to guest mode. This should
				1197	+ // come from the bluepill handler.
				1198	+ c.waitUntilNot(state)
				1199	+ case vCPUGuest, vCPUUser \| vCPUGuest:
				1200	+ if state == vCPUGuest && !forceGuestExit {
				1201	+ // The vCPU is already not acquired, so there's
				1202	+ // no need to do a fresh injection here.
				1203	+ return
				1204	+ }
				1205	+ // The vCPU is in user or kernel mode. Attempt to
				1206	+ // register a notification on change.
				1207	+ if !c.state.CompareAndSwap(state, state\|vCPUWaiter) {
				1208	+ break // Retry.
				1209	+ }
				1210	+ for {
				1211	+ // We need to spin here until the signal is
				1212	+ // delivered, because Tgkill can return EAGAIN
				1213	+ // under memory pressure. Since we already
				1214	+ // marked ourselves as a waiter, we need to
				1215	+ // ensure that a signal is actually delivered.
				1216	+ if err := unix.Tgkill(pid, int(c.tid.Load()), bounceSignal); err == nil {
				1217	+ break
				1218	+ } else if err.(unix.Errno) == unix.EAGAIN {
				1219	+ continue
				1220	+ } else {
				1221	+ // Nothing else should be returned by tgkill.
				1222	+ panic(fmt.Sprintf("unexpected tgkill error: %v", err))
				1223	+ }
				1224	+ }
				1225	+ case vCPUGuest \| vCPUWaiter, vCPUUser \| vCPUGuest \| vCPUWaiter:
				1226	+ if state == vCPUGuest\|vCPUWaiter && !forceGuestExit {
				1227	+ // See above.
				1228	+ return
				1229	+ }
				1230	+ // Wait for the transition. This again should happen
				1231	+ // from the bluepill handler, but on the way out.
				1232	+ c.waitUntilNot(state)
				1233	+ default:
				1234	+ // Should not happen: the above is exhaustive.
				1235	+ panic("invalid state")
				1236	+ }
				1237	+
				1238	+ // Check if we've missed the state transition, but
				1239	+ // we can safely return at this point in time.
				1240	+ newGuestExits := c.guestExits.Load()
				1241	+ newUserExits := c.userExits.Load()
				1242	+ if newUserExits != origUserExits && (!forceGuestExit \|\| newGuestExits != origGuestExits) {
				1243	+ return
				1244	+ }
				1245	+ }
				1246	+}
				1247	+
				1248	+// BounceToKernel ensures that the vCPU bounces back to the kernel.
				1249	+//
				1250	+//go:nosplit
				1251	+func (c *vCPU) BounceToKernel() {
				1252	+ c.bounce(false)
				1253	+}
				1254	+
				1255	+// BounceToHost ensures that the vCPU is in host mode.
				1256	+//
				1257	+//go:nosplit
				1258	+func (c *vCPU) BounceToHost() {
				1259	+ c.bounce(true)
				1260	+}
				1261	+
				1262	+// setSystemTimeLegacy calibrates and sets an approximate system time.
				1263	+func (c *vCPU) setSystemTimeLegacy() error {
				1264	+ const minIterations = 10
				1265	+ minimum := uint64(0)
				1266	+ for iter := 0; ; iter++ {
				1267	+ // Try to set the TSC to an estimate of where it will be
				1268	+ // on the host during a "fast" system call iteration.
				1269	+ start := uint64(ktime.Rdtsc())
				1270	+ if err := c.setTSC(start + (minimum / 2)); err != nil {
				1271	+ return err
				1272	+ }
				1273	+ // See if this is our new minimum call time. Note that this
				1274	+ // serves two functions: one, we make sure that we are
				1275	+ // accurately predicting the offset we need to set. Second, we
				1276	+ // don't want to do the final set on a slow call, which could
				1277	+ // produce a really bad result.
				1278	+ end := uint64(ktime.Rdtsc())
				1279	+ if end < start {
				1280	+ continue // Totally bogus: unstable TSC?
				1281	+ }
				1282	+ current := end - start
				1283	+ if current < minimum \|\| iter == 0 {
				1284	+ minimum = current // Set our new minimum.
				1285	+ }
				1286	+ // Is this past minIterations and within ~10% of minimum?
				1287	+ upperThreshold := (((minimum << 3) + minimum) >> 3)
				1288	+ if iter >= minIterations && current <= upperThreshold {
				1289	+ return nil
				1290	+ }
				1291	+ }
				1292	+}
				1293	+
				1294	+const machinePoolSize = 16
				1295	+
				1296	+// machinePool is enumerated from the seccompMmapHandler signal handler
				1297	+var (
				1298	+ machinePool [machinePoolSize]machineAtomicPtr
				1299	+ machinePoolLen atomicbitops.Uint32
				1300	+ machinePoolMu sync.Mutex
				1301	+ seccompMmapRulesOnce gosync.Once
				1302	+)
				1303	+
				1304	+func sigsysHandler()
				1305	+func addrOfSigsysHandler() uintptr
				1306	+
				1307	+// seccompMmapRules adds seccomp rules to trap mmap system calls that will be
				1308	+// handled in seccompMmapHandler.
				1309	+func seccompMmapRules(m *machine) {
				1310	+ seccompMmapRulesOnce.Do(func() {
				1311	+ // Install the handler.
				1312	+ if err := sighandling.ReplaceSignalHandler(unix.SIGSYS, addrOfSigsysHandler(), &savedSigsysHandler); err != nil {
				1313	+ panic(fmt.Sprintf("Unable to set handler for signal %d: %v", bluepillSignal, err))
				1314	+ }
				1315	+ rules := []seccomp.RuleSet{}
				1316	+ rules = append(rules, []seccomp.RuleSet{
				1317	+ // Trap mmap system calls and handle them in sigsysGoHandler
				1318	+ {
				1319	+ Rules: seccomp.SyscallRules{
				1320	+ unix.SYS_MMAP: {
				1321	+ {
				1322	+ seccomp.MatchAny{},
				1323	+ seccomp.MatchAny{},
				1324	+ seccomp.MaskedEqual(unix.PROT_EXEC, 0),
				1325	+ /* MAP_DENYWRITE is ignored and used only for filtering. */
				1326	+ seccomp.MaskedEqual(unix.MAP_DENYWRITE, 0),
				1327	+ },
				1328	+ },
				1329	+ },
				1330	+ Action: linux.SECCOMP_RET_TRAP,
				1331	+ },
				1332	+ }...)
				1333	+ instrs, err := seccomp.BuildProgram(rules, linux.SECCOMP_RET_ALLOW, linux.SECCOMP_RET_ALLOW)
				1334	+ if err != nil {
				1335	+ panic(fmt.Sprintf("failed to build rules: %v", err))
				1336	+ }
				1337	+ // Perform the actual installation.
				1338	+ if err := seccomp.SetFilter(instrs); err != nil {
				1339	+ panic(fmt.Sprintf("failed to set filter: %v", err))
				1340	+ }
				1341	+ })
				1342	+
				1343	+ machinePoolMu.Lock()
				1344	+ n := machinePoolLen.Load()
				1345	+ i := uint32(0)
				1346	+ for ; i < n; i++ {
				1347	+ if machinePool[i].Load() == nil {
				1348	+ break
				1349	+ }
				1350	+ }
				1351	+ if i == n {
				1352	+ if i == machinePoolSize {
				1353	+ machinePoolMu.Unlock()
				1354	+ panic("machinePool is full")
				1355	+ }
				1356	+ machinePoolLen.Add(1)
				1357	+ }
				1358	+ machinePool[i].Store(m)
				1359	+ m.machinePoolIndex = i
				1360	+ machinePoolMu.Unlock()
				1361	+}
				1362	--
				1363	2.41.0
				1364