blob: ea5b04d1e78b8c658ab1cf84459c4796acad7d33 [file] [log] [blame]
Lorenz Brun65702192023-08-31 16:27:38 +02001From eb22b742839180a0bdb3953c061da15ba822d56d Mon Sep 17 00:00:00 2001
2From: Tim Windelschmidt <tim@monogon.tech>
3Date: Tue, 12 Sep 2023 15:06:49 +0200
4Subject: [PATCH] fix debug builds
5
6---
7 pkg/sentry/platform/kvm/address_space.go | 3 +
8 .../platform/kvm/address_space_debug.go | 242 +++++
9 .../platform/kvm/bluepill_debug_unsafe.go | 215 +++++
10 pkg/sentry/platform/kvm/bluepill_unsafe.go | 4 +-
11 pkg/sentry/platform/kvm/machine.go | 3 +
12 pkg/sentry/platform/kvm/machine_debug.go | 826 ++++++++++++++++++
13 6 files changed, 1291 insertions(+), 2 deletions(-)
14 create mode 100644 pkg/sentry/platform/kvm/address_space_debug.go
15 create mode 100644 pkg/sentry/platform/kvm/bluepill_debug_unsafe.go
16 create mode 100644 pkg/sentry/platform/kvm/machine_debug.go
17
18diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
19index 79ccbea35..7e30d0365 100644
20--- a/pkg/sentry/platform/kvm/address_space.go
21+++ b/pkg/sentry/platform/kvm/address_space.go
22@@ -12,6 +12,9 @@
23 // See the License for the specific language governing permissions and
24 // limitations under the License.
25
26+//go:build !kvm_debug
27+// +build !kvm_debug
28+
29 package kvm
30
31 import (
32diff --git a/pkg/sentry/platform/kvm/address_space_debug.go b/pkg/sentry/platform/kvm/address_space_debug.go
33new file mode 100644
34index 000000000..69aeba45a
35--- /dev/null
36+++ b/pkg/sentry/platform/kvm/address_space_debug.go
37@@ -0,0 +1,242 @@
38+// Copyright 2018 The gVisor Authors.
39+//
40+// Licensed under the Apache License, Version 2.0 (the "License");
41+// you may not use this file except in compliance with the License.
42+// You may obtain a copy of the License at
43+//
44+// http://www.apache.org/licenses/LICENSE-2.0
45+//
46+// Unless required by applicable law or agreed to in writing, software
47+// distributed under the License is distributed on an "AS IS" BASIS,
48+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
49+// See the License for the specific language governing permissions and
50+// limitations under the License.
51+
52+//go:build kvm_debug
53+// +build kvm_debug
54+
55+package kvm
56+
57+import (
58+ "gvisor.dev/gvisor/pkg/atomicbitops"
59+ "gvisor.dev/gvisor/pkg/hostarch"
60+ "gvisor.dev/gvisor/pkg/ring0/pagetables"
61+ "gvisor.dev/gvisor/pkg/sentry/memmap"
62+ "gvisor.dev/gvisor/pkg/sentry/platform"
63+ "gvisor.dev/gvisor/pkg/sync"
64+)
65+
66+// dirtySet tracks vCPUs for invalidation.
67+type dirtySet struct {
68+ vCPUMasks []atomicbitops.Uint64
69+}
70+
71+// forEach iterates over all CPUs in the dirty set.
72+func (ds *dirtySet) forEach(m *machine, fn func(c *vCPU)) {
73+ for index := range ds.vCPUMasks {
74+ mask := ds.vCPUMasks[index].Swap(0)
75+ if mask != 0 {
76+ for bit := 0; bit < 64; bit++ {
77+ if mask&(1<<uint64(bit)) == 0 {
78+ continue
79+ }
80+ id := 64*index + bit
81+ fn(m.vCPUsByID[id])
82+ }
83+ }
84+ }
85+}
86+
87+// mark marks the given vCPU as dirty and returns whether it was previously
88+// clean. Being previously clean implies that a flush is needed on entry.
89+func (ds *dirtySet) mark(c *vCPU) bool {
90+ index := uint64(c.id) / 64
91+ bit := uint64(1) << uint(c.id%64)
92+
93+ oldValue := ds.vCPUMasks[index].Load()
94+ if oldValue&bit != 0 {
95+ return false // Not clean.
96+ }
97+
98+ // Set the bit unilaterally, and ensure that a flush takes place. Note
99+ // that it's possible for races to occur here, but since the flush is
100+ // taking place long after these lines there's no race in practice.
101+ atomicbitops.OrUint64(&ds.vCPUMasks[index], bit)
102+ return true // Previously clean.
103+}
104+
105+// addressSpace is a wrapper for PageTables.
106+type addressSpace struct {
107+ platform.NoAddressSpaceIO
108+
109+ // mu is the lock for modifications to the address space.
110+ //
111+ // Note that the page tables themselves are not locked.
112+ mu sync.Mutex
113+
114+ // machine is the underlying machine.
115+ machine *machine
116+
117+ // pageTables are for this particular address space.
118+ pageTables *pagetables.PageTables
119+
120+ // dirtySet is the set of dirty vCPUs.
121+ dirtySet *dirtySet
122+}
123+
124+// Invalidate interrupts all dirty contexts.
125+func (as *addressSpace) Invalidate() {
126+ as.mu.Lock()
127+ defer as.mu.Unlock()
128+ as.invalidate()
129+}
130+
131+// Touch adds the given vCPU to the dirty list.
132+//
133+// The return value indicates whether a flush is required.
134+func (as *addressSpace) Touch(c *vCPU) bool {
135+ return as.dirtySet.mark(c)
136+}
137+
138+type hostMapEntry struct {
139+ addr uintptr
140+ length uintptr
141+}
142+
143+// mapLocked maps the given host entry.
144+//
145+// +checkescape:hard,stack
146+func (as *addressSpace) mapLocked(addr hostarch.Addr, m hostMapEntry, at hostarch.AccessType) (inv bool) {
147+ for m.length > 0 {
148+ physical, length, ok := translateToPhysical(m.addr)
149+ if !ok {
150+ panic("unable to translate segment")
151+ }
152+ if length > m.length {
153+ length = m.length
154+ }
155+
156+ // Ensure that this map has physical mappings. If the page does
157+ // not have physical mappings, the KVM module may inject
158+ // spurious exceptions when emulation fails (i.e. it tries to
159+ // emulate because the RIP is pointed at those pages).
160+ as.machine.mapPhysical(physical, length, physicalRegions)
161+
162+ // Install the page table mappings. Note that the ordering is
163+ // important; if the pagetable mappings were installed before
164+ // ensuring the physical pages were available, then some other
165+ // thread could theoretically access them.
166+ inv = as.pageTables.Map(addr, length, pagetables.MapOpts{
167+ AccessType: at,
168+ User: true,
169+ }, physical) || inv
170+ m.addr += length
171+ m.length -= length
172+ addr += hostarch.Addr(length)
173+ }
174+
175+ return inv
176+}
177+
178+// MapFile implements platform.AddressSpace.MapFile.
179+func (as *addressSpace) MapFile(addr hostarch.Addr, f memmap.File, fr memmap.FileRange, at hostarch.AccessType, precommit bool) error {
180+ as.mu.Lock()
181+ defer as.mu.Unlock()
182+
183+ // Get mappings in the sentry's address space, which are guaranteed to be
184+ // valid as long as a reference is held on the mapped pages (which is in
185+ // turn required by AddressSpace.MapFile precondition).
186+ //
187+ // If precommit is true, we will touch mappings to commit them, so ensure
188+ // that mappings are readable from sentry context.
189+ //
190+ // We don't execute from application file-mapped memory, and guest page
191+ // tables don't care if we have execute permission (but they do need pages
192+ // to be readable).
193+ bs, err := f.MapInternal(fr, hostarch.AccessType{
194+ Read: at.Read || at.Execute || precommit,
195+ Write: at.Write,
196+ })
197+ if err != nil {
198+ return err
199+ }
200+
201+ // See block in mapLocked.
202+ as.pageTables.Allocator.(*allocator).cpu = as.machine.Get()
203+ defer as.machine.Put(as.pageTables.Allocator.(*allocator).cpu)
204+
205+ // Map the mappings in the sentry's address space (guest physical memory)
206+ // into the application's address space (guest virtual memory).
207+ inv := false
208+ for !bs.IsEmpty() {
209+ b := bs.Head()
210+ bs = bs.Tail()
211+ // Since fr was page-aligned, b should also be page-aligned. We do the
212+ // lookup in our host page tables for this translation.
213+ if precommit {
214+ s := b.ToSlice()
215+ for i := 0; i < len(s); i += hostarch.PageSize {
216+ _ = s[i] // Touch to commit.
217+ }
218+ }
219+
220+ // See bluepill_allocator.go.
221+ bluepill(as.pageTables.Allocator.(*allocator).cpu)
222+
223+ // Perform the mapping.
224+ prev := as.mapLocked(addr, hostMapEntry{
225+ addr: b.Addr(),
226+ length: uintptr(b.Len()),
227+ }, at)
228+ inv = inv || prev
229+ addr += hostarch.Addr(b.Len())
230+ }
231+ if inv {
232+ as.invalidate()
233+ }
234+
235+ return nil
236+}
237+
238+// unmapLocked is an escape-checked wrapped around Unmap.
239+//
240+// +checkescape:hard,stack
241+func (as *addressSpace) unmapLocked(addr hostarch.Addr, length uint64) bool {
242+ return as.pageTables.Unmap(addr, uintptr(length))
243+}
244+
245+// Unmap unmaps the given range by calling pagetables.PageTables.Unmap.
246+func (as *addressSpace) Unmap(addr hostarch.Addr, length uint64) {
247+ as.mu.Lock()
248+ defer as.mu.Unlock()
249+
250+ // See above & bluepill_allocator.go.
251+ as.pageTables.Allocator.(*allocator).cpu = as.machine.Get()
252+ defer as.machine.Put(as.pageTables.Allocator.(*allocator).cpu)
253+ bluepill(as.pageTables.Allocator.(*allocator).cpu)
254+
255+ if prev := as.unmapLocked(addr, length); prev {
256+ // Invalidate all active vCPUs.
257+ as.invalidate()
258+
259+ // Recycle any freed intermediate pages.
260+ as.pageTables.Allocator.Recycle()
261+ }
262+}
263+
264+// Release releases the page tables.
265+func (as *addressSpace) Release() {
266+ as.Unmap(0, ^uint64(0))
267+
268+ // Free all pages from the allocator.
269+ as.pageTables.Allocator.(*allocator).base.Drain()
270+
271+ // Drop all cached machine references.
272+ as.machine.dropPageTables(as.pageTables)
273+}
274+
275+// PreFork implements platform.AddressSpace.PreFork.
276+func (as *addressSpace) PreFork() {}
277+
278+// PostFork implements platform.AddressSpace.PostFork.
279+func (as *addressSpace) PostFork() {}
280diff --git a/pkg/sentry/platform/kvm/bluepill_debug_unsafe.go b/pkg/sentry/platform/kvm/bluepill_debug_unsafe.go
281new file mode 100644
282index 000000000..5feb45c19
283--- /dev/null
284+++ b/pkg/sentry/platform/kvm/bluepill_debug_unsafe.go
285@@ -0,0 +1,215 @@
286+// Copyright 2018 The gVisor Authors.
287+//
288+// Licensed under the Apache License, Version 2.0 (the "License");
289+// you may not use this file except in compliance with the License.
290+// You may obtain a copy of the License at
291+//
292+// http://www.apache.org/licenses/LICENSE-2.0
293+//
294+// Unless required by applicable law or agreed to in writing, software
295+// distributed under the License is distributed on an "AS IS" BASIS,
296+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
297+// See the License for the specific language governing permissions and
298+// limitations under the License.
299+
300+//go:build go1.18 && kvm_debug
301+// +build go1.18,kvm_debug
302+
303+// //go:linkname directives type-checked by checklinkname. Any other
304+// non-linkname assumptions outside the Go 1 compatibility guarantee should
305+// have an accompanied vet check or version guard build tag.
306+
307+package kvm
308+
309+import (
310+ "unsafe"
311+
312+ "golang.org/x/sys/unix"
313+ "gvisor.dev/gvisor/pkg/sentry/arch"
314+)
315+
316+//go:linkname throw runtime.throw
317+func throw(s string)
318+
319+// vCPUPtr returns a CPU for the given address.
320+func vCPUPtr(addr uintptr) *vCPU {
321+ return (*vCPU)(unsafe.Pointer(addr))
322+}
323+
324+// bytePtr returns a bytePtr for the given address.
325+func bytePtr(addr uintptr) *byte {
326+ return (*byte)(unsafe.Pointer(addr))
327+}
328+
329+// uintptrValue returns a uintptr for the given address.
330+func uintptrValue(addr *byte) uintptr {
331+ return (uintptr)(unsafe.Pointer(addr))
332+}
333+
334+// bluepillArchContext returns the UContext64.
335+func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 {
336+ return &((*arch.UContext64)(context).MContext)
337+}
338+
339+// bluepillHandleHlt is reponsible for handling VM-Exit.
340+func bluepillGuestExit(c *vCPU, context unsafe.Pointer) {
341+ // Increment our counter.
342+ c.guestExits.Add(1)
343+
344+ // Copy out registers.
345+ bluepillArchExit(c, bluepillArchContext(context))
346+
347+ // Return to the vCPUReady state; notify any waiters.
348+ user := c.state.Load() & vCPUUser
349+ switch c.state.Swap(user) {
350+ case user | vCPUGuest: // Expected case.
351+ case user | vCPUGuest | vCPUWaiter:
352+ c.notify()
353+ default:
354+ throw("invalid state")
355+ }
356+}
357+
358+var hexSyms = []byte("0123456789abcdef")
359+
360+func printHex(title []byte, val uint64) {
361+ var str [18]byte
362+ for i := 0; i < 16; i++ {
363+ str[16-i] = hexSyms[val&0xf]
364+ val = val >> 4
365+ }
366+ str[0] = ' '
367+ str[17] = '\n'
368+ unix.RawSyscall(unix.SYS_WRITE, uintptr(unix.Stderr), uintptr(unsafe.Pointer(&title[0])), uintptr(len(title)))
369+ unix.RawSyscall(unix.SYS_WRITE, uintptr(unix.Stderr), uintptr(unsafe.Pointer(&str)), 18)
370+}
371+
372+// bluepillHandler is called from the signal stub.
373+//
374+// The world may be stopped while this is executing, and it executes on the
375+// signal stack. It should only execute raw system calls and functions that are
376+// explicitly marked go:nosplit.
377+//
378+// Ideally, this function should switch to gsignal, as runtime.sigtramp does,
379+// but that is tedious given all the runtime internals. That said, using
380+// gsignal inside a signal handler is not _required_, provided we avoid stack
381+// splits and allocations. Note that calling any splittable function here will
382+// be flaky; if the signal stack is below the G stack then we will trigger a
383+// split and crash. If above, we won't trigger a split.
384+//
385+// +checkescape:all
386+func bluepillHandler(context unsafe.Pointer) {
387+ // Sanitize the registers; interrupts must always be disabled.
388+ c := bluepillArchEnter(bluepillArchContext(context))
389+
390+ // Mark this as guest mode.
391+ switch c.state.Swap(vCPUGuest | vCPUUser) {
392+ case vCPUUser: // Expected case.
393+ case vCPUUser | vCPUWaiter:
394+ c.notify()
395+ default:
396+ throw("invalid state")
397+ }
398+
399+ for {
400+ hostExitCounter.Increment()
401+ _, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(c.fd), _KVM_RUN, 0) // escapes: no.
402+ switch errno {
403+ case 0: // Expected case.
404+ case unix.EINTR:
405+ interruptCounter.Increment()
406+ // First, we process whatever pending signal
407+ // interrupted KVM. Since we're in a signal handler
408+ // currently, all signals are masked and the signal
409+ // must have been delivered directly to this thread.
410+ timeout := unix.Timespec{}
411+ sig, _, errno := unix.RawSyscall6( // escapes: no.
412+ unix.SYS_RT_SIGTIMEDWAIT,
413+ uintptr(unsafe.Pointer(&bounceSignalMask)),
414+ 0, // siginfo.
415+ uintptr(unsafe.Pointer(&timeout)), // timeout.
416+ 8, // sigset size.
417+ 0, 0)
418+ if errno == unix.EAGAIN {
419+ continue
420+ }
421+ if errno != 0 {
422+ throw("error waiting for pending signal")
423+ }
424+ if sig != uintptr(bounceSignal) {
425+ throw("unexpected signal")
426+ }
427+
428+ // Check whether the current state of the vCPU is ready
429+ // for interrupt injection. Because we don't have a
430+ // PIC, we can't inject an interrupt while they are
431+ // masked. We need to request a window if it's not
432+ // ready.
433+ if bluepillReadyStopGuest(c) {
434+ // Force injection below; the vCPU is ready.
435+ c.runData.exitReason = _KVM_EXIT_IRQ_WINDOW_OPEN
436+ } else {
437+ c.runData.requestInterruptWindow = 1
438+ continue // Rerun vCPU.
439+ }
440+ case unix.EFAULT:
441+ // If a fault is not serviceable due to the host
442+ // backing pages having page permissions, instead of an
443+ // MMIO exit we receive EFAULT from the run ioctl. We
444+ // always inject an NMI here since we may be in kernel
445+ // mode and have interrupts disabled.
446+ bluepillSigBus(c)
447+ continue // Rerun vCPU.
448+ case unix.ENOSYS:
449+ bluepillHandleEnosys(c)
450+ continue
451+ default:
452+ throw("run failed")
453+ }
454+
455+ switch c.runData.exitReason {
456+ case _KVM_EXIT_EXCEPTION:
457+ c.die(bluepillArchContext(context), "exception")
458+ return
459+ case _KVM_EXIT_IO:
460+ c.die(bluepillArchContext(context), "I/O")
461+ return
462+ case _KVM_EXIT_INTERNAL_ERROR:
463+ // An internal error is typically thrown when emulation
464+ // fails. This can occur via the MMIO path below (and
465+ // it might fail because we have multiple regions that
466+ // are not mapped). We would actually prefer that no
467+ // emulation occur, and don't mind at all if it fails.
468+ case _KVM_EXIT_HYPERCALL:
469+ c.die(bluepillArchContext(context), "hypercall")
470+ return
471+ case _KVM_EXIT_DEBUG:
472+ c.die(bluepillArchContext(context), "debug")
473+ return
474+ case _KVM_EXIT_HLT:
475+ c.hltSanityCheck()
476+ bluepillGuestExit(c, context)
477+ return
478+ case _KVM_EXIT_MMIO:
479+ physical := uintptr(c.runData.data[0])
480+ if getHypercallID(physical) == _KVM_HYPERCALL_VMEXIT {
481+ bluepillGuestExit(c, context)
482+ return
483+ }
484+
485+ c.die(bluepillArchContext(context), "exit_mmio")
486+ return
487+ case _KVM_EXIT_IRQ_WINDOW_OPEN:
488+ bluepillStopGuest(c)
489+ case _KVM_EXIT_SHUTDOWN:
490+ c.die(bluepillArchContext(context), "shutdown")
491+ return
492+ case _KVM_EXIT_FAIL_ENTRY:
493+ c.die(bluepillArchContext(context), "entry failed")
494+ return
495+ default:
496+ bluepillArchHandleExit(c, context)
497+ return
498+ }
499+ }
500+}
501diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
502index 81bd9f814..ad8b966e7 100644
503--- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
504+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
505@@ -12,8 +12,8 @@
506 // See the License for the specific language governing permissions and
507 // limitations under the License.
508
509-//go:build go1.18
510-// +build go1.18
511+//go:build go1.18 && !kvm_debug
512+// +build go1.18,!kvm_debug
513
514 // //go:linkname directives type-checked by checklinkname. Any other
515 // non-linkname assumptions outside the Go 1 compatibility guarantee should
516diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
517index f39bf1f06..4f0264db7 100644
518--- a/pkg/sentry/platform/kvm/machine.go
519+++ b/pkg/sentry/platform/kvm/machine.go
520@@ -12,6 +12,9 @@
521 // See the License for the specific language governing permissions and
522 // limitations under the License.
523
524+//go:build !kvm_debug
525+// +build !kvm_debug
526+
527 package kvm
528
529 import (
530diff --git a/pkg/sentry/platform/kvm/machine_debug.go b/pkg/sentry/platform/kvm/machine_debug.go
531new file mode 100644
532index 000000000..0a4735d2d
533--- /dev/null
534+++ b/pkg/sentry/platform/kvm/machine_debug.go
535@@ -0,0 +1,826 @@
536+// Copyright 2018 The gVisor Authors.
537+//
538+// Licensed under the Apache License, Version 2.0 (the "License");
539+// you may not use this file except in compliance with the License.
540+// You may obtain a copy of the License at
541+//
542+// http://www.apache.org/licenses/LICENSE-2.0
543+//
544+// Unless required by applicable law or agreed to in writing, software
545+// distributed under the License is distributed on an "AS IS" BASIS,
546+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
547+// See the License for the specific language governing permissions and
548+// limitations under the License.
549+
550+//go:build kvm_debug
551+// +build kvm_debug
552+
553+package kvm
554+
555+import (
556+ "fmt"
557+ "runtime"
558+ gosync "sync"
559+ "sync/atomic"
560+ "time"
561+
562+ "golang.org/x/sys/unix"
563+ "gvisor.dev/gvisor/pkg/abi/linux"
564+ "gvisor.dev/gvisor/pkg/atomicbitops"
565+ "gvisor.dev/gvisor/pkg/hostarch"
566+ "gvisor.dev/gvisor/pkg/hosttid"
567+ "gvisor.dev/gvisor/pkg/log"
568+ "gvisor.dev/gvisor/pkg/metric"
569+ "gvisor.dev/gvisor/pkg/ring0"
570+ "gvisor.dev/gvisor/pkg/ring0/pagetables"
571+ "gvisor.dev/gvisor/pkg/seccomp"
572+ ktime "gvisor.dev/gvisor/pkg/sentry/time"
573+ "gvisor.dev/gvisor/pkg/sighandling"
574+ "gvisor.dev/gvisor/pkg/sync"
575+)
576+
577+// machine contains state associated with the VM as a whole.
578+type machine struct {
579+ // fd is the vm fd.
580+ fd int
581+
582+ // machinePoolIndex is the index in the machinePool array.
583+ machinePoolIndex uint32
584+
585+ // nextSlot is the next slot for setMemoryRegion.
586+ //
587+ // If nextSlot is ^uint32(0), then slots are currently being updated, and the
588+ // caller should retry.
589+ nextSlot atomicbitops.Uint32
590+
591+ // upperSharedPageTables tracks the read-only shared upper of all the pagetables.
592+ upperSharedPageTables *pagetables.PageTables
593+
594+ // kernel is the set of global structures.
595+ kernel ring0.Kernel
596+
597+ // mu protects vCPUs.
598+ mu sync.RWMutex
599+
600+ // available is notified when vCPUs are available.
601+ available sync.Cond
602+
603+ // vCPUsByTID are the machine vCPUs.
604+ //
605+ // These are populated dynamically.
606+ vCPUsByTID map[uint64]*vCPU
607+
608+ // vCPUsByID are the machine vCPUs, can be indexed by the vCPU's ID.
609+ vCPUsByID []*vCPU
610+
611+ // usedVCPUs is the number of vCPUs that have been used from the
612+ // vCPUsByID pool.
613+ usedVCPUs int
614+
615+ // maxVCPUs is the maximum number of vCPUs supported by the machine.
616+ maxVCPUs int
617+
618+ // maxSlots is the maximum number of memory slots supported by the machine.
619+ maxSlots int
620+
621+ // tscControl checks whether cpu supports TSC scaling
622+ tscControl bool
623+
624+ // usedSlots is the set of used physical addresses (not sorted).
625+ usedSlots []uintptr
626+}
627+
628+const (
629+ // vCPUReady is an alias for all the below clear.
630+ vCPUReady uint32 = 0
631+
632+ // vCPUser indicates that the vCPU is in or about to enter user mode.
633+ vCPUUser uint32 = 1 << 0
634+
635+ // vCPUGuest indicates the vCPU is in guest mode.
636+ vCPUGuest uint32 = 1 << 1
637+
638+ // vCPUWaiter indicates that there is a waiter.
639+ //
640+ // If this is set, then notify must be called on any state transitions.
641+ vCPUWaiter uint32 = 1 << 2
642+)
643+
644+// Field values for the get_vcpu metric acquisition path used.
645+var (
646+ getVCPUAcquisitionFastReused = metric.FieldValue{"fast_reused"}
647+ getVCPUAcquisitionReused = metric.FieldValue{"reused"}
648+ getVCPUAcquisitionUnused = metric.FieldValue{"unused"}
649+ getVCPUAcquisitionStolen = metric.FieldValue{"stolen"}
650+)
651+
652+var (
653+ // hostExitCounter is a metric that tracks how many times the sentry
654+ // performed a host to guest world switch.
655+ hostExitCounter = metric.MustCreateNewProfilingUint64Metric(
656+ "/kvm/host_exits", false, "The number of times the sentry performed a host to guest world switch.")
657+
658+ // userExitCounter is a metric that tracks how many times the sentry has
659+ // had an exit from userspace. Analogous to vCPU.userExits.
660+ userExitCounter = metric.MustCreateNewProfilingUint64Metric(
661+ "/kvm/user_exits", false, "The number of times the sentry has had an exit from userspace.")
662+
663+ // interruptCounter is a metric that tracks how many times execution returned
664+ // to the KVM host to handle a pending signal.
665+ interruptCounter = metric.MustCreateNewProfilingUint64Metric(
666+ "/kvm/interrupts", false, "The number of times the signal handler was invoked.")
667+
668+ // mmapCallCounter is a metric that tracks how many times the function
669+ // seccompMmapSyscall has been called.
670+ mmapCallCounter = metric.MustCreateNewProfilingUint64Metric(
671+ "/kvm/mmap_calls", false, "The number of times seccompMmapSyscall has been called.")
672+
673+ // getVCPUCounter is a metric that tracks how many times different paths of
674+ // machine.Get() are triggered.
675+ getVCPUCounter = metric.MustCreateNewProfilingUint64Metric(
676+ "/kvm/get_vcpu", false, "The number of times that machine.Get() was called, split by path the function took.",
677+ metric.NewField("acquisition_type", &getVCPUAcquisitionFastReused, &getVCPUAcquisitionReused, &getVCPUAcquisitionUnused, &getVCPUAcquisitionStolen))
678+
679+ // asInvalidateDuration are durations of calling addressSpace.invalidate().
680+ asInvalidateDuration = metric.MustCreateNewProfilingTimerMetric("/kvm/address_space_invalidate",
681+ metric.NewExponentialBucketer(15, uint64(time.Nanosecond*100), 1, 2),
682+ "Duration of calling addressSpace.invalidate().")
683+)
684+
685+// vCPU is a single KVM vCPU.
686+type vCPU struct {
687+ // CPU is the kernel CPU data.
688+ //
689+ // This must be the first element of this structure, it is referenced
690+ // by the bluepill code (see bluepill_amd64.s).
691+ ring0.CPU
692+
693+ // id is the vCPU id.
694+ id int
695+
696+ // fd is the vCPU fd.
697+ fd int
698+
699+ // tid is the last set tid.
700+ tid atomicbitops.Uint64
701+
702+ // userExits is the count of user exits.
703+ userExits atomicbitops.Uint64
704+
705+ // guestExits is the count of guest to host world switches.
706+ guestExits atomicbitops.Uint64
707+
708+ // faults is a count of world faults (informational only).
709+ faults uint32
710+
711+ // state is the vCPU state.
712+ //
713+ // This is a bitmask of the three fields (vCPU*) described above.
714+ state atomicbitops.Uint32
715+
716+ // runData for this vCPU.
717+ runData *runData
718+
719+ // machine associated with this vCPU.
720+ machine *machine
721+
722+ // active is the current addressSpace: this is set and read atomically,
723+ // it is used to elide unnecessary interrupts due to invalidations.
724+ active atomicAddressSpace
725+
726+ // vCPUArchState is the architecture-specific state.
727+ vCPUArchState
728+
729+ // dieState holds state related to vCPU death.
730+ dieState dieState
731+}
732+
733+type dieState struct {
734+ // message is thrown from die.
735+ message string
736+
737+ // guestRegs is used to store register state during vCPU.die() to prevent
738+ // allocation inside nosplit function.
739+ guestRegs userRegs
740+}
741+
742+// createVCPU creates and returns a new vCPU.
743+//
744+// Precondition: mu must be held.
745+func (m *machine) createVCPU(id int) *vCPU {
746+ // Create the vCPU.
747+ fd, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CREATE_VCPU, uintptr(id))
748+ if errno != 0 {
749+ panic(fmt.Sprintf("error creating new vCPU: %v", errno))
750+ }
751+
752+ c := &vCPU{
753+ id: id,
754+ fd: int(fd),
755+ machine: m,
756+ }
757+ c.CPU.Init(&m.kernel, c.id, c)
758+ m.vCPUsByID[c.id] = c
759+
760+ // Ensure the signal mask is correct.
761+ if err := c.setSignalMask(); err != nil {
762+ panic(fmt.Sprintf("error setting signal mask: %v", err))
763+ }
764+
765+ // Map the run data.
766+ runData, err := mapRunData(int(fd))
767+ if err != nil {
768+ panic(fmt.Sprintf("error mapping run data: %v", err))
769+ }
770+ c.runData = runData
771+
772+ // Initialize architecture state.
773+ if err := c.initArchState(); err != nil {
774+ panic(fmt.Sprintf("error initialization vCPU state: %v", err))
775+ }
776+
777+ return c // Done.
778+}
779+
780+// newMachine returns a new VM context.
781+func newMachine(vm int) (*machine, error) {
782+ // Create the machine.
783+ m := &machine{fd: vm}
784+ m.available.L = &m.mu
785+
786+ // Pull the maximum vCPUs.
787+ m.getMaxVCPU()
788+ log.Debugf("The maximum number of vCPUs is %d.", m.maxVCPUs)
789+ m.vCPUsByTID = make(map[uint64]*vCPU)
790+ m.vCPUsByID = make([]*vCPU, m.maxVCPUs)
791+ m.kernel.Init(m.maxVCPUs)
792+
793+ // Pull the maximum slots.
794+ maxSlots, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_MEMSLOTS)
795+ if errno != 0 {
796+ m.maxSlots = _KVM_NR_MEMSLOTS
797+ } else {
798+ m.maxSlots = int(maxSlots)
799+ }
800+ log.Debugf("The maximum number of slots is %d.", m.maxSlots)
801+ m.usedSlots = make([]uintptr, m.maxSlots)
802+
803+ // Check TSC Scaling
804+ hasTSCControl, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_TSC_CONTROL)
805+ m.tscControl = errno == 0 && hasTSCControl == 1
806+ log.Debugf("TSC scaling support: %t.", m.tscControl)
807+
808+ // Create the upper shared pagetables and kernel(sentry) pagetables.
809+ m.upperSharedPageTables = pagetables.New(newAllocator())
810+ m.mapUpperHalf(m.upperSharedPageTables)
811+ m.upperSharedPageTables.Allocator.(*allocator).base.Drain()
812+ m.upperSharedPageTables.MarkReadOnlyShared()
813+ m.kernel.PageTables = pagetables.NewWithUpper(newAllocator(), m.upperSharedPageTables, ring0.KernelStartAddress)
814+
815+ // Install seccomp rules to trap runtime mmap system calls. They will
816+ // be handled by seccompMmapHandler.
817+ seccompMmapRules(m)
818+
819+ // Apply the physical mappings. Note that these mappings may point to
820+ // guest physical addresses that are not actually available. These
821+ // physical pages are mapped on demand, see kernel_unsafe.go.
822+ applyPhysicalRegions(func(pr physicalRegion) bool {
823+ // Map everything in the lower half.
824+ m.kernel.PageTables.Map(
825+ hostarch.Addr(pr.virtual),
826+ pr.length,
827+ pagetables.MapOpts{AccessType: hostarch.ReadWrite},
828+ pr.physical)
829+
830+ return true // Keep iterating.
831+ })
832+
833+ // Ensure that the currently mapped virtual regions are actually
834+ // available in the VM. Note that this doesn't guarantee no future
835+ // faults, however it should guarantee that everything is available to
836+ // ensure successful vCPU entry.
837+ mapRegion := func(vr virtualRegion, flags uint32) {
838+ for virtual := vr.virtual; virtual < vr.virtual+vr.length; {
839+ physical, length, ok := translateToPhysical(virtual)
840+ if !ok {
841+ // This must be an invalid region that was
842+ // knocked out by creation of the physical map.
843+ return
844+ }
845+ if virtual+length > vr.virtual+vr.length {
846+ // Cap the length to the end of the area.
847+ length = vr.virtual + vr.length - virtual
848+ }
849+ // Update page tables for executable mappings.
850+ if vr.accessType.Execute {
851+ if vr.accessType.Write {
852+ panic(fmt.Sprintf("executable mapping can't be writable: %#v", vr))
853+ }
854+ m.kernel.PageTables.Map(
855+ hostarch.Addr(virtual),
856+ length,
857+ pagetables.MapOpts{AccessType: vr.accessType},
858+ physical)
859+ }
860+
861+ // Ensure the physical range is mapped.
862+ m.mapPhysical(physical, length, physicalRegions)
863+ virtual += length
864+ }
865+ }
866+
867+ // handleBluepillFault takes the slot spinlock and it is called from
868+ // seccompMmapHandler, so here we have to guarantee that mmap is not
869+ // called while we hold the slot spinlock.
870+ disableAsyncPreemption()
871+ applyVirtualRegions(func(vr virtualRegion) {
872+ if excludeVirtualRegion(vr) {
873+ return // skip region.
874+ }
875+ // Take into account that the stack can grow down.
876+ if vr.filename == "[stack]" {
877+ vr.virtual -= 1 << 20
878+ vr.length += 1 << 20
879+ }
880+
881+ mapRegion(vr, 0)
882+
883+ })
884+ enableAsyncPreemption()
885+
886+ // Initialize architecture state.
887+ if err := m.initArchState(); err != nil {
888+ m.Destroy()
889+ return nil, err
890+ }
891+
892+ // Ensure the machine is cleaned up properly.
893+ runtime.SetFinalizer(m, (*machine).Destroy)
894+ return m, nil
895+}
896+
897+// hasSlot returns true if the given address is mapped.
898+//
899+// This must be done via a linear scan.
900+//
901+//go:nosplit
902+func (m *machine) hasSlot(physical uintptr) bool {
903+ slotLen := int(m.nextSlot.Load())
904+ // When slots are being updated, nextSlot is ^uint32(0). As this situation
905+ // is less likely happen, we just set the slotLen to m.maxSlots, and scan
906+ // the whole usedSlots array.
907+ if slotLen == int(^uint32(0)) {
908+ slotLen = m.maxSlots
909+ }
910+ for i := 0; i < slotLen; i++ {
911+ if p := atomic.LoadUintptr(&m.usedSlots[i]); p == physical {
912+ return true
913+ }
914+ }
915+ return false
916+}
917+
918+// mapPhysical checks for the mapping of a physical range, and installs one if
919+// not available. This attempts to be efficient for calls in the hot path.
920+//
921+// This throws on error.
922+func (m *machine) mapPhysical(physical, length uintptr, phyRegions []physicalRegion) {
923+ for end := physical + length; physical < end; {
924+ _, physicalStart, length, pr := calculateBluepillFault(physical, phyRegions)
925+ if pr == nil {
926+ // Should never happen.
927+ throw("mapPhysical on unknown physical address")
928+ }
929+
930+ // Is this already mapped? Check the usedSlots.
931+ if !m.hasSlot(physicalStart) {
932+ if _, ok := handleBluepillFault(m, physical, phyRegions); !ok {
933+ throw("handleBluepillFault failed")
934+ }
935+ }
936+
937+ // Move to the next chunk.
938+ physical = physicalStart + length
939+ }
940+}
941+
942+// Destroy frees associated resources.
943+//
944+// Destroy should only be called once all active users of the machine are gone.
945+// The machine object should not be used after calling Destroy.
946+//
947+// Precondition: all vCPUs must be returned to the machine.
948+func (m *machine) Destroy() {
949+ runtime.SetFinalizer(m, nil)
950+
951+ // Destroy vCPUs.
952+ for _, c := range m.vCPUsByID {
953+ if c == nil {
954+ continue
955+ }
956+
957+ // Ensure the vCPU is not still running in guest mode. This is
958+ // possible iff teardown has been done by other threads, and
959+ // somehow a single thread has not executed any system calls.
960+ c.BounceToHost()
961+
962+ // Note that the runData may not be mapped if an error occurs
963+ // during the middle of initialization.
964+ if c.runData != nil {
965+ if err := unmapRunData(c.runData); err != nil {
966+ panic(fmt.Sprintf("error unmapping rundata: %v", err))
967+ }
968+ }
969+ if err := unix.Close(int(c.fd)); err != nil {
970+ panic(fmt.Sprintf("error closing vCPU fd: %v", err))
971+ }
972+ }
973+
974+ machinePool[m.machinePoolIndex].Store(nil)
975+ seccompMmapSync()
976+
977+ // vCPUs are gone: teardown machine state.
978+ if err := unix.Close(m.fd); err != nil {
979+ panic(fmt.Sprintf("error closing VM fd: %v", err))
980+ }
981+}
982+
983+// Get gets an available vCPU.
984+//
985+// This will return with the OS thread locked.
986+//
987+// It is guaranteed that if any OS thread TID is in guest, m.vCPUs[TID] points
988+// to the vCPU in which the OS thread TID is running. So if Get() returns with
989+// the corrent context in guest, the vCPU of it must be the same as what
990+// Get() returns.
991+func (m *machine) Get() *vCPU {
992+ m.mu.RLock()
993+ runtime.LockOSThread()
994+ tid := hosttid.Current()
995+
996+ // Check for an exact match.
997+ if c := m.vCPUsByTID[tid]; c != nil {
998+ c.lock()
999+ m.mu.RUnlock()
1000+ getVCPUCounter.Increment(&getVCPUAcquisitionFastReused)
1001+ return c
1002+ }
1003+
1004+ // The happy path failed. We now proceed to acquire an exclusive lock
1005+ // (because the vCPU map may change), and scan all available vCPUs.
1006+ // In this case, we first unlock the OS thread. Otherwise, if mu is
1007+ // not available, the current system thread will be parked and a new
1008+ // system thread spawned. We avoid this situation by simply refreshing
1009+ // tid after relocking the system thread.
1010+ m.mu.RUnlock()
1011+ runtime.UnlockOSThread()
1012+ m.mu.Lock()
1013+ runtime.LockOSThread()
1014+ tid = hosttid.Current()
1015+
1016+ // Recheck for an exact match.
1017+ if c := m.vCPUsByTID[tid]; c != nil {
1018+ c.lock()
1019+ m.mu.Unlock()
1020+ getVCPUCounter.Increment(&getVCPUAcquisitionReused)
1021+ return c
1022+ }
1023+
1024+ for {
1025+ // Get vCPU from the m.vCPUsByID pool.
1026+ if m.usedVCPUs < m.maxVCPUs {
1027+ c := m.vCPUsByID[m.usedVCPUs]
1028+ m.usedVCPUs++
1029+ c.lock()
1030+ m.vCPUsByTID[tid] = c
1031+ m.mu.Unlock()
1032+ c.loadSegments(tid)
1033+ getVCPUCounter.Increment(&getVCPUAcquisitionUnused)
1034+ return c
1035+ }
1036+
1037+ // Scan for an available vCPU.
1038+ for origTID, c := range m.vCPUsByTID {
1039+ if c.state.CompareAndSwap(vCPUReady, vCPUUser) {
1040+ delete(m.vCPUsByTID, origTID)
1041+ m.vCPUsByTID[tid] = c
1042+ m.mu.Unlock()
1043+ c.loadSegments(tid)
1044+ getVCPUCounter.Increment(&getVCPUAcquisitionUnused)
1045+ return c
1046+ }
1047+ }
1048+
1049+ // Scan for something not in user mode.
1050+ for origTID, c := range m.vCPUsByTID {
1051+ if !c.state.CompareAndSwap(vCPUGuest, vCPUGuest|vCPUWaiter) {
1052+ continue
1053+ }
1054+
1055+ // The vCPU is not be able to transition to
1056+ // vCPUGuest|vCPUWaiter or to vCPUUser because that
1057+ // transition requires holding the machine mutex, as we
1058+ // do now. There is no path to register a waiter on
1059+ // just the vCPUReady state.
1060+ for {
1061+ c.waitUntilNot(vCPUGuest | vCPUWaiter)
1062+ if c.state.CompareAndSwap(vCPUReady, vCPUUser) {
1063+ break
1064+ }
1065+ }
1066+
1067+ // Steal the vCPU.
1068+ delete(m.vCPUsByTID, origTID)
1069+ m.vCPUsByTID[tid] = c
1070+ m.mu.Unlock()
1071+ c.loadSegments(tid)
1072+ getVCPUCounter.Increment(&getVCPUAcquisitionStolen)
1073+ return c
1074+ }
1075+
1076+ // Everything is executing in user mode. Wait until something
1077+ // is available. Note that signaling the condition variable
1078+ // will have the extra effect of kicking the vCPUs out of guest
1079+ // mode if that's where they were.
1080+ m.available.Wait()
1081+ }
1082+}
1083+
1084+// Put puts the current vCPU.
1085+func (m *machine) Put(c *vCPU) {
1086+ c.unlock()
1087+ runtime.UnlockOSThread()
1088+
1089+ m.mu.RLock()
1090+ m.available.Signal()
1091+ m.mu.RUnlock()
1092+}
1093+
1094+// newDirtySet returns a new dirty set.
1095+func (m *machine) newDirtySet() *dirtySet {
1096+ return &dirtySet{
1097+ vCPUMasks: make([]atomicbitops.Uint64,
1098+ (m.maxVCPUs+63)/64, (m.maxVCPUs+63)/64),
1099+ }
1100+}
1101+
1102+// dropPageTables drops cached page table entries.
1103+func (m *machine) dropPageTables(pt *pagetables.PageTables) {
1104+ m.mu.Lock()
1105+ defer m.mu.Unlock()
1106+
1107+ // Clear from all PCIDs.
1108+ for _, c := range m.vCPUsByID {
1109+ if c != nil && c.PCIDs != nil {
1110+ c.PCIDs.Drop(pt)
1111+ }
1112+ }
1113+}
1114+
1115+// lock marks the vCPU as in user mode.
1116+//
1117+// This should only be called directly when known to be safe, i.e. when
1118+// the vCPU is owned by the current TID with no chance of theft.
1119+//
1120+//go:nosplit
1121+func (c *vCPU) lock() {
1122+ atomicbitops.OrUint32(&c.state, vCPUUser)
1123+}
1124+
1125+// unlock clears the vCPUUser bit.
1126+//
1127+//go:nosplit
1128+func (c *vCPU) unlock() {
1129+ origState := atomicbitops.CompareAndSwapUint32(&c.state, vCPUUser|vCPUGuest, vCPUGuest)
1130+ if origState == vCPUUser|vCPUGuest {
1131+ // Happy path: no exits are forced, and we can continue
1132+ // executing on our merry way with a single atomic access.
1133+ return
1134+ }
1135+
1136+ // Clear the lock.
1137+ for {
1138+ state := atomicbitops.CompareAndSwapUint32(&c.state, origState, origState&^vCPUUser)
1139+ if state == origState {
1140+ break
1141+ }
1142+ origState = state
1143+ }
1144+ switch origState {
1145+ case vCPUUser:
1146+ // Normal state.
1147+ case vCPUUser | vCPUGuest | vCPUWaiter:
1148+ // Force a transition: this must trigger a notification when we
1149+ // return from guest mode. We must clear vCPUWaiter here
1150+ // anyways, because BounceToKernel will force a transition only
1151+ // from ring3 to ring0, which will not clear this bit. Halt may
1152+ // workaround the issue, but if there is no exception or
1153+ // syscall in this period, BounceToKernel will hang.
1154+ atomicbitops.AndUint32(&c.state, ^vCPUWaiter)
1155+ c.notify()
1156+ case vCPUUser | vCPUWaiter:
1157+ // Waiting for the lock to be released; the responsibility is
1158+ // on us to notify the waiter and clear the associated bit.
1159+ atomicbitops.AndUint32(&c.state, ^vCPUWaiter)
1160+ c.notify()
1161+ default:
1162+ panic("invalid state")
1163+ }
1164+}
1165+
1166+// NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt.
1167+//
1168+//go:nosplit
1169+func (c *vCPU) NotifyInterrupt() {
1170+ c.BounceToKernel()
1171+}
1172+
1173+// pid is used below in bounce.
1174+var pid = unix.Getpid()
1175+
1176+// bounce forces a return to the kernel or to host mode.
1177+//
1178+// This effectively unwinds the state machine.
1179+func (c *vCPU) bounce(forceGuestExit bool) {
1180+ origGuestExits := c.guestExits.Load()
1181+ origUserExits := c.userExits.Load()
1182+ for {
1183+ switch state := c.state.Load(); state {
1184+ case vCPUReady, vCPUWaiter:
1185+ // There is nothing to be done, we're already in the
1186+ // kernel pre-acquisition. The Bounce criteria have
1187+ // been satisfied.
1188+ return
1189+ case vCPUUser:
1190+ // We need to register a waiter for the actual guest
1191+ // transition. When the transition takes place, then we
1192+ // can inject an interrupt to ensure a return to host
1193+ // mode.
1194+ c.state.CompareAndSwap(state, state|vCPUWaiter)
1195+ case vCPUUser | vCPUWaiter:
1196+ // Wait for the transition to guest mode. This should
1197+ // come from the bluepill handler.
1198+ c.waitUntilNot(state)
1199+ case vCPUGuest, vCPUUser | vCPUGuest:
1200+ if state == vCPUGuest && !forceGuestExit {
1201+ // The vCPU is already not acquired, so there's
1202+ // no need to do a fresh injection here.
1203+ return
1204+ }
1205+ // The vCPU is in user or kernel mode. Attempt to
1206+ // register a notification on change.
1207+ if !c.state.CompareAndSwap(state, state|vCPUWaiter) {
1208+ break // Retry.
1209+ }
1210+ for {
1211+ // We need to spin here until the signal is
1212+ // delivered, because Tgkill can return EAGAIN
1213+ // under memory pressure. Since we already
1214+ // marked ourselves as a waiter, we need to
1215+ // ensure that a signal is actually delivered.
1216+ if err := unix.Tgkill(pid, int(c.tid.Load()), bounceSignal); err == nil {
1217+ break
1218+ } else if err.(unix.Errno) == unix.EAGAIN {
1219+ continue
1220+ } else {
1221+ // Nothing else should be returned by tgkill.
1222+ panic(fmt.Sprintf("unexpected tgkill error: %v", err))
1223+ }
1224+ }
1225+ case vCPUGuest | vCPUWaiter, vCPUUser | vCPUGuest | vCPUWaiter:
1226+ if state == vCPUGuest|vCPUWaiter && !forceGuestExit {
1227+ // See above.
1228+ return
1229+ }
1230+ // Wait for the transition. This again should happen
1231+ // from the bluepill handler, but on the way out.
1232+ c.waitUntilNot(state)
1233+ default:
1234+ // Should not happen: the above is exhaustive.
1235+ panic("invalid state")
1236+ }
1237+
1238+ // Check if we've missed the state transition, but
1239+ // we can safely return at this point in time.
1240+ newGuestExits := c.guestExits.Load()
1241+ newUserExits := c.userExits.Load()
1242+ if newUserExits != origUserExits && (!forceGuestExit || newGuestExits != origGuestExits) {
1243+ return
1244+ }
1245+ }
1246+}
1247+
1248+// BounceToKernel ensures that the vCPU bounces back to the kernel.
1249+//
1250+//go:nosplit
1251+func (c *vCPU) BounceToKernel() {
1252+ c.bounce(false)
1253+}
1254+
1255+// BounceToHost ensures that the vCPU is in host mode.
1256+//
1257+//go:nosplit
1258+func (c *vCPU) BounceToHost() {
1259+ c.bounce(true)
1260+}
1261+
1262+// setSystemTimeLegacy calibrates and sets an approximate system time.
1263+func (c *vCPU) setSystemTimeLegacy() error {
1264+ const minIterations = 10
1265+ minimum := uint64(0)
1266+ for iter := 0; ; iter++ {
1267+ // Try to set the TSC to an estimate of where it will be
1268+ // on the host during a "fast" system call iteration.
1269+ start := uint64(ktime.Rdtsc())
1270+ if err := c.setTSC(start + (minimum / 2)); err != nil {
1271+ return err
1272+ }
1273+ // See if this is our new minimum call time. Note that this
1274+ // serves two functions: one, we make sure that we are
1275+ // accurately predicting the offset we need to set. Second, we
1276+ // don't want to do the final set on a slow call, which could
1277+ // produce a really bad result.
1278+ end := uint64(ktime.Rdtsc())
1279+ if end < start {
1280+ continue // Totally bogus: unstable TSC?
1281+ }
1282+ current := end - start
1283+ if current < minimum || iter == 0 {
1284+ minimum = current // Set our new minimum.
1285+ }
1286+ // Is this past minIterations and within ~10% of minimum?
1287+ upperThreshold := (((minimum << 3) + minimum) >> 3)
1288+ if iter >= minIterations && current <= upperThreshold {
1289+ return nil
1290+ }
1291+ }
1292+}
1293+
1294+const machinePoolSize = 16
1295+
1296+// machinePool is enumerated from the seccompMmapHandler signal handler
1297+var (
1298+ machinePool [machinePoolSize]machineAtomicPtr
1299+ machinePoolLen atomicbitops.Uint32
1300+ machinePoolMu sync.Mutex
1301+ seccompMmapRulesOnce gosync.Once
1302+)
1303+
1304+func sigsysHandler()
1305+func addrOfSigsysHandler() uintptr
1306+
1307+// seccompMmapRules adds seccomp rules to trap mmap system calls that will be
1308+// handled in seccompMmapHandler.
1309+func seccompMmapRules(m *machine) {
1310+ seccompMmapRulesOnce.Do(func() {
1311+ // Install the handler.
1312+ if err := sighandling.ReplaceSignalHandler(unix.SIGSYS, addrOfSigsysHandler(), &savedSigsysHandler); err != nil {
1313+ panic(fmt.Sprintf("Unable to set handler for signal %d: %v", bluepillSignal, err))
1314+ }
1315+ rules := []seccomp.RuleSet{}
1316+ rules = append(rules, []seccomp.RuleSet{
1317+ // Trap mmap system calls and handle them in sigsysGoHandler
1318+ {
1319+ Rules: seccomp.SyscallRules{
1320+ unix.SYS_MMAP: {
1321+ {
1322+ seccomp.MatchAny{},
1323+ seccomp.MatchAny{},
1324+ seccomp.MaskedEqual(unix.PROT_EXEC, 0),
1325+ /* MAP_DENYWRITE is ignored and used only for filtering. */
1326+ seccomp.MaskedEqual(unix.MAP_DENYWRITE, 0),
1327+ },
1328+ },
1329+ },
1330+ Action: linux.SECCOMP_RET_TRAP,
1331+ },
1332+ }...)
1333+ instrs, err := seccomp.BuildProgram(rules, linux.SECCOMP_RET_ALLOW, linux.SECCOMP_RET_ALLOW)
1334+ if err != nil {
1335+ panic(fmt.Sprintf("failed to build rules: %v", err))
1336+ }
1337+ // Perform the actual installation.
1338+ if err := seccomp.SetFilter(instrs); err != nil {
1339+ panic(fmt.Sprintf("failed to set filter: %v", err))
1340+ }
1341+ })
1342+
1343+ machinePoolMu.Lock()
1344+ n := machinePoolLen.Load()
1345+ i := uint32(0)
1346+ for ; i < n; i++ {
1347+ if machinePool[i].Load() == nil {
1348+ break
1349+ }
1350+ }
1351+ if i == n {
1352+ if i == machinePoolSize {
1353+ machinePoolMu.Unlock()
1354+ panic("machinePool is full")
1355+ }
1356+ machinePoolLen.Add(1)
1357+ }
1358+ machinePool[i].Store(m)
1359+ m.machinePoolIndex = i
1360+ machinePoolMu.Unlock()
1361+}
1362--
13632.41.0
1364