Lorenz Brun | 6570219 | 2023-08-31 16:27:38 +0200 | [diff] [blame^] | 1 | From eb22b742839180a0bdb3953c061da15ba822d56d Mon Sep 17 00:00:00 2001 |
| 2 | From: Tim Windelschmidt <tim@monogon.tech> |
| 3 | Date: Tue, 12 Sep 2023 15:06:49 +0200 |
| 4 | Subject: [PATCH] fix debug builds |
| 5 | |
| 6 | --- |
| 7 | pkg/sentry/platform/kvm/address_space.go | 3 + |
| 8 | .../platform/kvm/address_space_debug.go | 242 +++++ |
| 9 | .../platform/kvm/bluepill_debug_unsafe.go | 215 +++++ |
| 10 | pkg/sentry/platform/kvm/bluepill_unsafe.go | 4 +- |
| 11 | pkg/sentry/platform/kvm/machine.go | 3 + |
| 12 | pkg/sentry/platform/kvm/machine_debug.go | 826 ++++++++++++++++++ |
| 13 | 6 files changed, 1291 insertions(+), 2 deletions(-) |
| 14 | create mode 100644 pkg/sentry/platform/kvm/address_space_debug.go |
| 15 | create mode 100644 pkg/sentry/platform/kvm/bluepill_debug_unsafe.go |
| 16 | create mode 100644 pkg/sentry/platform/kvm/machine_debug.go |
| 17 | |
| 18 | diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go |
| 19 | index 79ccbea35..7e30d0365 100644 |
| 20 | --- a/pkg/sentry/platform/kvm/address_space.go |
| 21 | +++ b/pkg/sentry/platform/kvm/address_space.go |
| 22 | @@ -12,6 +12,9 @@ |
| 23 | // See the License for the specific language governing permissions and |
| 24 | // limitations under the License. |
| 25 | |
| 26 | +//go:build !kvm_debug |
| 27 | +// +build !kvm_debug |
| 28 | + |
| 29 | package kvm |
| 30 | |
| 31 | import ( |
| 32 | diff --git a/pkg/sentry/platform/kvm/address_space_debug.go b/pkg/sentry/platform/kvm/address_space_debug.go |
| 33 | new file mode 100644 |
| 34 | index 000000000..69aeba45a |
| 35 | --- /dev/null |
| 36 | +++ b/pkg/sentry/platform/kvm/address_space_debug.go |
| 37 | @@ -0,0 +1,242 @@ |
| 38 | +// Copyright 2018 The gVisor Authors. |
| 39 | +// |
| 40 | +// Licensed under the Apache License, Version 2.0 (the "License"); |
| 41 | +// you may not use this file except in compliance with the License. |
| 42 | +// You may obtain a copy of the License at |
| 43 | +// |
| 44 | +// http://www.apache.org/licenses/LICENSE-2.0 |
| 45 | +// |
| 46 | +// Unless required by applicable law or agreed to in writing, software |
| 47 | +// distributed under the License is distributed on an "AS IS" BASIS, |
| 48 | +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 49 | +// See the License for the specific language governing permissions and |
| 50 | +// limitations under the License. |
| 51 | + |
| 52 | +//go:build kvm_debug |
| 53 | +// +build kvm_debug |
| 54 | + |
| 55 | +package kvm |
| 56 | + |
| 57 | +import ( |
| 58 | + "gvisor.dev/gvisor/pkg/atomicbitops" |
| 59 | + "gvisor.dev/gvisor/pkg/hostarch" |
| 60 | + "gvisor.dev/gvisor/pkg/ring0/pagetables" |
| 61 | + "gvisor.dev/gvisor/pkg/sentry/memmap" |
| 62 | + "gvisor.dev/gvisor/pkg/sentry/platform" |
| 63 | + "gvisor.dev/gvisor/pkg/sync" |
| 64 | +) |
| 65 | + |
| 66 | +// dirtySet tracks vCPUs for invalidation. |
| 67 | +type dirtySet struct { |
| 68 | + vCPUMasks []atomicbitops.Uint64 |
| 69 | +} |
| 70 | + |
| 71 | +// forEach iterates over all CPUs in the dirty set. |
| 72 | +func (ds *dirtySet) forEach(m *machine, fn func(c *vCPU)) { |
| 73 | + for index := range ds.vCPUMasks { |
| 74 | + mask := ds.vCPUMasks[index].Swap(0) |
| 75 | + if mask != 0 { |
| 76 | + for bit := 0; bit < 64; bit++ { |
| 77 | + if mask&(1<<uint64(bit)) == 0 { |
| 78 | + continue |
| 79 | + } |
| 80 | + id := 64*index + bit |
| 81 | + fn(m.vCPUsByID[id]) |
| 82 | + } |
| 83 | + } |
| 84 | + } |
| 85 | +} |
| 86 | + |
| 87 | +// mark marks the given vCPU as dirty and returns whether it was previously |
| 88 | +// clean. Being previously clean implies that a flush is needed on entry. |
| 89 | +func (ds *dirtySet) mark(c *vCPU) bool { |
| 90 | + index := uint64(c.id) / 64 |
| 91 | + bit := uint64(1) << uint(c.id%64) |
| 92 | + |
| 93 | + oldValue := ds.vCPUMasks[index].Load() |
| 94 | + if oldValue&bit != 0 { |
| 95 | + return false // Not clean. |
| 96 | + } |
| 97 | + |
| 98 | + // Set the bit unilaterally, and ensure that a flush takes place. Note |
| 99 | + // that it's possible for races to occur here, but since the flush is |
| 100 | + // taking place long after these lines there's no race in practice. |
| 101 | + atomicbitops.OrUint64(&ds.vCPUMasks[index], bit) |
| 102 | + return true // Previously clean. |
| 103 | +} |
| 104 | + |
| 105 | +// addressSpace is a wrapper for PageTables. |
| 106 | +type addressSpace struct { |
| 107 | + platform.NoAddressSpaceIO |
| 108 | + |
| 109 | + // mu is the lock for modifications to the address space. |
| 110 | + // |
| 111 | + // Note that the page tables themselves are not locked. |
| 112 | + mu sync.Mutex |
| 113 | + |
| 114 | + // machine is the underlying machine. |
| 115 | + machine *machine |
| 116 | + |
| 117 | + // pageTables are for this particular address space. |
| 118 | + pageTables *pagetables.PageTables |
| 119 | + |
| 120 | + // dirtySet is the set of dirty vCPUs. |
| 121 | + dirtySet *dirtySet |
| 122 | +} |
| 123 | + |
| 124 | +// Invalidate interrupts all dirty contexts. |
| 125 | +func (as *addressSpace) Invalidate() { |
| 126 | + as.mu.Lock() |
| 127 | + defer as.mu.Unlock() |
| 128 | + as.invalidate() |
| 129 | +} |
| 130 | + |
| 131 | +// Touch adds the given vCPU to the dirty list. |
| 132 | +// |
| 133 | +// The return value indicates whether a flush is required. |
| 134 | +func (as *addressSpace) Touch(c *vCPU) bool { |
| 135 | + return as.dirtySet.mark(c) |
| 136 | +} |
| 137 | + |
| 138 | +type hostMapEntry struct { |
| 139 | + addr uintptr |
| 140 | + length uintptr |
| 141 | +} |
| 142 | + |
| 143 | +// mapLocked maps the given host entry. |
| 144 | +// |
| 145 | +// +checkescape:hard,stack |
| 146 | +func (as *addressSpace) mapLocked(addr hostarch.Addr, m hostMapEntry, at hostarch.AccessType) (inv bool) { |
| 147 | + for m.length > 0 { |
| 148 | + physical, length, ok := translateToPhysical(m.addr) |
| 149 | + if !ok { |
| 150 | + panic("unable to translate segment") |
| 151 | + } |
| 152 | + if length > m.length { |
| 153 | + length = m.length |
| 154 | + } |
| 155 | + |
| 156 | + // Ensure that this map has physical mappings. If the page does |
| 157 | + // not have physical mappings, the KVM module may inject |
| 158 | + // spurious exceptions when emulation fails (i.e. it tries to |
| 159 | + // emulate because the RIP is pointed at those pages). |
| 160 | + as.machine.mapPhysical(physical, length, physicalRegions) |
| 161 | + |
| 162 | + // Install the page table mappings. Note that the ordering is |
| 163 | + // important; if the pagetable mappings were installed before |
| 164 | + // ensuring the physical pages were available, then some other |
| 165 | + // thread could theoretically access them. |
| 166 | + inv = as.pageTables.Map(addr, length, pagetables.MapOpts{ |
| 167 | + AccessType: at, |
| 168 | + User: true, |
| 169 | + }, physical) || inv |
| 170 | + m.addr += length |
| 171 | + m.length -= length |
| 172 | + addr += hostarch.Addr(length) |
| 173 | + } |
| 174 | + |
| 175 | + return inv |
| 176 | +} |
| 177 | + |
| 178 | +// MapFile implements platform.AddressSpace.MapFile. |
| 179 | +func (as *addressSpace) MapFile(addr hostarch.Addr, f memmap.File, fr memmap.FileRange, at hostarch.AccessType, precommit bool) error { |
| 180 | + as.mu.Lock() |
| 181 | + defer as.mu.Unlock() |
| 182 | + |
| 183 | + // Get mappings in the sentry's address space, which are guaranteed to be |
| 184 | + // valid as long as a reference is held on the mapped pages (which is in |
| 185 | + // turn required by AddressSpace.MapFile precondition). |
| 186 | + // |
| 187 | + // If precommit is true, we will touch mappings to commit them, so ensure |
| 188 | + // that mappings are readable from sentry context. |
| 189 | + // |
| 190 | + // We don't execute from application file-mapped memory, and guest page |
| 191 | + // tables don't care if we have execute permission (but they do need pages |
| 192 | + // to be readable). |
| 193 | + bs, err := f.MapInternal(fr, hostarch.AccessType{ |
| 194 | + Read: at.Read || at.Execute || precommit, |
| 195 | + Write: at.Write, |
| 196 | + }) |
| 197 | + if err != nil { |
| 198 | + return err |
| 199 | + } |
| 200 | + |
| 201 | + // See block in mapLocked. |
| 202 | + as.pageTables.Allocator.(*allocator).cpu = as.machine.Get() |
| 203 | + defer as.machine.Put(as.pageTables.Allocator.(*allocator).cpu) |
| 204 | + |
| 205 | + // Map the mappings in the sentry's address space (guest physical memory) |
| 206 | + // into the application's address space (guest virtual memory). |
| 207 | + inv := false |
| 208 | + for !bs.IsEmpty() { |
| 209 | + b := bs.Head() |
| 210 | + bs = bs.Tail() |
| 211 | + // Since fr was page-aligned, b should also be page-aligned. We do the |
| 212 | + // lookup in our host page tables for this translation. |
| 213 | + if precommit { |
| 214 | + s := b.ToSlice() |
| 215 | + for i := 0; i < len(s); i += hostarch.PageSize { |
| 216 | + _ = s[i] // Touch to commit. |
| 217 | + } |
| 218 | + } |
| 219 | + |
| 220 | + // See bluepill_allocator.go. |
| 221 | + bluepill(as.pageTables.Allocator.(*allocator).cpu) |
| 222 | + |
| 223 | + // Perform the mapping. |
| 224 | + prev := as.mapLocked(addr, hostMapEntry{ |
| 225 | + addr: b.Addr(), |
| 226 | + length: uintptr(b.Len()), |
| 227 | + }, at) |
| 228 | + inv = inv || prev |
| 229 | + addr += hostarch.Addr(b.Len()) |
| 230 | + } |
| 231 | + if inv { |
| 232 | + as.invalidate() |
| 233 | + } |
| 234 | + |
| 235 | + return nil |
| 236 | +} |
| 237 | + |
| 238 | +// unmapLocked is an escape-checked wrapped around Unmap. |
| 239 | +// |
| 240 | +// +checkescape:hard,stack |
| 241 | +func (as *addressSpace) unmapLocked(addr hostarch.Addr, length uint64) bool { |
| 242 | + return as.pageTables.Unmap(addr, uintptr(length)) |
| 243 | +} |
| 244 | + |
| 245 | +// Unmap unmaps the given range by calling pagetables.PageTables.Unmap. |
| 246 | +func (as *addressSpace) Unmap(addr hostarch.Addr, length uint64) { |
| 247 | + as.mu.Lock() |
| 248 | + defer as.mu.Unlock() |
| 249 | + |
| 250 | + // See above & bluepill_allocator.go. |
| 251 | + as.pageTables.Allocator.(*allocator).cpu = as.machine.Get() |
| 252 | + defer as.machine.Put(as.pageTables.Allocator.(*allocator).cpu) |
| 253 | + bluepill(as.pageTables.Allocator.(*allocator).cpu) |
| 254 | + |
| 255 | + if prev := as.unmapLocked(addr, length); prev { |
| 256 | + // Invalidate all active vCPUs. |
| 257 | + as.invalidate() |
| 258 | + |
| 259 | + // Recycle any freed intermediate pages. |
| 260 | + as.pageTables.Allocator.Recycle() |
| 261 | + } |
| 262 | +} |
| 263 | + |
| 264 | +// Release releases the page tables. |
| 265 | +func (as *addressSpace) Release() { |
| 266 | + as.Unmap(0, ^uint64(0)) |
| 267 | + |
| 268 | + // Free all pages from the allocator. |
| 269 | + as.pageTables.Allocator.(*allocator).base.Drain() |
| 270 | + |
| 271 | + // Drop all cached machine references. |
| 272 | + as.machine.dropPageTables(as.pageTables) |
| 273 | +} |
| 274 | + |
| 275 | +// PreFork implements platform.AddressSpace.PreFork. |
| 276 | +func (as *addressSpace) PreFork() {} |
| 277 | + |
| 278 | +// PostFork implements platform.AddressSpace.PostFork. |
| 279 | +func (as *addressSpace) PostFork() {} |
| 280 | diff --git a/pkg/sentry/platform/kvm/bluepill_debug_unsafe.go b/pkg/sentry/platform/kvm/bluepill_debug_unsafe.go |
| 281 | new file mode 100644 |
| 282 | index 000000000..5feb45c19 |
| 283 | --- /dev/null |
| 284 | +++ b/pkg/sentry/platform/kvm/bluepill_debug_unsafe.go |
| 285 | @@ -0,0 +1,215 @@ |
| 286 | +// Copyright 2018 The gVisor Authors. |
| 287 | +// |
| 288 | +// Licensed under the Apache License, Version 2.0 (the "License"); |
| 289 | +// you may not use this file except in compliance with the License. |
| 290 | +// You may obtain a copy of the License at |
| 291 | +// |
| 292 | +// http://www.apache.org/licenses/LICENSE-2.0 |
| 293 | +// |
| 294 | +// Unless required by applicable law or agreed to in writing, software |
| 295 | +// distributed under the License is distributed on an "AS IS" BASIS, |
| 296 | +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 297 | +// See the License for the specific language governing permissions and |
| 298 | +// limitations under the License. |
| 299 | + |
| 300 | +//go:build go1.18 && kvm_debug |
| 301 | +// +build go1.18,kvm_debug |
| 302 | + |
| 303 | +// //go:linkname directives type-checked by checklinkname. Any other |
| 304 | +// non-linkname assumptions outside the Go 1 compatibility guarantee should |
| 305 | +// have an accompanied vet check or version guard build tag. |
| 306 | + |
| 307 | +package kvm |
| 308 | + |
| 309 | +import ( |
| 310 | + "unsafe" |
| 311 | + |
| 312 | + "golang.org/x/sys/unix" |
| 313 | + "gvisor.dev/gvisor/pkg/sentry/arch" |
| 314 | +) |
| 315 | + |
| 316 | +//go:linkname throw runtime.throw |
| 317 | +func throw(s string) |
| 318 | + |
| 319 | +// vCPUPtr returns a CPU for the given address. |
| 320 | +func vCPUPtr(addr uintptr) *vCPU { |
| 321 | + return (*vCPU)(unsafe.Pointer(addr)) |
| 322 | +} |
| 323 | + |
| 324 | +// bytePtr returns a bytePtr for the given address. |
| 325 | +func bytePtr(addr uintptr) *byte { |
| 326 | + return (*byte)(unsafe.Pointer(addr)) |
| 327 | +} |
| 328 | + |
| 329 | +// uintptrValue returns a uintptr for the given address. |
| 330 | +func uintptrValue(addr *byte) uintptr { |
| 331 | + return (uintptr)(unsafe.Pointer(addr)) |
| 332 | +} |
| 333 | + |
| 334 | +// bluepillArchContext returns the UContext64. |
| 335 | +func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 { |
| 336 | + return &((*arch.UContext64)(context).MContext) |
| 337 | +} |
| 338 | + |
| 339 | +// bluepillHandleHlt is reponsible for handling VM-Exit. |
| 340 | +func bluepillGuestExit(c *vCPU, context unsafe.Pointer) { |
| 341 | + // Increment our counter. |
| 342 | + c.guestExits.Add(1) |
| 343 | + |
| 344 | + // Copy out registers. |
| 345 | + bluepillArchExit(c, bluepillArchContext(context)) |
| 346 | + |
| 347 | + // Return to the vCPUReady state; notify any waiters. |
| 348 | + user := c.state.Load() & vCPUUser |
| 349 | + switch c.state.Swap(user) { |
| 350 | + case user | vCPUGuest: // Expected case. |
| 351 | + case user | vCPUGuest | vCPUWaiter: |
| 352 | + c.notify() |
| 353 | + default: |
| 354 | + throw("invalid state") |
| 355 | + } |
| 356 | +} |
| 357 | + |
| 358 | +var hexSyms = []byte("0123456789abcdef") |
| 359 | + |
| 360 | +func printHex(title []byte, val uint64) { |
| 361 | + var str [18]byte |
| 362 | + for i := 0; i < 16; i++ { |
| 363 | + str[16-i] = hexSyms[val&0xf] |
| 364 | + val = val >> 4 |
| 365 | + } |
| 366 | + str[0] = ' ' |
| 367 | + str[17] = '\n' |
| 368 | + unix.RawSyscall(unix.SYS_WRITE, uintptr(unix.Stderr), uintptr(unsafe.Pointer(&title[0])), uintptr(len(title))) |
| 369 | + unix.RawSyscall(unix.SYS_WRITE, uintptr(unix.Stderr), uintptr(unsafe.Pointer(&str)), 18) |
| 370 | +} |
| 371 | + |
| 372 | +// bluepillHandler is called from the signal stub. |
| 373 | +// |
| 374 | +// The world may be stopped while this is executing, and it executes on the |
| 375 | +// signal stack. It should only execute raw system calls and functions that are |
| 376 | +// explicitly marked go:nosplit. |
| 377 | +// |
| 378 | +// Ideally, this function should switch to gsignal, as runtime.sigtramp does, |
| 379 | +// but that is tedious given all the runtime internals. That said, using |
| 380 | +// gsignal inside a signal handler is not _required_, provided we avoid stack |
| 381 | +// splits and allocations. Note that calling any splittable function here will |
| 382 | +// be flaky; if the signal stack is below the G stack then we will trigger a |
| 383 | +// split and crash. If above, we won't trigger a split. |
| 384 | +// |
| 385 | +// +checkescape:all |
| 386 | +func bluepillHandler(context unsafe.Pointer) { |
| 387 | + // Sanitize the registers; interrupts must always be disabled. |
| 388 | + c := bluepillArchEnter(bluepillArchContext(context)) |
| 389 | + |
| 390 | + // Mark this as guest mode. |
| 391 | + switch c.state.Swap(vCPUGuest | vCPUUser) { |
| 392 | + case vCPUUser: // Expected case. |
| 393 | + case vCPUUser | vCPUWaiter: |
| 394 | + c.notify() |
| 395 | + default: |
| 396 | + throw("invalid state") |
| 397 | + } |
| 398 | + |
| 399 | + for { |
| 400 | + hostExitCounter.Increment() |
| 401 | + _, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(c.fd), _KVM_RUN, 0) // escapes: no. |
| 402 | + switch errno { |
| 403 | + case 0: // Expected case. |
| 404 | + case unix.EINTR: |
| 405 | + interruptCounter.Increment() |
| 406 | + // First, we process whatever pending signal |
| 407 | + // interrupted KVM. Since we're in a signal handler |
| 408 | + // currently, all signals are masked and the signal |
| 409 | + // must have been delivered directly to this thread. |
| 410 | + timeout := unix.Timespec{} |
| 411 | + sig, _, errno := unix.RawSyscall6( // escapes: no. |
| 412 | + unix.SYS_RT_SIGTIMEDWAIT, |
| 413 | + uintptr(unsafe.Pointer(&bounceSignalMask)), |
| 414 | + 0, // siginfo. |
| 415 | + uintptr(unsafe.Pointer(&timeout)), // timeout. |
| 416 | + 8, // sigset size. |
| 417 | + 0, 0) |
| 418 | + if errno == unix.EAGAIN { |
| 419 | + continue |
| 420 | + } |
| 421 | + if errno != 0 { |
| 422 | + throw("error waiting for pending signal") |
| 423 | + } |
| 424 | + if sig != uintptr(bounceSignal) { |
| 425 | + throw("unexpected signal") |
| 426 | + } |
| 427 | + |
| 428 | + // Check whether the current state of the vCPU is ready |
| 429 | + // for interrupt injection. Because we don't have a |
| 430 | + // PIC, we can't inject an interrupt while they are |
| 431 | + // masked. We need to request a window if it's not |
| 432 | + // ready. |
| 433 | + if bluepillReadyStopGuest(c) { |
| 434 | + // Force injection below; the vCPU is ready. |
| 435 | + c.runData.exitReason = _KVM_EXIT_IRQ_WINDOW_OPEN |
| 436 | + } else { |
| 437 | + c.runData.requestInterruptWindow = 1 |
| 438 | + continue // Rerun vCPU. |
| 439 | + } |
| 440 | + case unix.EFAULT: |
| 441 | + // If a fault is not serviceable due to the host |
| 442 | + // backing pages having page permissions, instead of an |
| 443 | + // MMIO exit we receive EFAULT from the run ioctl. We |
| 444 | + // always inject an NMI here since we may be in kernel |
| 445 | + // mode and have interrupts disabled. |
| 446 | + bluepillSigBus(c) |
| 447 | + continue // Rerun vCPU. |
| 448 | + case unix.ENOSYS: |
| 449 | + bluepillHandleEnosys(c) |
| 450 | + continue |
| 451 | + default: |
| 452 | + throw("run failed") |
| 453 | + } |
| 454 | + |
| 455 | + switch c.runData.exitReason { |
| 456 | + case _KVM_EXIT_EXCEPTION: |
| 457 | + c.die(bluepillArchContext(context), "exception") |
| 458 | + return |
| 459 | + case _KVM_EXIT_IO: |
| 460 | + c.die(bluepillArchContext(context), "I/O") |
| 461 | + return |
| 462 | + case _KVM_EXIT_INTERNAL_ERROR: |
| 463 | + // An internal error is typically thrown when emulation |
| 464 | + // fails. This can occur via the MMIO path below (and |
| 465 | + // it might fail because we have multiple regions that |
| 466 | + // are not mapped). We would actually prefer that no |
| 467 | + // emulation occur, and don't mind at all if it fails. |
| 468 | + case _KVM_EXIT_HYPERCALL: |
| 469 | + c.die(bluepillArchContext(context), "hypercall") |
| 470 | + return |
| 471 | + case _KVM_EXIT_DEBUG: |
| 472 | + c.die(bluepillArchContext(context), "debug") |
| 473 | + return |
| 474 | + case _KVM_EXIT_HLT: |
| 475 | + c.hltSanityCheck() |
| 476 | + bluepillGuestExit(c, context) |
| 477 | + return |
| 478 | + case _KVM_EXIT_MMIO: |
| 479 | + physical := uintptr(c.runData.data[0]) |
| 480 | + if getHypercallID(physical) == _KVM_HYPERCALL_VMEXIT { |
| 481 | + bluepillGuestExit(c, context) |
| 482 | + return |
| 483 | + } |
| 484 | + |
| 485 | + c.die(bluepillArchContext(context), "exit_mmio") |
| 486 | + return |
| 487 | + case _KVM_EXIT_IRQ_WINDOW_OPEN: |
| 488 | + bluepillStopGuest(c) |
| 489 | + case _KVM_EXIT_SHUTDOWN: |
| 490 | + c.die(bluepillArchContext(context), "shutdown") |
| 491 | + return |
| 492 | + case _KVM_EXIT_FAIL_ENTRY: |
| 493 | + c.die(bluepillArchContext(context), "entry failed") |
| 494 | + return |
| 495 | + default: |
| 496 | + bluepillArchHandleExit(c, context) |
| 497 | + return |
| 498 | + } |
| 499 | + } |
| 500 | +} |
| 501 | diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go |
| 502 | index 81bd9f814..ad8b966e7 100644 |
| 503 | --- a/pkg/sentry/platform/kvm/bluepill_unsafe.go |
| 504 | +++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go |
| 505 | @@ -12,8 +12,8 @@ |
| 506 | // See the License for the specific language governing permissions and |
| 507 | // limitations under the License. |
| 508 | |
| 509 | -//go:build go1.18 |
| 510 | -// +build go1.18 |
| 511 | +//go:build go1.18 && !kvm_debug |
| 512 | +// +build go1.18,!kvm_debug |
| 513 | |
| 514 | // //go:linkname directives type-checked by checklinkname. Any other |
| 515 | // non-linkname assumptions outside the Go 1 compatibility guarantee should |
| 516 | diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go |
| 517 | index f39bf1f06..4f0264db7 100644 |
| 518 | --- a/pkg/sentry/platform/kvm/machine.go |
| 519 | +++ b/pkg/sentry/platform/kvm/machine.go |
| 520 | @@ -12,6 +12,9 @@ |
| 521 | // See the License for the specific language governing permissions and |
| 522 | // limitations under the License. |
| 523 | |
| 524 | +//go:build !kvm_debug |
| 525 | +// +build !kvm_debug |
| 526 | + |
| 527 | package kvm |
| 528 | |
| 529 | import ( |
| 530 | diff --git a/pkg/sentry/platform/kvm/machine_debug.go b/pkg/sentry/platform/kvm/machine_debug.go |
| 531 | new file mode 100644 |
| 532 | index 000000000..0a4735d2d |
| 533 | --- /dev/null |
| 534 | +++ b/pkg/sentry/platform/kvm/machine_debug.go |
| 535 | @@ -0,0 +1,826 @@ |
| 536 | +// Copyright 2018 The gVisor Authors. |
| 537 | +// |
| 538 | +// Licensed under the Apache License, Version 2.0 (the "License"); |
| 539 | +// you may not use this file except in compliance with the License. |
| 540 | +// You may obtain a copy of the License at |
| 541 | +// |
| 542 | +// http://www.apache.org/licenses/LICENSE-2.0 |
| 543 | +// |
| 544 | +// Unless required by applicable law or agreed to in writing, software |
| 545 | +// distributed under the License is distributed on an "AS IS" BASIS, |
| 546 | +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 547 | +// See the License for the specific language governing permissions and |
| 548 | +// limitations under the License. |
| 549 | + |
| 550 | +//go:build kvm_debug |
| 551 | +// +build kvm_debug |
| 552 | + |
| 553 | +package kvm |
| 554 | + |
| 555 | +import ( |
| 556 | + "fmt" |
| 557 | + "runtime" |
| 558 | + gosync "sync" |
| 559 | + "sync/atomic" |
| 560 | + "time" |
| 561 | + |
| 562 | + "golang.org/x/sys/unix" |
| 563 | + "gvisor.dev/gvisor/pkg/abi/linux" |
| 564 | + "gvisor.dev/gvisor/pkg/atomicbitops" |
| 565 | + "gvisor.dev/gvisor/pkg/hostarch" |
| 566 | + "gvisor.dev/gvisor/pkg/hosttid" |
| 567 | + "gvisor.dev/gvisor/pkg/log" |
| 568 | + "gvisor.dev/gvisor/pkg/metric" |
| 569 | + "gvisor.dev/gvisor/pkg/ring0" |
| 570 | + "gvisor.dev/gvisor/pkg/ring0/pagetables" |
| 571 | + "gvisor.dev/gvisor/pkg/seccomp" |
| 572 | + ktime "gvisor.dev/gvisor/pkg/sentry/time" |
| 573 | + "gvisor.dev/gvisor/pkg/sighandling" |
| 574 | + "gvisor.dev/gvisor/pkg/sync" |
| 575 | +) |
| 576 | + |
| 577 | +// machine contains state associated with the VM as a whole. |
| 578 | +type machine struct { |
| 579 | + // fd is the vm fd. |
| 580 | + fd int |
| 581 | + |
| 582 | + // machinePoolIndex is the index in the machinePool array. |
| 583 | + machinePoolIndex uint32 |
| 584 | + |
| 585 | + // nextSlot is the next slot for setMemoryRegion. |
| 586 | + // |
| 587 | + // If nextSlot is ^uint32(0), then slots are currently being updated, and the |
| 588 | + // caller should retry. |
| 589 | + nextSlot atomicbitops.Uint32 |
| 590 | + |
| 591 | + // upperSharedPageTables tracks the read-only shared upper of all the pagetables. |
| 592 | + upperSharedPageTables *pagetables.PageTables |
| 593 | + |
| 594 | + // kernel is the set of global structures. |
| 595 | + kernel ring0.Kernel |
| 596 | + |
| 597 | + // mu protects vCPUs. |
| 598 | + mu sync.RWMutex |
| 599 | + |
| 600 | + // available is notified when vCPUs are available. |
| 601 | + available sync.Cond |
| 602 | + |
| 603 | + // vCPUsByTID are the machine vCPUs. |
| 604 | + // |
| 605 | + // These are populated dynamically. |
| 606 | + vCPUsByTID map[uint64]*vCPU |
| 607 | + |
| 608 | + // vCPUsByID are the machine vCPUs, can be indexed by the vCPU's ID. |
| 609 | + vCPUsByID []*vCPU |
| 610 | + |
| 611 | + // usedVCPUs is the number of vCPUs that have been used from the |
| 612 | + // vCPUsByID pool. |
| 613 | + usedVCPUs int |
| 614 | + |
| 615 | + // maxVCPUs is the maximum number of vCPUs supported by the machine. |
| 616 | + maxVCPUs int |
| 617 | + |
| 618 | + // maxSlots is the maximum number of memory slots supported by the machine. |
| 619 | + maxSlots int |
| 620 | + |
| 621 | + // tscControl checks whether cpu supports TSC scaling |
| 622 | + tscControl bool |
| 623 | + |
| 624 | + // usedSlots is the set of used physical addresses (not sorted). |
| 625 | + usedSlots []uintptr |
| 626 | +} |
| 627 | + |
| 628 | +const ( |
| 629 | + // vCPUReady is an alias for all the below clear. |
| 630 | + vCPUReady uint32 = 0 |
| 631 | + |
| 632 | + // vCPUser indicates that the vCPU is in or about to enter user mode. |
| 633 | + vCPUUser uint32 = 1 << 0 |
| 634 | + |
| 635 | + // vCPUGuest indicates the vCPU is in guest mode. |
| 636 | + vCPUGuest uint32 = 1 << 1 |
| 637 | + |
| 638 | + // vCPUWaiter indicates that there is a waiter. |
| 639 | + // |
| 640 | + // If this is set, then notify must be called on any state transitions. |
| 641 | + vCPUWaiter uint32 = 1 << 2 |
| 642 | +) |
| 643 | + |
| 644 | +// Field values for the get_vcpu metric acquisition path used. |
| 645 | +var ( |
| 646 | + getVCPUAcquisitionFastReused = metric.FieldValue{"fast_reused"} |
| 647 | + getVCPUAcquisitionReused = metric.FieldValue{"reused"} |
| 648 | + getVCPUAcquisitionUnused = metric.FieldValue{"unused"} |
| 649 | + getVCPUAcquisitionStolen = metric.FieldValue{"stolen"} |
| 650 | +) |
| 651 | + |
| 652 | +var ( |
| 653 | + // hostExitCounter is a metric that tracks how many times the sentry |
| 654 | + // performed a host to guest world switch. |
| 655 | + hostExitCounter = metric.MustCreateNewProfilingUint64Metric( |
| 656 | + "/kvm/host_exits", false, "The number of times the sentry performed a host to guest world switch.") |
| 657 | + |
| 658 | + // userExitCounter is a metric that tracks how many times the sentry has |
| 659 | + // had an exit from userspace. Analogous to vCPU.userExits. |
| 660 | + userExitCounter = metric.MustCreateNewProfilingUint64Metric( |
| 661 | + "/kvm/user_exits", false, "The number of times the sentry has had an exit from userspace.") |
| 662 | + |
| 663 | + // interruptCounter is a metric that tracks how many times execution returned |
| 664 | + // to the KVM host to handle a pending signal. |
| 665 | + interruptCounter = metric.MustCreateNewProfilingUint64Metric( |
| 666 | + "/kvm/interrupts", false, "The number of times the signal handler was invoked.") |
| 667 | + |
| 668 | + // mmapCallCounter is a metric that tracks how many times the function |
| 669 | + // seccompMmapSyscall has been called. |
| 670 | + mmapCallCounter = metric.MustCreateNewProfilingUint64Metric( |
| 671 | + "/kvm/mmap_calls", false, "The number of times seccompMmapSyscall has been called.") |
| 672 | + |
| 673 | + // getVCPUCounter is a metric that tracks how many times different paths of |
| 674 | + // machine.Get() are triggered. |
| 675 | + getVCPUCounter = metric.MustCreateNewProfilingUint64Metric( |
| 676 | + "/kvm/get_vcpu", false, "The number of times that machine.Get() was called, split by path the function took.", |
| 677 | + metric.NewField("acquisition_type", &getVCPUAcquisitionFastReused, &getVCPUAcquisitionReused, &getVCPUAcquisitionUnused, &getVCPUAcquisitionStolen)) |
| 678 | + |
| 679 | + // asInvalidateDuration are durations of calling addressSpace.invalidate(). |
| 680 | + asInvalidateDuration = metric.MustCreateNewProfilingTimerMetric("/kvm/address_space_invalidate", |
| 681 | + metric.NewExponentialBucketer(15, uint64(time.Nanosecond*100), 1, 2), |
| 682 | + "Duration of calling addressSpace.invalidate().") |
| 683 | +) |
| 684 | + |
| 685 | +// vCPU is a single KVM vCPU. |
| 686 | +type vCPU struct { |
| 687 | + // CPU is the kernel CPU data. |
| 688 | + // |
| 689 | + // This must be the first element of this structure, it is referenced |
| 690 | + // by the bluepill code (see bluepill_amd64.s). |
| 691 | + ring0.CPU |
| 692 | + |
| 693 | + // id is the vCPU id. |
| 694 | + id int |
| 695 | + |
| 696 | + // fd is the vCPU fd. |
| 697 | + fd int |
| 698 | + |
| 699 | + // tid is the last set tid. |
| 700 | + tid atomicbitops.Uint64 |
| 701 | + |
| 702 | + // userExits is the count of user exits. |
| 703 | + userExits atomicbitops.Uint64 |
| 704 | + |
| 705 | + // guestExits is the count of guest to host world switches. |
| 706 | + guestExits atomicbitops.Uint64 |
| 707 | + |
| 708 | + // faults is a count of world faults (informational only). |
| 709 | + faults uint32 |
| 710 | + |
| 711 | + // state is the vCPU state. |
| 712 | + // |
| 713 | + // This is a bitmask of the three fields (vCPU*) described above. |
| 714 | + state atomicbitops.Uint32 |
| 715 | + |
| 716 | + // runData for this vCPU. |
| 717 | + runData *runData |
| 718 | + |
| 719 | + // machine associated with this vCPU. |
| 720 | + machine *machine |
| 721 | + |
| 722 | + // active is the current addressSpace: this is set and read atomically, |
| 723 | + // it is used to elide unnecessary interrupts due to invalidations. |
| 724 | + active atomicAddressSpace |
| 725 | + |
| 726 | + // vCPUArchState is the architecture-specific state. |
| 727 | + vCPUArchState |
| 728 | + |
| 729 | + // dieState holds state related to vCPU death. |
| 730 | + dieState dieState |
| 731 | +} |
| 732 | + |
| 733 | +type dieState struct { |
| 734 | + // message is thrown from die. |
| 735 | + message string |
| 736 | + |
| 737 | + // guestRegs is used to store register state during vCPU.die() to prevent |
| 738 | + // allocation inside nosplit function. |
| 739 | + guestRegs userRegs |
| 740 | +} |
| 741 | + |
| 742 | +// createVCPU creates and returns a new vCPU. |
| 743 | +// |
| 744 | +// Precondition: mu must be held. |
| 745 | +func (m *machine) createVCPU(id int) *vCPU { |
| 746 | + // Create the vCPU. |
| 747 | + fd, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CREATE_VCPU, uintptr(id)) |
| 748 | + if errno != 0 { |
| 749 | + panic(fmt.Sprintf("error creating new vCPU: %v", errno)) |
| 750 | + } |
| 751 | + |
| 752 | + c := &vCPU{ |
| 753 | + id: id, |
| 754 | + fd: int(fd), |
| 755 | + machine: m, |
| 756 | + } |
| 757 | + c.CPU.Init(&m.kernel, c.id, c) |
| 758 | + m.vCPUsByID[c.id] = c |
| 759 | + |
| 760 | + // Ensure the signal mask is correct. |
| 761 | + if err := c.setSignalMask(); err != nil { |
| 762 | + panic(fmt.Sprintf("error setting signal mask: %v", err)) |
| 763 | + } |
| 764 | + |
| 765 | + // Map the run data. |
| 766 | + runData, err := mapRunData(int(fd)) |
| 767 | + if err != nil { |
| 768 | + panic(fmt.Sprintf("error mapping run data: %v", err)) |
| 769 | + } |
| 770 | + c.runData = runData |
| 771 | + |
| 772 | + // Initialize architecture state. |
| 773 | + if err := c.initArchState(); err != nil { |
| 774 | + panic(fmt.Sprintf("error initialization vCPU state: %v", err)) |
| 775 | + } |
| 776 | + |
| 777 | + return c // Done. |
| 778 | +} |
| 779 | + |
| 780 | +// newMachine returns a new VM context. |
| 781 | +func newMachine(vm int) (*machine, error) { |
| 782 | + // Create the machine. |
| 783 | + m := &machine{fd: vm} |
| 784 | + m.available.L = &m.mu |
| 785 | + |
| 786 | + // Pull the maximum vCPUs. |
| 787 | + m.getMaxVCPU() |
| 788 | + log.Debugf("The maximum number of vCPUs is %d.", m.maxVCPUs) |
| 789 | + m.vCPUsByTID = make(map[uint64]*vCPU) |
| 790 | + m.vCPUsByID = make([]*vCPU, m.maxVCPUs) |
| 791 | + m.kernel.Init(m.maxVCPUs) |
| 792 | + |
| 793 | + // Pull the maximum slots. |
| 794 | + maxSlots, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_MEMSLOTS) |
| 795 | + if errno != 0 { |
| 796 | + m.maxSlots = _KVM_NR_MEMSLOTS |
| 797 | + } else { |
| 798 | + m.maxSlots = int(maxSlots) |
| 799 | + } |
| 800 | + log.Debugf("The maximum number of slots is %d.", m.maxSlots) |
| 801 | + m.usedSlots = make([]uintptr, m.maxSlots) |
| 802 | + |
| 803 | + // Check TSC Scaling |
| 804 | + hasTSCControl, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_TSC_CONTROL) |
| 805 | + m.tscControl = errno == 0 && hasTSCControl == 1 |
| 806 | + log.Debugf("TSC scaling support: %t.", m.tscControl) |
| 807 | + |
| 808 | + // Create the upper shared pagetables and kernel(sentry) pagetables. |
| 809 | + m.upperSharedPageTables = pagetables.New(newAllocator()) |
| 810 | + m.mapUpperHalf(m.upperSharedPageTables) |
| 811 | + m.upperSharedPageTables.Allocator.(*allocator).base.Drain() |
| 812 | + m.upperSharedPageTables.MarkReadOnlyShared() |
| 813 | + m.kernel.PageTables = pagetables.NewWithUpper(newAllocator(), m.upperSharedPageTables, ring0.KernelStartAddress) |
| 814 | + |
| 815 | + // Install seccomp rules to trap runtime mmap system calls. They will |
| 816 | + // be handled by seccompMmapHandler. |
| 817 | + seccompMmapRules(m) |
| 818 | + |
| 819 | + // Apply the physical mappings. Note that these mappings may point to |
| 820 | + // guest physical addresses that are not actually available. These |
| 821 | + // physical pages are mapped on demand, see kernel_unsafe.go. |
| 822 | + applyPhysicalRegions(func(pr physicalRegion) bool { |
| 823 | + // Map everything in the lower half. |
| 824 | + m.kernel.PageTables.Map( |
| 825 | + hostarch.Addr(pr.virtual), |
| 826 | + pr.length, |
| 827 | + pagetables.MapOpts{AccessType: hostarch.ReadWrite}, |
| 828 | + pr.physical) |
| 829 | + |
| 830 | + return true // Keep iterating. |
| 831 | + }) |
| 832 | + |
| 833 | + // Ensure that the currently mapped virtual regions are actually |
| 834 | + // available in the VM. Note that this doesn't guarantee no future |
| 835 | + // faults, however it should guarantee that everything is available to |
| 836 | + // ensure successful vCPU entry. |
| 837 | + mapRegion := func(vr virtualRegion, flags uint32) { |
| 838 | + for virtual := vr.virtual; virtual < vr.virtual+vr.length; { |
| 839 | + physical, length, ok := translateToPhysical(virtual) |
| 840 | + if !ok { |
| 841 | + // This must be an invalid region that was |
| 842 | + // knocked out by creation of the physical map. |
| 843 | + return |
| 844 | + } |
| 845 | + if virtual+length > vr.virtual+vr.length { |
| 846 | + // Cap the length to the end of the area. |
| 847 | + length = vr.virtual + vr.length - virtual |
| 848 | + } |
| 849 | + // Update page tables for executable mappings. |
| 850 | + if vr.accessType.Execute { |
| 851 | + if vr.accessType.Write { |
| 852 | + panic(fmt.Sprintf("executable mapping can't be writable: %#v", vr)) |
| 853 | + } |
| 854 | + m.kernel.PageTables.Map( |
| 855 | + hostarch.Addr(virtual), |
| 856 | + length, |
| 857 | + pagetables.MapOpts{AccessType: vr.accessType}, |
| 858 | + physical) |
| 859 | + } |
| 860 | + |
| 861 | + // Ensure the physical range is mapped. |
| 862 | + m.mapPhysical(physical, length, physicalRegions) |
| 863 | + virtual += length |
| 864 | + } |
| 865 | + } |
| 866 | + |
| 867 | + // handleBluepillFault takes the slot spinlock and it is called from |
| 868 | + // seccompMmapHandler, so here we have to guarantee that mmap is not |
| 869 | + // called while we hold the slot spinlock. |
| 870 | + disableAsyncPreemption() |
| 871 | + applyVirtualRegions(func(vr virtualRegion) { |
| 872 | + if excludeVirtualRegion(vr) { |
| 873 | + return // skip region. |
| 874 | + } |
| 875 | + // Take into account that the stack can grow down. |
| 876 | + if vr.filename == "[stack]" { |
| 877 | + vr.virtual -= 1 << 20 |
| 878 | + vr.length += 1 << 20 |
| 879 | + } |
| 880 | + |
| 881 | + mapRegion(vr, 0) |
| 882 | + |
| 883 | + }) |
| 884 | + enableAsyncPreemption() |
| 885 | + |
| 886 | + // Initialize architecture state. |
| 887 | + if err := m.initArchState(); err != nil { |
| 888 | + m.Destroy() |
| 889 | + return nil, err |
| 890 | + } |
| 891 | + |
| 892 | + // Ensure the machine is cleaned up properly. |
| 893 | + runtime.SetFinalizer(m, (*machine).Destroy) |
| 894 | + return m, nil |
| 895 | +} |
| 896 | + |
| 897 | +// hasSlot returns true if the given address is mapped. |
| 898 | +// |
| 899 | +// This must be done via a linear scan. |
| 900 | +// |
| 901 | +//go:nosplit |
| 902 | +func (m *machine) hasSlot(physical uintptr) bool { |
| 903 | + slotLen := int(m.nextSlot.Load()) |
| 904 | + // When slots are being updated, nextSlot is ^uint32(0). As this situation |
| 905 | + // is less likely happen, we just set the slotLen to m.maxSlots, and scan |
| 906 | + // the whole usedSlots array. |
| 907 | + if slotLen == int(^uint32(0)) { |
| 908 | + slotLen = m.maxSlots |
| 909 | + } |
| 910 | + for i := 0; i < slotLen; i++ { |
| 911 | + if p := atomic.LoadUintptr(&m.usedSlots[i]); p == physical { |
| 912 | + return true |
| 913 | + } |
| 914 | + } |
| 915 | + return false |
| 916 | +} |
| 917 | + |
| 918 | +// mapPhysical checks for the mapping of a physical range, and installs one if |
| 919 | +// not available. This attempts to be efficient for calls in the hot path. |
| 920 | +// |
| 921 | +// This throws on error. |
| 922 | +func (m *machine) mapPhysical(physical, length uintptr, phyRegions []physicalRegion) { |
| 923 | + for end := physical + length; physical < end; { |
| 924 | + _, physicalStart, length, pr := calculateBluepillFault(physical, phyRegions) |
| 925 | + if pr == nil { |
| 926 | + // Should never happen. |
| 927 | + throw("mapPhysical on unknown physical address") |
| 928 | + } |
| 929 | + |
| 930 | + // Is this already mapped? Check the usedSlots. |
| 931 | + if !m.hasSlot(physicalStart) { |
| 932 | + if _, ok := handleBluepillFault(m, physical, phyRegions); !ok { |
| 933 | + throw("handleBluepillFault failed") |
| 934 | + } |
| 935 | + } |
| 936 | + |
| 937 | + // Move to the next chunk. |
| 938 | + physical = physicalStart + length |
| 939 | + } |
| 940 | +} |
| 941 | + |
| 942 | +// Destroy frees associated resources. |
| 943 | +// |
| 944 | +// Destroy should only be called once all active users of the machine are gone. |
| 945 | +// The machine object should not be used after calling Destroy. |
| 946 | +// |
| 947 | +// Precondition: all vCPUs must be returned to the machine. |
| 948 | +func (m *machine) Destroy() { |
| 949 | + runtime.SetFinalizer(m, nil) |
| 950 | + |
| 951 | + // Destroy vCPUs. |
| 952 | + for _, c := range m.vCPUsByID { |
| 953 | + if c == nil { |
| 954 | + continue |
| 955 | + } |
| 956 | + |
| 957 | + // Ensure the vCPU is not still running in guest mode. This is |
| 958 | + // possible iff teardown has been done by other threads, and |
| 959 | + // somehow a single thread has not executed any system calls. |
| 960 | + c.BounceToHost() |
| 961 | + |
| 962 | + // Note that the runData may not be mapped if an error occurs |
| 963 | + // during the middle of initialization. |
| 964 | + if c.runData != nil { |
| 965 | + if err := unmapRunData(c.runData); err != nil { |
| 966 | + panic(fmt.Sprintf("error unmapping rundata: %v", err)) |
| 967 | + } |
| 968 | + } |
| 969 | + if err := unix.Close(int(c.fd)); err != nil { |
| 970 | + panic(fmt.Sprintf("error closing vCPU fd: %v", err)) |
| 971 | + } |
| 972 | + } |
| 973 | + |
| 974 | + machinePool[m.machinePoolIndex].Store(nil) |
| 975 | + seccompMmapSync() |
| 976 | + |
| 977 | + // vCPUs are gone: teardown machine state. |
| 978 | + if err := unix.Close(m.fd); err != nil { |
| 979 | + panic(fmt.Sprintf("error closing VM fd: %v", err)) |
| 980 | + } |
| 981 | +} |
| 982 | + |
| 983 | +// Get gets an available vCPU. |
| 984 | +// |
| 985 | +// This will return with the OS thread locked. |
| 986 | +// |
| 987 | +// It is guaranteed that if any OS thread TID is in guest, m.vCPUs[TID] points |
| 988 | +// to the vCPU in which the OS thread TID is running. So if Get() returns with |
| 989 | +// the corrent context in guest, the vCPU of it must be the same as what |
| 990 | +// Get() returns. |
| 991 | +func (m *machine) Get() *vCPU { |
| 992 | + m.mu.RLock() |
| 993 | + runtime.LockOSThread() |
| 994 | + tid := hosttid.Current() |
| 995 | + |
| 996 | + // Check for an exact match. |
| 997 | + if c := m.vCPUsByTID[tid]; c != nil { |
| 998 | + c.lock() |
| 999 | + m.mu.RUnlock() |
| 1000 | + getVCPUCounter.Increment(&getVCPUAcquisitionFastReused) |
| 1001 | + return c |
| 1002 | + } |
| 1003 | + |
| 1004 | + // The happy path failed. We now proceed to acquire an exclusive lock |
| 1005 | + // (because the vCPU map may change), and scan all available vCPUs. |
| 1006 | + // In this case, we first unlock the OS thread. Otherwise, if mu is |
| 1007 | + // not available, the current system thread will be parked and a new |
| 1008 | + // system thread spawned. We avoid this situation by simply refreshing |
| 1009 | + // tid after relocking the system thread. |
| 1010 | + m.mu.RUnlock() |
| 1011 | + runtime.UnlockOSThread() |
| 1012 | + m.mu.Lock() |
| 1013 | + runtime.LockOSThread() |
| 1014 | + tid = hosttid.Current() |
| 1015 | + |
| 1016 | + // Recheck for an exact match. |
| 1017 | + if c := m.vCPUsByTID[tid]; c != nil { |
| 1018 | + c.lock() |
| 1019 | + m.mu.Unlock() |
| 1020 | + getVCPUCounter.Increment(&getVCPUAcquisitionReused) |
| 1021 | + return c |
| 1022 | + } |
| 1023 | + |
| 1024 | + for { |
| 1025 | + // Get vCPU from the m.vCPUsByID pool. |
| 1026 | + if m.usedVCPUs < m.maxVCPUs { |
| 1027 | + c := m.vCPUsByID[m.usedVCPUs] |
| 1028 | + m.usedVCPUs++ |
| 1029 | + c.lock() |
| 1030 | + m.vCPUsByTID[tid] = c |
| 1031 | + m.mu.Unlock() |
| 1032 | + c.loadSegments(tid) |
| 1033 | + getVCPUCounter.Increment(&getVCPUAcquisitionUnused) |
| 1034 | + return c |
| 1035 | + } |
| 1036 | + |
| 1037 | + // Scan for an available vCPU. |
| 1038 | + for origTID, c := range m.vCPUsByTID { |
| 1039 | + if c.state.CompareAndSwap(vCPUReady, vCPUUser) { |
| 1040 | + delete(m.vCPUsByTID, origTID) |
| 1041 | + m.vCPUsByTID[tid] = c |
| 1042 | + m.mu.Unlock() |
| 1043 | + c.loadSegments(tid) |
| 1044 | + getVCPUCounter.Increment(&getVCPUAcquisitionUnused) |
| 1045 | + return c |
| 1046 | + } |
| 1047 | + } |
| 1048 | + |
| 1049 | + // Scan for something not in user mode. |
| 1050 | + for origTID, c := range m.vCPUsByTID { |
| 1051 | + if !c.state.CompareAndSwap(vCPUGuest, vCPUGuest|vCPUWaiter) { |
| 1052 | + continue |
| 1053 | + } |
| 1054 | + |
| 1055 | + // The vCPU is not be able to transition to |
| 1056 | + // vCPUGuest|vCPUWaiter or to vCPUUser because that |
| 1057 | + // transition requires holding the machine mutex, as we |
| 1058 | + // do now. There is no path to register a waiter on |
| 1059 | + // just the vCPUReady state. |
| 1060 | + for { |
| 1061 | + c.waitUntilNot(vCPUGuest | vCPUWaiter) |
| 1062 | + if c.state.CompareAndSwap(vCPUReady, vCPUUser) { |
| 1063 | + break |
| 1064 | + } |
| 1065 | + } |
| 1066 | + |
| 1067 | + // Steal the vCPU. |
| 1068 | + delete(m.vCPUsByTID, origTID) |
| 1069 | + m.vCPUsByTID[tid] = c |
| 1070 | + m.mu.Unlock() |
| 1071 | + c.loadSegments(tid) |
| 1072 | + getVCPUCounter.Increment(&getVCPUAcquisitionStolen) |
| 1073 | + return c |
| 1074 | + } |
| 1075 | + |
| 1076 | + // Everything is executing in user mode. Wait until something |
| 1077 | + // is available. Note that signaling the condition variable |
| 1078 | + // will have the extra effect of kicking the vCPUs out of guest |
| 1079 | + // mode if that's where they were. |
| 1080 | + m.available.Wait() |
| 1081 | + } |
| 1082 | +} |
| 1083 | + |
| 1084 | +// Put puts the current vCPU. |
| 1085 | +func (m *machine) Put(c *vCPU) { |
| 1086 | + c.unlock() |
| 1087 | + runtime.UnlockOSThread() |
| 1088 | + |
| 1089 | + m.mu.RLock() |
| 1090 | + m.available.Signal() |
| 1091 | + m.mu.RUnlock() |
| 1092 | +} |
| 1093 | + |
| 1094 | +// newDirtySet returns a new dirty set. |
| 1095 | +func (m *machine) newDirtySet() *dirtySet { |
| 1096 | + return &dirtySet{ |
| 1097 | + vCPUMasks: make([]atomicbitops.Uint64, |
| 1098 | + (m.maxVCPUs+63)/64, (m.maxVCPUs+63)/64), |
| 1099 | + } |
| 1100 | +} |
| 1101 | + |
| 1102 | +// dropPageTables drops cached page table entries. |
| 1103 | +func (m *machine) dropPageTables(pt *pagetables.PageTables) { |
| 1104 | + m.mu.Lock() |
| 1105 | + defer m.mu.Unlock() |
| 1106 | + |
| 1107 | + // Clear from all PCIDs. |
| 1108 | + for _, c := range m.vCPUsByID { |
| 1109 | + if c != nil && c.PCIDs != nil { |
| 1110 | + c.PCIDs.Drop(pt) |
| 1111 | + } |
| 1112 | + } |
| 1113 | +} |
| 1114 | + |
| 1115 | +// lock marks the vCPU as in user mode. |
| 1116 | +// |
| 1117 | +// This should only be called directly when known to be safe, i.e. when |
| 1118 | +// the vCPU is owned by the current TID with no chance of theft. |
| 1119 | +// |
| 1120 | +//go:nosplit |
| 1121 | +func (c *vCPU) lock() { |
| 1122 | + atomicbitops.OrUint32(&c.state, vCPUUser) |
| 1123 | +} |
| 1124 | + |
| 1125 | +// unlock clears the vCPUUser bit. |
| 1126 | +// |
| 1127 | +//go:nosplit |
| 1128 | +func (c *vCPU) unlock() { |
| 1129 | + origState := atomicbitops.CompareAndSwapUint32(&c.state, vCPUUser|vCPUGuest, vCPUGuest) |
| 1130 | + if origState == vCPUUser|vCPUGuest { |
| 1131 | + // Happy path: no exits are forced, and we can continue |
| 1132 | + // executing on our merry way with a single atomic access. |
| 1133 | + return |
| 1134 | + } |
| 1135 | + |
| 1136 | + // Clear the lock. |
| 1137 | + for { |
| 1138 | + state := atomicbitops.CompareAndSwapUint32(&c.state, origState, origState&^vCPUUser) |
| 1139 | + if state == origState { |
| 1140 | + break |
| 1141 | + } |
| 1142 | + origState = state |
| 1143 | + } |
| 1144 | + switch origState { |
| 1145 | + case vCPUUser: |
| 1146 | + // Normal state. |
| 1147 | + case vCPUUser | vCPUGuest | vCPUWaiter: |
| 1148 | + // Force a transition: this must trigger a notification when we |
| 1149 | + // return from guest mode. We must clear vCPUWaiter here |
| 1150 | + // anyways, because BounceToKernel will force a transition only |
| 1151 | + // from ring3 to ring0, which will not clear this bit. Halt may |
| 1152 | + // workaround the issue, but if there is no exception or |
| 1153 | + // syscall in this period, BounceToKernel will hang. |
| 1154 | + atomicbitops.AndUint32(&c.state, ^vCPUWaiter) |
| 1155 | + c.notify() |
| 1156 | + case vCPUUser | vCPUWaiter: |
| 1157 | + // Waiting for the lock to be released; the responsibility is |
| 1158 | + // on us to notify the waiter and clear the associated bit. |
| 1159 | + atomicbitops.AndUint32(&c.state, ^vCPUWaiter) |
| 1160 | + c.notify() |
| 1161 | + default: |
| 1162 | + panic("invalid state") |
| 1163 | + } |
| 1164 | +} |
| 1165 | + |
| 1166 | +// NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt. |
| 1167 | +// |
| 1168 | +//go:nosplit |
| 1169 | +func (c *vCPU) NotifyInterrupt() { |
| 1170 | + c.BounceToKernel() |
| 1171 | +} |
| 1172 | + |
| 1173 | +// pid is used below in bounce. |
| 1174 | +var pid = unix.Getpid() |
| 1175 | + |
| 1176 | +// bounce forces a return to the kernel or to host mode. |
| 1177 | +// |
| 1178 | +// This effectively unwinds the state machine. |
| 1179 | +func (c *vCPU) bounce(forceGuestExit bool) { |
| 1180 | + origGuestExits := c.guestExits.Load() |
| 1181 | + origUserExits := c.userExits.Load() |
| 1182 | + for { |
| 1183 | + switch state := c.state.Load(); state { |
| 1184 | + case vCPUReady, vCPUWaiter: |
| 1185 | + // There is nothing to be done, we're already in the |
| 1186 | + // kernel pre-acquisition. The Bounce criteria have |
| 1187 | + // been satisfied. |
| 1188 | + return |
| 1189 | + case vCPUUser: |
| 1190 | + // We need to register a waiter for the actual guest |
| 1191 | + // transition. When the transition takes place, then we |
| 1192 | + // can inject an interrupt to ensure a return to host |
| 1193 | + // mode. |
| 1194 | + c.state.CompareAndSwap(state, state|vCPUWaiter) |
| 1195 | + case vCPUUser | vCPUWaiter: |
| 1196 | + // Wait for the transition to guest mode. This should |
| 1197 | + // come from the bluepill handler. |
| 1198 | + c.waitUntilNot(state) |
| 1199 | + case vCPUGuest, vCPUUser | vCPUGuest: |
| 1200 | + if state == vCPUGuest && !forceGuestExit { |
| 1201 | + // The vCPU is already not acquired, so there's |
| 1202 | + // no need to do a fresh injection here. |
| 1203 | + return |
| 1204 | + } |
| 1205 | + // The vCPU is in user or kernel mode. Attempt to |
| 1206 | + // register a notification on change. |
| 1207 | + if !c.state.CompareAndSwap(state, state|vCPUWaiter) { |
| 1208 | + break // Retry. |
| 1209 | + } |
| 1210 | + for { |
| 1211 | + // We need to spin here until the signal is |
| 1212 | + // delivered, because Tgkill can return EAGAIN |
| 1213 | + // under memory pressure. Since we already |
| 1214 | + // marked ourselves as a waiter, we need to |
| 1215 | + // ensure that a signal is actually delivered. |
| 1216 | + if err := unix.Tgkill(pid, int(c.tid.Load()), bounceSignal); err == nil { |
| 1217 | + break |
| 1218 | + } else if err.(unix.Errno) == unix.EAGAIN { |
| 1219 | + continue |
| 1220 | + } else { |
| 1221 | + // Nothing else should be returned by tgkill. |
| 1222 | + panic(fmt.Sprintf("unexpected tgkill error: %v", err)) |
| 1223 | + } |
| 1224 | + } |
| 1225 | + case vCPUGuest | vCPUWaiter, vCPUUser | vCPUGuest | vCPUWaiter: |
| 1226 | + if state == vCPUGuest|vCPUWaiter && !forceGuestExit { |
| 1227 | + // See above. |
| 1228 | + return |
| 1229 | + } |
| 1230 | + // Wait for the transition. This again should happen |
| 1231 | + // from the bluepill handler, but on the way out. |
| 1232 | + c.waitUntilNot(state) |
| 1233 | + default: |
| 1234 | + // Should not happen: the above is exhaustive. |
| 1235 | + panic("invalid state") |
| 1236 | + } |
| 1237 | + |
| 1238 | + // Check if we've missed the state transition, but |
| 1239 | + // we can safely return at this point in time. |
| 1240 | + newGuestExits := c.guestExits.Load() |
| 1241 | + newUserExits := c.userExits.Load() |
| 1242 | + if newUserExits != origUserExits && (!forceGuestExit || newGuestExits != origGuestExits) { |
| 1243 | + return |
| 1244 | + } |
| 1245 | + } |
| 1246 | +} |
| 1247 | + |
| 1248 | +// BounceToKernel ensures that the vCPU bounces back to the kernel. |
| 1249 | +// |
| 1250 | +//go:nosplit |
| 1251 | +func (c *vCPU) BounceToKernel() { |
| 1252 | + c.bounce(false) |
| 1253 | +} |
| 1254 | + |
| 1255 | +// BounceToHost ensures that the vCPU is in host mode. |
| 1256 | +// |
| 1257 | +//go:nosplit |
| 1258 | +func (c *vCPU) BounceToHost() { |
| 1259 | + c.bounce(true) |
| 1260 | +} |
| 1261 | + |
| 1262 | +// setSystemTimeLegacy calibrates and sets an approximate system time. |
| 1263 | +func (c *vCPU) setSystemTimeLegacy() error { |
| 1264 | + const minIterations = 10 |
| 1265 | + minimum := uint64(0) |
| 1266 | + for iter := 0; ; iter++ { |
| 1267 | + // Try to set the TSC to an estimate of where it will be |
| 1268 | + // on the host during a "fast" system call iteration. |
| 1269 | + start := uint64(ktime.Rdtsc()) |
| 1270 | + if err := c.setTSC(start + (minimum / 2)); err != nil { |
| 1271 | + return err |
| 1272 | + } |
| 1273 | + // See if this is our new minimum call time. Note that this |
| 1274 | + // serves two functions: one, we make sure that we are |
| 1275 | + // accurately predicting the offset we need to set. Second, we |
| 1276 | + // don't want to do the final set on a slow call, which could |
| 1277 | + // produce a really bad result. |
| 1278 | + end := uint64(ktime.Rdtsc()) |
| 1279 | + if end < start { |
| 1280 | + continue // Totally bogus: unstable TSC? |
| 1281 | + } |
| 1282 | + current := end - start |
| 1283 | + if current < minimum || iter == 0 { |
| 1284 | + minimum = current // Set our new minimum. |
| 1285 | + } |
| 1286 | + // Is this past minIterations and within ~10% of minimum? |
| 1287 | + upperThreshold := (((minimum << 3) + minimum) >> 3) |
| 1288 | + if iter >= minIterations && current <= upperThreshold { |
| 1289 | + return nil |
| 1290 | + } |
| 1291 | + } |
| 1292 | +} |
| 1293 | + |
| 1294 | +const machinePoolSize = 16 |
| 1295 | + |
| 1296 | +// machinePool is enumerated from the seccompMmapHandler signal handler |
| 1297 | +var ( |
| 1298 | + machinePool [machinePoolSize]machineAtomicPtr |
| 1299 | + machinePoolLen atomicbitops.Uint32 |
| 1300 | + machinePoolMu sync.Mutex |
| 1301 | + seccompMmapRulesOnce gosync.Once |
| 1302 | +) |
| 1303 | + |
| 1304 | +func sigsysHandler() |
| 1305 | +func addrOfSigsysHandler() uintptr |
| 1306 | + |
| 1307 | +// seccompMmapRules adds seccomp rules to trap mmap system calls that will be |
| 1308 | +// handled in seccompMmapHandler. |
| 1309 | +func seccompMmapRules(m *machine) { |
| 1310 | + seccompMmapRulesOnce.Do(func() { |
| 1311 | + // Install the handler. |
| 1312 | + if err := sighandling.ReplaceSignalHandler(unix.SIGSYS, addrOfSigsysHandler(), &savedSigsysHandler); err != nil { |
| 1313 | + panic(fmt.Sprintf("Unable to set handler for signal %d: %v", bluepillSignal, err)) |
| 1314 | + } |
| 1315 | + rules := []seccomp.RuleSet{} |
| 1316 | + rules = append(rules, []seccomp.RuleSet{ |
| 1317 | + // Trap mmap system calls and handle them in sigsysGoHandler |
| 1318 | + { |
| 1319 | + Rules: seccomp.SyscallRules{ |
| 1320 | + unix.SYS_MMAP: { |
| 1321 | + { |
| 1322 | + seccomp.MatchAny{}, |
| 1323 | + seccomp.MatchAny{}, |
| 1324 | + seccomp.MaskedEqual(unix.PROT_EXEC, 0), |
| 1325 | + /* MAP_DENYWRITE is ignored and used only for filtering. */ |
| 1326 | + seccomp.MaskedEqual(unix.MAP_DENYWRITE, 0), |
| 1327 | + }, |
| 1328 | + }, |
| 1329 | + }, |
| 1330 | + Action: linux.SECCOMP_RET_TRAP, |
| 1331 | + }, |
| 1332 | + }...) |
| 1333 | + instrs, err := seccomp.BuildProgram(rules, linux.SECCOMP_RET_ALLOW, linux.SECCOMP_RET_ALLOW) |
| 1334 | + if err != nil { |
| 1335 | + panic(fmt.Sprintf("failed to build rules: %v", err)) |
| 1336 | + } |
| 1337 | + // Perform the actual installation. |
| 1338 | + if err := seccomp.SetFilter(instrs); err != nil { |
| 1339 | + panic(fmt.Sprintf("failed to set filter: %v", err)) |
| 1340 | + } |
| 1341 | + }) |
| 1342 | + |
| 1343 | + machinePoolMu.Lock() |
| 1344 | + n := machinePoolLen.Load() |
| 1345 | + i := uint32(0) |
| 1346 | + for ; i < n; i++ { |
| 1347 | + if machinePool[i].Load() == nil { |
| 1348 | + break |
| 1349 | + } |
| 1350 | + } |
| 1351 | + if i == n { |
| 1352 | + if i == machinePoolSize { |
| 1353 | + machinePoolMu.Unlock() |
| 1354 | + panic("machinePool is full") |
| 1355 | + } |
| 1356 | + machinePoolLen.Add(1) |
| 1357 | + } |
| 1358 | + machinePool[i].Store(m) |
| 1359 | + m.machinePoolIndex = i |
| 1360 | + machinePoolMu.Unlock() |
| 1361 | +} |
| 1362 | -- |
| 1363 | 2.41.0 |
| 1364 | |