pkg/nvme: add NVMe package

This adds a NVMe package for performing various low-level operations on
NVMe devices. Only the most important (to us) calls are implemented as
NVMe has a vast API surface.

Change-Id: I532894c3c2eb780309993a1688226c92c91cdedf
Reviewed-on: https://review.monogon.dev/c/monogon/+/999
Reviewed-by: Mateusz Zalega <mateusz@monogon.tech>
Tested-by: Jenkins CI
diff --git a/metropolis/pkg/nvme/BUILD.bazel b/metropolis/pkg/nvme/BUILD.bazel
new file mode 100644
index 0000000..6a2438b
--- /dev/null
+++ b/metropolis/pkg/nvme/BUILD.bazel
@@ -0,0 +1,33 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_library(
+    name = "nvme",
+    srcs = [
+        "cmd_linux.go",
+        "cmd_unsupported.go",
+        "error.go",
+        "format.go",
+        "health.go",
+        "identify.go",
+        "nvme.go",
+        "selftest.go",
+        "uint128le.go",
+    ],
+    importpath = "source.monogon.dev/metropolis/pkg/nvme",
+    visibility = ["//visibility:public"],
+    deps = select({
+        "@io_bazel_rules_go//go/platform:android": [
+            "@org_golang_x_sys//unix",
+        ],
+        "@io_bazel_rules_go//go/platform:linux": [
+            "@org_golang_x_sys//unix",
+        ],
+        "//conditions:default": [],
+    }),
+)
+
+go_test(
+    name = "nvme_test",
+    srcs = ["struct_test.go"],
+    embed = [":nvme"],
+)
diff --git a/metropolis/pkg/nvme/cmd_linux.go b/metropolis/pkg/nvme/cmd_linux.go
new file mode 100644
index 0000000..e4353cc
--- /dev/null
+++ b/metropolis/pkg/nvme/cmd_linux.go
@@ -0,0 +1,117 @@
+//go:build linux
+
+package nvme
+
+import (
+	"errors"
+	"fmt"
+	"math"
+	"runtime"
+	"unsafe"
+
+	"golang.org/x/sys/unix"
+)
+
+// From @linux//include/uapi/linux/nvme_ioctl.h
+const (
+	nvmeIoctlAdminCmd = 0xC0484E41 // _IOWR('N', 0x41, sizeof cmd)
+)
+
+// From @linux//include/uapi/linux/nvme_ioctl.h
+type passthruCmd struct {
+	// Corresponding to Figure 88
+	opcode      uint8
+	flags       uint8
+	rsvd1       uint16
+	nsid        uint32
+	cdw2        uint32
+	cdw3        uint32
+	metadata    uint64
+	addr        uint64
+	metadataLen uint32
+	dataLen     uint32
+	cdw10       uint32
+	cdw11       uint32
+	cdw12       uint32
+	cdw13       uint32
+	cdw14       uint32
+	cdw15       uint32
+
+	// Linux ioctl-specific
+	timeoutMs uint32
+	result    uint32
+}
+
+// RawCommand runs a raw command on the NVMe device.
+// Please note that depending on the payload this can be very dangerous and can
+// cause data loss or even firmware issues.
+func (d *Device) RawCommand(cmd *Command) error {
+	conn, err := d.fd.SyscallConn()
+	if err != nil {
+		return fmt.Errorf("unable to get RawConn: %w", err)
+	}
+	cmdRaw := passthruCmd{
+		opcode:    cmd.Opcode,
+		flags:     cmd.Flags,
+		nsid:      cmd.NamespaceID,
+		cdw2:      cmd.CDW2,
+		cdw3:      cmd.CDW3,
+		cdw10:     cmd.CDW10,
+		cdw11:     cmd.CDW11,
+		cdw12:     cmd.CDW12,
+		cdw13:     cmd.CDW13,
+		cdw14:     cmd.CDW14,
+		cdw15:     cmd.CDW15,
+		timeoutMs: uint32(cmd.Timeout.Milliseconds()),
+	}
+	// NOTE: Currently this is safe (even if the documentation says otherwise)
+	// as the runtime.KeepAlive call below ensures that the GC cannot clean up
+	// the memory segments passed as data and metadata. This is sufficient as
+	// Go's runtime currently does not use a moving GC, meaning that these
+	// pointers do not get invalidated as long as they are considered alive.
+	// In case Go introduces a moving GC, which they might want to do this will
+	// no longer be safe as a GC-initiated move can happen while the syscall is
+	// running, causing the kernel to overwrite random memory of the calling
+	// process. To avoid this, these data structures need to be pinned. But Go
+	// doesn't have a pinning API yet [1], so all I can do is note this here.
+	// [1] https://github.com/golang/go/issues/46787
+	if cmd.Data != nil {
+		if len(cmd.Data) > math.MaxUint32 {
+			return errors.New("data buffer larger than uint32, this is unsupported")
+		}
+		cmdRaw.dataLen = uint32(len(cmd.Data))
+		cmdRaw.addr = uint64(uintptr(unsafe.Pointer(&cmd.Data[0])))
+	}
+	if cmd.Metadata != nil {
+		if len(cmd.Metadata) > math.MaxUint32 {
+			return errors.New("metadata buffer larger than uint32, this is unsupported")
+		}
+		cmdRaw.metadataLen = uint32(len(cmd.Metadata))
+		cmdRaw.metadata = uint64(uintptr(unsafe.Pointer(&cmd.Metadata[0])))
+	}
+	var errno unix.Errno
+	var status uintptr
+	err = conn.Control(func(fd uintptr) {
+		status, _, errno = unix.Syscall(unix.SYS_IOCTL, fd, nvmeIoctlAdminCmd, uintptr(unsafe.Pointer(&cmdRaw)))
+	})
+	runtime.KeepAlive(cmdRaw)
+	runtime.KeepAlive(cmd.Data)
+	runtime.KeepAlive(cmd.Metadata)
+	if err != nil {
+		return fmt.Errorf("unable to get fd: %w", err)
+	}
+	if errno != 0 {
+		return errno
+	}
+	var commandErr Error
+	commandErr.DoNotRetry = status&(1<<15) != 0            // Bit 31
+	commandErr.More = status&(1<<14) != 0                  // Bit 30
+	commandErr.StatusCodeType = uint8((status >> 8) & 0x7) // Bits 27:25
+	commandErr.StatusCode = uint8(status & 0xff)           // Bits 24:17
+	// The only success status is in the generic status code set with value 0
+	if commandErr.StatusCodeType != StatusCodeTypeGeneric ||
+		commandErr.StatusCode != 0 {
+		return commandErr
+	}
+	return nil
+}
diff --git a/metropolis/pkg/nvme/cmd_unsupported.go b/metropolis/pkg/nvme/cmd_unsupported.go
new file mode 100644
index 0000000..747a33d
--- /dev/null
+++ b/metropolis/pkg/nvme/cmd_unsupported.go
@@ -0,0 +1,12 @@
+//go:build !linux
+
+package nvme
+
+import (
+	"fmt"
+	"runtime"
+)
+
+func (d *Device) RawCommand(cmd *Command) error {
+	return fmt.Errorf("NVMe command interface unimplemented for %v", runtime.GOOS)
+}
diff --git a/metropolis/pkg/nvme/error.go b/metropolis/pkg/nvme/error.go
new file mode 100644
index 0000000..8c4a207
--- /dev/null
+++ b/metropolis/pkg/nvme/error.go
@@ -0,0 +1,136 @@
+package nvme
+
+import "fmt"
+
+// Figure 31 in the spec
+var genericStatusCodeDesc = map[uint8]string{
+	0x00: "successful completion",
+	0x01: "invalid command opcode",
+	0x02: "invalid field in command",
+	0x03: "command ID conflict",
+	0x04: "data transfer error",
+	0x05: "command aborted due power loss notification",
+	0x06: "internal error",
+	0x07: "command abort requested",
+	0x08: "command abort due to SQ deletion",
+	0x09: "command abort due to failed fused command",
+	0x0a: "command abort due to missing fused command",
+	0x0b: "invalid namespace or format",
+	0x0c: "command sequence error",
+	0x0d: "invalid SGL segment descriptor",
+	0x0e: "invalid number of SGL descriptors",
+	0x0f: "data SGL length invalid",
+	0x10: "metadata SGL length invalid",
+	0x11: "SGL descriptor type invalid",
+	0x12: "invalid use of controller memory buffer",
+	0x13: "PRP offset invalid",
+	0x14: "atomic write unit exceeded",
+	0x15: "operation denied",
+	0x16: "SGL offset invalid",
+	0x18: "host identifer inconsistent format",
+	0x19: "keep alive timeout expired",
+	0x1a: "keep alive timeout invalid",
+	0x1b: "command aborted due to preempt and abort",
+	0x1c: "sanitize failed",
+	0x1d: "sanitize in progress",
+	0x1e: "SGL data block granularity invalid",
+	0x1f: "command not supported for queue in CMB",
+
+	// Figure 32
+	0x80: "LBA out of range",
+	0x81: "capacity exceeded",
+	0x82: "namespace not ready",
+	0x83: "reservation conflict",
+	0x84: "format in progress",
+}
+
+// Figure 33 in the spec
+var commandSpecificStatusCodeDesc = map[uint8]string{
+	0x00: "completion queue invalid",
+	0x01: "invalid queue identifier",
+	0x02: "invalid queue size",
+	0x03: "abort command limit exceeded",
+	0x05: "asynchronous event request limit exceeded",
+	0x06: "invalid firmware slot",
+	0x07: "invalid firmware image",
+	0x08: "invalid interrupt vector",
+	0x09: "invalid log page",
+	0x0a: "invalid format",
+	0x0b: "firmware activation requires conventional reset",
+	0x0c: "invalid queue deletion",
+	0x0d: "feature identifier not saveable",
+	0x0e: "feature not changeable",
+	0x0f: "feature not namespace-specific",
+	0x10: "firmware activation requires NVM subsystem reset",
+	0x11: "firmware activation requires reset",
+	0x12: "firmware activation requires maximum time violation",
+	0x13: "firmware activation prohibited",
+	0x14: "overlapping range",
+	0x15: "namespace insufficient capacity",
+	0x16: "namespace identifier unavailable",
+	0x18: "namespace already attached",
+	0x19: "namespace is private",
+	0x1a: "namespace is not attached",
+	0x1b: "thin provisioning not supported",
+	0x1c: "controller list invalid",
+	0x1d: "device self-test in progress",
+	0x1e: "boot partition write prohibited",
+	0x1f: "invalid controller identifier",
+	0x20: "invalid secondary controller state",
+	0x21: "invalid number of controller resources",
+	0x22: "invalid resource identifier",
+
+	// Figure 34
+	0x80: "conflicting attributes",
+	0x81: "invalid protection information",
+	0x82: "attempted to write to read-only range",
+}
+
+// Figure 36
+var mediaAndDataIntegrityStatusCodeDesc = map[uint8]string{
+	0x80: "write fault",
+	0x81: "unrecovered read error",
+	0x82: "end-to-end guard check error",
+	0x83: "end-to-end application tag check error",
+	0x84: "end-to-end reference tag check error",
+	0x85: "compare failure",
+	0x86: "access denied",
+	0x87: "deallocated or unwritten logical block",
+}
+
+const (
+	StatusCodeTypeGeneric               = 0x0
+	StatusCodeTypeCommandSpecific       = 0x1
+	StatusCodeTypeMediaAndDataIntegrity = 0x2
+)
+
+// Error represents an error returned by the NVMe device in the form of a
+// NVMe Status Field (see also Figure 29 in the spec).
+type Error struct {
+	DoNotRetry     bool
+	More           bool
+	StatusCodeType uint8
+	StatusCode     uint8
+}
+
+func (e Error) Error() string {
+	switch e.StatusCodeType {
+	case StatusCodeTypeGeneric:
+		if errStr, ok := genericStatusCodeDesc[e.StatusCode]; ok {
+			return errStr
+		}
+		return fmt.Sprintf("unknown error with generic code 0x%x", e.StatusCode)
+	case StatusCodeTypeCommandSpecific:
+		if errStr, ok := commandSpecificStatusCodeDesc[e.StatusCode]; ok {
+			return errStr
+		}
+		return fmt.Sprintf("unknown error with command-specific code 0x%x", e.StatusCode)
+	case StatusCodeTypeMediaAndDataIntegrity:
+		if errStr, ok := mediaAndDataIntegrityStatusCodeDesc[e.StatusCode]; ok {
+			return errStr
+		}
+		return fmt.Sprintf("unknown error with media and data integrity code 0x%x", e.StatusCode)
+	default:
+		return fmt.Sprintf("unknown error with unknown type 0x%x and code 0x%x", e.StatusCodeType, e.StatusCode)
+	}
+}
diff --git a/metropolis/pkg/nvme/format.go b/metropolis/pkg/nvme/format.go
new file mode 100644
index 0000000..8bde44a
--- /dev/null
+++ b/metropolis/pkg/nvme/format.go
@@ -0,0 +1,75 @@
+package nvme
+
+// SecureEraseType specifices what type of secure erase should be performed by
+// by the controller. The zero value requests no secure erase.
+type SecureEraseType uint8
+
+const (
+	// SecureEraseTypeNone specifies that no secure erase operation is
+	// requested.
+	SecureEraseTypeNone SecureEraseType = 0
+	// SecureEraseTypeUserData specifies that all user data should be securely
+	// erased. The controller is allowed to perform a cryptographic erase
+	// instead.
+	SecureEraseTypeUserData SecureEraseType = 1
+	// SecureEraseTypeCryptographic specifies that the encryption key for user
+	// data should be erased. This in turn causes all current user data to
+	// become unreadable.
+	SecureEraseTypeCryptographic SecureEraseType = 2
+)
+
+// ProtectionInformationType selects the type of end-to-end protection tags to
+// use. NVMe supports the same types as T10 DIF (SCSI).
+type ProtectionInformationType uint8
+
+const (
+	ProtectionInformationTypeNone ProtectionInformationType = 0
+	ProtectionInformationType1    ProtectionInformationType = 1
+	ProtectionInformationType2    ProtectionInformationType = 2
+	ProtectionInformationType3    ProtectionInformationType = 3
+)
+
+type FormatRequest struct {
+	// NamespaceID contains the ID of the namespace to format.
+	// NamespaceGlobal formats all namespaces.
+	NamespaceID uint32
+	// SecureEraseSettings specifies the type of secure erase to perform.
+	SecureEraseSettings SecureEraseType
+	// ProtectionInformationLocation selects where protection information is
+	// transmitted. If true, it is transmitted as the first 8 bytes of metadata.
+	// If false, it is transmitted as the last 8 bytes of metadata.
+	ProtectionInformationLocation bool
+	// ProtectionInformation specifies the type of T10 DIF Protection
+	// Information to use.
+	ProtectionInformation ProtectionInformationType
+	// MetadataInline selects whether metadata is transferred as part of an
+	// extended data LBA. If false, metadata is returned in a separate buffer.
+	// If true, metadata is appended to the data buffer.
+	MetadataInline bool
+	// LBAFormat specifies the LBA format to use. This needs to be selected
+	// from the list of supported LBA formats in the Identify response.
+	LBAFormat uint8
+}
+
+// Format performs a low-level format of the NVM media. This is used for
+// changing the block and/or metadata size. This command causes all data
+// on the specified namespace to be lost. By setting SecureEraseSettings
+// to the appropriate value it can also be used to securely erase data.
+// See also the Sanitize command for just wiping the device.
+func (d *Device) Format(req *FormatRequest) error {
+	var cdw10 uint32
+	cdw10 |= uint32(req.SecureEraseSettings&0x7) << 9
+	cdw10 |= uint32(req.ProtectionInformation&0x7) << 5
+	cdw10 |= uint32(req.LBAFormat & 0x7)
+	if req.ProtectionInformationLocation {
+		cdw10 |= 1 << 8
+	}
+	if req.MetadataInline {
+		cdw10 |= 1 << 4
+	}
+	return d.RawCommand(&Command{
+		Opcode:      0x80,
+		NamespaceID: req.NamespaceID,
+		CDW10:       cdw10,
+	})
+}
diff --git a/metropolis/pkg/nvme/health.go b/metropolis/pkg/nvme/health.go
new file mode 100644
index 0000000..775742f
--- /dev/null
+++ b/metropolis/pkg/nvme/health.go
@@ -0,0 +1,196 @@
+package nvme
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"math/big"
+	"time"
+)
+
+// healthPage represents the raw data from a NVMe Health/SMART page.
+// See Figure 93 in the spec.
+type healthPage struct {
+	CriticalWarning         uint8
+	CompositeTemperature    uint16
+	AvailableSpare          uint8
+	AvailableSpareThreshold uint8
+	PercentageUsed          uint8
+
+	_ [26]byte
+
+	DataUnitsRead               uint128le
+	DataUnitsWritten            uint128le
+	HostReadCommands            uint128le
+	HostWriteCommands           uint128le
+	ControllerBusyTime          uint128le
+	PowerCycles                 uint128le
+	PowerOnHours                uint128le
+	UnsafeSHutdowns             uint128le
+	MediaAndDataIntegrityErrors uint128le
+	ErrorInformationLogEntries  uint128le
+
+	WarningCompositeTemperatureTime  uint32
+	CriticalCompositeTemperatureTime uint32
+
+	TemperatureSensors [8]uint16
+
+	ThermalMgmtTemperature1TransitionCount uint32
+	ThermalMgmtTemperature2TransitionCount uint32
+
+	_ [8]byte
+
+	TotalTimeForThermalMgmtTemperature1 uint32
+	TotalTimeForThermalMgmtTemperature2 uint32
+}
+
+// HealthInfo contains information related to the health of the NVMe device.
+//
+// Note that some values might be clamped under highly abnormal circumstances
+// as they are reported as 128-bit integers which Go doesn't support.
+// For easier handling values which are very unlikely to exceed 64 bits are
+// exposed as 64 bit integers.
+type HealthInfo struct {
+	// AvailableSpareSpaceCritical is set if the avilable spare threshold has
+	// fallen below the critical threshold.
+	AvailableSpareSpaceCritical bool
+	// TemperatureCritical is set if a temperature is outside the acceptable
+	// operating thresholds.
+	TemperatureCritical bool
+	// MediaCritical is set if significant media or internal issues affect the
+	// operation of the device.
+	MediaCritical bool
+	// ForcedReadOnly is set if the device is forced into read-only mode due
+	// to an error.
+	ForcedReadOnly bool
+	// VolatileMemoryBackupFailed is set if the volatile memory backup device
+	// has failed.
+	VolatileMemoryBackupFailed bool
+	// CompositeTemperatureKelvin contains a derived value representing the
+	// composite state of controller and namespace/flash temperature.
+	// The exact mechanism used to derive it is vendor-specific.
+	CompositeTemperatureKelvin uint16
+	// AvailableSpare represents the relative amount (0-1) of spare capacity
+	// still unnused.
+	AvailableSpare float32
+	// AvailableSpareThreshold represents the vendor-defined threshold which
+	// AvailableSpare shuld not fall under.
+	AvailableSpareThreshold float32
+	// LifeUsed represents vendor-defined relative estimate of the life of
+	// the device which has been used up. It is allowed to exceed 1 and will
+	// be clamped by the device somewhere between 1.0 and 2.55.
+	LifeUsed float32
+	// BytesRead contains the number of bytes read from the device.
+	// This value is only updated in 512KiB increments.
+	BytesRead *big.Int
+	// BytesWritten contains the number of bytes written to the device.
+	// This value is only updated in 512KiB increments.
+	BytesWritten *big.Int
+	// HostReadCommands contains the number of read commands completed by the
+	// controller.
+	HostReadCommands *big.Int
+	// HostWriteCommands contains the number of write commands completed by the
+	// controller.
+	HostWriteCommands *big.Int
+	// ControllerBusyTime contains the cumulative amount of time the controller
+	// has spent being busy (i.e. having at least one command outstanding on an
+	// I/O queue). This value is only updated in 1m increments.
+	ControllerBusyTime time.Duration
+	// PowerCycles contains the number of power cycles.
+	PowerCycles uint64
+	// PowerOnHours contains the number of hours the controller has been
+	// powered on. Depending on the vendor implementation it may or may
+	// not contain time spent in a non-operational power state.
+	PowerOnHours uint64
+	// UnsafeShutdown contains the number of power loss events without
+	// a prior shutdown notification from the host.
+	UnsafeShutdowns uint64
+	// MediaAndDataIntegrityErrors contains the number of occurrences where the
+	// controller detecte an unrecovered data integrity error.
+	MediaAndDataIntegrityErrors uint64
+	// ErrorInformationLogEntriesCount contains the number of Error
+	// Information log entries over the life of the controller.
+	ErrorInformationLogEntriesCount uint64
+	// WarningCompositeTemperatureTime contains the amount of time the
+	// controller is operational while the composite temperature is greater
+	// than the warning composite threshold.
+	WarningCompositeTemperatureTime time.Duration
+	// CriticalCompositeTemperatureTime contains the amount of time the
+	// controller is operational while the composite temperature is greater
+	// than the critical composite threshold.
+	CriticalCompositeTemperatureTime time.Duration
+	// TemperatureSensorValues contains the current temperature in Kelvin as
+	// reported by up to 8 sensors on the device. A value of zero means that
+	// the given sensor is not available.
+	TemperatureSensorValues [8]uint16
+	// ThermalMgmtTemperature1TransitionCount contains the number of times the
+	// controller transitioned to lower power active power states or performed
+	// vendor-specific thermal management actions to reduce temperature.
+	ThermalMgmtTemperature1TransitionCount uint32
+	// ThermalMgmtTemperature2TransitionCount is the same as above, but
+	// for "heavier" thermal management actions including heavy throttling.
+	// The actual difference is vendor-specific.
+	ThermalMgmtTemperature2TransitionCount uint32
+	// TotalTimeForThermalMgmtTemperature1 contains the total time the
+	// controller spent under "light" thermal management.
+	TotalTimeForThermalMgmtTemperature1 time.Duration
+	// TotalTimeForThermalMgmtTemperature2 contains the total time the
+	// controller spent under "heavy" thermal management.
+	TotalTimeForThermalMgmtTemperature2 time.Duration
+}
+
+// HasCriticalWarning returns true if any of the critical warnings
+// (AvailableSpareSpaceCritical, TemperatureCritical, MediaCritical,
+// ForcedReadOnly, VolatileMemoryBackupFailed) are active.
+// If this returns true the NVMe medium has reason to believe that
+// data availability or integrity is endangered.
+func (h *HealthInfo) HasCriticalWarning() bool {
+	return h.AvailableSpareSpaceCritical || h.TemperatureCritical || h.MediaCritical || h.ForcedReadOnly || h.VolatileMemoryBackupFailed
+}
+
+// See Figure 93 Data Units Read
+var dataUnit = big.NewInt(512 * 1000)
+
+const (
+	healthLogPage = 0x02
+)
+
+// GetHealthInfo gets health information from the NVMe device's health log page.
+func (d *Device) GetHealthInfo() (*HealthInfo, error) {
+	var buf [512]byte
+
+	if err := d.GetLogPage(GlobalNamespace, healthLogPage, 0, 0, buf[:]); err != nil {
+		return nil, fmt.Errorf("unable to get health log page: %w", err)
+	}
+
+	var page healthPage
+	binary.Read(bytes.NewReader(buf[:]), binary.LittleEndian, &page)
+	var res HealthInfo
+	res.AvailableSpareSpaceCritical = page.CriticalWarning&(1<<0) != 0
+	res.TemperatureCritical = page.CriticalWarning&(1<<1) != 0
+	res.MediaCritical = page.CriticalWarning&(1<<2) != 0
+	res.ForcedReadOnly = page.CriticalWarning&(1<<3) != 0
+	res.VolatileMemoryBackupFailed = page.CriticalWarning&(1<<4) != 0
+	res.CompositeTemperatureKelvin = page.CompositeTemperature
+	res.AvailableSpare = float32(page.AvailableSpare) / 100.
+	res.AvailableSpareThreshold = float32(page.AvailableSpareThreshold) / 100.
+	res.LifeUsed = float32(page.PercentageUsed) / 100.
+	res.BytesRead = new(big.Int).Mul(page.DataUnitsRead.BigInt(), dataUnit)
+	res.BytesWritten = new(big.Int).Mul(page.DataUnitsWritten.BigInt(), dataUnit)
+	res.HostReadCommands = page.HostReadCommands.BigInt()
+	res.HostWriteCommands = page.HostWriteCommands.BigInt()
+	res.ControllerBusyTime = time.Duration(page.ControllerBusyTime.Uint64()) * time.Minute
+	res.PowerCycles = page.PowerCycles.Uint64()
+	res.PowerOnHours = page.PowerOnHours.Uint64()
+	res.UnsafeShutdowns = page.UnsafeSHutdowns.Uint64()
+	res.MediaAndDataIntegrityErrors = page.MediaAndDataIntegrityErrors.Uint64()
+	res.ErrorInformationLogEntriesCount = page.ErrorInformationLogEntries.Uint64()
+	res.WarningCompositeTemperatureTime = time.Duration(page.WarningCompositeTemperatureTime) * time.Minute
+	res.CriticalCompositeTemperatureTime = time.Duration(page.CriticalCompositeTemperatureTime) * time.Minute
+	res.TemperatureSensorValues = page.TemperatureSensors
+	res.ThermalMgmtTemperature1TransitionCount = page.ThermalMgmtTemperature1TransitionCount
+	res.ThermalMgmtTemperature2TransitionCount = page.ThermalMgmtTemperature2TransitionCount
+	res.TotalTimeForThermalMgmtTemperature1 = time.Duration(page.TotalTimeForThermalMgmtTemperature1) * time.Second
+	res.TotalTimeForThermalMgmtTemperature2 = time.Duration(page.TotalTimeForThermalMgmtTemperature2) * time.Second
+	return &res, nil
+}
diff --git a/metropolis/pkg/nvme/identify.go b/metropolis/pkg/nvme/identify.go
new file mode 100644
index 0000000..218d089
--- /dev/null
+++ b/metropolis/pkg/nvme/identify.go
@@ -0,0 +1,193 @@
+package nvme
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"math/big"
+)
+
+// Figure 109
+type identifyData struct {
+	// Controller Capabilities and Features
+	PCIVendorID                 uint16
+	PCISubsystemVendorID        uint16
+	SerialNumber                [20]byte
+	ModelNumber                 [40]byte
+	FirmwareRevision            [8]byte
+	RecommendedArbitrationBurst uint8
+	IEEEOUI                     [3]byte
+	CMIC                        uint8
+	MaximumDataTransferSize     uint8
+	ControllerID                uint16
+	Version                     uint32
+	RuntimeD3ResumeLatency      uint32
+	RuntimeD3EntryLatency       uint32
+	OAES                        uint32
+	CTRATT                      uint32
+	_                           [12]byte
+	FRUGUID                     [16]byte
+	_                           [128]byte
+	// Admin Command Set Attributes & Optional Controller Capabilities
+	OACS                                uint16
+	AbortCommandLimit                   uint8
+	AsynchronousEventRequestLimit       uint8
+	FRMW                                uint8
+	LPA                                 uint8
+	ErrorLogPageEntries                 uint8
+	NumberOfPowerStatesSupport          uint8
+	AdminVendorSpecificCmdConfig        uint8
+	AutonomousPowerStateTransitionAttrs uint8
+	WarningCompositeTempThreshold       uint16
+	CriticalCompositeTempThreshold      uint16
+	MaximumTimeForFirmwareActivation    uint16
+	HostMemoryBufferPreferredSize       uint32
+	HostMemoryBufferMinimumSize         uint32
+	TotalNVMCapacity                    uint128le
+	UnallocatedNVMCapacity              uint128le
+	ReplyProtectedMemoryBlockSupport    uint32
+	ExtendedDeviceSelfTestTime          uint16
+	DeviceSelfTestOptions               uint8
+	FirmwareUpdateGranularity           uint8
+	KeepAliveSupport                    uint16
+	HostControlledThermalMgmtAttrs      uint16
+	MinimumThermalMgmntTemp             uint16
+	MaximumThermalMgmntTemp             uint16
+	SanitizeCapabilities                uint32
+	_                                   [180]byte
+	// NVM Command Set Attributes
+	SubmissionQueueEntrySize       uint8
+	CompletionQueueEntrySize       uint8
+	MaximumOutstandingCommands     uint16
+	NumberOfNamespaces             uint32
+	OptionalNVMCommandSupport      uint16
+	FusedOperationSupport          uint16
+	FormatNVMAttributes            uint8
+	VolatileWriteCache             uint8
+	AtomicWriteUnitNormal          uint16
+	AtomicWriteUnitPowerFail       uint16
+	NVMVendorSepcificCommandConfig uint8
+	AtomicCompareAndWriteUnit      uint16
+	_                              [2]byte
+	SGLSupport                     uint32
+	_                              [228]byte
+	NVMSubsystemNVMeQualifiedName  [256]byte
+	_                              [1024]byte
+	// Power State Descriptors
+	PowerStateDescriptors [32][32]byte
+}
+
+// IdentifyData contains various identifying information about a NVMe
+// controller. Because the actual data structure is very large, currently not
+// all fields are exposed as properly-typed individual fields. If you need
+// a new field, please add it to this structure.
+type IdentifyData struct {
+	// PCIVendorID contains the company vendor identifier assigned by the PCI
+	// SIG.
+	PCIVendorID uint16
+	// PCISubsystemVendorID contains the company vendor identifier that is
+	// assigned by the PCI SIG for the subsystem.
+	PCISubsystemVendorID uint16
+	// SerialNumber contains the serial number for the NVM subsystem that is
+	// assigned by the vendor.
+	SerialNumber string
+	// ModelNumber contains the model number for the NVM subsystem that is
+	// assigned by the vendor.
+	ModelNumber string
+	// FirmwareRevision contains the currently active firmware revision for the
+	// NVM subsystem.
+	FirmwareRevision string
+	// IEEEOUI contains the Organization Unique Identifier for the controller
+	// vendor as assigned by the IEEE.
+	IEEEOUI [3]byte
+
+	// IsPCIVirtualFunction indicates if the controller is a virtual controller
+	// as part of a PCI virtual function.
+	IsPCIVirtualFunction bool
+
+	// SpecVersionMajor/Minor contain the version of the NVMe specification the
+	// controller supports. Only mandatory from spec version 1.2 onwards.
+	SpecVersionMajor uint16
+	SpecVersionMinor uint8
+
+	// FRUGloballyUniqueIdentifier contains a 128-bit value that is globally
+	// unique for a given Field Replaceable Unit (FRU). Contains all-zeroes if
+	// unavailable.
+	FRUGloballyUniqueIdentifier [16]byte
+	// VirtualizationManagementSupported indicates if the controller
+	// supports the Virtualization Management command.
+	VirtualizationManagementSupported bool
+	// NVMeMISupported indicates if the controller supports the NVMe-MI
+	// Send and Receive commands.
+	NVMeMISupported bool
+	// DirectivesSupported indicates if the controller supports the
+	// Directive Send and Receive commands.
+	DirectivesSupported bool
+	// SelfTestSupported indicates if the controller supports the Device Self-
+	// test command.
+	SelfTestSupported bool
+	// NamespaceManagementSupported indicates if the controller supports the
+	// Namespace Management and Attachment commands.
+	NamespaceManagementSupported bool
+	// FirmwareUpdateSupported indicates if the controller supports the
+	// Firmware Commit and Image Download commands.
+	FirmwareUpdateSupported bool
+	// FormattingSupported indicates if the controller supports the Format
+	// command.
+	FormattingSupported bool
+	// SecuritySupported indicates if the controller supports the Security Send
+	// and Receive commands.
+	SecuritySupported bool
+
+	// TotalNVMCapacity contains the total NVM capacity in bytes in the NVM
+	// subsystem. This can be 0 on devices without NamespaceManagementSupported.
+	TotalNVMCapacity *big.Int
+	// UnallocatedNVMCapacity contains the unallocated NVM capacity in bytes in
+	// the NVM subsystem. This can be 0 on devices without
+	// NamespaceManagementSupported.
+	UnallocatedNVMCapacity *big.Int
+
+	// MaximumNumberOfNamespace defines the maximum number of namespaces
+	// supported by the controller.
+	MaximumNumberOfNamespaces uint32
+}
+
+func (d *Device) Identify() (*IdentifyData, error) {
+	var resp [4096]byte
+
+	if err := d.RawCommand(&Command{
+		Opcode: 0x06,
+		Data:   resp[:],
+		CDW10:  1,
+	}); err != nil {
+		return nil, fmt.Errorf("Identify command failed: %w", err)
+	}
+	var raw identifyData
+	binary.Read(bytes.NewReader(resp[:]), binary.LittleEndian, &raw)
+
+	var res IdentifyData
+	res.PCIVendorID = raw.PCIVendorID
+	res.PCISubsystemVendorID = raw.PCISubsystemVendorID
+	res.SerialNumber = string(bytes.TrimRight(raw.SerialNumber[:], " "))
+	res.ModelNumber = string(bytes.TrimRight(raw.ModelNumber[:], " "))
+	res.FirmwareRevision = string(bytes.TrimRight(raw.FirmwareRevision[:], " "))
+	// OUIs are traditionally big-endian, but NVMe exposes them in little-endian
+	res.IEEEOUI[0], res.IEEEOUI[1], res.IEEEOUI[2] = raw.IEEEOUI[2], raw.IEEEOUI[1], raw.IEEEOUI[0]
+	res.IsPCIVirtualFunction = raw.CMIC&(1<<2) != 0
+	res.SpecVersionMajor = uint16(raw.Version >> 16)
+	res.SpecVersionMinor = uint8((raw.Version >> 8) & 0xFF)
+	res.FRUGloballyUniqueIdentifier = raw.FRUGUID
+	res.VirtualizationManagementSupported = raw.OACS&(1<<7) != 0
+	res.NVMeMISupported = raw.OACS&(1<<6) != 0
+	res.DirectivesSupported = raw.OACS&(1<<5) != 0
+	res.SelfTestSupported = raw.OACS&(1<<4) != 0
+	res.NamespaceManagementSupported = raw.OACS&(1<<3) != 0
+	res.FirmwareUpdateSupported = raw.OACS&(1<<2) != 0
+	res.FormattingSupported = raw.OACS&(1<<1) != 0
+	res.SecuritySupported = raw.OACS&(1<<0) != 0
+
+	res.TotalNVMCapacity = raw.TotalNVMCapacity.BigInt()
+	res.UnallocatedNVMCapacity = raw.UnallocatedNVMCapacity.BigInt()
+	res.MaximumNumberOfNamespaces = raw.NumberOfNamespaces
+	return &res, nil
+}
diff --git a/metropolis/pkg/nvme/nvme.go b/metropolis/pkg/nvme/nvme.go
new file mode 100644
index 0000000..f46546d
--- /dev/null
+++ b/metropolis/pkg/nvme/nvme.go
@@ -0,0 +1,79 @@
+// Package nvme provides methods and data structures for issuing commands to
+// device speaking the NVMe protocol.
+// This package is written against the NVMe Specification Revision 1.3 and
+// all references to figures or other parts of the spec refer to this version.
+package nvme
+
+import (
+	"errors"
+	"fmt"
+	"os"
+	"syscall"
+	"time"
+)
+
+// Device is a handle for a NVMe device.
+type Device struct {
+	fd syscall.Conn
+}
+
+// NewFromFd creates a new NVMe device handle from a system handle.
+func NewFromFd(fd syscall.Conn) (*Device, error) {
+	d := &Device{fd: fd}
+	// There is no good way to validate that a file descriptor indeed points to
+	// a NVMe device. For future compatibility let this return an error so that
+	// code is already prepared to handle it.
+	return d, nil
+}
+
+// Open opens a new NVMe device handle from a device path (like /dev/nvme0).
+func Open(path string) (*Device, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return nil, fmt.Errorf("unable to open path: %w", err)
+	}
+	return NewFromFd(f)
+}
+
+// Close closes the NVMe device handle. It returns an error if the handle was
+// not created by Open. Please close the handle passed to NewFromFd yourself
+// in that case.
+func (d *Device) Close() error {
+	if f, ok := d.fd.(*os.File); ok {
+		return f.Close()
+	} else {
+		return errors.New("unable to close device not opened via Open, please close it yourself")
+	}
+}
+
+const (
+	// GlobalNamespace is the namespace ID for operations not on a specific
+	// namespace.
+	GlobalNamespace = 0xffffffff
+)
+
+// Command represents a generic NVMe command. Only use this if the command
+// you need is not already wrapped by this library.
+type Command struct {
+	Opcode                                   uint8
+	Flags                                    uint8
+	NamespaceID                              uint32
+	CDW2, CDW3                               uint32
+	Metadata                                 []byte
+	Data                                     []byte
+	CDW10, CDW11, CDW12, CDW13, CDW14, CDW15 uint32
+	Timeout                                  time.Duration
+}
+
+func (d *Device) GetLogPage(ns uint32, logPageIdentifier uint8, logSpecificField uint8, logPageOffset uint64, pageBuf []byte) error {
+	numberOfDwords := len(pageBuf) / 4
+	return d.RawCommand(&Command{
+		Opcode:      0x02,
+		NamespaceID: ns,
+		Data:        pageBuf,
+		CDW10:       uint32(logPageIdentifier) | uint32(logSpecificField&0xF)<<8 | uint32(numberOfDwords)<<16, // TODO: RAE
+		CDW11:       uint32(numberOfDwords >> 16 & 0xffff),
+		CDW12:       uint32(logPageOffset & 0xffffffff),
+		CDW13:       uint32(logPageOffset >> 32),
+	})
+}
diff --git a/metropolis/pkg/nvme/selftest.go b/metropolis/pkg/nvme/selftest.go
new file mode 100644
index 0000000..8f46995
--- /dev/null
+++ b/metropolis/pkg/nvme/selftest.go
@@ -0,0 +1,96 @@
+package nvme
+
+import (
+	"bytes"
+	"encoding/binary"
+)
+
+type SelfTestOp uint8
+
+const (
+	SelfTestNone     SelfTestOp = 0x0
+	SelfTestShort    SelfTestOp = 0x1
+	SelfTestExtended SelfTestOp = 0x2
+	SelfTestAbort    SelfTestOp = 0xF
+)
+
+func (d *Device) StartSelfTest(ns uint32, action SelfTestOp) error {
+	return d.RawCommand(&Command{
+		Opcode:      0x14,
+		NamespaceID: ns,
+		CDW10:       uint32(action & 0xF),
+	})
+}
+
+// Figure 99
+type selfTestResult struct {
+	SelfTestStatus             uint8
+	SegmentNumber              uint8
+	ValidDiagnosticInformation uint8
+	_                          byte
+	PowerOnHours               uint64
+	NamespaceID                uint32
+	FailingLBA                 uint64
+	StatusCodeType             uint8
+	StatusCode                 uint8
+	VendorSpecific             [2]byte
+}
+
+// Figure 98
+type selfTestLogPage struct {
+	CurrentSelfTestOp         uint8
+	CurrentSelfTestCompletion uint8
+	_                         [2]byte
+	SelfTestResults           [20]selfTestResult
+}
+
+type SelfTestResult struct {
+	// Op contains the self test type
+	Op            SelfTestOp
+	Result        uint8
+	SegmentNumber uint8
+	PowerOnHours  uint64
+	NamespaceID   uint32
+	FailingLBA    uint64
+	Error         Error
+}
+
+type SelfTestResults struct {
+	// CurrentOp contains the currently in-progress self test type (or
+	// SelfTestTypeNone if no self test is in progress).
+	CurrentOp SelfTestOp
+	// CurrentCompletion contains the progress from 0 to 1 of the currently
+	// in-progress self-test. Only valid if CurrentOp is not SelfTestTypeNone.
+	CurrentSelfTestCompletion float32
+	// PastResults contains a list of up to 20 previous self test results,
+	// sorted from the most recent to the oldest.
+	PastResults []SelfTestResult
+}
+
+func (d *Device) GetSelfTestResults(ns uint32) (*SelfTestResults, error) {
+	var buf [564]byte
+	if err := d.GetLogPage(ns, 0x06, 0, 0, buf[:]); err != nil {
+		return nil, err
+	}
+	var page selfTestLogPage
+	binary.Read(bytes.NewReader(buf[:]), binary.LittleEndian, &page)
+	var res SelfTestResults
+	res.CurrentOp = SelfTestOp(page.CurrentSelfTestOp & 0xF)
+	res.CurrentSelfTestCompletion = float32(page.CurrentSelfTestCompletion&0x7F) / 100.
+	for _, r := range page.SelfTestResults {
+		var t SelfTestResult
+		t.Op = SelfTestOp((r.SelfTestStatus >> 4) & 0xF)
+		t.Result = r.SelfTestStatus & 0xF
+		if t.Result == 0xF {
+			continue
+		}
+		t.SegmentNumber = r.SegmentNumber
+		t.PowerOnHours = r.PowerOnHours
+		t.NamespaceID = r.NamespaceID
+		t.FailingLBA = r.FailingLBA
+		t.Error.StatusCode = r.StatusCode
+		t.Error.StatusCodeType = r.StatusCodeType
+		res.PastResults = append(res.PastResults, t)
+	}
+	return &res, nil
+}
diff --git a/metropolis/pkg/nvme/struct_test.go b/metropolis/pkg/nvme/struct_test.go
new file mode 100644
index 0000000..b26a48e
--- /dev/null
+++ b/metropolis/pkg/nvme/struct_test.go
@@ -0,0 +1,15 @@
+package nvme
+
+import (
+	"encoding/binary"
+	"testing"
+)
+
+// TestStruct tests if the struct passed to Linux's ioctl has the ABI-specified
+// size.
+func TestStruct(t *testing.T) {
+	passthruCmdSize := binary.Size(passthruCmd{})
+	if passthruCmdSize != 72 {
+		t.Errorf("passthroughCmd is %d bytes, expected 72", passthruCmdSize)
+	}
+}
diff --git a/metropolis/pkg/nvme/uint128le.go b/metropolis/pkg/nvme/uint128le.go
new file mode 100644
index 0000000..a25adb7
--- /dev/null
+++ b/metropolis/pkg/nvme/uint128le.go
@@ -0,0 +1,29 @@
+package nvme
+
+import (
+	"math"
+	"math/big"
+)
+
+// uint128 little endian composed of two uint64s, readable by binary.Read.
+// Auxiliary type to simplify structures with uint128s (of which NVMe has
+// quite a few).
+type uint128le struct {
+	Lo, Hi uint64
+}
+
+// BigInt returns u as a bigint
+func (u uint128le) BigInt() *big.Int {
+	v := new(big.Int).SetUint64(u.Hi)
+	v = v.Lsh(v, 64)
+	v = v.Or(v, new(big.Int).SetUint64(u.Lo))
+	return v
+}
+
+// Uint64 returns u as a clamped uint64
+func (u uint128le) Uint64() uint64 {
+	if u.Hi > 0 {
+		return math.MaxUint64
+	}
+	return u.Lo
+}