pkg/nvme: add NVMe package
This adds a NVMe package for performing various low-level operations on
NVMe devices. Only the most important (to us) calls are implemented as
NVMe has a vast API surface.
Change-Id: I532894c3c2eb780309993a1688226c92c91cdedf
Reviewed-on: https://review.monogon.dev/c/monogon/+/999
Reviewed-by: Mateusz Zalega <mateusz@monogon.tech>
Tested-by: Jenkins CI
diff --git a/metropolis/pkg/nvme/BUILD.bazel b/metropolis/pkg/nvme/BUILD.bazel
new file mode 100644
index 0000000..6a2438b
--- /dev/null
+++ b/metropolis/pkg/nvme/BUILD.bazel
@@ -0,0 +1,33 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_library(
+ name = "nvme",
+ srcs = [
+ "cmd_linux.go",
+ "cmd_unsupported.go",
+ "error.go",
+ "format.go",
+ "health.go",
+ "identify.go",
+ "nvme.go",
+ "selftest.go",
+ "uint128le.go",
+ ],
+ importpath = "source.monogon.dev/metropolis/pkg/nvme",
+ visibility = ["//visibility:public"],
+ deps = select({
+ "@io_bazel_rules_go//go/platform:android": [
+ "@org_golang_x_sys//unix",
+ ],
+ "@io_bazel_rules_go//go/platform:linux": [
+ "@org_golang_x_sys//unix",
+ ],
+ "//conditions:default": [],
+ }),
+)
+
+go_test(
+ name = "nvme_test",
+ srcs = ["struct_test.go"],
+ embed = [":nvme"],
+)
diff --git a/metropolis/pkg/nvme/cmd_linux.go b/metropolis/pkg/nvme/cmd_linux.go
new file mode 100644
index 0000000..e4353cc
--- /dev/null
+++ b/metropolis/pkg/nvme/cmd_linux.go
@@ -0,0 +1,117 @@
+//go:build linux
+
+package nvme
+
+import (
+ "errors"
+ "fmt"
+ "math"
+ "runtime"
+ "unsafe"
+
+ "golang.org/x/sys/unix"
+)
+
+// From @linux//include/uapi/linux/nvme_ioctl.h
+const (
+ nvmeIoctlAdminCmd = 0xC0484E41 // _IOWR('N', 0x41, sizeof cmd)
+)
+
+// From @linux//include/uapi/linux/nvme_ioctl.h
+type passthruCmd struct {
+ // Corresponding to Figure 88
+ opcode uint8
+ flags uint8
+ rsvd1 uint16
+ nsid uint32
+ cdw2 uint32
+ cdw3 uint32
+ metadata uint64
+ addr uint64
+ metadataLen uint32
+ dataLen uint32
+ cdw10 uint32
+ cdw11 uint32
+ cdw12 uint32
+ cdw13 uint32
+ cdw14 uint32
+ cdw15 uint32
+
+ // Linux ioctl-specific
+ timeoutMs uint32
+ result uint32
+}
+
+// RawCommand runs a raw command on the NVMe device.
+// Please note that depending on the payload this can be very dangerous and can
+// cause data loss or even firmware issues.
+func (d *Device) RawCommand(cmd *Command) error {
+ conn, err := d.fd.SyscallConn()
+ if err != nil {
+ return fmt.Errorf("unable to get RawConn: %w", err)
+ }
+ cmdRaw := passthruCmd{
+ opcode: cmd.Opcode,
+ flags: cmd.Flags,
+ nsid: cmd.NamespaceID,
+ cdw2: cmd.CDW2,
+ cdw3: cmd.CDW3,
+ cdw10: cmd.CDW10,
+ cdw11: cmd.CDW11,
+ cdw12: cmd.CDW12,
+ cdw13: cmd.CDW13,
+ cdw14: cmd.CDW14,
+ cdw15: cmd.CDW15,
+ timeoutMs: uint32(cmd.Timeout.Milliseconds()),
+ }
+ // NOTE: Currently this is safe (even if the documentation says otherwise)
+ // as the runtime.KeepAlive call below ensures that the GC cannot clean up
+ // the memory segments passed as data and metadata. This is sufficient as
+ // Go's runtime currently does not use a moving GC, meaning that these
+ // pointers do not get invalidated as long as they are considered alive.
+ // In case Go introduces a moving GC, which they might want to do this will
+ // no longer be safe as a GC-initiated move can happen while the syscall is
+ // running, causing the kernel to overwrite random memory of the calling
+ // process. To avoid this, these data structures need to be pinned. But Go
+ // doesn't have a pinning API yet [1], so all I can do is note this here.
+ // [1] https://github.com/golang/go/issues/46787
+ if cmd.Data != nil {
+ if len(cmd.Data) > math.MaxUint32 {
+ return errors.New("data buffer larger than uint32, this is unsupported")
+ }
+ cmdRaw.dataLen = uint32(len(cmd.Data))
+ cmdRaw.addr = uint64(uintptr(unsafe.Pointer(&cmd.Data[0])))
+ }
+ if cmd.Metadata != nil {
+ if len(cmd.Metadata) > math.MaxUint32 {
+ return errors.New("metadata buffer larger than uint32, this is unsupported")
+ }
+ cmdRaw.metadataLen = uint32(len(cmd.Metadata))
+ cmdRaw.metadata = uint64(uintptr(unsafe.Pointer(&cmd.Metadata[0])))
+ }
+ var errno unix.Errno
+ var status uintptr
+ err = conn.Control(func(fd uintptr) {
+ status, _, errno = unix.Syscall(unix.SYS_IOCTL, fd, nvmeIoctlAdminCmd, uintptr(unsafe.Pointer(&cmdRaw)))
+ })
+ runtime.KeepAlive(cmdRaw)
+ runtime.KeepAlive(cmd.Data)
+ runtime.KeepAlive(cmd.Metadata)
+ if err != nil {
+ return fmt.Errorf("unable to get fd: %w", err)
+ }
+ if errno != 0 {
+ return errno
+ }
+ var commandErr Error
+ commandErr.DoNotRetry = status&(1<<15) != 0 // Bit 31
+ commandErr.More = status&(1<<14) != 0 // Bit 30
+ commandErr.StatusCodeType = uint8((status >> 8) & 0x7) // Bits 27:25
+ commandErr.StatusCode = uint8(status & 0xff) // Bits 24:17
+ // The only success status is in the generic status code set with value 0
+ if commandErr.StatusCodeType != StatusCodeTypeGeneric ||
+ commandErr.StatusCode != 0 {
+ return commandErr
+ }
+ return nil
+}
diff --git a/metropolis/pkg/nvme/cmd_unsupported.go b/metropolis/pkg/nvme/cmd_unsupported.go
new file mode 100644
index 0000000..747a33d
--- /dev/null
+++ b/metropolis/pkg/nvme/cmd_unsupported.go
@@ -0,0 +1,12 @@
+//go:build !linux
+
+package nvme
+
+import (
+ "fmt"
+ "runtime"
+)
+
+func (d *Device) RawCommand(cmd *Command) error {
+ return fmt.Errorf("NVMe command interface unimplemented for %v", runtime.GOOS)
+}
diff --git a/metropolis/pkg/nvme/error.go b/metropolis/pkg/nvme/error.go
new file mode 100644
index 0000000..8c4a207
--- /dev/null
+++ b/metropolis/pkg/nvme/error.go
@@ -0,0 +1,136 @@
+package nvme
+
+import "fmt"
+
+// Figure 31 in the spec
+var genericStatusCodeDesc = map[uint8]string{
+ 0x00: "successful completion",
+ 0x01: "invalid command opcode",
+ 0x02: "invalid field in command",
+ 0x03: "command ID conflict",
+ 0x04: "data transfer error",
+ 0x05: "command aborted due power loss notification",
+ 0x06: "internal error",
+ 0x07: "command abort requested",
+ 0x08: "command abort due to SQ deletion",
+ 0x09: "command abort due to failed fused command",
+ 0x0a: "command abort due to missing fused command",
+ 0x0b: "invalid namespace or format",
+ 0x0c: "command sequence error",
+ 0x0d: "invalid SGL segment descriptor",
+ 0x0e: "invalid number of SGL descriptors",
+ 0x0f: "data SGL length invalid",
+ 0x10: "metadata SGL length invalid",
+ 0x11: "SGL descriptor type invalid",
+ 0x12: "invalid use of controller memory buffer",
+ 0x13: "PRP offset invalid",
+ 0x14: "atomic write unit exceeded",
+ 0x15: "operation denied",
+ 0x16: "SGL offset invalid",
+ 0x18: "host identifer inconsistent format",
+ 0x19: "keep alive timeout expired",
+ 0x1a: "keep alive timeout invalid",
+ 0x1b: "command aborted due to preempt and abort",
+ 0x1c: "sanitize failed",
+ 0x1d: "sanitize in progress",
+ 0x1e: "SGL data block granularity invalid",
+ 0x1f: "command not supported for queue in CMB",
+
+ // Figure 32
+ 0x80: "LBA out of range",
+ 0x81: "capacity exceeded",
+ 0x82: "namespace not ready",
+ 0x83: "reservation conflict",
+ 0x84: "format in progress",
+}
+
+// Figure 33 in the spec
+var commandSpecificStatusCodeDesc = map[uint8]string{
+ 0x00: "completion queue invalid",
+ 0x01: "invalid queue identifier",
+ 0x02: "invalid queue size",
+ 0x03: "abort command limit exceeded",
+ 0x05: "asynchronous event request limit exceeded",
+ 0x06: "invalid firmware slot",
+ 0x07: "invalid firmware image",
+ 0x08: "invalid interrupt vector",
+ 0x09: "invalid log page",
+ 0x0a: "invalid format",
+ 0x0b: "firmware activation requires conventional reset",
+ 0x0c: "invalid queue deletion",
+ 0x0d: "feature identifier not saveable",
+ 0x0e: "feature not changeable",
+ 0x0f: "feature not namespace-specific",
+ 0x10: "firmware activation requires NVM subsystem reset",
+ 0x11: "firmware activation requires reset",
+ 0x12: "firmware activation requires maximum time violation",
+ 0x13: "firmware activation prohibited",
+ 0x14: "overlapping range",
+ 0x15: "namespace insufficient capacity",
+ 0x16: "namespace identifier unavailable",
+ 0x18: "namespace already attached",
+ 0x19: "namespace is private",
+ 0x1a: "namespace is not attached",
+ 0x1b: "thin provisioning not supported",
+ 0x1c: "controller list invalid",
+ 0x1d: "device self-test in progress",
+ 0x1e: "boot partition write prohibited",
+ 0x1f: "invalid controller identifier",
+ 0x20: "invalid secondary controller state",
+ 0x21: "invalid number of controller resources",
+ 0x22: "invalid resource identifier",
+
+ // Figure 34
+ 0x80: "conflicting attributes",
+ 0x81: "invalid protection information",
+ 0x82: "attempted to write to read-only range",
+}
+
+// Figure 36
+var mediaAndDataIntegrityStatusCodeDesc = map[uint8]string{
+ 0x80: "write fault",
+ 0x81: "unrecovered read error",
+ 0x82: "end-to-end guard check error",
+ 0x83: "end-to-end application tag check error",
+ 0x84: "end-to-end reference tag check error",
+ 0x85: "compare failure",
+ 0x86: "access denied",
+ 0x87: "deallocated or unwritten logical block",
+}
+
+const (
+ StatusCodeTypeGeneric = 0x0
+ StatusCodeTypeCommandSpecific = 0x1
+ StatusCodeTypeMediaAndDataIntegrity = 0x2
+)
+
+// Error represents an error returned by the NVMe device in the form of a
+// NVMe Status Field (see also Figure 29 in the spec).
+type Error struct {
+ DoNotRetry bool
+ More bool
+ StatusCodeType uint8
+ StatusCode uint8
+}
+
+func (e Error) Error() string {
+ switch e.StatusCodeType {
+ case StatusCodeTypeGeneric:
+ if errStr, ok := genericStatusCodeDesc[e.StatusCode]; ok {
+ return errStr
+ }
+ return fmt.Sprintf("unknown error with generic code 0x%x", e.StatusCode)
+ case StatusCodeTypeCommandSpecific:
+ if errStr, ok := commandSpecificStatusCodeDesc[e.StatusCode]; ok {
+ return errStr
+ }
+ return fmt.Sprintf("unknown error with command-specific code 0x%x", e.StatusCode)
+ case StatusCodeTypeMediaAndDataIntegrity:
+ if errStr, ok := mediaAndDataIntegrityStatusCodeDesc[e.StatusCode]; ok {
+ return errStr
+ }
+ return fmt.Sprintf("unknown error with media and data integrity code 0x%x", e.StatusCode)
+ default:
+ return fmt.Sprintf("unknown error with unknown type 0x%x and code 0x%x", e.StatusCodeType, e.StatusCode)
+ }
+}
diff --git a/metropolis/pkg/nvme/format.go b/metropolis/pkg/nvme/format.go
new file mode 100644
index 0000000..8bde44a
--- /dev/null
+++ b/metropolis/pkg/nvme/format.go
@@ -0,0 +1,75 @@
+package nvme
+
+// SecureEraseType specifices what type of secure erase should be performed by
+// by the controller. The zero value requests no secure erase.
+type SecureEraseType uint8
+
+const (
+ // SecureEraseTypeNone specifies that no secure erase operation is
+ // requested.
+ SecureEraseTypeNone SecureEraseType = 0
+ // SecureEraseTypeUserData specifies that all user data should be securely
+ // erased. The controller is allowed to perform a cryptographic erase
+ // instead.
+ SecureEraseTypeUserData SecureEraseType = 1
+ // SecureEraseTypeCryptographic specifies that the encryption key for user
+ // data should be erased. This in turn causes all current user data to
+ // become unreadable.
+ SecureEraseTypeCryptographic SecureEraseType = 2
+)
+
+// ProtectionInformationType selects the type of end-to-end protection tags to
+// use. NVMe supports the same types as T10 DIF (SCSI).
+type ProtectionInformationType uint8
+
+const (
+ ProtectionInformationTypeNone ProtectionInformationType = 0
+ ProtectionInformationType1 ProtectionInformationType = 1
+ ProtectionInformationType2 ProtectionInformationType = 2
+ ProtectionInformationType3 ProtectionInformationType = 3
+)
+
+type FormatRequest struct {
+ // NamespaceID contains the ID of the namespace to format.
+ // NamespaceGlobal formats all namespaces.
+ NamespaceID uint32
+ // SecureEraseSettings specifies the type of secure erase to perform.
+ SecureEraseSettings SecureEraseType
+ // ProtectionInformationLocation selects where protection information is
+ // transmitted. If true, it is transmitted as the first 8 bytes of metadata.
+ // If false, it is transmitted as the last 8 bytes of metadata.
+ ProtectionInformationLocation bool
+ // ProtectionInformation specifies the type of T10 DIF Protection
+ // Information to use.
+ ProtectionInformation ProtectionInformationType
+ // MetadataInline selects whether metadata is transferred as part of an
+ // extended data LBA. If false, metadata is returned in a separate buffer.
+ // If true, metadata is appended to the data buffer.
+ MetadataInline bool
+ // LBAFormat specifies the LBA format to use. This needs to be selected
+ // from the list of supported LBA formats in the Identify response.
+ LBAFormat uint8
+}
+
+// Format performs a low-level format of the NVM media. This is used for
+// changing the block and/or metadata size. This command causes all data
+// on the specified namespace to be lost. By setting SecureEraseSettings
+// to the appropriate value it can also be used to securely erase data.
+// See also the Sanitize command for just wiping the device.
+func (d *Device) Format(req *FormatRequest) error {
+ var cdw10 uint32
+ cdw10 |= uint32(req.SecureEraseSettings&0x7) << 9
+ cdw10 |= uint32(req.ProtectionInformation&0x7) << 5
+ cdw10 |= uint32(req.LBAFormat & 0x7)
+ if req.ProtectionInformationLocation {
+ cdw10 |= 1 << 8
+ }
+ if req.MetadataInline {
+ cdw10 |= 1 << 4
+ }
+ return d.RawCommand(&Command{
+ Opcode: 0x80,
+ NamespaceID: req.NamespaceID,
+ CDW10: cdw10,
+ })
+}
diff --git a/metropolis/pkg/nvme/health.go b/metropolis/pkg/nvme/health.go
new file mode 100644
index 0000000..775742f
--- /dev/null
+++ b/metropolis/pkg/nvme/health.go
@@ -0,0 +1,196 @@
+package nvme
+
+import (
+ "bytes"
+ "encoding/binary"
+ "fmt"
+ "math/big"
+ "time"
+)
+
+// healthPage represents the raw data from a NVMe Health/SMART page.
+// See Figure 93 in the spec.
+type healthPage struct {
+ CriticalWarning uint8
+ CompositeTemperature uint16
+ AvailableSpare uint8
+ AvailableSpareThreshold uint8
+ PercentageUsed uint8
+
+ _ [26]byte
+
+ DataUnitsRead uint128le
+ DataUnitsWritten uint128le
+ HostReadCommands uint128le
+ HostWriteCommands uint128le
+ ControllerBusyTime uint128le
+ PowerCycles uint128le
+ PowerOnHours uint128le
+ UnsafeSHutdowns uint128le
+ MediaAndDataIntegrityErrors uint128le
+ ErrorInformationLogEntries uint128le
+
+ WarningCompositeTemperatureTime uint32
+ CriticalCompositeTemperatureTime uint32
+
+ TemperatureSensors [8]uint16
+
+ ThermalMgmtTemperature1TransitionCount uint32
+ ThermalMgmtTemperature2TransitionCount uint32
+
+ _ [8]byte
+
+ TotalTimeForThermalMgmtTemperature1 uint32
+ TotalTimeForThermalMgmtTemperature2 uint32
+}
+
+// HealthInfo contains information related to the health of the NVMe device.
+//
+// Note that some values might be clamped under highly abnormal circumstances
+// as they are reported as 128-bit integers which Go doesn't support.
+// For easier handling values which are very unlikely to exceed 64 bits are
+// exposed as 64 bit integers.
+type HealthInfo struct {
+ // AvailableSpareSpaceCritical is set if the avilable spare threshold has
+ // fallen below the critical threshold.
+ AvailableSpareSpaceCritical bool
+ // TemperatureCritical is set if a temperature is outside the acceptable
+ // operating thresholds.
+ TemperatureCritical bool
+ // MediaCritical is set if significant media or internal issues affect the
+ // operation of the device.
+ MediaCritical bool
+ // ForcedReadOnly is set if the device is forced into read-only mode due
+ // to an error.
+ ForcedReadOnly bool
+ // VolatileMemoryBackupFailed is set if the volatile memory backup device
+ // has failed.
+ VolatileMemoryBackupFailed bool
+ // CompositeTemperatureKelvin contains a derived value representing the
+ // composite state of controller and namespace/flash temperature.
+ // The exact mechanism used to derive it is vendor-specific.
+ CompositeTemperatureKelvin uint16
+ // AvailableSpare represents the relative amount (0-1) of spare capacity
+ // still unnused.
+ AvailableSpare float32
+ // AvailableSpareThreshold represents the vendor-defined threshold which
+ // AvailableSpare shuld not fall under.
+ AvailableSpareThreshold float32
+ // LifeUsed represents vendor-defined relative estimate of the life of
+ // the device which has been used up. It is allowed to exceed 1 and will
+ // be clamped by the device somewhere between 1.0 and 2.55.
+ LifeUsed float32
+ // BytesRead contains the number of bytes read from the device.
+ // This value is only updated in 512KiB increments.
+ BytesRead *big.Int
+ // BytesWritten contains the number of bytes written to the device.
+ // This value is only updated in 512KiB increments.
+ BytesWritten *big.Int
+ // HostReadCommands contains the number of read commands completed by the
+ // controller.
+ HostReadCommands *big.Int
+ // HostWriteCommands contains the number of write commands completed by the
+ // controller.
+ HostWriteCommands *big.Int
+ // ControllerBusyTime contains the cumulative amount of time the controller
+ // has spent being busy (i.e. having at least one command outstanding on an
+ // I/O queue). This value is only updated in 1m increments.
+ ControllerBusyTime time.Duration
+ // PowerCycles contains the number of power cycles.
+ PowerCycles uint64
+ // PowerOnHours contains the number of hours the controller has been
+ // powered on. Depending on the vendor implementation it may or may
+ // not contain time spent in a non-operational power state.
+ PowerOnHours uint64
+ // UnsafeShutdown contains the number of power loss events without
+ // a prior shutdown notification from the host.
+ UnsafeShutdowns uint64
+ // MediaAndDataIntegrityErrors contains the number of occurrences where the
+ // controller detecte an unrecovered data integrity error.
+ MediaAndDataIntegrityErrors uint64
+ // ErrorInformationLogEntriesCount contains the number of Error
+ // Information log entries over the life of the controller.
+ ErrorInformationLogEntriesCount uint64
+ // WarningCompositeTemperatureTime contains the amount of time the
+ // controller is operational while the composite temperature is greater
+ // than the warning composite threshold.
+ WarningCompositeTemperatureTime time.Duration
+ // CriticalCompositeTemperatureTime contains the amount of time the
+ // controller is operational while the composite temperature is greater
+ // than the critical composite threshold.
+ CriticalCompositeTemperatureTime time.Duration
+ // TemperatureSensorValues contains the current temperature in Kelvin as
+ // reported by up to 8 sensors on the device. A value of zero means that
+ // the given sensor is not available.
+ TemperatureSensorValues [8]uint16
+ // ThermalMgmtTemperature1TransitionCount contains the number of times the
+ // controller transitioned to lower power active power states or performed
+ // vendor-specific thermal management actions to reduce temperature.
+ ThermalMgmtTemperature1TransitionCount uint32
+ // ThermalMgmtTemperature2TransitionCount is the same as above, but
+ // for "heavier" thermal management actions including heavy throttling.
+ // The actual difference is vendor-specific.
+ ThermalMgmtTemperature2TransitionCount uint32
+ // TotalTimeForThermalMgmtTemperature1 contains the total time the
+ // controller spent under "light" thermal management.
+ TotalTimeForThermalMgmtTemperature1 time.Duration
+ // TotalTimeForThermalMgmtTemperature2 contains the total time the
+ // controller spent under "heavy" thermal management.
+ TotalTimeForThermalMgmtTemperature2 time.Duration
+}
+
+// HasCriticalWarning returns true if any of the critical warnings
+// (AvailableSpareSpaceCritical, TemperatureCritical, MediaCritical,
+// ForcedReadOnly, VolatileMemoryBackupFailed) are active.
+// If this returns true the NVMe medium has reason to believe that
+// data availability or integrity is endangered.
+func (h *HealthInfo) HasCriticalWarning() bool {
+ return h.AvailableSpareSpaceCritical || h.TemperatureCritical || h.MediaCritical || h.ForcedReadOnly || h.VolatileMemoryBackupFailed
+}
+
+// See Figure 93 Data Units Read
+var dataUnit = big.NewInt(512 * 1000)
+
+const (
+ healthLogPage = 0x02
+)
+
+// GetHealthInfo gets health information from the NVMe device's health log page.
+func (d *Device) GetHealthInfo() (*HealthInfo, error) {
+ var buf [512]byte
+
+ if err := d.GetLogPage(GlobalNamespace, healthLogPage, 0, 0, buf[:]); err != nil {
+ return nil, fmt.Errorf("unable to get health log page: %w", err)
+ }
+
+ var page healthPage
+ binary.Read(bytes.NewReader(buf[:]), binary.LittleEndian, &page)
+ var res HealthInfo
+ res.AvailableSpareSpaceCritical = page.CriticalWarning&(1<<0) != 0
+ res.TemperatureCritical = page.CriticalWarning&(1<<1) != 0
+ res.MediaCritical = page.CriticalWarning&(1<<2) != 0
+ res.ForcedReadOnly = page.CriticalWarning&(1<<3) != 0
+ res.VolatileMemoryBackupFailed = page.CriticalWarning&(1<<4) != 0
+ res.CompositeTemperatureKelvin = page.CompositeTemperature
+ res.AvailableSpare = float32(page.AvailableSpare) / 100.
+ res.AvailableSpareThreshold = float32(page.AvailableSpareThreshold) / 100.
+ res.LifeUsed = float32(page.PercentageUsed) / 100.
+ res.BytesRead = new(big.Int).Mul(page.DataUnitsRead.BigInt(), dataUnit)
+ res.BytesWritten = new(big.Int).Mul(page.DataUnitsWritten.BigInt(), dataUnit)
+ res.HostReadCommands = page.HostReadCommands.BigInt()
+ res.HostWriteCommands = page.HostWriteCommands.BigInt()
+ res.ControllerBusyTime = time.Duration(page.ControllerBusyTime.Uint64()) * time.Minute
+ res.PowerCycles = page.PowerCycles.Uint64()
+ res.PowerOnHours = page.PowerOnHours.Uint64()
+ res.UnsafeShutdowns = page.UnsafeSHutdowns.Uint64()
+ res.MediaAndDataIntegrityErrors = page.MediaAndDataIntegrityErrors.Uint64()
+ res.ErrorInformationLogEntriesCount = page.ErrorInformationLogEntries.Uint64()
+ res.WarningCompositeTemperatureTime = time.Duration(page.WarningCompositeTemperatureTime) * time.Minute
+ res.CriticalCompositeTemperatureTime = time.Duration(page.CriticalCompositeTemperatureTime) * time.Minute
+ res.TemperatureSensorValues = page.TemperatureSensors
+ res.ThermalMgmtTemperature1TransitionCount = page.ThermalMgmtTemperature1TransitionCount
+ res.ThermalMgmtTemperature2TransitionCount = page.ThermalMgmtTemperature2TransitionCount
+ res.TotalTimeForThermalMgmtTemperature1 = time.Duration(page.TotalTimeForThermalMgmtTemperature1) * time.Second
+ res.TotalTimeForThermalMgmtTemperature2 = time.Duration(page.TotalTimeForThermalMgmtTemperature2) * time.Second
+ return &res, nil
+}
diff --git a/metropolis/pkg/nvme/identify.go b/metropolis/pkg/nvme/identify.go
new file mode 100644
index 0000000..218d089
--- /dev/null
+++ b/metropolis/pkg/nvme/identify.go
@@ -0,0 +1,193 @@
+package nvme
+
+import (
+ "bytes"
+ "encoding/binary"
+ "fmt"
+ "math/big"
+)
+
+// Figure 109
+type identifyData struct {
+ // Controller Capabilities and Features
+ PCIVendorID uint16
+ PCISubsystemVendorID uint16
+ SerialNumber [20]byte
+ ModelNumber [40]byte
+ FirmwareRevision [8]byte
+ RecommendedArbitrationBurst uint8
+ IEEEOUI [3]byte
+ CMIC uint8
+ MaximumDataTransferSize uint8
+ ControllerID uint16
+ Version uint32
+ RuntimeD3ResumeLatency uint32
+ RuntimeD3EntryLatency uint32
+ OAES uint32
+ CTRATT uint32
+ _ [12]byte
+ FRUGUID [16]byte
+ _ [128]byte
+ // Admin Command Set Attributes & Optional Controller Capabilities
+ OACS uint16
+ AbortCommandLimit uint8
+ AsynchronousEventRequestLimit uint8
+ FRMW uint8
+ LPA uint8
+ ErrorLogPageEntries uint8
+ NumberOfPowerStatesSupport uint8
+ AdminVendorSpecificCmdConfig uint8
+ AutonomousPowerStateTransitionAttrs uint8
+ WarningCompositeTempThreshold uint16
+ CriticalCompositeTempThreshold uint16
+ MaximumTimeForFirmwareActivation uint16
+ HostMemoryBufferPreferredSize uint32
+ HostMemoryBufferMinimumSize uint32
+ TotalNVMCapacity uint128le
+ UnallocatedNVMCapacity uint128le
+ ReplyProtectedMemoryBlockSupport uint32
+ ExtendedDeviceSelfTestTime uint16
+ DeviceSelfTestOptions uint8
+ FirmwareUpdateGranularity uint8
+ KeepAliveSupport uint16
+ HostControlledThermalMgmtAttrs uint16
+ MinimumThermalMgmntTemp uint16
+ MaximumThermalMgmntTemp uint16
+ SanitizeCapabilities uint32
+ _ [180]byte
+ // NVM Command Set Attributes
+ SubmissionQueueEntrySize uint8
+ CompletionQueueEntrySize uint8
+ MaximumOutstandingCommands uint16
+ NumberOfNamespaces uint32
+ OptionalNVMCommandSupport uint16
+ FusedOperationSupport uint16
+ FormatNVMAttributes uint8
+ VolatileWriteCache uint8
+ AtomicWriteUnitNormal uint16
+ AtomicWriteUnitPowerFail uint16
+ NVMVendorSepcificCommandConfig uint8
+ AtomicCompareAndWriteUnit uint16
+ _ [2]byte
+ SGLSupport uint32
+ _ [228]byte
+ NVMSubsystemNVMeQualifiedName [256]byte
+ _ [1024]byte
+ // Power State Descriptors
+ PowerStateDescriptors [32][32]byte
+}
+
+// IdentifyData contains various identifying information about a NVMe
+// controller. Because the actual data structure is very large, currently not
+// all fields are exposed as properly-typed individual fields. If you need
+// a new field, please add it to this structure.
+type IdentifyData struct {
+ // PCIVendorID contains the company vendor identifier assigned by the PCI
+ // SIG.
+ PCIVendorID uint16
+ // PCISubsystemVendorID contains the company vendor identifier that is
+ // assigned by the PCI SIG for the subsystem.
+ PCISubsystemVendorID uint16
+ // SerialNumber contains the serial number for the NVM subsystem that is
+ // assigned by the vendor.
+ SerialNumber string
+ // ModelNumber contains the model number for the NVM subsystem that is
+ // assigned by the vendor.
+ ModelNumber string
+ // FirmwareRevision contains the currently active firmware revision for the
+ // NVM subsystem.
+ FirmwareRevision string
+ // IEEEOUI contains the Organization Unique Identifier for the controller
+ // vendor as assigned by the IEEE.
+ IEEEOUI [3]byte
+
+ // IsPCIVirtualFunction indicates if the controller is a virtual controller
+ // as part of a PCI virtual function.
+ IsPCIVirtualFunction bool
+
+ // SpecVersionMajor/Minor contain the version of the NVMe specification the
+ // controller supports. Only mandatory from spec version 1.2 onwards.
+ SpecVersionMajor uint16
+ SpecVersionMinor uint8
+
+ // FRUGloballyUniqueIdentifier contains a 128-bit value that is globally
+ // unique for a given Field Replaceable Unit (FRU). Contains all-zeroes if
+ // unavailable.
+ FRUGloballyUniqueIdentifier [16]byte
+ // VirtualizationManagementSupported indicates if the controller
+ // supports the Virtualization Management command.
+ VirtualizationManagementSupported bool
+ // NVMeMISupported indicates if the controller supports the NVMe-MI
+ // Send and Receive commands.
+ NVMeMISupported bool
+ // DirectivesSupported indicates if the controller supports the
+ // Directive Send and Receive commands.
+ DirectivesSupported bool
+ // SelfTestSupported indicates if the controller supports the Device Self-
+ // test command.
+ SelfTestSupported bool
+ // NamespaceManagementSupported indicates if the controller supports the
+ // Namespace Management and Attachment commands.
+ NamespaceManagementSupported bool
+ // FirmwareUpdateSupported indicates if the controller supports the
+ // Firmware Commit and Image Download commands.
+ FirmwareUpdateSupported bool
+ // FormattingSupported indicates if the controller supports the Format
+ // command.
+ FormattingSupported bool
+ // SecuritySupported indicates if the controller supports the Security Send
+ // and Receive commands.
+ SecuritySupported bool
+
+ // TotalNVMCapacity contains the total NVM capacity in bytes in the NVM
+ // subsystem. This can be 0 on devices without NamespaceManagementSupported.
+ TotalNVMCapacity *big.Int
+ // UnallocatedNVMCapacity contains the unallocated NVM capacity in bytes in
+ // the NVM subsystem. This can be 0 on devices without
+ // NamespaceManagementSupported.
+ UnallocatedNVMCapacity *big.Int
+
+ // MaximumNumberOfNamespace defines the maximum number of namespaces
+ // supported by the controller.
+ MaximumNumberOfNamespaces uint32
+}
+
+func (d *Device) Identify() (*IdentifyData, error) {
+ var resp [4096]byte
+
+ if err := d.RawCommand(&Command{
+ Opcode: 0x06,
+ Data: resp[:],
+ CDW10: 1,
+ }); err != nil {
+ return nil, fmt.Errorf("Identify command failed: %w", err)
+ }
+ var raw identifyData
+ binary.Read(bytes.NewReader(resp[:]), binary.LittleEndian, &raw)
+
+ var res IdentifyData
+ res.PCIVendorID = raw.PCIVendorID
+ res.PCISubsystemVendorID = raw.PCISubsystemVendorID
+ res.SerialNumber = string(bytes.TrimRight(raw.SerialNumber[:], " "))
+ res.ModelNumber = string(bytes.TrimRight(raw.ModelNumber[:], " "))
+ res.FirmwareRevision = string(bytes.TrimRight(raw.FirmwareRevision[:], " "))
+ // OUIs are traditionally big-endian, but NVMe exposes them in little-endian
+ res.IEEEOUI[0], res.IEEEOUI[1], res.IEEEOUI[2] = raw.IEEEOUI[2], raw.IEEEOUI[1], raw.IEEEOUI[0]
+ res.IsPCIVirtualFunction = raw.CMIC&(1<<2) != 0
+ res.SpecVersionMajor = uint16(raw.Version >> 16)
+ res.SpecVersionMinor = uint8((raw.Version >> 8) & 0xFF)
+ res.FRUGloballyUniqueIdentifier = raw.FRUGUID
+ res.VirtualizationManagementSupported = raw.OACS&(1<<7) != 0
+ res.NVMeMISupported = raw.OACS&(1<<6) != 0
+ res.DirectivesSupported = raw.OACS&(1<<5) != 0
+ res.SelfTestSupported = raw.OACS&(1<<4) != 0
+ res.NamespaceManagementSupported = raw.OACS&(1<<3) != 0
+ res.FirmwareUpdateSupported = raw.OACS&(1<<2) != 0
+ res.FormattingSupported = raw.OACS&(1<<1) != 0
+ res.SecuritySupported = raw.OACS&(1<<0) != 0
+
+ res.TotalNVMCapacity = raw.TotalNVMCapacity.BigInt()
+ res.UnallocatedNVMCapacity = raw.UnallocatedNVMCapacity.BigInt()
+ res.MaximumNumberOfNamespaces = raw.NumberOfNamespaces
+ return &res, nil
+}
diff --git a/metropolis/pkg/nvme/nvme.go b/metropolis/pkg/nvme/nvme.go
new file mode 100644
index 0000000..f46546d
--- /dev/null
+++ b/metropolis/pkg/nvme/nvme.go
@@ -0,0 +1,79 @@
+// Package nvme provides methods and data structures for issuing commands to
+// device speaking the NVMe protocol.
+// This package is written against the NVMe Specification Revision 1.3 and
+// all references to figures or other parts of the spec refer to this version.
+package nvme
+
+import (
+ "errors"
+ "fmt"
+ "os"
+ "syscall"
+ "time"
+)
+
+// Device is a handle for a NVMe device.
+type Device struct {
+ fd syscall.Conn
+}
+
+// NewFromFd creates a new NVMe device handle from a system handle.
+func NewFromFd(fd syscall.Conn) (*Device, error) {
+ d := &Device{fd: fd}
+ // There is no good way to validate that a file descriptor indeed points to
+ // a NVMe device. For future compatibility let this return an error so that
+ // code is already prepared to handle it.
+ return d, nil
+}
+
+// Open opens a new NVMe device handle from a device path (like /dev/nvme0).
+func Open(path string) (*Device, error) {
+ f, err := os.Open(path)
+ if err != nil {
+ return nil, fmt.Errorf("unable to open path: %w", err)
+ }
+ return NewFromFd(f)
+}
+
+// Close closes the NVMe device handle. It returns an error if the handle was
+// not created by Open. Please close the handle passed to NewFromFd yourself
+// in that case.
+func (d *Device) Close() error {
+ if f, ok := d.fd.(*os.File); ok {
+ return f.Close()
+ } else {
+ return errors.New("unable to close device not opened via Open, please close it yourself")
+ }
+}
+
+const (
+ // GlobalNamespace is the namespace ID for operations not on a specific
+ // namespace.
+ GlobalNamespace = 0xffffffff
+)
+
+// Command represents a generic NVMe command. Only use this if the command
+// you need is not already wrapped by this library.
+type Command struct {
+ Opcode uint8
+ Flags uint8
+ NamespaceID uint32
+ CDW2, CDW3 uint32
+ Metadata []byte
+ Data []byte
+ CDW10, CDW11, CDW12, CDW13, CDW14, CDW15 uint32
+ Timeout time.Duration
+}
+
+func (d *Device) GetLogPage(ns uint32, logPageIdentifier uint8, logSpecificField uint8, logPageOffset uint64, pageBuf []byte) error {
+ numberOfDwords := len(pageBuf) / 4
+ return d.RawCommand(&Command{
+ Opcode: 0x02,
+ NamespaceID: ns,
+ Data: pageBuf,
+ CDW10: uint32(logPageIdentifier) | uint32(logSpecificField&0xF)<<8 | uint32(numberOfDwords)<<16, // TODO: RAE
+ CDW11: uint32(numberOfDwords >> 16 & 0xffff),
+ CDW12: uint32(logPageOffset & 0xffffffff),
+ CDW13: uint32(logPageOffset >> 32),
+ })
+}
diff --git a/metropolis/pkg/nvme/selftest.go b/metropolis/pkg/nvme/selftest.go
new file mode 100644
index 0000000..8f46995
--- /dev/null
+++ b/metropolis/pkg/nvme/selftest.go
@@ -0,0 +1,96 @@
+package nvme
+
+import (
+ "bytes"
+ "encoding/binary"
+)
+
+type SelfTestOp uint8
+
+const (
+ SelfTestNone SelfTestOp = 0x0
+ SelfTestShort SelfTestOp = 0x1
+ SelfTestExtended SelfTestOp = 0x2
+ SelfTestAbort SelfTestOp = 0xF
+)
+
+func (d *Device) StartSelfTest(ns uint32, action SelfTestOp) error {
+ return d.RawCommand(&Command{
+ Opcode: 0x14,
+ NamespaceID: ns,
+ CDW10: uint32(action & 0xF),
+ })
+}
+
+// Figure 99
+type selfTestResult struct {
+ SelfTestStatus uint8
+ SegmentNumber uint8
+ ValidDiagnosticInformation uint8
+ _ byte
+ PowerOnHours uint64
+ NamespaceID uint32
+ FailingLBA uint64
+ StatusCodeType uint8
+ StatusCode uint8
+ VendorSpecific [2]byte
+}
+
+// Figure 98
+type selfTestLogPage struct {
+ CurrentSelfTestOp uint8
+ CurrentSelfTestCompletion uint8
+ _ [2]byte
+ SelfTestResults [20]selfTestResult
+}
+
+type SelfTestResult struct {
+ // Op contains the self test type
+ Op SelfTestOp
+ Result uint8
+ SegmentNumber uint8
+ PowerOnHours uint64
+ NamespaceID uint32
+ FailingLBA uint64
+ Error Error
+}
+
+type SelfTestResults struct {
+ // CurrentOp contains the currently in-progress self test type (or
+ // SelfTestTypeNone if no self test is in progress).
+ CurrentOp SelfTestOp
+ // CurrentCompletion contains the progress from 0 to 1 of the currently
+ // in-progress self-test. Only valid if CurrentOp is not SelfTestTypeNone.
+ CurrentSelfTestCompletion float32
+ // PastResults contains a list of up to 20 previous self test results,
+ // sorted from the most recent to the oldest.
+ PastResults []SelfTestResult
+}
+
+func (d *Device) GetSelfTestResults(ns uint32) (*SelfTestResults, error) {
+ var buf [564]byte
+ if err := d.GetLogPage(ns, 0x06, 0, 0, buf[:]); err != nil {
+ return nil, err
+ }
+ var page selfTestLogPage
+ binary.Read(bytes.NewReader(buf[:]), binary.LittleEndian, &page)
+ var res SelfTestResults
+ res.CurrentOp = SelfTestOp(page.CurrentSelfTestOp & 0xF)
+ res.CurrentSelfTestCompletion = float32(page.CurrentSelfTestCompletion&0x7F) / 100.
+ for _, r := range page.SelfTestResults {
+ var t SelfTestResult
+ t.Op = SelfTestOp((r.SelfTestStatus >> 4) & 0xF)
+ t.Result = r.SelfTestStatus & 0xF
+ if t.Result == 0xF {
+ continue
+ }
+ t.SegmentNumber = r.SegmentNumber
+ t.PowerOnHours = r.PowerOnHours
+ t.NamespaceID = r.NamespaceID
+ t.FailingLBA = r.FailingLBA
+ t.Error.StatusCode = r.StatusCode
+ t.Error.StatusCodeType = r.StatusCodeType
+ res.PastResults = append(res.PastResults, t)
+ }
+ return &res, nil
+}
diff --git a/metropolis/pkg/nvme/struct_test.go b/metropolis/pkg/nvme/struct_test.go
new file mode 100644
index 0000000..b26a48e
--- /dev/null
+++ b/metropolis/pkg/nvme/struct_test.go
@@ -0,0 +1,15 @@
+package nvme
+
+import (
+ "encoding/binary"
+ "testing"
+)
+
+// TestStruct tests if the struct passed to Linux's ioctl has the ABI-specified
+// size.
+func TestStruct(t *testing.T) {
+ passthruCmdSize := binary.Size(passthruCmd{})
+ if passthruCmdSize != 72 {
+ t.Errorf("passthroughCmd is %d bytes, expected 72", passthruCmdSize)
+ }
+}
diff --git a/metropolis/pkg/nvme/uint128le.go b/metropolis/pkg/nvme/uint128le.go
new file mode 100644
index 0000000..a25adb7
--- /dev/null
+++ b/metropolis/pkg/nvme/uint128le.go
@@ -0,0 +1,29 @@
+package nvme
+
+import (
+ "math"
+ "math/big"
+)
+
+// uint128 little endian composed of two uint64s, readable by binary.Read.
+// Auxiliary type to simplify structures with uint128s (of which NVMe has
+// quite a few).
+type uint128le struct {
+ Lo, Hi uint64
+}
+
+// BigInt returns u as a bigint
+func (u uint128le) BigInt() *big.Int {
+ v := new(big.Int).SetUint64(u.Hi)
+ v = v.Lsh(v, 64)
+ v = v.Or(v, new(big.Int).SetUint64(u.Lo))
+ return v
+}
+
+// Uint64 returns u as a clamped uint64
+func (u uint128le) Uint64() uint64 {
+ if u.Hi > 0 {
+ return math.MaxUint64
+ }
+ return u.Lo
+}