pkg/nvme: add NVMe package

This adds a NVMe package for performing various low-level operations on
NVMe devices. Only the most important (to us) calls are implemented as
NVMe has a vast API surface.

Change-Id: I532894c3c2eb780309993a1688226c92c91cdedf
Reviewed-on: https://review.monogon.dev/c/monogon/+/999
Reviewed-by: Mateusz Zalega <mateusz@monogon.tech>
Tested-by: Jenkins CI
diff --git a/metropolis/pkg/nvme/cmd_linux.go b/metropolis/pkg/nvme/cmd_linux.go
new file mode 100644
index 0000000..e4353cc
--- /dev/null
+++ b/metropolis/pkg/nvme/cmd_linux.go
@@ -0,0 +1,117 @@
+//go:build linux
+
+package nvme
+
+import (
+	"errors"
+	"fmt"
+	"math"
+	"runtime"
+	"unsafe"
+
+	"golang.org/x/sys/unix"
+)
+
+// From @linux//include/uapi/linux/nvme_ioctl.h
+const (
+	nvmeIoctlAdminCmd = 0xC0484E41 // _IOWR('N', 0x41, sizeof cmd)
+)
+
+// From @linux//include/uapi/linux/nvme_ioctl.h
+type passthruCmd struct {
+	// Corresponding to Figure 88
+	opcode      uint8
+	flags       uint8
+	rsvd1       uint16
+	nsid        uint32
+	cdw2        uint32
+	cdw3        uint32
+	metadata    uint64
+	addr        uint64
+	metadataLen uint32
+	dataLen     uint32
+	cdw10       uint32
+	cdw11       uint32
+	cdw12       uint32
+	cdw13       uint32
+	cdw14       uint32
+	cdw15       uint32
+
+	// Linux ioctl-specific
+	timeoutMs uint32
+	result    uint32
+}
+
+// RawCommand runs a raw command on the NVMe device.
+// Please note that depending on the payload this can be very dangerous and can
+// cause data loss or even firmware issues.
+func (d *Device) RawCommand(cmd *Command) error {
+	conn, err := d.fd.SyscallConn()
+	if err != nil {
+		return fmt.Errorf("unable to get RawConn: %w", err)
+	}
+	cmdRaw := passthruCmd{
+		opcode:    cmd.Opcode,
+		flags:     cmd.Flags,
+		nsid:      cmd.NamespaceID,
+		cdw2:      cmd.CDW2,
+		cdw3:      cmd.CDW3,
+		cdw10:     cmd.CDW10,
+		cdw11:     cmd.CDW11,
+		cdw12:     cmd.CDW12,
+		cdw13:     cmd.CDW13,
+		cdw14:     cmd.CDW14,
+		cdw15:     cmd.CDW15,
+		timeoutMs: uint32(cmd.Timeout.Milliseconds()),
+	}
+	// NOTE: Currently this is safe (even if the documentation says otherwise)
+	// as the runtime.KeepAlive call below ensures that the GC cannot clean up
+	// the memory segments passed as data and metadata. This is sufficient as
+	// Go's runtime currently does not use a moving GC, meaning that these
+	// pointers do not get invalidated as long as they are considered alive.
+	// In case Go introduces a moving GC, which they might want to do this will
+	// no longer be safe as a GC-initiated move can happen while the syscall is
+	// running, causing the kernel to overwrite random memory of the calling
+	// process. To avoid this, these data structures need to be pinned. But Go
+	// doesn't have a pinning API yet [1], so all I can do is note this here.
+	// [1] https://github.com/golang/go/issues/46787
+	if cmd.Data != nil {
+		if len(cmd.Data) > math.MaxUint32 {
+			return errors.New("data buffer larger than uint32, this is unsupported")
+		}
+		cmdRaw.dataLen = uint32(len(cmd.Data))
+		cmdRaw.addr = uint64(uintptr(unsafe.Pointer(&cmd.Data[0])))
+	}
+	if cmd.Metadata != nil {
+		if len(cmd.Metadata) > math.MaxUint32 {
+			return errors.New("metadata buffer larger than uint32, this is unsupported")
+		}
+		cmdRaw.metadataLen = uint32(len(cmd.Metadata))
+		cmdRaw.metadata = uint64(uintptr(unsafe.Pointer(&cmd.Metadata[0])))
+	}
+	var errno unix.Errno
+	var status uintptr
+	err = conn.Control(func(fd uintptr) {
+		status, _, errno = unix.Syscall(unix.SYS_IOCTL, fd, nvmeIoctlAdminCmd, uintptr(unsafe.Pointer(&cmdRaw)))
+	})
+	runtime.KeepAlive(cmdRaw)
+	runtime.KeepAlive(cmd.Data)
+	runtime.KeepAlive(cmd.Metadata)
+	if err != nil {
+		return fmt.Errorf("unable to get fd: %w", err)
+	}
+	if errno != 0 {
+		return errno
+	}
+	var commandErr Error
+	commandErr.DoNotRetry = status&(1<<15) != 0            // Bit 31
+	commandErr.More = status&(1<<14) != 0                  // Bit 30
+	commandErr.StatusCodeType = uint8((status >> 8) & 0x7) // Bits 27:25
+	commandErr.StatusCode = uint8(status & 0xff)           // Bits 24:17
+	// The only success status is in the generic status code set with value 0
+	if commandErr.StatusCodeType != StatusCodeTypeGeneric ||
+		commandErr.StatusCode != 0 {
+		return commandErr
+	}
+	return nil
+}