m/n/core/localstorage/crypt: read partition data from uevent

Previously we only checked the blockdevices itself,
but in the real-world the minor-id is not always the partition offset.
This scans all blockdevs that are partitions and creates them correctly

Change-Id: I8f3d99761e9e883783b398496ec8b35f28f3557d
Reviewed-on: https://review.monogon.dev/c/monogon/+/1813
Reviewed-by: Serge Bazanski <serge@monogon.tech>
Tested-by: Jenkins CI
diff --git a/metropolis/node/core/localstorage/crypt/blockdev.go b/metropolis/node/core/localstorage/crypt/blockdev.go
index 8379180..353164a 100644
--- a/metropolis/node/core/localstorage/crypt/blockdev.go
+++ b/metropolis/node/core/localstorage/crypt/blockdev.go
@@ -22,6 +22,7 @@
 	"os"
 	"path/filepath"
 	"strconv"
+	"strings"
 	"unsafe"
 
 	"github.com/google/uuid"
@@ -42,6 +43,18 @@
 	NodeDataRawPath = "/dev/data-raw"
 )
 
+// nodePathForPartitionType returns the device node path
+// for a given partition type.
+func nodePathForPartitionType(t uuid.UUID) string {
+	switch t {
+	case gpt.PartitionTypeEFISystem:
+		return ESPDevicePath
+	case NodeDataPartitionType:
+		return NodeDataRawPath
+	}
+	return ""
+}
+
 // MakeBlockDevices looks for the ESP and the node data partition and maps them
 // to ESPDevicePath and NodeDataCryptPath respectively. This doesn't fail if it
 // doesn't find the partitions, only if something goes catastrophically wrong.
@@ -50,79 +63,223 @@
 	if err != nil {
 		supervisor.Logger(ctx).Warningf("No EFI variable for the loader device partition UUID present")
 	}
-	blockdevNames, err := os.ReadDir("/sys/class/block")
+
+	blockDevs, err := os.ReadDir("/sys/class/block")
 	if err != nil {
 		return fmt.Errorf("failed to read sysfs block class: %w", err)
 	}
-	for _, blockdevName := range blockdevNames {
-		ueventData, err := sysfs.ReadUevents(filepath.Join("/sys/class/block", blockdevName.Name(), "uevent"))
-		if err != nil {
-			return fmt.Errorf("failed to read uevent for block device %v: %w", blockdevName.Name(), err)
+
+	for _, blockDev := range blockDevs {
+		if err := handleBlockDevice(blockDev.Name(), blockDevs, espUUID); err != nil {
+			supervisor.Logger(ctx).Errorf("Failed to create block device %s: %w", blockDev.Name(), err)
 		}
-		if ueventData["DEVTYPE"] == "disk" {
-			majorDev, err := strconv.Atoi(ueventData["MAJOR"])
-			if err != nil {
-				return fmt.Errorf("failed to convert uevent: %w", err)
-			}
-			devNodeName := fmt.Sprintf("/dev/%v", ueventData["DEVNAME"])
-			// TODO(lorenz): This extraction code is all a bit hairy, will get
-			// replaced by blockdev shortly.
-			blkdev, err := os.Open(devNodeName)
-			if err != nil {
-				return fmt.Errorf("failed to open block device %v: %w", devNodeName, err)
-			}
-			defer blkdev.Close()
-			blockSize, err := unix.IoctlGetUint32(int(blkdev.Fd()), unix.BLKSSZGET)
-			if err != nil {
-				continue // This is not a regular block device
-			}
-			var sizeBytes uint64
-			_, _, err = unix.Syscall(unix.SYS_IOCTL, blkdev.Fd(), unix.BLKGETSIZE64, uintptr(unsafe.Pointer(&sizeBytes)))
-			if err != unix.Errno(0) {
-				return fmt.Errorf("failed to get device size: %w", err)
-			}
-			blkdev.Seek(int64(blockSize), 0)
-			table, err := gpt.Read(blkdev, int64(blockSize), int64(sizeBytes)/int64(blockSize))
-			if err != nil {
-				// Probably just not a GPT-partitioned disk
-				continue
-			}
-			skipDisk := false
-			if espUUID != uuid.Nil {
-				// If we know where we booted from, ignore all disks which do
-				// not contain this partition.
-				skipDisk = true
-				for _, part := range table.Partitions {
-					if part.ID == espUUID {
-						skipDisk = false
-						break
-					}
-				}
-			}
-			if skipDisk {
-				continue
-			}
-			seenTypes := make(map[uuid.UUID]bool)
-			for partNumber, part := range table.Partitions {
-				if seenTypes[part.Type] {
-					return fmt.Errorf("failed to create device node for %s (%s): node for this type already created/multiple partitions found", part.ID.String(), part.Type.String())
-				}
-				if part.Type == gpt.PartitionTypeEFISystem {
-					seenTypes[part.Type] = true
-					err := unix.Mknod(ESPDevicePath, 0600|unix.S_IFBLK, int(unix.Mkdev(uint32(majorDev), uint32(partNumber+1))))
-					if err != nil && !os.IsExist(err) {
-						return fmt.Errorf("failed to create device node for ESP partition: %w", err)
-					}
-				}
-				if part.Type == NodeDataPartitionType {
-					seenTypes[part.Type] = true
-					err := unix.Mknod(NodeDataRawPath, 0600|unix.S_IFBLK, int(unix.Mkdev(uint32(majorDev), uint32(partNumber+1))))
-					if err != nil && !os.IsExist(err) {
-						return fmt.Errorf("failed to create device node for Metropolis node encrypted data partition: %w", err)
-					}
-				}
+	}
+
+	return nil
+}
+
+// handleBlockDevice reads the uevent data and continues to iterate over all
+// partitions to create all required device nodes.
+func handleBlockDevice(diskBlockDev string, blockDevs []os.DirEntry, espUUID uuid.UUID) error {
+	data, err := readUEvent(diskBlockDev)
+	if err != nil {
+		return err
+	}
+
+	// We only care about disks, skip all other dev types.
+	if data["DEVTYPE"] != "disk" {
+		return nil
+	}
+
+	table, err := data.readPartitionTable()
+	if err != nil {
+		return fmt.Errorf("when reading disk info: %w", err)
+	}
+
+	// Not a normal block device or not a gpt table.
+	if table == nil {
+		return nil
+	}
+
+	skipDisk := false
+	if espUUID != uuid.Nil {
+		// If we know where we booted from, ignore all disks which do
+		// not contain this partition.
+		skipDisk = true
+		for _, part := range table.Partitions {
+			if part.ID == espUUID {
+				skipDisk = false
+				break
 			}
 		}
 	}
+	if skipDisk {
+		return nil
+	}
+
+	seenTypes := make(map[uuid.UUID]bool)
+	for _, dev := range blockDevs {
+		if err := handlePartition(diskBlockDev, dev.Name(), table, seenTypes); err != nil {
+			return fmt.Errorf("when creating partition %s: %w", dev.Name(), err)
+		}
+	}
+
 	return nil
 }
+
+func handlePartition(diskBlockDev string, partBlockDev string, table *gpt.Table, seenTypes map[uuid.UUID]bool) error {
+	// Skip all blockdev that dont share the same name/prefix,
+	// also skip the blockdev itself.
+	if !strings.HasPrefix(partBlockDev, diskBlockDev) || partBlockDev == diskBlockDev {
+		return nil
+	}
+
+	data, err := readUEvent(partBlockDev)
+	if err != nil {
+		return err
+	}
+
+	// We only care about partitions, skip all other dev types.
+	if data["DEVTYPE"] != "partition" {
+		return nil
+	}
+
+	pi, err := data.readPartitionInfo()
+	if err != nil {
+		return err
+	}
+
+	// TODO(tim): Is this safe? Are we actually using the partition number for the slice index?
+	part := table.Partitions[pi.partNumber-1]
+
+	nodePath := nodePathForPartitionType(part.Type)
+	if nodePath == "" {
+		// Ignore partitions with an unknown type.
+		return nil
+	}
+
+	if seenTypes[part.Type] {
+		return fmt.Errorf("node for this type (%s) already created/multiple partitions found", part.Type.String())
+	}
+	seenTypes[part.Type] = true
+
+	if err := pi.makeDeviceNode(nodePath); err != nil {
+		return fmt.Errorf("when creating partition node: %w", err)
+	}
+
+	return nil
+}
+
+type partInfo struct {
+	major, minor, partNumber int
+}
+
+// validateDeviceNode tries to open a device node and validates that it
+// has the expected major and minor device numbers. If the path does non exist,
+// no error will be returned.
+func (pi partInfo) validateDeviceNode(path string) error {
+	var s unix.Stat_t
+	if err := unix.Stat(path, &s); err != nil {
+		if os.IsNotExist(err) {
+			return nil
+		}
+
+		return fmt.Errorf("inspecting device node %q: %w", path, err)
+	}
+
+	if unix.Major(s.Rdev) != uint32(pi.major) || unix.Minor(s.Rdev) != uint32(pi.minor) {
+		return fmt.Errorf("device node %q exists for different device %d:%d", path, unix.Major(s.Rdev), unix.Minor(s.Rdev))
+	}
+
+	return nil
+}
+
+// makeDeviceNode creates the device node at the given path based on the
+// major and minor device number. If the device node already exists and points
+// to the same device, no error will be returned.
+func (pi partInfo) makeDeviceNode(path string) error {
+	if err := pi.validateDeviceNode(path); err != nil {
+		return err
+	}
+
+	err := unix.Mknod(path, 0600|unix.S_IFBLK, int(unix.Mkdev(uint32(pi.major), uint32(pi.minor))))
+	if err != nil {
+		return fmt.Errorf("create device node %q: %w", path, err)
+	}
+	return nil
+}
+
+func readUEvent(blockName string) (blockUEvent, error) {
+	data, err := sysfs.ReadUevents(filepath.Join("/sys/class/block", blockName, "uevent"))
+	if err != nil {
+		return nil, fmt.Errorf("when reading uevent: %w", err)
+	}
+	return data, nil
+}
+
+type blockUEvent map[string]string
+
+func (b blockUEvent) readUdevKeyInteger(key string) (int, error) {
+	if _, ok := b[key]; !ok {
+		return 0, fmt.Errorf("missing udev value %s", key)
+	}
+
+	v, err := strconv.Atoi(b[key])
+	if err != nil {
+		return 0, fmt.Errorf("invalid %s: %w", key, err)
+	}
+
+	return v, nil
+}
+
+// readPartitionInfo parses all fields for partInfo from a blockUEvent.
+func (b blockUEvent) readPartitionInfo() (pi partInfo, err error) {
+	pi.major, err = b.readUdevKeyInteger("MAJOR")
+	if err != nil {
+		return
+	}
+
+	pi.minor, err = b.readUdevKeyInteger("MINOR")
+	if err != nil {
+		return
+	}
+
+	pi.partNumber, err = b.readUdevKeyInteger("PARTN")
+	if err != nil {
+		return
+	}
+
+	return
+}
+
+// readPartitionTable tries to read a GPT partition table based on the blockUEvent
+// data. It returns nil when either the block device is not a regular device
+// or it fails to parse the GPT table.
+func (b blockUEvent) readPartitionTable() (*gpt.Table, error) {
+	// TODO(lorenz): This extraction code is all a bit hairy, will get
+	// replaced by blockdev shortly.
+	blkdev, err := os.Open(fmt.Sprintf("/dev/%v", b["DEVNAME"]))
+	if err != nil {
+		return nil, fmt.Errorf("failed to open block device: %w", err)
+	}
+	defer blkdev.Close()
+
+	blockSize, err := unix.IoctlGetUint32(int(blkdev.Fd()), unix.BLKSSZGET)
+	if err != nil {
+		return nil, nil // This is not a regular block device
+	}
+
+	var sizeBytes uint64
+	_, _, err = unix.Syscall(unix.SYS_IOCTL, blkdev.Fd(), unix.BLKGETSIZE64, uintptr(unsafe.Pointer(&sizeBytes)))
+	if err != unix.Errno(0) {
+		return nil, fmt.Errorf("failed to get device size: %w", err)
+	}
+
+	blkdev.Seek(int64(blockSize), 0)
+	table, err := gpt.Read(blkdev, int64(blockSize), int64(sizeBytes)/int64(blockSize))
+	if err != nil {
+		return nil, nil // Probably just not a GPT-partitioned disk
+	}
+
+	return table, nil
+}