m/p/gpt: add GPT package This introduces our own GPT package. It will be used for provisioning and Metropolis images. Change-Id: I905cd5d540673fd4b69c01d8975f98b88e24edd4 Reviewed-on: https://review.monogon.dev/c/monogon/+/956 Tested-by: Jenkins CI Reviewed-by: Sergiusz Bazanski <serge@monogon.tech>

commit: ee17d8303258980270587755f75dc4b6412e3a31 [log] [tgz]
author: Lorenz Brun <lorenz@monogon.tech> Tue Oct 18 12:02:45 2022 +0000
committer: Lorenz Brun <lorenz@monogon.tech> Mon Oct 24 16:30:56 2022 +0000
tree: 25e30e22a95527592b8d596dc8da5e6f7b6b63ab
parent: ce3d810f4fde5e00aba7539a4d12ebe82d65b672 [diff]
diff --git a/metropolis/pkg/gpt/BUILD.bazel b/metropolis/pkg/gpt/BUILD.bazel
new file mode 100644
index 0000000..25b6b7b
--- /dev/null
+++ b/metropolis/pkg/gpt/BUILD.bazel

@@ -0,0 +1,33 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//metropolis/test/ktest:ktest.bzl", "ktest")
+
+go_library(
+    name = "gpt",
+    srcs = [
+        "gpt.go",
+        "mbr.go",
+    ],
+    importpath = "source.monogon.dev/metropolis/pkg/gpt",
+    visibility = ["//visibility:public"],
+    deps = ["@com_github_google_uuid//:uuid"],
+)
+
+go_test(
+    name = "gpt_test",
+    srcs = [
+        "gpt_test.go",
+        "linux_test.go",
+        "mbr_test.go",
+        "structs_test.go",
+    ],
+    embed = [":gpt"],
+    deps = [
+        "@com_github_google_uuid//:uuid",
+        "@org_golang_x_sys//unix",
+    ],
+)
+
+ktest(
+    cmdline = "ramdisk_size=4096",
+    tester = ":gpt_test",
+)

diff --git a/metropolis/pkg/gpt/gpt.go b/metropolis/pkg/gpt/gpt.go
new file mode 100644
index 0000000..5660237
--- /dev/null
+++ b/metropolis/pkg/gpt/gpt.go

@@ -0,0 +1,727 @@
+// Package gpt implements reading and writing GUID Partition Tables as specified
+// in the UEFI Specification. It only implements up to 128 partitions per table
+// (same as most other implementations) as more would require a dynamic table
+// size, significantly complicating the code for little gain.
+package gpt
+
+import (
+	"bytes"
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"hash/crc32"
+	"io"
+	"sort"
+	"strings"
+	"unicode/utf16"
+
+	"github.com/google/uuid"
+)
+
+var gptSignature = [8]byte{'E', 'F', 'I', ' ', 'P', 'A', 'R', 'T'}
+var gptRevision uint32 = 0x00010000 // First 2 bytes major, second 2 bytes minor
+
+// See UEFI Specification 2.9 Table 5-5
+type header struct {
+	Signature   [8]byte
+	Revision    uint32
+	HeaderSize  uint32
+	HeaderCRC32 uint32
+	_           [4]byte
+
+	HeaderBlock          uint64
+	AlternateHeaderBlock uint64
+	FirstUsableBlock     uint64
+	LastUsableBlock      uint64
+
+	ID [16]byte
+
+	PartitionEntriesStartBlock uint64
+	PartitionEntryCount        uint32
+	PartitionEntrySize         uint32
+	PartitionEntriesCRC32      uint32
+}
+
+// See UEFI Specification 2.9 Table 5-6
+type partition struct {
+	Type       [16]byte
+	ID         [16]byte
+	FirstBlock uint64
+	LastBlock  uint64
+	Attributes uint64
+	Name       [36]uint16
+}
+
+var (
+	PartitionTypeEFISystem = uuid.MustParse("C12A7328-F81F-11D2-BA4B-00A0C93EC93B")
+)
+
+var zeroUUID = [16]byte{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
+
+// Attribute is a bitfield of attributes set on a partition. Bits 0 to 47 are
+// reserved for UEFI specification use and all current assignments are in the
+// following const block. Bits 48 to 64 are available for per-Type use by
+// the organization controlling the partition Type.
+type Attribute uint64
+
+const (
+	// AttrRequiredPartition indicates that this partition is required for the
+	// platform to function. Mostly used by vendors to mark things like recovery
+	// partitions.
+	AttrRequiredPartition = 1 << 0
+	// AttrNoBlockIOProto indicates that EFI firmware must not provide an EFI
+	// block device (EFI_BLOCK_IO_PROTOCOL) for this partition.
+	AttrNoBlockIOProto = 1 << 1
+	// AttrLegacyBIOSBootable indicates to special-purpose software outside of
+	// UEFI that this partition can be booted using a traditional PC BIOS.
+	// Don't use this unless you know that you need it specifically.
+	AttrLegacyBIOSBootable = 1 << 2
+)
+
+// PerTypeAttrs returns the top 24 bits which are reserved for custom per-Type
+// attributes. The top 8 bits of the returned uint32 are always 0.
+func (a Attribute) PerTypeAttrs() uint32 {
+	return uint32(a >> 48)
+}
+
+// SetPerTypeAttrs sets the top 24 bits which are reserved for custom per-Type
+// attributes. It does not touch the lower attributes which are specified by the
+// UEFI specification. The top 8 bits of v are silently discarded.
+func (a *Attribute) SetPerTypeAttrs(v uint32) {
+	*a &= 0x000000FF_FFFFFFFF
+	*a |= Attribute(v) << 48
+}
+
+type Partition struct {
+	// Name of the partition, will be truncated if it expands to more than 36
+	// UTF-16 code points. Not all systems can display non-BMP code points.
+	Name string
+	// Type is the type of Table partition, can either be one of the predefined
+	// constants by the UEFI specification or a custom type identifier.
+	// Note that the all-zero UUID denotes an empty partition slot, so this
+	// MUST be set to something, otherwise it is not treated as a partition.
+	Type uuid.UUID
+	// ID is a unique identifier for this specific partition. It should be
+	// changed when cloning the partition.
+	ID uuid.UUID
+	// The first logical block of the partition (inclusive)
+	FirstBlock uint64
+	// The last logical block of the partition (inclusive)
+	LastBlock uint64
+	// Bitset of attributes of this partition.
+	Attributes Attribute
+}
+
+// SizeBlocks returns the size of the partition in blocks
+func (p *Partition) SizeBlocks() uint64 {
+	return 1 + p.LastBlock - p.FirstBlock
+}
+
+// IsUnused checks if the partition is unused, i.e. it is nil or its type is
+// the null UUID.
+func (p *Partition) IsUnused() bool {
+	if p == nil {
+		return true
+	}
+	return p.Type == zeroUUID
+}
+
+type Table struct {
+	// ID is the unique identifier of this specific disk / GPT.
+	// If this is left uninitialized/all-zeroes a new random ID is automatically
+	// generated when writing.
+	ID uuid.UUID
+
+	// Data put at the start of the very first block. Gets loaded and executed
+	// by a legacy BIOS bootloader. This can be used to make GPT-partitioned
+	// disks bootable by legacy systems or display a nice error message.
+	// Maximum length is 440 bytes, if that is exceeded Write returns an error.
+	// Should be left empty if the device is not bootable and/or compatibility
+	// with BIOS booting is not required. Only useful on x86 systems.
+	BootCode []byte
+
+	// BlockSize contains the logical block size of the block device. It must
+	// be a power of two equal to or larger than 512.
+	BlockSize int64
+	// BlockCount contains the number of logical blocks on the block device.
+	// BlockCount times BlockSize is the size of the block device in bytes.
+	BlockCount int64
+
+	// Partitions contains the list of partitions in this table. This is
+	// artificially limited to 128 partitions.
+	Partitions []*Partition
+}
+type addOptions struct {
+	preferEnd        bool
+	keepEmptyEntries bool
+	alignment        int64
+}
+
+// AddOption is a bitset controlling various
+type AddOption func(*addOptions)
+
+// WithPreferEnd tries to put the partition as close to the end as possible
+// instead of as close to the start.
+func WithPreferEnd() AddOption {
+	return func(options *addOptions) {
+		options.preferEnd = true
+	}
+}
+
+// WithKeepEmptyEntries does not fill up empty entries which are followed by
+// filled ones. It always appends the partition after the last used entry.
+// Without this flag, the partition is placed in the first empty entry.
+func WithKeepEmptyEntries() AddOption {
+	return func(options *addOptions) {
+		options.keepEmptyEntries = true
+	}
+}
+
+// WithAlignment allows aligning the partition start block to a non-default
+// value. By default, these are aligned to 1MiB.
+//  Only use this flag if you are certain you need it, it can cause quite severe
+// performance degradation under certain conditions.
+func WithAlignment(alignmenet int64) AddOption {
+	return func(options *addOptions) {
+		options.alignment = alignmenet
+	}
+}
+
+// AddPartition takes a pointer to a partition and adds it, placing it into
+// the first (or last using WithPreferEnd) continuous free space which fits it.
+// It writes the placement information (FirstBlock, LastBlock) back to p.
+// By default, AddPartition aligns FirstBlock to 1MiB boundaries, but this can
+// be overridden using WithAlignment.
+func (g *Table) AddPartition(p *Partition, size int64, options ...AddOption) error {
+	if g.BlockSize < 512 {
+		return errors.New("block size is smaller than 512 bytes, this is unsupported")
+	}
+	var opts addOptions
+	// Align to 1MiB or the block size, whichever is bigger
+	opts.alignment = 1 * 1024 * 1024
+	if g.BlockSize > opts.alignment {
+		opts.alignment = g.BlockSize
+	}
+	for _, o := range options {
+		o(&opts)
+	}
+	if opts.alignment%g.BlockSize != 0 {
+		return fmt.Errorf("requested alignment (%d bytes) is not an integer multiple of the block size (%d), unable to align", opts.alignment, g.BlockSize)
+	}
+	// Number of blocks the partition should occupy, rounded up.
+	blocks := (size + g.BlockSize - 1) / g.BlockSize
+	fs, _, err := g.GetFreeSpaces()
+	if err != nil {
+		return fmt.Errorf("unable to determine free space: %v", err)
+	}
+	if opts.preferEnd {
+		// Reverse fs slice to start iteration at the end
+		for i, j := 0, len(fs)-1; i < j; i, j = i+1, j-1 {
+			fs[i], fs[j] = fs[j], fs[i]
+		}
+	}
+	var maxFreeBlocks int64
+	for _, freeInt := range fs {
+		start := freeInt[0]
+		end := freeInt[1]
+		freeBlocks := end - start
+		// Align start properly
+		paddingBlocks := (-start) % (opts.alignment / g.BlockSize)
+		freeBlocks -= paddingBlocks
+		start += paddingBlocks
+		if maxFreeBlocks < freeBlocks {
+			maxFreeBlocks = freeBlocks
+		}
+		if freeBlocks >= blocks {
+			if !opts.preferEnd {
+				p.FirstBlock = uint64(start)
+				p.LastBlock = uint64(start + blocks)
+			} else {
+				// Realign FirstBlock. This will always succeed as
+				// there is enough space to align to the start.
+				moveLeft := (end - blocks - 1) % (opts.alignment / g.BlockSize)
+				p.FirstBlock = uint64(end - (blocks + 1 + moveLeft))
+				p.LastBlock = uint64(end - (1 + moveLeft))
+			}
+			newPartPos := -1
+			if !opts.keepEmptyEntries {
+				for i, part := range g.Partitions {
+					if part.IsUnused() {
+						newPartPos = i
+						break
+					}
+				}
+			}
+			if newPartPos == -1 {
+				g.Partitions = append(g.Partitions, p)
+			} else {
+				g.Partitions[newPartPos] = p
+			}
+			return nil
+		}
+	}
+
+	return fmt.Errorf("no space for partition of %d blocks, largest continuous free space after alignment is %d blocks", blocks, maxFreeBlocks)
+}
+
+// FirstUsableBlock returns the first usable (i.e. a partition can start there)
+// block.
+func (g *Table) FirstUsableBlock() int64 {
+	partitionEntryBlocks := (16384 + g.BlockSize - 1) / g.BlockSize
+	return 2 + partitionEntryBlocks
+}
+
+// LastUsableBlock returns the last usable (i.e. a partition can end there)
+// block. This block is inclusive.
+func (g *Table) LastUsableBlock() int64 {
+	partitionEntryBlocks := (16384 + g.BlockSize - 1) / g.BlockSize
+	return g.BlockCount - (2 + partitionEntryBlocks)
+}
+
+// GetFreeSpaces returns a slice of tuples, each containing a half-closed
+// interval of logical blocks not occupied by the GPT itself or any partition.
+// The returned intervals are always in ascending order as well as
+// non-overlapping. It also returns if it detected any overlaps between
+// partitions or partitions and the GPT. It returns an error if and only if any
+// partition has its FirstBlock before the LastBlock or exceeds the amount of
+// blocks on the block device.
+//
+// Note that the most common use cases for this function are covered by
+// AddPartition, you're encouraged to use it instead.
+func (g *Table) GetFreeSpaces() ([][2]int64, bool, error) {
+	// This implements an efficient algorithm for finding free intervals given
+	// a set of potentially overlapping occupying intervals. It uses O(n*log n)
+	// time for n being the amount of intervals, i.e. partitions. It uses O(n)
+	// additional memory. This makes it de facto infinitely scalable in the
+	// context of partition tables as the size of the block device is not part
+	// of its cyclomatic complexity and O(n*log n) is tiny for even very big
+	// partition tables.
+
+	// startBlocks contains the start blocks (inclusive) of all occupied
+	// intervals.
+	var startBlocks []int64
+	// endBlocks contains the end blocks (exclusive!) of all occupied intervals.
+	// The interval at index i is given by [startBlock[i], endBlock[i]).
+	var endBlocks []int64
+
+	// Reserve the primary GPT interval including the protective MBR.
+	startBlocks = append(startBlocks, 0)
+	endBlocks = append(endBlocks, g.FirstUsableBlock())
+
+	// Reserve the alternate GPT interval (needs +1 for exclusive interval)
+	startBlocks = append(startBlocks, g.LastUsableBlock()+1)
+	endBlocks = append(endBlocks, g.BlockCount)
+
+	for i, part := range g.Partitions {
+		if part.IsUnused() {
+			continue
+		}
+		// Bail if partition does not contain a valid interval. These are open
+		// intervals, thus part.FirstBlock == part.LastBlock denotes a valid
+		// partition with a size of one block.
+		if part.FirstBlock > part.LastBlock {
+			return nil, false, fmt.Errorf("partition %d has a LastBlock smaller than its FirstBlock, its interval is [%d, %d]", i, part.FirstBlock, part.LastBlock)
+		}
+		if part.FirstBlock >= uint64(g.BlockCount) || part.LastBlock >= uint64(g.BlockCount) {
+			return nil, false, fmt.Errorf("partition %d exceeds the block count of the block device", i)
+		}
+		startBlocks = append(startBlocks, int64(part.FirstBlock))
+		// Algorithm needs open-closed intervals, thus add +1 to the end.
+		endBlocks = append(endBlocks, int64(part.LastBlock)+1)
+	}
+	// Sort both sets of blocks independently in ascending order. Note that it
+	// is now no longer possible to extract the original intervals. Integers
+	// have no identity thus it doesn't matter if the sort is stable or not.
+	sort.Slice(startBlocks, func(i, j int) bool { return startBlocks[i] < startBlocks[j] })
+	sort.Slice(endBlocks, func(i, j int) bool { return endBlocks[i] < endBlocks[j] })
+
+	var freeSpaces [][2]int64
+
+	// currentIntervals contains the number of intervals which contain the
+	// position currently being iterated over. If currentIntervals is ever
+	// bigger than 1, there is overlap within the given intervals.
+	currentIntervals := 0
+	var hasOverlap bool
+
+	// Iterate for as long as there are interval boundaries to be processed.
+	for len(startBlocks) != 0 || len(endBlocks) != 0 {
+		// Short-circuit boundary processing. If an interval ends at x and the
+		// next one starts at x (this is using half-open intervals), it would
+		// otherwise perform useless processing as well as create an empty free
+		// interval which would then need to be filtered back out.
+		if len(startBlocks) != 0 && len(endBlocks) != 0 && startBlocks[0] == endBlocks[0] {
+			startBlocks = startBlocks[1:]
+			endBlocks = endBlocks[1:]
+			continue
+		}
+		// Pick the lowest boundary from either startBlocks or endBlocks,
+		// preferring endBlocks if they are equal. Don't try to pick from empty
+		// slices.
+		if (len(startBlocks) != 0 && len(endBlocks) != 0 && startBlocks[0] < endBlocks[0]) || len(endBlocks) == 0 {
+			// If currentIntervals == 0 a free space region ends here.
+			// Since this algorithm creates the free space interval at the end
+			// of an occupied interval, for the first interval there is no free
+			// space entry. But in this case it's fine to just ignore it as the
+			// first interval always starts at 0 because of the GPT.
+			if currentIntervals == 0 && len(freeSpaces) != 0 {
+				freeSpaces[len(freeSpaces)-1][1] = startBlocks[0]
+			}
+			// This is the start of an interval, increase the number of active
+			// intervals.
+			currentIntervals++
+			hasOverlap = hasOverlap || currentIntervals > 1
+			// Drop processed startBlock from slice.
+			startBlocks = startBlocks[1:]
+		} else {
+			// This is the end of an interval, decrease the number of active
+			// intervals.
+			currentIntervals--
+			// If currentIntervals == 0 a free space region starts here.
+			// Same as with the startBlocks, ignore a potential free block after
+			// the final range as the GPT occupies the last blocks anyway.
+			if currentIntervals == 0 && len(startBlocks) != 0 {
+				freeSpaces = append(freeSpaces, [2]int64{endBlocks[0], 0})
+			}
+			endBlocks = endBlocks[1:]
+		}
+	}
+	return freeSpaces, hasOverlap, nil
+}
+
+// Overhead returns the number of blocks the GPT partitioning itself consumes,
+// i.e. aren't usable for user data.
+func Overhead(blockSize int64) int64 {
+	// 3 blocks + 2x 16384 bytes (partition entry space)
+	partitionEntryBlocks := (16384 + blockSize - 1) / blockSize
+	return 3 + (2 * partitionEntryBlocks)
+}
+
+// Write writes a list of GPT partitions with a protective MBR to the given
+// WriteSeeker. It must have a defined end, i.e. w.Seek(-x, io.SeekEnd) must
+// seek to x bytes before the end of the disk. If gpt.ID or any of the
+// partition IDs are the all-zero UUID, a new random one is generated and
+// written back. If the output is supposed to be reproducible, generate the
+// UUIDs beforehand.
+func Write(w io.WriteSeeker, gpt *Table) error {
+	if gpt.BlockSize < 512 {
+		return errors.New("block size is smaller than 512 bytes, this is unsupported")
+	}
+	// Layout looks as follows:
+	// Block 0: Protective MBR
+	// Block 1: GPT Header
+	// Block 2-(16384 bytes): GPT partition entries
+	// Block (16384 bytes)-n: GPT partition entries alternate copy
+	// Block n: GPT Header alternate copy
+	if len(gpt.Partitions) > 128 {
+		return errors.New("Bigger-than default GPTs (>128 partitions) are unimplemented")
+	}
+
+	partitionEntryBlocks := (16384 + gpt.BlockSize - 1) / gpt.BlockSize
+	if gpt.BlockCount < 3+(2*partitionEntryBlocks) {
+		return errors.New("not enough blocks to write GPT")
+	}
+
+	if gpt.ID == zeroUUID {
+		gpt.ID = uuid.New()
+	}
+
+	partSize := binary.Size(partition{})
+	slotCount := 128
+
+	var partitionEntriesData bytes.Buffer
+	for i := 0; i < slotCount; i++ {
+		if len(gpt.Partitions) <= i || gpt.Partitions[i] == nil {
+			partitionEntriesData.Write(make([]byte, partSize))
+			continue
+		}
+		p := gpt.Partitions[i]
+		if p.ID == zeroUUID {
+			p.ID = uuid.New()
+		}
+		rawP := partition{
+			Type:       mangleUUID(p.Type),
+			ID:         mangleUUID(p.ID),
+			FirstBlock: p.FirstBlock,
+			LastBlock:  p.LastBlock,
+			Attributes: uint64(p.Attributes),
+		}
+		nameUTF16 := utf16.Encode([]rune(p.Name))
+		// copy will automatically truncate if target is too short
+		copy(rawP.Name[:], nameUTF16)
+		binary.Write(&partitionEntriesData, binary.LittleEndian, rawP)
+	}
+
+	hdr := header{
+		Signature:  gptSignature,
+		Revision:   gptRevision,
+		HeaderSize: uint32(binary.Size(&header{})),
+		ID:         mangleUUID(gpt.ID),
+
+		PartitionEntryCount: uint32(slotCount),
+		PartitionEntrySize:  uint32(partSize),
+
+		FirstUsableBlock: uint64(2 + partitionEntryBlocks),
+		LastUsableBlock:  uint64(gpt.BlockCount - (2 + partitionEntryBlocks)),
+	}
+	hdr.PartitionEntriesCRC32 = crc32.ChecksumIEEE(partitionEntriesData.Bytes())
+
+	hdrChecksum := crc32.NewIEEE()
+
+	// Write alternate header first, as otherwise resizes are unsafe. If the
+	// alternate is currently not at the end of the block device, it cannot
+	// be found. Thus if the write operation is aborted abnormally, the
+	// primary GPT is corrupted and the alternate cannot be found because it
+	// is not at its canonical location. Rewriting the alternate first avoids
+	// this problem.
+
+	// Alternate header
+	if _, err := w.Seek((gpt.LastUsableBlock()+1)*gpt.BlockSize, io.SeekStart); err != nil {
+		return fmt.Errorf("failed to seek to end of block device: %w", err)
+	}
+	hdr.HeaderBlock = uint64(gpt.BlockCount - 1)
+	hdr.AlternateHeaderBlock = 1
+	hdr.PartitionEntriesStartBlock = uint64(gpt.BlockCount - (1 + partitionEntryBlocks))
+
+	hdrChecksum.Reset()
+	hdr.HeaderCRC32 = 0
+	binary.Write(hdrChecksum, binary.LittleEndian, &hdr)
+	hdr.HeaderCRC32 = hdrChecksum.Sum32()
+
+	if _, err := w.Write(partitionEntriesData.Bytes()); err != nil {
+		return fmt.Errorf("failed to write alternate partition entries: %w", err)
+	}
+	if _, err := w.Seek((gpt.BlockCount-1)*gpt.BlockSize, io.SeekStart); err != nil {
+		return fmt.Errorf("failed to seek to end of block device: %w", err)
+	}
+
+	if err := binary.Write(w, binary.LittleEndian, &hdr); err != nil {
+		return fmt.Errorf("failed to write alternate header: %w", err)
+	}
+	if _, err := w.Write(make([]byte, gpt.BlockSize-int64(binary.Size(hdr)))); err != nil {
+		return fmt.Errorf("failed to write padding: %v", err)
+	}
+
+	// Primary header
+	if _, err := w.Seek(0, io.SeekStart); err != nil {
+		return fmt.Errorf("failed to seek to end of block device: %w", err)
+	}
+	hdr.HeaderBlock = 1
+	hdr.AlternateHeaderBlock = uint64(gpt.BlockCount - 1)
+	hdr.PartitionEntriesStartBlock = 2
+
+	hdrChecksum.Reset()
+	hdr.HeaderCRC32 = 0
+	binary.Write(hdrChecksum, binary.LittleEndian, &hdr)
+	hdr.HeaderCRC32 = hdrChecksum.Sum32()
+
+	if err := makeProtectiveMBR(w, gpt.BlockCount, gpt.BootCode); err != nil {
+		return fmt.Errorf("failed to write first block: %w", err)
+	}
+
+	if err := binary.Write(w, binary.LittleEndian, &hdr); err != nil {
+		return fmt.Errorf("failed to write primary header: %w", err)
+	}
+	if _, err := w.Write(make([]byte, gpt.BlockSize-int64(binary.Size(hdr)))); err != nil {
+		return fmt.Errorf("failed to write padding: %v", err)
+	}
+	if _, err := w.Write(partitionEntriesData.Bytes()); err != nil {
+		return fmt.Errorf("failed to write primary partition entries: %w", err)
+	}
+	return nil
+}
+
+// Read reads a Table from a block device given its block size and count.
+func Read(r io.ReadSeeker, blockSize int64, blockCount int64) (*Table, error) {
+	if Overhead(blockSize) > blockCount {
+		return nil, errors.New("disk cannot contain a GPT as the block count is too small to store one")
+	}
+	if _, err := r.Seek(0, io.SeekStart); err != nil {
+		return nil, fmt.Errorf("failed to seek to block 0: %w", err)
+	}
+	zeroBlock := make([]byte, blockSize)
+	if _, err := io.ReadFull(r, zeroBlock); err != nil {
+		return nil, fmt.Errorf("failed to read first two blocks: %w", err)
+	}
+
+	var m mbr
+	if err := binary.Read(bytes.NewReader(zeroBlock[:512]), binary.LittleEndian, &m); err != nil {
+		panic(err) // Read is from memory and with enough data
+	}
+	// The UEFI standard says that the only acceptable MBR for a GPT-partitioned
+	// device is a pure protective MBR with one partition of type 0xEE covering
+	// the entire disk. But reality is sadly not so simple. People have come up
+	// with hacks like Hybrid MBR which is basically a way to expose partitions
+	// as both GPT partitions and MBR partitions. There are also GPTs without
+	// any MBR at all.
+	// Following the standard strictly when reading means that this library
+	// would fail to read valid GPT disks where such schemes are employed.
+	// On the other hand just looking at the GPT signature is also dangerous
+	// as not all tools clear the second block where the GPT resides when
+	// writing an MBR, which results in reading a wrong/obsolete GPT.
+	// As a pragmatic solution this library treats any disk as GPT-formatted if
+	// the first block does not contain an MBR signature or at least one MBR
+	// partition has type 0xEE (GPT). It does however not care in which slot
+	// this partition is or if it begins at the start of the disk.
+	//
+	// Note that the block signatures for MBR and FAT are shared. This is a
+	// historical artifact from DOS. It is not reliably possible to
+	// differentiate the two as either has boot code where the other has meta-
+	// data and both lack any checksums. Because the MBR partition table is at
+	// the very end of the FAT bootcode section the following code always
+	// assumes that it is dealing with an MBR. This is both more likely and
+	// the 0xEE marker is rarer and thus more specific than FATs 0x00, 0x80 and
+	// 0x02.
+	var bootCode []byte
+	hasDOSBootSig := m.Signature == mbrSignature
+	if hasDOSBootSig {
+		var isGPT bool
+		for _, p := range m.PartitionRecords {
+			if p.Type == 0xEE {
+				isGPT = true
+			}
+		}
+		// Note that there is a small but non-zero chance that isGPT is true
+		// for a raw FAT filesystem if the bootcode contains a "valid" MBR.
+		// The next error message mentions that possibility.
+		if !isGPT {
+			return nil, errors.New("block device contains an MBR table without a GPT marker or a raw FAT filesystem")
+		}
+		// Trim right zeroes away as they are padded back when writing. This
+		// makes BootCode empty when it is all-zeros, making it easier to work
+		// with while still round-tripping correctly.
+		bootCode = bytes.TrimRight(m.BootCode[:], "\x00")
+	}
+	// Read the primary GPT. If it is damaged and/or broken, read the alternate.
+	primaryGPT, err := readSingleGPT(r, blockSize, blockCount, 1)
+	if err != nil {
+		alternateGPT, err2 := readSingleGPT(r, blockSize, blockCount, blockCount-1)
+		if err2 != nil {
+			return nil, fmt.Errorf("failed to read both GPTs: primary GPT (%v), secondary GPT (%v)", err, err2)
+		}
+		alternateGPT.BootCode = bootCode
+		return alternateGPT, nil
+	}
+	primaryGPT.BootCode = bootCode
+	return primaryGPT, nil
+}
+
+func readSingleGPT(r io.ReadSeeker, blockSize int64, blockCount int64, headerBlockPos int64) (*Table, error) {
+	if _, err := r.Seek(blockSize*headerBlockPos, io.SeekStart); err != nil {
+		return nil, fmt.Errorf("failed to seek to block %d: %w", headerBlockPos, err)
+	}
+	hdrBlock := make([]byte, blockSize)
+	if _, err := io.ReadFull(r, hdrBlock); err != nil {
+		return nil, fmt.Errorf("failed to read GPT header block: %w", err)
+	}
+	hdrBlockReader := bytes.NewReader(hdrBlock)
+	var hdr header
+	if err := binary.Read(hdrBlockReader, binary.LittleEndian, &hdr); err != nil {
+		panic(err) // Read from memory with enough bytes, should not fail
+	}
+	if hdr.Signature != gptSignature {
+		return nil, errors.New("no GPT signature found")
+	}
+	if hdr.HeaderSize < uint32(binary.Size(hdr)) {
+		return nil, fmt.Errorf("GPT header size is too small, likely corrupted")
+	}
+	if int64(hdr.HeaderSize) > blockSize {
+		return nil, fmt.Errorf("GPT header size is bigger than block size, likely corrupted")
+	}
+	// Use reserved bytes to hash, but do not expose them to the user.
+	// If someone has a need to process them, they should extend this library
+	// with whatever an updated UEFI specification contains.
+	// It has been considered to store these in the user-exposed GPT struct to
+	// be able to round-trip them cleanly, but there is significant complexity
+	// and risk involved in doing so.
+	reservedBytes := hdrBlock[binary.Size(hdr):hdr.HeaderSize]
+	hdrExpectedCRC := hdr.HeaderCRC32
+	hdr.HeaderCRC32 = 0
+	hdrCRC := crc32.NewIEEE()
+	binary.Write(hdrCRC, binary.LittleEndian, &hdr)
+	hdrCRC.Write(reservedBytes)
+	if hdrCRC.Sum32() != hdrExpectedCRC {
+		return nil, fmt.Errorf("GPT header checksum mismatch, probably corrupted")
+	}
+	if hdr.HeaderBlock != uint64(headerBlockPos) {
+		return nil, errors.New("GPT header indicates wrong block")
+	}
+	if hdr.PartitionEntrySize < uint32(binary.Size(partition{})) {
+		return nil, errors.New("partition entry size too small")
+	}
+	if hdr.PartitionEntriesStartBlock > uint64(blockCount) {
+		return nil, errors.New("partition entry start block is out of range")
+	}
+	// Sanity-check total size of the partition entry area. Otherwise, this is a
+	// trivial DoS as it could cause allocation of gigabytes of memory.
+	// 4MiB is equivalent to around 45k partitions at the current size.
+	// I know of no operating system which would handle even a fraction of this.
+	if uint64(hdr.PartitionEntryCount)*uint64(hdr.PartitionEntrySize) > 4*1024*1024 {
+		return nil, errors.New("partition entry area bigger than 4MiB, refusing to read")
+	}
+	partitionEntryData := make([]byte, hdr.PartitionEntrySize*hdr.PartitionEntryCount)
+	if _, err := r.Seek(blockSize*int64(hdr.PartitionEntriesStartBlock), io.SeekStart); err != nil {
+		return nil, fmt.Errorf("failed to seek to partition entry start block: %w", err)
+	}
+	if _, err := io.ReadFull(r, partitionEntryData); err != nil {
+		return nil, fmt.Errorf("failed to read partition entries: %w", err)
+	}
+	if crc32.ChecksumIEEE(partitionEntryData) != hdr.PartitionEntriesCRC32 {
+		return nil, errors.New("GPT partition entry table checksum mismatch")
+	}
+	var g Table
+	g.ID = unmangleUUID(hdr.ID)
+	g.BlockSize = blockSize
+	g.BlockCount = blockCount
+	for i := uint32(0); i < hdr.PartitionEntryCount; i++ {
+		entryReader := bytes.NewReader(partitionEntryData[i*hdr.PartitionEntrySize : (i+1)*hdr.PartitionEntrySize])
+		var part partition
+		if err := binary.Read(entryReader, binary.LittleEndian, &part); err != nil {
+			panic(err) // Should not happen
+		}
+		// If the partition type is the all-zero UUID, this slot counts as
+		// unused.
+		if part.Type == zeroUUID {
+			g.Partitions = append(g.Partitions, nil)
+			continue
+		}
+		g.Partitions = append(g.Partitions, &Partition{
+			ID:         unmangleUUID(part.ID),
+			Type:       unmangleUUID(part.Type),
+			Name:       strings.TrimRight(string(utf16.Decode(part.Name[:])), "\x00"),
+			FirstBlock: part.FirstBlock,
+			LastBlock:  part.LastBlock,
+			Attributes: Attribute(part.Attributes),
+		})
+	}
+	// Remove long list of nils at the end as it's inconvenient to work with
+	// (append doesn't work, debug prints are very long) and it round-trips
+	// correctly even without it as it gets zero-padded when writing anyway.
+	var maxValidPartition int
+	for i, p := range g.Partitions {
+		if !p.IsUnused() {
+			maxValidPartition = i
+		}
+	}
+	g.Partitions = g.Partitions[:maxValidPartition+1]
+	return &g, nil
+}
+
+var mixedEndianTranspose = []int{3, 2, 1, 0, 5, 4, 7, 6, 8, 9, 10, 11, 12, 13, 14, 15}
+
+// mangleUUID encodes a normal UUID into a "mixed-endian" UUID. This just means
+// shuffling the bytes around in a specific way. Thank Microsoft for this
+// idiosyncrasy.
+func mangleUUID(u uuid.UUID) (o [16]byte) {
+	for dest, from := range mixedEndianTranspose {
+		o[dest] = u[from]
+	}
+	return
+}
+
+// unmangleUUID does the reverse transformation of mangleUUID
+func unmangleUUID(i [16]byte) (o uuid.UUID) {
+	for from, dest := range mixedEndianTranspose {
+		o[dest] = i[from]
+	}
+	return
+}

diff --git a/metropolis/pkg/gpt/gpt_test.go b/metropolis/pkg/gpt/gpt_test.go
new file mode 100644
index 0000000..df2970b
--- /dev/null
+++ b/metropolis/pkg/gpt/gpt_test.go

@@ -0,0 +1,152 @@
+package gpt
+
+import (
+	"bytes"
+	"crypto/sha256"
+	"io"
+	"os"
+	"testing"
+
+	"github.com/google/uuid"
+)
+
+func TestUUIDTranspose(t *testing.T) {
+	testUUID := uuid.MustParse("00112233-4455-6677-c899-aabbccddeeff")
+	mixedEndianUUID := mangleUUID(testUUID)
+	expectedMixedEndianUUID := [16]byte{0x33, 0x22, 0x11, 0x00, 0x55, 0x44, 0x77, 0x66, 0xc8, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff}
+	if mixedEndianUUID != expectedMixedEndianUUID {
+		t.Errorf("mangleUUID(%s) = %x, expected %x", testUUID, mixedEndianUUID, expectedMixedEndianUUID)
+	}
+	roundTrippedUUID := unmangleUUID(mixedEndianUUID)
+	if testUUID != roundTrippedUUID {
+		t.Errorf("unmangleUUID(mangleUUID(%s)) = %s, expected input", testUUID, roundTrippedUUID)
+	}
+}
+
+func TestFreeSpaces(t *testing.T) {
+	cases := []struct {
+		name            string
+		parts           []*Partition
+		expected        [][2]int64
+		expectedOverlap bool
+	}{
+		{"Empty", []*Partition{}, [][2]int64{{34, 2015}}, false},
+		{"OnePart", []*Partition{
+			{Type: PartitionTypeEFISystem, FirstBlock: 200, LastBlock: 1499},
+		}, [][2]int64{
+			{34, 200},
+			{1500, 2015},
+		}, false},
+		{"TwoOverlappingParts", []*Partition{
+			{Type: PartitionTypeEFISystem, FirstBlock: 200, LastBlock: 1499},
+			{Type: PartitionTypeEFISystem, FirstBlock: 1000, LastBlock: 1999},
+		}, [][2]int64{
+			{34, 200},
+			{2000, 2015},
+		}, true},
+		{"Full", []*Partition{
+			{Type: PartitionTypeEFISystem, FirstBlock: 34, LastBlock: 999},
+			{Type: PartitionTypeEFISystem, FirstBlock: 1000, LastBlock: 2014},
+		}, [][2]int64{}, false},
+		{"TwoSpacedParts", []*Partition{
+			{Type: PartitionTypeEFISystem, FirstBlock: 500, LastBlock: 899},
+			{Type: PartitionTypeEFISystem, FirstBlock: 1200, LastBlock: 1799},
+		}, [][2]int64{
+			{34, 500},
+			{900, 1200},
+			{1800, 2015},
+		}, false},
+	}
+
+	// Partitions are created manually as AddPartition calls FreeSpaces itself,
+	// which makes the test unreliable as well as making failures very hard to
+	// debug.
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			g := Table{
+				BlockSize:  512,
+				BlockCount: 2048, // 0.5KiB * 2048 = 1MiB
+				Partitions: c.parts,
+			}
+			fs, overlap, err := g.GetFreeSpaces()
+			if err != nil {
+				t.Fatal(err)
+			}
+			if overlap != c.expectedOverlap {
+				t.Errorf("expected overlap %v, got %v", c.expectedOverlap, overlap)
+			}
+			if len(fs) != len(c.expected) {
+				t.Fatalf("expected %v, got %v", c.expected, fs)
+			}
+			for i := range fs {
+				if fs[i] != c.expected[i] {
+					t.Errorf("free space mismatch at pos %d: got [%d, %d), expected [%d, %d)", i, fs[i][0], fs[i][1], c.expected[i][0], c.expected[i][1])
+				}
+			}
+		})
+	}
+}
+
+func TestRoundTrip(t *testing.T) {
+	if os.Getenv("IN_KTEST") == "true" {
+		t.Skip("In ktest")
+	}
+	g := Table{
+		ID:         uuid.NewSHA1(zeroUUID, []byte("test")),
+		BlockSize:  512,
+		BlockCount: 2048,
+		BootCode:   []byte("just some test code"),
+		Partitions: []*Partition{
+			nil,
+			// This emoji is very complex and exercises UTF16 surrogate encoding
+			// and composing.
+			{Name: "Test 🏃‍♂️", FirstBlock: 10, LastBlock: 19, Type: PartitionTypeEFISystem, ID: uuid.NewSHA1(zeroUUID, []byte("test1")), Attributes: AttrNoBlockIOProto},
+			nil,
+			{Name: "Test2", FirstBlock: 20, LastBlock: 49, Type: PartitionTypeEFISystem, ID: uuid.NewSHA1(zeroUUID, []byte("test2")), Attributes: 0},
+		},
+	}
+	f, err := os.CreateTemp("", "")
+	if err != nil {
+		t.Fatalf("Failed to create temporary file: %v", err)
+	}
+	defer os.Remove(f.Name())
+	if err := Write(f, &g); err != nil {
+		t.Fatalf("Error while wrinting Table: %v", err)
+	}
+	f.Seek(0, io.SeekStart)
+	originalHash := sha256.New()
+	if _, err := io.Copy(originalHash, f); err != nil {
+		panic(err)
+	}
+
+	g2, err := Read(f, 512, 2048)
+	if err != nil {
+		t.Fatalf("Failed to read back GPT: %v", err)
+	}
+	if g2.ID != g.ID {
+		t.Errorf("Disk UUID changed when reading back: %v", err)
+	}
+	// Destroy primary GPT
+	f.Seek(1*g.BlockSize, io.SeekStart)
+	f.Write(make([]byte, 512))
+	f.Seek(0, io.SeekStart)
+	g3, err := Read(f, 512, 2048)
+	if err != nil {
+		t.Fatalf("Failed to read back GPT with primary GPT destroyed: %v", err)
+	}
+	if g3.ID != g.ID {
+		t.Errorf("Disk UUID changed when reading back: %v", err)
+	}
+	f.Seek(0, io.SeekStart)
+	if err := Write(f, g3); err != nil {
+		t.Fatalf("Error while writing back GPT: %v", err)
+	}
+	f.Seek(0, io.SeekStart)
+	rewrittenHash := sha256.New()
+	if _, err := io.Copy(rewrittenHash, f); err != nil {
+		panic(err)
+	}
+	if !bytes.Equal(originalHash.Sum([]byte{}), rewrittenHash.Sum([]byte{})) {
+		t.Errorf("Write/Read/Write test was not reproducible: %x != %x", originalHash.Sum([]byte{}), rewrittenHash.Sum([]byte{}))
+	}
+}

diff --git a/metropolis/pkg/gpt/linux_test.go b/metropolis/pkg/gpt/linux_test.go
new file mode 100644
index 0000000..19ac519
--- /dev/null
+++ b/metropolis/pkg/gpt/linux_test.go

@@ -0,0 +1,51 @@
+package gpt
+
+import (
+	"os"
+	"testing"
+
+	"github.com/google/uuid"
+	"golang.org/x/sys/unix"
+)
+
+var testUUID = uuid.MustParse("85c0b60f-caf9-40dd-86fa-f8797e26238d")
+
+func TestKernelInterop(t *testing.T) {
+	if os.Getenv("IN_KTEST") != "true" {
+		t.Skip("Not in ktest")
+	}
+	ram0, err := os.OpenFile("/dev/ram0", os.O_RDWR, 0)
+	if err != nil {
+		panic(err)
+	}
+	g := Table{
+		ID:         uuid.NewSHA1(testUUID, []byte("test")),
+		BlockSize:  512,
+		BlockCount: 2048,
+		BootCode:   []byte("just some test code"),
+		Partitions: []*Partition{
+			nil,
+			// This emoji is very complex and exercises UTF16 surrogate encoding
+			// and composing.
+			{Name: "Test 🏃‍♂️", FirstBlock: 10, LastBlock: 19, Type: PartitionTypeEFISystem, ID: uuid.NewSHA1(testUUID, []byte("test1")), Attributes: AttrNoBlockIOProto},
+			nil,
+			{Name: "Test2", FirstBlock: 20, LastBlock: 49, Type: PartitionTypeEFISystem, ID: uuid.NewSHA1(testUUID, []byte("test2")), Attributes: 0},
+		},
+	}
+	if err := Write(ram0, &g); err != nil {
+		t.Fatalf("Failed to write GPT: %v", err)
+	}
+
+	if err := unix.IoctlSetInt(int(ram0.Fd()), unix.BLKRRPART, 0); err != nil {
+		t.Fatalf("Failed to reread partition table: %v", err)
+	}
+	if _, err := os.Stat("/sys/block/ram0/ram0p2"); err != nil {
+		t.Errorf("Expected ram0p2 to exist, got %v", err)
+	}
+	if _, err := os.Stat("/sys/block/ram0/ram0p4"); err != nil {
+		t.Errorf("Expected ram0p4 to exist, got %v", err)
+	}
+	if _, err := os.Stat("/sys/block/ram0/ram0p1"); err == nil {
+		t.Error("Expected ram0p1 not to exist, but it exists")
+	}
+}

diff --git a/metropolis/pkg/gpt/mbr.go b/metropolis/pkg/gpt/mbr.go
new file mode 100644
index 0000000..46f561f
--- /dev/null
+++ b/metropolis/pkg/gpt/mbr.go

@@ -0,0 +1,82 @@
+package gpt
+
+import (
+	"encoding/binary"
+	"fmt"
+	"io"
+	"math"
+)
+
+// See UEFI Specification 2.9 Table 5-3
+type mbr struct {
+	BootCode         [440]byte
+	DiskSignature    [4]byte
+	_                [2]byte
+	PartitionRecords [4]mbrPartitionRecord
+	Signature        [2]byte
+}
+
+// See UEFI Specification 2.9 Table 5-4
+type mbrPartitionRecord struct {
+	BootIndicator byte
+	StartingCHS   [3]byte
+	Type          byte
+	EndingCHS     [3]byte
+	StartingBlock uint32
+	SizeInBlocks  uint32
+}
+
+var mbrSignature = [2]byte{0x55, 0xaa}
+
+func makeProtectiveMBR(w io.Writer, blockCount int64, bootCode []byte) error {
+	var representedBlockCount = uint32(math.MaxUint32)
+	if blockCount < math.MaxUint32 {
+		representedBlockCount = uint32(blockCount)
+	}
+	m := mbr{
+		DiskSignature: [4]byte{0, 0, 0, 0},
+		PartitionRecords: [4]mbrPartitionRecord{
+			{
+				StartingCHS:   toCHS(1),
+				Type:          0xEE, // Table/Protective MBR
+				StartingBlock: 1,
+				SizeInBlocks:  representedBlockCount,
+				EndingCHS:     toCHS(blockCount + 1),
+			},
+			{},
+			{},
+			{},
+		},
+		Signature: mbrSignature,
+	}
+	if len(bootCode) > len(m.BootCode) {
+		return fmt.Errorf("BootCode is %d bytes, can only store %d", len(bootCode), len(m.BootCode))
+	}
+	copy(m.BootCode[:], bootCode)
+	if err := binary.Write(w, binary.LittleEndian, &m); err != nil {
+		return fmt.Errorf("failed to write MBR: %w", err)
+	}
+	return nil
+}
+
+// toCHS converts a LBA to a "logical" CHS, i.e. what a legacy BIOS 13h
+// interface would use. This has nothing to do with the actual CHS geometry
+// which depends on the disk and interface used.
+func toCHS(lba int64) (chs [3]byte) {
+	const maxCylinders = (1 << 10) - 1
+	const maxHeadsPerCylinder = (1 << 8) - 1
+	const maxSectorsPerTrack = (1 << 6) - 2 // Sector is 1-based
+	cylinder := lba / (maxHeadsPerCylinder * maxSectorsPerTrack)
+	head := (lba / maxSectorsPerTrack) % maxHeadsPerCylinder
+	sector := (lba % maxSectorsPerTrack) + 1
+	if cylinder > maxCylinders {
+		cylinder = maxCylinders
+		head = maxHeadsPerCylinder
+		sector = maxSectorsPerTrack + 1
+	}
+	chs[0] = uint8(head)
+	chs[1] = uint8(sector)
+	chs[1] |= uint8(cylinder>>2) & 0xc0
+	chs[2] = uint8(cylinder)
+	return
+}

diff --git a/metropolis/pkg/gpt/mbr_test.go b/metropolis/pkg/gpt/mbr_test.go
new file mode 100644
index 0000000..2ed4ed1
--- /dev/null
+++ b/metropolis/pkg/gpt/mbr_test.go

@@ -0,0 +1,30 @@
+package gpt
+
+import "testing"
+
+func TestToCHS(t *testing.T) {
+	cases := []struct {
+		name        string
+		lba         int64
+		expectedCHS [3]byte
+	}{
+		{ // See UEFI Specification 2.9 Table 5-4 StartingCHS
+			name:        "One",
+			lba:         1,
+			expectedCHS: [3]byte{0x00, 0x02, 0x00},
+		},
+		{
+			name:        "TooBig",
+			lba:         (1023 * 255 * 63) + 1,
+			expectedCHS: [3]byte{0xff, 0xff, 0xff},
+		},
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			chs := toCHS(c.lba)
+			if chs != c.expectedCHS {
+				t.Errorf("expected %x, got %x", c.expectedCHS, chs)
+			}
+		})
+	}
+}

diff --git a/metropolis/pkg/gpt/structs_test.go b/metropolis/pkg/gpt/structs_test.go
new file mode 100644
index 0000000..dc53635
--- /dev/null
+++ b/metropolis/pkg/gpt/structs_test.go

@@ -0,0 +1,27 @@
+package gpt
+
+import (
+	"encoding/binary"
+	"reflect"
+	"testing"
+)
+
+func TestStructureSizes(t *testing.T) {
+	cases := []struct {
+		StructInstance interface{}
+		ExpectedSize   int
+	}{
+		{mbr{}, 512},
+		{mbrPartitionRecord{}, 16},
+		{header{}, 92},
+		{partition{}, 128},
+	}
+	for _, c := range cases {
+		t.Run(reflect.TypeOf(c.StructInstance).String(), func(t *testing.T) {
+			actualSize := binary.Size(c.StructInstance)
+			if actualSize != c.ExpectedSize {
+				t.Errorf("Expected %d bytes, got %d", c.ExpectedSize, actualSize)
+			}
+		})
+	}
+}
commit	ee17d8303258980270587755f75dc4b6412e3a31	[log] [tgz]
author	Lorenz Brun <lorenz@monogon.tech>	Tue Oct 18 12:02:45 2022 +0000
committer	Lorenz Brun <lorenz@monogon.tech>	Mon Oct 24 16:30:56 2022 +0000
tree	25e30e22a95527592b8d596dc8da5e6f7b6b63ab
parent	ce3d810f4fde5e00aba7539a4d12ebe82d65b672 [diff]