treewide: port everything to blockdev

This gets rid of most ad-hoc block device code, using blockdev for
everything. It also gets rid of diskfs for everything but tests. This
enables Metropolis to be installed on non-512-byte block sizes.

Change-Id: I644b5b68bb7bed8106585df3179674789031687a
Reviewed-on: https://review.monogon.dev/c/monogon/+/1873
Tested-by: Jenkins CI
Reviewed-by: Serge Bazanski <serge@monogon.tech>
diff --git a/metropolis/pkg/gpt/gpt.go b/metropolis/pkg/gpt/gpt.go
index 81c50f7..f5c2a907 100644
--- a/metropolis/pkg/gpt/gpt.go
+++ b/metropolis/pkg/gpt/gpt.go
@@ -10,13 +10,13 @@
 	"errors"
 	"fmt"
 	"hash/crc32"
-	"io"
 	"sort"
 	"strings"
 	"unicode/utf16"
 
 	"github.com/google/uuid"
 
+	"source.monogon.dev/metropolis/pkg/blockdev"
 	"source.monogon.dev/metropolis/pkg/msguid"
 )
 
@@ -58,8 +58,6 @@
 	PartitionTypeEFISystem = uuid.MustParse("C12A7328-F81F-11D2-BA4B-00A0C93EC93B")
 )
 
-var zeroUUID = [16]byte{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
-
 // Attribute is a bitfield of attributes set on a partition. Bits 0 to 47 are
 // reserved for UEFI specification use and all current assignments are in the
 // following const block. Bits 48 to 64 are available for per-Type use by
@@ -112,6 +110,8 @@
 	LastBlock uint64
 	// Bitset of attributes of this partition.
 	Attributes Attribute
+
+	*blockdev.Section
 }
 
 // SizeBlocks returns the size of the partition in blocks
@@ -125,7 +125,16 @@
 	if p == nil {
 		return true
 	}
-	return p.Type == zeroUUID
+	return p.Type == uuid.Nil
+}
+
+// New returns an empty table on the given block device.
+// It does not read any existing GPT on the disk (use Read for that), nor does
+// it write anything until Write is called.
+func New(b blockdev.BlockDev) (*Table, error) {
+	return &Table{
+		b: b,
+	}, nil
 }
 
 type Table struct {
@@ -142,17 +151,13 @@
 	// with BIOS booting is not required. Only useful on x86 systems.
 	BootCode []byte
 
-	// BlockSize contains the logical block size of the block device. It must
-	// be a power of two equal to or larger than 512.
-	BlockSize int64
-	// BlockCount contains the number of logical blocks on the block device.
-	// BlockCount times BlockSize is the size of the block device in bytes.
-	BlockCount int64
-
 	// Partitions contains the list of partitions in this table. This is
 	// artificially limited to 128 partitions.
 	Partitions []*Partition
+
+	b blockdev.BlockDev
 }
+
 type addOptions struct {
 	preferEnd        bool
 	keepEmptyEntries bool
@@ -195,23 +200,23 @@
 // By default, AddPartition aligns FirstBlock to 1MiB boundaries, but this can
 // be overridden using WithAlignment.
 func (g *Table) AddPartition(p *Partition, size int64, options ...AddOption) error {
-	if g.BlockSize < 512 {
-		return errors.New("block size is smaller than 512 bytes, this is unsupported")
-	}
+	blockSize := g.b.BlockSize()
 	var opts addOptions
 	// Align to 1MiB or the block size, whichever is bigger
 	opts.alignment = 1 * 1024 * 1024
-	if g.BlockSize > opts.alignment {
-		opts.alignment = g.BlockSize
+	if blockSize > opts.alignment {
+		opts.alignment = blockSize
 	}
 	for _, o := range options {
 		o(&opts)
 	}
-	if opts.alignment%g.BlockSize != 0 {
-		return fmt.Errorf("requested alignment (%d bytes) is not an integer multiple of the block size (%d), unable to align", opts.alignment, g.BlockSize)
+	if opts.alignment%blockSize != 0 {
+		return fmt.Errorf("requested alignment (%d bytes) is not an integer multiple of the block size (%d), unable to align", opts.alignment, blockSize)
 	}
-	// Number of blocks the partition should occupy, rounded up.
-	blocks := (size + g.BlockSize - 1) / g.BlockSize
+	if p.ID == uuid.Nil {
+		p.ID = uuid.New()
+	}
+
 	fs, _, err := g.GetFreeSpaces()
 	if err != nil {
 		return fmt.Errorf("unable to determine free space: %v", err)
@@ -222,13 +227,28 @@
 			fs[i], fs[j] = fs[j], fs[i]
 		}
 	}
+	// Number of blocks the partition should occupy, rounded up.
+	blocks := (size + blockSize - 1) / blockSize
+	if size == -1 {
+		var largestFreeSpace int64
+		for _, freeInt := range fs {
+			intSz := freeInt[1] - freeInt[0]
+			if intSz > largestFreeSpace {
+				largestFreeSpace = intSz
+			}
+		}
+		blocks = largestFreeSpace
+	}
 	var maxFreeBlocks int64
 	for _, freeInt := range fs {
 		start := freeInt[0]
 		end := freeInt[1]
 		freeBlocks := end - start
 		// Align start properly
-		paddingBlocks := (-start) % (opts.alignment / g.BlockSize)
+		alignTo := (opts.alignment / blockSize)
+		// Go doesn't implement the euclidean modulus, thus this construction
+		// is necessary.
+		paddingBlocks := ((alignTo - start) % alignTo) % alignTo
 		freeBlocks -= paddingBlocks
 		start += paddingBlocks
 		if maxFreeBlocks < freeBlocks {
@@ -237,13 +257,13 @@
 		if freeBlocks >= blocks {
 			if !opts.preferEnd {
 				p.FirstBlock = uint64(start)
-				p.LastBlock = uint64(start + blocks)
+				p.LastBlock = uint64(start + blocks - 1)
 			} else {
 				// Realign FirstBlock. This will always succeed as
 				// there is enough space to align to the start.
-				moveLeft := (end - blocks - 1) % (opts.alignment / g.BlockSize)
+				moveLeft := (end - blocks - 1) % (opts.alignment / blockSize)
 				p.FirstBlock = uint64(end - (blocks + 1 + moveLeft))
-				p.LastBlock = uint64(end - (1 + moveLeft))
+				p.LastBlock = uint64(end - (2 + moveLeft))
 			}
 			newPartPos := -1
 			if !opts.keepEmptyEntries {
@@ -259,6 +279,7 @@
 			} else {
 				g.Partitions[newPartPos] = p
 			}
+			p.Section = blockdev.NewSection(g.b, int64(p.FirstBlock), int64(p.LastBlock)+1)
 			return nil
 		}
 	}
@@ -269,15 +290,17 @@
 // FirstUsableBlock returns the first usable (i.e. a partition can start there)
 // block.
 func (g *Table) FirstUsableBlock() int64 {
-	partitionEntryBlocks := (16384 + g.BlockSize - 1) / g.BlockSize
+	blockSize := g.b.BlockSize()
+	partitionEntryBlocks := (16384 + blockSize - 1) / blockSize
 	return 2 + partitionEntryBlocks
 }
 
 // LastUsableBlock returns the last usable (i.e. a partition can end there)
 // block. This block is inclusive.
 func (g *Table) LastUsableBlock() int64 {
-	partitionEntryBlocks := (16384 + g.BlockSize - 1) / g.BlockSize
-	return g.BlockCount - (2 + partitionEntryBlocks)
+	blockSize := g.b.BlockSize()
+	partitionEntryBlocks := (16384 + blockSize - 1) / blockSize
+	return g.b.BlockCount() - (2 + partitionEntryBlocks)
 }
 
 // GetFreeSpaces returns a slice of tuples, each containing a half-closed
@@ -299,6 +322,8 @@
 	// of its cyclomatic complexity and O(n*log n) is tiny for even very big
 	// partition tables.
 
+	blockCount := g.b.BlockCount()
+
 	// startBlocks contains the start blocks (inclusive) of all occupied
 	// intervals.
 	var startBlocks []int64
@@ -312,7 +337,7 @@
 
 	// Reserve the alternate GPT interval (needs +1 for exclusive interval)
 	startBlocks = append(startBlocks, g.LastUsableBlock()+1)
-	endBlocks = append(endBlocks, g.BlockCount)
+	endBlocks = append(endBlocks, blockCount)
 
 	for i, part := range g.Partitions {
 		if part.IsUnused() {
@@ -324,7 +349,7 @@
 		if part.FirstBlock > part.LastBlock {
 			return nil, false, fmt.Errorf("partition %d has a LastBlock smaller than its FirstBlock, its interval is [%d, %d]", i, part.FirstBlock, part.LastBlock)
 		}
-		if part.FirstBlock >= uint64(g.BlockCount) || part.LastBlock >= uint64(g.BlockCount) {
+		if part.FirstBlock >= uint64(blockCount) || part.LastBlock >= uint64(blockCount) {
 			return nil, false, fmt.Errorf("partition %d exceeds the block count of the block device", i)
 		}
 		startBlocks = append(startBlocks, int64(part.FirstBlock))
@@ -398,14 +423,14 @@
 	return 3 + (2 * partitionEntryBlocks)
 }
 
-// Write writes a list of GPT partitions with a protective MBR to the given
-// WriteSeeker. It must have a defined end, i.e. w.Seek(-x, io.SeekEnd) must
-// seek to x bytes before the end of the disk. If gpt.ID or any of the
-// partition IDs are the all-zero UUID, a new random one is generated and
-// written back. If the output is supposed to be reproducible, generate the
-// UUIDs beforehand.
-func Write(w io.WriteSeeker, gpt *Table) error {
-	if gpt.BlockSize < 512 {
+// Write writes the two GPTs, first the alternate, then the primary to the
+// block device. If gpt.ID or any of the partition IDs are the all-zero UUID,
+// new random ones are generated and written back. If the output is supposed
+// to be reproducible, generate the UUIDs beforehand.
+func (gpt *Table) Write() error {
+	blockSize := gpt.b.BlockSize()
+	blockCount := gpt.b.BlockCount()
+	if blockSize < 512 {
 		return errors.New("block size is smaller than 512 bytes, this is unsupported")
 	}
 	// Layout looks as follows:
@@ -414,30 +439,30 @@
 	// Block 2-(16384 bytes): GPT partition entries
 	// Block (16384 bytes)-n: GPT partition entries alternate copy
 	// Block n: GPT Header alternate copy
-	if len(gpt.Partitions) > 128 {
-		return errors.New("Bigger-than default GPTs (>128 partitions) are unimplemented")
+	partitionEntryCount := 128
+	if len(gpt.Partitions) > partitionEntryCount {
+		return errors.New("bigger-than default GPTs (>128 partitions) are unimplemented")
 	}
 
-	partitionEntryBlocks := (16384 + gpt.BlockSize - 1) / gpt.BlockSize
-	if gpt.BlockCount < 3+(2*partitionEntryBlocks) {
+	partitionEntryBlocks := (16384 + blockSize - 1) / blockSize
+	if blockCount < 3+(2*partitionEntryBlocks) {
 		return errors.New("not enough blocks to write GPT")
 	}
 
-	if gpt.ID == zeroUUID {
+	if gpt.ID == uuid.Nil {
 		gpt.ID = uuid.New()
 	}
 
 	partSize := binary.Size(partition{})
-	slotCount := 128
-
 	var partitionEntriesData bytes.Buffer
-	for i := 0; i < slotCount; i++ {
+	for i := 0; i < partitionEntryCount; i++ {
 		if len(gpt.Partitions) <= i || gpt.Partitions[i] == nil {
+			// Write an empty entry
 			partitionEntriesData.Write(make([]byte, partSize))
 			continue
 		}
 		p := gpt.Partitions[i]
-		if p.ID == zeroUUID {
+		if p.ID == uuid.Nil {
 			p.ID = uuid.New()
 		}
 		rawP := partition{
@@ -459,11 +484,11 @@
 		HeaderSize: uint32(binary.Size(&header{})),
 		ID:         msguid.From(gpt.ID),
 
-		PartitionEntryCount: uint32(slotCount),
+		PartitionEntryCount: uint32(partitionEntryCount),
 		PartitionEntrySize:  uint32(partSize),
 
 		FirstUsableBlock: uint64(2 + partitionEntryBlocks),
-		LastUsableBlock:  uint64(gpt.BlockCount - (2 + partitionEntryBlocks)),
+		LastUsableBlock:  uint64(blockCount - (2 + partitionEntryBlocks)),
 	}
 	hdr.PartitionEntriesCRC32 = crc32.ChecksumIEEE(partitionEntriesData.Bytes())
 
@@ -477,38 +502,36 @@
 	// this problem.
 
 	// Alternate header
-	if _, err := w.Seek((gpt.LastUsableBlock()+1)*gpt.BlockSize, io.SeekStart); err != nil {
-		return fmt.Errorf("failed to seek to end of block device: %w", err)
-	}
-	hdr.HeaderBlock = uint64(gpt.BlockCount - 1)
+	hdr.HeaderBlock = uint64(blockCount - 1)
 	hdr.AlternateHeaderBlock = 1
-	hdr.PartitionEntriesStartBlock = uint64(gpt.BlockCount - (1 + partitionEntryBlocks))
+	hdr.PartitionEntriesStartBlock = uint64(blockCount - (1 + partitionEntryBlocks))
 
 	hdrChecksum.Reset()
 	hdr.HeaderCRC32 = 0
 	binary.Write(hdrChecksum, binary.LittleEndian, &hdr)
 	hdr.HeaderCRC32 = hdrChecksum.Sum32()
 
-	if _, err := w.Write(partitionEntriesData.Bytes()); err != nil {
+	for partitionEntriesData.Len()%int(blockSize) != 0 {
+		partitionEntriesData.WriteByte(0x00)
+	}
+	if _, err := gpt.b.WriteAt(partitionEntriesData.Bytes(), int64(hdr.PartitionEntriesStartBlock)*blockSize); err != nil {
 		return fmt.Errorf("failed to write alternate partition entries: %w", err)
 	}
-	if _, err := w.Seek((gpt.BlockCount-1)*gpt.BlockSize, io.SeekStart); err != nil {
-		return fmt.Errorf("failed to seek to end of block device: %w", err)
-	}
 
-	if err := binary.Write(w, binary.LittleEndian, &hdr); err != nil {
-		return fmt.Errorf("failed to write alternate header: %w", err)
+	var hdrRaw bytes.Buffer
+	if err := binary.Write(&hdrRaw, binary.LittleEndian, &hdr); err != nil {
+		return fmt.Errorf("failed to encode alternate header: %w", err)
 	}
-	if _, err := w.Write(make([]byte, gpt.BlockSize-int64(binary.Size(hdr)))); err != nil {
-		return fmt.Errorf("failed to write padding: %v", err)
+	for hdrRaw.Len()%int(blockSize) != 0 {
+		hdrRaw.WriteByte(0x00)
+	}
+	if _, err := gpt.b.WriteAt(hdrRaw.Bytes(), (blockCount-1)*blockSize); err != nil {
+		return fmt.Errorf("failed to write alternate header: %v", err)
 	}
 
 	// Primary header
-	if _, err := w.Seek(0, io.SeekStart); err != nil {
-		return fmt.Errorf("failed to seek to end of block device: %w", err)
-	}
 	hdr.HeaderBlock = 1
-	hdr.AlternateHeaderBlock = uint64(gpt.BlockCount - 1)
+	hdr.AlternateHeaderBlock = uint64(blockCount - 1)
 	hdr.PartitionEntriesStartBlock = 2
 
 	hdrChecksum.Reset()
@@ -516,33 +539,39 @@
 	binary.Write(hdrChecksum, binary.LittleEndian, &hdr)
 	hdr.HeaderCRC32 = hdrChecksum.Sum32()
 
-	if err := makeProtectiveMBR(w, gpt.BlockCount, gpt.BootCode); err != nil {
-		return fmt.Errorf("failed to write first block: %w", err)
+	hdrRaw.Reset()
+
+	if err := makeProtectiveMBR(&hdrRaw, blockCount, gpt.BootCode); err != nil {
+		return fmt.Errorf("failed creating protective MBR: %w", err)
+	}
+	for hdrRaw.Len()%int(blockSize) != 0 {
+		hdrRaw.WriteByte(0x00)
+	}
+	if err := binary.Write(&hdrRaw, binary.LittleEndian, &hdr); err != nil {
+		panic(err)
+	}
+	for hdrRaw.Len()%int(blockSize) != 0 {
+		hdrRaw.WriteByte(0x00)
+	}
+	hdrRaw.Write(partitionEntriesData.Bytes())
+	for hdrRaw.Len()%int(blockSize) != 0 {
+		hdrRaw.WriteByte(0x00)
 	}
 
-	if err := binary.Write(w, binary.LittleEndian, &hdr); err != nil {
-		return fmt.Errorf("failed to write primary header: %w", err)
-	}
-	if _, err := w.Write(make([]byte, gpt.BlockSize-int64(binary.Size(hdr)))); err != nil {
-		return fmt.Errorf("failed to write padding: %v", err)
-	}
-	if _, err := w.Write(partitionEntriesData.Bytes()); err != nil {
-		return fmt.Errorf("failed to write primary partition entries: %w", err)
+	if _, err := gpt.b.WriteAt(hdrRaw.Bytes(), 0); err != nil {
+		return fmt.Errorf("failed to write primary GPT: %w", err)
 	}
 	return nil
 }
 
-// Read reads a Table from a block device given its block size and count.
-func Read(r io.ReadSeeker, blockSize int64, blockCount int64) (*Table, error) {
-	if Overhead(blockSize) > blockCount {
+// Read reads a Table from a block device.
+func Read(r blockdev.BlockDev) (*Table, error) {
+	if Overhead(r.BlockSize()) > r.BlockCount() {
 		return nil, errors.New("disk cannot contain a GPT as the block count is too small to store one")
 	}
-	if _, err := r.Seek(0, io.SeekStart); err != nil {
-		return nil, fmt.Errorf("failed to seek to block 0: %w", err)
-	}
-	zeroBlock := make([]byte, blockSize)
-	if _, err := io.ReadFull(r, zeroBlock); err != nil {
-		return nil, fmt.Errorf("failed to read first two blocks: %w", err)
+	zeroBlock := make([]byte, r.BlockSize())
+	if _, err := r.ReadAt(zeroBlock, 0); err != nil {
+		return nil, fmt.Errorf("failed to read first block: %w", err)
 	}
 
 	var m mbr
@@ -594,9 +623,9 @@
 		bootCode = bytes.TrimRight(m.BootCode[:], "\x00")
 	}
 	// Read the primary GPT. If it is damaged and/or broken, read the alternate.
-	primaryGPT, err := readSingleGPT(r, blockSize, blockCount, 1)
+	primaryGPT, err := readSingleGPT(r, 1)
 	if err != nil {
-		alternateGPT, err2 := readSingleGPT(r, blockSize, blockCount, blockCount-1)
+		alternateGPT, err2 := readSingleGPT(r, r.BlockCount()-1)
 		if err2 != nil {
 			return nil, fmt.Errorf("failed to read both GPTs: primary GPT (%v), secondary GPT (%v)", err, err2)
 		}
@@ -607,12 +636,9 @@
 	return primaryGPT, nil
 }
 
-func readSingleGPT(r io.ReadSeeker, blockSize int64, blockCount int64, headerBlockPos int64) (*Table, error) {
-	if _, err := r.Seek(blockSize*headerBlockPos, io.SeekStart); err != nil {
-		return nil, fmt.Errorf("failed to seek to block %d: %w", headerBlockPos, err)
-	}
-	hdrBlock := make([]byte, blockSize)
-	if _, err := io.ReadFull(r, hdrBlock); err != nil {
+func readSingleGPT(r blockdev.BlockDev, headerBlockPos int64) (*Table, error) {
+	hdrBlock := make([]byte, r.BlockSize())
+	if _, err := r.ReadAt(hdrBlock, r.BlockSize()*headerBlockPos); err != nil {
 		return nil, fmt.Errorf("failed to read GPT header block: %w", err)
 	}
 	hdrBlockReader := bytes.NewReader(hdrBlock)
@@ -626,7 +652,7 @@
 	if hdr.HeaderSize < uint32(binary.Size(hdr)) {
 		return nil, fmt.Errorf("GPT header size is too small, likely corrupted")
 	}
-	if int64(hdr.HeaderSize) > blockSize {
+	if int64(hdr.HeaderSize) > r.BlockSize() {
 		return nil, fmt.Errorf("GPT header size is bigger than block size, likely corrupted")
 	}
 	// Use reserved bytes to hash, but do not expose them to the user.
@@ -650,7 +676,7 @@
 	if hdr.PartitionEntrySize < uint32(binary.Size(partition{})) {
 		return nil, errors.New("partition entry size too small")
 	}
-	if hdr.PartitionEntriesStartBlock > uint64(blockCount) {
+	if hdr.PartitionEntriesStartBlock > uint64(r.BlockCount()) {
 		return nil, errors.New("partition entry start block is out of range")
 	}
 	// Sanity-check total size of the partition entry area. Otherwise, this is a
@@ -661,10 +687,7 @@
 		return nil, errors.New("partition entry area bigger than 4MiB, refusing to read")
 	}
 	partitionEntryData := make([]byte, hdr.PartitionEntrySize*hdr.PartitionEntryCount)
-	if _, err := r.Seek(blockSize*int64(hdr.PartitionEntriesStartBlock), io.SeekStart); err != nil {
-		return nil, fmt.Errorf("failed to seek to partition entry start block: %w", err)
-	}
-	if _, err := io.ReadFull(r, partitionEntryData); err != nil {
+	if _, err := r.ReadAt(partitionEntryData, r.BlockSize()*int64(hdr.PartitionEntriesStartBlock)); err != nil {
 		return nil, fmt.Errorf("failed to read partition entries: %w", err)
 	}
 	if crc32.ChecksumIEEE(partitionEntryData) != hdr.PartitionEntriesCRC32 {
@@ -672,8 +695,6 @@
 	}
 	var g Table
 	g.ID = msguid.To(hdr.ID)
-	g.BlockSize = blockSize
-	g.BlockCount = blockCount
 	for i := uint32(0); i < hdr.PartitionEntryCount; i++ {
 		entryReader := bytes.NewReader(partitionEntryData[i*hdr.PartitionEntrySize : (i+1)*hdr.PartitionEntrySize])
 		var part partition
@@ -682,7 +703,7 @@
 		}
 		// If the partition type is the all-zero UUID, this slot counts as
 		// unused.
-		if part.Type == zeroUUID {
+		if part.Type == uuid.Nil {
 			g.Partitions = append(g.Partitions, nil)
 			continue
 		}
@@ -705,5 +726,6 @@
 		}
 	}
 	g.Partitions = g.Partitions[:maxValidPartition+1]
+	g.b = r
 	return &g, nil
 }