m/p/blockdev: init Adds blockdev, a package providing a Go interface for generic block devices as well as an implementation of it for Linux and auxiliary types. This will replace most ad-hoc block device handling in the monorepo. Change-Id: I3a4e3b7c31a8344f7859210bbb4942977d1ad1d2 Reviewed-on: https://review.monogon.dev/c/monogon/+/1871 Tested-by: Jenkins CI Reviewed-by: Serge Bazanski <serge@monogon.tech>

commit: 1e0e3a47f72a8fb251bec9a98cb3d6acffe79989 [log] [tgz]
author: Lorenz Brun <lorenz@monogon.tech> Wed Jun 28 16:40:18 2023 +0200
committer: Lorenz Brun <lorenz@monogon.tech> Thu Jul 27 13:58:35 2023 +0000
tree: 64f18c66ac03870d1cbbae02b91e6f14a4ebc090
parent: fd49f22e3a98d42ffe4d508a1e49ef2549fa8ecf [diff]
diff --git a/metropolis/pkg/blockdev/BUILD.bazel b/metropolis/pkg/blockdev/BUILD.bazel
new file mode 100644
index 0000000..c0c98d3
--- /dev/null
+++ b/metropolis/pkg/blockdev/BUILD.bazel

@@ -0,0 +1,21 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "blockdev",
+    srcs = [
+        "blockdev.go",
+        "blockdev_linux.go",
+        "memory.go",
+    ],
+    importpath = "source.monogon.dev/metropolis/pkg/blockdev",
+    visibility = ["//visibility:public"],
+    deps = select({
+        "@io_bazel_rules_go//go/platform:android": [
+            "@org_golang_x_sys//unix",
+        ],
+        "@io_bazel_rules_go//go/platform:linux": [
+            "@org_golang_x_sys//unix",
+        ],
+        "//conditions:default": [],
+    }),
+)

diff --git a/metropolis/pkg/blockdev/blockdev.go b/metropolis/pkg/blockdev/blockdev.go
new file mode 100644
index 0000000..1cb9551
--- /dev/null
+++ b/metropolis/pkg/blockdev/blockdev.go

@@ -0,0 +1,200 @@
+package blockdev
+
+import (
+	"errors"
+	"fmt"
+	"io"
+)
+
+// Replace with errors.ErrUnsupported once we migrate to Go 1.21
+var ErrUnsupported = errors.New("unsupported")
+
+var ErrNotBlockDevice = errors.New("not a block device")
+
+// BlockDev represents a generic block device made up of equally-sized blocks.
+// All offsets and intervals are expressed in bytes and must be aligned to
+// BlockSize and are recommended to be aligned to OptimalBlockSize if feasible.
+// Unless stated otherwise, intervals are inclusive-exclusive, i.e. the
+// start byte is included but the end byte is not.
+type BlockDev interface {
+	io.ReaderAt
+	io.WriterAt
+	// BlockSize returns the block size of the block device in bytes. This must
+	// be a power of two and is commonly (but not always) either 512 or 4096.
+	BlockSize() int64
+
+	// BlockCount returns the number of blocks on the block device or -1 if it
+	// is an image with an undefined size.
+	BlockCount() int64
+
+	// OptimalBlockSize returns the optimal block size in bytes for aligning
+	// to as well as issuing I/O. IO operations with block sizes below this
+	// one might incur read-write overhead. This is the larger of the physical
+	// block size and a device-reported value if available.
+	OptimalBlockSize() int64
+
+	// Discard discards a continuous set of blocks. Discarding means the
+	// underlying device gets notified that the data in these blocks is no
+	// longer needed. This can improve performance of the device device (as it
+	// no longer needs to preserve the unused data) as well as bulk erase
+	// operations. This command is advisory and not all implementations support
+	// it. The contents of discarded blocks are implementation-defined.
+	Discard(startByte int64, endByte int64) error
+
+	// Zero zeroes a continouous set of blocks. On certain implementations this
+	// can be significantly faster than just calling Write with zeroes.
+	Zero(startByte, endByte int64) error
+}
+
+func NewRWS(b BlockDev) *ReadWriteSeeker {
+	return &ReadWriteSeeker{b: b}
+}
+
+// ReadWriteSeeker provides an adapter implementing ReadWriteSeeker on top of
+// a blockdev.
+type ReadWriteSeeker struct {
+	b       BlockDev
+	currPos int64
+}
+
+func (s *ReadWriteSeeker) Read(p []byte) (n int, err error) {
+	n, err = s.b.ReadAt(p, s.currPos)
+	s.currPos += int64(n)
+	return
+}
+
+func (s *ReadWriteSeeker) Write(p []byte) (n int, err error) {
+	n, err = s.b.WriteAt(p, s.currPos)
+	s.currPos += int64(n)
+	return
+}
+
+func (s *ReadWriteSeeker) Seek(offset int64, whence int) (int64, error) {
+	switch whence {
+	case io.SeekCurrent:
+		s.currPos += offset
+	case io.SeekStart:
+		s.currPos = offset
+	case io.SeekEnd:
+		s.currPos = (s.b.BlockCount() * s.b.BlockSize()) - offset
+	}
+	return s.currPos, nil
+}
+
+var ErrOutOfBounds = errors.New("write out of bounds")
+
+// NewSection returns a new Section, implementing BlockDev over that subset
+// of blocks. The interval is inclusive-exclusive.
+func NewSection(b BlockDev, startBlock, endBlock int64) *Section {
+	return &Section{
+		b:          b,
+		startBlock: startBlock,
+		endBlock:   endBlock,
+	}
+}
+
+// Section implements BlockDev on a slice of another BlockDev given a startBlock
+// and endBlock.
+type Section struct {
+	b                    BlockDev
+	startBlock, endBlock int64
+}
+
+func (s *Section) ReadAt(p []byte, off int64) (n int, err error) {
+	bOff := off + (s.startBlock * s.b.BlockSize())
+	bytesToEnd := (s.endBlock * s.b.BlockSize()) - bOff
+	if bytesToEnd <= 0 {
+		return 0, io.EOF
+	}
+	if bytesToEnd < int64(len(p)) {
+		return s.b.ReadAt(p[:bytesToEnd], bOff)
+	}
+	return s.b.ReadAt(p, bOff)
+}
+
+func (s *Section) WriteAt(p []byte, off int64) (n int, err error) {
+	bOff := off + (s.startBlock * s.b.BlockSize())
+	bytesToEnd := (s.endBlock * s.b.BlockSize()) - bOff
+	if bytesToEnd <= 0 {
+		return 0, ErrOutOfBounds
+	}
+	if bytesToEnd < int64(len(p)) {
+		n, err := s.b.WriteAt(p[:bytesToEnd], off+(s.startBlock*s.b.BlockSize()))
+		if err != nil {
+			// If an error happened, prioritize that error
+			return n, err
+		}
+		// Otherwise, return ErrOutOfBounds as even short writes must return an
+		// error.
+		return n, ErrOutOfBounds
+	}
+	return s.b.WriteAt(p, off+(s.startBlock*s.b.BlockSize()))
+}
+
+func (s *Section) BlockCount() int64 {
+	return s.endBlock - s.startBlock
+}
+
+func (s *Section) BlockSize() int64 {
+	return s.b.BlockSize()
+}
+
+func (s *Section) inRange(startByte, endByte int64) error {
+	if startByte > endByte {
+		return fmt.Errorf("invalid range: startByte (%d) bigger than endByte (%d)", startByte, endByte)
+	}
+	sectionLen := s.BlockCount() * s.BlockSize()
+	if startByte >= sectionLen {
+		return fmt.Errorf("startByte (%d) out of range (%d)", startByte, sectionLen)
+	}
+	if endByte > sectionLen {
+		return fmt.Errorf("endBlock (%d) out of range (%d)", endByte, sectionLen)
+	}
+	return nil
+}
+
+func (s *Section) Discard(startByte, endByte int64) error {
+	if err := s.inRange(startByte, endByte); err != nil {
+		return err
+	}
+	return s.b.Discard(s.startBlock+startByte, s.startBlock+endByte)
+}
+
+func (s *Section) OptimalBlockSize() int64 {
+	return s.b.OptimalBlockSize()
+}
+
+func (s *Section) Zero(startByte, endByte int64) error {
+	if err := s.inRange(startByte, endByte); err != nil {
+		return err
+	}
+	return s.b.Zero(s.startBlock+startByte, s.startBlock+endByte)
+}
+
+// GenericZero implements software-based zeroing. This can be used to implement
+// Zero when no acceleration is available or desired.
+func GenericZero(b BlockDev, startByte, endByte int64) error {
+	if startByte%b.BlockSize() != 0 {
+		return fmt.Errorf("startByte (%d) needs to be aligned to block size (%d)", startByte, b.BlockSize())
+	}
+	if endByte%b.BlockSize() != 0 {
+		return fmt.Errorf("endByte (%d) needs to be aligned to block size (%d)", endByte, b.BlockSize())
+	}
+	// Choose buffer size close to 16MiB or the range to be zeroed, whatever
+	// is smaller.
+	bufSizeTarget := int64(16 * 1024 * 1024)
+	if endByte-startByte < bufSizeTarget {
+		bufSizeTarget = endByte - startByte
+	}
+	bufSize := (bufSizeTarget / b.BlockSize()) * b.BlockSize()
+	buf := make([]byte, bufSize)
+	for i := startByte; i < endByte; i += bufSize {
+		if endByte-i < bufSize {
+			buf = buf[:endByte-i]
+		}
+		if _, err := b.WriteAt(buf, i); err != nil {
+			return fmt.Errorf("while writing zeroes: %w", err)
+		}
+	}
+	return nil
+}

diff --git a/metropolis/pkg/blockdev/blockdev_linux.go b/metropolis/pkg/blockdev/blockdev_linux.go
new file mode 100644
index 0000000..a8e6a8a
--- /dev/null
+++ b/metropolis/pkg/blockdev/blockdev_linux.go

@@ -0,0 +1,245 @@
+//go:build linux
+
+package blockdev
+
+import (
+	"errors"
+	"fmt"
+	"math/bits"
+	"os"
+	"syscall"
+	"unsafe"
+
+	"golang.org/x/sys/unix"
+)
+
+type Device struct {
+	backend    *os.File
+	rawConn    syscall.RawConn
+	blockSize  int64
+	blockCount int64
+}
+
+func (d *Device) ReadAt(p []byte, off int64) (n int, err error) {
+	return d.backend.ReadAt(p, off)
+}
+
+func (d *Device) WriteAt(p []byte, off int64) (n int, err error) {
+	return d.backend.WriteAt(p, off)
+}
+
+func (d *Device) Close() error {
+	return d.backend.Close()
+}
+
+func (d *Device) BlockCount() int64 {
+	return d.blockCount
+}
+
+func (d *Device) BlockSize() int64 {
+	return d.blockSize
+}
+
+func (d *Device) Discard(startByte int64, endByte int64) error {
+	var args [2]uint64
+	var err unix.Errno
+	args[0] = uint64(startByte)
+	args[1] = uint64(endByte)
+	if ctrlErr := d.rawConn.Control(func(fd uintptr) {
+		_, _, err = unix.Syscall(unix.SYS_IOCTL, fd, unix.BLKDISCARD, uintptr(unsafe.Pointer(&args[0])))
+	}); ctrlErr != nil {
+		return ctrlErr
+	}
+	if err == unix.EOPNOTSUPP {
+		return ErrUnsupported
+	}
+	if err != unix.Errno(0) {
+		return fmt.Errorf("failed to discard: %w", err)
+	}
+	return nil
+}
+
+func (d *Device) OptimalBlockSize() int64 {
+	return d.blockSize
+}
+
+func (d *Device) Zero(startByte int64, endByte int64) error {
+	var args [2]uint64
+	var err error
+	args[0] = uint64(startByte)
+	args[1] = uint64(endByte)
+	if ctrlErr := d.rawConn.Control(func(fd uintptr) {
+		// Attempts to leverage discard guarantees to provide extremely quick
+		// metadata-only zeroing.
+		err = unix.Fallocate(int(fd), unix.FALLOC_FL_PUNCH_HOLE|unix.FALLOC_FL_KEEP_SIZE, startByte, endByte-startByte)
+		if err == unix.EOPNOTSUPP {
+			// Tries Write Same and friends and then just falls back to writing
+			// zeroes.
+			_, _, err = unix.Syscall(unix.SYS_IOCTL, fd, unix.BLKZEROOUT, uintptr(unsafe.Pointer(&args[0])))
+			if err == unix.Errno(0) {
+				err = nil
+			}
+		}
+	}); ctrlErr != nil {
+		return ctrlErr
+	}
+	if err != nil {
+		return fmt.Errorf("failed to zero out: %w", err)
+	}
+	return nil
+}
+
+// RefreshPartitionTable refreshes the kernel's view of the partition table
+// after changes made from userspace.
+func (d *Device) RefreshPartitionTable() error {
+	var err unix.Errno
+	if ctrlErr := d.rawConn.Control(func(fd uintptr) {
+		_, _, err = unix.Syscall(unix.SYS_IOCTL, fd, unix.BLKRRPART, 0)
+	}); ctrlErr != nil {
+		return ctrlErr
+	}
+	if err != unix.Errno(0) {
+		return fmt.Errorf("ioctl(BLKRRPART): %w", err)
+	}
+	return nil
+}
+
+// Open opens a block device given a path to its inode.
+// TODO: exclusive, O_DIRECT
+func Open(path string) (*Device, error) {
+	outFile, err := os.OpenFile(path, os.O_RDWR, 0640)
+	if err != nil {
+		return nil, fmt.Errorf("failed to open block device: %w", err)
+	}
+	return FromFileHandle(outFile)
+}
+
+// FromFileHandle creates a blockdev from a device handle. The device handle is
+// not duplicated, closing the returned Device will close it. If the handle is
+// not a block device, i.e does not implement block device ioctls, an error is
+// returned.
+func FromFileHandle(handle *os.File) (*Device, error) {
+	outFileC, err := handle.SyscallConn()
+	if err != nil {
+		return nil, fmt.Errorf("error getting SyscallConn: %w", err)
+	}
+	var blockSize uint32
+	outFileC.Control(func(fd uintptr) {
+		blockSize, err = unix.IoctlGetUint32(int(fd), unix.BLKSSZGET)
+	})
+	if errors.Is(err, unix.ENOTTY) || errors.Is(err, unix.EINVAL) {
+		return nil, ErrNotBlockDevice
+	} else if err != nil {
+		return nil, fmt.Errorf("when querying disk block size: %w", err)
+	}
+
+	var sizeBytes uint64
+	var getSizeErr error
+	outFileC.Control(func(fd uintptr) {
+		_, _, getSizeErr = unix.Syscall(unix.SYS_IOCTL, fd, unix.BLKGETSIZE64, uintptr(unsafe.Pointer(&sizeBytes)))
+	})
+
+	if getSizeErr != unix.Errno(0) {
+		return nil, fmt.Errorf("when querying disk block count: %w", err)
+	}
+	if sizeBytes%uint64(blockSize) != 0 {
+		return nil, fmt.Errorf("block device size is not an integer multiple of its block size (%d %% %d = %d)", sizeBytes, blockSize, sizeBytes%uint64(blockSize))
+	}
+	return &Device{
+		backend:    handle,
+		rawConn:    outFileC,
+		blockSize:  int64(blockSize),
+		blockCount: int64(sizeBytes) / int64(blockSize),
+	}, nil
+}
+
+type File struct {
+	backend    *os.File
+	rawConn    syscall.RawConn
+	blockSize  int64
+	blockCount int64
+}
+
+func CreateFile(name string, blockSize int64, blockCount int64) (*File, error) {
+	if blockSize < 512 {
+		return nil, fmt.Errorf("blockSize must be bigger than 512 bytes")
+	}
+	if bits.OnesCount64(uint64(blockSize)) != 1 {
+		return nil, fmt.Errorf("blockSize must be a power of two")
+	}
+	out, err := os.Create(name)
+	if err != nil {
+		return nil, fmt.Errorf("when creating backing file: %w", err)
+	}
+	rawConn, err := out.SyscallConn()
+	if err != nil {
+		return nil, fmt.Errorf("unable to get SyscallConn: %w", err)
+	}
+	return &File{
+		backend:    out,
+		blockSize:  blockSize,
+		rawConn:    rawConn,
+		blockCount: blockCount,
+	}, nil
+}
+
+func (d *File) ReadAt(p []byte, off int64) (n int, err error) {
+	return d.backend.ReadAt(p, off)
+}
+
+func (d *File) WriteAt(p []byte, off int64) (n int, err error) {
+	return d.backend.WriteAt(p, off)
+}
+
+func (d *File) Close() error {
+	return d.backend.Close()
+}
+
+func (d *File) BlockCount() int64 {
+	return d.blockCount
+}
+
+func (d *File) BlockSize() int64 {
+	return d.blockSize
+}
+
+func (d *File) Discard(startByte int64, endByte int64) error {
+	var err error
+	if ctrlErr := d.rawConn.Control(func(fd uintptr) {
+		// There is FALLOC_FL_NO_HIDE_STALE, but it's not implemented by
+		// any filesystem right now, so let's not attempt it for the time being.
+		err = unix.Fallocate(int(fd), unix.FALLOC_FL_PUNCH_HOLE|unix.FALLOC_FL_KEEP_SIZE, startByte, endByte-startByte)
+	}); ctrlErr != nil {
+		return ctrlErr
+	}
+	if errors.Is(err, unix.EOPNOTSUPP) {
+		return ErrUnsupported
+	}
+	if err != unix.Errno(0) {
+		return fmt.Errorf("failed to discard: %w", err)
+	}
+	return nil
+}
+
+func (d *File) OptimalBlockSize() int64 {
+	return d.blockSize
+}
+
+func (d *File) Zero(startByte int64, endByte int64) error {
+	var err error
+	if ctrlErr := d.rawConn.Control(func(fd uintptr) {
+		// Tell the filesystem to punch out the given blocks.
+		err = unix.Fallocate(int(fd), unix.FALLOC_FL_PUNCH_HOLE|unix.FALLOC_FL_KEEP_SIZE, startByte, endByte-startByte)
+	}); ctrlErr != nil {
+		return ctrlErr
+	}
+	// If unsupported or the syscall is not available (for example in a sandbox)
+	// fall back to the generic software implementation.
+	if errors.Is(err, unix.EOPNOTSUPP) || errors.Is(err, unix.ENOSYS) {
+		return GenericZero(d, startByte, endByte)
+	}
+	if err != nil {
+		return fmt.Errorf("failed to zero out: %w", err)
+	}
+	return nil
+}

diff --git a/metropolis/pkg/blockdev/memory.go b/metropolis/pkg/blockdev/memory.go
new file mode 100644
index 0000000..193f93c
--- /dev/null
+++ b/metropolis/pkg/blockdev/memory.go

@@ -0,0 +1,128 @@
+package blockdev
+
+import (
+	"errors"
+	"fmt"
+	"io"
+	"math/bits"
+)
+
+// Memory is a memory-backed implementation of BlockDev. It is optimal
+// for testing and temporary use, as it is fast and platform-independent.
+type Memory struct {
+	blockSize  int64
+	blockCount int64
+	data       []byte
+}
+
+// NewMemory returns a new memory-backed block device with the given geometry.
+func NewMemory(blockSize, blockCount int64) (*Memory, error) {
+	if blockSize <= 0 {
+		return nil, errors.New("block size cannot be zero or negative")
+	}
+	if bits.OnesCount64(uint64(blockSize)) > 1 {
+		return nil, fmt.Errorf("block size must be a power of two (got %d)", blockSize)
+	}
+	if blockCount < 0 {
+		return nil, errors.New("block count cannot be negative")
+	}
+	return &Memory{
+		blockSize:  blockSize,
+		blockCount: blockCount,
+		data:       make([]byte, blockSize*blockCount),
+	}, nil
+}
+
+// MustNewMemory works exactly like NewMemory, but panics when NewMemory would
+// return an error. Intended for use in tests.
+func MustNewMemory(blockSize, blockCount int64) *Memory {
+	m, err := NewMemory(blockSize, blockCount)
+	if err != nil {
+		panic(err)
+	}
+	return m
+}
+
+func (m *Memory) ReadAt(p []byte, off int64) (int, error) {
+	devSize := m.blockSize * m.blockCount
+	if off > devSize {
+		return 0, io.EOF
+	}
+	// TODO: Alignment checks?
+	copy(p, m.data[off:])
+	n := len(m.data[off:])
+	if n < len(p) {
+		return n, io.EOF
+	}
+	return len(p), nil
+}
+
+func (m *Memory) WriteAt(p []byte, off int64) (int, error) {
+	devSize := m.blockSize * m.blockCount
+	if off > devSize {
+		return 0, io.EOF
+	}
+	// TODO: Alignment checks?
+	copy(m.data[off:], p)
+	n := len(m.data[off:])
+	if n < len(p) {
+		return n, io.EOF
+	}
+	return len(p), nil
+}
+
+func (m *Memory) BlockSize() int64 {
+	return m.blockSize
+}
+
+func (m *Memory) BlockCount() int64 {
+	return m.blockCount
+}
+
+func (m *Memory) OptimalBlockSize() int64 {
+	return m.blockSize
+}
+
+func (m *Memory) validRange(startByte, endByte int64) error {
+	if startByte > endByte {
+		return fmt.Errorf("startByte (%d) larger than endByte (%d), invalid interval", startByte, endByte)
+	}
+	devSize := m.blockSize * m.blockCount
+	if startByte >= devSize || startByte < 0 {
+		return fmt.Errorf("startByte (%d) out of range (0-%d)", endByte, devSize)
+	}
+	if endByte > devSize || endByte < 0 {
+		return fmt.Errorf("endByte (%d) out of range (0-%d)", endByte, devSize)
+	}
+	// Alignment check works for powers of two by looking at every bit below
+	// the bit set in the block size.
+	if startByte&(m.blockSize-1) != 0 {
+		return fmt.Errorf("startByte (%d) is not aligned to blockSize (%d)", startByte, m.blockSize)
+	}
+	if endByte&(m.blockSize-1) != 0 {
+		return fmt.Errorf("endByte (%d) is not aligned to blockSize (%d)", startByte, m.blockSize)
+	}
+	return nil
+}
+
+func (m *Memory) Discard(startByte, endByte int64) error {
+	if err := m.validRange(startByte, endByte); err != nil {
+		return err
+	}
+	for i := startByte; i < endByte; i++ {
+		// Intentionally don't set to zero as Discard doesn't guarantee
+		// any specific contents. Call Zero if you need this.
+		m.data[i] = 0xaa
+	}
+	return nil
+}
+
+func (m *Memory) Zero(startByte, endByte int64) error {
+	if err := m.validRange(startByte, endByte); err != nil {
+		return err
+	}
+	for i := startByte; i < endByte; i++ {
+		m.data[i] = 0x00
+	}
+	return nil
+}
commit	1e0e3a47f72a8fb251bec9a98cb3d6acffe79989	[log] [tgz]
author	Lorenz Brun <lorenz@monogon.tech>	Wed Jun 28 16:40:18 2023 +0200
committer	Lorenz Brun <lorenz@monogon.tech>	Thu Jul 27 13:58:35 2023 +0000
tree	64f18c66ac03870d1cbbae02b91e6f14a4ebc090
parent	fd49f22e3a98d42ffe4d508a1e49ef2549fa8ecf [diff]