m/p/blockdev: init
Adds blockdev, a package providing a Go interface for generic block
devices as well as an implementation of it for Linux and auxiliary
types.
This will replace most ad-hoc block device handling in the monorepo.
Change-Id: I3a4e3b7c31a8344f7859210bbb4942977d1ad1d2
Reviewed-on: https://review.monogon.dev/c/monogon/+/1871
Tested-by: Jenkins CI
Reviewed-by: Serge Bazanski <serge@monogon.tech>
diff --git a/metropolis/pkg/blockdev/BUILD.bazel b/metropolis/pkg/blockdev/BUILD.bazel
new file mode 100644
index 0000000..c0c98d3
--- /dev/null
+++ b/metropolis/pkg/blockdev/BUILD.bazel
@@ -0,0 +1,21 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+ name = "blockdev",
+ srcs = [
+ "blockdev.go",
+ "blockdev_linux.go",
+ "memory.go",
+ ],
+ importpath = "source.monogon.dev/metropolis/pkg/blockdev",
+ visibility = ["//visibility:public"],
+ deps = select({
+ "@io_bazel_rules_go//go/platform:android": [
+ "@org_golang_x_sys//unix",
+ ],
+ "@io_bazel_rules_go//go/platform:linux": [
+ "@org_golang_x_sys//unix",
+ ],
+ "//conditions:default": [],
+ }),
+)
diff --git a/metropolis/pkg/blockdev/blockdev.go b/metropolis/pkg/blockdev/blockdev.go
new file mode 100644
index 0000000..1cb9551
--- /dev/null
+++ b/metropolis/pkg/blockdev/blockdev.go
@@ -0,0 +1,200 @@
+package blockdev
+
+import (
+ "errors"
+ "fmt"
+ "io"
+)
+
+// Replace with errors.ErrUnsupported once we migrate to Go 1.21
+var ErrUnsupported = errors.New("unsupported")
+
+var ErrNotBlockDevice = errors.New("not a block device")
+
+// BlockDev represents a generic block device made up of equally-sized blocks.
+// All offsets and intervals are expressed in bytes and must be aligned to
+// BlockSize and are recommended to be aligned to OptimalBlockSize if feasible.
+// Unless stated otherwise, intervals are inclusive-exclusive, i.e. the
+// start byte is included but the end byte is not.
+type BlockDev interface {
+ io.ReaderAt
+ io.WriterAt
+ // BlockSize returns the block size of the block device in bytes. This must
+ // be a power of two and is commonly (but not always) either 512 or 4096.
+ BlockSize() int64
+
+ // BlockCount returns the number of blocks on the block device or -1 if it
+ // is an image with an undefined size.
+ BlockCount() int64
+
+ // OptimalBlockSize returns the optimal block size in bytes for aligning
+ // to as well as issuing I/O. IO operations with block sizes below this
+ // one might incur read-write overhead. This is the larger of the physical
+ // block size and a device-reported value if available.
+ OptimalBlockSize() int64
+
+ // Discard discards a continuous set of blocks. Discarding means the
+ // underlying device gets notified that the data in these blocks is no
+ // longer needed. This can improve performance of the device device (as it
+ // no longer needs to preserve the unused data) as well as bulk erase
+ // operations. This command is advisory and not all implementations support
+ // it. The contents of discarded blocks are implementation-defined.
+ Discard(startByte int64, endByte int64) error
+
+ // Zero zeroes a continouous set of blocks. On certain implementations this
+ // can be significantly faster than just calling Write with zeroes.
+ Zero(startByte, endByte int64) error
+}
+
+func NewRWS(b BlockDev) *ReadWriteSeeker {
+ return &ReadWriteSeeker{b: b}
+}
+
+// ReadWriteSeeker provides an adapter implementing ReadWriteSeeker on top of
+// a blockdev.
+type ReadWriteSeeker struct {
+ b BlockDev
+ currPos int64
+}
+
+func (s *ReadWriteSeeker) Read(p []byte) (n int, err error) {
+ n, err = s.b.ReadAt(p, s.currPos)
+ s.currPos += int64(n)
+ return
+}
+
+func (s *ReadWriteSeeker) Write(p []byte) (n int, err error) {
+ n, err = s.b.WriteAt(p, s.currPos)
+ s.currPos += int64(n)
+ return
+}
+
+func (s *ReadWriteSeeker) Seek(offset int64, whence int) (int64, error) {
+ switch whence {
+ case io.SeekCurrent:
+ s.currPos += offset
+ case io.SeekStart:
+ s.currPos = offset
+ case io.SeekEnd:
+ s.currPos = (s.b.BlockCount() * s.b.BlockSize()) - offset
+ }
+ return s.currPos, nil
+}
+
+var ErrOutOfBounds = errors.New("write out of bounds")
+
+// NewSection returns a new Section, implementing BlockDev over that subset
+// of blocks. The interval is inclusive-exclusive.
+func NewSection(b BlockDev, startBlock, endBlock int64) *Section {
+ return &Section{
+ b: b,
+ startBlock: startBlock,
+ endBlock: endBlock,
+ }
+}
+
+// Section implements BlockDev on a slice of another BlockDev given a startBlock
+// and endBlock.
+type Section struct {
+ b BlockDev
+ startBlock, endBlock int64
+}
+
+func (s *Section) ReadAt(p []byte, off int64) (n int, err error) {
+ bOff := off + (s.startBlock * s.b.BlockSize())
+ bytesToEnd := (s.endBlock * s.b.BlockSize()) - bOff
+ if bytesToEnd <= 0 {
+ return 0, io.EOF
+ }
+ if bytesToEnd < int64(len(p)) {
+ return s.b.ReadAt(p[:bytesToEnd], bOff)
+ }
+ return s.b.ReadAt(p, bOff)
+}
+
+func (s *Section) WriteAt(p []byte, off int64) (n int, err error) {
+ bOff := off + (s.startBlock * s.b.BlockSize())
+ bytesToEnd := (s.endBlock * s.b.BlockSize()) - bOff
+ if bytesToEnd <= 0 {
+ return 0, ErrOutOfBounds
+ }
+ if bytesToEnd < int64(len(p)) {
+ n, err := s.b.WriteAt(p[:bytesToEnd], off+(s.startBlock*s.b.BlockSize()))
+ if err != nil {
+ // If an error happened, prioritize that error
+ return n, err
+ }
+ // Otherwise, return ErrOutOfBounds as even short writes must return an
+ // error.
+ return n, ErrOutOfBounds
+ }
+ return s.b.WriteAt(p, off+(s.startBlock*s.b.BlockSize()))
+}
+
+func (s *Section) BlockCount() int64 {
+ return s.endBlock - s.startBlock
+}
+
+func (s *Section) BlockSize() int64 {
+ return s.b.BlockSize()
+}
+
+func (s *Section) inRange(startByte, endByte int64) error {
+ if startByte > endByte {
+ return fmt.Errorf("invalid range: startByte (%d) bigger than endByte (%d)", startByte, endByte)
+ }
+ sectionLen := s.BlockCount() * s.BlockSize()
+ if startByte >= sectionLen {
+ return fmt.Errorf("startByte (%d) out of range (%d)", startByte, sectionLen)
+ }
+ if endByte > sectionLen {
+ return fmt.Errorf("endBlock (%d) out of range (%d)", endByte, sectionLen)
+ }
+ return nil
+}
+
+func (s *Section) Discard(startByte, endByte int64) error {
+ if err := s.inRange(startByte, endByte); err != nil {
+ return err
+ }
+ return s.b.Discard(s.startBlock+startByte, s.startBlock+endByte)
+}
+
+func (s *Section) OptimalBlockSize() int64 {
+ return s.b.OptimalBlockSize()
+}
+
+func (s *Section) Zero(startByte, endByte int64) error {
+ if err := s.inRange(startByte, endByte); err != nil {
+ return err
+ }
+ return s.b.Zero(s.startBlock+startByte, s.startBlock+endByte)
+}
+
+// GenericZero implements software-based zeroing. This can be used to implement
+// Zero when no acceleration is available or desired.
+func GenericZero(b BlockDev, startByte, endByte int64) error {
+ if startByte%b.BlockSize() != 0 {
+ return fmt.Errorf("startByte (%d) needs to be aligned to block size (%d)", startByte, b.BlockSize())
+ }
+ if endByte%b.BlockSize() != 0 {
+ return fmt.Errorf("endByte (%d) needs to be aligned to block size (%d)", endByte, b.BlockSize())
+ }
+ // Choose buffer size close to 16MiB or the range to be zeroed, whatever
+ // is smaller.
+ bufSizeTarget := int64(16 * 1024 * 1024)
+ if endByte-startByte < bufSizeTarget {
+ bufSizeTarget = endByte - startByte
+ }
+ bufSize := (bufSizeTarget / b.BlockSize()) * b.BlockSize()
+ buf := make([]byte, bufSize)
+ for i := startByte; i < endByte; i += bufSize {
+ if endByte-i < bufSize {
+ buf = buf[:endByte-i]
+ }
+ if _, err := b.WriteAt(buf, i); err != nil {
+ return fmt.Errorf("while writing zeroes: %w", err)
+ }
+ }
+ return nil
+}
diff --git a/metropolis/pkg/blockdev/blockdev_linux.go b/metropolis/pkg/blockdev/blockdev_linux.go
new file mode 100644
index 0000000..a8e6a8a
--- /dev/null
+++ b/metropolis/pkg/blockdev/blockdev_linux.go
@@ -0,0 +1,245 @@
+//go:build linux
+
+package blockdev
+
+import (
+ "errors"
+ "fmt"
+ "math/bits"
+ "os"
+ "syscall"
+ "unsafe"
+
+ "golang.org/x/sys/unix"
+)
+
+type Device struct {
+ backend *os.File
+ rawConn syscall.RawConn
+ blockSize int64
+ blockCount int64
+}
+
+func (d *Device) ReadAt(p []byte, off int64) (n int, err error) {
+ return d.backend.ReadAt(p, off)
+}
+
+func (d *Device) WriteAt(p []byte, off int64) (n int, err error) {
+ return d.backend.WriteAt(p, off)
+}
+
+func (d *Device) Close() error {
+ return d.backend.Close()
+}
+
+func (d *Device) BlockCount() int64 {
+ return d.blockCount
+}
+
+func (d *Device) BlockSize() int64 {
+ return d.blockSize
+}
+
+func (d *Device) Discard(startByte int64, endByte int64) error {
+ var args [2]uint64
+ var err unix.Errno
+ args[0] = uint64(startByte)
+ args[1] = uint64(endByte)
+ if ctrlErr := d.rawConn.Control(func(fd uintptr) {
+ _, _, err = unix.Syscall(unix.SYS_IOCTL, fd, unix.BLKDISCARD, uintptr(unsafe.Pointer(&args[0])))
+ }); ctrlErr != nil {
+ return ctrlErr
+ }
+ if err == unix.EOPNOTSUPP {
+ return ErrUnsupported
+ }
+ if err != unix.Errno(0) {
+ return fmt.Errorf("failed to discard: %w", err)
+ }
+ return nil
+}
+
+func (d *Device) OptimalBlockSize() int64 {
+ return d.blockSize
+}
+
+func (d *Device) Zero(startByte int64, endByte int64) error {
+ var args [2]uint64
+ var err error
+ args[0] = uint64(startByte)
+ args[1] = uint64(endByte)
+ if ctrlErr := d.rawConn.Control(func(fd uintptr) {
+ // Attempts to leverage discard guarantees to provide extremely quick
+ // metadata-only zeroing.
+ err = unix.Fallocate(int(fd), unix.FALLOC_FL_PUNCH_HOLE|unix.FALLOC_FL_KEEP_SIZE, startByte, endByte-startByte)
+ if err == unix.EOPNOTSUPP {
+ // Tries Write Same and friends and then just falls back to writing
+ // zeroes.
+ _, _, err = unix.Syscall(unix.SYS_IOCTL, fd, unix.BLKZEROOUT, uintptr(unsafe.Pointer(&args[0])))
+ if err == unix.Errno(0) {
+ err = nil
+ }
+ }
+ }); ctrlErr != nil {
+ return ctrlErr
+ }
+ if err != nil {
+ return fmt.Errorf("failed to zero out: %w", err)
+ }
+ return nil
+}
+
+// RefreshPartitionTable refreshes the kernel's view of the partition table
+// after changes made from userspace.
+func (d *Device) RefreshPartitionTable() error {
+ var err unix.Errno
+ if ctrlErr := d.rawConn.Control(func(fd uintptr) {
+ _, _, err = unix.Syscall(unix.SYS_IOCTL, fd, unix.BLKRRPART, 0)
+ }); ctrlErr != nil {
+ return ctrlErr
+ }
+ if err != unix.Errno(0) {
+ return fmt.Errorf("ioctl(BLKRRPART): %w", err)
+ }
+ return nil
+}
+
+// Open opens a block device given a path to its inode.
+// TODO: exclusive, O_DIRECT
+func Open(path string) (*Device, error) {
+ outFile, err := os.OpenFile(path, os.O_RDWR, 0640)
+ if err != nil {
+ return nil, fmt.Errorf("failed to open block device: %w", err)
+ }
+ return FromFileHandle(outFile)
+}
+
+// FromFileHandle creates a blockdev from a device handle. The device handle is
+// not duplicated, closing the returned Device will close it. If the handle is
+// not a block device, i.e does not implement block device ioctls, an error is
+// returned.
+func FromFileHandle(handle *os.File) (*Device, error) {
+ outFileC, err := handle.SyscallConn()
+ if err != nil {
+ return nil, fmt.Errorf("error getting SyscallConn: %w", err)
+ }
+ var blockSize uint32
+ outFileC.Control(func(fd uintptr) {
+ blockSize, err = unix.IoctlGetUint32(int(fd), unix.BLKSSZGET)
+ })
+ if errors.Is(err, unix.ENOTTY) || errors.Is(err, unix.EINVAL) {
+ return nil, ErrNotBlockDevice
+ } else if err != nil {
+ return nil, fmt.Errorf("when querying disk block size: %w", err)
+ }
+
+ var sizeBytes uint64
+ var getSizeErr error
+ outFileC.Control(func(fd uintptr) {
+ _, _, getSizeErr = unix.Syscall(unix.SYS_IOCTL, fd, unix.BLKGETSIZE64, uintptr(unsafe.Pointer(&sizeBytes)))
+ })
+
+ if getSizeErr != unix.Errno(0) {
+ return nil, fmt.Errorf("when querying disk block count: %w", err)
+ }
+ if sizeBytes%uint64(blockSize) != 0 {
+ return nil, fmt.Errorf("block device size is not an integer multiple of its block size (%d %% %d = %d)", sizeBytes, blockSize, sizeBytes%uint64(blockSize))
+ }
+ return &Device{
+ backend: handle,
+ rawConn: outFileC,
+ blockSize: int64(blockSize),
+ blockCount: int64(sizeBytes) / int64(blockSize),
+ }, nil
+}
+
+type File struct {
+ backend *os.File
+ rawConn syscall.RawConn
+ blockSize int64
+ blockCount int64
+}
+
+func CreateFile(name string, blockSize int64, blockCount int64) (*File, error) {
+ if blockSize < 512 {
+ return nil, fmt.Errorf("blockSize must be bigger than 512 bytes")
+ }
+ if bits.OnesCount64(uint64(blockSize)) != 1 {
+ return nil, fmt.Errorf("blockSize must be a power of two")
+ }
+ out, err := os.Create(name)
+ if err != nil {
+ return nil, fmt.Errorf("when creating backing file: %w", err)
+ }
+ rawConn, err := out.SyscallConn()
+ if err != nil {
+ return nil, fmt.Errorf("unable to get SyscallConn: %w", err)
+ }
+ return &File{
+ backend: out,
+ blockSize: blockSize,
+ rawConn: rawConn,
+ blockCount: blockCount,
+ }, nil
+}
+
+func (d *File) ReadAt(p []byte, off int64) (n int, err error) {
+ return d.backend.ReadAt(p, off)
+}
+
+func (d *File) WriteAt(p []byte, off int64) (n int, err error) {
+ return d.backend.WriteAt(p, off)
+}
+
+func (d *File) Close() error {
+ return d.backend.Close()
+}
+
+func (d *File) BlockCount() int64 {
+ return d.blockCount
+}
+
+func (d *File) BlockSize() int64 {
+ return d.blockSize
+}
+
+func (d *File) Discard(startByte int64, endByte int64) error {
+ var err error
+ if ctrlErr := d.rawConn.Control(func(fd uintptr) {
+ // There is FALLOC_FL_NO_HIDE_STALE, but it's not implemented by
+ // any filesystem right now, so let's not attempt it for the time being.
+ err = unix.Fallocate(int(fd), unix.FALLOC_FL_PUNCH_HOLE|unix.FALLOC_FL_KEEP_SIZE, startByte, endByte-startByte)
+ }); ctrlErr != nil {
+ return ctrlErr
+ }
+ if errors.Is(err, unix.EOPNOTSUPP) {
+ return ErrUnsupported
+ }
+ if err != unix.Errno(0) {
+ return fmt.Errorf("failed to discard: %w", err)
+ }
+ return nil
+}
+
+func (d *File) OptimalBlockSize() int64 {
+ return d.blockSize
+}
+
+func (d *File) Zero(startByte int64, endByte int64) error {
+ var err error
+ if ctrlErr := d.rawConn.Control(func(fd uintptr) {
+ // Tell the filesystem to punch out the given blocks.
+ err = unix.Fallocate(int(fd), unix.FALLOC_FL_PUNCH_HOLE|unix.FALLOC_FL_KEEP_SIZE, startByte, endByte-startByte)
+ }); ctrlErr != nil {
+ return ctrlErr
+ }
+ // If unsupported or the syscall is not available (for example in a sandbox)
+ // fall back to the generic software implementation.
+ if errors.Is(err, unix.EOPNOTSUPP) || errors.Is(err, unix.ENOSYS) {
+ return GenericZero(d, startByte, endByte)
+ }
+ if err != nil {
+ return fmt.Errorf("failed to zero out: %w", err)
+ }
+ return nil
+}
diff --git a/metropolis/pkg/blockdev/memory.go b/metropolis/pkg/blockdev/memory.go
new file mode 100644
index 0000000..193f93c
--- /dev/null
+++ b/metropolis/pkg/blockdev/memory.go
@@ -0,0 +1,128 @@
+package blockdev
+
+import (
+ "errors"
+ "fmt"
+ "io"
+ "math/bits"
+)
+
+// Memory is a memory-backed implementation of BlockDev. It is optimal
+// for testing and temporary use, as it is fast and platform-independent.
+type Memory struct {
+ blockSize int64
+ blockCount int64
+ data []byte
+}
+
+// NewMemory returns a new memory-backed block device with the given geometry.
+func NewMemory(blockSize, blockCount int64) (*Memory, error) {
+ if blockSize <= 0 {
+ return nil, errors.New("block size cannot be zero or negative")
+ }
+ if bits.OnesCount64(uint64(blockSize)) > 1 {
+ return nil, fmt.Errorf("block size must be a power of two (got %d)", blockSize)
+ }
+ if blockCount < 0 {
+ return nil, errors.New("block count cannot be negative")
+ }
+ return &Memory{
+ blockSize: blockSize,
+ blockCount: blockCount,
+ data: make([]byte, blockSize*blockCount),
+ }, nil
+}
+
+// MustNewMemory works exactly like NewMemory, but panics when NewMemory would
+// return an error. Intended for use in tests.
+func MustNewMemory(blockSize, blockCount int64) *Memory {
+ m, err := NewMemory(blockSize, blockCount)
+ if err != nil {
+ panic(err)
+ }
+ return m
+}
+
+func (m *Memory) ReadAt(p []byte, off int64) (int, error) {
+ devSize := m.blockSize * m.blockCount
+ if off > devSize {
+ return 0, io.EOF
+ }
+ // TODO: Alignment checks?
+ copy(p, m.data[off:])
+ n := len(m.data[off:])
+ if n < len(p) {
+ return n, io.EOF
+ }
+ return len(p), nil
+}
+
+func (m *Memory) WriteAt(p []byte, off int64) (int, error) {
+ devSize := m.blockSize * m.blockCount
+ if off > devSize {
+ return 0, io.EOF
+ }
+ // TODO: Alignment checks?
+ copy(m.data[off:], p)
+ n := len(m.data[off:])
+ if n < len(p) {
+ return n, io.EOF
+ }
+ return len(p), nil
+}
+
+func (m *Memory) BlockSize() int64 {
+ return m.blockSize
+}
+
+func (m *Memory) BlockCount() int64 {
+ return m.blockCount
+}
+
+func (m *Memory) OptimalBlockSize() int64 {
+ return m.blockSize
+}
+
+func (m *Memory) validRange(startByte, endByte int64) error {
+ if startByte > endByte {
+ return fmt.Errorf("startByte (%d) larger than endByte (%d), invalid interval", startByte, endByte)
+ }
+ devSize := m.blockSize * m.blockCount
+ if startByte >= devSize || startByte < 0 {
+ return fmt.Errorf("startByte (%d) out of range (0-%d)", endByte, devSize)
+ }
+ if endByte > devSize || endByte < 0 {
+ return fmt.Errorf("endByte (%d) out of range (0-%d)", endByte, devSize)
+ }
+ // Alignment check works for powers of two by looking at every bit below
+ // the bit set in the block size.
+ if startByte&(m.blockSize-1) != 0 {
+ return fmt.Errorf("startByte (%d) is not aligned to blockSize (%d)", startByte, m.blockSize)
+ }
+ if endByte&(m.blockSize-1) != 0 {
+ return fmt.Errorf("endByte (%d) is not aligned to blockSize (%d)", startByte, m.blockSize)
+ }
+ return nil
+}
+
+func (m *Memory) Discard(startByte, endByte int64) error {
+ if err := m.validRange(startByte, endByte); err != nil {
+ return err
+ }
+ for i := startByte; i < endByte; i++ {
+ // Intentionally don't set to zero as Discard doesn't guarantee
+ // any specific contents. Call Zero if you need this.
+ m.data[i] = 0xaa
+ }
+ return nil
+}
+
+func (m *Memory) Zero(startByte, endByte int64) error {
+ if err := m.validRange(startByte, endByte); err != nil {
+ return err
+ }
+ for i := startByte; i < endByte; i++ {
+ m.data[i] = 0x00
+ }
+ return nil
+}