m/p/blockdev: init
Adds blockdev, a package providing a Go interface for generic block
devices as well as an implementation of it for Linux and auxiliary
types.
This will replace most ad-hoc block device handling in the monorepo.
Change-Id: I3a4e3b7c31a8344f7859210bbb4942977d1ad1d2
Reviewed-on: https://review.monogon.dev/c/monogon/+/1871
Tested-by: Jenkins CI
Reviewed-by: Serge Bazanski <serge@monogon.tech>
diff --git a/metropolis/pkg/blockdev/blockdev_linux.go b/metropolis/pkg/blockdev/blockdev_linux.go
new file mode 100644
index 0000000..a8e6a8a
--- /dev/null
+++ b/metropolis/pkg/blockdev/blockdev_linux.go
@@ -0,0 +1,245 @@
+//go:build linux
+
+package blockdev
+
+import (
+ "errors"
+ "fmt"
+ "math/bits"
+ "os"
+ "syscall"
+ "unsafe"
+
+ "golang.org/x/sys/unix"
+)
+
+type Device struct {
+ backend *os.File
+ rawConn syscall.RawConn
+ blockSize int64
+ blockCount int64
+}
+
+func (d *Device) ReadAt(p []byte, off int64) (n int, err error) {
+ return d.backend.ReadAt(p, off)
+}
+
+func (d *Device) WriteAt(p []byte, off int64) (n int, err error) {
+ return d.backend.WriteAt(p, off)
+}
+
+func (d *Device) Close() error {
+ return d.backend.Close()
+}
+
+func (d *Device) BlockCount() int64 {
+ return d.blockCount
+}
+
+func (d *Device) BlockSize() int64 {
+ return d.blockSize
+}
+
+func (d *Device) Discard(startByte int64, endByte int64) error {
+ var args [2]uint64
+ var err unix.Errno
+ args[0] = uint64(startByte)
+ args[1] = uint64(endByte)
+ if ctrlErr := d.rawConn.Control(func(fd uintptr) {
+ _, _, err = unix.Syscall(unix.SYS_IOCTL, fd, unix.BLKDISCARD, uintptr(unsafe.Pointer(&args[0])))
+ }); ctrlErr != nil {
+ return ctrlErr
+ }
+ if err == unix.EOPNOTSUPP {
+ return ErrUnsupported
+ }
+ if err != unix.Errno(0) {
+ return fmt.Errorf("failed to discard: %w", err)
+ }
+ return nil
+}
+
+func (d *Device) OptimalBlockSize() int64 {
+ return d.blockSize
+}
+
+func (d *Device) Zero(startByte int64, endByte int64) error {
+ var args [2]uint64
+ var err error
+ args[0] = uint64(startByte)
+ args[1] = uint64(endByte)
+ if ctrlErr := d.rawConn.Control(func(fd uintptr) {
+ // Attempts to leverage discard guarantees to provide extremely quick
+ // metadata-only zeroing.
+ err = unix.Fallocate(int(fd), unix.FALLOC_FL_PUNCH_HOLE|unix.FALLOC_FL_KEEP_SIZE, startByte, endByte-startByte)
+ if err == unix.EOPNOTSUPP {
+ // Tries Write Same and friends and then just falls back to writing
+ // zeroes.
+ _, _, err = unix.Syscall(unix.SYS_IOCTL, fd, unix.BLKZEROOUT, uintptr(unsafe.Pointer(&args[0])))
+ if err == unix.Errno(0) {
+ err = nil
+ }
+ }
+ }); ctrlErr != nil {
+ return ctrlErr
+ }
+ if err != nil {
+ return fmt.Errorf("failed to zero out: %w", err)
+ }
+ return nil
+}
+
+// RefreshPartitionTable refreshes the kernel's view of the partition table
+// after changes made from userspace.
+func (d *Device) RefreshPartitionTable() error {
+ var err unix.Errno
+ if ctrlErr := d.rawConn.Control(func(fd uintptr) {
+ _, _, err = unix.Syscall(unix.SYS_IOCTL, fd, unix.BLKRRPART, 0)
+ }); ctrlErr != nil {
+ return ctrlErr
+ }
+ if err != unix.Errno(0) {
+ return fmt.Errorf("ioctl(BLKRRPART): %w", err)
+ }
+ return nil
+}
+
+// Open opens a block device given a path to its inode.
+// TODO: exclusive, O_DIRECT
+func Open(path string) (*Device, error) {
+ outFile, err := os.OpenFile(path, os.O_RDWR, 0640)
+ if err != nil {
+ return nil, fmt.Errorf("failed to open block device: %w", err)
+ }
+ return FromFileHandle(outFile)
+}
+
+// FromFileHandle creates a blockdev from a device handle. The device handle is
+// not duplicated, closing the returned Device will close it. If the handle is
+// not a block device, i.e does not implement block device ioctls, an error is
+// returned.
+func FromFileHandle(handle *os.File) (*Device, error) {
+ outFileC, err := handle.SyscallConn()
+ if err != nil {
+ return nil, fmt.Errorf("error getting SyscallConn: %w", err)
+ }
+ var blockSize uint32
+ outFileC.Control(func(fd uintptr) {
+ blockSize, err = unix.IoctlGetUint32(int(fd), unix.BLKSSZGET)
+ })
+ if errors.Is(err, unix.ENOTTY) || errors.Is(err, unix.EINVAL) {
+ return nil, ErrNotBlockDevice
+ } else if err != nil {
+ return nil, fmt.Errorf("when querying disk block size: %w", err)
+ }
+
+ var sizeBytes uint64
+ var getSizeErr error
+ outFileC.Control(func(fd uintptr) {
+ _, _, getSizeErr = unix.Syscall(unix.SYS_IOCTL, fd, unix.BLKGETSIZE64, uintptr(unsafe.Pointer(&sizeBytes)))
+ })
+
+ if getSizeErr != unix.Errno(0) {
+ return nil, fmt.Errorf("when querying disk block count: %w", err)
+ }
+ if sizeBytes%uint64(blockSize) != 0 {
+ return nil, fmt.Errorf("block device size is not an integer multiple of its block size (%d %% %d = %d)", sizeBytes, blockSize, sizeBytes%uint64(blockSize))
+ }
+ return &Device{
+ backend: handle,
+ rawConn: outFileC,
+ blockSize: int64(blockSize),
+ blockCount: int64(sizeBytes) / int64(blockSize),
+ }, nil
+}
+
+type File struct {
+ backend *os.File
+ rawConn syscall.RawConn
+ blockSize int64
+ blockCount int64
+}
+
+func CreateFile(name string, blockSize int64, blockCount int64) (*File, error) {
+ if blockSize < 512 {
+ return nil, fmt.Errorf("blockSize must be bigger than 512 bytes")
+ }
+ if bits.OnesCount64(uint64(blockSize)) != 1 {
+ return nil, fmt.Errorf("blockSize must be a power of two")
+ }
+ out, err := os.Create(name)
+ if err != nil {
+ return nil, fmt.Errorf("when creating backing file: %w", err)
+ }
+ rawConn, err := out.SyscallConn()
+ if err != nil {
+ return nil, fmt.Errorf("unable to get SyscallConn: %w", err)
+ }
+ return &File{
+ backend: out,
+ blockSize: blockSize,
+ rawConn: rawConn,
+ blockCount: blockCount,
+ }, nil
+}
+
+func (d *File) ReadAt(p []byte, off int64) (n int, err error) {
+ return d.backend.ReadAt(p, off)
+}
+
+func (d *File) WriteAt(p []byte, off int64) (n int, err error) {
+ return d.backend.WriteAt(p, off)
+}
+
+func (d *File) Close() error {
+ return d.backend.Close()
+}
+
+func (d *File) BlockCount() int64 {
+ return d.blockCount
+}
+
+func (d *File) BlockSize() int64 {
+ return d.blockSize
+}
+
+func (d *File) Discard(startByte int64, endByte int64) error {
+ var err error
+ if ctrlErr := d.rawConn.Control(func(fd uintptr) {
+ // There is FALLOC_FL_NO_HIDE_STALE, but it's not implemented by
+ // any filesystem right now, so let's not attempt it for the time being.
+ err = unix.Fallocate(int(fd), unix.FALLOC_FL_PUNCH_HOLE|unix.FALLOC_FL_KEEP_SIZE, startByte, endByte-startByte)
+ }); ctrlErr != nil {
+ return ctrlErr
+ }
+ if errors.Is(err, unix.EOPNOTSUPP) {
+ return ErrUnsupported
+ }
+ if err != unix.Errno(0) {
+ return fmt.Errorf("failed to discard: %w", err)
+ }
+ return nil
+}
+
+func (d *File) OptimalBlockSize() int64 {
+ return d.blockSize
+}
+
+func (d *File) Zero(startByte int64, endByte int64) error {
+ var err error
+ if ctrlErr := d.rawConn.Control(func(fd uintptr) {
+ // Tell the filesystem to punch out the given blocks.
+ err = unix.Fallocate(int(fd), unix.FALLOC_FL_PUNCH_HOLE|unix.FALLOC_FL_KEEP_SIZE, startByte, endByte-startByte)
+ }); ctrlErr != nil {
+ return ctrlErr
+ }
+ // If unsupported or the syscall is not available (for example in a sandbox)
+ // fall back to the generic software implementation.
+ if errors.Is(err, unix.EOPNOTSUPP) || errors.Is(err, unix.ENOSYS) {
+ return GenericZero(d, startByte, endByte)
+ }
+ if err != nil {
+ return fmt.Errorf("failed to zero out: %w", err)
+ }
+ return nil
+}