| // Copyright The Monogon Project Authors. |
| // SPDX-License-Identifier: Apache-2.0 |
| |
| //go:build linux |
| |
| package blockdev |
| |
| import ( |
| "errors" |
| "fmt" |
| "io" |
| "math/bits" |
| "os" |
| "runtime" |
| "syscall" |
| "unsafe" |
| |
| "golang.org/x/sys/unix" |
| ) |
| |
| type Device struct { |
| backend *os.File |
| rawConn syscall.RawConn |
| blockSize int64 |
| blockCount int64 |
| } |
| |
| func (d *Device) ReadAt(p []byte, off int64) (n int, err error) { |
| size := d.blockSize * d.blockCount |
| if off > size { |
| return 0, io.EOF |
| } |
| if int64(len(p)) > size-off { |
| n, err = d.backend.ReadAt(p[:size-off], off) |
| if err == nil { |
| err = io.EOF |
| } |
| return |
| } |
| return d.backend.ReadAt(p, off) |
| } |
| |
| func (d *Device) WriteAt(p []byte, off int64) (n int, err error) { |
| size := d.blockSize * d.blockCount |
| if off > size { |
| return 0, ErrOutOfBounds |
| } |
| if int64(len(p)) > size-off { |
| n, err = d.backend.WriteAt(p[:size-off], off) |
| if err == nil { |
| err = ErrOutOfBounds |
| } |
| return |
| } |
| return d.backend.WriteAt(p, off) |
| } |
| |
| func (d *Device) Close() error { |
| return d.backend.Close() |
| } |
| |
| func (d *Device) BlockCount() int64 { |
| return d.blockCount |
| } |
| |
| func (d *Device) BlockSize() int64 { |
| return d.blockSize |
| } |
| |
| func (d *Device) OptimalBlockSize() int64 { |
| return d.blockSize |
| } |
| |
| func (d *Device) Discard(startByte int64, endByte int64) error { |
| if err := validAlignedRange(d, startByte, endByte); err != nil { |
| return err |
| } |
| if startByte == endByte { |
| return nil |
| } |
| var args [2]uint64 |
| var err unix.Errno |
| args[0] = uint64(startByte) |
| args[1] = uint64(endByte - startByte) |
| if ctrlErr := d.rawConn.Control(func(fd uintptr) { |
| _, _, err = unix.Syscall(unix.SYS_IOCTL, fd, unix.BLKDISCARD, uintptr(unsafe.Pointer(&args[0]))) |
| }); ctrlErr != nil { |
| return ctrlErr |
| } |
| if err == unix.EOPNOTSUPP { |
| return errors.ErrUnsupported |
| } |
| if err != unix.Errno(0) { |
| return fmt.Errorf("failed to discard: %w", err) |
| } |
| return nil |
| } |
| |
| func (d *Device) Zero(startByte int64, endByte int64) error { |
| if err := validAlignedRange(d, startByte, endByte); err != nil { |
| return err |
| } |
| if startByte == endByte { |
| return nil |
| } |
| var args [2]uint64 |
| var err error |
| args[0] = uint64(startByte) |
| args[1] = uint64(endByte - startByte) |
| ctrlErr := d.rawConn.Control(func(fd uintptr) { |
| // Attempts to leverage discard guarantees to provide extremely quick |
| // metadata-only zeroing. |
| err = unix.Fallocate(int(fd), unix.FALLOC_FL_PUNCH_HOLE|unix.FALLOC_FL_KEEP_SIZE, startByte, endByte-startByte) |
| if errors.Is(err, unix.EOPNOTSUPP) { |
| // Tries Write Same and friends and then just falls back to writing |
| // zeroes. |
| _, _, errNo := unix.Syscall(unix.SYS_IOCTL, fd, unix.BLKZEROOUT, uintptr(unsafe.Pointer(&args[0]))) |
| if errNo == unix.Errno(0) { |
| err = nil |
| } else { |
| err = errNo |
| } |
| } |
| }) |
| if ctrlErr != nil { |
| return ctrlErr |
| } |
| if err != nil { |
| return fmt.Errorf("failed to zero out: %w", err) |
| } |
| return nil |
| } |
| |
| func (d *Device) Sync() error { |
| return d.backend.Sync() |
| } |
| |
| // RefreshPartitionTable refreshes the kernel's view of the partition table |
| // after changes made from userspace. |
| func (d *Device) RefreshPartitionTable() error { |
| var err unix.Errno |
| if ctrlErr := d.rawConn.Control(func(fd uintptr) { |
| _, _, err = unix.Syscall(unix.SYS_IOCTL, fd, unix.BLKRRPART, 0) |
| }); ctrlErr != nil { |
| return ctrlErr |
| } |
| if err != unix.Errno(0) { |
| return fmt.Errorf("ioctl(BLKRRPART): %w", err) |
| } |
| return nil |
| } |
| |
| // ResizePartition updates the start and length of one partition in the kernel. |
| // This can be used as an alternative to RefreshPartitionTable, which cannot |
| // be used if any partition on this device is currently mounted. |
| func (d *Device) ResizePartition(partitionNo int32, startByte, lengthBytes int64) error { |
| var ioctlPins runtime.Pinner |
| defer ioctlPins.Unpin() |
| |
| partition := unix.BlkpgPartition{ |
| Start: startByte, |
| Length: lengthBytes, |
| Pno: partitionNo, |
| } |
| ioctlPins.Pin(&partition) |
| arg := unix.BlkpgIoctlArg{ |
| Op: unix.BLKPG_RESIZE_PARTITION, |
| Datalen: int32(unsafe.Sizeof(partition)), |
| Data: (*byte)(unsafe.Pointer(&partition)), |
| } |
| |
| var err unix.Errno |
| if ctrlErr := d.rawConn.Control(func(fd uintptr) { |
| _, _, err = unix.Syscall(unix.SYS_IOCTL, fd, unix.BLKPG, uintptr(unsafe.Pointer(&arg))) |
| }); ctrlErr != nil { |
| return ctrlErr |
| } |
| if err != unix.Errno(0) { |
| return fmt.Errorf("ioctl(BLKPG): %w", err) |
| } |
| return nil |
| } |
| |
| // Open opens a block device given a path to its inode. |
| func Open(path string, opts ...Option) (*Device, error) { |
| var o options |
| o.collect(opts) |
| flags := o.genericFlags() |
| if o.direct { |
| flags |= unix.O_DIRECT |
| } |
| if o.exclusive { |
| flags |= unix.O_EXCL |
| } |
| |
| outFile, err := os.OpenFile(path, flags, 0640) |
| if err != nil { |
| return nil, fmt.Errorf("failed to open block device: %w", err) |
| } |
| return FromFileHandle(outFile) |
| } |
| |
| // FromFileHandle creates a blockdev from a device handle. The device handle is |
| // not duplicated, closing the returned Device will close it. If the handle is |
| // not a block device, i.e does not implement block device ioctls, an error is |
| // returned. |
| func FromFileHandle(handle *os.File) (*Device, error) { |
| outFileC, err := handle.SyscallConn() |
| if err != nil { |
| return nil, fmt.Errorf("error getting SyscallConn: %w", err) |
| } |
| var blockSize uint32 |
| outFileC.Control(func(fd uintptr) { |
| blockSize, err = unix.IoctlGetUint32(int(fd), unix.BLKSSZGET) |
| }) |
| if errors.Is(err, unix.ENOTTY) || errors.Is(err, unix.EINVAL) { |
| return nil, ErrNotBlockDevice |
| } else if err != nil { |
| return nil, fmt.Errorf("when querying disk block size: %w", err) |
| } |
| |
| var sizeBytes uint64 |
| var getSizeErr syscall.Errno |
| outFileC.Control(func(fd uintptr) { |
| _, _, getSizeErr = unix.Syscall(unix.SYS_IOCTL, fd, unix.BLKGETSIZE64, uintptr(unsafe.Pointer(&sizeBytes))) |
| }) |
| |
| if getSizeErr != unix.Errno(0) { |
| return nil, fmt.Errorf("when querying disk block count: %w", err) |
| } |
| if sizeBytes%uint64(blockSize) != 0 { |
| return nil, fmt.Errorf("block device size is not an integer multiple of its block size (%d %% %d = %d)", sizeBytes, blockSize, sizeBytes%uint64(blockSize)) |
| } |
| return &Device{ |
| backend: handle, |
| rawConn: outFileC, |
| blockSize: int64(blockSize), |
| blockCount: int64(sizeBytes) / int64(blockSize), |
| }, nil |
| } |
| |
| type File struct { |
| backend *os.File |
| rawConn syscall.RawConn |
| blockSize int64 |
| blockCount int64 |
| } |
| |
| func CreateFile(name string, blockSize int64, blockCount int64) (*File, error) { |
| if blockSize < 512 { |
| return nil, fmt.Errorf("blockSize must be at least 512 bytes") |
| } |
| if bits.OnesCount64(uint64(blockSize)) != 1 { |
| return nil, fmt.Errorf("blockSize must be a power of two") |
| } |
| out, err := os.Create(name) |
| if err != nil { |
| return nil, fmt.Errorf("when creating backing file: %w", err) |
| } |
| rawConn, err := out.SyscallConn() |
| if err != nil { |
| return nil, fmt.Errorf("unable to get SyscallConn: %w", err) |
| } |
| return &File{ |
| backend: out, |
| blockSize: blockSize, |
| rawConn: rawConn, |
| blockCount: blockCount, |
| }, nil |
| } |
| |
| func (d *File) ReadAt(p []byte, off int64) (n int, err error) { |
| size := d.blockSize * d.blockCount |
| if off > size { |
| return 0, io.EOF |
| } |
| if int64(len(p)) > size-off { |
| n, err = d.backend.ReadAt(p[:size-off], off) |
| if err == nil { |
| err = io.EOF |
| } |
| return |
| } |
| return d.backend.ReadAt(p, off) |
| } |
| |
| func (d *File) WriteAt(p []byte, off int64) (n int, err error) { |
| size := d.blockSize * d.blockCount |
| if off > size { |
| return 0, ErrOutOfBounds |
| } |
| if int64(len(p)) > size-off { |
| n, err = d.backend.WriteAt(p[:size-off], off) |
| if err == nil { |
| err = ErrOutOfBounds |
| } |
| return |
| } |
| return d.backend.WriteAt(p, off) |
| } |
| |
| func (d *File) ReadFromAt(r io.Reader, off int64) (n int64, err error) { |
| size := d.blockSize * d.blockCount |
| if off > size || off < 0 { |
| return 0, ErrOutOfBounds |
| } |
| limit := size - off |
| ur := r |
| lr, lrOK := r.(*io.LimitedReader) |
| if lrOK { |
| ur = lr.R |
| limit = min(limit, lr.N) |
| } |
| n, handled, err := d.doCopyFileRange(ur, off, limit) |
| if lrOK { |
| lr.N -= n |
| } |
| off += n |
| if !handled { |
| var fallbackN int64 |
| fallbackN, err = genericReadFromAt(d, r, off) |
| n += fallbackN |
| return |
| } |
| if err == nil && off == size { |
| // Return an error if we have not reached EOF. |
| moreN, moreErr := io.CopyN(io.Discard, r, 1) |
| if moreN != 0 { |
| err = ErrOutOfBounds |
| } else if moreErr != io.EOF { |
| err = moreErr |
| } |
| } |
| return |
| } |
| |
| // Copied from Go src/internal/poll/copy_file_range_linux.go |
| const maxCopyFileRangeRound = 0x7ffff000 |
| |
| // doCopyFileRange attempts to copy using the copy_file_range syscall. |
| // |
| // This is only implemented for [File] because Linux does not support this |
| // syscall on block devices. |
| func (d *File) doCopyFileRange(r io.Reader, off int64, remain int64) (written int64, handled bool, err error) { |
| if remain <= 0 { |
| handled = true |
| return |
| } |
| // Note: We should also check for os.fileWithoutWriteTo, but that type isn't |
| // exported. This means that this optimization won't work if the top-level |
| // copy is io.Copy, but it does work with io.CopyN and w.ReadFrom(r). |
| src, srcOK := r.(*os.File) |
| if !srcOK { |
| return |
| } |
| srcConn, err := src.SyscallConn() |
| if err != nil { |
| return |
| } |
| // We need a read lock of src, because its file offset is used and updated. |
| // We don't need a lock of dest, because its file offset is not used. |
| readErr := srcConn.Read(func(srcFD uintptr) bool { |
| controlErr := d.rawConn.Control(func(destFD uintptr) { |
| handled = true |
| for remain > 0 { |
| n := int(min(remain, maxCopyFileRangeRound)) |
| n, err = unix.CopyFileRange(int(srcFD), nil, int(destFD), &off, n, 0) |
| if n > 0 { |
| remain -= int64(n) |
| written += int64(n) |
| // The kernel adds n to off. |
| } |
| // See handleCopyFileRangeErr in |
| // src/internal/poll/copy_file_range_linux.go |
| if err != nil { |
| if errors.Is(err, unix.ENOSYS) || errors.Is(err, unix.EXDEV) || |
| errors.Is(err, unix.EINVAL) || errors.Is(err, unix.EIO) || |
| errors.Is(err, unix.EOPNOTSUPP) || errors.Is(err, unix.EPERM) { |
| handled = false |
| } |
| break |
| } else if n == 0 { |
| if written == 0 { |
| handled = false |
| } |
| break |
| } |
| } |
| }) |
| if err == nil { |
| err = controlErr |
| } |
| return true |
| }) |
| if err == nil { |
| err = readErr |
| } |
| return |
| } |
| |
| func (d *File) Close() error { |
| return d.backend.Close() |
| } |
| |
| func (d *File) BlockCount() int64 { |
| return d.blockCount |
| } |
| |
| func (d *File) BlockSize() int64 { |
| return d.blockSize |
| } |
| |
| func (d *File) OptimalBlockSize() int64 { |
| return d.blockSize |
| } |
| |
| func (d *File) Discard(startByte int64, endByte int64) error { |
| if err := validAlignedRange(d, startByte, endByte); err != nil { |
| return err |
| } |
| if startByte == endByte { |
| return nil |
| } |
| var err error |
| if ctrlErr := d.rawConn.Control(func(fd uintptr) { |
| // There is FALLOC_FL_NO_HIDE_STALE, but it's not implemented by |
| // any filesystem right now, so let's not attempt it for the time being. |
| err = unix.Fallocate(int(fd), unix.FALLOC_FL_PUNCH_HOLE|unix.FALLOC_FL_KEEP_SIZE, startByte, endByte-startByte) |
| }); ctrlErr != nil { |
| return ctrlErr |
| } |
| if errors.Is(err, unix.EOPNOTSUPP) { |
| return errors.ErrUnsupported |
| } |
| if err != nil { |
| return fmt.Errorf("failed to discard: %w", err) |
| } |
| return nil |
| } |
| |
| func (d *File) Zero(startByte int64, endByte int64) error { |
| if err := validAlignedRange(d, startByte, endByte); err != nil { |
| return err |
| } |
| if startByte == endByte { |
| return nil |
| } |
| var err error |
| if ctrlErr := d.rawConn.Control(func(fd uintptr) { |
| // Tell the filesystem to punch out the given blocks. |
| err = unix.Fallocate(int(fd), unix.FALLOC_FL_PUNCH_HOLE|unix.FALLOC_FL_KEEP_SIZE, startByte, endByte-startByte) |
| }); ctrlErr != nil { |
| return ctrlErr |
| } |
| // If unsupported or the syscall is not available (for example in a sandbox) |
| // fall back to the generic software implementation. |
| if errors.Is(err, unix.EOPNOTSUPP) || errors.Is(err, unix.ENOSYS) { |
| return GenericZero(d, startByte, endByte) |
| } |
| if err != nil { |
| return fmt.Errorf("failed to zero out: %w", err) |
| } |
| return nil |
| } |
| |
| func (d *File) Sync() error { |
| return d.backend.Sync() |
| } |