blob: ab9c96b5769c3f5b351c35ea27ae7002e7a6e02d [file] [log] [blame]
Tim Windelschmidt6d33a432025-02-04 14:34:25 +01001// Copyright The Monogon Project Authors.
2// SPDX-License-Identifier: Apache-2.0
3
Lorenz Brun1e0e3a42023-06-28 16:40:18 +02004//go:build linux
5
6package blockdev
7
8import (
9 "errors"
10 "fmt"
Jan Schära6da1712024-08-21 15:12:11 +020011 "io"
Lorenz Brun1e0e3a42023-06-28 16:40:18 +020012 "math/bits"
13 "os"
Jan Schär5c82e0d2024-08-26 17:06:13 +020014 "runtime"
Lorenz Brun1e0e3a42023-06-28 16:40:18 +020015 "syscall"
16 "unsafe"
17
18 "golang.org/x/sys/unix"
19)
20
21type Device struct {
22 backend *os.File
23 rawConn syscall.RawConn
24 blockSize int64
25 blockCount int64
26}
27
28func (d *Device) ReadAt(p []byte, off int64) (n int, err error) {
Jan Schära6da1712024-08-21 15:12:11 +020029 size := d.blockSize * d.blockCount
30 if off > size {
31 return 0, io.EOF
32 }
33 if int64(len(p)) > size-off {
34 n, err = d.backend.ReadAt(p[:size-off], off)
35 if err == nil {
36 err = io.EOF
37 }
38 return
39 }
Lorenz Brun1e0e3a42023-06-28 16:40:18 +020040 return d.backend.ReadAt(p, off)
41}
42
43func (d *Device) WriteAt(p []byte, off int64) (n int, err error) {
Jan Schära6da1712024-08-21 15:12:11 +020044 size := d.blockSize * d.blockCount
45 if off > size {
46 return 0, ErrOutOfBounds
47 }
48 if int64(len(p)) > size-off {
49 n, err = d.backend.WriteAt(p[:size-off], off)
50 if err == nil {
51 err = ErrOutOfBounds
52 }
53 return
54 }
Lorenz Brun1e0e3a42023-06-28 16:40:18 +020055 return d.backend.WriteAt(p, off)
56}
57
58func (d *Device) Close() error {
59 return d.backend.Close()
60}
61
62func (d *Device) BlockCount() int64 {
63 return d.blockCount
64}
65
66func (d *Device) BlockSize() int64 {
67 return d.blockSize
68}
69
Jan Schära6da1712024-08-21 15:12:11 +020070func (d *Device) OptimalBlockSize() int64 {
71 return d.blockSize
72}
73
Lorenz Brun1e0e3a42023-06-28 16:40:18 +020074func (d *Device) Discard(startByte int64, endByte int64) error {
Jan Schära6da1712024-08-21 15:12:11 +020075 if err := validAlignedRange(d, startByte, endByte); err != nil {
76 return err
77 }
78 if startByte == endByte {
79 return nil
80 }
Lorenz Brun1e0e3a42023-06-28 16:40:18 +020081 var args [2]uint64
82 var err unix.Errno
83 args[0] = uint64(startByte)
Jan Schär0ea961c2024-04-11 13:41:40 +020084 args[1] = uint64(endByte - startByte)
Lorenz Brun1e0e3a42023-06-28 16:40:18 +020085 if ctrlErr := d.rawConn.Control(func(fd uintptr) {
86 _, _, err = unix.Syscall(unix.SYS_IOCTL, fd, unix.BLKDISCARD, uintptr(unsafe.Pointer(&args[0])))
87 }); ctrlErr != nil {
88 return ctrlErr
89 }
90 if err == unix.EOPNOTSUPP {
Lorenz Brun65b1c682023-09-14 15:49:39 +020091 return errors.ErrUnsupported
Lorenz Brun1e0e3a42023-06-28 16:40:18 +020092 }
93 if err != unix.Errno(0) {
94 return fmt.Errorf("failed to discard: %w", err)
95 }
96 return nil
97}
98
Lorenz Brun1e0e3a42023-06-28 16:40:18 +020099func (d *Device) Zero(startByte int64, endByte int64) error {
Jan Schära6da1712024-08-21 15:12:11 +0200100 if err := validAlignedRange(d, startByte, endByte); err != nil {
101 return err
102 }
103 if startByte == endByte {
104 return nil
105 }
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200106 var args [2]uint64
107 var err error
108 args[0] = uint64(startByte)
Jan Schär0ea961c2024-04-11 13:41:40 +0200109 args[1] = uint64(endByte - startByte)
Tim Windelschmidt06c19642024-04-23 15:07:40 +0200110 ctrlErr := d.rawConn.Control(func(fd uintptr) {
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200111 // Attempts to leverage discard guarantees to provide extremely quick
112 // metadata-only zeroing.
113 err = unix.Fallocate(int(fd), unix.FALLOC_FL_PUNCH_HOLE|unix.FALLOC_FL_KEEP_SIZE, startByte, endByte-startByte)
Tim Windelschmidtd5f851b2024-04-23 14:59:37 +0200114 if errors.Is(err, unix.EOPNOTSUPP) {
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200115 // Tries Write Same and friends and then just falls back to writing
116 // zeroes.
Tim Windelschmidt06c19642024-04-23 15:07:40 +0200117 _, _, errNo := unix.Syscall(unix.SYS_IOCTL, fd, unix.BLKZEROOUT, uintptr(unsafe.Pointer(&args[0])))
118 if errNo == unix.Errno(0) {
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200119 err = nil
Tim Windelschmidt06c19642024-04-23 15:07:40 +0200120 } else {
121 err = errNo
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200122 }
123 }
Tim Windelschmidt06c19642024-04-23 15:07:40 +0200124 })
125 if ctrlErr != nil {
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200126 return ctrlErr
127 }
128 if err != nil {
129 return fmt.Errorf("failed to zero out: %w", err)
130 }
131 return nil
132}
133
Jan Schära6da1712024-08-21 15:12:11 +0200134func (d *Device) Sync() error {
135 return d.backend.Sync()
136}
137
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200138// RefreshPartitionTable refreshes the kernel's view of the partition table
139// after changes made from userspace.
140func (d *Device) RefreshPartitionTable() error {
141 var err unix.Errno
142 if ctrlErr := d.rawConn.Control(func(fd uintptr) {
143 _, _, err = unix.Syscall(unix.SYS_IOCTL, fd, unix.BLKRRPART, 0)
144 }); ctrlErr != nil {
145 return ctrlErr
146 }
147 if err != unix.Errno(0) {
148 return fmt.Errorf("ioctl(BLKRRPART): %w", err)
149 }
150 return nil
151}
152
Jan Schär5c82e0d2024-08-26 17:06:13 +0200153// ResizePartition updates the start and length of one partition in the kernel.
154// This can be used as an alternative to RefreshPartitionTable, which cannot
155// be used if any partition on this device is currently mounted.
156func (d *Device) ResizePartition(partitionNo int32, startByte, lengthBytes int64) error {
157 var ioctlPins runtime.Pinner
158 defer ioctlPins.Unpin()
159
160 partition := unix.BlkpgPartition{
161 Start: startByte,
162 Length: lengthBytes,
163 Pno: partitionNo,
164 }
165 ioctlPins.Pin(&partition)
166 arg := unix.BlkpgIoctlArg{
167 Op: unix.BLKPG_RESIZE_PARTITION,
168 Datalen: int32(unsafe.Sizeof(partition)),
169 Data: (*byte)(unsafe.Pointer(&partition)),
170 }
171
172 var err unix.Errno
173 if ctrlErr := d.rawConn.Control(func(fd uintptr) {
174 _, _, err = unix.Syscall(unix.SYS_IOCTL, fd, unix.BLKPG, uintptr(unsafe.Pointer(&arg)))
175 }); ctrlErr != nil {
176 return ctrlErr
177 }
178 if err != unix.Errno(0) {
179 return fmt.Errorf("ioctl(BLKPG): %w", err)
180 }
181 return nil
182}
183
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200184// Open opens a block device given a path to its inode.
Lorenz Brun8eb02442025-02-25 16:57:52 +0100185func Open(path string, opts ...Option) (*Device, error) {
186 var o options
187 o.collect(opts)
188 flags := o.genericFlags()
189 if o.direct {
190 flags |= unix.O_DIRECT
191 }
192 if o.exclusive {
193 flags |= unix.O_EXCL
194 }
195
196 outFile, err := os.OpenFile(path, flags, 0640)
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200197 if err != nil {
198 return nil, fmt.Errorf("failed to open block device: %w", err)
199 }
200 return FromFileHandle(outFile)
201}
202
203// FromFileHandle creates a blockdev from a device handle. The device handle is
204// not duplicated, closing the returned Device will close it. If the handle is
205// not a block device, i.e does not implement block device ioctls, an error is
206// returned.
207func FromFileHandle(handle *os.File) (*Device, error) {
208 outFileC, err := handle.SyscallConn()
209 if err != nil {
210 return nil, fmt.Errorf("error getting SyscallConn: %w", err)
211 }
212 var blockSize uint32
213 outFileC.Control(func(fd uintptr) {
214 blockSize, err = unix.IoctlGetUint32(int(fd), unix.BLKSSZGET)
215 })
216 if errors.Is(err, unix.ENOTTY) || errors.Is(err, unix.EINVAL) {
217 return nil, ErrNotBlockDevice
218 } else if err != nil {
219 return nil, fmt.Errorf("when querying disk block size: %w", err)
220 }
221
222 var sizeBytes uint64
Tim Windelschmidt06c19642024-04-23 15:07:40 +0200223 var getSizeErr syscall.Errno
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200224 outFileC.Control(func(fd uintptr) {
225 _, _, getSizeErr = unix.Syscall(unix.SYS_IOCTL, fd, unix.BLKGETSIZE64, uintptr(unsafe.Pointer(&sizeBytes)))
226 })
227
228 if getSizeErr != unix.Errno(0) {
229 return nil, fmt.Errorf("when querying disk block count: %w", err)
230 }
231 if sizeBytes%uint64(blockSize) != 0 {
232 return nil, fmt.Errorf("block device size is not an integer multiple of its block size (%d %% %d = %d)", sizeBytes, blockSize, sizeBytes%uint64(blockSize))
233 }
234 return &Device{
235 backend: handle,
236 rawConn: outFileC,
237 blockSize: int64(blockSize),
238 blockCount: int64(sizeBytes) / int64(blockSize),
239 }, nil
240}
241
242type File struct {
243 backend *os.File
244 rawConn syscall.RawConn
245 blockSize int64
246 blockCount int64
247}
248
249func CreateFile(name string, blockSize int64, blockCount int64) (*File, error) {
250 if blockSize < 512 {
Jan Schära6da1712024-08-21 15:12:11 +0200251 return nil, fmt.Errorf("blockSize must be at least 512 bytes")
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200252 }
253 if bits.OnesCount64(uint64(blockSize)) != 1 {
254 return nil, fmt.Errorf("blockSize must be a power of two")
255 }
256 out, err := os.Create(name)
257 if err != nil {
258 return nil, fmt.Errorf("when creating backing file: %w", err)
259 }
260 rawConn, err := out.SyscallConn()
261 if err != nil {
262 return nil, fmt.Errorf("unable to get SyscallConn: %w", err)
263 }
264 return &File{
265 backend: out,
266 blockSize: blockSize,
267 rawConn: rawConn,
268 blockCount: blockCount,
269 }, nil
270}
271
272func (d *File) ReadAt(p []byte, off int64) (n int, err error) {
Jan Schära6da1712024-08-21 15:12:11 +0200273 size := d.blockSize * d.blockCount
274 if off > size {
275 return 0, io.EOF
276 }
277 if int64(len(p)) > size-off {
278 n, err = d.backend.ReadAt(p[:size-off], off)
279 if err == nil {
280 err = io.EOF
281 }
282 return
283 }
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200284 return d.backend.ReadAt(p, off)
285}
286
287func (d *File) WriteAt(p []byte, off int64) (n int, err error) {
Jan Schära6da1712024-08-21 15:12:11 +0200288 size := d.blockSize * d.blockCount
289 if off > size {
290 return 0, ErrOutOfBounds
291 }
292 if int64(len(p)) > size-off {
293 n, err = d.backend.WriteAt(p[:size-off], off)
294 if err == nil {
295 err = ErrOutOfBounds
296 }
297 return
298 }
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200299 return d.backend.WriteAt(p, off)
300}
301
Jan Schäre0db72c2025-06-18 18:14:07 +0000302func (d *File) ReadFromAt(r io.Reader, off int64) (n int64, err error) {
303 size := d.blockSize * d.blockCount
304 if off > size || off < 0 {
305 return 0, ErrOutOfBounds
306 }
307 limit := size - off
308 ur := r
309 lr, lrOK := r.(*io.LimitedReader)
310 if lrOK {
311 ur = lr.R
312 limit = min(limit, lr.N)
313 }
314 n, handled, err := d.doCopyFileRange(ur, off, limit)
315 if lrOK {
316 lr.N -= n
317 }
318 off += n
319 if !handled {
320 var fallbackN int64
321 fallbackN, err = genericReadFromAt(d, r, off)
322 n += fallbackN
323 return
324 }
325 if err == nil && off == size {
326 // Return an error if we have not reached EOF.
327 moreN, moreErr := io.CopyN(io.Discard, r, 1)
328 if moreN != 0 {
329 err = ErrOutOfBounds
330 } else if moreErr != io.EOF {
331 err = moreErr
332 }
333 }
334 return
335}
336
337// Copied from Go src/internal/poll/copy_file_range_linux.go
338const maxCopyFileRangeRound = 0x7ffff000
339
340// doCopyFileRange attempts to copy using the copy_file_range syscall.
341//
342// This is only implemented for [File] because Linux does not support this
343// syscall on block devices.
344func (d *File) doCopyFileRange(r io.Reader, off int64, remain int64) (written int64, handled bool, err error) {
345 if remain <= 0 {
346 handled = true
347 return
348 }
349 // Note: We should also check for os.fileWithoutWriteTo, but that type isn't
350 // exported. This means that this optimization won't work if the top-level
351 // copy is io.Copy, but it does work with io.CopyN and w.ReadFrom(r).
352 src, srcOK := r.(*os.File)
353 if !srcOK {
354 return
355 }
356 srcConn, err := src.SyscallConn()
357 if err != nil {
358 return
359 }
360 // We need a read lock of src, because its file offset is used and updated.
361 // We don't need a lock of dest, because its file offset is not used.
362 readErr := srcConn.Read(func(srcFD uintptr) bool {
363 controlErr := d.rawConn.Control(func(destFD uintptr) {
364 handled = true
365 for remain > 0 {
366 n := int(min(remain, maxCopyFileRangeRound))
367 n, err = unix.CopyFileRange(int(srcFD), nil, int(destFD), &off, n, 0)
368 if n > 0 {
369 remain -= int64(n)
370 written += int64(n)
371 // The kernel adds n to off.
372 }
373 // See handleCopyFileRangeErr in
374 // src/internal/poll/copy_file_range_linux.go
375 if err != nil {
376 if errors.Is(err, unix.ENOSYS) || errors.Is(err, unix.EXDEV) ||
377 errors.Is(err, unix.EINVAL) || errors.Is(err, unix.EIO) ||
378 errors.Is(err, unix.EOPNOTSUPP) || errors.Is(err, unix.EPERM) {
379 handled = false
380 }
381 break
382 } else if n == 0 {
383 if written == 0 {
384 handled = false
385 }
386 break
387 }
388 }
389 })
390 if err == nil {
391 err = controlErr
392 }
393 return true
394 })
395 if err == nil {
396 err = readErr
397 }
398 return
399}
400
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200401func (d *File) Close() error {
402 return d.backend.Close()
403}
404
405func (d *File) BlockCount() int64 {
406 return d.blockCount
407}
408
409func (d *File) BlockSize() int64 {
410 return d.blockSize
411}
412
Jan Schära6da1712024-08-21 15:12:11 +0200413func (d *File) OptimalBlockSize() int64 {
414 return d.blockSize
415}
416
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200417func (d *File) Discard(startByte int64, endByte int64) error {
Jan Schära6da1712024-08-21 15:12:11 +0200418 if err := validAlignedRange(d, startByte, endByte); err != nil {
419 return err
420 }
421 if startByte == endByte {
422 return nil
423 }
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200424 var err error
425 if ctrlErr := d.rawConn.Control(func(fd uintptr) {
426 // There is FALLOC_FL_NO_HIDE_STALE, but it's not implemented by
427 // any filesystem right now, so let's not attempt it for the time being.
428 err = unix.Fallocate(int(fd), unix.FALLOC_FL_PUNCH_HOLE|unix.FALLOC_FL_KEEP_SIZE, startByte, endByte-startByte)
429 }); ctrlErr != nil {
430 return ctrlErr
431 }
432 if errors.Is(err, unix.EOPNOTSUPP) {
Lorenz Brun65b1c682023-09-14 15:49:39 +0200433 return errors.ErrUnsupported
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200434 }
Jan Schär0ea961c2024-04-11 13:41:40 +0200435 if err != nil {
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200436 return fmt.Errorf("failed to discard: %w", err)
437 }
438 return nil
439}
440
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200441func (d *File) Zero(startByte int64, endByte int64) error {
Jan Schära6da1712024-08-21 15:12:11 +0200442 if err := validAlignedRange(d, startByte, endByte); err != nil {
443 return err
444 }
445 if startByte == endByte {
446 return nil
447 }
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200448 var err error
449 if ctrlErr := d.rawConn.Control(func(fd uintptr) {
450 // Tell the filesystem to punch out the given blocks.
451 err = unix.Fallocate(int(fd), unix.FALLOC_FL_PUNCH_HOLE|unix.FALLOC_FL_KEEP_SIZE, startByte, endByte-startByte)
452 }); ctrlErr != nil {
453 return ctrlErr
454 }
455 // If unsupported or the syscall is not available (for example in a sandbox)
456 // fall back to the generic software implementation.
457 if errors.Is(err, unix.EOPNOTSUPP) || errors.Is(err, unix.ENOSYS) {
458 return GenericZero(d, startByte, endByte)
459 }
460 if err != nil {
461 return fmt.Errorf("failed to zero out: %w", err)
462 }
463 return nil
464}
Jan Schära6da1712024-08-21 15:12:11 +0200465
466func (d *File) Sync() error {
467 return d.backend.Sync()
468}