blob: fbcbf5b84062965e3f9b2fa159f561882576120c [file] [log] [blame]
Tim Windelschmidt6d33a432025-02-04 14:34:25 +01001// Copyright The Monogon Project Authors.
2// SPDX-License-Identifier: Apache-2.0
3
Lorenz Brun1e0e3a42023-06-28 16:40:18 +02004//go:build linux
5
6package blockdev
7
8import (
9 "errors"
10 "fmt"
Jan Schära6da1712024-08-21 15:12:11 +020011 "io"
Lorenz Brun1e0e3a42023-06-28 16:40:18 +020012 "math/bits"
13 "os"
Jan Schär5c82e0d2024-08-26 17:06:13 +020014 "runtime"
Lorenz Brun1e0e3a42023-06-28 16:40:18 +020015 "syscall"
16 "unsafe"
17
18 "golang.org/x/sys/unix"
19)
20
21type Device struct {
22 backend *os.File
23 rawConn syscall.RawConn
24 blockSize int64
25 blockCount int64
26}
27
28func (d *Device) ReadAt(p []byte, off int64) (n int, err error) {
Jan Schära6da1712024-08-21 15:12:11 +020029 size := d.blockSize * d.blockCount
30 if off > size {
31 return 0, io.EOF
32 }
33 if int64(len(p)) > size-off {
34 n, err = d.backend.ReadAt(p[:size-off], off)
35 if err == nil {
36 err = io.EOF
37 }
38 return
39 }
Lorenz Brun1e0e3a42023-06-28 16:40:18 +020040 return d.backend.ReadAt(p, off)
41}
42
43func (d *Device) WriteAt(p []byte, off int64) (n int, err error) {
Jan Schära6da1712024-08-21 15:12:11 +020044 size := d.blockSize * d.blockCount
45 if off > size {
46 return 0, ErrOutOfBounds
47 }
48 if int64(len(p)) > size-off {
49 n, err = d.backend.WriteAt(p[:size-off], off)
50 if err == nil {
51 err = ErrOutOfBounds
52 }
53 return
54 }
Lorenz Brun1e0e3a42023-06-28 16:40:18 +020055 return d.backend.WriteAt(p, off)
56}
57
58func (d *Device) Close() error {
59 return d.backend.Close()
60}
61
62func (d *Device) BlockCount() int64 {
63 return d.blockCount
64}
65
66func (d *Device) BlockSize() int64 {
67 return d.blockSize
68}
69
Jan Schära6da1712024-08-21 15:12:11 +020070func (d *Device) OptimalBlockSize() int64 {
71 return d.blockSize
72}
73
Lorenz Brun1e0e3a42023-06-28 16:40:18 +020074func (d *Device) Discard(startByte int64, endByte int64) error {
Jan Schära6da1712024-08-21 15:12:11 +020075 if err := validAlignedRange(d, startByte, endByte); err != nil {
76 return err
77 }
78 if startByte == endByte {
79 return nil
80 }
Lorenz Brun1e0e3a42023-06-28 16:40:18 +020081 var args [2]uint64
82 var err unix.Errno
83 args[0] = uint64(startByte)
Jan Schär0ea961c2024-04-11 13:41:40 +020084 args[1] = uint64(endByte - startByte)
Lorenz Brun1e0e3a42023-06-28 16:40:18 +020085 if ctrlErr := d.rawConn.Control(func(fd uintptr) {
86 _, _, err = unix.Syscall(unix.SYS_IOCTL, fd, unix.BLKDISCARD, uintptr(unsafe.Pointer(&args[0])))
87 }); ctrlErr != nil {
88 return ctrlErr
89 }
90 if err == unix.EOPNOTSUPP {
Lorenz Brun65b1c682023-09-14 15:49:39 +020091 return errors.ErrUnsupported
Lorenz Brun1e0e3a42023-06-28 16:40:18 +020092 }
93 if err != unix.Errno(0) {
94 return fmt.Errorf("failed to discard: %w", err)
95 }
96 return nil
97}
98
Lorenz Brun1e0e3a42023-06-28 16:40:18 +020099func (d *Device) Zero(startByte int64, endByte int64) error {
Jan Schära6da1712024-08-21 15:12:11 +0200100 if err := validAlignedRange(d, startByte, endByte); err != nil {
101 return err
102 }
103 if startByte == endByte {
104 return nil
105 }
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200106 var args [2]uint64
107 var err error
108 args[0] = uint64(startByte)
Jan Schär0ea961c2024-04-11 13:41:40 +0200109 args[1] = uint64(endByte - startByte)
Tim Windelschmidt06c19642024-04-23 15:07:40 +0200110 ctrlErr := d.rawConn.Control(func(fd uintptr) {
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200111 // Attempts to leverage discard guarantees to provide extremely quick
112 // metadata-only zeroing.
113 err = unix.Fallocate(int(fd), unix.FALLOC_FL_PUNCH_HOLE|unix.FALLOC_FL_KEEP_SIZE, startByte, endByte-startByte)
Tim Windelschmidtd5f851b2024-04-23 14:59:37 +0200114 if errors.Is(err, unix.EOPNOTSUPP) {
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200115 // Tries Write Same and friends and then just falls back to writing
116 // zeroes.
Tim Windelschmidt06c19642024-04-23 15:07:40 +0200117 _, _, errNo := unix.Syscall(unix.SYS_IOCTL, fd, unix.BLKZEROOUT, uintptr(unsafe.Pointer(&args[0])))
118 if errNo == unix.Errno(0) {
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200119 err = nil
Tim Windelschmidt06c19642024-04-23 15:07:40 +0200120 } else {
121 err = errNo
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200122 }
123 }
Tim Windelschmidt06c19642024-04-23 15:07:40 +0200124 })
125 if ctrlErr != nil {
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200126 return ctrlErr
127 }
128 if err != nil {
129 return fmt.Errorf("failed to zero out: %w", err)
130 }
131 return nil
132}
133
Jan Schära6da1712024-08-21 15:12:11 +0200134func (d *Device) Sync() error {
135 return d.backend.Sync()
136}
137
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200138// RefreshPartitionTable refreshes the kernel's view of the partition table
139// after changes made from userspace.
140func (d *Device) RefreshPartitionTable() error {
141 var err unix.Errno
142 if ctrlErr := d.rawConn.Control(func(fd uintptr) {
143 _, _, err = unix.Syscall(unix.SYS_IOCTL, fd, unix.BLKRRPART, 0)
144 }); ctrlErr != nil {
145 return ctrlErr
146 }
147 if err != unix.Errno(0) {
148 return fmt.Errorf("ioctl(BLKRRPART): %w", err)
149 }
150 return nil
151}
152
Jan Schär5c82e0d2024-08-26 17:06:13 +0200153// ResizePartition updates the start and length of one partition in the kernel.
154// This can be used as an alternative to RefreshPartitionTable, which cannot
155// be used if any partition on this device is currently mounted.
156func (d *Device) ResizePartition(partitionNo int32, startByte, lengthBytes int64) error {
157 var ioctlPins runtime.Pinner
158 defer ioctlPins.Unpin()
159
160 partition := unix.BlkpgPartition{
161 Start: startByte,
162 Length: lengthBytes,
163 Pno: partitionNo,
164 }
165 ioctlPins.Pin(&partition)
166 arg := unix.BlkpgIoctlArg{
167 Op: unix.BLKPG_RESIZE_PARTITION,
168 Datalen: int32(unsafe.Sizeof(partition)),
169 Data: (*byte)(unsafe.Pointer(&partition)),
170 }
171
172 var err unix.Errno
173 if ctrlErr := d.rawConn.Control(func(fd uintptr) {
174 _, _, err = unix.Syscall(unix.SYS_IOCTL, fd, unix.BLKPG, uintptr(unsafe.Pointer(&arg)))
175 }); ctrlErr != nil {
176 return ctrlErr
177 }
178 if err != unix.Errno(0) {
179 return fmt.Errorf("ioctl(BLKPG): %w", err)
180 }
181 return nil
182}
183
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200184// Open opens a block device given a path to its inode.
Lorenz Brun8eb02442025-02-25 16:57:52 +0100185func Open(path string, opts ...Option) (*Device, error) {
186 var o options
187 o.collect(opts)
188 flags := o.genericFlags()
189 if o.direct {
190 flags |= unix.O_DIRECT
191 }
192 if o.exclusive {
193 flags |= unix.O_EXCL
194 }
195
196 outFile, err := os.OpenFile(path, flags, 0640)
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200197 if err != nil {
198 return nil, fmt.Errorf("failed to open block device: %w", err)
199 }
200 return FromFileHandle(outFile)
201}
202
203// FromFileHandle creates a blockdev from a device handle. The device handle is
204// not duplicated, closing the returned Device will close it. If the handle is
205// not a block device, i.e does not implement block device ioctls, an error is
206// returned.
207func FromFileHandle(handle *os.File) (*Device, error) {
208 outFileC, err := handle.SyscallConn()
209 if err != nil {
210 return nil, fmt.Errorf("error getting SyscallConn: %w", err)
211 }
212 var blockSize uint32
213 outFileC.Control(func(fd uintptr) {
214 blockSize, err = unix.IoctlGetUint32(int(fd), unix.BLKSSZGET)
215 })
216 if errors.Is(err, unix.ENOTTY) || errors.Is(err, unix.EINVAL) {
217 return nil, ErrNotBlockDevice
218 } else if err != nil {
219 return nil, fmt.Errorf("when querying disk block size: %w", err)
220 }
221
222 var sizeBytes uint64
Tim Windelschmidt06c19642024-04-23 15:07:40 +0200223 var getSizeErr syscall.Errno
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200224 outFileC.Control(func(fd uintptr) {
225 _, _, getSizeErr = unix.Syscall(unix.SYS_IOCTL, fd, unix.BLKGETSIZE64, uintptr(unsafe.Pointer(&sizeBytes)))
226 })
227
228 if getSizeErr != unix.Errno(0) {
229 return nil, fmt.Errorf("when querying disk block count: %w", err)
230 }
231 if sizeBytes%uint64(blockSize) != 0 {
232 return nil, fmt.Errorf("block device size is not an integer multiple of its block size (%d %% %d = %d)", sizeBytes, blockSize, sizeBytes%uint64(blockSize))
233 }
234 return &Device{
235 backend: handle,
236 rawConn: outFileC,
237 blockSize: int64(blockSize),
238 blockCount: int64(sizeBytes) / int64(blockSize),
239 }, nil
240}
241
242type File struct {
243 backend *os.File
244 rawConn syscall.RawConn
245 blockSize int64
246 blockCount int64
247}
248
249func CreateFile(name string, blockSize int64, blockCount int64) (*File, error) {
250 if blockSize < 512 {
Jan Schära6da1712024-08-21 15:12:11 +0200251 return nil, fmt.Errorf("blockSize must be at least 512 bytes")
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200252 }
253 if bits.OnesCount64(uint64(blockSize)) != 1 {
254 return nil, fmt.Errorf("blockSize must be a power of two")
255 }
256 out, err := os.Create(name)
257 if err != nil {
258 return nil, fmt.Errorf("when creating backing file: %w", err)
259 }
260 rawConn, err := out.SyscallConn()
261 if err != nil {
262 return nil, fmt.Errorf("unable to get SyscallConn: %w", err)
263 }
264 return &File{
265 backend: out,
266 blockSize: blockSize,
267 rawConn: rawConn,
268 blockCount: blockCount,
269 }, nil
270}
271
272func (d *File) ReadAt(p []byte, off int64) (n int, err error) {
Jan Schära6da1712024-08-21 15:12:11 +0200273 size := d.blockSize * d.blockCount
274 if off > size {
275 return 0, io.EOF
276 }
277 if int64(len(p)) > size-off {
278 n, err = d.backend.ReadAt(p[:size-off], off)
279 if err == nil {
280 err = io.EOF
281 }
282 return
283 }
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200284 return d.backend.ReadAt(p, off)
285}
286
287func (d *File) WriteAt(p []byte, off int64) (n int, err error) {
Jan Schära6da1712024-08-21 15:12:11 +0200288 size := d.blockSize * d.blockCount
289 if off > size {
290 return 0, ErrOutOfBounds
291 }
292 if int64(len(p)) > size-off {
293 n, err = d.backend.WriteAt(p[:size-off], off)
294 if err == nil {
295 err = ErrOutOfBounds
296 }
297 return
298 }
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200299 return d.backend.WriteAt(p, off)
300}
301
302func (d *File) Close() error {
303 return d.backend.Close()
304}
305
306func (d *File) BlockCount() int64 {
307 return d.blockCount
308}
309
310func (d *File) BlockSize() int64 {
311 return d.blockSize
312}
313
Jan Schära6da1712024-08-21 15:12:11 +0200314func (d *File) OptimalBlockSize() int64 {
315 return d.blockSize
316}
317
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200318func (d *File) Discard(startByte int64, endByte int64) error {
Jan Schära6da1712024-08-21 15:12:11 +0200319 if err := validAlignedRange(d, startByte, endByte); err != nil {
320 return err
321 }
322 if startByte == endByte {
323 return nil
324 }
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200325 var err error
326 if ctrlErr := d.rawConn.Control(func(fd uintptr) {
327 // There is FALLOC_FL_NO_HIDE_STALE, but it's not implemented by
328 // any filesystem right now, so let's not attempt it for the time being.
329 err = unix.Fallocate(int(fd), unix.FALLOC_FL_PUNCH_HOLE|unix.FALLOC_FL_KEEP_SIZE, startByte, endByte-startByte)
330 }); ctrlErr != nil {
331 return ctrlErr
332 }
333 if errors.Is(err, unix.EOPNOTSUPP) {
Lorenz Brun65b1c682023-09-14 15:49:39 +0200334 return errors.ErrUnsupported
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200335 }
Jan Schär0ea961c2024-04-11 13:41:40 +0200336 if err != nil {
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200337 return fmt.Errorf("failed to discard: %w", err)
338 }
339 return nil
340}
341
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200342func (d *File) Zero(startByte int64, endByte int64) error {
Jan Schära6da1712024-08-21 15:12:11 +0200343 if err := validAlignedRange(d, startByte, endByte); err != nil {
344 return err
345 }
346 if startByte == endByte {
347 return nil
348 }
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200349 var err error
350 if ctrlErr := d.rawConn.Control(func(fd uintptr) {
351 // Tell the filesystem to punch out the given blocks.
352 err = unix.Fallocate(int(fd), unix.FALLOC_FL_PUNCH_HOLE|unix.FALLOC_FL_KEEP_SIZE, startByte, endByte-startByte)
353 }); ctrlErr != nil {
354 return ctrlErr
355 }
356 // If unsupported or the syscall is not available (for example in a sandbox)
357 // fall back to the generic software implementation.
358 if errors.Is(err, unix.EOPNOTSUPP) || errors.Is(err, unix.ENOSYS) {
359 return GenericZero(d, startByte, endByte)
360 }
361 if err != nil {
362 return fmt.Errorf("failed to zero out: %w", err)
363 }
364 return nil
365}
Jan Schära6da1712024-08-21 15:12:11 +0200366
367func (d *File) Sync() error {
368 return d.backend.Sync()
369}