blob: f6d5b4c5ac82aacfb76ea69bad7452b38bde14bf [file] [log] [blame]
Lorenz Brun1e0e3a42023-06-28 16:40:18 +02001//go:build linux
2
3package blockdev
4
5import (
6 "errors"
7 "fmt"
Jan Schära6da1712024-08-21 15:12:11 +02008 "io"
Lorenz Brun1e0e3a42023-06-28 16:40:18 +02009 "math/bits"
10 "os"
11 "syscall"
12 "unsafe"
13
14 "golang.org/x/sys/unix"
15)
16
17type Device struct {
18 backend *os.File
19 rawConn syscall.RawConn
20 blockSize int64
21 blockCount int64
22}
23
24func (d *Device) ReadAt(p []byte, off int64) (n int, err error) {
Jan Schära6da1712024-08-21 15:12:11 +020025 size := d.blockSize * d.blockCount
26 if off > size {
27 return 0, io.EOF
28 }
29 if int64(len(p)) > size-off {
30 n, err = d.backend.ReadAt(p[:size-off], off)
31 if err == nil {
32 err = io.EOF
33 }
34 return
35 }
Lorenz Brun1e0e3a42023-06-28 16:40:18 +020036 return d.backend.ReadAt(p, off)
37}
38
39func (d *Device) WriteAt(p []byte, off int64) (n int, err error) {
Jan Schära6da1712024-08-21 15:12:11 +020040 size := d.blockSize * d.blockCount
41 if off > size {
42 return 0, ErrOutOfBounds
43 }
44 if int64(len(p)) > size-off {
45 n, err = d.backend.WriteAt(p[:size-off], off)
46 if err == nil {
47 err = ErrOutOfBounds
48 }
49 return
50 }
Lorenz Brun1e0e3a42023-06-28 16:40:18 +020051 return d.backend.WriteAt(p, off)
52}
53
54func (d *Device) Close() error {
55 return d.backend.Close()
56}
57
58func (d *Device) BlockCount() int64 {
59 return d.blockCount
60}
61
62func (d *Device) BlockSize() int64 {
63 return d.blockSize
64}
65
Jan Schära6da1712024-08-21 15:12:11 +020066func (d *Device) OptimalBlockSize() int64 {
67 return d.blockSize
68}
69
Lorenz Brun1e0e3a42023-06-28 16:40:18 +020070func (d *Device) Discard(startByte int64, endByte int64) error {
Jan Schära6da1712024-08-21 15:12:11 +020071 if err := validAlignedRange(d, startByte, endByte); err != nil {
72 return err
73 }
74 if startByte == endByte {
75 return nil
76 }
Lorenz Brun1e0e3a42023-06-28 16:40:18 +020077 var args [2]uint64
78 var err unix.Errno
79 args[0] = uint64(startByte)
Jan Schär0ea961c2024-04-11 13:41:40 +020080 args[1] = uint64(endByte - startByte)
Lorenz Brun1e0e3a42023-06-28 16:40:18 +020081 if ctrlErr := d.rawConn.Control(func(fd uintptr) {
82 _, _, err = unix.Syscall(unix.SYS_IOCTL, fd, unix.BLKDISCARD, uintptr(unsafe.Pointer(&args[0])))
83 }); ctrlErr != nil {
84 return ctrlErr
85 }
86 if err == unix.EOPNOTSUPP {
Lorenz Brun65b1c682023-09-14 15:49:39 +020087 return errors.ErrUnsupported
Lorenz Brun1e0e3a42023-06-28 16:40:18 +020088 }
89 if err != unix.Errno(0) {
90 return fmt.Errorf("failed to discard: %w", err)
91 }
92 return nil
93}
94
Lorenz Brun1e0e3a42023-06-28 16:40:18 +020095func (d *Device) Zero(startByte int64, endByte int64) error {
Jan Schära6da1712024-08-21 15:12:11 +020096 if err := validAlignedRange(d, startByte, endByte); err != nil {
97 return err
98 }
99 if startByte == endByte {
100 return nil
101 }
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200102 var args [2]uint64
103 var err error
104 args[0] = uint64(startByte)
Jan Schär0ea961c2024-04-11 13:41:40 +0200105 args[1] = uint64(endByte - startByte)
Tim Windelschmidt06c19642024-04-23 15:07:40 +0200106 ctrlErr := d.rawConn.Control(func(fd uintptr) {
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200107 // Attempts to leverage discard guarantees to provide extremely quick
108 // metadata-only zeroing.
109 err = unix.Fallocate(int(fd), unix.FALLOC_FL_PUNCH_HOLE|unix.FALLOC_FL_KEEP_SIZE, startByte, endByte-startByte)
Tim Windelschmidtd5f851b2024-04-23 14:59:37 +0200110 if errors.Is(err, unix.EOPNOTSUPP) {
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200111 // Tries Write Same and friends and then just falls back to writing
112 // zeroes.
Tim Windelschmidt06c19642024-04-23 15:07:40 +0200113 _, _, errNo := unix.Syscall(unix.SYS_IOCTL, fd, unix.BLKZEROOUT, uintptr(unsafe.Pointer(&args[0])))
114 if errNo == unix.Errno(0) {
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200115 err = nil
Tim Windelschmidt06c19642024-04-23 15:07:40 +0200116 } else {
117 err = errNo
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200118 }
119 }
Tim Windelschmidt06c19642024-04-23 15:07:40 +0200120 })
121 if ctrlErr != nil {
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200122 return ctrlErr
123 }
124 if err != nil {
125 return fmt.Errorf("failed to zero out: %w", err)
126 }
127 return nil
128}
129
Jan Schära6da1712024-08-21 15:12:11 +0200130func (d *Device) Sync() error {
131 return d.backend.Sync()
132}
133
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200134// RefreshPartitionTable refreshes the kernel's view of the partition table
135// after changes made from userspace.
136func (d *Device) RefreshPartitionTable() error {
137 var err unix.Errno
138 if ctrlErr := d.rawConn.Control(func(fd uintptr) {
139 _, _, err = unix.Syscall(unix.SYS_IOCTL, fd, unix.BLKRRPART, 0)
140 }); ctrlErr != nil {
141 return ctrlErr
142 }
143 if err != unix.Errno(0) {
144 return fmt.Errorf("ioctl(BLKRRPART): %w", err)
145 }
146 return nil
147}
148
149// Open opens a block device given a path to its inode.
150// TODO: exclusive, O_DIRECT
151func Open(path string) (*Device, error) {
152 outFile, err := os.OpenFile(path, os.O_RDWR, 0640)
153 if err != nil {
154 return nil, fmt.Errorf("failed to open block device: %w", err)
155 }
156 return FromFileHandle(outFile)
157}
158
159// FromFileHandle creates a blockdev from a device handle. The device handle is
160// not duplicated, closing the returned Device will close it. If the handle is
161// not a block device, i.e does not implement block device ioctls, an error is
162// returned.
163func FromFileHandle(handle *os.File) (*Device, error) {
164 outFileC, err := handle.SyscallConn()
165 if err != nil {
166 return nil, fmt.Errorf("error getting SyscallConn: %w", err)
167 }
168 var blockSize uint32
169 outFileC.Control(func(fd uintptr) {
170 blockSize, err = unix.IoctlGetUint32(int(fd), unix.BLKSSZGET)
171 })
172 if errors.Is(err, unix.ENOTTY) || errors.Is(err, unix.EINVAL) {
173 return nil, ErrNotBlockDevice
174 } else if err != nil {
175 return nil, fmt.Errorf("when querying disk block size: %w", err)
176 }
177
178 var sizeBytes uint64
Tim Windelschmidt06c19642024-04-23 15:07:40 +0200179 var getSizeErr syscall.Errno
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200180 outFileC.Control(func(fd uintptr) {
181 _, _, getSizeErr = unix.Syscall(unix.SYS_IOCTL, fd, unix.BLKGETSIZE64, uintptr(unsafe.Pointer(&sizeBytes)))
182 })
183
184 if getSizeErr != unix.Errno(0) {
185 return nil, fmt.Errorf("when querying disk block count: %w", err)
186 }
187 if sizeBytes%uint64(blockSize) != 0 {
188 return nil, fmt.Errorf("block device size is not an integer multiple of its block size (%d %% %d = %d)", sizeBytes, blockSize, sizeBytes%uint64(blockSize))
189 }
190 return &Device{
191 backend: handle,
192 rawConn: outFileC,
193 blockSize: int64(blockSize),
194 blockCount: int64(sizeBytes) / int64(blockSize),
195 }, nil
196}
197
198type File struct {
199 backend *os.File
200 rawConn syscall.RawConn
201 blockSize int64
202 blockCount int64
203}
204
205func CreateFile(name string, blockSize int64, blockCount int64) (*File, error) {
206 if blockSize < 512 {
Jan Schära6da1712024-08-21 15:12:11 +0200207 return nil, fmt.Errorf("blockSize must be at least 512 bytes")
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200208 }
209 if bits.OnesCount64(uint64(blockSize)) != 1 {
210 return nil, fmt.Errorf("blockSize must be a power of two")
211 }
212 out, err := os.Create(name)
213 if err != nil {
214 return nil, fmt.Errorf("when creating backing file: %w", err)
215 }
216 rawConn, err := out.SyscallConn()
217 if err != nil {
218 return nil, fmt.Errorf("unable to get SyscallConn: %w", err)
219 }
220 return &File{
221 backend: out,
222 blockSize: blockSize,
223 rawConn: rawConn,
224 blockCount: blockCount,
225 }, nil
226}
227
228func (d *File) ReadAt(p []byte, off int64) (n int, err error) {
Jan Schära6da1712024-08-21 15:12:11 +0200229 size := d.blockSize * d.blockCount
230 if off > size {
231 return 0, io.EOF
232 }
233 if int64(len(p)) > size-off {
234 n, err = d.backend.ReadAt(p[:size-off], off)
235 if err == nil {
236 err = io.EOF
237 }
238 return
239 }
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200240 return d.backend.ReadAt(p, off)
241}
242
243func (d *File) WriteAt(p []byte, off int64) (n int, err error) {
Jan Schära6da1712024-08-21 15:12:11 +0200244 size := d.blockSize * d.blockCount
245 if off > size {
246 return 0, ErrOutOfBounds
247 }
248 if int64(len(p)) > size-off {
249 n, err = d.backend.WriteAt(p[:size-off], off)
250 if err == nil {
251 err = ErrOutOfBounds
252 }
253 return
254 }
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200255 return d.backend.WriteAt(p, off)
256}
257
258func (d *File) Close() error {
259 return d.backend.Close()
260}
261
262func (d *File) BlockCount() int64 {
263 return d.blockCount
264}
265
266func (d *File) BlockSize() int64 {
267 return d.blockSize
268}
269
Jan Schära6da1712024-08-21 15:12:11 +0200270func (d *File) OptimalBlockSize() int64 {
271 return d.blockSize
272}
273
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200274func (d *File) Discard(startByte int64, endByte int64) error {
Jan Schära6da1712024-08-21 15:12:11 +0200275 if err := validAlignedRange(d, startByte, endByte); err != nil {
276 return err
277 }
278 if startByte == endByte {
279 return nil
280 }
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200281 var err error
282 if ctrlErr := d.rawConn.Control(func(fd uintptr) {
283 // There is FALLOC_FL_NO_HIDE_STALE, but it's not implemented by
284 // any filesystem right now, so let's not attempt it for the time being.
285 err = unix.Fallocate(int(fd), unix.FALLOC_FL_PUNCH_HOLE|unix.FALLOC_FL_KEEP_SIZE, startByte, endByte-startByte)
286 }); ctrlErr != nil {
287 return ctrlErr
288 }
289 if errors.Is(err, unix.EOPNOTSUPP) {
Lorenz Brun65b1c682023-09-14 15:49:39 +0200290 return errors.ErrUnsupported
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200291 }
Jan Schär0ea961c2024-04-11 13:41:40 +0200292 if err != nil {
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200293 return fmt.Errorf("failed to discard: %w", err)
294 }
295 return nil
296}
297
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200298func (d *File) Zero(startByte int64, endByte int64) error {
Jan Schära6da1712024-08-21 15:12:11 +0200299 if err := validAlignedRange(d, startByte, endByte); err != nil {
300 return err
301 }
302 if startByte == endByte {
303 return nil
304 }
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200305 var err error
306 if ctrlErr := d.rawConn.Control(func(fd uintptr) {
307 // Tell the filesystem to punch out the given blocks.
308 err = unix.Fallocate(int(fd), unix.FALLOC_FL_PUNCH_HOLE|unix.FALLOC_FL_KEEP_SIZE, startByte, endByte-startByte)
309 }); ctrlErr != nil {
310 return ctrlErr
311 }
312 // If unsupported or the syscall is not available (for example in a sandbox)
313 // fall back to the generic software implementation.
314 if errors.Is(err, unix.EOPNOTSUPP) || errors.Is(err, unix.ENOSYS) {
315 return GenericZero(d, startByte, endByte)
316 }
317 if err != nil {
318 return fmt.Errorf("failed to zero out: %w", err)
319 }
320 return nil
321}
Jan Schära6da1712024-08-21 15:12:11 +0200322
323func (d *File) Sync() error {
324 return d.backend.Sync()
325}