blob: 9877186267d69f7ac0cc63fea6fc86060035ff3d [file] [log] [blame]
Tim Windelschmidt6d33a432025-02-04 14:34:25 +01001// Copyright The Monogon Project Authors.
2// SPDX-License-Identifier: Apache-2.0
3
Lorenz Brun1e0e3a42023-06-28 16:40:18 +02004package blockdev
5
6import (
7 "errors"
8 "fmt"
9 "io"
Lorenz Brun8eb02442025-02-25 16:57:52 +010010 "os"
Lorenz Brun1e0e3a42023-06-28 16:40:18 +020011)
12
Lorenz Brun1e0e3a42023-06-28 16:40:18 +020013var ErrNotBlockDevice = errors.New("not a block device")
14
Lorenz Brun8eb02442025-02-25 16:57:52 +010015// options aggregates all open options for all platforms.
16// If these were defined per-platform selecting the right ones per platform
17// would require multiple per-platform files at each call site.
18type options struct {
19 readOnly bool
20 direct bool
21 exclusive bool
22}
23
24func (o *options) collect(opts []Option) {
25 for _, f := range opts {
26 f(o)
27 }
28}
29
30func (o *options) genericFlags() int {
31 if o.readOnly {
32 return os.O_RDONLY
33 } else {
34 return os.O_RDWR
35 }
36}
37
38type Option func(*options)
39
40// WithReadonly opens the block device read-only. Any write calls will fail.
41// Passed as an option to Open.
42func WithReadonly(o *options) {
43 o.readOnly = true
44}
45
46// WithDirect opens the block device bypassing any caching by the kernel.
47// Note that additional alignment requirements might be imposed by the
48// underlying device.
49// Unsupported on non-Linux currently, will return an error.
50func WithDirect(o *options) {
51 o.direct = true
52}
53
54// WithExclusive tries to acquire a pseudo-exclusive lock (only with other
55// exclusive FDs) over the block device.
56// Unsupported on non-Linux currently, will return an error.
57func WithExclusive(o *options) {
58 o.exclusive = true
59}
60
Lorenz Brun1e0e3a42023-06-28 16:40:18 +020061// BlockDev represents a generic block device made up of equally-sized blocks.
62// All offsets and intervals are expressed in bytes and must be aligned to
63// BlockSize and are recommended to be aligned to OptimalBlockSize if feasible.
64// Unless stated otherwise, intervals are inclusive-exclusive, i.e. the
65// start byte is included but the end byte is not.
66type BlockDev interface {
67 io.ReaderAt
68 io.WriterAt
Lorenz Brun1e0e3a42023-06-28 16:40:18 +020069
70 // BlockCount returns the number of blocks on the block device or -1 if it
71 // is an image with an undefined size.
72 BlockCount() int64
73
Jan Schära6da1712024-08-21 15:12:11 +020074 // BlockSize returns the block size of the block device in bytes. This must
75 // be a power of two and is commonly (but not always) either 512 or 4096.
76 BlockSize() int64
77
Lorenz Brun1e0e3a42023-06-28 16:40:18 +020078 // OptimalBlockSize returns the optimal block size in bytes for aligning
79 // to as well as issuing I/O. IO operations with block sizes below this
80 // one might incur read-write overhead. This is the larger of the physical
81 // block size and a device-reported value if available.
82 OptimalBlockSize() int64
83
84 // Discard discards a continuous set of blocks. Discarding means the
85 // underlying device gets notified that the data in these blocks is no
86 // longer needed. This can improve performance of the device device (as it
87 // no longer needs to preserve the unused data) as well as bulk erase
88 // operations. This command is advisory and not all implementations support
89 // it. The contents of discarded blocks are implementation-defined.
90 Discard(startByte int64, endByte int64) error
91
92 // Zero zeroes a continouous set of blocks. On certain implementations this
93 // can be significantly faster than just calling Write with zeroes.
94 Zero(startByte, endByte int64) error
Jan Schära6da1712024-08-21 15:12:11 +020095
96 // Sync commits the current contents to stable storage.
97 Sync() error
Lorenz Brun1e0e3a42023-06-28 16:40:18 +020098}
99
Jan Schäre0db72c2025-06-18 18:14:07 +0000100// ReaderFromAt is similar to [io.ReaderFrom], except that the write starts at
101// offset off instead of using the file offset.
102type ReaderFromAt interface {
103 ReadFromAt(r io.Reader, off int64) (n int64, err error)
104}
105
106// writerOnly wraps an [io.Writer] and hides all methods other than Write
107// (such as ReadFrom).
108type writerOnly struct {
109 io.Writer
110}
111
112// genericReadFromAt is a generic implementation which does not use b.ReadFromAt
113// to prevent recursive calls.
114func genericReadFromAt(b BlockDev, r io.Reader, off int64) (int64, error) {
115 w := &writerOnly{Writer: &ReadWriteSeeker{b: b, currPos: off}}
116 return io.Copy(w, r)
117}
118
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200119func NewRWS(b BlockDev) *ReadWriteSeeker {
120 return &ReadWriteSeeker{b: b}
121}
122
123// ReadWriteSeeker provides an adapter implementing ReadWriteSeeker on top of
124// a blockdev.
125type ReadWriteSeeker struct {
126 b BlockDev
127 currPos int64
128}
129
130func (s *ReadWriteSeeker) Read(p []byte) (n int, err error) {
131 n, err = s.b.ReadAt(p, s.currPos)
132 s.currPos += int64(n)
133 return
134}
135
136func (s *ReadWriteSeeker) Write(p []byte) (n int, err error) {
137 n, err = s.b.WriteAt(p, s.currPos)
138 s.currPos += int64(n)
139 return
140}
141
Jan Schäre0db72c2025-06-18 18:14:07 +0000142func (s *ReadWriteSeeker) ReadFrom(r io.Reader) (n int64, err error) {
143 rfa, rfaOK := s.b.(ReaderFromAt)
144 if !rfaOK {
145 w := &writerOnly{Writer: s}
146 return io.Copy(w, r)
147 }
148 n, err = rfa.ReadFromAt(r, s.currPos)
149 s.currPos += n
150 return
151}
152
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200153func (s *ReadWriteSeeker) Seek(offset int64, whence int) (int64, error) {
154 switch whence {
Jan Schära6da1712024-08-21 15:12:11 +0200155 default:
156 return 0, errors.New("Seek: invalid whence")
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200157 case io.SeekStart:
Jan Schära6da1712024-08-21 15:12:11 +0200158 case io.SeekCurrent:
159 offset += s.currPos
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200160 case io.SeekEnd:
Jan Schära6da1712024-08-21 15:12:11 +0200161 offset += s.b.BlockCount() * s.b.BlockSize()
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200162 }
Jan Schära6da1712024-08-21 15:12:11 +0200163 if offset < 0 {
164 return 0, errors.New("Seek: invalid offset")
165 }
166 s.currPos = offset
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200167 return s.currPos, nil
168}
169
170var ErrOutOfBounds = errors.New("write out of bounds")
171
172// NewSection returns a new Section, implementing BlockDev over that subset
173// of blocks. The interval is inclusive-exclusive.
Jan Schära6da1712024-08-21 15:12:11 +0200174func NewSection(b BlockDev, startBlock, endBlock int64) (*Section, error) {
175 if startBlock < 0 {
176 return nil, fmt.Errorf("invalid range: startBlock (%d) negative", startBlock)
177 }
178 if startBlock > endBlock {
179 return nil, fmt.Errorf("invalid range: startBlock (%d) bigger than endBlock (%d)", startBlock, endBlock)
180 }
181 if endBlock > b.BlockCount() {
182 return nil, fmt.Errorf("endBlock (%d) out of range (%d)", endBlock, b.BlockCount())
183 }
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200184 return &Section{
185 b: b,
186 startBlock: startBlock,
187 endBlock: endBlock,
Jan Schära6da1712024-08-21 15:12:11 +0200188 }, nil
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200189}
190
191// Section implements BlockDev on a slice of another BlockDev given a startBlock
192// and endBlock.
193type Section struct {
194 b BlockDev
195 startBlock, endBlock int64
196}
197
198func (s *Section) ReadAt(p []byte, off int64) (n int, err error) {
Jan Schära6da1712024-08-21 15:12:11 +0200199 if off < 0 {
200 return 0, errors.New("blockdev.Section.ReadAt: negative offset")
201 }
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200202 bOff := off + (s.startBlock * s.b.BlockSize())
203 bytesToEnd := (s.endBlock * s.b.BlockSize()) - bOff
Jan Schära6da1712024-08-21 15:12:11 +0200204 if bytesToEnd < 0 {
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200205 return 0, io.EOF
206 }
207 if bytesToEnd < int64(len(p)) {
Jan Schära6da1712024-08-21 15:12:11 +0200208 n, err := s.b.ReadAt(p[:bytesToEnd], bOff)
209 if err == nil {
210 err = io.EOF
211 }
212 return n, err
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200213 }
214 return s.b.ReadAt(p, bOff)
215}
216
217func (s *Section) WriteAt(p []byte, off int64) (n int, err error) {
218 bOff := off + (s.startBlock * s.b.BlockSize())
219 bytesToEnd := (s.endBlock * s.b.BlockSize()) - bOff
Jan Schära6da1712024-08-21 15:12:11 +0200220 if off < 0 || bytesToEnd < 0 {
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200221 return 0, ErrOutOfBounds
222 }
223 if bytesToEnd < int64(len(p)) {
Jan Schära6da1712024-08-21 15:12:11 +0200224 n, err := s.b.WriteAt(p[:bytesToEnd], bOff)
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200225 if err != nil {
226 // If an error happened, prioritize that error
227 return n, err
228 }
229 // Otherwise, return ErrOutOfBounds as even short writes must return an
230 // error.
231 return n, ErrOutOfBounds
232 }
Jan Schära6da1712024-08-21 15:12:11 +0200233 return s.b.WriteAt(p, bOff)
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200234}
235
Jan Schäre0db72c2025-06-18 18:14:07 +0000236func (s *Section) ReadFromAt(r io.Reader, off int64) (n int64, err error) {
237 rfa, rfaOK := s.b.(ReaderFromAt)
238 if !rfaOK {
239 return genericReadFromAt(s, r, off)
240 }
241 bOff := off + (s.startBlock * s.b.BlockSize())
242 bytesToEnd := (s.endBlock * s.b.BlockSize()) - bOff
243 if off < 0 || bytesToEnd < 0 {
244 return 0, ErrOutOfBounds
245 }
246 ur := r
247 lr, lrOK := r.(*io.LimitedReader)
248 if lrOK {
249 if bytesToEnd >= lr.N {
250 return rfa.ReadFromAt(r, bOff)
251 }
252 ur = lr.R
253 }
254 n, err = rfa.ReadFromAt(io.LimitReader(ur, bytesToEnd), bOff)
255 if lrOK {
256 lr.N -= n
257 }
258 if err == nil && n == bytesToEnd {
259 // Return an error if we have not reached EOF.
260 moreN, moreErr := io.CopyN(io.Discard, r, 1)
261 if moreN != 0 {
262 err = ErrOutOfBounds
263 } else if moreErr != io.EOF {
264 err = moreErr
265 }
266 }
267 return
268}
269
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200270func (s *Section) BlockCount() int64 {
271 return s.endBlock - s.startBlock
272}
273
274func (s *Section) BlockSize() int64 {
275 return s.b.BlockSize()
276}
277
Jan Schära6da1712024-08-21 15:12:11 +0200278func (s *Section) OptimalBlockSize() int64 {
279 return s.b.OptimalBlockSize()
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200280}
281
282func (s *Section) Discard(startByte, endByte int64) error {
Jan Schära6da1712024-08-21 15:12:11 +0200283 if err := validAlignedRange(s, startByte, endByte); err != nil {
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200284 return err
285 }
Jan Schär0ea961c2024-04-11 13:41:40 +0200286 offset := s.startBlock * s.b.BlockSize()
287 return s.b.Discard(offset+startByte, offset+endByte)
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200288}
289
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200290func (s *Section) Zero(startByte, endByte int64) error {
Jan Schära6da1712024-08-21 15:12:11 +0200291 if err := validAlignedRange(s, startByte, endByte); err != nil {
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200292 return err
293 }
Jan Schär0ea961c2024-04-11 13:41:40 +0200294 offset := s.startBlock * s.b.BlockSize()
295 return s.b.Zero(offset+startByte, offset+endByte)
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200296}
297
Jan Schära6da1712024-08-21 15:12:11 +0200298func (s *Section) Sync() error {
299 return s.b.Sync()
300}
301
302func validAlignedRange(b BlockDev, startByte, endByte int64) error {
303 if startByte < 0 {
304 return fmt.Errorf("invalid range: startByte (%d) negative", startByte)
305 }
306 if startByte > endByte {
307 return fmt.Errorf("invalid range: startByte (%d) bigger than endByte (%d)", startByte, endByte)
308 }
309 devLen := b.BlockCount() * b.BlockSize()
310 if endByte > devLen {
311 return fmt.Errorf("endByte (%d) out of range (%d)", endByte, devLen)
312 }
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200313 if startByte%b.BlockSize() != 0 {
314 return fmt.Errorf("startByte (%d) needs to be aligned to block size (%d)", startByte, b.BlockSize())
315 }
316 if endByte%b.BlockSize() != 0 {
317 return fmt.Errorf("endByte (%d) needs to be aligned to block size (%d)", endByte, b.BlockSize())
318 }
Jan Schära6da1712024-08-21 15:12:11 +0200319 return nil
320}
321
322// GenericZero implements software-based zeroing. This can be used to implement
323// Zero when no acceleration is available or desired.
324func GenericZero(b BlockDev, startByte, endByte int64) error {
325 if err := validAlignedRange(b, startByte, endByte); err != nil {
326 return err
327 }
Lorenz Brun1e0e3a42023-06-28 16:40:18 +0200328 // Choose buffer size close to 16MiB or the range to be zeroed, whatever
329 // is smaller.
330 bufSizeTarget := int64(16 * 1024 * 1024)
331 if endByte-startByte < bufSizeTarget {
332 bufSizeTarget = endByte - startByte
333 }
334 bufSize := (bufSizeTarget / b.BlockSize()) * b.BlockSize()
335 buf := make([]byte, bufSize)
336 for i := startByte; i < endByte; i += bufSize {
337 if endByte-i < bufSize {
338 buf = buf[:endByte-i]
339 }
340 if _, err := b.WriteAt(buf, i); err != nil {
341 return fmt.Errorf("while writing zeroes: %w", err)
342 }
343 }
344 return nil
345}