blob: df7825f5db2cb6940c37ee7cd351fabbb53f1066 [file] [log] [blame]
Serge Bazanskia42844a2024-05-16 18:06:19 +02001// Package qcow2 implements generating QEMU copy-on-write image files.
2package qcow2
3
4// QCOW2 is a peculiar file format, as it's cluster-based (read: block-based) and
5// that blocks are a concept both exposed to the guest (user data storage is
6// expressed in clusters) and used to describe the layout of metadata structures.
7//
8// Most notably, the 'refcount' (reference count) mechanism for clusters is not
9// only for user-visible clusters but also for clusters within the image that
10// contain metadata. This means that users of the format can use the same
11// allocation system for clusters both for allocating clusters exposed to the
12// user and clusters for metadata/housekeeping.
13//
14// We only support a small subset of QCOW2 functionality: writing, qcow2 (no v1
15// or v3) and no extensions. This keeps the codebase quite lean.
16//
17// QCOW2 images are made out of:
18//
19// 1. A file header (occupies first cluster);
20// 2. Optionally, a backing file name (right after header, within the first
21// cluster);
22// 3. A refcount table (made out of two levels, L1 is continuous and pointed to
23// by header, L2 is pointed to by L1, both must be cluster-aligned and occupy a
24// multiple of clusters each). The refcount table is indexed by cluster address
25// and returns whether a cluster is free or in use (by metadata or user data);
26// 4. A cluster mapping table (made out of two levels similarly to the refcount
27// table, but the cluster mapping L1 table does not have to occupy an entire
28// cluster as the file header contains information on how many entries are
29// there). The cluster mapping table is indexed by user address and returns
30// the file cluster address.
31//
32// Our generated files contain a header/backing file name, a refcount table which
33// contains enough entries to cover the metadata of the file itself, and a
34// minimum L1 table (enough to cover the whole user data address range, but with
35// no cluster mapped). The users of the image file will be able to use the
36// cluster mapping L1 table as is, and will likely need to remap the refcount
37// table to fit more file cluster addresses as the image gets written. This is
38// the same kind of layout qemu-img uses.
39//
40// Reference: https://github.com/qemu/qemu/blob/master/docs/interop/qcow2.txt
41
42import (
43 "bytes"
44 "encoding/binary"
45 "fmt"
46 "io"
47 "os"
48)
49
50// builder is the internal structure defining the requested parameters for image
51// generation.
52type builder struct {
53 // size of the image in bytes. Note that this is the size visible to the user,
54 // not the actual size in bytes of the container (which will be in most cases
55 // significantly smaller).
56 size uint64
57 // backingFile is, if non-zero, the backing file for a given qcow2 image. This
58 // makes the qcow2 image function as an overlay on top of the backing file (ie.
59 // offers snapshot functionality).
60 backingFile string
61 // clusterBits is the number of bits that a cluster has. This is usually 16, for
62 // 64KiB clusters.
63 clusterBits uint32
64}
65
66type qcow2Header struct {
67 Magic [4]byte
68 Version uint32
69 BackingFileOffset uint64
70 BackingFileSize uint32
71 ClusterBits uint32
72 Size uint64
73 CryptMethod uint32
74 L1Size uint32
75 L1TableOffset uint64
76 RefcountTableOffset uint64
77 RefcountTableClusters uint32
78 NumberSnapshots uint32
79 SnapshotsOffset uint64
80}
81
82// divRoundUp divides a by b and returns the result rounded up. This is useful to
83// answer the question 'how many b-sized elements do I need to contain a'.
84func divRoundUp(a, b uint64) uint64 {
85 res := a / b
86 if a%b != 0 {
87 res += 1
88 }
89 return res
90}
91
92// clusterSize returns the cluster size for this builder.
93func (b *builder) clusterSize() uint64 {
94 return uint64(1 << b.clusterBits)
95}
96
97// calculateMappingEntries returns the details of the 'cluster mapping'
98// structures (sometimes called 'L1'). This is calculated based on the cluster
99// size and target image size.
100//
101// The values returned are the count of L1 entries needed to cover the entirety
102// of the user data, and the size of such an entry within its table. The latter
103// is constant.
104func (b *builder) calculateMappingEntries() (mappingL1Count uint64, mappingL1EntrySize uint64) {
105 clusterSize := b.clusterSize()
106
107 mappingL2EntrySize := uint64(8)
108 // Each L2 table is always one cluster.
109 mappingL2EntryCount := clusterSize / mappingL2EntrySize
110 // How much user data each L2 table covers.
111 mappingL2Covering := mappingL2EntryCount * clusterSize
112 // How many L1 entries do we need.
113 mappingL1Count = divRoundUp(b.size, mappingL2Covering)
114 // Each L1 entry is always 64 bits.
115 mappingL1EntrySize = uint64(8)
116
117 return
118}
119
120// refcountTableProperties contains the properties of a L1 or L2 refcount table
121// calculated from the cluster size of a qcow2 image.
122type refcountTableProperties struct {
123 // bytesCoveredByCluster describes how many bytes of {meta,}data are addressed by
124 // a cluster full of metadata entries of this refcount table.
125 bytesCoveredByCluster uint64
126 // entriesPerCluster describes how many entries of this refcount table fit in a
127 // full cluster of metadata.
128 entriesPerCluster uint64
129}
130
131// calculateRefcountProperties returns recountTableProperties for L1 and L2
132// refcount tables for the qcow2 image being built. These are static properties
133// calculated based on the cluster size, not on the target size of the image.
134func (b *builder) calculateRefcountProperties() (l1 refcountTableProperties, l2 refcountTableProperties) {
135 clusterSize := b.clusterSize()
136
137 // Size of refcount l2 structure entry (16 bits):
138 refcountL2EntrySize := uint64(2)
139 // Number of entries in refcount l2 structure:
140 refcountL2EntryCount := clusterSize / refcountL2EntrySize
141 // Storage size covered by every refcount l2 structure (each entry points towards
142 // a cluster):
143 refcountL2Covering := refcountL2EntryCount * clusterSize
144
145 // Size of refcount l1 structure entry (64 bits):
146 refcountL1EntrySize := uint64(8)
147 // Number of entries in refcount l1 structure:
148 refcountL1EntryCount := clusterSize / refcountL1EntrySize
149 // Storage size covered by every refcount l1 structure:
150 refcountL1Covering := refcountL1EntryCount * refcountL2Covering
151
152 l1.bytesCoveredByCluster = refcountL1Covering
153 l1.entriesPerCluster = refcountL1EntryCount
154
155 l2.bytesCoveredByCluster = refcountL2Covering
156 l2.entriesPerCluster = refcountL2EntryCount
157 return
158}
159
160// build writes out a qcow2 image (based on the builder's configuration) to a
161// Writer.
162func (b *builder) build(w io.Writer) error {
163 clusterSize := b.clusterSize()
164 // Minimum supported by QEMU and this codebase.
165 if b.clusterSize() < 512 {
166 return fmt.Errorf("cluster size too small")
167 }
168
169 // Size of a serialized qcow2Header.
170 headerSize := 72
171 // And however long the backing file name is (we stuff it after the header,
172 // within the first cluster).
173 headerSize += len(b.backingFile)
174 // Make sure the above fits.
175 if uint64(headerSize) > clusterSize {
176 return fmt.Errorf("cluster size too small")
177 }
178
179 // Cluster mapping structures are just based on the backing storage size and
180 // don't need to cover metadata. This makes it easy to calculate:
181 mappingL1Count, mappingL1EntrySize := b.calculateMappingEntries()
182 mappingL1Clusters := divRoundUp(mappingL1Count*mappingL1EntrySize, clusterSize)
183
184 // For refcount structures we need enough to cover just the metadata of our
185 // image. Unfortunately that includes the refcount structures themselves. We
186 // choose a naïve iterative approach where we start out with zero metadata
187 // structures and keep adding more until we cover everything that's necessary.
188
189 // Coverage, in bytes, for an L1 and L2 refcount cluster full of entries.
190 refcountL1, refcountL2 := b.calculateRefcountProperties()
191
192 // How many clusters of metadata to put in the final image. We start at zero and
193 // iterate upwards.
194 refcountL1Clusters := uint64(0)
195 refcountL2Clusters := uint64(0)
196
197 // How many bytes the current metadata covers. This is also zero at first.
198 maximumL2Covering := refcountL2Clusters * refcountL2.bytesCoveredByCluster
199 maximumL1Covering := refcountL1Clusters * refcountL1.bytesCoveredByCluster
200
201 // The size of our metadata in bytes.
202 sizeMetadata := (1 + mappingL1Clusters + refcountL1Clusters + refcountL2Clusters) * clusterSize
203
204 // Keep adding L1/L2 metadata clusters until we can cover all of our own
205 // metadata. This is an iterative approach to the aforementioned problem. Even
206 // with large images (100TB) at the default cluster size this loop only takes one
207 // iteration to stabilize. Smaller cluster sizes might take longer.
208 for {
209 changed := false
210
211 // Keep adding L2 metadata clusters until we cover all the metadata we know so far.
212 for maximumL2Covering < sizeMetadata {
213 refcountL2Clusters += 1
214 // Recalculate.
215 maximumL2Covering = refcountL2Clusters * refcountL2.bytesCoveredByCluster
216 sizeMetadata += clusterSize
217 changed = true
218 }
219 // Keep adding L1 metadata clusters until we cover all the metadata we know so far.
220 for maximumL1Covering < sizeMetadata {
221 refcountL1Clusters += 1
222 // Recalculate.
223 maximumL1Covering = refcountL1Clusters * refcountL1.bytesCoveredByCluster
224 sizeMetadata += clusterSize
225 changed = true
226 }
227
228 // If no changes were introduced, it means we stabilized at some l1/l2 metadata
229 // cluster count. Exit.
230 if !changed {
231 break
232 }
233 }
234
235 // Now that we have calculated everything, start writing out the header.
236 h := qcow2Header{
237 Version: 2,
238 ClusterBits: b.clusterBits,
239 Size: b.size,
240 // Unencrypted.
241 CryptMethod: 0,
242
243 L1Size: uint32(mappingL1Count),
244 // L1 table starts after the header and refcount tables.
245 L1TableOffset: (1 + refcountL1Clusters + refcountL2Clusters) * clusterSize,
246
247 // Refcount table starts right after header cluster.
248 RefcountTableOffset: 1 * clusterSize,
249 RefcountTableClusters: uint32(refcountL1Clusters),
250
251 // No snapshots.
252 NumberSnapshots: 0,
253 SnapshotsOffset: 0,
254 }
255 copy(h.Magic[:], "QFI\xfb")
256 if b.backingFile != "" {
257 // Backing file path is right after header.
258 h.BackingFileOffset = 72
259 h.BackingFileSize = uint32(len(b.backingFile))
260 }
261 if err := binary.Write(w, binary.BigEndian, &h); err != nil {
262 return err
263 }
264
265 // Write out backing file name right after the header.
266 if b.backingFile != "" {
267 if _, err := w.Write([]byte(b.backingFile)); err != nil {
268 return err
269 }
270 }
271
272 // Align to next cluster with zeroes.
273 if _, err := w.Write(bytes.Repeat([]byte{0}, int(clusterSize)-headerSize)); err != nil {
274 return err
275 }
276
277 // Write L1 refcount table.
278 for i := uint64(0); i < refcountL1Clusters; i++ {
279 for j := uint64(0); j < refcountL1.entriesPerCluster; j++ {
280 // Index of corresponding l2 table.
281 ix := j + i*refcountL1.entriesPerCluster
282
283 var data uint64
284 if ix < refcountL2Clusters {
285 // If this is an allocated L2 table, write its offset. Otherwise write zero.
286 data = (1 + refcountL1Clusters + ix) * clusterSize
287 }
288 if err := binary.Write(w, binary.BigEndian, &data); err != nil {
289 return err
290 }
291 }
292 }
293 // Write L2 refcount table.
294 for i := uint64(0); i < refcountL2Clusters; i++ {
295 for j := uint64(0); j < refcountL2.entriesPerCluster; j++ {
296 // Index of corresponding cluster.
297 ix := j + i*refcountL2.entriesPerCluster
298
299 var data uint16
300 if ix < (1 + refcountL1Clusters + refcountL2Clusters + mappingL1Clusters) {
301 // If this is an in-use cluster, mark it as such.
302 data = 1
303 }
304 if err := binary.Write(w, binary.BigEndian, &data); err != nil {
305 return err
306 }
307 }
308 }
309 // Write L1 mapping table.
310 for i := uint64(0); i < mappingL1Count; i++ {
311 // No user data yet allocated.
312 data := uint64(0)
313 if err := binary.Write(w, binary.BigEndian, &data); err != nil {
314 return err
315 }
316 }
317
318 return nil
319}
320
321// GenerateOption structures are passed to the Generate call.
322type GenerateOption struct {
323 fileSize *uint64
324 backingFilePath *string
325}
326
327// GenerateWithFileSize will generate an image with a given user size in bytes.
328func GenerateWithFileSize(fileSize uint64) *GenerateOption {
329 return &GenerateOption{
330 fileSize: &fileSize,
331 }
332}
333
334// GenerateWithBackingFile will generate an image size backed by a given file
335// path. If GenerateWithFileSize is not given, the path will also be checked at
336// generation time and used to determine the size of the user image.
337func GenerateWithBackingFile(path string) *GenerateOption {
338 return &GenerateOption{
339 backingFilePath: &path,
340 }
341}
342
343// Generate builds a QCOW2 image and writes it to the given Writer.
344//
345// At least one of GenerateWithFileSize or GenerateWithBackingFile must be given.
346func Generate(w io.Writer, opts ...*GenerateOption) error {
347 var size uint64
348 var haveSize bool
349
350 var backingFile string
351 var haveBackingFile bool
352
353 for _, opt := range opts {
354 if opt.fileSize != nil {
355 if haveSize {
356 return fmt.Errorf("cannot specify GenerateWithFileSize twice")
357 }
358 haveSize = true
359
360 size = *opt.fileSize
361 }
362 if opt.backingFilePath != nil {
363 if haveBackingFile {
364 return fmt.Errorf("cannot specify GenerateWithBackingFile twice")
365 }
366 haveBackingFile = true
367
368 backingFile = *opt.backingFilePath
369 }
370 }
371
372 if !haveSize && !haveBackingFile {
373 return fmt.Errorf("must be called with GenerateWithFileSize or GenerateWithBackingFileAndSize")
374 }
375
376 if haveBackingFile && !haveSize {
377 st, err := os.Stat(backingFile)
378 if err != nil {
379 return fmt.Errorf("cannot read backing file: %w", err)
380 }
381 size = uint64(st.Size())
382 }
383
384 b := builder{
385 size: size,
386 backingFile: backingFile,
387 // 64k clusters (standard generated by qemu-img).
388 clusterBits: uint32(0x10),
389 }
390 return b.build(w)
391}