blob: f10b4eb894a9041104c077df8cf5420579f1659a [file] [log] [blame]
Tim Windelschmidt6d33a432025-02-04 14:34:25 +01001// Copyright The Monogon Project Authors.
2// SPDX-License-Identifier: Apache-2.0
3
Serge Bazanskia42844a2024-05-16 18:06:19 +02004// Package qcow2 implements generating QEMU copy-on-write image files.
5package qcow2
6
7// QCOW2 is a peculiar file format, as it's cluster-based (read: block-based) and
8// that blocks are a concept both exposed to the guest (user data storage is
9// expressed in clusters) and used to describe the layout of metadata structures.
10//
11// Most notably, the 'refcount' (reference count) mechanism for clusters is not
12// only for user-visible clusters but also for clusters within the image that
13// contain metadata. This means that users of the format can use the same
14// allocation system for clusters both for allocating clusters exposed to the
15// user and clusters for metadata/housekeeping.
16//
17// We only support a small subset of QCOW2 functionality: writing, qcow2 (no v1
18// or v3) and no extensions. This keeps the codebase quite lean.
19//
20// QCOW2 images are made out of:
21//
22// 1. A file header (occupies first cluster);
23// 2. Optionally, a backing file name (right after header, within the first
24// cluster);
25// 3. A refcount table (made out of two levels, L1 is continuous and pointed to
26// by header, L2 is pointed to by L1, both must be cluster-aligned and occupy a
27// multiple of clusters each). The refcount table is indexed by cluster address
28// and returns whether a cluster is free or in use (by metadata or user data);
29// 4. A cluster mapping table (made out of two levels similarly to the refcount
30// table, but the cluster mapping L1 table does not have to occupy an entire
31// cluster as the file header contains information on how many entries are
32// there). The cluster mapping table is indexed by user address and returns
33// the file cluster address.
34//
35// Our generated files contain a header/backing file name, a refcount table which
36// contains enough entries to cover the metadata of the file itself, and a
37// minimum L1 table (enough to cover the whole user data address range, but with
38// no cluster mapped). The users of the image file will be able to use the
39// cluster mapping L1 table as is, and will likely need to remap the refcount
40// table to fit more file cluster addresses as the image gets written. This is
41// the same kind of layout qemu-img uses.
42//
43// Reference: https://github.com/qemu/qemu/blob/master/docs/interop/qcow2.txt
44
45import (
46 "bytes"
47 "encoding/binary"
48 "fmt"
49 "io"
50 "os"
51)
52
53// builder is the internal structure defining the requested parameters for image
54// generation.
55type builder struct {
56 // size of the image in bytes. Note that this is the size visible to the user,
57 // not the actual size in bytes of the container (which will be in most cases
58 // significantly smaller).
59 size uint64
60 // backingFile is, if non-zero, the backing file for a given qcow2 image. This
61 // makes the qcow2 image function as an overlay on top of the backing file (ie.
62 // offers snapshot functionality).
63 backingFile string
64 // clusterBits is the number of bits that a cluster has. This is usually 16, for
65 // 64KiB clusters.
66 clusterBits uint32
67}
68
69type qcow2Header struct {
70 Magic [4]byte
71 Version uint32
72 BackingFileOffset uint64
73 BackingFileSize uint32
74 ClusterBits uint32
75 Size uint64
76 CryptMethod uint32
77 L1Size uint32
78 L1TableOffset uint64
79 RefcountTableOffset uint64
80 RefcountTableClusters uint32
81 NumberSnapshots uint32
82 SnapshotsOffset uint64
83}
84
85// divRoundUp divides a by b and returns the result rounded up. This is useful to
86// answer the question 'how many b-sized elements do I need to contain a'.
87func divRoundUp(a, b uint64) uint64 {
88 res := a / b
89 if a%b != 0 {
90 res += 1
91 }
92 return res
93}
94
95// clusterSize returns the cluster size for this builder.
96func (b *builder) clusterSize() uint64 {
97 return uint64(1 << b.clusterBits)
98}
99
100// calculateMappingEntries returns the details of the 'cluster mapping'
101// structures (sometimes called 'L1'). This is calculated based on the cluster
102// size and target image size.
103//
104// The values returned are the count of L1 entries needed to cover the entirety
105// of the user data, and the size of such an entry within its table. The latter
106// is constant.
107func (b *builder) calculateMappingEntries() (mappingL1Count uint64, mappingL1EntrySize uint64) {
108 clusterSize := b.clusterSize()
109
110 mappingL2EntrySize := uint64(8)
111 // Each L2 table is always one cluster.
112 mappingL2EntryCount := clusterSize / mappingL2EntrySize
113 // How much user data each L2 table covers.
114 mappingL2Covering := mappingL2EntryCount * clusterSize
115 // How many L1 entries do we need.
116 mappingL1Count = divRoundUp(b.size, mappingL2Covering)
117 // Each L1 entry is always 64 bits.
118 mappingL1EntrySize = uint64(8)
119
120 return
121}
122
123// refcountTableProperties contains the properties of a L1 or L2 refcount table
124// calculated from the cluster size of a qcow2 image.
125type refcountTableProperties struct {
126 // bytesCoveredByCluster describes how many bytes of {meta,}data are addressed by
127 // a cluster full of metadata entries of this refcount table.
128 bytesCoveredByCluster uint64
129 // entriesPerCluster describes how many entries of this refcount table fit in a
130 // full cluster of metadata.
131 entriesPerCluster uint64
132}
133
134// calculateRefcountProperties returns recountTableProperties for L1 and L2
135// refcount tables for the qcow2 image being built. These are static properties
136// calculated based on the cluster size, not on the target size of the image.
137func (b *builder) calculateRefcountProperties() (l1 refcountTableProperties, l2 refcountTableProperties) {
138 clusterSize := b.clusterSize()
139
140 // Size of refcount l2 structure entry (16 bits):
141 refcountL2EntrySize := uint64(2)
142 // Number of entries in refcount l2 structure:
143 refcountL2EntryCount := clusterSize / refcountL2EntrySize
144 // Storage size covered by every refcount l2 structure (each entry points towards
145 // a cluster):
146 refcountL2Covering := refcountL2EntryCount * clusterSize
147
148 // Size of refcount l1 structure entry (64 bits):
149 refcountL1EntrySize := uint64(8)
150 // Number of entries in refcount l1 structure:
151 refcountL1EntryCount := clusterSize / refcountL1EntrySize
152 // Storage size covered by every refcount l1 structure:
153 refcountL1Covering := refcountL1EntryCount * refcountL2Covering
154
155 l1.bytesCoveredByCluster = refcountL1Covering
156 l1.entriesPerCluster = refcountL1EntryCount
157
158 l2.bytesCoveredByCluster = refcountL2Covering
159 l2.entriesPerCluster = refcountL2EntryCount
160 return
161}
162
163// build writes out a qcow2 image (based on the builder's configuration) to a
164// Writer.
165func (b *builder) build(w io.Writer) error {
166 clusterSize := b.clusterSize()
167 // Minimum supported by QEMU and this codebase.
168 if b.clusterSize() < 512 {
169 return fmt.Errorf("cluster size too small")
170 }
171
172 // Size of a serialized qcow2Header.
173 headerSize := 72
174 // And however long the backing file name is (we stuff it after the header,
175 // within the first cluster).
176 headerSize += len(b.backingFile)
177 // Make sure the above fits.
178 if uint64(headerSize) > clusterSize {
179 return fmt.Errorf("cluster size too small")
180 }
181
182 // Cluster mapping structures are just based on the backing storage size and
183 // don't need to cover metadata. This makes it easy to calculate:
184 mappingL1Count, mappingL1EntrySize := b.calculateMappingEntries()
185 mappingL1Clusters := divRoundUp(mappingL1Count*mappingL1EntrySize, clusterSize)
186
187 // For refcount structures we need enough to cover just the metadata of our
188 // image. Unfortunately that includes the refcount structures themselves. We
189 // choose a naïve iterative approach where we start out with zero metadata
190 // structures and keep adding more until we cover everything that's necessary.
191
192 // Coverage, in bytes, for an L1 and L2 refcount cluster full of entries.
193 refcountL1, refcountL2 := b.calculateRefcountProperties()
194
195 // How many clusters of metadata to put in the final image. We start at zero and
196 // iterate upwards.
197 refcountL1Clusters := uint64(0)
198 refcountL2Clusters := uint64(0)
199
200 // How many bytes the current metadata covers. This is also zero at first.
201 maximumL2Covering := refcountL2Clusters * refcountL2.bytesCoveredByCluster
202 maximumL1Covering := refcountL1Clusters * refcountL1.bytesCoveredByCluster
203
204 // The size of our metadata in bytes.
205 sizeMetadata := (1 + mappingL1Clusters + refcountL1Clusters + refcountL2Clusters) * clusterSize
206
207 // Keep adding L1/L2 metadata clusters until we can cover all of our own
208 // metadata. This is an iterative approach to the aforementioned problem. Even
209 // with large images (100TB) at the default cluster size this loop only takes one
210 // iteration to stabilize. Smaller cluster sizes might take longer.
211 for {
212 changed := false
213
214 // Keep adding L2 metadata clusters until we cover all the metadata we know so far.
215 for maximumL2Covering < sizeMetadata {
216 refcountL2Clusters += 1
217 // Recalculate.
218 maximumL2Covering = refcountL2Clusters * refcountL2.bytesCoveredByCluster
219 sizeMetadata += clusterSize
220 changed = true
221 }
222 // Keep adding L1 metadata clusters until we cover all the metadata we know so far.
223 for maximumL1Covering < sizeMetadata {
224 refcountL1Clusters += 1
225 // Recalculate.
226 maximumL1Covering = refcountL1Clusters * refcountL1.bytesCoveredByCluster
227 sizeMetadata += clusterSize
228 changed = true
229 }
230
231 // If no changes were introduced, it means we stabilized at some l1/l2 metadata
232 // cluster count. Exit.
233 if !changed {
234 break
235 }
236 }
237
238 // Now that we have calculated everything, start writing out the header.
239 h := qcow2Header{
240 Version: 2,
241 ClusterBits: b.clusterBits,
242 Size: b.size,
243 // Unencrypted.
244 CryptMethod: 0,
245
246 L1Size: uint32(mappingL1Count),
247 // L1 table starts after the header and refcount tables.
248 L1TableOffset: (1 + refcountL1Clusters + refcountL2Clusters) * clusterSize,
249
250 // Refcount table starts right after header cluster.
251 RefcountTableOffset: 1 * clusterSize,
252 RefcountTableClusters: uint32(refcountL1Clusters),
253
254 // No snapshots.
255 NumberSnapshots: 0,
256 SnapshotsOffset: 0,
257 }
258 copy(h.Magic[:], "QFI\xfb")
259 if b.backingFile != "" {
260 // Backing file path is right after header.
261 h.BackingFileOffset = 72
262 h.BackingFileSize = uint32(len(b.backingFile))
263 }
264 if err := binary.Write(w, binary.BigEndian, &h); err != nil {
265 return err
266 }
267
268 // Write out backing file name right after the header.
269 if b.backingFile != "" {
270 if _, err := w.Write([]byte(b.backingFile)); err != nil {
271 return err
272 }
273 }
274
275 // Align to next cluster with zeroes.
276 if _, err := w.Write(bytes.Repeat([]byte{0}, int(clusterSize)-headerSize)); err != nil {
277 return err
278 }
279
280 // Write L1 refcount table.
281 for i := uint64(0); i < refcountL1Clusters; i++ {
282 for j := uint64(0); j < refcountL1.entriesPerCluster; j++ {
283 // Index of corresponding l2 table.
284 ix := j + i*refcountL1.entriesPerCluster
285
286 var data uint64
287 if ix < refcountL2Clusters {
288 // If this is an allocated L2 table, write its offset. Otherwise write zero.
289 data = (1 + refcountL1Clusters + ix) * clusterSize
290 }
291 if err := binary.Write(w, binary.BigEndian, &data); err != nil {
292 return err
293 }
294 }
295 }
296 // Write L2 refcount table.
297 for i := uint64(0); i < refcountL2Clusters; i++ {
298 for j := uint64(0); j < refcountL2.entriesPerCluster; j++ {
299 // Index of corresponding cluster.
300 ix := j + i*refcountL2.entriesPerCluster
301
302 var data uint16
303 if ix < (1 + refcountL1Clusters + refcountL2Clusters + mappingL1Clusters) {
304 // If this is an in-use cluster, mark it as such.
305 data = 1
306 }
307 if err := binary.Write(w, binary.BigEndian, &data); err != nil {
308 return err
309 }
310 }
311 }
312 // Write L1 mapping table.
313 for i := uint64(0); i < mappingL1Count; i++ {
314 // No user data yet allocated.
315 data := uint64(0)
316 if err := binary.Write(w, binary.BigEndian, &data); err != nil {
317 return err
318 }
319 }
320
321 return nil
322}
323
324// GenerateOption structures are passed to the Generate call.
325type GenerateOption struct {
326 fileSize *uint64
327 backingFilePath *string
328}
329
330// GenerateWithFileSize will generate an image with a given user size in bytes.
331func GenerateWithFileSize(fileSize uint64) *GenerateOption {
332 return &GenerateOption{
333 fileSize: &fileSize,
334 }
335}
336
337// GenerateWithBackingFile will generate an image size backed by a given file
338// path. If GenerateWithFileSize is not given, the path will also be checked at
339// generation time and used to determine the size of the user image.
340func GenerateWithBackingFile(path string) *GenerateOption {
341 return &GenerateOption{
342 backingFilePath: &path,
343 }
344}
345
346// Generate builds a QCOW2 image and writes it to the given Writer.
347//
348// At least one of GenerateWithFileSize or GenerateWithBackingFile must be given.
349func Generate(w io.Writer, opts ...*GenerateOption) error {
350 var size uint64
351 var haveSize bool
352
353 var backingFile string
354 var haveBackingFile bool
355
356 for _, opt := range opts {
357 if opt.fileSize != nil {
358 if haveSize {
359 return fmt.Errorf("cannot specify GenerateWithFileSize twice")
360 }
361 haveSize = true
362
363 size = *opt.fileSize
364 }
365 if opt.backingFilePath != nil {
366 if haveBackingFile {
367 return fmt.Errorf("cannot specify GenerateWithBackingFile twice")
368 }
369 haveBackingFile = true
370
371 backingFile = *opt.backingFilePath
372 }
373 }
374
375 if !haveSize && !haveBackingFile {
376 return fmt.Errorf("must be called with GenerateWithFileSize or GenerateWithBackingFileAndSize")
377 }
378
379 if haveBackingFile && !haveSize {
380 st, err := os.Stat(backingFile)
381 if err != nil {
382 return fmt.Errorf("cannot read backing file: %w", err)
383 }
384 size = uint64(st.Size())
385 }
386
387 b := builder{
388 size: size,
389 backingFile: backingFile,
390 // 64k clusters (standard generated by qemu-img).
391 clusterBits: uint32(0x10),
392 }
393 return b.build(w)
394}