| Tim Windelschmidt | 6d33a43 | 2025-02-04 14:34:25 +0100 | [diff] [blame] | 1 | // Copyright The Monogon Project Authors. |
| 2 | // SPDX-License-Identifier: Apache-2.0 |
| 3 | |
| Serge Bazanski | a42844a | 2024-05-16 18:06:19 +0200 | [diff] [blame] | 4 | // Package qcow2 implements generating QEMU copy-on-write image files. |
| 5 | package qcow2 |
| 6 | |
| 7 | // QCOW2 is a peculiar file format, as it's cluster-based (read: block-based) and |
| 8 | // that blocks are a concept both exposed to the guest (user data storage is |
| 9 | // expressed in clusters) and used to describe the layout of metadata structures. |
| 10 | // |
| 11 | // Most notably, the 'refcount' (reference count) mechanism for clusters is not |
| 12 | // only for user-visible clusters but also for clusters within the image that |
| 13 | // contain metadata. This means that users of the format can use the same |
| 14 | // allocation system for clusters both for allocating clusters exposed to the |
| 15 | // user and clusters for metadata/housekeeping. |
| 16 | // |
| 17 | // We only support a small subset of QCOW2 functionality: writing, qcow2 (no v1 |
| 18 | // or v3) and no extensions. This keeps the codebase quite lean. |
| 19 | // |
| 20 | // QCOW2 images are made out of: |
| 21 | // |
| 22 | // 1. A file header (occupies first cluster); |
| 23 | // 2. Optionally, a backing file name (right after header, within the first |
| 24 | // cluster); |
| 25 | // 3. A refcount table (made out of two levels, L1 is continuous and pointed to |
| 26 | // by header, L2 is pointed to by L1, both must be cluster-aligned and occupy a |
| 27 | // multiple of clusters each). The refcount table is indexed by cluster address |
| 28 | // and returns whether a cluster is free or in use (by metadata or user data); |
| 29 | // 4. A cluster mapping table (made out of two levels similarly to the refcount |
| 30 | // table, but the cluster mapping L1 table does not have to occupy an entire |
| 31 | // cluster as the file header contains information on how many entries are |
| 32 | // there). The cluster mapping table is indexed by user address and returns |
| 33 | // the file cluster address. |
| 34 | // |
| 35 | // Our generated files contain a header/backing file name, a refcount table which |
| 36 | // contains enough entries to cover the metadata of the file itself, and a |
| 37 | // minimum L1 table (enough to cover the whole user data address range, but with |
| 38 | // no cluster mapped). The users of the image file will be able to use the |
| 39 | // cluster mapping L1 table as is, and will likely need to remap the refcount |
| 40 | // table to fit more file cluster addresses as the image gets written. This is |
| 41 | // the same kind of layout qemu-img uses. |
| 42 | // |
| 43 | // Reference: https://github.com/qemu/qemu/blob/master/docs/interop/qcow2.txt |
| 44 | |
| 45 | import ( |
| 46 | "bytes" |
| 47 | "encoding/binary" |
| 48 | "fmt" |
| 49 | "io" |
| 50 | "os" |
| 51 | ) |
| 52 | |
| 53 | // builder is the internal structure defining the requested parameters for image |
| 54 | // generation. |
| 55 | type builder struct { |
| 56 | // size of the image in bytes. Note that this is the size visible to the user, |
| 57 | // not the actual size in bytes of the container (which will be in most cases |
| 58 | // significantly smaller). |
| 59 | size uint64 |
| 60 | // backingFile is, if non-zero, the backing file for a given qcow2 image. This |
| 61 | // makes the qcow2 image function as an overlay on top of the backing file (ie. |
| 62 | // offers snapshot functionality). |
| 63 | backingFile string |
| 64 | // clusterBits is the number of bits that a cluster has. This is usually 16, for |
| 65 | // 64KiB clusters. |
| 66 | clusterBits uint32 |
| 67 | } |
| 68 | |
| 69 | type qcow2Header struct { |
| 70 | Magic [4]byte |
| 71 | Version uint32 |
| 72 | BackingFileOffset uint64 |
| 73 | BackingFileSize uint32 |
| 74 | ClusterBits uint32 |
| 75 | Size uint64 |
| 76 | CryptMethod uint32 |
| 77 | L1Size uint32 |
| 78 | L1TableOffset uint64 |
| 79 | RefcountTableOffset uint64 |
| 80 | RefcountTableClusters uint32 |
| 81 | NumberSnapshots uint32 |
| 82 | SnapshotsOffset uint64 |
| 83 | } |
| 84 | |
| 85 | // divRoundUp divides a by b and returns the result rounded up. This is useful to |
| 86 | // answer the question 'how many b-sized elements do I need to contain a'. |
| 87 | func divRoundUp(a, b uint64) uint64 { |
| 88 | res := a / b |
| 89 | if a%b != 0 { |
| 90 | res += 1 |
| 91 | } |
| 92 | return res |
| 93 | } |
| 94 | |
| 95 | // clusterSize returns the cluster size for this builder. |
| 96 | func (b *builder) clusterSize() uint64 { |
| 97 | return uint64(1 << b.clusterBits) |
| 98 | } |
| 99 | |
| 100 | // calculateMappingEntries returns the details of the 'cluster mapping' |
| 101 | // structures (sometimes called 'L1'). This is calculated based on the cluster |
| 102 | // size and target image size. |
| 103 | // |
| 104 | // The values returned are the count of L1 entries needed to cover the entirety |
| 105 | // of the user data, and the size of such an entry within its table. The latter |
| 106 | // is constant. |
| 107 | func (b *builder) calculateMappingEntries() (mappingL1Count uint64, mappingL1EntrySize uint64) { |
| 108 | clusterSize := b.clusterSize() |
| 109 | |
| 110 | mappingL2EntrySize := uint64(8) |
| 111 | // Each L2 table is always one cluster. |
| 112 | mappingL2EntryCount := clusterSize / mappingL2EntrySize |
| 113 | // How much user data each L2 table covers. |
| 114 | mappingL2Covering := mappingL2EntryCount * clusterSize |
| 115 | // How many L1 entries do we need. |
| 116 | mappingL1Count = divRoundUp(b.size, mappingL2Covering) |
| 117 | // Each L1 entry is always 64 bits. |
| 118 | mappingL1EntrySize = uint64(8) |
| 119 | |
| 120 | return |
| 121 | } |
| 122 | |
| 123 | // refcountTableProperties contains the properties of a L1 or L2 refcount table |
| 124 | // calculated from the cluster size of a qcow2 image. |
| 125 | type refcountTableProperties struct { |
| 126 | // bytesCoveredByCluster describes how many bytes of {meta,}data are addressed by |
| 127 | // a cluster full of metadata entries of this refcount table. |
| 128 | bytesCoveredByCluster uint64 |
| 129 | // entriesPerCluster describes how many entries of this refcount table fit in a |
| 130 | // full cluster of metadata. |
| 131 | entriesPerCluster uint64 |
| 132 | } |
| 133 | |
| 134 | // calculateRefcountProperties returns recountTableProperties for L1 and L2 |
| 135 | // refcount tables for the qcow2 image being built. These are static properties |
| 136 | // calculated based on the cluster size, not on the target size of the image. |
| 137 | func (b *builder) calculateRefcountProperties() (l1 refcountTableProperties, l2 refcountTableProperties) { |
| 138 | clusterSize := b.clusterSize() |
| 139 | |
| 140 | // Size of refcount l2 structure entry (16 bits): |
| 141 | refcountL2EntrySize := uint64(2) |
| 142 | // Number of entries in refcount l2 structure: |
| 143 | refcountL2EntryCount := clusterSize / refcountL2EntrySize |
| 144 | // Storage size covered by every refcount l2 structure (each entry points towards |
| 145 | // a cluster): |
| 146 | refcountL2Covering := refcountL2EntryCount * clusterSize |
| 147 | |
| 148 | // Size of refcount l1 structure entry (64 bits): |
| 149 | refcountL1EntrySize := uint64(8) |
| 150 | // Number of entries in refcount l1 structure: |
| 151 | refcountL1EntryCount := clusterSize / refcountL1EntrySize |
| 152 | // Storage size covered by every refcount l1 structure: |
| 153 | refcountL1Covering := refcountL1EntryCount * refcountL2Covering |
| 154 | |
| 155 | l1.bytesCoveredByCluster = refcountL1Covering |
| 156 | l1.entriesPerCluster = refcountL1EntryCount |
| 157 | |
| 158 | l2.bytesCoveredByCluster = refcountL2Covering |
| 159 | l2.entriesPerCluster = refcountL2EntryCount |
| 160 | return |
| 161 | } |
| 162 | |
| 163 | // build writes out a qcow2 image (based on the builder's configuration) to a |
| 164 | // Writer. |
| 165 | func (b *builder) build(w io.Writer) error { |
| 166 | clusterSize := b.clusterSize() |
| 167 | // Minimum supported by QEMU and this codebase. |
| 168 | if b.clusterSize() < 512 { |
| 169 | return fmt.Errorf("cluster size too small") |
| 170 | } |
| 171 | |
| 172 | // Size of a serialized qcow2Header. |
| 173 | headerSize := 72 |
| 174 | // And however long the backing file name is (we stuff it after the header, |
| 175 | // within the first cluster). |
| 176 | headerSize += len(b.backingFile) |
| 177 | // Make sure the above fits. |
| 178 | if uint64(headerSize) > clusterSize { |
| 179 | return fmt.Errorf("cluster size too small") |
| 180 | } |
| 181 | |
| 182 | // Cluster mapping structures are just based on the backing storage size and |
| 183 | // don't need to cover metadata. This makes it easy to calculate: |
| 184 | mappingL1Count, mappingL1EntrySize := b.calculateMappingEntries() |
| 185 | mappingL1Clusters := divRoundUp(mappingL1Count*mappingL1EntrySize, clusterSize) |
| 186 | |
| 187 | // For refcount structures we need enough to cover just the metadata of our |
| 188 | // image. Unfortunately that includes the refcount structures themselves. We |
| 189 | // choose a naïve iterative approach where we start out with zero metadata |
| 190 | // structures and keep adding more until we cover everything that's necessary. |
| 191 | |
| 192 | // Coverage, in bytes, for an L1 and L2 refcount cluster full of entries. |
| 193 | refcountL1, refcountL2 := b.calculateRefcountProperties() |
| 194 | |
| 195 | // How many clusters of metadata to put in the final image. We start at zero and |
| 196 | // iterate upwards. |
| 197 | refcountL1Clusters := uint64(0) |
| 198 | refcountL2Clusters := uint64(0) |
| 199 | |
| 200 | // How many bytes the current metadata covers. This is also zero at first. |
| 201 | maximumL2Covering := refcountL2Clusters * refcountL2.bytesCoveredByCluster |
| 202 | maximumL1Covering := refcountL1Clusters * refcountL1.bytesCoveredByCluster |
| 203 | |
| 204 | // The size of our metadata in bytes. |
| 205 | sizeMetadata := (1 + mappingL1Clusters + refcountL1Clusters + refcountL2Clusters) * clusterSize |
| 206 | |
| 207 | // Keep adding L1/L2 metadata clusters until we can cover all of our own |
| 208 | // metadata. This is an iterative approach to the aforementioned problem. Even |
| 209 | // with large images (100TB) at the default cluster size this loop only takes one |
| 210 | // iteration to stabilize. Smaller cluster sizes might take longer. |
| 211 | for { |
| 212 | changed := false |
| 213 | |
| 214 | // Keep adding L2 metadata clusters until we cover all the metadata we know so far. |
| 215 | for maximumL2Covering < sizeMetadata { |
| 216 | refcountL2Clusters += 1 |
| 217 | // Recalculate. |
| 218 | maximumL2Covering = refcountL2Clusters * refcountL2.bytesCoveredByCluster |
| 219 | sizeMetadata += clusterSize |
| 220 | changed = true |
| 221 | } |
| 222 | // Keep adding L1 metadata clusters until we cover all the metadata we know so far. |
| 223 | for maximumL1Covering < sizeMetadata { |
| 224 | refcountL1Clusters += 1 |
| 225 | // Recalculate. |
| 226 | maximumL1Covering = refcountL1Clusters * refcountL1.bytesCoveredByCluster |
| 227 | sizeMetadata += clusterSize |
| 228 | changed = true |
| 229 | } |
| 230 | |
| 231 | // If no changes were introduced, it means we stabilized at some l1/l2 metadata |
| 232 | // cluster count. Exit. |
| 233 | if !changed { |
| 234 | break |
| 235 | } |
| 236 | } |
| 237 | |
| 238 | // Now that we have calculated everything, start writing out the header. |
| 239 | h := qcow2Header{ |
| 240 | Version: 2, |
| 241 | ClusterBits: b.clusterBits, |
| 242 | Size: b.size, |
| 243 | // Unencrypted. |
| 244 | CryptMethod: 0, |
| 245 | |
| 246 | L1Size: uint32(mappingL1Count), |
| 247 | // L1 table starts after the header and refcount tables. |
| 248 | L1TableOffset: (1 + refcountL1Clusters + refcountL2Clusters) * clusterSize, |
| 249 | |
| 250 | // Refcount table starts right after header cluster. |
| 251 | RefcountTableOffset: 1 * clusterSize, |
| 252 | RefcountTableClusters: uint32(refcountL1Clusters), |
| 253 | |
| 254 | // No snapshots. |
| 255 | NumberSnapshots: 0, |
| 256 | SnapshotsOffset: 0, |
| 257 | } |
| 258 | copy(h.Magic[:], "QFI\xfb") |
| 259 | if b.backingFile != "" { |
| 260 | // Backing file path is right after header. |
| 261 | h.BackingFileOffset = 72 |
| 262 | h.BackingFileSize = uint32(len(b.backingFile)) |
| 263 | } |
| 264 | if err := binary.Write(w, binary.BigEndian, &h); err != nil { |
| 265 | return err |
| 266 | } |
| 267 | |
| 268 | // Write out backing file name right after the header. |
| 269 | if b.backingFile != "" { |
| 270 | if _, err := w.Write([]byte(b.backingFile)); err != nil { |
| 271 | return err |
| 272 | } |
| 273 | } |
| 274 | |
| 275 | // Align to next cluster with zeroes. |
| 276 | if _, err := w.Write(bytes.Repeat([]byte{0}, int(clusterSize)-headerSize)); err != nil { |
| 277 | return err |
| 278 | } |
| 279 | |
| 280 | // Write L1 refcount table. |
| 281 | for i := uint64(0); i < refcountL1Clusters; i++ { |
| 282 | for j := uint64(0); j < refcountL1.entriesPerCluster; j++ { |
| 283 | // Index of corresponding l2 table. |
| 284 | ix := j + i*refcountL1.entriesPerCluster |
| 285 | |
| 286 | var data uint64 |
| 287 | if ix < refcountL2Clusters { |
| 288 | // If this is an allocated L2 table, write its offset. Otherwise write zero. |
| 289 | data = (1 + refcountL1Clusters + ix) * clusterSize |
| 290 | } |
| 291 | if err := binary.Write(w, binary.BigEndian, &data); err != nil { |
| 292 | return err |
| 293 | } |
| 294 | } |
| 295 | } |
| 296 | // Write L2 refcount table. |
| 297 | for i := uint64(0); i < refcountL2Clusters; i++ { |
| 298 | for j := uint64(0); j < refcountL2.entriesPerCluster; j++ { |
| 299 | // Index of corresponding cluster. |
| 300 | ix := j + i*refcountL2.entriesPerCluster |
| 301 | |
| 302 | var data uint16 |
| 303 | if ix < (1 + refcountL1Clusters + refcountL2Clusters + mappingL1Clusters) { |
| 304 | // If this is an in-use cluster, mark it as such. |
| 305 | data = 1 |
| 306 | } |
| 307 | if err := binary.Write(w, binary.BigEndian, &data); err != nil { |
| 308 | return err |
| 309 | } |
| 310 | } |
| 311 | } |
| 312 | // Write L1 mapping table. |
| 313 | for i := uint64(0); i < mappingL1Count; i++ { |
| 314 | // No user data yet allocated. |
| 315 | data := uint64(0) |
| 316 | if err := binary.Write(w, binary.BigEndian, &data); err != nil { |
| 317 | return err |
| 318 | } |
| 319 | } |
| 320 | |
| 321 | return nil |
| 322 | } |
| 323 | |
| 324 | // GenerateOption structures are passed to the Generate call. |
| 325 | type GenerateOption struct { |
| 326 | fileSize *uint64 |
| 327 | backingFilePath *string |
| 328 | } |
| 329 | |
| 330 | // GenerateWithFileSize will generate an image with a given user size in bytes. |
| 331 | func GenerateWithFileSize(fileSize uint64) *GenerateOption { |
| 332 | return &GenerateOption{ |
| 333 | fileSize: &fileSize, |
| 334 | } |
| 335 | } |
| 336 | |
| 337 | // GenerateWithBackingFile will generate an image size backed by a given file |
| 338 | // path. If GenerateWithFileSize is not given, the path will also be checked at |
| 339 | // generation time and used to determine the size of the user image. |
| 340 | func GenerateWithBackingFile(path string) *GenerateOption { |
| 341 | return &GenerateOption{ |
| 342 | backingFilePath: &path, |
| 343 | } |
| 344 | } |
| 345 | |
| 346 | // Generate builds a QCOW2 image and writes it to the given Writer. |
| 347 | // |
| 348 | // At least one of GenerateWithFileSize or GenerateWithBackingFile must be given. |
| 349 | func Generate(w io.Writer, opts ...*GenerateOption) error { |
| 350 | var size uint64 |
| 351 | var haveSize bool |
| 352 | |
| 353 | var backingFile string |
| 354 | var haveBackingFile bool |
| 355 | |
| 356 | for _, opt := range opts { |
| 357 | if opt.fileSize != nil { |
| 358 | if haveSize { |
| 359 | return fmt.Errorf("cannot specify GenerateWithFileSize twice") |
| 360 | } |
| 361 | haveSize = true |
| 362 | |
| 363 | size = *opt.fileSize |
| 364 | } |
| 365 | if opt.backingFilePath != nil { |
| 366 | if haveBackingFile { |
| 367 | return fmt.Errorf("cannot specify GenerateWithBackingFile twice") |
| 368 | } |
| 369 | haveBackingFile = true |
| 370 | |
| 371 | backingFile = *opt.backingFilePath |
| 372 | } |
| 373 | } |
| 374 | |
| 375 | if !haveSize && !haveBackingFile { |
| 376 | return fmt.Errorf("must be called with GenerateWithFileSize or GenerateWithBackingFileAndSize") |
| 377 | } |
| 378 | |
| 379 | if haveBackingFile && !haveSize { |
| 380 | st, err := os.Stat(backingFile) |
| 381 | if err != nil { |
| 382 | return fmt.Errorf("cannot read backing file: %w", err) |
| 383 | } |
| 384 | size = uint64(st.Size()) |
| 385 | } |
| 386 | |
| 387 | b := builder{ |
| 388 | size: size, |
| 389 | backingFile: backingFile, |
| 390 | // 64k clusters (standard generated by qemu-img). |
| 391 | clusterBits: uint32(0x10), |
| 392 | } |
| 393 | return b.build(w) |
| 394 | } |