Blame - go/qcow2/qcow2.go - monogon

blob: f10b4eb894a9041104c077df8cf5420579f1659a [file] [log] [blame]

Tim Windelschmidt	6d33a43	2025-02-04 14:34:25 +0100	[diff] [blame]	1	// Copyright The Monogon Project Authors.
				2	// SPDX-License-Identifier: Apache-2.0
				3
Serge Bazanski	a42844a	2024-05-16 18:06:19 +0200	[diff] [blame]	4	// Package qcow2 implements generating QEMU copy-on-write image files.
				5	package qcow2
				6
				7	// QCOW2 is a peculiar file format, as it's cluster-based (read: block-based) and
				8	// that blocks are a concept both exposed to the guest (user data storage is
				9	// expressed in clusters) and used to describe the layout of metadata structures.
				10	//
				11	// Most notably, the 'refcount' (reference count) mechanism for clusters is not
				12	// only for user-visible clusters but also for clusters within the image that
				13	// contain metadata. This means that users of the format can use the same
				14	// allocation system for clusters both for allocating clusters exposed to the
				15	// user and clusters for metadata/housekeeping.
				16	//
				17	// We only support a small subset of QCOW2 functionality: writing, qcow2 (no v1
				18	// or v3) and no extensions. This keeps the codebase quite lean.
				19	//
				20	// QCOW2 images are made out of:
				21	//
				22	// 1. A file header (occupies first cluster);
				23	// 2. Optionally, a backing file name (right after header, within the first
				24	// cluster);
				25	// 3. A refcount table (made out of two levels, L1 is continuous and pointed to
				26	// by header, L2 is pointed to by L1, both must be cluster-aligned and occupy a
				27	// multiple of clusters each). The refcount table is indexed by cluster address
				28	// and returns whether a cluster is free or in use (by metadata or user data);
				29	// 4. A cluster mapping table (made out of two levels similarly to the refcount
				30	// table, but the cluster mapping L1 table does not have to occupy an entire
				31	// cluster as the file header contains information on how many entries are
				32	// there). The cluster mapping table is indexed by user address and returns
				33	// the file cluster address.
				34	//
				35	// Our generated files contain a header/backing file name, a refcount table which
				36	// contains enough entries to cover the metadata of the file itself, and a
				37	// minimum L1 table (enough to cover the whole user data address range, but with
				38	// no cluster mapped). The users of the image file will be able to use the
				39	// cluster mapping L1 table as is, and will likely need to remap the refcount
				40	// table to fit more file cluster addresses as the image gets written. This is
				41	// the same kind of layout qemu-img uses.
				42	//
				43	// Reference: https://github.com/qemu/qemu/blob/master/docs/interop/qcow2.txt
				44
				45	import (
				46	"bytes"
				47	"encoding/binary"
				48	"fmt"
				49	"io"
				50	"os"
				51	)
				52
				53	// builder is the internal structure defining the requested parameters for image
				54	// generation.
				55	type builder struct {
				56	// size of the image in bytes. Note that this is the size visible to the user,
				57	// not the actual size in bytes of the container (which will be in most cases
				58	// significantly smaller).
				59	size uint64
				60	// backingFile is, if non-zero, the backing file for a given qcow2 image. This
				61	// makes the qcow2 image function as an overlay on top of the backing file (ie.
				62	// offers snapshot functionality).
				63	backingFile string
				64	// clusterBits is the number of bits that a cluster has. This is usually 16, for
				65	// 64KiB clusters.
				66	clusterBits uint32
				67	}
				68
				69	type qcow2Header struct {
				70	Magic [4]byte
				71	Version uint32
				72	BackingFileOffset uint64
				73	BackingFileSize uint32
				74	ClusterBits uint32
				75	Size uint64
				76	CryptMethod uint32
				77	L1Size uint32
				78	L1TableOffset uint64
				79	RefcountTableOffset uint64
				80	RefcountTableClusters uint32
				81	NumberSnapshots uint32
				82	SnapshotsOffset uint64
				83	}
				84
				85	// divRoundUp divides a by b and returns the result rounded up. This is useful to
				86	// answer the question 'how many b-sized elements do I need to contain a'.
				87	func divRoundUp(a, b uint64) uint64 {
				88	res := a / b
				89	if a%b != 0 {
				90	res += 1
				91	}
				92	return res
				93	}
				94
				95	// clusterSize returns the cluster size for this builder.
				96	func (b *builder) clusterSize() uint64 {
				97	return uint64(1 << b.clusterBits)
				98	}
				99
				100	// calculateMappingEntries returns the details of the 'cluster mapping'
				101	// structures (sometimes called 'L1'). This is calculated based on the cluster
				102	// size and target image size.
				103	//
				104	// The values returned are the count of L1 entries needed to cover the entirety
				105	// of the user data, and the size of such an entry within its table. The latter
				106	// is constant.
				107	func (b *builder) calculateMappingEntries() (mappingL1Count uint64, mappingL1EntrySize uint64) {
				108	clusterSize := b.clusterSize()
				109
				110	mappingL2EntrySize := uint64(8)
				111	// Each L2 table is always one cluster.
				112	mappingL2EntryCount := clusterSize / mappingL2EntrySize
				113	// How much user data each L2 table covers.
				114	mappingL2Covering := mappingL2EntryCount * clusterSize
				115	// How many L1 entries do we need.
				116	mappingL1Count = divRoundUp(b.size, mappingL2Covering)
				117	// Each L1 entry is always 64 bits.
				118	mappingL1EntrySize = uint64(8)
				119
				120	return
				121	}
				122
				123	// refcountTableProperties contains the properties of a L1 or L2 refcount table
				124	// calculated from the cluster size of a qcow2 image.
				125	type refcountTableProperties struct {
				126	// bytesCoveredByCluster describes how many bytes of {meta,}data are addressed by
				127	// a cluster full of metadata entries of this refcount table.
				128	bytesCoveredByCluster uint64
				129	// entriesPerCluster describes how many entries of this refcount table fit in a
				130	// full cluster of metadata.
				131	entriesPerCluster uint64
				132	}
				133
				134	// calculateRefcountProperties returns recountTableProperties for L1 and L2
				135	// refcount tables for the qcow2 image being built. These are static properties
				136	// calculated based on the cluster size, not on the target size of the image.
				137	func (b *builder) calculateRefcountProperties() (l1 refcountTableProperties, l2 refcountTableProperties) {
				138	clusterSize := b.clusterSize()
				139
				140	// Size of refcount l2 structure entry (16 bits):
				141	refcountL2EntrySize := uint64(2)
				142	// Number of entries in refcount l2 structure:
				143	refcountL2EntryCount := clusterSize / refcountL2EntrySize
				144	// Storage size covered by every refcount l2 structure (each entry points towards
				145	// a cluster):
				146	refcountL2Covering := refcountL2EntryCount * clusterSize
				147
				148	// Size of refcount l1 structure entry (64 bits):
				149	refcountL1EntrySize := uint64(8)
				150	// Number of entries in refcount l1 structure:
				151	refcountL1EntryCount := clusterSize / refcountL1EntrySize
				152	// Storage size covered by every refcount l1 structure:
				153	refcountL1Covering := refcountL1EntryCount * refcountL2Covering
				154
				155	l1.bytesCoveredByCluster = refcountL1Covering
				156	l1.entriesPerCluster = refcountL1EntryCount
				157
				158	l2.bytesCoveredByCluster = refcountL2Covering
				159	l2.entriesPerCluster = refcountL2EntryCount
				160	return
				161	}
				162
				163	// build writes out a qcow2 image (based on the builder's configuration) to a
				164	// Writer.
				165	func (b *builder) build(w io.Writer) error {
				166	clusterSize := b.clusterSize()
				167	// Minimum supported by QEMU and this codebase.
				168	if b.clusterSize() < 512 {
				169	return fmt.Errorf("cluster size too small")
				170	}
				171
				172	// Size of a serialized qcow2Header.
				173	headerSize := 72
				174	// And however long the backing file name is (we stuff it after the header,
				175	// within the first cluster).
				176	headerSize += len(b.backingFile)
				177	// Make sure the above fits.
				178	if uint64(headerSize) > clusterSize {
				179	return fmt.Errorf("cluster size too small")
				180	}
				181
				182	// Cluster mapping structures are just based on the backing storage size and
				183	// don't need to cover metadata. This makes it easy to calculate:
				184	mappingL1Count, mappingL1EntrySize := b.calculateMappingEntries()
				185	mappingL1Clusters := divRoundUp(mappingL1Count*mappingL1EntrySize, clusterSize)
				186
				187	// For refcount structures we need enough to cover just the metadata of our
				188	// image. Unfortunately that includes the refcount structures themselves. We
				189	// choose a naïve iterative approach where we start out with zero metadata
				190	// structures and keep adding more until we cover everything that's necessary.
				191
				192	// Coverage, in bytes, for an L1 and L2 refcount cluster full of entries.
				193	refcountL1, refcountL2 := b.calculateRefcountProperties()
				194
				195	// How many clusters of metadata to put in the final image. We start at zero and
				196	// iterate upwards.
				197	refcountL1Clusters := uint64(0)
				198	refcountL2Clusters := uint64(0)
				199
				200	// How many bytes the current metadata covers. This is also zero at first.
				201	maximumL2Covering := refcountL2Clusters * refcountL2.bytesCoveredByCluster
				202	maximumL1Covering := refcountL1Clusters * refcountL1.bytesCoveredByCluster
				203
				204	// The size of our metadata in bytes.
				205	sizeMetadata := (1 + mappingL1Clusters + refcountL1Clusters + refcountL2Clusters) * clusterSize
				206
				207	// Keep adding L1/L2 metadata clusters until we can cover all of our own
				208	// metadata. This is an iterative approach to the aforementioned problem. Even
				209	// with large images (100TB) at the default cluster size this loop only takes one
				210	// iteration to stabilize. Smaller cluster sizes might take longer.
				211	for {
				212	changed := false
				213
				214	// Keep adding L2 metadata clusters until we cover all the metadata we know so far.
				215	for maximumL2Covering < sizeMetadata {
				216	refcountL2Clusters += 1
				217	// Recalculate.
				218	maximumL2Covering = refcountL2Clusters * refcountL2.bytesCoveredByCluster
				219	sizeMetadata += clusterSize
				220	changed = true
				221	}
				222	// Keep adding L1 metadata clusters until we cover all the metadata we know so far.
				223	for maximumL1Covering < sizeMetadata {
				224	refcountL1Clusters += 1
				225	// Recalculate.
				226	maximumL1Covering = refcountL1Clusters * refcountL1.bytesCoveredByCluster
				227	sizeMetadata += clusterSize
				228	changed = true
				229	}
				230
				231	// If no changes were introduced, it means we stabilized at some l1/l2 metadata
				232	// cluster count. Exit.
				233	if !changed {
				234	break
				235	}
				236	}
				237
				238	// Now that we have calculated everything, start writing out the header.
				239	h := qcow2Header{
				240	Version: 2,
				241	ClusterBits: b.clusterBits,
				242	Size: b.size,
				243	// Unencrypted.
				244	CryptMethod: 0,
				245
				246	L1Size: uint32(mappingL1Count),
				247	// L1 table starts after the header and refcount tables.
				248	L1TableOffset: (1 + refcountL1Clusters + refcountL2Clusters) * clusterSize,
				249
				250	// Refcount table starts right after header cluster.
				251	RefcountTableOffset: 1 * clusterSize,
				252	RefcountTableClusters: uint32(refcountL1Clusters),
				253
				254	// No snapshots.
				255	NumberSnapshots: 0,
				256	SnapshotsOffset: 0,
				257	}
				258	copy(h.Magic[:], "QFI\xfb")
				259	if b.backingFile != "" {
				260	// Backing file path is right after header.
				261	h.BackingFileOffset = 72
				262	h.BackingFileSize = uint32(len(b.backingFile))
				263	}
				264	if err := binary.Write(w, binary.BigEndian, &h); err != nil {
				265	return err
				266	}
				267
				268	// Write out backing file name right after the header.
				269	if b.backingFile != "" {
				270	if _, err := w.Write([]byte(b.backingFile)); err != nil {
				271	return err
				272	}
				273	}
				274
				275	// Align to next cluster with zeroes.
				276	if _, err := w.Write(bytes.Repeat([]byte{0}, int(clusterSize)-headerSize)); err != nil {
				277	return err
				278	}
				279
				280	// Write L1 refcount table.
				281	for i := uint64(0); i < refcountL1Clusters; i++ {
				282	for j := uint64(0); j < refcountL1.entriesPerCluster; j++ {
				283	// Index of corresponding l2 table.
				284	ix := j + i*refcountL1.entriesPerCluster
				285
				286	var data uint64
				287	if ix < refcountL2Clusters {
				288	// If this is an allocated L2 table, write its offset. Otherwise write zero.
				289	data = (1 + refcountL1Clusters + ix) * clusterSize
				290	}
				291	if err := binary.Write(w, binary.BigEndian, &data); err != nil {
				292	return err
				293	}
				294	}
				295	}
				296	// Write L2 refcount table.
				297	for i := uint64(0); i < refcountL2Clusters; i++ {
				298	for j := uint64(0); j < refcountL2.entriesPerCluster; j++ {
				299	// Index of corresponding cluster.
				300	ix := j + i*refcountL2.entriesPerCluster
				301
				302	var data uint16
				303	if ix < (1 + refcountL1Clusters + refcountL2Clusters + mappingL1Clusters) {
				304	// If this is an in-use cluster, mark it as such.
				305	data = 1
				306	}
				307	if err := binary.Write(w, binary.BigEndian, &data); err != nil {
				308	return err
				309	}
				310	}
				311	}
				312	// Write L1 mapping table.
				313	for i := uint64(0); i < mappingL1Count; i++ {
				314	// No user data yet allocated.
				315	data := uint64(0)
				316	if err := binary.Write(w, binary.BigEndian, &data); err != nil {
				317	return err
				318	}
				319	}
				320
				321	return nil
				322	}
				323
				324	// GenerateOption structures are passed to the Generate call.
				325	type GenerateOption struct {
				326	fileSize *uint64
				327	backingFilePath *string
				328	}
				329
				330	// GenerateWithFileSize will generate an image with a given user size in bytes.
				331	func GenerateWithFileSize(fileSize uint64) *GenerateOption {
				332	return &GenerateOption{
				333	fileSize: &fileSize,
				334	}
				335	}
				336
				337	// GenerateWithBackingFile will generate an image size backed by a given file
				338	// path. If GenerateWithFileSize is not given, the path will also be checked at
				339	// generation time and used to determine the size of the user image.
				340	func GenerateWithBackingFile(path string) *GenerateOption {
				341	return &GenerateOption{
				342	backingFilePath: &path,
				343	}
				344	}
				345
				346	// Generate builds a QCOW2 image and writes it to the given Writer.
				347	//
				348	// At least one of GenerateWithFileSize or GenerateWithBackingFile must be given.
				349	func Generate(w io.Writer, opts ...*GenerateOption) error {
				350	var size uint64
				351	var haveSize bool
				352
				353	var backingFile string
				354	var haveBackingFile bool
				355
				356	for _, opt := range opts {
				357	if opt.fileSize != nil {
				358	if haveSize {
				359	return fmt.Errorf("cannot specify GenerateWithFileSize twice")
				360	}
				361	haveSize = true
				362
				363	size = *opt.fileSize
				364	}
				365	if opt.backingFilePath != nil {
				366	if haveBackingFile {
				367	return fmt.Errorf("cannot specify GenerateWithBackingFile twice")
				368	}
				369	haveBackingFile = true
				370
				371	backingFile = *opt.backingFilePath
				372	}
				373	}
				374
				375	if !haveSize && !haveBackingFile {
				376	return fmt.Errorf("must be called with GenerateWithFileSize or GenerateWithBackingFileAndSize")
				377	}
				378
				379	if haveBackingFile && !haveSize {
				380	st, err := os.Stat(backingFile)
				381	if err != nil {
				382	return fmt.Errorf("cannot read backing file: %w", err)
				383	}
				384	size = uint64(st.Size())
				385	}
				386
				387	b := builder{
				388	size: size,
				389	backingFile: backingFile,
				390	// 64k clusters (standard generated by qemu-img).
				391	clusterBits: uint32(0x10),
				392	}
				393	return b.build(w)
				394	}