go/qcow2: implement simple image generator

This is a write-at-once, write-only library for qcow2 images. We will
use it in tests to generate copy-on-write, snapshot-style images
(instead of wasting disk space and iops copying the node image every
time).

Change-Id: I3c8eaeb0c6fa1da44255eb7fc577168d6066d8e3
Reviewed-on: https://review.monogon.dev/c/monogon/+/3092
Tested-by: Jenkins CI
Reviewed-by: Jan Schär <jan@monogon.tech>
diff --git a/go/qcow2/qcow2.go b/go/qcow2/qcow2.go
new file mode 100644
index 0000000..df7825f
--- /dev/null
+++ b/go/qcow2/qcow2.go
@@ -0,0 +1,391 @@
+// Package qcow2 implements generating QEMU copy-on-write image files.
+package qcow2
+
+// QCOW2 is a peculiar file format, as it's cluster-based (read: block-based) and
+// that blocks are a concept both exposed to the guest (user data storage is
+// expressed in clusters) and used to describe the layout of metadata structures.
+//
+// Most notably, the 'refcount' (reference count) mechanism for clusters is not
+// only for user-visible clusters but also for clusters within the image that
+// contain metadata. This means that users of the format can use the same
+// allocation system for clusters both for allocating clusters exposed to the
+// user and clusters for metadata/housekeeping.
+//
+// We only support a small subset of QCOW2 functionality: writing, qcow2 (no v1
+// or v3) and no extensions. This keeps the codebase quite lean.
+//
+// QCOW2 images are made out of:
+//
+//  1. A file header (occupies first cluster);
+//  2. Optionally, a backing file name (right after header, within the first
+//     cluster);
+//  3. A refcount table (made out of two levels, L1 is continuous and pointed to
+//     by header, L2 is pointed to by L1, both must be cluster-aligned and occupy a
+//     multiple of clusters each). The refcount table is indexed by cluster address
+//     and returns whether a cluster is free or in use (by metadata or user data);
+//  4. A cluster mapping table (made out of two levels similarly to the refcount
+//     table, but the cluster mapping L1 table does not have to occupy an entire
+//     cluster as the file header contains information on how many entries are
+//     there). The cluster mapping table is indexed by user address and returns
+//     the file cluster address.
+//
+// Our generated files contain a header/backing file name, a refcount table which
+// contains enough entries to cover the metadata of the file itself, and a
+// minimum L1 table (enough to cover the whole user data address range, but with
+// no cluster mapped). The users of the image file will be able to use the
+// cluster mapping L1 table as is, and will likely need to remap the refcount
+// table to fit more file cluster addresses as the image gets written. This is
+// the same kind of layout qemu-img uses.
+//
+// Reference: https://github.com/qemu/qemu/blob/master/docs/interop/qcow2.txt
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"io"
+	"os"
+)
+
+// builder is the internal structure defining the requested parameters for image
+// generation.
+type builder struct {
+	// size of the image in bytes. Note that this is the size visible to the user,
+	// not the actual size in bytes of the container (which will be in most cases
+	// significantly smaller).
+	size uint64
+	// backingFile is, if non-zero, the backing file for a given qcow2 image. This
+	// makes the qcow2 image function as an overlay on top of the backing file (ie.
+	// offers snapshot functionality).
+	backingFile string
+	// clusterBits is the number of bits that a cluster has. This is usually 16, for
+	// 64KiB clusters.
+	clusterBits uint32
+}
+
+type qcow2Header struct {
+	Magic                 [4]byte
+	Version               uint32
+	BackingFileOffset     uint64
+	BackingFileSize       uint32
+	ClusterBits           uint32
+	Size                  uint64
+	CryptMethod           uint32
+	L1Size                uint32
+	L1TableOffset         uint64
+	RefcountTableOffset   uint64
+	RefcountTableClusters uint32
+	NumberSnapshots       uint32
+	SnapshotsOffset       uint64
+}
+
+// divRoundUp divides a by b and returns the result rounded up. This is useful to
+// answer the question 'how many b-sized elements do I need to contain a'.
+func divRoundUp(a, b uint64) uint64 {
+	res := a / b
+	if a%b != 0 {
+		res += 1
+	}
+	return res
+}
+
+// clusterSize returns the cluster size for this builder.
+func (b *builder) clusterSize() uint64 {
+	return uint64(1 << b.clusterBits)
+}
+
+// calculateMappingEntries returns the details of the 'cluster mapping'
+// structures (sometimes called 'L1'). This is calculated based on the cluster
+// size and target image size.
+//
+// The values returned are the count of L1 entries needed to cover the entirety
+// of the user data, and the size of such an entry within its table. The latter
+// is constant.
+func (b *builder) calculateMappingEntries() (mappingL1Count uint64, mappingL1EntrySize uint64) {
+	clusterSize := b.clusterSize()
+
+	mappingL2EntrySize := uint64(8)
+	// Each L2 table is always one cluster.
+	mappingL2EntryCount := clusterSize / mappingL2EntrySize
+	// How much user data each L2 table covers.
+	mappingL2Covering := mappingL2EntryCount * clusterSize
+	// How many L1 entries do we need.
+	mappingL1Count = divRoundUp(b.size, mappingL2Covering)
+	// Each L1 entry is always 64 bits.
+	mappingL1EntrySize = uint64(8)
+
+	return
+}
+
+// refcountTableProperties contains the properties of a L1 or L2 refcount table
+// calculated from the cluster size of a qcow2 image.
+type refcountTableProperties struct {
+	// bytesCoveredByCluster describes how many bytes of {meta,}data are addressed by
+	// a cluster full of metadata entries of this refcount table.
+	bytesCoveredByCluster uint64
+	// entriesPerCluster describes how many entries of this refcount table fit in a
+	// full cluster of metadata.
+	entriesPerCluster uint64
+}
+
+// calculateRefcountProperties returns recountTableProperties for L1 and L2
+// refcount tables for the qcow2 image being built. These are static properties
+// calculated based on the cluster size, not on the target size of the image.
+func (b *builder) calculateRefcountProperties() (l1 refcountTableProperties, l2 refcountTableProperties) {
+	clusterSize := b.clusterSize()
+
+	// Size of refcount l2 structure entry (16 bits):
+	refcountL2EntrySize := uint64(2)
+	// Number of entries in refcount l2 structure:
+	refcountL2EntryCount := clusterSize / refcountL2EntrySize
+	// Storage size covered by every refcount l2 structure (each entry points towards
+	// a cluster):
+	refcountL2Covering := refcountL2EntryCount * clusterSize
+
+	// Size of refcount l1 structure entry (64 bits):
+	refcountL1EntrySize := uint64(8)
+	// Number of entries in refcount l1 structure:
+	refcountL1EntryCount := clusterSize / refcountL1EntrySize
+	// Storage size covered by every refcount l1 structure:
+	refcountL1Covering := refcountL1EntryCount * refcountL2Covering
+
+	l1.bytesCoveredByCluster = refcountL1Covering
+	l1.entriesPerCluster = refcountL1EntryCount
+
+	l2.bytesCoveredByCluster = refcountL2Covering
+	l2.entriesPerCluster = refcountL2EntryCount
+	return
+}
+
+// build writes out a qcow2 image (based on the builder's configuration) to a
+// Writer.
+func (b *builder) build(w io.Writer) error {
+	clusterSize := b.clusterSize()
+	// Minimum supported by QEMU and this codebase.
+	if b.clusterSize() < 512 {
+		return fmt.Errorf("cluster size too small")
+	}
+
+	// Size of a serialized qcow2Header.
+	headerSize := 72
+	// And however long the backing file name is (we stuff it after the header,
+	// within the first cluster).
+	headerSize += len(b.backingFile)
+	// Make sure the above fits.
+	if uint64(headerSize) > clusterSize {
+		return fmt.Errorf("cluster size too small")
+	}
+
+	// Cluster mapping structures are just based on the backing storage size and
+	// don't need to cover metadata. This makes it easy to calculate:
+	mappingL1Count, mappingL1EntrySize := b.calculateMappingEntries()
+	mappingL1Clusters := divRoundUp(mappingL1Count*mappingL1EntrySize, clusterSize)
+
+	// For refcount structures we need enough to cover just the metadata of our
+	// image. Unfortunately that includes the refcount structures themselves. We
+	// choose a naïve iterative approach where we start out with zero metadata
+	// structures and keep adding more until we cover everything that's necessary.
+
+	// Coverage, in bytes, for an L1 and L2 refcount cluster full of entries.
+	refcountL1, refcountL2 := b.calculateRefcountProperties()
+
+	// How many clusters of metadata to put in the final image. We start at zero and
+	// iterate upwards.
+	refcountL1Clusters := uint64(0)
+	refcountL2Clusters := uint64(0)
+
+	// How many bytes the current metadata covers. This is also zero at first.
+	maximumL2Covering := refcountL2Clusters * refcountL2.bytesCoveredByCluster
+	maximumL1Covering := refcountL1Clusters * refcountL1.bytesCoveredByCluster
+
+	// The size of our metadata in bytes.
+	sizeMetadata := (1 + mappingL1Clusters + refcountL1Clusters + refcountL2Clusters) * clusterSize
+
+	// Keep adding L1/L2 metadata clusters until we can cover all of our own
+	// metadata. This is an iterative approach to the aforementioned problem. Even
+	// with large images (100TB) at the default cluster size this loop only takes one
+	// iteration to stabilize. Smaller cluster sizes might take longer.
+	for {
+		changed := false
+
+		// Keep adding L2 metadata clusters until we cover all the metadata we know so far.
+		for maximumL2Covering < sizeMetadata {
+			refcountL2Clusters += 1
+			// Recalculate.
+			maximumL2Covering = refcountL2Clusters * refcountL2.bytesCoveredByCluster
+			sizeMetadata += clusterSize
+			changed = true
+		}
+		// Keep adding L1 metadata clusters until we cover all the metadata we know so far.
+		for maximumL1Covering < sizeMetadata {
+			refcountL1Clusters += 1
+			// Recalculate.
+			maximumL1Covering = refcountL1Clusters * refcountL1.bytesCoveredByCluster
+			sizeMetadata += clusterSize
+			changed = true
+		}
+
+		// If no changes were introduced, it means we stabilized at some l1/l2 metadata
+		// cluster count. Exit.
+		if !changed {
+			break
+		}
+	}
+
+	// Now that we have calculated everything, start writing out the header.
+	h := qcow2Header{
+		Version:     2,
+		ClusterBits: b.clusterBits,
+		Size:        b.size,
+		// Unencrypted.
+		CryptMethod: 0,
+
+		L1Size: uint32(mappingL1Count),
+		// L1 table starts after the header and refcount tables.
+		L1TableOffset: (1 + refcountL1Clusters + refcountL2Clusters) * clusterSize,
+
+		// Refcount table starts right after header cluster.
+		RefcountTableOffset:   1 * clusterSize,
+		RefcountTableClusters: uint32(refcountL1Clusters),
+
+		// No snapshots.
+		NumberSnapshots: 0,
+		SnapshotsOffset: 0,
+	}
+	copy(h.Magic[:], "QFI\xfb")
+	if b.backingFile != "" {
+		// Backing file path is right after header.
+		h.BackingFileOffset = 72
+		h.BackingFileSize = uint32(len(b.backingFile))
+	}
+	if err := binary.Write(w, binary.BigEndian, &h); err != nil {
+		return err
+	}
+
+	// Write out backing file name right after the header.
+	if b.backingFile != "" {
+		if _, err := w.Write([]byte(b.backingFile)); err != nil {
+			return err
+		}
+	}
+
+	// Align to next cluster with zeroes.
+	if _, err := w.Write(bytes.Repeat([]byte{0}, int(clusterSize)-headerSize)); err != nil {
+		return err
+	}
+
+	// Write L1 refcount table.
+	for i := uint64(0); i < refcountL1Clusters; i++ {
+		for j := uint64(0); j < refcountL1.entriesPerCluster; j++ {
+			// Index of corresponding l2 table.
+			ix := j + i*refcountL1.entriesPerCluster
+
+			var data uint64
+			if ix < refcountL2Clusters {
+				// If this is an allocated L2 table, write its offset. Otherwise write zero.
+				data = (1 + refcountL1Clusters + ix) * clusterSize
+			}
+			if err := binary.Write(w, binary.BigEndian, &data); err != nil {
+				return err
+			}
+		}
+	}
+	// Write L2 refcount table.
+	for i := uint64(0); i < refcountL2Clusters; i++ {
+		for j := uint64(0); j < refcountL2.entriesPerCluster; j++ {
+			// Index of corresponding cluster.
+			ix := j + i*refcountL2.entriesPerCluster
+
+			var data uint16
+			if ix < (1 + refcountL1Clusters + refcountL2Clusters + mappingL1Clusters) {
+				// If this is an in-use cluster, mark it as such.
+				data = 1
+			}
+			if err := binary.Write(w, binary.BigEndian, &data); err != nil {
+				return err
+			}
+		}
+	}
+	// Write L1 mapping table.
+	for i := uint64(0); i < mappingL1Count; i++ {
+		// No user data yet allocated.
+		data := uint64(0)
+		if err := binary.Write(w, binary.BigEndian, &data); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+// GenerateOption structures are passed to the Generate call.
+type GenerateOption struct {
+	fileSize        *uint64
+	backingFilePath *string
+}
+
+// GenerateWithFileSize will generate an image with a given user size in bytes.
+func GenerateWithFileSize(fileSize uint64) *GenerateOption {
+	return &GenerateOption{
+		fileSize: &fileSize,
+	}
+}
+
+// GenerateWithBackingFile will generate an image size backed by a given file
+// path. If GenerateWithFileSize is not given, the path will also be checked at
+// generation time and used to determine the size of the user image.
+func GenerateWithBackingFile(path string) *GenerateOption {
+	return &GenerateOption{
+		backingFilePath: &path,
+	}
+}
+
+// Generate builds a QCOW2 image and writes it to the given Writer.
+//
+// At least one of GenerateWithFileSize or GenerateWithBackingFile must be given.
+func Generate(w io.Writer, opts ...*GenerateOption) error {
+	var size uint64
+	var haveSize bool
+
+	var backingFile string
+	var haveBackingFile bool
+
+	for _, opt := range opts {
+		if opt.fileSize != nil {
+			if haveSize {
+				return fmt.Errorf("cannot specify GenerateWithFileSize twice")
+			}
+			haveSize = true
+
+			size = *opt.fileSize
+		}
+		if opt.backingFilePath != nil {
+			if haveBackingFile {
+				return fmt.Errorf("cannot specify GenerateWithBackingFile twice")
+			}
+			haveBackingFile = true
+
+			backingFile = *opt.backingFilePath
+		}
+	}
+
+	if !haveSize && !haveBackingFile {
+		return fmt.Errorf("must be called with GenerateWithFileSize or GenerateWithBackingFileAndSize")
+	}
+
+	if haveBackingFile && !haveSize {
+		st, err := os.Stat(backingFile)
+		if err != nil {
+			return fmt.Errorf("cannot read backing file: %w", err)
+		}
+		size = uint64(st.Size())
+	}
+
+	b := builder{
+		size:        size,
+		backingFile: backingFile,
+		// 64k clusters (standard generated by qemu-img).
+		clusterBits: uint32(0x10),
+	}
+	return b.build(w)
+}