blob: df7825f5db2cb6940c37ee7cd351fabbb53f1066 [file] [log] [blame]
// Package qcow2 implements generating QEMU copy-on-write image files.
package qcow2
// QCOW2 is a peculiar file format, as it's cluster-based (read: block-based) and
// that blocks are a concept both exposed to the guest (user data storage is
// expressed in clusters) and used to describe the layout of metadata structures.
//
// Most notably, the 'refcount' (reference count) mechanism for clusters is not
// only for user-visible clusters but also for clusters within the image that
// contain metadata. This means that users of the format can use the same
// allocation system for clusters both for allocating clusters exposed to the
// user and clusters for metadata/housekeeping.
//
// We only support a small subset of QCOW2 functionality: writing, qcow2 (no v1
// or v3) and no extensions. This keeps the codebase quite lean.
//
// QCOW2 images are made out of:
//
// 1. A file header (occupies first cluster);
// 2. Optionally, a backing file name (right after header, within the first
// cluster);
// 3. A refcount table (made out of two levels, L1 is continuous and pointed to
// by header, L2 is pointed to by L1, both must be cluster-aligned and occupy a
// multiple of clusters each). The refcount table is indexed by cluster address
// and returns whether a cluster is free or in use (by metadata or user data);
// 4. A cluster mapping table (made out of two levels similarly to the refcount
// table, but the cluster mapping L1 table does not have to occupy an entire
// cluster as the file header contains information on how many entries are
// there). The cluster mapping table is indexed by user address and returns
// the file cluster address.
//
// Our generated files contain a header/backing file name, a refcount table which
// contains enough entries to cover the metadata of the file itself, and a
// minimum L1 table (enough to cover the whole user data address range, but with
// no cluster mapped). The users of the image file will be able to use the
// cluster mapping L1 table as is, and will likely need to remap the refcount
// table to fit more file cluster addresses as the image gets written. This is
// the same kind of layout qemu-img uses.
//
// Reference: https://github.com/qemu/qemu/blob/master/docs/interop/qcow2.txt
import (
"bytes"
"encoding/binary"
"fmt"
"io"
"os"
)
// builder is the internal structure defining the requested parameters for image
// generation.
type builder struct {
// size of the image in bytes. Note that this is the size visible to the user,
// not the actual size in bytes of the container (which will be in most cases
// significantly smaller).
size uint64
// backingFile is, if non-zero, the backing file for a given qcow2 image. This
// makes the qcow2 image function as an overlay on top of the backing file (ie.
// offers snapshot functionality).
backingFile string
// clusterBits is the number of bits that a cluster has. This is usually 16, for
// 64KiB clusters.
clusterBits uint32
}
type qcow2Header struct {
Magic [4]byte
Version uint32
BackingFileOffset uint64
BackingFileSize uint32
ClusterBits uint32
Size uint64
CryptMethod uint32
L1Size uint32
L1TableOffset uint64
RefcountTableOffset uint64
RefcountTableClusters uint32
NumberSnapshots uint32
SnapshotsOffset uint64
}
// divRoundUp divides a by b and returns the result rounded up. This is useful to
// answer the question 'how many b-sized elements do I need to contain a'.
func divRoundUp(a, b uint64) uint64 {
res := a / b
if a%b != 0 {
res += 1
}
return res
}
// clusterSize returns the cluster size for this builder.
func (b *builder) clusterSize() uint64 {
return uint64(1 << b.clusterBits)
}
// calculateMappingEntries returns the details of the 'cluster mapping'
// structures (sometimes called 'L1'). This is calculated based on the cluster
// size and target image size.
//
// The values returned are the count of L1 entries needed to cover the entirety
// of the user data, and the size of such an entry within its table. The latter
// is constant.
func (b *builder) calculateMappingEntries() (mappingL1Count uint64, mappingL1EntrySize uint64) {
clusterSize := b.clusterSize()
mappingL2EntrySize := uint64(8)
// Each L2 table is always one cluster.
mappingL2EntryCount := clusterSize / mappingL2EntrySize
// How much user data each L2 table covers.
mappingL2Covering := mappingL2EntryCount * clusterSize
// How many L1 entries do we need.
mappingL1Count = divRoundUp(b.size, mappingL2Covering)
// Each L1 entry is always 64 bits.
mappingL1EntrySize = uint64(8)
return
}
// refcountTableProperties contains the properties of a L1 or L2 refcount table
// calculated from the cluster size of a qcow2 image.
type refcountTableProperties struct {
// bytesCoveredByCluster describes how many bytes of {meta,}data are addressed by
// a cluster full of metadata entries of this refcount table.
bytesCoveredByCluster uint64
// entriesPerCluster describes how many entries of this refcount table fit in a
// full cluster of metadata.
entriesPerCluster uint64
}
// calculateRefcountProperties returns recountTableProperties for L1 and L2
// refcount tables for the qcow2 image being built. These are static properties
// calculated based on the cluster size, not on the target size of the image.
func (b *builder) calculateRefcountProperties() (l1 refcountTableProperties, l2 refcountTableProperties) {
clusterSize := b.clusterSize()
// Size of refcount l2 structure entry (16 bits):
refcountL2EntrySize := uint64(2)
// Number of entries in refcount l2 structure:
refcountL2EntryCount := clusterSize / refcountL2EntrySize
// Storage size covered by every refcount l2 structure (each entry points towards
// a cluster):
refcountL2Covering := refcountL2EntryCount * clusterSize
// Size of refcount l1 structure entry (64 bits):
refcountL1EntrySize := uint64(8)
// Number of entries in refcount l1 structure:
refcountL1EntryCount := clusterSize / refcountL1EntrySize
// Storage size covered by every refcount l1 structure:
refcountL1Covering := refcountL1EntryCount * refcountL2Covering
l1.bytesCoveredByCluster = refcountL1Covering
l1.entriesPerCluster = refcountL1EntryCount
l2.bytesCoveredByCluster = refcountL2Covering
l2.entriesPerCluster = refcountL2EntryCount
return
}
// build writes out a qcow2 image (based on the builder's configuration) to a
// Writer.
func (b *builder) build(w io.Writer) error {
clusterSize := b.clusterSize()
// Minimum supported by QEMU and this codebase.
if b.clusterSize() < 512 {
return fmt.Errorf("cluster size too small")
}
// Size of a serialized qcow2Header.
headerSize := 72
// And however long the backing file name is (we stuff it after the header,
// within the first cluster).
headerSize += len(b.backingFile)
// Make sure the above fits.
if uint64(headerSize) > clusterSize {
return fmt.Errorf("cluster size too small")
}
// Cluster mapping structures are just based on the backing storage size and
// don't need to cover metadata. This makes it easy to calculate:
mappingL1Count, mappingL1EntrySize := b.calculateMappingEntries()
mappingL1Clusters := divRoundUp(mappingL1Count*mappingL1EntrySize, clusterSize)
// For refcount structures we need enough to cover just the metadata of our
// image. Unfortunately that includes the refcount structures themselves. We
// choose a naïve iterative approach where we start out with zero metadata
// structures and keep adding more until we cover everything that's necessary.
// Coverage, in bytes, for an L1 and L2 refcount cluster full of entries.
refcountL1, refcountL2 := b.calculateRefcountProperties()
// How many clusters of metadata to put in the final image. We start at zero and
// iterate upwards.
refcountL1Clusters := uint64(0)
refcountL2Clusters := uint64(0)
// How many bytes the current metadata covers. This is also zero at first.
maximumL2Covering := refcountL2Clusters * refcountL2.bytesCoveredByCluster
maximumL1Covering := refcountL1Clusters * refcountL1.bytesCoveredByCluster
// The size of our metadata in bytes.
sizeMetadata := (1 + mappingL1Clusters + refcountL1Clusters + refcountL2Clusters) * clusterSize
// Keep adding L1/L2 metadata clusters until we can cover all of our own
// metadata. This is an iterative approach to the aforementioned problem. Even
// with large images (100TB) at the default cluster size this loop only takes one
// iteration to stabilize. Smaller cluster sizes might take longer.
for {
changed := false
// Keep adding L2 metadata clusters until we cover all the metadata we know so far.
for maximumL2Covering < sizeMetadata {
refcountL2Clusters += 1
// Recalculate.
maximumL2Covering = refcountL2Clusters * refcountL2.bytesCoveredByCluster
sizeMetadata += clusterSize
changed = true
}
// Keep adding L1 metadata clusters until we cover all the metadata we know so far.
for maximumL1Covering < sizeMetadata {
refcountL1Clusters += 1
// Recalculate.
maximumL1Covering = refcountL1Clusters * refcountL1.bytesCoveredByCluster
sizeMetadata += clusterSize
changed = true
}
// If no changes were introduced, it means we stabilized at some l1/l2 metadata
// cluster count. Exit.
if !changed {
break
}
}
// Now that we have calculated everything, start writing out the header.
h := qcow2Header{
Version: 2,
ClusterBits: b.clusterBits,
Size: b.size,
// Unencrypted.
CryptMethod: 0,
L1Size: uint32(mappingL1Count),
// L1 table starts after the header and refcount tables.
L1TableOffset: (1 + refcountL1Clusters + refcountL2Clusters) * clusterSize,
// Refcount table starts right after header cluster.
RefcountTableOffset: 1 * clusterSize,
RefcountTableClusters: uint32(refcountL1Clusters),
// No snapshots.
NumberSnapshots: 0,
SnapshotsOffset: 0,
}
copy(h.Magic[:], "QFI\xfb")
if b.backingFile != "" {
// Backing file path is right after header.
h.BackingFileOffset = 72
h.BackingFileSize = uint32(len(b.backingFile))
}
if err := binary.Write(w, binary.BigEndian, &h); err != nil {
return err
}
// Write out backing file name right after the header.
if b.backingFile != "" {
if _, err := w.Write([]byte(b.backingFile)); err != nil {
return err
}
}
// Align to next cluster with zeroes.
if _, err := w.Write(bytes.Repeat([]byte{0}, int(clusterSize)-headerSize)); err != nil {
return err
}
// Write L1 refcount table.
for i := uint64(0); i < refcountL1Clusters; i++ {
for j := uint64(0); j < refcountL1.entriesPerCluster; j++ {
// Index of corresponding l2 table.
ix := j + i*refcountL1.entriesPerCluster
var data uint64
if ix < refcountL2Clusters {
// If this is an allocated L2 table, write its offset. Otherwise write zero.
data = (1 + refcountL1Clusters + ix) * clusterSize
}
if err := binary.Write(w, binary.BigEndian, &data); err != nil {
return err
}
}
}
// Write L2 refcount table.
for i := uint64(0); i < refcountL2Clusters; i++ {
for j := uint64(0); j < refcountL2.entriesPerCluster; j++ {
// Index of corresponding cluster.
ix := j + i*refcountL2.entriesPerCluster
var data uint16
if ix < (1 + refcountL1Clusters + refcountL2Clusters + mappingL1Clusters) {
// If this is an in-use cluster, mark it as such.
data = 1
}
if err := binary.Write(w, binary.BigEndian, &data); err != nil {
return err
}
}
}
// Write L1 mapping table.
for i := uint64(0); i < mappingL1Count; i++ {
// No user data yet allocated.
data := uint64(0)
if err := binary.Write(w, binary.BigEndian, &data); err != nil {
return err
}
}
return nil
}
// GenerateOption structures are passed to the Generate call.
type GenerateOption struct {
fileSize *uint64
backingFilePath *string
}
// GenerateWithFileSize will generate an image with a given user size in bytes.
func GenerateWithFileSize(fileSize uint64) *GenerateOption {
return &GenerateOption{
fileSize: &fileSize,
}
}
// GenerateWithBackingFile will generate an image size backed by a given file
// path. If GenerateWithFileSize is not given, the path will also be checked at
// generation time and used to determine the size of the user image.
func GenerateWithBackingFile(path string) *GenerateOption {
return &GenerateOption{
backingFilePath: &path,
}
}
// Generate builds a QCOW2 image and writes it to the given Writer.
//
// At least one of GenerateWithFileSize or GenerateWithBackingFile must be given.
func Generate(w io.Writer, opts ...*GenerateOption) error {
var size uint64
var haveSize bool
var backingFile string
var haveBackingFile bool
for _, opt := range opts {
if opt.fileSize != nil {
if haveSize {
return fmt.Errorf("cannot specify GenerateWithFileSize twice")
}
haveSize = true
size = *opt.fileSize
}
if opt.backingFilePath != nil {
if haveBackingFile {
return fmt.Errorf("cannot specify GenerateWithBackingFile twice")
}
haveBackingFile = true
backingFile = *opt.backingFilePath
}
}
if !haveSize && !haveBackingFile {
return fmt.Errorf("must be called with GenerateWithFileSize or GenerateWithBackingFileAndSize")
}
if haveBackingFile && !haveSize {
st, err := os.Stat(backingFile)
if err != nil {
return fmt.Errorf("cannot read backing file: %w", err)
}
size = uint64(st.Size())
}
b := builder{
size: size,
backingFile: backingFile,
// 64k clusters (standard generated by qemu-img).
clusterBits: uint32(0x10),
}
return b.build(w)
}