Add EROFS library

This adds a library to write EROFS filesystems. It supports most of the non-deprecated features the
filesystem supports other than extended inodes (which have no benefits for most use cases where EROFS would be
appropriate). EROFS's variable-length extent compression is partially implemented but it requires an LZ4
compressor with support for fixed-size output which Go's https://github.com/pierrec/lz4 doesn't have. This means
that VLE compression is currently not wired up.

This will be used later as a replacement for our current initramfs-based root filesystem.

Test Plan: Has both integration and some unit tests. Confirmed working for our whole rootfs.

X-Origin-Diff: phab/D692
GitOrigin-RevId: 8c52b45ea05c617c80047e99c04c2b63e1b60c7c
diff --git a/metropolis/pkg/erofs/erofs.go b/metropolis/pkg/erofs/erofs.go
new file mode 100644
index 0000000..af6ad1c
--- /dev/null
+++ b/metropolis/pkg/erofs/erofs.go
@@ -0,0 +1,264 @@
+// Copyright 2020 The Monogon Project Authors.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package erofs
+
+import (
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"io"
+	"path"
+
+	"golang.org/x/sys/unix"
+)
+
+// Writer writes a new EROFS filesystem.
+type Writer struct {
+	w io.WriteSeeker
+	// fixDirectoryEntry contains for each referenced path where it is referenced from. Since self-references
+	// are required anyways (for the "." and ".." entries) we let the user write files in any order and just
+	// point the directory entries to the right target nid and file type on Close().
+	fixDirectoryEntry map[string][]direntFixupLocation
+	pathInodeMeta     map[string]*uncompressedInodeMeta
+	// legacyInodeIndex stores the next legacy (32-bit) inode to be allocated. 64 bit inodes are automatically
+	// calculated by EROFS on mount.
+	legacyInodeIndex    uint32
+	blockAllocatorIndex uint32
+	metadataBlocksFree  metadataBlocksMeta
+}
+
+// NewWriter creates a new EROFS filesystem writer. The given WriteSeeker needs to be at the start.
+func NewWriter(w io.WriteSeeker) (*Writer, error) {
+	erofsWriter := &Writer{
+		w:                 w,
+		fixDirectoryEntry: make(map[string][]direntFixupLocation),
+		pathInodeMeta:     make(map[string]*uncompressedInodeMeta),
+	}
+	_, err := erofsWriter.allocateMetadata(1024+binary.Size(&superblock{}), 0)
+	if err != nil {
+		return nil, fmt.Errorf("cannot allocate first metadata block: %w", err)
+	}
+	if _, err := erofsWriter.w.Write(make([]byte, 1024)); err != nil { // Padding
+		return nil, fmt.Errorf("failed to write initial padding: %w", err)
+	}
+	if err := binary.Write(erofsWriter.w, binary.LittleEndian, &superblock{
+		Magic:          Magic,
+		BlockSizeBits:  blockSizeBits,
+		RootNodeNumber: 36, // 1024 (padding) + 128 (superblock) / 32, not eligible for fixup as different int size
+	}); err != nil {
+		return nil, fmt.Errorf("failed to write superblock: %w", err)
+	}
+	return erofsWriter, nil
+}
+
+// allocateMetadata allocates metadata space of size bytes with a given alignment and seeks to the first byte of the
+// newly-allocated metadata space. It also returns the position of that first byte.
+func (w *Writer) allocateMetadata(size int, alignment uint16) (int64, error) {
+	if size > BlockSize {
+		panic("cannot allocate a metadata object bigger than BlockSize bytes")
+	}
+	sizeU16 := uint16(size)
+	pos, ok := w.metadataBlocksFree.findBlock(sizeU16, 32)
+	if !ok {
+		blockNumber, err := w.allocateBlocks(1)
+		if err != nil {
+			return 0, fmt.Errorf("failed to allocate additional metadata space: %w", err)
+		}
+		w.metadataBlocksFree = append(w.metadataBlocksFree, metadataBlockMeta{blockNumber: blockNumber, freeBytes: BlockSize - sizeU16})
+		if _, err := w.w.Write(make([]byte, BlockSize)); err != nil {
+			return 0, fmt.Errorf("failed to write metadata: %w", err)
+		}
+		pos = int64(blockNumber) * BlockSize // Always aligned to BlockSize, bigger alignments are unsupported anyways
+	}
+	if _, err := w.w.Seek(pos, io.SeekStart); err != nil {
+		return 0, fmt.Errorf("cannot seek to existing metadata nid, likely misaligned meta write")
+	}
+	return pos, nil
+}
+
+// allocateBlocks allocates n new BlockSize-sized block and seeks to the beginning of the first newly-allocated block.
+// It also returns the first newly-allocated block number.  The caller is expected to write these blocks completely
+// before calling allocateBlocks again.
+func (w *Writer) allocateBlocks(n uint32) (uint32, error) {
+	if _, err := w.w.Seek(int64(w.blockAllocatorIndex)*BlockSize, io.SeekStart); err != nil {
+		return 0, fmt.Errorf("cannot seek to end of last block, check write alignment: %w", err)
+	}
+	firstBlock := w.blockAllocatorIndex
+	w.blockAllocatorIndex += n
+	return firstBlock, nil
+}
+
+func (w *Writer) create(pathname string, inode Inode) *uncompressedInodeWriter {
+	i := &uncompressedInodeWriter{
+		writer:            w,
+		inode:             *inode.inode(),
+		legacyInodeNumber: w.legacyInodeIndex,
+		pathname:          path.Clean(pathname),
+	}
+	w.legacyInodeIndex++
+	return i
+}
+
+// CreateFile adds a new file to the EROFS. It returns a WriteCloser to which the file contents should be written and
+// which then needs to be closed. The last writer obtained by calling CreateFile() needs to be closed first before
+// opening a new one. The given pathname needs to be referenced by a directory created using Create(), otherwise it will
+// not be accessible.
+func (w *Writer) CreateFile(pathname string, meta *FileMeta) io.WriteCloser {
+	return w.create(pathname, meta)
+}
+
+// Create adds a new non-file inode to the EROFS. This includes directories, device nodes, symlinks and FIFOs.
+// The first call to Create() needs to be with pathname "." and a directory inode.
+// The given pathname needs to be referenced by a directory, otherwise it will not be accessible (with the exception of
+// the directory ".").
+func (w *Writer) Create(pathname string, inode Inode) error {
+	iw := w.create(pathname, inode)
+	switch i := inode.(type) {
+	case *Directory:
+		if err := i.writeTo(iw); err != nil {
+			return fmt.Errorf("failed to write directory contents: %w", err)
+		}
+	case *SymbolicLink:
+		if err := i.writeTo(iw); err != nil {
+			return fmt.Errorf("failed to write symbolic link contents: %w", err)
+		}
+	}
+	return iw.Close()
+}
+
+// Close finishes writing an EROFS filesystem. Errors by this function need to be handled as they indicate if the
+// written filesystem is consistent (i.e. there are no directory entries pointing to nonexistent inodes).
+func (w *Writer) Close() error {
+	for targetPath, entries := range w.fixDirectoryEntry {
+		for _, entry := range entries {
+			targetMeta, ok := w.pathInodeMeta[targetPath]
+			if !ok {
+				return fmt.Errorf("failed to link filesystem tree: dangling reference to %v", targetPath)
+			}
+			if err := direntFixup(w.pathInodeMeta[entry.path], int64(entry.entryIndex), targetMeta); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+// uncompressedInodeMeta tracks enough metadata about a written inode to be able to point dirents to it and to provide
+// a WriteSeeker into the inode itself.
+type uncompressedInodeMeta struct {
+	nid   uint64
+	ftype uint8
+
+	// Physical placement metdata
+	blockStart   int64
+	blockLength  int64
+	inlineStart  int64
+	inlineLength int64
+
+	writer        *Writer
+	currentOffset int64
+}
+
+func (a *uncompressedInodeMeta) Seek(offset int64, whence int) (int64, error) {
+	switch whence {
+	case io.SeekCurrent:
+		break
+	case io.SeekStart:
+		a.currentOffset = 0
+	case io.SeekEnd:
+		a.currentOffset = a.blockLength + a.inlineLength
+	}
+	a.currentOffset += offset
+	return a.currentOffset, nil
+}
+
+func (a *uncompressedInodeMeta) Write(p []byte) (int, error) {
+	if a.currentOffset < a.blockLength {
+		// TODO(lorenz): Handle the special case where a directory inode is spread across multiple
+		// blocks (depending on other factors this occurs around ~200 direct children).
+		return 0, errors.New("relocating dirents in multi-block directory inodes is unimplemented")
+	}
+	if _, err := a.writer.w.Seek(a.inlineStart+a.currentOffset, io.SeekStart); err != nil {
+		return 0, err
+	}
+	a.currentOffset += int64(len(p))
+	return a.writer.w.Write(p)
+}
+
+type direntFixupLocation struct {
+	path       string
+	entryIndex uint16
+}
+
+// direntFixup overrides nid and file type from the path the dirent is pointing to. The given iw is expected to be at
+// the start of the dirent inode to be fixed up.
+func direntFixup(iw io.WriteSeeker, entryIndex int64, meta *uncompressedInodeMeta) error {
+	if _, err := iw.Seek(entryIndex*12, io.SeekStart); err != nil {
+		return fmt.Errorf("failed to seek to dirent: %w", err)
+	}
+	if err := binary.Write(iw, binary.LittleEndian, meta.nid); err != nil {
+		return fmt.Errorf("failed to write nid: %w", err)
+	}
+	if _, err := iw.Seek(2, io.SeekCurrent); err != nil { // Skip NameStartOffset
+		return fmt.Errorf("failed to seek to dirent: %w", err)
+	}
+	if err := binary.Write(iw, binary.LittleEndian, meta.ftype); err != nil {
+		return fmt.Errorf("failed to write ftype: %w", err)
+	}
+	return nil
+}
+
+type metadataBlockMeta struct {
+	blockNumber uint32
+	freeBytes   uint16
+}
+
+// metadataBlocksMeta contains metadata about all metadata blocks, most importantly the amount of free
+// bytes in each block. This is not a map for reproducibility (map ordering).
+type metadataBlocksMeta []metadataBlockMeta
+
+// findBlock returns the absolute position where `size` bytes with the specified alignment can still fit.
+// If there is not enough space in any metadata block it returns false as the second return value.
+func (m metadataBlocksMeta) findBlock(size uint16, alignment uint16) (int64, bool) {
+	for i, blockMeta := range m {
+		freeBytesAligned := blockMeta.freeBytes - (blockMeta.freeBytes % alignment)
+		if freeBytesAligned > size {
+			m[i] = metadataBlockMeta{
+				blockNumber: blockMeta.blockNumber,
+				freeBytes:   freeBytesAligned - size,
+			}
+			pos := int64(blockMeta.blockNumber+1)*BlockSize - int64(freeBytesAligned)
+			return pos, true
+		}
+	}
+	return 0, false
+}
+
+var unixModeToFTMap = map[uint16]uint8{
+	unix.S_IFREG:  fileTypeRegularFile,
+	unix.S_IFDIR:  fileTypeDirectory,
+	unix.S_IFCHR:  fileTypeCharacterDevice,
+	unix.S_IFBLK:  fileTypeBlockDevice,
+	unix.S_IFIFO:  fileTypeFIFO,
+	unix.S_IFSOCK: fileTypeSocket,
+	unix.S_IFLNK:  fileTypeSymbolicLink,
+}
+
+// unixModeToFT maps a Unix file type to an EROFS file type.
+func unixModeToFT(mode uint16) uint8 {
+	return unixModeToFTMap[mode&unix.S_IFMT]
+}