Add EROFS library This adds a library to write EROFS filesystems. It supports most of the non-deprecated features the filesystem supports other than extended inodes (which have no benefits for most use cases where EROFS would be appropriate). EROFS's variable-length extent compression is partially implemented but it requires an LZ4 compressor with support for fixed-size output which Go's https://github.com/pierrec/lz4 doesn't have. This means that VLE compression is currently not wired up. This will be used later as a replacement for our current initramfs-based root filesystem. Test Plan: Has both integration and some unit tests. Confirmed working for our whole rootfs. X-Origin-Diff: phab/D692 GitOrigin-RevId: 8c52b45ea05c617c80047e99c04c2b63e1b60c7c

commit: 378a4455aedda838f60c546e55199092f24952ed [log] [tgz]
author: Lorenz Brun <lorenz@nexantic.com> Tue Jan 26 13:47:41 2021 +0100
committer: Lorenz Brun <lorenz@nexantic.com> Tue Jan 26 13:47:41 2021 +0100
tree: aa78b858535224fe8c9b24c2ff7e9ed2c903080b
parent: 74e8e5c35fea1ec9ce13c8a2d16100bab45d42d9 [diff]
diff --git a/metropolis/pkg/erofs/BUILD.bazel b/metropolis/pkg/erofs/BUILD.bazel
new file mode 100644
index 0000000..7014e87
--- /dev/null
+++ b/metropolis/pkg/erofs/BUILD.bazel

@@ -0,0 +1,39 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//metropolis/test/ktest:ktest.bzl", "ktest")
+
+go_library(
+    name = "go_default_library",
+    srcs = [
+        "compression.go",
+        "defs.go",
+        "erofs.go",
+        "inode_types.go",
+        "uncompressed_inode_writer.go",
+    ],
+    importpath = "source.monogon.dev/metropolis/pkg/erofs",
+    visibility = ["//visibility:public"],
+    deps = ["@org_golang_x_sys//unix:go_default_library"],
+)
+
+go_test(
+    name = "go_default_test",
+    srcs = [
+        "compression_test.go",
+        "defs_test.go",
+        "erofs_test.go",
+    ],
+    embed = [":go_default_library"],
+    pure = "on",  # keep
+    deps = [
+        "@com_github_stretchr_testify//assert:go_default_library",
+        "@com_github_stretchr_testify//require:go_default_library",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+ktest(
+    cmdline = "ramdisk_size=128",
+    initramfs_extra = "",
+    tester = ":go_default_test",
+    deps = [],
+)

diff --git a/metropolis/pkg/erofs/compression.go b/metropolis/pkg/erofs/compression.go
new file mode 100644
index 0000000..58b2f4b
--- /dev/null
+++ b/metropolis/pkg/erofs/compression.go

@@ -0,0 +1,59 @@
+// Copyright 2020 The Monogon Project Authors.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package erofs
+
+// This file contains compression-related functions.
+// TODO(lorenz): Fully implement compression. These are currently unused.
+
+import "encoding/binary"
+
+// mapHeader is a legacy but still-used advisory structure at the start of a compressed VLE block. It contains constant
+// values as annotated.
+type mapHeader struct {
+	Reserved      uint32 // 0
+	Advise        uint16 // 1
+	AlgorithmType uint8  // 0
+	ClusterBits   uint8  // 0
+}
+
+// encodeSmallVLEBlock encodes two VLE extents into a 8 byte block.
+func encodeSmallVLEBlock(vals [2]uint16, blkaddr uint32) [8]byte {
+	var out [8]byte
+	binary.LittleEndian.PutUint16(out[0:2], vals[0])
+	binary.LittleEndian.PutUint16(out[2:4], vals[1])
+	binary.LittleEndian.PutUint32(out[4:8], blkaddr)
+	return out
+}
+
+// encodeBigVLEBlock encodes 16 VLE extents into a 32 byte block.
+func encodeBigVLEBlock(vals [16]uint16, blkaddr uint32) [32]byte {
+	var out [32]byte
+	for i, val := range vals {
+		if val > 1<<14 {
+			panic("value is bigger than 14 bits, cannot encode")
+		}
+		// Writes packed 14 bit unsigned integers
+		pos := i * 14
+		bitStartPos := pos % 8
+		byteStartPos := pos / 8
+		out[byteStartPos] = out[byteStartPos]&((1<<bitStartPos)-1) | uint8(val<<bitStartPos)
+		out[byteStartPos+1] = uint8(val >> (8 - bitStartPos))
+		out[byteStartPos+2] = uint8(val >> (16 - bitStartPos))
+	}
+	binary.LittleEndian.PutUint32(out[28:32], blkaddr)
+	return out
+}

diff --git a/metropolis/pkg/erofs/compression_test.go b/metropolis/pkg/erofs/compression_test.go
new file mode 100644
index 0000000..8d5d656
--- /dev/null
+++ b/metropolis/pkg/erofs/compression_test.go

@@ -0,0 +1,91 @@
+// Copyright 2020 The Monogon Project Authors.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package erofs
+
+import (
+	"reflect"
+	"testing"
+)
+
+func TestEncodeSmallVLEBlock(t *testing.T) {
+	type args struct {
+		vals    [2]uint16
+		blkaddr uint32
+	}
+	tests := []struct {
+		name string
+		args args
+		want [8]byte
+	}{
+		{
+			name: "Reference",
+			args: args{vals: [2]uint16{vleClusterTypeHead | 1527, vleClusterTypeNonhead | 1}, blkaddr: 1},
+			want: [8]byte{0xf7, 0x15, 0x01, 0x20, 0x01, 0x00, 0x00, 0x00},
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := encodeSmallVLEBlock(tt.args.vals, tt.args.blkaddr); !reflect.DeepEqual(got, tt.want) {
+				t.Errorf("encodeSmallVLEBlock() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
+
+func TestEncodeBigVLEBlock(t *testing.T) {
+	type args struct {
+		vals    [16]uint16
+		blkaddr uint32
+	}
+	tests := []struct {
+		name string
+		args args
+		want [32]byte
+	}{
+		{
+			name: "Reference",
+			args: args{
+				vals: [16]uint16{
+					vleClusterTypeNonhead | 2,
+					vleClusterTypeHead | 1460,
+					vleClusterTypeNonhead | 1,
+					vleClusterTypeNonhead | 2,
+					vleClusterTypeHead | 2751,
+					vleClusterTypeNonhead | 1,
+					vleClusterTypeNonhead | 2,
+					vleClusterTypeHead | 940,
+					vleClusterTypeNonhead | 1,
+					vleClusterTypeHead | 3142,
+					vleClusterTypeNonhead | 1,
+					vleClusterTypeNonhead | 2,
+					vleClusterTypeHead | 1750,
+					vleClusterTypeNonhead | 1,
+					vleClusterTypeNonhead | 2,
+					vleClusterTypeHead | 683,
+				},
+				blkaddr: 3,
+			},
+			want: [32]byte{0x02, 0x20, 0x6d, 0x15, 0x00, 0x0a, 0x80, 0xbf, 0x5a, 0x00, 0x28, 0x00, 0xb2, 0x4e, 0x01, 0xa0, 0x11, 0x17, 0x00, 0x0a, 0x80, 0xd6, 0x56, 0x00, 0x28, 0x00, 0xae, 0x4a, 0x03, 0x00, 0x00, 0x00}},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := encodeBigVLEBlock(tt.args.vals, tt.args.blkaddr); !reflect.DeepEqual(got, tt.want) {
+				t.Errorf("encodeBigVLEBlock() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}

diff --git a/metropolis/pkg/erofs/defs.go b/metropolis/pkg/erofs/defs.go
new file mode 100644
index 0000000..b547867
--- /dev/null
+++ b/metropolis/pkg/erofs/defs.go

@@ -0,0 +1,97 @@
+// Copyright 2020 The Monogon Project Authors.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package erofs
+
+// This file contains definitions coming from the in-Kernel implementation of the EROFS filesystem.
+// All definitions come from @linux//fs/erofs:erofs_fs.h unless stated otherwise.
+
+// Magic contains the 4 magic bytes starting at position 1024 identifying an EROFS filesystem.
+// Defined in @linux//include/uapi/linux/magic.h EROFS_SUPER_MAGIC_V1
+var Magic = [4]byte{0xe2, 0xe1, 0xf5, 0xe0}
+
+const blockSizeBits = 12
+const BlockSize = 1 << blockSizeBits
+
+// Defined in @linux//include/linux:fs_types.h starting at FT_UNKNOWN
+const (
+	fileTypeUnknown = iota
+	fileTypeRegularFile
+	fileTypeDirectory
+	fileTypeCharacterDevice
+	fileTypeBlockDevice
+	fileTypeFIFO
+	fileTypeSocket
+	fileTypeSymbolicLink
+)
+
+// Anonymous enum starting at EROFS_INODE_FLAT_PLAIN
+const (
+	inodeFlatPlain             = 0
+	inodeFlatCompressionLegacy = 1
+	inodeFlatInline            = 2
+	inodeFlatCompression       = 3
+)
+
+// struct erofs_dirent
+type directoryEntryRaw struct {
+	NodeNumber      uint64
+	NameStartOffset uint16
+	FileType        uint8
+	Reserved        uint8
+}
+
+// struct erofs_super_block
+type superblock struct {
+	Magic                [4]byte
+	Checksum             uint32
+	FeatureCompat        uint32
+	BlockSizeBits        uint8
+	Reserved0            uint8
+	RootNodeNumber       uint16
+	TotalInodes          uint64
+	BuildTimeSeconds     uint64
+	BuildTimeNanoseconds uint32
+	Blocks               uint32
+	MetaStartAddr        uint32
+	SharedXattrStartAddr uint32
+	UUID                 [16]byte
+	VolumeName           [16]byte
+	FeaturesIncompatible uint32
+	Reserved1            [44]byte
+}
+
+// struct erofs_inode_compact
+type inodeCompact struct {
+	Format         uint16
+	XattrCount     uint16
+	Mode           uint16
+	HardlinkCount  uint16
+	Size           uint32
+	Reserved0      uint32
+	Union          uint32
+	InodeNumCompat uint32
+	UID            uint16
+	GID            uint16
+	Reserved1      uint32
+}
+
+// Anonymous enum starting at Z_EROFS_VLE_CLUSTER_TYPE_PLAIN
+const (
+	vleClusterTypePlain = iota << 12
+	vleClusterTypeHead
+	vleClusterTypeNonhead
+)

diff --git a/metropolis/pkg/erofs/defs_test.go b/metropolis/pkg/erofs/defs_test.go
new file mode 100644
index 0000000..e32e155
--- /dev/null
+++ b/metropolis/pkg/erofs/defs_test.go

@@ -0,0 +1,52 @@
+// Copyright 2020 The Monogon Project Authors.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package erofs
+
+import (
+	"bytes"
+	"encoding/binary"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+// These test that the specified structures serialize to the same number of bytes as the ones in the
+// EROFS kernel module.
+
+func TestSuperblockSize(t *testing.T) {
+	var buf bytes.Buffer
+	if err := binary.Write(&buf, binary.LittleEndian, &superblock{}); err != nil {
+		t.Fatalf("failed to write superblock: %v", err)
+	}
+	assert.Equal(t, 128, buf.Len())
+}
+
+func TestDirectoryEntrySize(t *testing.T) {
+	var buf bytes.Buffer
+	if err := binary.Write(&buf, binary.LittleEndian, &directoryEntryRaw{}); err != nil {
+		t.Fatalf("failed to write directory entry: %v", err)
+	}
+	assert.Equal(t, 12, buf.Len())
+}
+
+func TestInodeCompactSize(t *testing.T) {
+	var buf bytes.Buffer
+	if err := binary.Write(&buf, binary.LittleEndian, &inodeCompact{}); err != nil {
+		t.Fatalf("failed to write compact inode: %v", err)
+	}
+	assert.Equal(t, 32, buf.Len())
+}

diff --git a/metropolis/pkg/erofs/doc.md b/metropolis/pkg/erofs/doc.md
new file mode 100644
index 0000000..1d4c29e
--- /dev/null
+++ b/metropolis/pkg/erofs/doc.md

@@ -0,0 +1,68 @@
+# EROFS Primer
+EROFS is a relatively modern (Linux 5.3+) filesystem optimized for fast read-only use. Similar to squashfs
+and cramfs EROFS filesystems have no write support in the kernel and can only be created by external tools.
+Both squashfs and cramfs are extremely optimized towards achieving minimal size, to the detriment of performance.
+For modern server use both of them are unacceptably slow as they support limited concurrency, make inefficient
+use of the page cache and read in weird block sizes. EROFS is designed to replace them on modern, fast hardware
+and generally exceeds Ext4 in performance by leveraging the fact that it is read-only. It supports compression,
+but only in fixed disk-aligned chunks and using LZ4 for maximum performance.
+
+Sadly the existing tooling to create EROFS filesystems (erofs-utils's mkfs.erofs) can only pack up single
+folders which does not work in a build process as it would both require root access to get file ownership
+and device nodes correct as well as a complete content copy of all relevant files which is bad as it lies
+on the critical path of the image build process. Adopting mkfs.erofs for a spec-driven build process
+basically amounts to a rewrite as the "library" part of it also directly reads directories and thus cannot
+be used directly.
+
+As reusing the old code proved to be more effort than it's worth, this library was born. Sadly upstream EROFS
+has basically no documentation beyond a few trivial diagrams that describe how exactly the filesystem is
+constructed. This document holds most knowledge I pried from `mkfs.erofs` and the kernel implementation and
+should help people understand the code.
+
+# Blocks
+An EROFS filesystem consists of individual blocks, each of them 4096 bytes (4K) long. Each block can either be a metadata or a data block (but it's not possible to know just by looking at a single block). The first block is always a metadata block and the first 1024 bytes of it are occupied by padding. The next 128 bytes are a Superblock structure. The rest (2944 bytes) is available for normal metadata allocation. Blocks are numbered from zero.
+
+# Superblock
+As mentionend in the previous section about blocks the superblock is not actually a block in EROFS, but a 128 byte-sized structure 1024 bytes into the first block. Most fields don't need to be set and don't matter. The `BuildTimeSeconds` and `BuildTimeNanoseconds` fields determine the ctime, atime and mtime of all inodes which are in compact structure. This library leaves them at zero which results in all files having a creation time of 1.1.1970 (Unix zero). This is similar to what Bazel does for archives. The only fields which need be filled out are the magic bytes, the block size in bits (only 12 for 1 << 12 = 4096 is supported though) and the root `nid` which points to the inode structure of the root inode (which needs to be a directory for the EROFS to be mountable). The root inode cannot be everywhere on the disk as the integer size of the field is only 16 bits, whereas a normal nid field everywhere else is 32 bits. So in general the root directory immediately follows the superblock and thus has `nid` (1024 + 128)/32 = 36.
+
+# Inodes
+Inodes all have a common inode structure which exists in both compact (32 bytes) and extended (64 bytes) form. There's no fundamental difference, the extended form can store more metadata and has a bigger maximum file size (2^64 vs 2^32). All inode structures are aligned to 32 bytes. Through this alignment they are identifiable by a so-called `nid` which is simply their offset in bytes to the start of the filesystem divided by their alignment (32). Certain inodes (inline and compressed, a variant of inline) also store data immediately following the inode. The inode structure and its optional following data are allocated in metadata blocks. If there's no metadata block with enough free bytes to accomodate the inode, a new block is allocated and filled with that inode structure and its following data.
+
+EROFS has three on-disk inode layouts that are in use:
+
+## Plain Inodes
+These consist only of a single inode structure (compact, 32 bytes or extended, 64 bytes) in the metadata area, and zero or more filled data blocks (empty inodes are always plain). All data blocks are consecutive and the Union value of the inode contains the block number of the first data block. The `Size` value contains the size of the content in bytes, not including the inode structure itself. The number of data blocks is determined by dividing the `Size` value of the inode by the block size and rounding up (see next paragraph why rounding up is necessary). The data blocks do not need to be adjacent to the metadata block the inode is in.
+
+## Inodes with inline data
+These are similar to plain inodes but also work for inode content sizes not neatly divisible by the block size. The leftover data at the end of the inode content that didn't fit into a whole data block is placed in the metadata area directly following the inode itself. How many bytes are appended to the inode is again determined by looking at the inode structure's `Size` value and calculating the remainder when divided by the block size (4096 bytes). The number of blocks is the result of the integer division of these numbers. As with the plain inodes the full blocks don't need to be adjacent to the metadata block.
+
+An inline inode can thus occupy more than a whole metadata block (32 bytes inode + 4095 bytes of data that didn't fit into a full block). This special case is handled by detecting that the inode plus the inline data would exceed a full metadata block (4096) and converting to a plain inode with an additional data block which is zero-padded. This is done specifically when (inode_content_size % 4096) + inode_self_size > 4096. Thus if this special case has happened can also be determined just from the inode size value.
+
+## Compressed inodes
+EROFS supports what they call Variable-Length Extents. These are normal plain inodes or inodes with inline data, but instead of the data itself they contain a metadata structure beginning with a `MapHeader` which is mostly there for legacy reasons and always contains the same data. Then follow compressed VLE meta blocks, which contain either 2 or 16 packed 14 bit integers and a on-disk block number. For alignment reasons the first 6 VLE meta integers are always packed into the 2 integer structures. All following complete blocks of 16 VLE meta integers get packed into 16 byte together with their starting block number. Anything that's left over gets once again packed into 2 integer structures. Each integer in this compressed sequence of 14 bit integers represents 4K of uncompressed data. So a file which has an uncompressed size of 4MiB needs 1000 of these integers to be represented, independent of how well it compresses.
+
+Note that VLE meta blocks are treated as content of plain or inline inodes. So if they exceed the maximum inline inode size there will be blocks allocated just for storing VLE meta blocks.
+
+These VLE meta integers integers are divided into 12 lower bits and 2 upper bits. The upper bits determine what the lower 12 bits represent and also how this 4K block of uncompressed data is represented. There's three types: PLAIN, HEAD and NONHEAD. PLAIN means no compression, the block is stored as-is on disk. HEAD means this block is the start of a compressed cluster. Its 12 lower bits represent the offset of the decompressed data with regards to the uncompressed 4K block boundary. NONHEAD means this block is part of the same on-disk block as the last HEAD block when uncompressed. Its lower 12 bits represent the number of blocks until the next HEAD or PLAIN block unless at the end of a VLE meta block (2 or 16 integers), then they represent the distance from the last HEAD or PLAIN block.
+
+Only PLAIN and HEAD blocks have actual on-disk blocks of uncompressed and compressed data respectively. NONHEAD blocks only exist to represent data that's expanded by decompressing. Thus the on-disk block number of a PLAIN or HEAD block can be determined by looking at the on-disk block number of the VLE meta block and incrementing by one for each PLAIN or HEAD block in it. So all data blocks referenced inside one VLE meta block need to be consecutive (but not adjacent to the location of the VLE meta blocks themselves).
+
+# Unix file types and inode layout
+
+## Directories
+Directories are either plain or inline inodes. They have content which consists of 12 byte dirent structures (directoryEntryRaw). These dirent structures contain a `nid` (see Inodes) pointing to the inode structure that represents that child of the directory, a name offset and a file type. The file type is redundant (it is also stored in the child's inode) but needs to be set. Directly following the dirent structures are all names of the children. They are not terminated or aligned. The name offset stored in a dirent is relative to the start of the inode content and marks the first byte of the name for that child. The end can be determined by the name offset of the next dirent or the total size of the inode if it is the last child.
+
+Directories always contain the `.` and `..` children, which need to point to itself and the parent inodes respectively, with the exception that the root directory's parent is defined to be itself. The individual dirents are always sorted according to their name interpreted as bytes to allow for binary searching.
+
+## Symbolic links
+Symbolic links are inline inodes. They have their literal target path as content.
+
+## Device nodes
+Device nodes are always plain inodes. Instead of a content size they have a `dev_t` integer in the `Union` inode struct value encoding the major and minor numbers. The type of device inode (block or character) is determined by the high bits of the Mode value as in standard Unix.
+
+
+## Regular files
+Regular files can be any of the three plain, inline or compressed inodes. The inode content is the content of the file.
+
+## Others
+FIFOs and Sockets are plain inodes with no content and no special fields. They will also be seldomly used in an EROFS.

diff --git a/metropolis/pkg/erofs/erofs.go b/metropolis/pkg/erofs/erofs.go
new file mode 100644
index 0000000..af6ad1c
--- /dev/null
+++ b/metropolis/pkg/erofs/erofs.go

@@ -0,0 +1,264 @@
+// Copyright 2020 The Monogon Project Authors.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package erofs
+
+import (
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"io"
+	"path"
+
+	"golang.org/x/sys/unix"
+)
+
+// Writer writes a new EROFS filesystem.
+type Writer struct {
+	w io.WriteSeeker
+	// fixDirectoryEntry contains for each referenced path where it is referenced from. Since self-references
+	// are required anyways (for the "." and ".." entries) we let the user write files in any order and just
+	// point the directory entries to the right target nid and file type on Close().
+	fixDirectoryEntry map[string][]direntFixupLocation
+	pathInodeMeta     map[string]*uncompressedInodeMeta
+	// legacyInodeIndex stores the next legacy (32-bit) inode to be allocated. 64 bit inodes are automatically
+	// calculated by EROFS on mount.
+	legacyInodeIndex    uint32
+	blockAllocatorIndex uint32
+	metadataBlocksFree  metadataBlocksMeta
+}
+
+// NewWriter creates a new EROFS filesystem writer. The given WriteSeeker needs to be at the start.
+func NewWriter(w io.WriteSeeker) (*Writer, error) {
+	erofsWriter := &Writer{
+		w:                 w,
+		fixDirectoryEntry: make(map[string][]direntFixupLocation),
+		pathInodeMeta:     make(map[string]*uncompressedInodeMeta),
+	}
+	_, err := erofsWriter.allocateMetadata(1024+binary.Size(&superblock{}), 0)
+	if err != nil {
+		return nil, fmt.Errorf("cannot allocate first metadata block: %w", err)
+	}
+	if _, err := erofsWriter.w.Write(make([]byte, 1024)); err != nil { // Padding
+		return nil, fmt.Errorf("failed to write initial padding: %w", err)
+	}
+	if err := binary.Write(erofsWriter.w, binary.LittleEndian, &superblock{
+		Magic:          Magic,
+		BlockSizeBits:  blockSizeBits,
+		RootNodeNumber: 36, // 1024 (padding) + 128 (superblock) / 32, not eligible for fixup as different int size
+	}); err != nil {
+		return nil, fmt.Errorf("failed to write superblock: %w", err)
+	}
+	return erofsWriter, nil
+}
+
+// allocateMetadata allocates metadata space of size bytes with a given alignment and seeks to the first byte of the
+// newly-allocated metadata space. It also returns the position of that first byte.
+func (w *Writer) allocateMetadata(size int, alignment uint16) (int64, error) {
+	if size > BlockSize {
+		panic("cannot allocate a metadata object bigger than BlockSize bytes")
+	}
+	sizeU16 := uint16(size)
+	pos, ok := w.metadataBlocksFree.findBlock(sizeU16, 32)
+	if !ok {
+		blockNumber, err := w.allocateBlocks(1)
+		if err != nil {
+			return 0, fmt.Errorf("failed to allocate additional metadata space: %w", err)
+		}
+		w.metadataBlocksFree = append(w.metadataBlocksFree, metadataBlockMeta{blockNumber: blockNumber, freeBytes: BlockSize - sizeU16})
+		if _, err := w.w.Write(make([]byte, BlockSize)); err != nil {
+			return 0, fmt.Errorf("failed to write metadata: %w", err)
+		}
+		pos = int64(blockNumber) * BlockSize // Always aligned to BlockSize, bigger alignments are unsupported anyways
+	}
+	if _, err := w.w.Seek(pos, io.SeekStart); err != nil {
+		return 0, fmt.Errorf("cannot seek to existing metadata nid, likely misaligned meta write")
+	}
+	return pos, nil
+}
+
+// allocateBlocks allocates n new BlockSize-sized block and seeks to the beginning of the first newly-allocated block.
+// It also returns the first newly-allocated block number.  The caller is expected to write these blocks completely
+// before calling allocateBlocks again.
+func (w *Writer) allocateBlocks(n uint32) (uint32, error) {
+	if _, err := w.w.Seek(int64(w.blockAllocatorIndex)*BlockSize, io.SeekStart); err != nil {
+		return 0, fmt.Errorf("cannot seek to end of last block, check write alignment: %w", err)
+	}
+	firstBlock := w.blockAllocatorIndex
+	w.blockAllocatorIndex += n
+	return firstBlock, nil
+}
+
+func (w *Writer) create(pathname string, inode Inode) *uncompressedInodeWriter {
+	i := &uncompressedInodeWriter{
+		writer:            w,
+		inode:             *inode.inode(),
+		legacyInodeNumber: w.legacyInodeIndex,
+		pathname:          path.Clean(pathname),
+	}
+	w.legacyInodeIndex++
+	return i
+}
+
+// CreateFile adds a new file to the EROFS. It returns a WriteCloser to which the file contents should be written and
+// which then needs to be closed. The last writer obtained by calling CreateFile() needs to be closed first before
+// opening a new one. The given pathname needs to be referenced by a directory created using Create(), otherwise it will
+// not be accessible.
+func (w *Writer) CreateFile(pathname string, meta *FileMeta) io.WriteCloser {
+	return w.create(pathname, meta)
+}
+
+// Create adds a new non-file inode to the EROFS. This includes directories, device nodes, symlinks and FIFOs.
+// The first call to Create() needs to be with pathname "." and a directory inode.
+// The given pathname needs to be referenced by a directory, otherwise it will not be accessible (with the exception of
+// the directory ".").
+func (w *Writer) Create(pathname string, inode Inode) error {
+	iw := w.create(pathname, inode)
+	switch i := inode.(type) {
+	case *Directory:
+		if err := i.writeTo(iw); err != nil {
+			return fmt.Errorf("failed to write directory contents: %w", err)
+		}
+	case *SymbolicLink:
+		if err := i.writeTo(iw); err != nil {
+			return fmt.Errorf("failed to write symbolic link contents: %w", err)
+		}
+	}
+	return iw.Close()
+}
+
+// Close finishes writing an EROFS filesystem. Errors by this function need to be handled as they indicate if the
+// written filesystem is consistent (i.e. there are no directory entries pointing to nonexistent inodes).
+func (w *Writer) Close() error {
+	for targetPath, entries := range w.fixDirectoryEntry {
+		for _, entry := range entries {
+			targetMeta, ok := w.pathInodeMeta[targetPath]
+			if !ok {
+				return fmt.Errorf("failed to link filesystem tree: dangling reference to %v", targetPath)
+			}
+			if err := direntFixup(w.pathInodeMeta[entry.path], int64(entry.entryIndex), targetMeta); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+// uncompressedInodeMeta tracks enough metadata about a written inode to be able to point dirents to it and to provide
+// a WriteSeeker into the inode itself.
+type uncompressedInodeMeta struct {
+	nid   uint64
+	ftype uint8
+
+	// Physical placement metdata
+	blockStart   int64
+	blockLength  int64
+	inlineStart  int64
+	inlineLength int64
+
+	writer        *Writer
+	currentOffset int64
+}
+
+func (a *uncompressedInodeMeta) Seek(offset int64, whence int) (int64, error) {
+	switch whence {
+	case io.SeekCurrent:
+		break
+	case io.SeekStart:
+		a.currentOffset = 0
+	case io.SeekEnd:
+		a.currentOffset = a.blockLength + a.inlineLength
+	}
+	a.currentOffset += offset
+	return a.currentOffset, nil
+}
+
+func (a *uncompressedInodeMeta) Write(p []byte) (int, error) {
+	if a.currentOffset < a.blockLength {
+		// TODO(lorenz): Handle the special case where a directory inode is spread across multiple
+		// blocks (depending on other factors this occurs around ~200 direct children).
+		return 0, errors.New("relocating dirents in multi-block directory inodes is unimplemented")
+	}
+	if _, err := a.writer.w.Seek(a.inlineStart+a.currentOffset, io.SeekStart); err != nil {
+		return 0, err
+	}
+	a.currentOffset += int64(len(p))
+	return a.writer.w.Write(p)
+}
+
+type direntFixupLocation struct {
+	path       string
+	entryIndex uint16
+}
+
+// direntFixup overrides nid and file type from the path the dirent is pointing to. The given iw is expected to be at
+// the start of the dirent inode to be fixed up.
+func direntFixup(iw io.WriteSeeker, entryIndex int64, meta *uncompressedInodeMeta) error {
+	if _, err := iw.Seek(entryIndex*12, io.SeekStart); err != nil {
+		return fmt.Errorf("failed to seek to dirent: %w", err)
+	}
+	if err := binary.Write(iw, binary.LittleEndian, meta.nid); err != nil {
+		return fmt.Errorf("failed to write nid: %w", err)
+	}
+	if _, err := iw.Seek(2, io.SeekCurrent); err != nil { // Skip NameStartOffset
+		return fmt.Errorf("failed to seek to dirent: %w", err)
+	}
+	if err := binary.Write(iw, binary.LittleEndian, meta.ftype); err != nil {
+		return fmt.Errorf("failed to write ftype: %w", err)
+	}
+	return nil
+}
+
+type metadataBlockMeta struct {
+	blockNumber uint32
+	freeBytes   uint16
+}
+
+// metadataBlocksMeta contains metadata about all metadata blocks, most importantly the amount of free
+// bytes in each block. This is not a map for reproducibility (map ordering).
+type metadataBlocksMeta []metadataBlockMeta
+
+// findBlock returns the absolute position where `size` bytes with the specified alignment can still fit.
+// If there is not enough space in any metadata block it returns false as the second return value.
+func (m metadataBlocksMeta) findBlock(size uint16, alignment uint16) (int64, bool) {
+	for i, blockMeta := range m {
+		freeBytesAligned := blockMeta.freeBytes - (blockMeta.freeBytes % alignment)
+		if freeBytesAligned > size {
+			m[i] = metadataBlockMeta{
+				blockNumber: blockMeta.blockNumber,
+				freeBytes:   freeBytesAligned - size,
+			}
+			pos := int64(blockMeta.blockNumber+1)*BlockSize - int64(freeBytesAligned)
+			return pos, true
+		}
+	}
+	return 0, false
+}
+
+var unixModeToFTMap = map[uint16]uint8{
+	unix.S_IFREG:  fileTypeRegularFile,
+	unix.S_IFDIR:  fileTypeDirectory,
+	unix.S_IFCHR:  fileTypeCharacterDevice,
+	unix.S_IFBLK:  fileTypeBlockDevice,
+	unix.S_IFIFO:  fileTypeFIFO,
+	unix.S_IFSOCK: fileTypeSocket,
+	unix.S_IFLNK:  fileTypeSymbolicLink,
+}
+
+// unixModeToFT maps a Unix file type to an EROFS file type.
+func unixModeToFT(mode uint16) uint8 {
+	return unixModeToFTMap[mode&unix.S_IFMT]
+}

diff --git a/metropolis/pkg/erofs/erofs_test.go b/metropolis/pkg/erofs/erofs_test.go
new file mode 100644
index 0000000..d02c2dd
--- /dev/null
+++ b/metropolis/pkg/erofs/erofs_test.go

@@ -0,0 +1,250 @@
+// Copyright 2020 The Monogon Project Authors.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package erofs
+
+import (
+	"io"
+	"io/ioutil"
+	"log"
+	"math/rand"
+	"os"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+	"golang.org/x/sys/unix"
+)
+
+func TestKernelInterop(t *testing.T) {
+	if os.Getenv("IN_KTEST") != "true" {
+		t.Skip("Not in ktest")
+	}
+
+	type testCase struct {
+		name     string
+		setup    func(w *Writer) error
+		validate func(t *testing.T) error
+	}
+
+	tests := []testCase{
+		{
+			name: "SimpleFolder",
+			setup: func(w *Writer) error {
+				return w.Create(".", &Directory{
+					Base:     Base{GID: 123, UID: 124, Permissions: 0753},
+					Children: []string{},
+				})
+			},
+			validate: func(t *testing.T) error {
+				var stat unix.Stat_t
+				if err := unix.Stat("/test", &stat); err != nil {
+					t.Errorf("failed to stat output: %v", err)
+				}
+				require.EqualValues(t, 124, stat.Uid, "wrong Uid")
+				require.EqualValues(t, 123, stat.Gid, "wrong Gid")
+				require.EqualValues(t, 0753, stat.Mode&^unix.S_IFMT, "wrong mode")
+				return nil
+			},
+		},
+		{
+			name: "FolderHierarchy",
+			setup: func(w *Writer) error {
+				if err := w.Create(".", &Directory{
+					Base:     Base{GID: 123, UID: 124, Permissions: 0753},
+					Children: []string{"subdir"},
+				}); err != nil {
+					return err
+				}
+				if err := w.Create("subdir", &Directory{
+					Base:     Base{GID: 123, UID: 124, Permissions: 0753},
+					Children: []string{},
+				}); err != nil {
+					return err
+				}
+				return nil
+			},
+			validate: func(t *testing.T) error {
+				dirInfo, err := ioutil.ReadDir("/test")
+				if err != nil {
+					t.Fatalf("Failed to read top-level directory: %v", err)
+				}
+				require.Len(t, dirInfo, 1, "more subdirs than expected")
+				require.Equal(t, "subdir", dirInfo[0].Name(), "unexpected subdir")
+				require.True(t, dirInfo[0].IsDir(), "subdir not a directory")
+				subdirInfo, err := ioutil.ReadDir("/test/subdir")
+				assert.NoError(t, err, "cannot read empty subdir")
+				require.Len(t, subdirInfo, 0, "unexpected subdirs in empty directory")
+				return nil
+			},
+		},
+		{
+			name: "SmallFile",
+			setup: func(w *Writer) error {
+				if err := w.Create(".", &Directory{
+					Base:     Base{GID: 123, UID: 123, Permissions: 0755},
+					Children: []string{"test.bin"},
+				}); err != nil {
+					return err
+				}
+				writer := w.CreateFile("test.bin", &FileMeta{
+					Base: Base{GID: 123, UID: 124, Permissions: 0644},
+				})
+				r := rand.New(rand.NewSource(0)) // Random but deterministic data
+				if _, err := io.CopyN(writer, r, 128); err != nil {
+					return err
+				}
+				if err := writer.Close(); err != nil {
+					return err
+				}
+				return nil
+			},
+			validate: func(t *testing.T) error {
+				var stat unix.Stat_t
+				err := unix.Stat("/test/test.bin", &stat)
+				assert.NoError(t, err, "failed to stat file")
+				require.EqualValues(t, 124, stat.Uid, "wrong Uid")
+				require.EqualValues(t, 123, stat.Gid, "wrong Gid")
+				require.EqualValues(t, 0644, stat.Mode&^unix.S_IFMT, "wrong mode")
+				file, err := os.Open("/test/test.bin")
+				assert.NoError(t, err, "failed to open test file")
+				defer file.Close()
+				r := io.LimitReader(rand.New(rand.NewSource(0)), 128) // Random but deterministic data
+				expected, _ := ioutil.ReadAll(r)
+				actual, err := ioutil.ReadAll(file)
+				assert.NoError(t, err, "failed to read test file")
+				assert.Equal(t, expected, actual, "content not identical")
+				return nil
+			},
+		},
+		{
+			name: "LargeFile",
+			setup: func(w *Writer) error {
+				if err := w.Create(".", &Directory{
+					Base:     Base{GID: 123, UID: 123, Permissions: 0755},
+					Children: []string{"test.bin"},
+				}); err != nil {
+					return err
+				}
+				writer := w.CreateFile("test.bin", &FileMeta{
+					Base: Base{GID: 123, UID: 124, Permissions: 0644},
+				})
+				r := rand.New(rand.NewSource(1)) // Random but deterministic data
+				if _, err := io.CopyN(writer, r, 6500); err != nil {
+					return err
+				}
+				if err := writer.Close(); err != nil {
+					return err
+				}
+				return nil
+			},
+			validate: func(t *testing.T) error {
+				var stat unix.Stat_t
+				rawContents, err := ioutil.ReadFile("/dev/ram0")
+				assert.NoError(t, err, "failed to read test data")
+				log.Printf("%x", rawContents)
+				err = unix.Stat("/test/test.bin", &stat)
+				assert.NoError(t, err, "failed to stat file")
+				require.EqualValues(t, 124, stat.Uid, "wrong Uid")
+				require.EqualValues(t, 123, stat.Gid, "wrong Gid")
+				require.EqualValues(t, 0644, stat.Mode&^unix.S_IFMT, "wrong mode")
+				require.EqualValues(t, 6500, stat.Size, "wrong size")
+				file, err := os.Open("/test/test.bin")
+				assert.NoError(t, err, "failed to open test file")
+				defer file.Close()
+				r := io.LimitReader(rand.New(rand.NewSource(1)), 6500) // Random but deterministic data
+				expected, _ := ioutil.ReadAll(r)
+				actual, err := ioutil.ReadAll(file)
+				assert.NoError(t, err, "failed to read test file")
+				assert.Equal(t, expected, actual, "content not identical")
+				return nil
+			},
+		},
+		{
+			name: "MultipleMetaBlocks",
+			setup: func(w *Writer) error {
+				testFileNames := []string{"test1.bin", "test2.bin", "test3.bin"}
+				if err := w.Create(".", &Directory{
+					Base:     Base{GID: 123, UID: 123, Permissions: 0755},
+					Children: testFileNames,
+				}); err != nil {
+					return err
+				}
+				for i, fileName := range testFileNames {
+					writer := w.CreateFile(fileName, &FileMeta{
+						Base: Base{GID: 123, UID: 124, Permissions: 0644},
+					})
+					r := rand.New(rand.NewSource(int64(i))) // Random but deterministic data
+					if _, err := io.CopyN(writer, r, 2053); err != nil {
+						return err
+					}
+					if err := writer.Close(); err != nil {
+						return err
+					}
+				}
+				return nil
+			},
+			validate: func(t *testing.T) error {
+				testFileNames := []string{"test1.bin", "test2.bin", "test3.bin"}
+				for i, fileName := range testFileNames {
+					file, err := os.Open("/test/" + fileName)
+					assert.NoError(t, err, "failed to open test file")
+					defer file.Close()
+					r := io.LimitReader(rand.New(rand.NewSource(int64(i))), 2053) // Random but deterministic data
+					expected, _ := ioutil.ReadAll(r)
+					actual, err := ioutil.ReadAll(file)
+					assert.NoError(t, err, "failed to read test file")
+					require.Equal(t, expected, actual, "content not identical")
+				}
+				return nil
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			file, err := os.OpenFile("/dev/ram0", os.O_WRONLY, 0644)
+			if err != nil {
+				t.Fatalf("failed to create test image: %v", err)
+			}
+			defer file.Close()
+			w, err := NewWriter(file)
+			if err != nil {
+				t.Fatalf("failed to initialize EROFS writer: %v", err)
+			}
+			if err := test.setup(w); err != nil {
+				t.Fatalf("setup failed: %v", err)
+			}
+			if err := w.Close(); err != nil {
+				t.Errorf("failed close: %v", err)
+			}
+			_ = file.Close()
+			if err := os.MkdirAll("/test", 0755); err != nil {
+				t.Error(err)
+			}
+			if err := unix.Mount("/dev/ram0", "/test", "erofs", unix.MS_NOEXEC|unix.MS_NODEV, ""); err != nil {
+				t.Fatal(err)
+			}
+			if err := test.validate(t); err != nil {
+				t.Errorf("validation failure: %v", err)
+			}
+			if err := unix.Unmount("/test", 0); err != nil {
+				t.Fatalf("failed to unmount: %v", err)
+			}
+		})
+
+	}
+}

diff --git a/metropolis/pkg/erofs/inode_types.go b/metropolis/pkg/erofs/inode_types.go
new file mode 100644
index 0000000..afef90e
--- /dev/null
+++ b/metropolis/pkg/erofs/inode_types.go

@@ -0,0 +1,163 @@
+// Copyright 2020 The Monogon Project Authors.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package erofs
+
+import (
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"io"
+	"math"
+	"path"
+	"sort"
+
+	"golang.org/x/sys/unix"
+)
+
+// Inode specifies an interface that all inodes that can be written to an EROFS filesystem implement.
+type Inode interface {
+	inode() *inodeCompact
+}
+
+// Base contains generic inode metadata independent from the specific inode type.
+type Base struct {
+	Permissions uint16
+	UID, GID    uint16
+}
+
+func (b *Base) baseInode(fileType uint16) *inodeCompact {
+	return &inodeCompact{
+		UID:  b.UID,
+		GID:  b.GID,
+		Mode: b.Permissions | fileType,
+	}
+}
+
+// Directory represents a directory inode. The Children property contains the directories' direct children (just the
+// name, not the full path).
+type Directory struct {
+	Base
+	Children []string
+}
+
+func (d *Directory) inode() *inodeCompact {
+	return d.baseInode(unix.S_IFDIR)
+}
+
+func (d *Directory) writeTo(w *uncompressedInodeWriter) error {
+	d.Children = append(d.Children, ".", "..")
+	sort.Strings(d.Children)
+	var nameStartOffset = binary.Size(directoryEntryRaw{}) * len(d.Children)
+	var rawEntries []directoryEntryRaw
+	for _, ent := range d.Children {
+		if nameStartOffset > math.MaxUint16 {
+			return errors.New("directory name offset out of range, too many or too big entries")
+		}
+		var entData directoryEntryRaw
+		entData.NameStartOffset = uint16(nameStartOffset)
+		rawEntries = append(rawEntries, entData)
+		nameStartOffset += len(ent)
+	}
+	for i, ent := range rawEntries {
+		targetPath := path.Join(w.pathname, d.Children[i])
+		if targetPath == ".." {
+			targetPath = "."
+		}
+		w.writer.fixDirectoryEntry[targetPath] = append(w.writer.fixDirectoryEntry[targetPath], direntFixupLocation{
+			path:       w.pathname,
+			entryIndex: uint16(i),
+		})
+		if err := binary.Write(w, binary.LittleEndian, ent); err != nil {
+			return fmt.Errorf("failed to write dirent: %w", err)
+		}
+	}
+	for _, childName := range d.Children {
+		if _, err := w.Write([]byte(childName)); err != nil {
+			return fmt.Errorf("failed to write dirent name: %w", err)
+		}
+	}
+	return nil
+}
+
+// CharacterDevice represents a Unix character device inode with major and minor numbers.
+type CharacterDevice struct {
+	Base
+	Major uint32
+	Minor uint32
+}
+
+func (c *CharacterDevice) inode() *inodeCompact {
+	i := c.baseInode(unix.S_IFCHR)
+	i.Union = uint32(unix.Mkdev(c.Major, c.Minor))
+	return i
+}
+
+// CharacterDevice represents a Unix block device inode with major and minor numbers.
+type BlockDevice struct {
+	Base
+	Major uint32
+	Minor uint32
+}
+
+func (b *BlockDevice) inode() *inodeCompact {
+	i := b.baseInode(unix.S_IFBLK)
+	i.Union = uint32(unix.Mkdev(b.Major, b.Minor))
+	return i
+}
+
+// FIFO represents a Unix FIFO inode.
+type FIFO struct {
+	Base
+}
+
+func (f *FIFO) inode() *inodeCompact {
+	return f.baseInode(unix.S_IFIFO)
+}
+
+// Socket represents a Unix socket inode.
+type Socket struct {
+	Base
+}
+
+func (s *Socket) inode() *inodeCompact {
+	return s.baseInode(unix.S_IFSOCK)
+}
+
+// SymbolicLink represents a symbolic link/symlink to another inode. Target is the literal string target of the symlink.
+type SymbolicLink struct {
+	Base
+	Target string
+}
+
+func (s *SymbolicLink) inode() *inodeCompact {
+	return s.baseInode(unix.S_IFLNK)
+}
+
+func (s *SymbolicLink) writeTo(w io.Writer) error {
+	_, err := w.Write([]byte(s.Target))
+	return err
+}
+
+// FileMeta represents the metadata of a regular file. In this case the contents are written to a Writer returned by the
+// CreateFile function on the EROFS Writer and not included in the structure itself.
+type FileMeta struct {
+	Base
+}
+
+func (f *FileMeta) inode() *inodeCompact {
+	return f.baseInode(unix.S_IFREG)
+}

diff --git a/metropolis/pkg/erofs/uncompressed_inode_writer.go b/metropolis/pkg/erofs/uncompressed_inode_writer.go
new file mode 100644
index 0000000..df89fec
--- /dev/null
+++ b/metropolis/pkg/erofs/uncompressed_inode_writer.go

@@ -0,0 +1,125 @@
+// Copyright 2020 The Monogon Project Authors.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package erofs
+
+import (
+	"bytes"
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"math"
+)
+
+// uncompressedInodeWriter exposes a io.Write-style interface for a single uncompressed inode. It splits the Write-calls
+// into blocks and writes both the blocks and inode metadata. It is required to call Close() to ensure everything is
+// properly written down before writing another inode.
+type uncompressedInodeWriter struct {
+	buf               bytes.Buffer
+	writer            *Writer
+	inode             inodeCompact
+	baseBlock         uint32 // baseBlock == 0 implies this inode didn't allocate a block (yet).
+	writtenBytes      int
+	legacyInodeNumber uint32
+	pathname          string
+}
+
+func (i *uncompressedInodeWriter) allocateBlock() error {
+	bb, err := i.writer.allocateBlocks(1)
+	if err != nil {
+		return err
+	}
+	if i.baseBlock == 0 {
+		i.baseBlock = bb
+	}
+	return nil
+}
+
+func (i *uncompressedInodeWriter) flush(n int) error {
+	if err := i.allocateBlock(); err != nil {
+		return err
+	}
+	slice := i.buf.Next(n)
+	if _, err := i.writer.w.Write(slice); err != nil {
+		return err
+	}
+	// Always pad to BlockSize.
+	_, err := i.writer.w.Write(make([]byte, BlockSize-len(slice)))
+	return err
+}
+
+func (i *uncompressedInodeWriter) Write(b []byte) (int, error) {
+	i.writtenBytes += len(b)
+	if _, err := i.buf.Write(b); err != nil {
+		return 0, err
+	}
+	for i.buf.Len() >= BlockSize {
+		if err := i.flush(BlockSize); err != nil {
+			return 0, err
+		}
+	}
+	return len(b), nil
+}
+
+func (i *uncompressedInodeWriter) Close() error {
+	if i.buf.Len() > BlockSize {
+		panic("programming error")
+	}
+	inodeSize := binary.Size(i.inode)
+	if i.buf.Len()+inodeSize > BlockSize {
+		// Can't fit last part of data inline, write it in its own block.
+		if err := i.flush(i.buf.Len()); err != nil {
+			return err
+		}
+	}
+	if i.buf.Len() == 0 {
+		i.inode.Format = inodeFlatPlain << 1
+	} else {
+		// Colocate last part of data with inode.
+		i.inode.Format = inodeFlatInline << 1
+	}
+	if i.writtenBytes > math.MaxUint32 {
+		return errors.New("inodes bigger than 2^32 need the extended inode format which is unsupported by this library")
+	}
+	i.inode.Size = uint32(i.writtenBytes)
+	if i.baseBlock != 0 {
+		i.inode.Union = i.baseBlock
+	}
+	i.inode.HardlinkCount = 1
+	i.inode.InodeNumCompat = i.legacyInodeNumber
+	basePos, err := i.writer.allocateMetadata(inodeSize+i.buf.Len(), 32)
+	if err != nil {
+		return fmt.Errorf("failed to allocate metadata: %w", err)
+	}
+	i.writer.pathInodeMeta[i.pathname] = &uncompressedInodeMeta{
+		nid:          uint64(basePos) / 32,
+		ftype:        unixModeToFT(i.inode.Mode),
+		blockStart:   int64(i.baseBlock),
+		blockLength:  (int64(i.writtenBytes) / BlockSize) * BlockSize,
+		inlineStart:  basePos + 32,
+		inlineLength: int64(i.buf.Len()),
+		writer:       i.writer,
+	}
+	if err := binary.Write(i.writer.w, binary.LittleEndian, &i.inode); err != nil {
+		return err
+	}
+	if i.inode.Format&(inodeFlatInline<<1) != 0 {
+		// Data colocated in inode, if any.
+		_, err := i.writer.w.Write(i.buf.Bytes())
+		return err
+	}
+	return nil
+}
commit	378a4455aedda838f60c546e55199092f24952ed	[log] [tgz]
author	Lorenz Brun <lorenz@nexantic.com>	Tue Jan 26 13:47:41 2021 +0100
committer	Lorenz Brun <lorenz@nexantic.com>	Tue Jan 26 13:47:41 2021 +0100
tree	aa78b858535224fe8c9b24c2ff7e9ed2c903080b
parent	74e8e5c35fea1ec9ce13c8a2d16100bab45d42d9 [diff]