Add Loop Device package

This adds Loop device support in our Linux kernel and adds a Go package for working with them.
It also drive-by adds a pre-mounted tmpfs to ktest as that is quite useful in a lot of situations.

Test Plan: Comes with ktests.

X-Origin-Diff: phab/D745
GitOrigin-RevId: fa06bcdddc033efb136f56da3b4a91159273bf88
diff --git a/metropolis/pkg/loop/BUILD.bazel b/metropolis/pkg/loop/BUILD.bazel
new file mode 100644
index 0000000..1bf0722
--- /dev/null
+++ b/metropolis/pkg/loop/BUILD.bazel
@@ -0,0 +1,25 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//metropolis/test/ktest:ktest.bzl", "ktest")
+
+go_library(
+    name = "go_default_library",
+    srcs = ["loop.go"],
+    importpath = "source.monogon.dev/metropolis/pkg/loop",
+    visibility = ["//visibility:public"],
+    deps = ["@org_golang_x_sys//unix:go_default_library"],
+)
+
+go_test(
+    name = "go_default_test",
+    srcs = ["loop_test.go"],
+    embed = [":go_default_library"],
+    deps = [
+        "@com_github_stretchr_testify//assert:go_default_library",
+        "@com_github_stretchr_testify//require:go_default_library",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+ktest(
+    tester = ":go_default_test",
+)
diff --git a/metropolis/pkg/loop/loop.go b/metropolis/pkg/loop/loop.go
new file mode 100644
index 0000000..64b533b
--- /dev/null
+++ b/metropolis/pkg/loop/loop.go
@@ -0,0 +1,255 @@
+// Copyright 2020 The Monogon Project Authors.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package loop implements an interface to configure Linux loop devices.
+//
+// This package requires Linux 5.8 or higher because it uses the newer LOOP_CONFIGURE ioctl, which is better-behaved
+// and twice as fast as the old approach. It doesn't support all of the cryptloop functionality as it has been
+// superseded by dm-crypt and has known vulnerabilities. It also doesn't support on-the-fly reconfiguration of loop
+// devices as this is rather unusual, works only under very specific circumstances and would make the API less clean.
+package loop
+
+import (
+	"errors"
+	"fmt"
+	"io/ioutil"
+	"math/bits"
+	"os"
+	"sync"
+	"syscall"
+	"unsafe"
+
+	"golang.org/x/sys/unix"
+)
+
+// Lazily-initialized file descriptor for the control device /dev/loop-control (singleton)
+var (
+	mutex         sync.Mutex
+	loopControlFd *os.File
+)
+
+const (
+	// LOOP_CONFIGURE from @linux//include/uapi/linux:loop.h
+	loopConfigure = 0x4C0A
+	// LOOP_MAJOR from @linux//include/uapi/linux:major.h
+	loopMajor = 7
+)
+
+// struct loop_config from @linux//include/uapi/linux:loop.h
+type loopConfig struct {
+	fd        uint32
+	blockSize uint32 // Power of 2 between 512 and os.Getpagesize(), defaults reasonably
+	info      loopInfo64
+	_reserved [64]byte
+}
+
+// struct loop_info64 from @linux//include/uapi/linux:loop.h
+type loopInfo64 struct {
+	device         uint64
+	inode          uint64
+	rdevice        uint64
+	offset         uint64 // used
+	sizeLimit      uint64 // used
+	number         uint32
+	encryptType    uint32
+	encryptKeySize uint32
+	flags          uint32   // Flags from Flag constant
+	filename       [64]byte // used
+	cryptname      [64]byte
+	encryptkey     [32]byte
+	init           [2]uint64
+}
+
+type Config struct {
+	// Block size of the loop device in bytes. Power of 2 between 512 and page size.
+	// Zero defaults to an reasonable block size.
+	BlockSize uint32
+	// Combination of flags from the Flag constants in this package.
+	Flags uint32
+	// Offset in bytes from the start of the file to the first byte of the device. Usually zero.
+	Offset uint64
+	// Maximum size of the loop device in bytes. Zero defaults to the whole file.
+	SizeLimit uint64
+}
+
+func (c *Config) validate() error {
+	// Additional validation because of inconsistent kernel-side enforcement
+	if c.BlockSize != 0 {
+		if c.BlockSize < 512 || c.BlockSize > uint32(os.Getpagesize()) || bits.OnesCount32(c.BlockSize) > 1 {
+			return errors.New("BlockSize needs to be a power of two between 512 bytes and the OS page size")
+		}
+	}
+	return nil
+}
+
+// ensureFds lazily initializes control devices
+func ensureFds() (err error) {
+	mutex.Lock()
+	defer mutex.Unlock()
+	if loopControlFd != nil {
+		return
+	}
+	loopControlFd, err = os.Open("/dev/loop-control")
+	return
+}
+
+// Device represents a loop device.
+type Device struct {
+	num uint32
+	dev *os.File
+
+	closed bool
+}
+
+// All from @linux//include/uapi/linux:loop.h
+const (
+	// Makes the loop device read-only even if the backing file is read-write.
+	FlagReadOnly = 1
+	// Unbinds the backing file as soon as the last user is gone. Useful for unbinding after unmount.
+	FlagAutoclear = 4
+	// Enables kernel-side partition scanning on the loop device. Needed if you want to access specific partitions on
+	// a loop device.
+	FlagPartscan = 8
+	// Enables direct IO for the loop device, bypassing caches and buffer copying.
+	FlagDirectIO = 16
+)
+
+// Create creates a new loop device backed with the given file.
+func Create(f *os.File, c Config) (*Device, error) {
+	if err := c.validate(); err != nil {
+		return nil, err
+	}
+	if err := ensureFds(); err != nil {
+		return nil, fmt.Errorf("failed to access loop control device: %w", err)
+	}
+	for {
+		devNum, _, errno := syscall.Syscall(unix.SYS_IOCTL, loopControlFd.Fd(), unix.LOOP_CTL_GET_FREE, 0)
+		if errno != unix.Errno(0) {
+			return nil, fmt.Errorf("failed to allocate loop device: %w", os.NewSyscallError("ioctl(LOOP_CTL_GET_FREE)", errno))
+		}
+		dev, err := os.OpenFile(fmt.Sprintf("/dev/loop%v", devNum), os.O_RDWR|os.O_EXCL, 0)
+		if pe, ok := err.(*os.PathError); ok {
+			if pe.Err == unix.EBUSY {
+				// We have lost the race, get a new device
+				continue
+			}
+		}
+		if err != nil {
+			return nil, fmt.Errorf("failed to open newly-allocated loop device: %w", err)
+		}
+
+		var config loopConfig
+		config.fd = uint32(f.Fd())
+		config.blockSize = c.BlockSize
+		config.info.flags = c.Flags
+		config.info.offset = c.Offset
+		config.info.sizeLimit = c.SizeLimit
+
+		if _, _, err := syscall.Syscall(unix.SYS_IOCTL, dev.Fd(), loopConfigure, uintptr(unsafe.Pointer(&config))); err != 0 {
+			if err == unix.EBUSY {
+				// We have lost the race, get a new device
+				continue
+			}
+			return nil, os.NewSyscallError("ioctl(LOOP_CONFIGURE)", err)
+		}
+		return &Device{dev: dev, num: uint32(devNum)}, nil
+	}
+}
+
+// Open opens a loop device at the given path. It returns an error if the path is not a loop device.
+func Open(path string) (*Device, error) {
+	potentialDevice, err := os.Open(path)
+	if err != nil {
+		return nil, fmt.Errorf("failed to open device: %w", err)
+	}
+	var loopInfo loopInfo64
+	_, _, err = syscall.Syscall(unix.SYS_IOCTL, potentialDevice.Fd(), unix.LOOP_GET_STATUS64, uintptr(unsafe.Pointer(&loopInfo)))
+	if err == syscall.Errno(0) {
+		return &Device{dev: potentialDevice, num: loopInfo.number}, nil
+	}
+	potentialDevice.Close()
+	if err == syscall.EINVAL {
+		return nil, errors.New("not a loop device")
+	}
+	return nil, fmt.Errorf("failed to determine state of potential loop device: %w", err)
+}
+
+func (d *Device) ensureOpen() error {
+	if d.closed {
+		return errors.New("device is closed")
+	}
+	return nil
+}
+
+// DevPath returns the canonical path of this loop device in /dev.
+func (d *Device) DevPath() (string, error) {
+	if err := d.ensureOpen(); err != nil {
+		return "", err
+	}
+	return fmt.Sprintf("/dev/loop%d", d.num), nil
+}
+
+// Dev returns the Linux device ID of the loop device.
+func (d *Device) Dev() (uint64, error) {
+	if err := d.ensureOpen(); err != nil {
+		return 0, err
+	}
+	return unix.Mkdev(loopMajor, d.num), nil
+}
+
+// BackingFilePath returns the path of the backing file
+func (d *Device) BackingFilePath() (string, error) {
+	backingFile, err := ioutil.ReadFile(fmt.Sprintf("/sys/block/loop%d/loop/backing_file", d.num))
+	if err != nil {
+		return "", fmt.Errorf("failed to get backing file path: %w", err)
+	}
+	return string(backingFile), err
+}
+
+// RefreshSize recalculates the size of the loop device based on the config and the size of the backing file.
+func (d *Device) RefreshSize() error {
+	if err := d.ensureOpen(); err != nil {
+		return err
+	}
+	return unix.IoctlSetInt(int(d.dev.Fd()), unix.LOOP_SET_CAPACITY, 0)
+}
+
+// Close closes all file descriptors open to the device. Does not remove the device itself or alter its configuration.
+func (d *Device) Close() error {
+	if err := d.ensureOpen(); err != nil {
+		return err
+	}
+	d.closed = true
+	return d.dev.Close()
+}
+
+// Remove removes the loop device.
+func (d *Device) Remove() error {
+	if err := d.ensureOpen(); err != nil {
+		return err
+	}
+	err := unix.IoctlSetInt(int(d.dev.Fd()), unix.LOOP_CLR_FD, 0)
+	if err != nil {
+		return err
+	}
+	if err := d.Close(); err != nil {
+		return fmt.Errorf("failed to close device: %w", err)
+	}
+	if err := unix.IoctlSetInt(int(loopControlFd.Fd()), unix.LOOP_CTL_REMOVE, int(d.num)); err != nil {
+		return err
+	}
+	return nil
+}
diff --git a/metropolis/pkg/loop/loop_test.go b/metropolis/pkg/loop/loop_test.go
new file mode 100644
index 0000000..1ddb34f
--- /dev/null
+++ b/metropolis/pkg/loop/loop_test.go
@@ -0,0 +1,208 @@
+// Copyright 2020 The Monogon Project Authors.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package loop
+
+import (
+	"encoding/binary"
+	"io"
+	"io/ioutil"
+	"math"
+	"os"
+	"runtime"
+	"syscall"
+	"testing"
+	"unsafe"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+	"golang.org/x/sys/unix"
+)
+
+// Write a test file with a very specific pattern (increasing little-endian 16 bit unsigned integers) to detect offset
+// correctness. File is always 128KiB large (2^16 * 2 bytes).
+func makeTestFile() *os.File {
+	f, err := ioutil.TempFile("/tmp", "")
+	if err != nil {
+		panic(err)
+	}
+	for i := 0; i <= math.MaxUint16; i++ {
+		if err := binary.Write(f, binary.LittleEndian, uint16(i)); err != nil {
+			panic(err)
+		}
+	}
+	if _, err := f.Seek(0, io.SeekStart); err != nil {
+		panic(err)
+	}
+	return f
+}
+
+func getBlkdevSize(f *os.File) (size uint64) {
+	if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, f.Fd(), unix.BLKGETSIZE64, uintptr(unsafe.Pointer(&size))); err != 0 {
+		panic(err)
+	}
+	return
+}
+
+func getOffsetFromContent(dev *Device) (firstIndex uint16) {
+	if err := binary.Read(dev.dev, binary.LittleEndian, &firstIndex); err != nil {
+		panic(err)
+	}
+	firstIndex *= 2 // 2 bytes per index
+	return
+}
+
+func setupCreate(t *testing.T, config Config) *Device {
+	f := makeTestFile()
+	dev, err := Create(f, config)
+	defer f.Close()
+	assert.NoError(t, err)
+	t.Cleanup(func() {
+		if dev != nil {
+			dev.Remove()
+		}
+		os.Remove(f.Name())
+	})
+	if dev == nil {
+		t.FailNow()
+	}
+	return dev
+}
+
+func TestDeviceAccessors(t *testing.T) {
+	if os.Getenv("IN_KTEST") != "true" {
+		t.Skip("Not in ktest")
+	}
+	dev := setupCreate(t, Config{})
+
+	devPath, err := dev.DevPath()
+	assert.NoError(t, err)
+	require.Equal(t, "/dev/loop0", devPath)
+
+	var stat unix.Stat_t
+	assert.NoError(t, unix.Stat("/dev/loop0", &stat))
+	devNum, err := dev.Dev()
+	assert.NoError(t, err)
+	require.Equal(t, stat.Rdev, devNum)
+
+	backingFile, err := dev.BackingFilePath()
+	assert.NoError(t, err)
+	// The filename of the temporary file is not available in this context, but we know that the file
+	// needs to be in /tmp, which should be a good-enough test.
+	assert.Contains(t, backingFile, "/tmp/")
+}
+
+func TestCreate(t *testing.T) {
+	if os.Getenv("IN_KTEST") != "true" {
+		t.Skip("Not in ktest")
+	}
+	t.Parallel()
+	tests := []struct {
+		name     string
+		config   Config
+		validate func(t *testing.T, dev *Device)
+	}{
+		{"NoOpts", Config{}, func(t *testing.T, dev *Device) {
+			require.Equal(t, uint64(128*1024), getBlkdevSize(dev.dev))
+			require.Equal(t, uint16(0), getOffsetFromContent(dev))
+
+			_, err := dev.dev.WriteString("test")
+			assert.NoError(t, err)
+		}},
+		{"DirectIO", Config{Flags: FlagDirectIO}, func(t *testing.T, dev *Device) {
+			require.Equal(t, uint64(128*1024), getBlkdevSize(dev.dev))
+
+			_, err := dev.dev.WriteString("test")
+			assert.NoError(t, err)
+		}},
+		{"ReadOnly", Config{Flags: FlagReadOnly}, func(t *testing.T, dev *Device) {
+			_, err := dev.dev.WriteString("test")
+			assert.Error(t, err)
+		}},
+		{"Mapping", Config{BlockSize: 512, SizeLimit: 2048, Offset: 4096}, func(t *testing.T, dev *Device) {
+			assert.Equal(t, uint16(4096), getOffsetFromContent(dev))
+			assert.Equal(t, uint64(2048), getBlkdevSize(dev.dev))
+		}},
+	}
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			dev := setupCreate(t, test.config)
+			test.validate(t, dev)
+			assert.NoError(t, dev.Remove())
+		})
+	}
+}
+
+func TestOpenBadDevice(t *testing.T) {
+	if os.Getenv("IN_KTEST") != "true" {
+		t.Skip("Not in ktest")
+	}
+	dev, err := Open("/dev/null")
+	require.Error(t, err)
+	if dev != nil { // Prevent leaks in case this test fails
+		dev.Close()
+	}
+}
+
+func TestOpen(t *testing.T) {
+	if os.Getenv("IN_KTEST") != "true" {
+		t.Skip("Not in ktest")
+	}
+	f := makeTestFile()
+	defer os.Remove(f.Name())
+	defer f.Close()
+	dev, err := Create(f, Config{})
+	assert.NoError(t, err)
+	path, err := dev.DevPath()
+	assert.NoError(t, err)
+	assert.NoError(t, dev.Close())
+	reopenedDev, err := Open(path)
+	assert.NoError(t, err)
+	defer reopenedDev.Remove()
+	reopenedDevPath, err := reopenedDev.DevPath()
+	assert.NoError(t, err)
+	require.Equal(t, path, reopenedDevPath) // Still needs to be the same device
+}
+
+func TestResize(t *testing.T) {
+	if os.Getenv("IN_KTEST") != "true" {
+		t.Skip("Not in ktest")
+	}
+	f, err := ioutil.TempFile("/tmp", "")
+	assert.NoError(t, err)
+	empty1K := make([]byte, 1024)
+	for i := 0; i < 64; i++ {
+		_, err := f.Write(empty1K)
+		assert.NoError(t, err)
+	}
+	dev, err := Create(f, Config{})
+	assert.NoError(t, err)
+	require.Equal(t, uint64(64*1024), getBlkdevSize(dev.dev))
+	for i := 0; i < 32; i++ {
+		_, err := f.Write(empty1K)
+		assert.NoError(t, err)
+	}
+	assert.NoError(t, f.Sync())
+	assert.NoError(t, dev.RefreshSize())
+	require.Equal(t, uint64(96*1024), getBlkdevSize(dev.dev))
+}
+
+func TestStructSize(t *testing.T) {
+	if runtime.GOOS != "linux" && runtime.GOARCH != "amd64" {
+		t.Skip("Reference value not available")
+	}
+	require.Equal(t, uintptr(304), unsafe.Sizeof(loopConfig{}))
+}
diff --git a/metropolis/test/ktest/init/main.go b/metropolis/test/ktest/init/main.go
index f6049db..0236531 100644
--- a/metropolis/test/ktest/init/main.go
+++ b/metropolis/test/ktest/init/main.go
@@ -39,6 +39,7 @@
 		{"/proc", "proc", unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV},
 		{"/dev", "devtmpfs", unix.MS_NOEXEC | unix.MS_NOSUID},
 		{"/dev/pts", "devpts", unix.MS_NOEXEC | unix.MS_NOSUID},
+		{"/tmp", "tmpfs", 0},
 	} {
 		if err := os.Mkdir(el.dir, 0755); err != nil && !os.IsExist(err) {
 			return fmt.Errorf("could not make %s: %w", el.dir, err)