init: remount to tmpfs

runsc needs to be able to pivot_root. According to @lorenz this does not
work from initramfs. This introduces a temporary fix to re-mount and
re-exec into a new root based on tmpfs.

A proper fix would be to use a real filesystem instead of initramfs
(like squashfs), but this will do for now.

We also use this opportunity to use devtmpfs instead of manually
managing /dev. This collides with the storage manager that tries to
create all storage nodes - we just remove that.

Test Plan: shouldn't change behaviour

X-Origin-Diff: phab/D433
GitOrigin-RevId: aa59fec6551bab1b1b9c2fe037dce410e550981b
diff --git a/core/cmd/init/BUILD.bazel b/core/cmd/init/BUILD.bazel
index e8e55dc..0765538 100644
--- a/core/cmd/init/BUILD.bazel
+++ b/core/cmd/init/BUILD.bazel
@@ -2,7 +2,10 @@
 
 go_library(
     name = "go_default_library",
-    srcs = ["main.go"],
+    srcs = [
+        "main.go",
+        "switchroot.go",
+    ],
     importpath = "git.monogon.dev/source/nexantic.git/core/cmd/init",
     visibility = ["//visibility:private"],
     deps = [
diff --git a/core/cmd/init/main.go b/core/cmd/init/main.go
index 82ba033..f4ff871 100644
--- a/core/cmd/init/main.go
+++ b/core/cmd/init/main.go
@@ -55,23 +55,15 @@
 	if err != nil {
 		panic(err)
 	}
+
+	// Remount onto a tmpfs and re-exec if needed. Otherwise, keep running.
+	err = switchRoot(logger)
+	if err != nil {
+		panic(fmt.Errorf("could not remount root: %w", err))
+	}
+
 	logger.Info("Starting Smalltown Init")
 
-	// Set up bare minimum mounts
-	if err := os.Mkdir("/sys", 0755); err != nil {
-		panic(err)
-	}
-	if err := unix.Mount("sysfs", "/sys", "sysfs", unix.MS_NOEXEC|unix.MS_NOSUID|unix.MS_NODEV, ""); err != nil {
-		panic(err)
-	}
-
-	if err := os.Mkdir("/proc", 0755); err != nil {
-		panic(err)
-	}
-	if err := unix.Mount("procfs", "/proc", "proc", unix.MS_NOEXEC|unix.MS_NOSUID|unix.MS_NODEV, ""); err != nil {
-		panic(err)
-	}
-
 	signalChannel := make(chan os.Signal, 2)
 	signal.Notify(signalChannel)
 
@@ -81,7 +73,7 @@
 
 	storageManager, err := storage.Initialize(logger.With(zap.String("component", "storage")))
 	if err != nil {
-		panic(err)
+		panic(fmt.Errorf("could not initialize storage: %w", err))
 	}
 
 	networkSvc, err := network.NewNetworkService(network.Config{}, logger.With(zap.String("component", "network")))
diff --git a/core/cmd/init/switchroot.go b/core/cmd/init/switchroot.go
new file mode 100644
index 0000000..0e68b06
--- /dev/null
+++ b/core/cmd/init/switchroot.go
@@ -0,0 +1,174 @@
+// Copyright 2020 The Monogon Project Authors.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package main
+
+import (
+	"fmt"
+	"io"
+	"os"
+	"path/filepath"
+	"strings"
+	"syscall"
+
+	"go.uber.org/zap"
+	"golang.org/x/sys/unix"
+)
+
+// switchRoot moves the root from initramfs into a tmpfs
+// This is necessary because you cannot pivot_root from a initramfs (and runsc wants to do that).
+// In the future, we should instead use something like squashfs instead of an initramfs and just nuke this.
+func switchRoot(log *zap.Logger) error {
+	// We detect the need to remount to tmpfs over env vars.
+	// The first run of /init (from initramfs) will not have this var, and will be re-exec'd from a new tmpfs root with
+	// that variable set.
+	witness := "SIGNOS_REMOUNTED"
+
+	// If the witness env var is found in the environment, it means we are ready to go.
+	environ := os.Environ()
+	for _, env := range environ {
+		if strings.HasPrefix(env, witness+"=") {
+			log.Info("Smalltown running in tmpfs root")
+			return nil
+		}
+	}
+
+	// Otherwise, we need to remount to a tmpfs.
+	environ = append(environ, witness+"=yes")
+	log.Info("Smalltown running in initramfs, remounting to tmpfs...")
+
+	// Make note of all directories we have to make and files that we have to copy.
+	paths := []string{}
+	dirs := []string{}
+	err := filepath.Walk("/", func(path string, info os.FileInfo, err error) error {
+		if err != nil {
+			return err
+		}
+		if path == "/" {
+			return nil
+		}
+		// /dev is prepopulated by the initramfs, skip that. The target root uses devtmpfs.
+		if path == "/dev" || strings.HasPrefix(path, "/dev/") {
+			return nil
+		}
+
+		if info.IsDir() {
+			dirs = append(dirs, path)
+		} else {
+			paths = append(paths, path)
+		}
+
+		return nil
+	})
+	if err != nil {
+		return fmt.Errorf("could not list root files: %w", err)
+	}
+
+	log.Info("Copying to tmpfs", zap.Strings("paths", paths), zap.Strings("dirs", dirs))
+
+	// Make new root at /mnt
+	if err := os.Mkdir("/mnt", 0755); err != nil {
+		return fmt.Errorf("could not make /mnt: %w", err)
+	}
+	// And mount a tmpfs on it
+	if err := unix.Mount("tmpfs", "/mnt", "tmpfs", 0, ""); err != nil {
+		return fmt.Errorf("could not mount tmpfs on /mnt: %w", err)
+	}
+
+	// Make all directories. Since filepath.Walk is lexicographically ordered, we don't need to ensure that the parent
+	// exists.
+	for _, src := range dirs {
+		stat, err := os.Stat(src)
+		if err != nil {
+			return fmt.Errorf("Stat(%q): %w", src, err)
+		}
+		dst := "/mnt" + src
+		err = os.Mkdir(dst, stat.Mode())
+		if err != nil {
+			return fmt.Errorf("Mkdir(%q): %w", dst, err)
+		}
+	}
+
+	// Move all files over. Parent directories will exist by now.
+	for _, src := range paths {
+		stat, err := os.Stat(src)
+		if err != nil {
+			return fmt.Errorf("Stat(%q): %w", src, err)
+		}
+		dst := "/mnt" + src
+
+		// Copy file.
+		sfd, err := os.Open(src)
+		if err != nil {
+			return fmt.Errorf("Open(%q): %w", src, err)
+		}
+		dfd, err := os.OpenFile(dst, os.O_WRONLY|os.O_CREATE, stat.Mode())
+		if err != nil {
+			sfd.Close()
+			return fmt.Errorf("OpenFile(%q): %w", dst, err)
+		}
+		_, err = io.Copy(dfd, sfd)
+
+		sfd.Close()
+		dfd.Close()
+		if err != nil {
+			return fmt.Errorf("Copying %q failed: %w", src, err)
+		}
+
+		// Remove the old file.
+		err = unix.Unlink(src)
+		if err != nil {
+			return fmt.Errorf("Unlink(%q): %w", src, err)
+		}
+	}
+
+	// Set up target filesystems.
+	for _, el := range []struct {
+		dir   string
+		fs    string
+		flags uintptr
+	}{
+		{"/sys", "sysfs", unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV},
+		{"/proc", "proc", unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV},
+		{"/dev", "devtmpfs", unix.MS_NOEXEC | unix.MS_NOSUID},
+		{"/dev/pts", "devpts", unix.MS_NOEXEC | unix.MS_NOSUID},
+	} {
+		if err := os.Mkdir("/mnt"+el.dir, 0755); err != nil {
+			return fmt.Errorf("could not make /mnt%s: %w", el.dir, err)
+		}
+		if err := unix.Mount(el.fs, "/mnt"+el.dir, el.fs, el.flags, ""); err != nil {
+			return fmt.Errorf("could not mount %s on /mnt%s: %w", el.fs, el.dir, err)
+		}
+	}
+
+	// Chroot to new root.
+	// This is adapted from util-linux's switch_root.
+	err = os.Chdir("/mnt")
+	if err != nil {
+		return fmt.Errorf("could not chdir to /mnt: %w", err)
+	}
+	err = syscall.Mount("/mnt", "/", "", syscall.MS_MOVE, "")
+	if err != nil {
+		return fmt.Errorf("could not remount /mnt to /: %w", err)
+	}
+	err = syscall.Chroot(".")
+	if err != nil {
+		return fmt.Errorf("could not chroot to new root: %w", err)
+	}
+
+	// Re-exec into new init with new environment
+	return unix.Exec("/init", os.Args, environ)
+}
diff --git a/core/internal/storage/find.go b/core/internal/storage/find.go
index 1abf6c0..8d83510 100644
--- a/core/internal/storage/find.go
+++ b/core/internal/storage/find.go
@@ -56,14 +56,7 @@
 			if err != nil {
 				return fmt.Errorf("failed to convert uevent: %w", err)
 			}
-			minorDev, err := strconv.Atoi(ueventData["MINOR"])
-			if err != nil {
-				return fmt.Errorf("failed to convert uevent: %w", err)
-			}
 			devNodeName := fmt.Sprintf("/dev/%v", ueventData["DEVNAME"])
-			if err := unix.Mknod(devNodeName, 0600|unix.S_IFBLK, int(unix.Mkdev(uint32(majorDev), uint32(minorDev)))); err != nil {
-				return fmt.Errorf("failed to create block device node: %w", err)
-			}
 			blkdev, err := os.Open(devNodeName)
 			if err != nil {
 				return fmt.Errorf("failed to open block device %v: %w", devNodeName, err)
diff --git a/third_party/linux/linux-smalltown.config b/third_party/linux/linux-smalltown.config
index e2fcdf3..86d02a4 100644
--- a/third_party/linux/linux-smalltown.config
+++ b/third_party/linux/linux-smalltown.config
@@ -2542,7 +2542,7 @@
 # CONFIG_PROC_CHILDREN is not set
 CONFIG_KERNFS=y
 CONFIG_SYSFS=y
-# CONFIG_TMPFS is not set
+CONFIG_TMPFS=y
 # CONFIG_HUGETLBFS is not set
 CONFIG_CONFIGFS_FS=y
 CONFIG_EFIVAR_FS=y