blob: 7391e755305261dafc93f63c0d15e14ea3c1c52a [file] [log] [blame]
Serge Bazanski581b0bd2020-03-12 13:36:43 +01001// Copyright 2020 The Monogon Project Authors.
2//
3// SPDX-License-Identifier: Apache-2.0
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
17package main
18
19import (
20 "fmt"
21 "io"
Lorenz Brunc88c82d2020-05-08 14:35:04 +020022 "io/ioutil"
Serge Bazanski581b0bd2020-03-12 13:36:43 +010023 "os"
24 "path/filepath"
25 "strings"
26 "syscall"
27
Serge Bazanski581b0bd2020-03-12 13:36:43 +010028 "golang.org/x/sys/unix"
Serge Bazanski77cb6c52020-12-19 00:09:22 +010029
Serge Bazanski31370b02021-01-07 16:31:14 +010030 "source.monogon.dev/metropolis/pkg/logtree"
Serge Bazanski581b0bd2020-03-12 13:36:43 +010031)
32
33// switchRoot moves the root from initramfs into a tmpfs
34// This is necessary because you cannot pivot_root from a initramfs (and runsc wants to do that).
35// In the future, we should instead use something like squashfs instead of an initramfs and just nuke this.
Serge Bazanskic7359672020-10-30 16:38:57 +010036func switchRoot(log logtree.LeveledLogger) error {
Serge Bazanski581b0bd2020-03-12 13:36:43 +010037 // We detect the need to remount to tmpfs over env vars.
38 // The first run of /init (from initramfs) will not have this var, and will be re-exec'd from a new tmpfs root with
39 // that variable set.
Serge Bazanski662b5b32020-12-21 13:49:00 +010040 witness := "METROPOLIS_REMOUNTED"
Serge Bazanski581b0bd2020-03-12 13:36:43 +010041
42 // If the witness env var is found in the environment, it means we are ready to go.
43 environ := os.Environ()
44 for _, env := range environ {
45 if strings.HasPrefix(env, witness+"=") {
Serge Bazanski662b5b32020-12-21 13:49:00 +010046 log.Info("Metropolis node running in tmpfs root")
Serge Bazanski581b0bd2020-03-12 13:36:43 +010047 return nil
48 }
49 }
50
51 // Otherwise, we need to remount to a tmpfs.
52 environ = append(environ, witness+"=yes")
Serge Bazanski662b5b32020-12-21 13:49:00 +010053 log.Info("Metropolis node running in initramfs, remounting to tmpfs...")
Serge Bazanski581b0bd2020-03-12 13:36:43 +010054
55 // Make note of all directories we have to make and files that we have to copy.
56 paths := []string{}
57 dirs := []string{}
58 err := filepath.Walk("/", func(path string, info os.FileInfo, err error) error {
59 if err != nil {
60 return err
61 }
62 if path == "/" {
63 return nil
64 }
65 // /dev is prepopulated by the initramfs, skip that. The target root uses devtmpfs.
66 if path == "/dev" || strings.HasPrefix(path, "/dev/") {
67 return nil
68 }
69
70 if info.IsDir() {
71 dirs = append(dirs, path)
72 } else {
73 paths = append(paths, path)
74 }
75
76 return nil
77 })
78 if err != nil {
79 return fmt.Errorf("could not list root files: %w", err)
80 }
81
Serge Bazanskic7359672020-10-30 16:38:57 +010082 log.Info("Copying paths to tmpfs:")
83 for _, p := range paths {
84 log.Infof(" - %s", p)
85 }
Serge Bazanski581b0bd2020-03-12 13:36:43 +010086
87 // Make new root at /mnt
88 if err := os.Mkdir("/mnt", 0755); err != nil {
89 return fmt.Errorf("could not make /mnt: %w", err)
90 }
91 // And mount a tmpfs on it
92 if err := unix.Mount("tmpfs", "/mnt", "tmpfs", 0, ""); err != nil {
93 return fmt.Errorf("could not mount tmpfs on /mnt: %w", err)
94 }
95
96 // Make all directories. Since filepath.Walk is lexicographically ordered, we don't need to ensure that the parent
97 // exists.
98 for _, src := range dirs {
99 stat, err := os.Stat(src)
100 if err != nil {
101 return fmt.Errorf("Stat(%q): %w", src, err)
102 }
103 dst := "/mnt" + src
104 err = os.Mkdir(dst, stat.Mode())
105 if err != nil {
106 return fmt.Errorf("Mkdir(%q): %w", dst, err)
107 }
108 }
109
110 // Move all files over. Parent directories will exist by now.
111 for _, src := range paths {
112 stat, err := os.Stat(src)
113 if err != nil {
114 return fmt.Errorf("Stat(%q): %w", src, err)
115 }
116 dst := "/mnt" + src
117
118 // Copy file.
119 sfd, err := os.Open(src)
120 if err != nil {
121 return fmt.Errorf("Open(%q): %w", src, err)
122 }
123 dfd, err := os.OpenFile(dst, os.O_WRONLY|os.O_CREATE, stat.Mode())
124 if err != nil {
125 sfd.Close()
126 return fmt.Errorf("OpenFile(%q): %w", dst, err)
127 }
128 _, err = io.Copy(dfd, sfd)
129
130 sfd.Close()
131 dfd.Close()
132 if err != nil {
133 return fmt.Errorf("Copying %q failed: %w", src, err)
134 }
135
136 // Remove the old file.
137 err = unix.Unlink(src)
138 if err != nil {
139 return fmt.Errorf("Unlink(%q): %w", src, err)
140 }
141 }
142
143 // Set up target filesystems.
144 for _, el := range []struct {
145 dir string
146 fs string
147 flags uintptr
148 }{
149 {"/sys", "sysfs", unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV},
150 {"/proc", "proc", unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV},
151 {"/dev", "devtmpfs", unix.MS_NOEXEC | unix.MS_NOSUID},
152 {"/dev/pts", "devpts", unix.MS_NOEXEC | unix.MS_NOSUID},
153 } {
154 if err := os.Mkdir("/mnt"+el.dir, 0755); err != nil {
155 return fmt.Errorf("could not make /mnt%s: %w", el.dir, err)
156 }
157 if err := unix.Mount(el.fs, "/mnt"+el.dir, el.fs, el.flags, ""); err != nil {
158 return fmt.Errorf("could not mount %s on /mnt%s: %w", el.fs, el.dir, err)
159 }
160 }
161
Lorenz Brunc88c82d2020-05-08 14:35:04 +0200162 // Mount all available CGroups for v1 (v2 uses a single unified hierarchy and is not supported by our runtimes yet)
163 if unix.Mount("tmpfs", "/mnt/sys/fs/cgroup", "tmpfs", unix.MS_NOEXEC|unix.MS_NOSUID|unix.MS_NODEV, ""); err != nil {
164 panic(err)
165 }
166 cgroupsRaw, err := ioutil.ReadFile("/mnt/proc/cgroups")
167 if err != nil {
168 panic(err)
169 }
170
171 cgroupLines := strings.Split(string(cgroupsRaw), "\n")
172 for _, cgroupLine := range cgroupLines {
173 if cgroupLine == "" || strings.HasPrefix(cgroupLine, "#") {
174 continue
175 }
176 cgroupParts := strings.Split(cgroupLine, "\t")
177 cgroupName := cgroupParts[0]
178 if err := os.Mkdir("/mnt/sys/fs/cgroup/"+cgroupName, 0755); err != nil {
179 panic(err)
180 }
181 if err := unix.Mount("cgroup", "/mnt/sys/fs/cgroup/"+cgroupName, "cgroup", unix.MS_NOEXEC|unix.MS_NOSUID|unix.MS_NODEV, cgroupName); err != nil {
182 panic(err)
183 }
184 }
185
186 // Enable hierarchical memory accounting
187 useMemoryHierarchy, err := os.OpenFile("/mnt/sys/fs/cgroup/memory/memory.use_hierarchy", os.O_RDWR, 0)
188 if err != nil {
189 panic(err)
190 }
191 if _, err := useMemoryHierarchy.WriteString("1"); err != nil {
192 panic(err)
193 }
194 useMemoryHierarchy.Close()
195
Serge Bazanski581b0bd2020-03-12 13:36:43 +0100196 // Chroot to new root.
197 // This is adapted from util-linux's switch_root.
198 err = os.Chdir("/mnt")
199 if err != nil {
200 return fmt.Errorf("could not chdir to /mnt: %w", err)
201 }
202 err = syscall.Mount("/mnt", "/", "", syscall.MS_MOVE, "")
203 if err != nil {
204 return fmt.Errorf("could not remount /mnt to /: %w", err)
205 }
206 err = syscall.Chroot(".")
207 if err != nil {
208 return fmt.Errorf("could not chroot to new root: %w", err)
209 }
210
211 // Re-exec into new init with new environment
212 return unix.Exec("/init", os.Args, environ)
213}