blob: 527030ab4d0591f553ec74a36846b94e118f640e [file] [log] [blame]
Lorenz Brunae0d90d2019-09-05 17:53:56 +02001// Copyright 2020 The Monogon Project Authors.
2//
3// SPDX-License-Identifier: Apache-2.0
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
17package main
18
19import (
Serge Bazanskicdb8c782020-02-17 12:34:02 +010020 "context"
Lorenz Brundd8c80e2019-10-07 16:19:49 +020021 "fmt"
Serge Bazanski3b098232023-03-16 17:57:02 +010022 "strings"
Lorenz Brunae0d90d2019-09-05 17:53:56 +020023
Lorenz Brunae0d90d2019-09-05 17:53:56 +020024 "golang.org/x/sys/unix"
Serge Bazanski99f47742021-08-04 20:21:42 +020025
Serge Bazanski3c5d0632024-09-12 10:49:12 +000026 "source.monogon.dev/go/logging"
Serge Bazanski31370b02021-01-07 16:31:14 +010027 "source.monogon.dev/metropolis/node/core/cluster"
Lorenz Brun6c454342023-06-01 12:23:38 +020028 "source.monogon.dev/metropolis/node/core/devmgr"
Serge Bazanski31370b02021-01-07 16:31:14 +010029 "source.monogon.dev/metropolis/node/core/localstorage"
30 "source.monogon.dev/metropolis/node/core/localstorage/declarative"
Serge Bazanskiefbde192024-07-31 14:53:20 +000031 "source.monogon.dev/metropolis/node/core/metrics"
Serge Bazanski31370b02021-01-07 16:31:14 +010032 "source.monogon.dev/metropolis/node/core/network"
Serge Bazanskif9edf522021-06-17 15:57:13 +020033 "source.monogon.dev/metropolis/node/core/roleserve"
Serge Bazanski58ddc092022-06-30 18:23:33 +020034 "source.monogon.dev/metropolis/node/core/rpc/resolver"
Serge Bazanski0d9e1252024-09-03 12:16:47 +020035 "source.monogon.dev/metropolis/node/core/tconsole"
Lorenz Brune306d782021-09-01 13:01:06 +020036 timesvc "source.monogon.dev/metropolis/node/core/time"
Lorenz Brun35fcf032023-06-29 04:15:58 +020037 "source.monogon.dev/metropolis/node/core/update"
Serge Bazanski8d64a3b2023-11-20 12:58:42 +010038 mversion "source.monogon.dev/metropolis/version"
Tim Windelschmidt7dac92b2024-12-16 02:51:04 +010039 "source.monogon.dev/osbase/bringup"
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020040 "source.monogon.dev/osbase/logtree"
Jan Schär91bf1c82024-07-29 17:31:33 +020041 "source.monogon.dev/osbase/net/dns"
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020042 "source.monogon.dev/osbase/supervisor"
Serge Bazanski154e6d92024-09-11 17:26:31 +020043 "source.monogon.dev/osbase/sysctl"
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020044 "source.monogon.dev/osbase/tpm"
Serge Bazanski8d64a3b2023-11-20 12:58:42 +010045 "source.monogon.dev/version"
Lorenz Brunae0d90d2019-09-05 17:53:56 +020046)
47
48func main() {
Tim Windelschmidt7dac92b2024-12-16 02:51:04 +010049 bringup.Runnable(root).RunWith(bringup.Config{
50 Console: bringup.ConsoleConfig{
51 ShortenDictionary: logtree.MetropolisShortenDict,
52 Filter: consoleFilter,
Serge Bazanski3b098232023-03-16 17:57:02 +010053 },
Tim Windelschmidt7dac92b2024-12-16 02:51:04 +010054 Supervisor: bringup.SupervisorConfig{
55 Metrics: []supervisor.Metrics{
56 supervisor.NewMetricsPrometheus(metrics.CoreRegistry),
57 },
Lorenz Brunf0b22ff2023-05-02 16:04:20 +020058 },
Tim Windelschmidt7dac92b2024-12-16 02:51:04 +010059 })
Serge Bazanski57b43752020-07-13 19:17:48 +020060}
Serge Bazanski3b098232023-03-16 17:57:02 +010061
62// consoleFilter is used to filter out some uselessly verbose logs from the
63// console.
64//
65// This should be limited to external services, our internal services should
66// instead just have good logging by default.
67func consoleFilter(p *logtree.LogEntry) bool {
68 if p.Raw != nil {
69 return false
70 }
71 if p.Leveled == nil {
72 return false
73 }
74 s := string(p.DN)
75 if strings.HasPrefix(s, "root.role.controlplane.launcher.consensus.etcd") {
Serge Bazanski3c5d0632024-09-12 10:49:12 +000076 return p.Leveled.Severity().AtLeast(logging.WARNING)
Serge Bazanski3b098232023-03-16 17:57:02 +010077 }
78 // TODO(q3k): turn off RPC traces instead
79 if strings.HasPrefix(s, "root.role.controlplane.launcher.curator.listener.rpc") {
80 return false
81 }
82 if strings.HasPrefix(s, "root.role.kubernetes.run.kubernetes.networked.kubelet") {
Serge Bazanski3c5d0632024-09-12 10:49:12 +000083 return p.Leveled.Severity().AtLeast(logging.WARNING)
Serge Bazanski3b098232023-03-16 17:57:02 +010084 }
85 if strings.HasPrefix(s, "root.role.kubernetes.run.kubernetes.networked.apiserver") {
Serge Bazanski3c5d0632024-09-12 10:49:12 +000086 return p.Leveled.Severity().AtLeast(logging.WARNING)
Serge Bazanski3b098232023-03-16 17:57:02 +010087 }
88 if strings.HasPrefix(s, "root.role.kubernetes.run.kubernetes.controller-manager") {
Serge Bazanski3c5d0632024-09-12 10:49:12 +000089 return p.Leveled.Severity().AtLeast(logging.WARNING)
Serge Bazanski3b098232023-03-16 17:57:02 +010090 }
91 if strings.HasPrefix(s, "root.role.kubernetes.run.kubernetes.scheduler") {
Serge Bazanski3c5d0632024-09-12 10:49:12 +000092 return p.Leveled.Severity().AtLeast(logging.WARNING)
Serge Bazanski3b098232023-03-16 17:57:02 +010093 }
Lorenz Brun6eb3fb32023-08-09 17:19:24 +020094 if strings.HasPrefix(s, "root.kernel") {
95 // Linux writes high-severity logs directly to the console anyways and
96 // its low-severity logs are too verbose.
97 return false
98 }
Serge Bazanski6b7731e2023-03-22 17:58:04 +010099 if strings.HasPrefix(s, "supervisor") {
Serge Bazanski3c5d0632024-09-12 10:49:12 +0000100 return p.Leveled.Severity().AtLeast(logging.WARNING)
Serge Bazanski6b7731e2023-03-22 17:58:04 +0100101 }
Serge Bazanski3b098232023-03-16 17:57:02 +0100102 return true
103}
104
Tim Windelschmidt7dac92b2024-12-16 02:51:04 +0100105// Function which performs core, one-way initialization of the node. This means
106// waiting for the network, starting the cluster manager, and then starting all
107// services related to the node's roles.
108func root(ctx context.Context) error {
109 logger := supervisor.Logger(ctx)
110
111 logger.Info("Starting Metropolis node init")
112 logger.Infof("Version: %s", version.Semver(mversion.Version))
113
114 // Linux kernel default is 4096 which is far too low. Raise it to 1M which
115 // is what gVisor suggests.
116 if err := unix.Setrlimit(unix.RLIMIT_NOFILE, &unix.Rlimit{Cur: 1048576, Max: 1048576}); err != nil {
117 logger.Fatalf("Failed to raise rlimits: %v", err)
118 }
119
120 haveTPM := true
121 if err := tpm.Initialize(logger); err != nil {
122 logger.Warningf("Failed to initialize TPM 2.0: %v", err)
123 haveTPM = false
124 }
125
126 metrics.CoreRegistry.MustRegister(dns.MetricsRegistry)
127 networkSvc := network.New(nil, []string{"hosts", "kubernetes"})
128 networkSvc.DHCPVendorClassID = "dev.monogon.metropolis.node.v1"
129 timeSvc := timesvc.New()
130 devmgrSvc := devmgr.New()
131
132 // This function initializes a headless Delve if this is a debug build or
133 // does nothing if it's not
134 initializeDebugger(networkSvc)
135
136 // Prepare local storage.
137 root := &localstorage.Root{}
138 if err := declarative.PlaceFS(root, "/"); err != nil {
139 panic(fmt.Errorf("when placing root FS: %w", err))
140 }
141
142 updateSvc := &update.Service{
143 Logger: supervisor.MustSubLogger(ctx, "update"),
144 }
145 // Make node-wide cluster resolver.
146 res := resolver.New(ctx, resolver.WithLogger(supervisor.MustSubLogger(ctx, "resolver")))
147
148 // Start storage and network - we need this to get anything else done.
149 if err := root.Start(ctx, updateSvc); err != nil {
150 return fmt.Errorf("cannot start root FS: %w", err)
151 }
152
153 localNodeParams, err := getLocalNodeParams(ctx, root)
154 if err != nil {
155 return fmt.Errorf("cannot get local node parameters: %w", err)
156 }
157
158 if localNodeParams.NetworkConfig != nil {
159 networkSvc.StaticConfig = localNodeParams.NetworkConfig
160 if err := root.ESP.Metropolis.NetworkConfiguration.Marshal(localNodeParams.NetworkConfig); err != nil {
161 logger.Errorf("Error writing back network_config from NodeParameters: %v", err)
162 }
163 }
164 if networkSvc.StaticConfig == nil {
165 staticConfig, err := root.ESP.Metropolis.NetworkConfiguration.Unmarshal()
166 if err == nil {
167 networkSvc.StaticConfig = staticConfig
168 } else {
169 logger.Errorf("Unable to load static config, proceeding without it: %v", err)
170 }
171 }
172
173 if err := supervisor.Run(ctx, "devmgr", devmgrSvc.Run); err != nil {
174 return fmt.Errorf("when starting devmgr: %w", err)
175 }
176 if err := supervisor.Run(ctx, "network", networkSvc.Run); err != nil {
177 return fmt.Errorf("when starting network: %w", err)
178 }
179 if err := supervisor.Run(ctx, "time", timeSvc.Run); err != nil {
180 return fmt.Errorf("when starting time: %w", err)
181 }
182 if err := supervisor.Run(ctx, "sysctl", nodeSysctls); err != nil {
183 return fmt.Errorf("when applying sysctls: %w", err)
184 }
185
186 // The kernel does of course not run in this runnable, only the log pipe
187 // runs in it.
188 if err := supervisor.Run(ctx, "kernel", func(ctx context.Context) error {
189 return logtree.KmsgPipe(ctx, supervisor.Logger(ctx))
190 }); err != nil {
191 return fmt.Errorf("when starting kernel log pipe: %w", err)
192 }
193
194 // Start the role service. The role service connects to the curator and runs
195 // all node-specific role code (eg. Kubernetes services).
196 logger.Infof("Starting role service...")
197 rs := roleserve.New(roleserve.Config{
198 StorageRoot: root,
199 Network: networkSvc,
200 Resolver: res,
201 LogTree: supervisor.LogTree(ctx),
202 Update: updateSvc,
203 })
204 if err := supervisor.Run(ctx, "role", rs.Run); err != nil {
205 return fmt.Errorf("failed to start role service: %w", err)
206 }
207
208 if err := runDebugService(ctx, rs, supervisor.LogTree(ctx), root); err != nil {
209 return fmt.Errorf("when starting debug service: %w", err)
210 }
211
212 // Initialize interactive consoles.
213 interactiveConsoles := []string{"/dev/tty0"}
214 for _, c := range interactiveConsoles {
215 console, err := tconsole.New(tconsole.TerminalLinux, c, supervisor.LogTree(ctx), &networkSvc.Status, &rs.LocalRoles, &rs.CuratorConnection)
216 if err != nil {
217 logger.Infof("Failed to initialize interactive console at %s: %v", c, err)
218 } else {
219 logger.Infof("Started interactive console at %s", c)
220 supervisor.Run(ctx, "console-"+c, console.Run)
221 }
222 }
223
224 // Now that we have consoles, set console logging level to 1 (KERNEL_EMERG,
225 // minimum possible). This prevents the TUI console from being polluted by
226 // random printks.
227 opts := sysctl.Options{
228 "kernel.printk": "1",
229 }
230 if err := opts.Apply(); err != nil {
231 logger.Errorf("Failed to configure printk logging: %v", err)
232 }
233
234 nodeParams, err := getNodeParams(ctx, root)
235 if err != nil {
236 return fmt.Errorf("cannot get node parameters: %w", err)
237 }
238
239 // Start cluster manager. This kicks off cluster membership machinery,
240 // which will either start a new cluster, enroll into one or join one.
241 m := cluster.NewManager(root, networkSvc, rs, updateSvc, nodeParams, haveTPM)
242 if err := supervisor.Run(ctx, "cluster-manager", m.Run); err != nil {
243 return fmt.Errorf("when starting cluster manager: %w", err)
244 }
245
246 supervisor.Signal(ctx, supervisor.SignalHealthy)
247 supervisor.Signal(ctx, supervisor.SignalDone)
248 return nil
Serge Bazanski3b098232023-03-16 17:57:02 +0100249}