blob: 05b2dafff8830a5d3fe8c140cab889427aa26ad9 [file] [log] [blame]
Tim Windelschmidt6d33a432025-02-04 14:34:25 +01001// Copyright The Monogon Project Authors.
2// SPDX-License-Identifier: Apache-2.0
3
Lorenz Brunaadeb792023-03-27 15:53:56 +02004package main
5
6import (
7 "context"
8 "crypto/ed25519"
9 "crypto/rand"
10 "crypto/tls"
11 "crypto/x509"
12 "errors"
13 "fmt"
14 "math/big"
15 "os"
16 "time"
17
18 "github.com/cenkalti/backoff/v4"
19 "golang.org/x/sys/unix"
20 "google.golang.org/grpc"
21 "google.golang.org/grpc/credentials"
22 "google.golang.org/protobuf/proto"
23
24 apb "source.monogon.dev/cloud/agent/api"
25 bpb "source.monogon.dev/cloud/bmaas/server/api"
Tim Windelschmidt58321122024-09-10 02:26:03 +020026
Lorenz Brun6c454342023-06-01 12:23:38 +020027 "source.monogon.dev/metropolis/node/core/devmgr"
Lorenz Brunaadeb792023-03-27 15:53:56 +020028 "source.monogon.dev/metropolis/node/core/network"
Tim Windelschmidt58321122024-09-10 02:26:03 +020029 "source.monogon.dev/osbase/bringup"
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020030 "source.monogon.dev/osbase/pki"
31 "source.monogon.dev/osbase/supervisor"
Lorenz Brunaadeb792023-03-27 15:53:56 +020032)
33
Tim Windelschmidt58321122024-09-10 02:26:03 +020034func main() {
35 bringup.Runnable(agentRunnable).Run()
36}
37
Lorenz Brunaadeb792023-03-27 15:53:56 +020038// This is similar to rpc.NewEphemeralCredentials, but that only deals with
39// Metropolis-style certificate verification.
40func newEphemeralCert(private ed25519.PrivateKey) (*tls.Certificate, error) {
41 template := x509.Certificate{
42 SerialNumber: big.NewInt(1),
43 NotBefore: time.Now(),
44 NotAfter: pki.UnknownNotAfter,
45
46 KeyUsage: x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature,
47 ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageClientAuth},
48 BasicConstraintsValid: true,
49 }
50 certificateBytes, err := x509.CreateCertificate(rand.Reader, &template, &template, private.Public(), private)
51 if err != nil {
52 return nil, fmt.Errorf("when generating self-signed certificate: %w", err)
53 }
54 return &tls.Certificate{
55 Certificate: [][]byte{certificateBytes},
56 PrivateKey: private,
57 }, nil
58}
59
60// Main runnable for the agent.
61func agentRunnable(ctx context.Context) error {
62 l := supervisor.Logger(ctx)
Lorenz Brunaadeb792023-03-27 15:53:56 +020063 agentInitRaw, err := os.ReadFile("/init.pb")
64 if err != nil {
Tim Windelschmidt73e98822024-04-18 23:13:49 +020065 return fmt.Errorf("unable to read spec file from takeover: %w", err)
Lorenz Brunaadeb792023-03-27 15:53:56 +020066 }
67
68 var agentInit apb.AgentInit
69 if err := proto.Unmarshal(agentInitRaw, &agentInit); err != nil {
70 return fmt.Errorf("unable to parse spec file from takeover: %w", err)
71 }
72 l.Info("Monogon BMaaS Agent started")
73 if agentInit.TakeoverInit == nil {
74 return errors.New("AgentInit takeover_init field is unset, this is not allowed")
75 }
76
Lorenz Brun6c454342023-06-01 12:23:38 +020077 devmgrSvc := devmgr.New()
78 supervisor.Run(ctx, "devmgr", devmgrSvc.Run)
79
Jan Schär91bf1c82024-07-29 17:31:33 +020080 networkSvc := network.New(agentInit.NetworkConfig, nil)
Lorenz Brunaadeb792023-03-27 15:53:56 +020081 networkSvc.DHCPVendorClassID = "dev.monogon.cloud.agent.v1"
82 supervisor.Run(ctx, "networking", networkSvc.Run)
83 l.Info("Started networking")
84
Tim Windelschmidt5e460a92024-04-11 01:33:09 +020085 ephemeralCert, err := newEphemeralCert(agentInit.PrivateKey)
Lorenz Brunaadeb792023-03-27 15:53:56 +020086 if err != nil {
87 return fmt.Errorf("could not generate ephemeral credentials: %w", err)
88 }
89 var rootCAs *x509.CertPool
90 if len(agentInit.TakeoverInit.CaCertificate) != 0 {
91 caCert, err := x509.ParseCertificate(agentInit.TakeoverInit.CaCertificate)
92 if err != nil {
93 return fmt.Errorf("unable to parse supplied ca_certificate, is it in DER format?")
94 }
95 rootCAs = x509.NewCertPool()
96 rootCAs.AddCert(caCert)
97 }
98
Tim Windelschmidt9bd9bd42025-02-14 17:08:52 +010099 conn, err := grpc.NewClient(agentInit.TakeoverInit.BmaasEndpoint, grpc.WithTransportCredentials(credentials.NewTLS(&tls.Config{
Lorenz Brunaadeb792023-03-27 15:53:56 +0200100 Certificates: []tls.Certificate{*ephemeralCert},
101 RootCAs: rootCAs,
102 })))
103 if err != nil {
Tim Windelschmidt9bd9bd42025-02-14 17:08:52 +0100104 return fmt.Errorf("error creating BMaaS gRPC client: %w", err)
Lorenz Brunaadeb792023-03-27 15:53:56 +0200105 }
106 c := bpb.NewAgentCallbackClient(conn)
107
108 supervisor.Signal(ctx, supervisor.SignalHealthy)
109
Tim Windelschmidte1a4ac52023-06-20 12:17:54 +0200110 assembleHWReport := func() *bpb.AgentHardwareReport {
111 report, warnings := gatherHWReport()
112 var warningStrings []string
113 for _, w := range warnings {
114 l.Warningf("Hardware Report Warning: %v", w)
115 warningStrings = append(warningStrings, w.Error())
116 }
117 return &bpb.AgentHardwareReport{
118 Report: report,
119 Warning: warningStrings,
120 }
Lorenz Brunaadeb792023-03-27 15:53:56 +0200121 }
122
Tim Windelschmidte1a4ac52023-06-20 12:17:54 +0200123 var sentFirstHeartBeat, hwReportSent bool
Lorenz Brunaadeb792023-03-27 15:53:56 +0200124 var installationReport *bpb.OSInstallationReport
125 var installationGeneration int64
126 b := backoff.NewExponentialBackOff()
Lorenz Brunb44a5072023-04-18 13:14:33 +0200127 // Never stop retrying, there is nothing else to do
128 b.MaxElapsedTime = 0
Lorenz Brunaadeb792023-03-27 15:53:56 +0200129 // Main heartbeat loop
130 for {
Tim Windelschmidt5ffa6362025-01-28 19:20:06 +0100131 req := bpb.HeartbeatRequest{
Lorenz Brunaadeb792023-03-27 15:53:56 +0200132 MachineId: agentInit.TakeoverInit.MachineId,
133 }
Tim Windelschmidte1a4ac52023-06-20 12:17:54 +0200134 if sentFirstHeartBeat && !hwReportSent {
135 req.HardwareReport = assembleHWReport()
Lorenz Brunaadeb792023-03-27 15:53:56 +0200136 }
137 if installationReport != nil {
138 req.InstallationReport = installationReport
139 }
Lorenz Brun00343802023-04-20 17:04:32 +0200140 reqCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
141 res, err := c.Heartbeat(reqCtx, &req)
142 cancel()
Lorenz Brunaadeb792023-03-27 15:53:56 +0200143 if err != nil {
144 l.Infof("Heartbeat failed: %v", err)
145 time.Sleep(b.NextBackOff())
146 continue
147 }
148 b.Reset()
Tim Windelschmidte1a4ac52023-06-20 12:17:54 +0200149 sentFirstHeartBeat = true
150 if req.HardwareReport != nil {
151 hwReportSent = true
152 }
Lorenz Brunaadeb792023-03-27 15:53:56 +0200153 if installationReport != nil {
154 l.Infof("Installation report sent successfully, rebooting")
155 // Close connection and wait 1s to make sure that the RST
156 // can be sent. Important for QEMU/slirp where not doing this
157 // triggers bugs in the connection state management, but also
158 // nice for reducing the number of stale connections in the API
159 // server.
160 conn.Close()
161 time.Sleep(1 * time.Second)
162 unix.Sync()
163 unix.Reboot(unix.LINUX_REBOOT_CMD_RESTART)
164 }
165 if res.InstallationRequest != nil {
166 if res.InstallationRequest.Generation == installationGeneration {
167 // This installation request has already been attempted
168 continue
169 }
170 installationReport = &bpb.OSInstallationReport{
171 Generation: res.InstallationRequest.Generation,
172 }
Jan Schär4cc3d4d2025-04-14 11:46:47 +0000173 installCtx, cancel := context.WithTimeout(ctx, 15*time.Minute)
174 if err := install(installCtx, res.InstallationRequest, agentInit.NetworkConfig); err != nil {
Lorenz Brunaadeb792023-03-27 15:53:56 +0200175 l.Errorf("Installation failed: %v", err)
176 installationReport.Result = &bpb.OSInstallationReport_Error_{
177 Error: &bpb.OSInstallationReport_Error{
178 Error: err.Error(),
179 },
180 }
181 } else {
182 l.Info("Installation succeeded")
183 installationReport.Result = &bpb.OSInstallationReport_Success_{
184 Success: &bpb.OSInstallationReport_Success{},
185 }
186 }
Jan Schär4cc3d4d2025-04-14 11:46:47 +0000187 cancel()
Lorenz Brunaadeb792023-03-27 15:53:56 +0200188 } else {
189 time.Sleep(30 * time.Second)
190 }
191 }
192}