blob: 6d9a43a474ba53c833281a11986ec0a8830ee897 [file] [log] [blame]
Tim Windelschmidt6d33a432025-02-04 14:34:25 +01001// Copyright The Monogon Project Authors.
2// SPDX-License-Identifier: Apache-2.0
3
Lorenz Brunaadeb792023-03-27 15:53:56 +02004package main
5
6import (
7 "context"
8 "crypto/ed25519"
9 "crypto/rand"
10 "crypto/tls"
11 "crypto/x509"
12 "errors"
13 "fmt"
14 "math/big"
15 "os"
16 "time"
17
18 "github.com/cenkalti/backoff/v4"
19 "golang.org/x/sys/unix"
20 "google.golang.org/grpc"
21 "google.golang.org/grpc/credentials"
22 "google.golang.org/protobuf/proto"
23
24 apb "source.monogon.dev/cloud/agent/api"
Tim Windelschmidt58321122024-09-10 02:26:03 +020025
Lorenz Brun6c454342023-06-01 12:23:38 +020026 "source.monogon.dev/metropolis/node/core/devmgr"
Lorenz Brunaadeb792023-03-27 15:53:56 +020027 "source.monogon.dev/metropolis/node/core/network"
Tim Windelschmidt58321122024-09-10 02:26:03 +020028 "source.monogon.dev/osbase/bringup"
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020029 "source.monogon.dev/osbase/pki"
30 "source.monogon.dev/osbase/supervisor"
Lorenz Brunaadeb792023-03-27 15:53:56 +020031)
32
Tim Windelschmidt58321122024-09-10 02:26:03 +020033func main() {
34 bringup.Runnable(agentRunnable).Run()
35}
36
Lorenz Brunaadeb792023-03-27 15:53:56 +020037// This is similar to rpc.NewEphemeralCredentials, but that only deals with
38// Metropolis-style certificate verification.
39func newEphemeralCert(private ed25519.PrivateKey) (*tls.Certificate, error) {
40 template := x509.Certificate{
41 SerialNumber: big.NewInt(1),
42 NotBefore: time.Now(),
43 NotAfter: pki.UnknownNotAfter,
44
45 KeyUsage: x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature,
46 ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageClientAuth},
47 BasicConstraintsValid: true,
48 }
49 certificateBytes, err := x509.CreateCertificate(rand.Reader, &template, &template, private.Public(), private)
50 if err != nil {
51 return nil, fmt.Errorf("when generating self-signed certificate: %w", err)
52 }
53 return &tls.Certificate{
54 Certificate: [][]byte{certificateBytes},
55 PrivateKey: private,
56 }, nil
57}
58
59// Main runnable for the agent.
60func agentRunnable(ctx context.Context) error {
61 l := supervisor.Logger(ctx)
Lorenz Brunaadeb792023-03-27 15:53:56 +020062 agentInitRaw, err := os.ReadFile("/init.pb")
63 if err != nil {
Tim Windelschmidt73e98822024-04-18 23:13:49 +020064 return fmt.Errorf("unable to read spec file from takeover: %w", err)
Lorenz Brunaadeb792023-03-27 15:53:56 +020065 }
66
67 var agentInit apb.AgentInit
68 if err := proto.Unmarshal(agentInitRaw, &agentInit); err != nil {
69 return fmt.Errorf("unable to parse spec file from takeover: %w", err)
70 }
71 l.Info("Monogon BMaaS Agent started")
72 if agentInit.TakeoverInit == nil {
73 return errors.New("AgentInit takeover_init field is unset, this is not allowed")
74 }
75
Lorenz Brun6c454342023-06-01 12:23:38 +020076 devmgrSvc := devmgr.New()
77 supervisor.Run(ctx, "devmgr", devmgrSvc.Run)
78
Lorenz Brunc607bf62025-07-22 20:25:26 +020079 networkSvc := network.New(agentInit.NetworkConfig, nil, nil)
Lorenz Brunaadeb792023-03-27 15:53:56 +020080 networkSvc.DHCPVendorClassID = "dev.monogon.cloud.agent.v1"
81 supervisor.Run(ctx, "networking", networkSvc.Run)
82 l.Info("Started networking")
83
Tim Windelschmidt5e460a92024-04-11 01:33:09 +020084 ephemeralCert, err := newEphemeralCert(agentInit.PrivateKey)
Lorenz Brunaadeb792023-03-27 15:53:56 +020085 if err != nil {
86 return fmt.Errorf("could not generate ephemeral credentials: %w", err)
87 }
88 var rootCAs *x509.CertPool
89 if len(agentInit.TakeoverInit.CaCertificate) != 0 {
90 caCert, err := x509.ParseCertificate(agentInit.TakeoverInit.CaCertificate)
91 if err != nil {
92 return fmt.Errorf("unable to parse supplied ca_certificate, is it in DER format?")
93 }
94 rootCAs = x509.NewCertPool()
95 rootCAs.AddCert(caCert)
96 }
97
Tim Windelschmidt9bd9bd42025-02-14 17:08:52 +010098 conn, err := grpc.NewClient(agentInit.TakeoverInit.BmaasEndpoint, grpc.WithTransportCredentials(credentials.NewTLS(&tls.Config{
Lorenz Brunaadeb792023-03-27 15:53:56 +020099 Certificates: []tls.Certificate{*ephemeralCert},
100 RootCAs: rootCAs,
101 })))
102 if err != nil {
Tim Windelschmidt9bd9bd42025-02-14 17:08:52 +0100103 return fmt.Errorf("error creating BMaaS gRPC client: %w", err)
Lorenz Brunaadeb792023-03-27 15:53:56 +0200104 }
Tim Windelschmidtb21bdf92025-05-28 18:37:35 +0200105 c := apb.NewAgentCallbackClient(conn)
Lorenz Brunaadeb792023-03-27 15:53:56 +0200106
107 supervisor.Signal(ctx, supervisor.SignalHealthy)
108
Tim Windelschmidtb21bdf92025-05-28 18:37:35 +0200109 assembleHWReport := func() *apb.AgentHardwareReport {
Tim Windelschmidte1a4ac52023-06-20 12:17:54 +0200110 report, warnings := gatherHWReport()
111 var warningStrings []string
112 for _, w := range warnings {
113 l.Warningf("Hardware Report Warning: %v", w)
114 warningStrings = append(warningStrings, w.Error())
115 }
Tim Windelschmidtb21bdf92025-05-28 18:37:35 +0200116 return &apb.AgentHardwareReport{
Tim Windelschmidte1a4ac52023-06-20 12:17:54 +0200117 Report: report,
118 Warning: warningStrings,
119 }
Lorenz Brunaadeb792023-03-27 15:53:56 +0200120 }
121
Tim Windelschmidte1a4ac52023-06-20 12:17:54 +0200122 var sentFirstHeartBeat, hwReportSent bool
Tim Windelschmidtb21bdf92025-05-28 18:37:35 +0200123 var installationReport *apb.OSInstallationReport
Lorenz Brunaadeb792023-03-27 15:53:56 +0200124 var installationGeneration int64
125 b := backoff.NewExponentialBackOff()
Lorenz Brunb44a5072023-04-18 13:14:33 +0200126 // Never stop retrying, there is nothing else to do
127 b.MaxElapsedTime = 0
Lorenz Brunaadeb792023-03-27 15:53:56 +0200128 // Main heartbeat loop
129 for {
Tim Windelschmidtb21bdf92025-05-28 18:37:35 +0200130 req := apb.HeartbeatRequest{
Lorenz Brunaadeb792023-03-27 15:53:56 +0200131 MachineId: agentInit.TakeoverInit.MachineId,
132 }
Tim Windelschmidte1a4ac52023-06-20 12:17:54 +0200133 if sentFirstHeartBeat && !hwReportSent {
134 req.HardwareReport = assembleHWReport()
Lorenz Brunaadeb792023-03-27 15:53:56 +0200135 }
136 if installationReport != nil {
137 req.InstallationReport = installationReport
138 }
Lorenz Brun00343802023-04-20 17:04:32 +0200139 reqCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
140 res, err := c.Heartbeat(reqCtx, &req)
141 cancel()
Lorenz Brunaadeb792023-03-27 15:53:56 +0200142 if err != nil {
143 l.Infof("Heartbeat failed: %v", err)
144 time.Sleep(b.NextBackOff())
145 continue
146 }
147 b.Reset()
Tim Windelschmidte1a4ac52023-06-20 12:17:54 +0200148 sentFirstHeartBeat = true
149 if req.HardwareReport != nil {
150 hwReportSent = true
151 }
Lorenz Brunaadeb792023-03-27 15:53:56 +0200152 if installationReport != nil {
153 l.Infof("Installation report sent successfully, rebooting")
154 // Close connection and wait 1s to make sure that the RST
155 // can be sent. Important for QEMU/slirp where not doing this
156 // triggers bugs in the connection state management, but also
157 // nice for reducing the number of stale connections in the API
158 // server.
159 conn.Close()
160 time.Sleep(1 * time.Second)
161 unix.Sync()
162 unix.Reboot(unix.LINUX_REBOOT_CMD_RESTART)
163 }
164 if res.InstallationRequest != nil {
165 if res.InstallationRequest.Generation == installationGeneration {
166 // This installation request has already been attempted
167 continue
168 }
Tim Windelschmidtb21bdf92025-05-28 18:37:35 +0200169 installationReport = &apb.OSInstallationReport{
Lorenz Brunaadeb792023-03-27 15:53:56 +0200170 Generation: res.InstallationRequest.Generation,
171 }
Jan Schär4cc3d4d2025-04-14 11:46:47 +0000172 installCtx, cancel := context.WithTimeout(ctx, 15*time.Minute)
173 if err := install(installCtx, res.InstallationRequest, agentInit.NetworkConfig); err != nil {
Lorenz Brunaadeb792023-03-27 15:53:56 +0200174 l.Errorf("Installation failed: %v", err)
Tim Windelschmidtb21bdf92025-05-28 18:37:35 +0200175 installationReport.Result = &apb.OSInstallationReport_Error_{
176 Error: &apb.OSInstallationReport_Error{
Lorenz Brunaadeb792023-03-27 15:53:56 +0200177 Error: err.Error(),
178 },
179 }
180 } else {
181 l.Info("Installation succeeded")
Tim Windelschmidtb21bdf92025-05-28 18:37:35 +0200182 installationReport.Result = &apb.OSInstallationReport_Success_{
183 Success: &apb.OSInstallationReport_Success{},
Lorenz Brunaadeb792023-03-27 15:53:56 +0200184 }
185 }
Jan Schär4cc3d4d2025-04-14 11:46:47 +0000186 cancel()
Lorenz Brunaadeb792023-03-27 15:53:56 +0200187 } else {
188 time.Sleep(30 * time.Second)
189 }
190 }
191}