blob: 6432586149dcf68670825ed47c7df380b6e834a7 [file] [log] [blame]
Lorenz Brunaadeb792023-03-27 15:53:56 +02001package main
2
3import (
4 "context"
5 "crypto/ed25519"
6 "crypto/rand"
7 "crypto/tls"
8 "crypto/x509"
9 "errors"
10 "fmt"
11 "math/big"
12 "os"
13 "time"
14
15 "github.com/cenkalti/backoff/v4"
16 "golang.org/x/sys/unix"
17 "google.golang.org/grpc"
18 "google.golang.org/grpc/credentials"
19 "google.golang.org/protobuf/proto"
20
21 apb "source.monogon.dev/cloud/agent/api"
22 bpb "source.monogon.dev/cloud/bmaas/server/api"
Lorenz Brun6c454342023-06-01 12:23:38 +020023 "source.monogon.dev/metropolis/node/core/devmgr"
Lorenz Brunaadeb792023-03-27 15:53:56 +020024 "source.monogon.dev/metropolis/node/core/network"
25 "source.monogon.dev/metropolis/pkg/pki"
26 "source.monogon.dev/metropolis/pkg/supervisor"
27)
28
29// This is similar to rpc.NewEphemeralCredentials, but that only deals with
30// Metropolis-style certificate verification.
31func newEphemeralCert(private ed25519.PrivateKey) (*tls.Certificate, error) {
32 template := x509.Certificate{
33 SerialNumber: big.NewInt(1),
34 NotBefore: time.Now(),
35 NotAfter: pki.UnknownNotAfter,
36
37 KeyUsage: x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature,
38 ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageClientAuth},
39 BasicConstraintsValid: true,
40 }
41 certificateBytes, err := x509.CreateCertificate(rand.Reader, &template, &template, private.Public(), private)
42 if err != nil {
43 return nil, fmt.Errorf("when generating self-signed certificate: %w", err)
44 }
45 return &tls.Certificate{
46 Certificate: [][]byte{certificateBytes},
47 PrivateKey: private,
48 }, nil
49}
50
51// Main runnable for the agent.
52func agentRunnable(ctx context.Context) error {
53 l := supervisor.Logger(ctx)
54 // Mount this late so we don't just crash when not booted with EFI.
55 isEFIBoot := false
56 if err := mkdirAndMount("/sys/firmware/efi/efivars", "efivarfs", unix.MS_NOEXEC|unix.MS_NOSUID|unix.MS_NODEV); err == nil {
57 isEFIBoot = true
58 }
59 agentInitRaw, err := os.ReadFile("/init.pb")
60 if err != nil {
61 return fmt.Errorf("Unable to read spec file from takeover: %w", err)
62 }
63
64 var agentInit apb.AgentInit
65 if err := proto.Unmarshal(agentInitRaw, &agentInit); err != nil {
66 return fmt.Errorf("unable to parse spec file from takeover: %w", err)
67 }
68 l.Info("Monogon BMaaS Agent started")
69 if agentInit.TakeoverInit == nil {
70 return errors.New("AgentInit takeover_init field is unset, this is not allowed")
71 }
72
Lorenz Brun6c454342023-06-01 12:23:38 +020073 devmgrSvc := devmgr.New()
74 supervisor.Run(ctx, "devmgr", devmgrSvc.Run)
75
Lorenz Brunaadeb792023-03-27 15:53:56 +020076 networkSvc := network.New(agentInit.NetworkConfig)
77 networkSvc.DHCPVendorClassID = "dev.monogon.cloud.agent.v1"
78 supervisor.Run(ctx, "networking", networkSvc.Run)
79 l.Info("Started networking")
80
81 ephemeralCert, err := newEphemeralCert(ed25519.PrivateKey(agentInit.PrivateKey))
82 if err != nil {
83 return fmt.Errorf("could not generate ephemeral credentials: %w", err)
84 }
85 var rootCAs *x509.CertPool
86 if len(agentInit.TakeoverInit.CaCertificate) != 0 {
87 caCert, err := x509.ParseCertificate(agentInit.TakeoverInit.CaCertificate)
88 if err != nil {
89 return fmt.Errorf("unable to parse supplied ca_certificate, is it in DER format?")
90 }
91 rootCAs = x509.NewCertPool()
92 rootCAs.AddCert(caCert)
93 }
94
95 conn, err := grpc.Dial(agentInit.TakeoverInit.BmaasEndpoint, grpc.WithTransportCredentials(credentials.NewTLS(&tls.Config{
96 Certificates: []tls.Certificate{*ephemeralCert},
97 RootCAs: rootCAs,
98 })))
99 if err != nil {
100 return fmt.Errorf("error dialing BMaaS gRPC endpoint: %w", err)
101 }
102 c := bpb.NewAgentCallbackClient(conn)
103
104 supervisor.Signal(ctx, supervisor.SignalHealthy)
105
Tim Windelschmidte1a4ac52023-06-20 12:17:54 +0200106 assembleHWReport := func() *bpb.AgentHardwareReport {
107 report, warnings := gatherHWReport()
108 var warningStrings []string
109 for _, w := range warnings {
110 l.Warningf("Hardware Report Warning: %v", w)
111 warningStrings = append(warningStrings, w.Error())
112 }
113 return &bpb.AgentHardwareReport{
114 Report: report,
115 Warning: warningStrings,
116 }
Lorenz Brunaadeb792023-03-27 15:53:56 +0200117 }
118
Tim Windelschmidte1a4ac52023-06-20 12:17:54 +0200119 var sentFirstHeartBeat, hwReportSent bool
Lorenz Brunaadeb792023-03-27 15:53:56 +0200120 var installationReport *bpb.OSInstallationReport
121 var installationGeneration int64
122 b := backoff.NewExponentialBackOff()
Lorenz Brunb44a5072023-04-18 13:14:33 +0200123 // Never stop retrying, there is nothing else to do
124 b.MaxElapsedTime = 0
Lorenz Brunaadeb792023-03-27 15:53:56 +0200125 // Main heartbeat loop
126 for {
127 req := bpb.AgentHeartbeatRequest{
128 MachineId: agentInit.TakeoverInit.MachineId,
129 }
Tim Windelschmidte1a4ac52023-06-20 12:17:54 +0200130 if sentFirstHeartBeat && !hwReportSent {
131 req.HardwareReport = assembleHWReport()
Lorenz Brunaadeb792023-03-27 15:53:56 +0200132 }
133 if installationReport != nil {
134 req.InstallationReport = installationReport
135 }
Lorenz Brun00343802023-04-20 17:04:32 +0200136 reqCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
137 res, err := c.Heartbeat(reqCtx, &req)
138 cancel()
Lorenz Brunaadeb792023-03-27 15:53:56 +0200139 if err != nil {
140 l.Infof("Heartbeat failed: %v", err)
141 time.Sleep(b.NextBackOff())
142 continue
143 }
144 b.Reset()
Tim Windelschmidte1a4ac52023-06-20 12:17:54 +0200145 sentFirstHeartBeat = true
146 if req.HardwareReport != nil {
147 hwReportSent = true
148 }
Lorenz Brunaadeb792023-03-27 15:53:56 +0200149 if installationReport != nil {
150 l.Infof("Installation report sent successfully, rebooting")
151 // Close connection and wait 1s to make sure that the RST
152 // can be sent. Important for QEMU/slirp where not doing this
153 // triggers bugs in the connection state management, but also
154 // nice for reducing the number of stale connections in the API
155 // server.
156 conn.Close()
157 time.Sleep(1 * time.Second)
158 unix.Sync()
159 unix.Reboot(unix.LINUX_REBOOT_CMD_RESTART)
160 }
161 if res.InstallationRequest != nil {
162 if res.InstallationRequest.Generation == installationGeneration {
163 // This installation request has already been attempted
164 continue
165 }
166 installationReport = &bpb.OSInstallationReport{
167 Generation: res.InstallationRequest.Generation,
168 }
Tim Windelschmidtfac48742023-04-24 19:04:55 +0200169 if err := install(res.InstallationRequest, agentInit.NetworkConfig, l, isEFIBoot); err != nil {
Lorenz Brunaadeb792023-03-27 15:53:56 +0200170 l.Errorf("Installation failed: %v", err)
171 installationReport.Result = &bpb.OSInstallationReport_Error_{
172 Error: &bpb.OSInstallationReport_Error{
173 Error: err.Error(),
174 },
175 }
176 } else {
177 l.Info("Installation succeeded")
178 installationReport.Result = &bpb.OSInstallationReport_Success_{
179 Success: &bpb.OSInstallationReport_Success{},
180 }
181 }
182 } else {
183 time.Sleep(30 * time.Second)
184 }
185 }
186}