blob: e0bfd81d72956b0ccec9cc2d46cf308477956b32 [file] [log] [blame]
Lorenz Brunaadeb792023-03-27 15:53:56 +02001package main
2
3import (
4 "context"
5 "crypto/ed25519"
6 "crypto/rand"
7 "crypto/tls"
8 "crypto/x509"
9 "errors"
10 "fmt"
11 "math/big"
12 "os"
13 "time"
14
15 "github.com/cenkalti/backoff/v4"
16 "golang.org/x/sys/unix"
17 "google.golang.org/grpc"
18 "google.golang.org/grpc/credentials"
19 "google.golang.org/protobuf/proto"
20
21 apb "source.monogon.dev/cloud/agent/api"
22 bpb "source.monogon.dev/cloud/bmaas/server/api"
Tim Windelschmidt58321122024-09-10 02:26:03 +020023
Lorenz Brun6c454342023-06-01 12:23:38 +020024 "source.monogon.dev/metropolis/node/core/devmgr"
Lorenz Brunaadeb792023-03-27 15:53:56 +020025 "source.monogon.dev/metropolis/node/core/network"
Tim Windelschmidt58321122024-09-10 02:26:03 +020026 "source.monogon.dev/osbase/bringup"
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020027 "source.monogon.dev/osbase/pki"
28 "source.monogon.dev/osbase/supervisor"
Lorenz Brunaadeb792023-03-27 15:53:56 +020029)
30
Tim Windelschmidt58321122024-09-10 02:26:03 +020031func main() {
32 bringup.Runnable(agentRunnable).Run()
33}
34
Lorenz Brunaadeb792023-03-27 15:53:56 +020035// This is similar to rpc.NewEphemeralCredentials, but that only deals with
36// Metropolis-style certificate verification.
37func newEphemeralCert(private ed25519.PrivateKey) (*tls.Certificate, error) {
38 template := x509.Certificate{
39 SerialNumber: big.NewInt(1),
40 NotBefore: time.Now(),
41 NotAfter: pki.UnknownNotAfter,
42
43 KeyUsage: x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature,
44 ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageClientAuth},
45 BasicConstraintsValid: true,
46 }
47 certificateBytes, err := x509.CreateCertificate(rand.Reader, &template, &template, private.Public(), private)
48 if err != nil {
49 return nil, fmt.Errorf("when generating self-signed certificate: %w", err)
50 }
51 return &tls.Certificate{
52 Certificate: [][]byte{certificateBytes},
53 PrivateKey: private,
54 }, nil
55}
56
57// Main runnable for the agent.
58func agentRunnable(ctx context.Context) error {
59 l := supervisor.Logger(ctx)
Lorenz Brunaadeb792023-03-27 15:53:56 +020060 agentInitRaw, err := os.ReadFile("/init.pb")
61 if err != nil {
Tim Windelschmidt73e98822024-04-18 23:13:49 +020062 return fmt.Errorf("unable to read spec file from takeover: %w", err)
Lorenz Brunaadeb792023-03-27 15:53:56 +020063 }
64
65 var agentInit apb.AgentInit
66 if err := proto.Unmarshal(agentInitRaw, &agentInit); err != nil {
67 return fmt.Errorf("unable to parse spec file from takeover: %w", err)
68 }
69 l.Info("Monogon BMaaS Agent started")
70 if agentInit.TakeoverInit == nil {
71 return errors.New("AgentInit takeover_init field is unset, this is not allowed")
72 }
73
Lorenz Brun6c454342023-06-01 12:23:38 +020074 devmgrSvc := devmgr.New()
75 supervisor.Run(ctx, "devmgr", devmgrSvc.Run)
76
Jan Schär91bf1c82024-07-29 17:31:33 +020077 networkSvc := network.New(agentInit.NetworkConfig, nil)
Lorenz Brunaadeb792023-03-27 15:53:56 +020078 networkSvc.DHCPVendorClassID = "dev.monogon.cloud.agent.v1"
79 supervisor.Run(ctx, "networking", networkSvc.Run)
80 l.Info("Started networking")
81
Tim Windelschmidt5e460a92024-04-11 01:33:09 +020082 ephemeralCert, err := newEphemeralCert(agentInit.PrivateKey)
Lorenz Brunaadeb792023-03-27 15:53:56 +020083 if err != nil {
84 return fmt.Errorf("could not generate ephemeral credentials: %w", err)
85 }
86 var rootCAs *x509.CertPool
87 if len(agentInit.TakeoverInit.CaCertificate) != 0 {
88 caCert, err := x509.ParseCertificate(agentInit.TakeoverInit.CaCertificate)
89 if err != nil {
90 return fmt.Errorf("unable to parse supplied ca_certificate, is it in DER format?")
91 }
92 rootCAs = x509.NewCertPool()
93 rootCAs.AddCert(caCert)
94 }
95
96 conn, err := grpc.Dial(agentInit.TakeoverInit.BmaasEndpoint, grpc.WithTransportCredentials(credentials.NewTLS(&tls.Config{
97 Certificates: []tls.Certificate{*ephemeralCert},
98 RootCAs: rootCAs,
99 })))
100 if err != nil {
101 return fmt.Errorf("error dialing BMaaS gRPC endpoint: %w", err)
102 }
103 c := bpb.NewAgentCallbackClient(conn)
104
105 supervisor.Signal(ctx, supervisor.SignalHealthy)
106
Tim Windelschmidte1a4ac52023-06-20 12:17:54 +0200107 assembleHWReport := func() *bpb.AgentHardwareReport {
108 report, warnings := gatherHWReport()
109 var warningStrings []string
110 for _, w := range warnings {
111 l.Warningf("Hardware Report Warning: %v", w)
112 warningStrings = append(warningStrings, w.Error())
113 }
114 return &bpb.AgentHardwareReport{
115 Report: report,
116 Warning: warningStrings,
117 }
Lorenz Brunaadeb792023-03-27 15:53:56 +0200118 }
119
Tim Windelschmidte1a4ac52023-06-20 12:17:54 +0200120 var sentFirstHeartBeat, hwReportSent bool
Lorenz Brunaadeb792023-03-27 15:53:56 +0200121 var installationReport *bpb.OSInstallationReport
122 var installationGeneration int64
123 b := backoff.NewExponentialBackOff()
Lorenz Brunb44a5072023-04-18 13:14:33 +0200124 // Never stop retrying, there is nothing else to do
125 b.MaxElapsedTime = 0
Lorenz Brunaadeb792023-03-27 15:53:56 +0200126 // Main heartbeat loop
127 for {
128 req := bpb.AgentHeartbeatRequest{
129 MachineId: agentInit.TakeoverInit.MachineId,
130 }
Tim Windelschmidte1a4ac52023-06-20 12:17:54 +0200131 if sentFirstHeartBeat && !hwReportSent {
132 req.HardwareReport = assembleHWReport()
Lorenz Brunaadeb792023-03-27 15:53:56 +0200133 }
134 if installationReport != nil {
135 req.InstallationReport = installationReport
136 }
Lorenz Brun00343802023-04-20 17:04:32 +0200137 reqCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
138 res, err := c.Heartbeat(reqCtx, &req)
139 cancel()
Lorenz Brunaadeb792023-03-27 15:53:56 +0200140 if err != nil {
141 l.Infof("Heartbeat failed: %v", err)
142 time.Sleep(b.NextBackOff())
143 continue
144 }
145 b.Reset()
Tim Windelschmidte1a4ac52023-06-20 12:17:54 +0200146 sentFirstHeartBeat = true
147 if req.HardwareReport != nil {
148 hwReportSent = true
149 }
Lorenz Brunaadeb792023-03-27 15:53:56 +0200150 if installationReport != nil {
151 l.Infof("Installation report sent successfully, rebooting")
152 // Close connection and wait 1s to make sure that the RST
153 // can be sent. Important for QEMU/slirp where not doing this
154 // triggers bugs in the connection state management, but also
155 // nice for reducing the number of stale connections in the API
156 // server.
157 conn.Close()
158 time.Sleep(1 * time.Second)
159 unix.Sync()
160 unix.Reboot(unix.LINUX_REBOOT_CMD_RESTART)
161 }
162 if res.InstallationRequest != nil {
163 if res.InstallationRequest.Generation == installationGeneration {
164 // This installation request has already been attempted
165 continue
166 }
167 installationReport = &bpb.OSInstallationReport{
168 Generation: res.InstallationRequest.Generation,
169 }
Tim Windelschmidt58321122024-09-10 02:26:03 +0200170 if err := install(res.InstallationRequest, agentInit.NetworkConfig, l); err != nil {
Lorenz Brunaadeb792023-03-27 15:53:56 +0200171 l.Errorf("Installation failed: %v", err)
172 installationReport.Result = &bpb.OSInstallationReport_Error_{
173 Error: &bpb.OSInstallationReport_Error{
174 Error: err.Error(),
175 },
176 }
177 } else {
178 l.Info("Installation succeeded")
179 installationReport.Result = &bpb.OSInstallationReport_Success_{
180 Success: &bpb.OSInstallationReport_Success{},
181 }
182 }
183 } else {
184 time.Sleep(30 * time.Second)
185 }
186 }
187}