blob: a339794985fe0300cf9af83b40c7aef60459e5d4 [file] [log] [blame]
Lorenz Brunaadeb792023-03-27 15:53:56 +02001package main
2
3import (
4 "context"
5 "crypto/ed25519"
6 "crypto/rand"
7 "crypto/tls"
8 "crypto/x509"
9 "errors"
10 "fmt"
11 "math/big"
12 "os"
13 "time"
14
15 "github.com/cenkalti/backoff/v4"
16 "golang.org/x/sys/unix"
17 "google.golang.org/grpc"
18 "google.golang.org/grpc/credentials"
19 "google.golang.org/protobuf/proto"
20
21 apb "source.monogon.dev/cloud/agent/api"
22 bpb "source.monogon.dev/cloud/bmaas/server/api"
23 "source.monogon.dev/metropolis/node/core/network"
24 "source.monogon.dev/metropolis/pkg/pki"
25 "source.monogon.dev/metropolis/pkg/supervisor"
26)
27
28// This is similar to rpc.NewEphemeralCredentials, but that only deals with
29// Metropolis-style certificate verification.
30func newEphemeralCert(private ed25519.PrivateKey) (*tls.Certificate, error) {
31 template := x509.Certificate{
32 SerialNumber: big.NewInt(1),
33 NotBefore: time.Now(),
34 NotAfter: pki.UnknownNotAfter,
35
36 KeyUsage: x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature,
37 ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageClientAuth},
38 BasicConstraintsValid: true,
39 }
40 certificateBytes, err := x509.CreateCertificate(rand.Reader, &template, &template, private.Public(), private)
41 if err != nil {
42 return nil, fmt.Errorf("when generating self-signed certificate: %w", err)
43 }
44 return &tls.Certificate{
45 Certificate: [][]byte{certificateBytes},
46 PrivateKey: private,
47 }, nil
48}
49
50// Main runnable for the agent.
51func agentRunnable(ctx context.Context) error {
52 l := supervisor.Logger(ctx)
53 // Mount this late so we don't just crash when not booted with EFI.
54 isEFIBoot := false
55 if err := mkdirAndMount("/sys/firmware/efi/efivars", "efivarfs", unix.MS_NOEXEC|unix.MS_NOSUID|unix.MS_NODEV); err == nil {
56 isEFIBoot = true
57 }
58 agentInitRaw, err := os.ReadFile("/init.pb")
59 if err != nil {
60 return fmt.Errorf("Unable to read spec file from takeover: %w", err)
61 }
62
63 var agentInit apb.AgentInit
64 if err := proto.Unmarshal(agentInitRaw, &agentInit); err != nil {
65 return fmt.Errorf("unable to parse spec file from takeover: %w", err)
66 }
67 l.Info("Monogon BMaaS Agent started")
68 if agentInit.TakeoverInit == nil {
69 return errors.New("AgentInit takeover_init field is unset, this is not allowed")
70 }
71
72 networkSvc := network.New(agentInit.NetworkConfig)
73 networkSvc.DHCPVendorClassID = "dev.monogon.cloud.agent.v1"
74 supervisor.Run(ctx, "networking", networkSvc.Run)
75 l.Info("Started networking")
76
77 ephemeralCert, err := newEphemeralCert(ed25519.PrivateKey(agentInit.PrivateKey))
78 if err != nil {
79 return fmt.Errorf("could not generate ephemeral credentials: %w", err)
80 }
81 var rootCAs *x509.CertPool
82 if len(agentInit.TakeoverInit.CaCertificate) != 0 {
83 caCert, err := x509.ParseCertificate(agentInit.TakeoverInit.CaCertificate)
84 if err != nil {
85 return fmt.Errorf("unable to parse supplied ca_certificate, is it in DER format?")
86 }
87 rootCAs = x509.NewCertPool()
88 rootCAs.AddCert(caCert)
89 }
90
91 conn, err := grpc.Dial(agentInit.TakeoverInit.BmaasEndpoint, grpc.WithTransportCredentials(credentials.NewTLS(&tls.Config{
92 Certificates: []tls.Certificate{*ephemeralCert},
93 RootCAs: rootCAs,
94 })))
95 if err != nil {
96 return fmt.Errorf("error dialing BMaaS gRPC endpoint: %w", err)
97 }
98 c := bpb.NewAgentCallbackClient(conn)
99
100 supervisor.Signal(ctx, supervisor.SignalHealthy)
101
102 report, warnings := gatherHWReport()
103 var warningStrings []string
104 for _, w := range warnings {
105 l.Warningf("Hardware Report Warning: %v", w)
106 warningStrings = append(warningStrings, w.Error())
107 }
108
109 var hwReportSent bool
110 var installationReport *bpb.OSInstallationReport
111 var installationGeneration int64
112 b := backoff.NewExponentialBackOff()
Lorenz Brunb44a5072023-04-18 13:14:33 +0200113 // Never stop retrying, there is nothing else to do
114 b.MaxElapsedTime = 0
Lorenz Brunaadeb792023-03-27 15:53:56 +0200115 // Main heartbeat loop
116 for {
117 req := bpb.AgentHeartbeatRequest{
118 MachineId: agentInit.TakeoverInit.MachineId,
119 }
120 if !hwReportSent {
121 req.HardwareReport = &bpb.AgentHardwareReport{
122 Report: report,
123 Warning: warningStrings,
124 }
125 }
126 if installationReport != nil {
127 req.InstallationReport = installationReport
128 }
Lorenz Brun00343802023-04-20 17:04:32 +0200129 reqCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
130 res, err := c.Heartbeat(reqCtx, &req)
131 cancel()
Lorenz Brunaadeb792023-03-27 15:53:56 +0200132 if err != nil {
133 l.Infof("Heartbeat failed: %v", err)
134 time.Sleep(b.NextBackOff())
135 continue
136 }
137 b.Reset()
138 hwReportSent = true
139 if installationReport != nil {
140 l.Infof("Installation report sent successfully, rebooting")
141 // Close connection and wait 1s to make sure that the RST
142 // can be sent. Important for QEMU/slirp where not doing this
143 // triggers bugs in the connection state management, but also
144 // nice for reducing the number of stale connections in the API
145 // server.
146 conn.Close()
147 time.Sleep(1 * time.Second)
148 unix.Sync()
149 unix.Reboot(unix.LINUX_REBOOT_CMD_RESTART)
150 }
151 if res.InstallationRequest != nil {
152 if res.InstallationRequest.Generation == installationGeneration {
153 // This installation request has already been attempted
154 continue
155 }
156 installationReport = &bpb.OSInstallationReport{
157 Generation: res.InstallationRequest.Generation,
158 }
159 if err := install(res.InstallationRequest, l, isEFIBoot); err != nil {
160 l.Errorf("Installation failed: %v", err)
161 installationReport.Result = &bpb.OSInstallationReport_Error_{
162 Error: &bpb.OSInstallationReport_Error{
163 Error: err.Error(),
164 },
165 }
166 } else {
167 l.Info("Installation succeeded")
168 installationReport.Result = &bpb.OSInstallationReport_Success_{
169 Success: &bpb.OSInstallationReport_Success{},
170 }
171 }
172 } else {
173 time.Sleep(30 * time.Second)
174 }
175 }
176}