blob: 75888ce9e49e89bd36f675647db82cd945a4b438 [file] [log] [blame]
Lorenz Brunaadeb792023-03-27 15:53:56 +02001package main
2
3import (
4 "context"
5 "crypto/ed25519"
6 "crypto/rand"
7 "crypto/tls"
8 "crypto/x509"
9 "errors"
10 "fmt"
11 "math/big"
12 "os"
13 "time"
14
15 "github.com/cenkalti/backoff/v4"
16 "golang.org/x/sys/unix"
17 "google.golang.org/grpc"
18 "google.golang.org/grpc/credentials"
19 "google.golang.org/protobuf/proto"
20
21 apb "source.monogon.dev/cloud/agent/api"
22 bpb "source.monogon.dev/cloud/bmaas/server/api"
Lorenz Brun6c454342023-06-01 12:23:38 +020023 "source.monogon.dev/metropolis/node/core/devmgr"
Lorenz Brunaadeb792023-03-27 15:53:56 +020024 "source.monogon.dev/metropolis/node/core/network"
25 "source.monogon.dev/metropolis/pkg/pki"
26 "source.monogon.dev/metropolis/pkg/supervisor"
27)
28
29// This is similar to rpc.NewEphemeralCredentials, but that only deals with
30// Metropolis-style certificate verification.
31func newEphemeralCert(private ed25519.PrivateKey) (*tls.Certificate, error) {
32 template := x509.Certificate{
33 SerialNumber: big.NewInt(1),
34 NotBefore: time.Now(),
35 NotAfter: pki.UnknownNotAfter,
36
37 KeyUsage: x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature,
38 ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageClientAuth},
39 BasicConstraintsValid: true,
40 }
41 certificateBytes, err := x509.CreateCertificate(rand.Reader, &template, &template, private.Public(), private)
42 if err != nil {
43 return nil, fmt.Errorf("when generating self-signed certificate: %w", err)
44 }
45 return &tls.Certificate{
46 Certificate: [][]byte{certificateBytes},
47 PrivateKey: private,
48 }, nil
49}
50
51// Main runnable for the agent.
52func agentRunnable(ctx context.Context) error {
53 l := supervisor.Logger(ctx)
54 // Mount this late so we don't just crash when not booted with EFI.
55 isEFIBoot := false
56 if err := mkdirAndMount("/sys/firmware/efi/efivars", "efivarfs", unix.MS_NOEXEC|unix.MS_NOSUID|unix.MS_NODEV); err == nil {
57 isEFIBoot = true
58 }
59 agentInitRaw, err := os.ReadFile("/init.pb")
60 if err != nil {
61 return fmt.Errorf("Unable to read spec file from takeover: %w", err)
62 }
63
64 var agentInit apb.AgentInit
65 if err := proto.Unmarshal(agentInitRaw, &agentInit); err != nil {
66 return fmt.Errorf("unable to parse spec file from takeover: %w", err)
67 }
68 l.Info("Monogon BMaaS Agent started")
69 if agentInit.TakeoverInit == nil {
70 return errors.New("AgentInit takeover_init field is unset, this is not allowed")
71 }
72
Lorenz Brun6c454342023-06-01 12:23:38 +020073 devmgrSvc := devmgr.New()
74 supervisor.Run(ctx, "devmgr", devmgrSvc.Run)
75
Lorenz Brunaadeb792023-03-27 15:53:56 +020076 networkSvc := network.New(agentInit.NetworkConfig)
77 networkSvc.DHCPVendorClassID = "dev.monogon.cloud.agent.v1"
78 supervisor.Run(ctx, "networking", networkSvc.Run)
79 l.Info("Started networking")
80
81 ephemeralCert, err := newEphemeralCert(ed25519.PrivateKey(agentInit.PrivateKey))
82 if err != nil {
83 return fmt.Errorf("could not generate ephemeral credentials: %w", err)
84 }
85 var rootCAs *x509.CertPool
86 if len(agentInit.TakeoverInit.CaCertificate) != 0 {
87 caCert, err := x509.ParseCertificate(agentInit.TakeoverInit.CaCertificate)
88 if err != nil {
89 return fmt.Errorf("unable to parse supplied ca_certificate, is it in DER format?")
90 }
91 rootCAs = x509.NewCertPool()
92 rootCAs.AddCert(caCert)
93 }
94
95 conn, err := grpc.Dial(agentInit.TakeoverInit.BmaasEndpoint, grpc.WithTransportCredentials(credentials.NewTLS(&tls.Config{
96 Certificates: []tls.Certificate{*ephemeralCert},
97 RootCAs: rootCAs,
98 })))
99 if err != nil {
100 return fmt.Errorf("error dialing BMaaS gRPC endpoint: %w", err)
101 }
102 c := bpb.NewAgentCallbackClient(conn)
103
104 supervisor.Signal(ctx, supervisor.SignalHealthy)
105
106 report, warnings := gatherHWReport()
107 var warningStrings []string
108 for _, w := range warnings {
109 l.Warningf("Hardware Report Warning: %v", w)
110 warningStrings = append(warningStrings, w.Error())
111 }
112
113 var hwReportSent bool
114 var installationReport *bpb.OSInstallationReport
115 var installationGeneration int64
116 b := backoff.NewExponentialBackOff()
Lorenz Brunb44a5072023-04-18 13:14:33 +0200117 // Never stop retrying, there is nothing else to do
118 b.MaxElapsedTime = 0
Lorenz Brunaadeb792023-03-27 15:53:56 +0200119 // Main heartbeat loop
120 for {
121 req := bpb.AgentHeartbeatRequest{
122 MachineId: agentInit.TakeoverInit.MachineId,
123 }
124 if !hwReportSent {
125 req.HardwareReport = &bpb.AgentHardwareReport{
126 Report: report,
127 Warning: warningStrings,
128 }
129 }
130 if installationReport != nil {
131 req.InstallationReport = installationReport
132 }
Lorenz Brun00343802023-04-20 17:04:32 +0200133 reqCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
134 res, err := c.Heartbeat(reqCtx, &req)
135 cancel()
Lorenz Brunaadeb792023-03-27 15:53:56 +0200136 if err != nil {
137 l.Infof("Heartbeat failed: %v", err)
138 time.Sleep(b.NextBackOff())
139 continue
140 }
141 b.Reset()
142 hwReportSent = true
143 if installationReport != nil {
144 l.Infof("Installation report sent successfully, rebooting")
145 // Close connection and wait 1s to make sure that the RST
146 // can be sent. Important for QEMU/slirp where not doing this
147 // triggers bugs in the connection state management, but also
148 // nice for reducing the number of stale connections in the API
149 // server.
150 conn.Close()
151 time.Sleep(1 * time.Second)
152 unix.Sync()
153 unix.Reboot(unix.LINUX_REBOOT_CMD_RESTART)
154 }
155 if res.InstallationRequest != nil {
156 if res.InstallationRequest.Generation == installationGeneration {
157 // This installation request has already been attempted
158 continue
159 }
160 installationReport = &bpb.OSInstallationReport{
161 Generation: res.InstallationRequest.Generation,
162 }
Tim Windelschmidtfac48742023-04-24 19:04:55 +0200163 if err := install(res.InstallationRequest, agentInit.NetworkConfig, l, isEFIBoot); err != nil {
Lorenz Brunaadeb792023-03-27 15:53:56 +0200164 l.Errorf("Installation failed: %v", err)
165 installationReport.Result = &bpb.OSInstallationReport_Error_{
166 Error: &bpb.OSInstallationReport_Error{
167 Error: err.Error(),
168 },
169 }
170 } else {
171 l.Info("Installation succeeded")
172 installationReport.Result = &bpb.OSInstallationReport_Success_{
173 Success: &bpb.OSInstallationReport_Success{},
174 }
175 }
176 } else {
177 time.Sleep(30 * time.Second)
178 }
179 }
180}