blob: 6432586149dcf68670825ed47c7df380b6e834a7 [file] [log] [blame] [edit]
package main
import (
"context"
"crypto/ed25519"
"crypto/rand"
"crypto/tls"
"crypto/x509"
"errors"
"fmt"
"math/big"
"os"
"time"
"github.com/cenkalti/backoff/v4"
"golang.org/x/sys/unix"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials"
"google.golang.org/protobuf/proto"
apb "source.monogon.dev/cloud/agent/api"
bpb "source.monogon.dev/cloud/bmaas/server/api"
"source.monogon.dev/metropolis/node/core/devmgr"
"source.monogon.dev/metropolis/node/core/network"
"source.monogon.dev/metropolis/pkg/pki"
"source.monogon.dev/metropolis/pkg/supervisor"
)
// This is similar to rpc.NewEphemeralCredentials, but that only deals with
// Metropolis-style certificate verification.
func newEphemeralCert(private ed25519.PrivateKey) (*tls.Certificate, error) {
template := x509.Certificate{
SerialNumber: big.NewInt(1),
NotBefore: time.Now(),
NotAfter: pki.UnknownNotAfter,
KeyUsage: x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature,
ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageClientAuth},
BasicConstraintsValid: true,
}
certificateBytes, err := x509.CreateCertificate(rand.Reader, &template, &template, private.Public(), private)
if err != nil {
return nil, fmt.Errorf("when generating self-signed certificate: %w", err)
}
return &tls.Certificate{
Certificate: [][]byte{certificateBytes},
PrivateKey: private,
}, nil
}
// Main runnable for the agent.
func agentRunnable(ctx context.Context) error {
l := supervisor.Logger(ctx)
// Mount this late so we don't just crash when not booted with EFI.
isEFIBoot := false
if err := mkdirAndMount("/sys/firmware/efi/efivars", "efivarfs", unix.MS_NOEXEC|unix.MS_NOSUID|unix.MS_NODEV); err == nil {
isEFIBoot = true
}
agentInitRaw, err := os.ReadFile("/init.pb")
if err != nil {
return fmt.Errorf("Unable to read spec file from takeover: %w", err)
}
var agentInit apb.AgentInit
if err := proto.Unmarshal(agentInitRaw, &agentInit); err != nil {
return fmt.Errorf("unable to parse spec file from takeover: %w", err)
}
l.Info("Monogon BMaaS Agent started")
if agentInit.TakeoverInit == nil {
return errors.New("AgentInit takeover_init field is unset, this is not allowed")
}
devmgrSvc := devmgr.New()
supervisor.Run(ctx, "devmgr", devmgrSvc.Run)
networkSvc := network.New(agentInit.NetworkConfig)
networkSvc.DHCPVendorClassID = "dev.monogon.cloud.agent.v1"
supervisor.Run(ctx, "networking", networkSvc.Run)
l.Info("Started networking")
ephemeralCert, err := newEphemeralCert(ed25519.PrivateKey(agentInit.PrivateKey))
if err != nil {
return fmt.Errorf("could not generate ephemeral credentials: %w", err)
}
var rootCAs *x509.CertPool
if len(agentInit.TakeoverInit.CaCertificate) != 0 {
caCert, err := x509.ParseCertificate(agentInit.TakeoverInit.CaCertificate)
if err != nil {
return fmt.Errorf("unable to parse supplied ca_certificate, is it in DER format?")
}
rootCAs = x509.NewCertPool()
rootCAs.AddCert(caCert)
}
conn, err := grpc.Dial(agentInit.TakeoverInit.BmaasEndpoint, grpc.WithTransportCredentials(credentials.NewTLS(&tls.Config{
Certificates: []tls.Certificate{*ephemeralCert},
RootCAs: rootCAs,
})))
if err != nil {
return fmt.Errorf("error dialing BMaaS gRPC endpoint: %w", err)
}
c := bpb.NewAgentCallbackClient(conn)
supervisor.Signal(ctx, supervisor.SignalHealthy)
assembleHWReport := func() *bpb.AgentHardwareReport {
report, warnings := gatherHWReport()
var warningStrings []string
for _, w := range warnings {
l.Warningf("Hardware Report Warning: %v", w)
warningStrings = append(warningStrings, w.Error())
}
return &bpb.AgentHardwareReport{
Report: report,
Warning: warningStrings,
}
}
var sentFirstHeartBeat, hwReportSent bool
var installationReport *bpb.OSInstallationReport
var installationGeneration int64
b := backoff.NewExponentialBackOff()
// Never stop retrying, there is nothing else to do
b.MaxElapsedTime = 0
// Main heartbeat loop
for {
req := bpb.AgentHeartbeatRequest{
MachineId: agentInit.TakeoverInit.MachineId,
}
if sentFirstHeartBeat && !hwReportSent {
req.HardwareReport = assembleHWReport()
}
if installationReport != nil {
req.InstallationReport = installationReport
}
reqCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
res, err := c.Heartbeat(reqCtx, &req)
cancel()
if err != nil {
l.Infof("Heartbeat failed: %v", err)
time.Sleep(b.NextBackOff())
continue
}
b.Reset()
sentFirstHeartBeat = true
if req.HardwareReport != nil {
hwReportSent = true
}
if installationReport != nil {
l.Infof("Installation report sent successfully, rebooting")
// Close connection and wait 1s to make sure that the RST
// can be sent. Important for QEMU/slirp where not doing this
// triggers bugs in the connection state management, but also
// nice for reducing the number of stale connections in the API
// server.
conn.Close()
time.Sleep(1 * time.Second)
unix.Sync()
unix.Reboot(unix.LINUX_REBOOT_CMD_RESTART)
}
if res.InstallationRequest != nil {
if res.InstallationRequest.Generation == installationGeneration {
// This installation request has already been attempted
continue
}
installationReport = &bpb.OSInstallationReport{
Generation: res.InstallationRequest.Generation,
}
if err := install(res.InstallationRequest, agentInit.NetworkConfig, l, isEFIBoot); err != nil {
l.Errorf("Installation failed: %v", err)
installationReport.Result = &bpb.OSInstallationReport_Error_{
Error: &bpb.OSInstallationReport_Error{
Error: err.Error(),
},
}
} else {
l.Info("Installation succeeded")
installationReport.Result = &bpb.OSInstallationReport_Success_{
Success: &bpb.OSInstallationReport_Success{},
}
}
} else {
time.Sleep(30 * time.Second)
}
}
}