blob: f4495e8240a00d06d4781d0a799867069d55fba9 [file] [log] [blame]
Tim Windelschmidt6d33a432025-02-04 14:34:25 +01001// Copyright The Monogon Project Authors.
2// SPDX-License-Identifier: Apache-2.0
3
Serge Bazanskicaa12082023-02-16 14:54:04 +01004package manager
5
6import (
Tim Windelschmidt5f5f3302024-02-22 23:50:24 +01007 "bytes"
Serge Bazanskicaa12082023-02-16 14:54:04 +01008 "context"
9 "crypto/ed25519"
Serge Bazanski51987d62023-04-06 16:35:35 +020010 "crypto/x509"
Serge Bazanskicaa12082023-02-16 14:54:04 +010011 "encoding/hex"
Serge Bazanski51987d62023-04-06 16:35:35 +020012 "encoding/pem"
Serge Bazanskicaa12082023-02-16 14:54:04 +010013 "flag"
14 "fmt"
15 "net"
16 "os"
Serge Bazanskicaa12082023-02-16 14:54:04 +010017 "strings"
18 "time"
19
20 "github.com/google/uuid"
Serge Bazanskicaa12082023-02-16 14:54:04 +010021 "google.golang.org/protobuf/proto"
22 "k8s.io/klog/v2"
23
24 apb "source.monogon.dev/cloud/agent/api"
Tim Windelschmidt0e749612023-08-07 17:42:59 +000025
Serge Bazanski00cf57d2023-04-20 11:19:00 +020026 "source.monogon.dev/cloud/bmaas/bmdb"
Serge Bazanskic50f6942023-04-24 18:27:22 +020027 "source.monogon.dev/cloud/bmaas/bmdb/metrics"
Serge Bazanskicaa12082023-02-16 14:54:04 +010028 "source.monogon.dev/cloud/bmaas/bmdb/model"
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +020029 "source.monogon.dev/cloud/shepherd"
Tim Windelschmidt5f5f3302024-02-22 23:50:24 +010030 "source.monogon.dev/go/net/ssh"
Serge Bazanskicaa12082023-02-16 14:54:04 +010031)
32
Serge Bazanski86a714d2023-04-17 15:54:21 +020033// InitializerConfig configures how the Initializer will deploy Agents on
34// machines. In CLI scenarios, this should be populated from flags via
35// RegisterFlags.
36type InitializerConfig struct {
37 ControlLoopConfig
38
Serge Bazanskicaa12082023-02-16 14:54:04 +010039 // Executable is the contents of the agent binary created and run
40 // at the provisioned servers. Must be set.
41 Executable []byte
42
43 // TargetPath is a filesystem destination path used while uploading the BMaaS
44 // agent executable to hosts as part of the initialization process. Must be set.
45 TargetPath string
46
47 // Endpoint is the address Agent will use to contact the BMaaS
48 // infrastructure. Must be set.
49 Endpoint string
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +020050
Serge Bazanski51987d62023-04-06 16:35:35 +020051 // EndpointCACertificate is an optional DER-encoded (but not PEM-armored) X509
52 // certificate used to populate the trusted CA store of the agent. It should be
53 // set to the CA certificate of the endpoint if not using a system-trusted CA
54 // certificate.
55 EndpointCACertificate []byte
Serge Bazanskicaa12082023-02-16 14:54:04 +010056
57 // SSHTimeout is the amount of time set aside for the initializing
58 // SSH session to run its course. Upon timeout, the iteration would be
59 // declared a failure. Must be set.
60 SSHConnectTimeout time.Duration
61 // SSHExecTimeout is the amount of time set aside for executing the agent and
62 // getting its output once the SSH connection has been established. Upon timeout,
63 // the iteration would be declared as failure. Must be set.
64 SSHExecTimeout time.Duration
65}
66
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +020067func (ic *InitializerConfig) RegisterFlags() {
68 ic.ControlLoopConfig.RegisterFlags("initializer")
Serge Bazanski86a714d2023-04-17 15:54:21 +020069
Serge Bazanskicaa12082023-02-16 14:54:04 +010070 flag.Func("agent_executable_path", "Local filesystem path of agent binary to be uploaded", func(val string) error {
71 if val == "" {
72 return nil
73 }
74 data, err := os.ReadFile(val)
75 if err != nil {
Serge Bazanski77b11d32023-04-06 14:43:19 +020076 return fmt.Errorf("could not read: %w", err)
Serge Bazanskicaa12082023-02-16 14:54:04 +010077 }
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +020078 ic.Executable = data
Serge Bazanskicaa12082023-02-16 14:54:04 +010079 return nil
80 })
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +020081 flag.StringVar(&ic.TargetPath, "agent_target_path", "/root/agent", "Filesystem path where the agent will be uploaded to and ran from")
82 flag.StringVar(&ic.Endpoint, "agent_endpoint", "", "Address of BMDB Server to which the agent will attempt to connect")
Serge Bazanski51987d62023-04-06 16:35:35 +020083 flag.Func("agent_endpoint_ca_certificate_path", "Path to PEM X509 CA certificate that the agent endpoint is serving with. If not set, the agent will attempt to use system CA certificates to authenticate the endpoint.", func(val string) error {
84 if val == "" {
85 return nil
86 }
87 data, err := os.ReadFile(val)
88 if err != nil {
89 return fmt.Errorf("could not read: %w", err)
90 }
91 block, _ := pem.Decode(data)
92 if block.Type != "CERTIFICATE" {
93 return fmt.Errorf("not a certificate")
94 }
95 _, err = x509.ParseCertificate(block.Bytes)
96 if err != nil {
97 return fmt.Errorf("invalid certificate: %w", err)
98 }
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +020099 ic.EndpointCACertificate = block.Bytes
Serge Bazanski51987d62023-04-06 16:35:35 +0200100 return nil
101 })
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +0200102 flag.DurationVar(&ic.SSHConnectTimeout, "agent_ssh_connect_timeout", 2*time.Second, "Timeout for connecting over SSH to a machine")
103 flag.DurationVar(&ic.SSHExecTimeout, "agent_ssh_exec_timeout", 60*time.Second, "Timeout for connecting over SSH to a machine")
104}
105
106func (ic *InitializerConfig) Check() error {
107 if err := ic.ControlLoopConfig.Check(); err != nil {
108 return err
109 }
110
111 if len(ic.Executable) == 0 {
112 return fmt.Errorf("agent executable not configured")
113 }
114 if ic.TargetPath == "" {
115 return fmt.Errorf("agent target path must be set")
116 }
117 if ic.Endpoint == "" {
118 return fmt.Errorf("agent endpoint must be set")
119 }
120 if ic.SSHConnectTimeout == 0 {
121 return fmt.Errorf("agent SSH connection timeout must be set")
122 }
123 if ic.SSHExecTimeout == 0 {
124 return fmt.Errorf("agent SSH execution timeout must be set")
125 }
126
127 return nil
Serge Bazanskicaa12082023-02-16 14:54:04 +0100128}
129
Serge Bazanski86a714d2023-04-17 15:54:21 +0200130// The Initializer starts the agent on machines that aren't yet running it.
Serge Bazanskicaa12082023-02-16 14:54:04 +0100131type Initializer struct {
Serge Bazanski86a714d2023-04-17 15:54:21 +0200132 InitializerConfig
133
Tim Windelschmidt5f5f3302024-02-22 23:50:24 +0100134 sshClient ssh.Client
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +0200135 p shepherd.Provider
Serge Bazanskicaa12082023-02-16 14:54:04 +0100136}
137
Serge Bazanski86a714d2023-04-17 15:54:21 +0200138// NewInitializer creates an Initializer instance, checking the
139// InitializerConfig, SharedConfig and AgentConfig for errors.
Tim Windelschmidt5f5f3302024-02-22 23:50:24 +0100140func NewInitializer(p shepherd.Provider, sshClient ssh.Client, ic InitializerConfig) (*Initializer, error) {
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +0200141 if err := ic.Check(); err != nil {
Serge Bazanskicaa12082023-02-16 14:54:04 +0100142 return nil, err
143 }
Serge Bazanski86a714d2023-04-17 15:54:21 +0200144
Serge Bazanskicaa12082023-02-16 14:54:04 +0100145 return &Initializer{
Serge Bazanski86a714d2023-04-17 15:54:21 +0200146 InitializerConfig: ic,
147
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +0200148 p: p,
149 sshClient: sshClient,
Serge Bazanskicaa12082023-02-16 14:54:04 +0100150 }, nil
151}
152
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +0200153func (i *Initializer) getProcessInfo() processInfo {
Serge Bazanski00cf57d2023-04-20 11:19:00 +0200154 return processInfo{
155 process: model.ProcessShepherdAgentStart,
156 defaultBackoff: bmdb.Backoff{
157 Initial: 5 * time.Minute,
158 Maximum: 4 * time.Hour,
159 Exponent: 1.2,
160 },
Serge Bazanskic50f6942023-04-24 18:27:22 +0200161 processor: metrics.ProcessorShepherdInitializer,
Serge Bazanski00cf57d2023-04-20 11:19:00 +0200162 }
163}
164
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +0200165func (i *Initializer) getMachines(ctx context.Context, q *model.Queries, limit int32) ([]model.MachineProvided, error) {
Tim Windelschmidt0e749612023-08-07 17:42:59 +0000166 return q.GetMachinesForAgentStart(ctx, model.GetMachinesForAgentStartParams{
167 Limit: limit,
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +0200168 Provider: i.p.Type(),
Tim Windelschmidt0e749612023-08-07 17:42:59 +0000169 })
Serge Bazanski9eb903d2023-02-20 14:28:19 +0100170}
171
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +0200172func (i *Initializer) processMachine(ctx context.Context, t *task) error {
173 machine, err := i.p.GetMachine(ctx, shepherd.ProviderID(t.machine.ProviderID))
Serge Bazanskicaa12082023-02-16 14:54:04 +0100174 if err != nil {
Tim Windelschmidt327cdba2024-05-21 13:51:32 +0200175 return fmt.Errorf("while fetching machine %q: %w", t.machine.ProviderID, err)
Serge Bazanskicaa12082023-02-16 14:54:04 +0100176 }
Serge Bazanskicaa12082023-02-16 14:54:04 +0100177
Serge Bazanski86a714d2023-04-17 15:54:21 +0200178 // Start the agent.
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +0200179 klog.Infof("Starting agent on machine (ID: %s, PID %s)", t.machine.MachineID, t.machine.ProviderID)
180 apk, err := i.startAgent(ctx, machine, t.machine.MachineID)
Serge Bazanskicaa12082023-02-16 14:54:04 +0100181 if err != nil {
Serge Bazanski86a714d2023-04-17 15:54:21 +0200182 return fmt.Errorf("while starting the agent: %w", err)
183 }
184
185 // Agent startup succeeded. Set the appropriate BMDB tag, and release the
186 // lock.
187 klog.Infof("Setting AgentStarted (ID: %s, PID: %s, Agent public key: %s).", t.machine.MachineID, t.machine.ProviderID, hex.EncodeToString(apk))
188 err = t.work.Finish(ctx, func(q *model.Queries) error {
189 return q.MachineSetAgentStarted(ctx, model.MachineSetAgentStartedParams{
190 MachineID: t.machine.MachineID,
191 AgentStartedAt: time.Now(),
192 AgentPublicKey: apk,
193 })
194 })
195 if err != nil {
196 return fmt.Errorf("while setting AgentStarted tag: %w", err)
Serge Bazanskicaa12082023-02-16 14:54:04 +0100197 }
198 return nil
199}
200
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +0200201// startAgent runs the agent executable on the target machine m, returning the
Serge Bazanskicaa12082023-02-16 14:54:04 +0100202// agent's public key on success.
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +0200203func (i *Initializer) startAgent(ctx context.Context, m shepherd.Machine, mid uuid.UUID) ([]byte, error) {
Serge Bazanskicaa12082023-02-16 14:54:04 +0100204 // Provide a bound on execution time in case we get stuck after the SSH
205 // connection is established.
Serge Bazanski86a714d2023-04-17 15:54:21 +0200206 sctx, sctxC := context.WithTimeout(ctx, i.SSHExecTimeout)
Serge Bazanskicaa12082023-02-16 14:54:04 +0100207 defer sctxC()
208
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +0200209 // Use the machine's IP address
210 ni := m.Addr()
211 if !ni.IsValid() {
212 return nil, fmt.Errorf("machine (machine ID: %s) has no available addresses", mid)
Serge Bazanskicaa12082023-02-16 14:54:04 +0100213 }
Serge Bazanskicaa12082023-02-16 14:54:04 +0100214
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +0200215 addr := net.JoinHostPort(ni.String(), "22")
216 klog.V(1).Infof("Dialing machine (machine ID: %s, addr: %s).", mid, addr)
217
218 conn, err := i.sshClient.Dial(sctx, addr, i.SSHConnectTimeout)
Serge Bazanskicaa12082023-02-16 14:54:04 +0100219 if err != nil {
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +0200220 return nil, fmt.Errorf("while dialing the machine: %w", err)
Serge Bazanskicaa12082023-02-16 14:54:04 +0100221 }
222 defer conn.Close()
223
224 // Upload the agent executable.
225
Lorenz Brun5b8b8602023-03-09 17:22:21 +0100226 klog.Infof("Uploading the agent executable (machine ID: %s, addr: %s).", mid, addr)
Tim Windelschmidt5f5f3302024-02-22 23:50:24 +0100227 if err := conn.Upload(sctx, i.TargetPath, bytes.NewReader(i.Executable)); err != nil {
Serge Bazanskicaa12082023-02-16 14:54:04 +0100228 return nil, fmt.Errorf("while uploading agent executable: %w", err)
229 }
Lorenz Brun5b8b8602023-03-09 17:22:21 +0100230 klog.V(1).Infof("Upload successful (machine ID: %s, addr: %s).", mid, addr)
Serge Bazanskicaa12082023-02-16 14:54:04 +0100231
232 // The initialization protobuf message will be sent to the agent on its
233 // standard input.
234 imsg := apb.TakeoverInit{
Lorenz Brun5b8b8602023-03-09 17:22:21 +0100235 MachineId: mid.String(),
Serge Bazanski86a714d2023-04-17 15:54:21 +0200236 BmaasEndpoint: i.Endpoint,
237 CaCertificate: i.EndpointCACertificate,
Serge Bazanskicaa12082023-02-16 14:54:04 +0100238 }
239 imsgb, err := proto.Marshal(&imsg)
240 if err != nil {
241 return nil, fmt.Errorf("while marshaling agent message: %w", err)
242 }
243
244 // Start the agent and wait for the agent's output to arrive.
Serge Bazanski86a714d2023-04-17 15:54:21 +0200245 klog.V(1).Infof("Starting the agent executable at path %q (machine ID: %s).", i.TargetPath, mid)
246 stdout, stderr, err := conn.Execute(ctx, i.TargetPath, imsgb)
Serge Bazanskicaa12082023-02-16 14:54:04 +0100247 stderrStr := strings.TrimSpace(string(stderr))
248 if stderrStr != "" {
249 klog.Warningf("Agent stderr: %q", stderrStr)
250 }
251 if err != nil {
252 return nil, fmt.Errorf("while starting the agent executable: %w", err)
253 }
254
255 var arsp apb.TakeoverResponse
256 if err := proto.Unmarshal(stdout, &arsp); err != nil {
257 return nil, fmt.Errorf("agent reply couldn't be unmarshaled: %w", err)
258 }
Lorenz Brun595dfe92023-02-21 19:13:02 +0100259 var successResp *apb.TakeoverSuccess
260 switch r := arsp.Result.(type) {
261 case *apb.TakeoverResponse_Error:
262 return nil, fmt.Errorf("agent returned error: %v", r.Error.Message)
263 case *apb.TakeoverResponse_Success:
264 successResp = r.Success
265 default:
266 return nil, fmt.Errorf("agent returned unknown result of type %T", arsp.Result)
267 }
268 if !proto.Equal(&imsg, successResp.InitMessage) {
Tim Windelschmidt73e98822024-04-18 23:13:49 +0200269 return nil, fmt.Errorf("agent did not send back the init message")
Serge Bazanskicaa12082023-02-16 14:54:04 +0100270 }
Lorenz Brun595dfe92023-02-21 19:13:02 +0100271 if len(successResp.Key) != ed25519.PublicKeySize {
Tim Windelschmidt73e98822024-04-18 23:13:49 +0200272 return nil, fmt.Errorf("agent key length mismatch")
Serge Bazanskicaa12082023-02-16 14:54:04 +0100273 }
Lorenz Brun5b8b8602023-03-09 17:22:21 +0100274 klog.Infof("Started the agent (machine ID: %s, key: %s).", mid, hex.EncodeToString(successResp.Key))
Lorenz Brun595dfe92023-02-21 19:13:02 +0100275 return successResp.Key, nil
Serge Bazanskicaa12082023-02-16 14:54:04 +0100276}