blob: 4ec73af749114268829e5bb96bca5e1afd91217e [file] [log] [blame]
Serge Bazanskiae004682023-04-18 13:28:48 +02001package manager
2
3import (
4 "context"
Tim Windelschmidt913a03a2023-04-24 15:57:02 +02005 "flag"
Serge Bazanskiae004682023-04-18 13:28:48 +02006 "fmt"
Tim Windelschmidt913a03a2023-04-24 15:57:02 +02007 "time"
Serge Bazanskiae004682023-04-18 13:28:48 +02008
9 "k8s.io/klog/v2"
10
11 "source.monogon.dev/cloud/bmaas/bmdb/model"
12 ecl "source.monogon.dev/cloud/shepherd/equinix/wrapngo"
13)
14
15type RecovererConfig struct {
16 ControlLoopConfig
Tim Windelschmidt913a03a2023-04-24 15:57:02 +020017 RebootWaitSeconds int
Serge Bazanskiae004682023-04-18 13:28:48 +020018}
19
20func (r *RecovererConfig) RegisterFlags() {
21 r.ControlLoopConfig.RegisterFlags("recoverer")
Tim Windelschmidt913a03a2023-04-24 15:57:02 +020022 flag.IntVar(&r.RebootWaitSeconds, "recoverer_reboot_wait_seconds", 30, "How many seconds to sleep to ensure a reboot happend")
Serge Bazanskiae004682023-04-18 13:28:48 +020023}
24
25// The Recoverer reboots machines whose agent has stopped sending heartbeats or
26// has not sent any heartbeats at all.
27type Recoverer struct {
28 RecovererConfig
29
30 cl ecl.Client
31}
32
33func NewRecoverer(cl ecl.Client, rc RecovererConfig) (*Recoverer, error) {
34 if err := rc.ControlLoopConfig.Check(); err != nil {
35 return nil, err
36 }
37 return &Recoverer{
38 RecovererConfig: rc,
39 cl: cl,
40 }, nil
41}
42
43func (r *Recoverer) getMachines(ctx context.Context, q *model.Queries, limit int32) ([]model.MachineProvided, error) {
44 return q.GetMachineForAgentRecovery(ctx, limit)
45}
46
47func (r *Recoverer) processMachine(ctx context.Context, t *task) error {
48 klog.Infof("Starting recovery of device (ID: %s, PID %s)", t.machine.MachineID, t.machine.ProviderID)
49
50 if err := r.cl.RebootDevice(ctx, t.machine.ProviderID); err != nil {
51 return fmt.Errorf("failed to reboot device: %w", err)
52 }
53
Tim Windelschmidt913a03a2023-04-24 15:57:02 +020054 // TODO(issue/215): replace this
55 // This is required as Equinix doesn't reboot the machines synchronously
56 // during the API call.
57 select {
58 case <-time.After(time.Duration(r.RebootWaitSeconds) * time.Second):
59 case <-ctx.Done():
60 return fmt.Errorf("while waiting for reboot: %w", ctx.Err())
61 }
62
Serge Bazanskiae004682023-04-18 13:28:48 +020063 klog.Infof("Removing AgentStarted/AgentHeartbeat (ID: %s, PID: %s)...", t.machine.MachineID, t.machine.ProviderID)
64 err := t.work.Finish(ctx, func(q *model.Queries) error {
65 if err := q.MachineDeleteAgentStarted(ctx, t.machine.MachineID); err != nil {
66 return fmt.Errorf("while deleting AgentStarted: %w", err)
67 }
68 if err := q.MachineDeleteAgentHeartbeat(ctx, t.machine.MachineID); err != nil {
69 return fmt.Errorf("while deleting AgentHeartbeat: %w", err)
70 }
71 return nil
72 })
73 if err != nil {
74 return fmt.Errorf("while deleting AgentStarted/AgentHeartbeat tags: %w", err)
75 }
76 return nil
77}