blob: 72b5588c58da62cab219fafba1103441a02e7b8a [file] [log] [blame]
Serge Bazanskiae004682023-04-18 13:28:48 +02001package manager
2
3import (
4 "context"
Tim Windelschmidt913a03a2023-04-24 15:57:02 +02005 "flag"
Serge Bazanskiae004682023-04-18 13:28:48 +02006 "fmt"
Tim Windelschmidt913a03a2023-04-24 15:57:02 +02007 "time"
Serge Bazanskiae004682023-04-18 13:28:48 +02008
9 "k8s.io/klog/v2"
10
Serge Bazanski00cf57d2023-04-20 11:19:00 +020011 "source.monogon.dev/cloud/bmaas/bmdb"
Serge Bazanskic50f6942023-04-24 18:27:22 +020012 "source.monogon.dev/cloud/bmaas/bmdb/metrics"
Serge Bazanskiae004682023-04-18 13:28:48 +020013 "source.monogon.dev/cloud/bmaas/bmdb/model"
14 ecl "source.monogon.dev/cloud/shepherd/equinix/wrapngo"
15)
16
17type RecovererConfig struct {
18 ControlLoopConfig
Tim Windelschmidt913a03a2023-04-24 15:57:02 +020019 RebootWaitSeconds int
Serge Bazanskiae004682023-04-18 13:28:48 +020020}
21
22func (r *RecovererConfig) RegisterFlags() {
23 r.ControlLoopConfig.RegisterFlags("recoverer")
Tim Windelschmidt913a03a2023-04-24 15:57:02 +020024 flag.IntVar(&r.RebootWaitSeconds, "recoverer_reboot_wait_seconds", 30, "How many seconds to sleep to ensure a reboot happend")
Serge Bazanskiae004682023-04-18 13:28:48 +020025}
26
27// The Recoverer reboots machines whose agent has stopped sending heartbeats or
28// has not sent any heartbeats at all.
29type Recoverer struct {
30 RecovererConfig
31
32 cl ecl.Client
33}
34
35func NewRecoverer(cl ecl.Client, rc RecovererConfig) (*Recoverer, error) {
36 if err := rc.ControlLoopConfig.Check(); err != nil {
37 return nil, err
38 }
39 return &Recoverer{
40 RecovererConfig: rc,
41 cl: cl,
42 }, nil
43}
44
Serge Bazanski00cf57d2023-04-20 11:19:00 +020045func (r *Recoverer) getProcessInfo() processInfo {
46 return processInfo{
47 process: model.ProcessShepherdRecovery,
48 defaultBackoff: bmdb.Backoff{
49 Initial: 1 * time.Minute,
50 Maximum: 1 * time.Hour,
51 Exponent: 1.2,
52 },
Serge Bazanskic50f6942023-04-24 18:27:22 +020053 processor: metrics.ProcessorShepherdRecoverer,
Serge Bazanski00cf57d2023-04-20 11:19:00 +020054 }
55}
56
Serge Bazanskiae004682023-04-18 13:28:48 +020057func (r *Recoverer) getMachines(ctx context.Context, q *model.Queries, limit int32) ([]model.MachineProvided, error) {
58 return q.GetMachineForAgentRecovery(ctx, limit)
59}
60
61func (r *Recoverer) processMachine(ctx context.Context, t *task) error {
62 klog.Infof("Starting recovery of device (ID: %s, PID %s)", t.machine.MachineID, t.machine.ProviderID)
63
64 if err := r.cl.RebootDevice(ctx, t.machine.ProviderID); err != nil {
65 return fmt.Errorf("failed to reboot device: %w", err)
66 }
67
Tim Windelschmidt913a03a2023-04-24 15:57:02 +020068 // TODO(issue/215): replace this
69 // This is required as Equinix doesn't reboot the machines synchronously
70 // during the API call.
71 select {
72 case <-time.After(time.Duration(r.RebootWaitSeconds) * time.Second):
73 case <-ctx.Done():
74 return fmt.Errorf("while waiting for reboot: %w", ctx.Err())
75 }
76
Serge Bazanskiae004682023-04-18 13:28:48 +020077 klog.Infof("Removing AgentStarted/AgentHeartbeat (ID: %s, PID: %s)...", t.machine.MachineID, t.machine.ProviderID)
78 err := t.work.Finish(ctx, func(q *model.Queries) error {
79 if err := q.MachineDeleteAgentStarted(ctx, t.machine.MachineID); err != nil {
80 return fmt.Errorf("while deleting AgentStarted: %w", err)
81 }
82 if err := q.MachineDeleteAgentHeartbeat(ctx, t.machine.MachineID); err != nil {
83 return fmt.Errorf("while deleting AgentHeartbeat: %w", err)
84 }
85 return nil
86 })
87 if err != nil {
88 return fmt.Errorf("while deleting AgentStarted/AgentHeartbeat tags: %w", err)
89 }
90 return nil
91}