blob: 3779b0200d2f9db2d31ae7001231125d34c1d530 [file] [log] [blame]
Serge Bazanskiae004682023-04-18 13:28:48 +02001package manager
2
3import (
4 "context"
Tim Windelschmidt913a03a2023-04-24 15:57:02 +02005 "flag"
Serge Bazanskiae004682023-04-18 13:28:48 +02006 "fmt"
Tim Windelschmidt913a03a2023-04-24 15:57:02 +02007 "time"
Serge Bazanskiae004682023-04-18 13:28:48 +02008
9 "k8s.io/klog/v2"
10
Serge Bazanski00cf57d2023-04-20 11:19:00 +020011 "source.monogon.dev/cloud/bmaas/bmdb"
Serge Bazanskic50f6942023-04-24 18:27:22 +020012 "source.monogon.dev/cloud/bmaas/bmdb/metrics"
Serge Bazanskiae004682023-04-18 13:28:48 +020013 "source.monogon.dev/cloud/bmaas/bmdb/model"
14 ecl "source.monogon.dev/cloud/shepherd/equinix/wrapngo"
15)
16
17type RecovererConfig struct {
18 ControlLoopConfig
Tim Windelschmidt913a03a2023-04-24 15:57:02 +020019 RebootWaitSeconds int
Serge Bazanskiae004682023-04-18 13:28:48 +020020}
21
22func (r *RecovererConfig) RegisterFlags() {
23 r.ControlLoopConfig.RegisterFlags("recoverer")
Tim Windelschmidt913a03a2023-04-24 15:57:02 +020024 flag.IntVar(&r.RebootWaitSeconds, "recoverer_reboot_wait_seconds", 30, "How many seconds to sleep to ensure a reboot happend")
Serge Bazanskiae004682023-04-18 13:28:48 +020025}
26
27// The Recoverer reboots machines whose agent has stopped sending heartbeats or
28// has not sent any heartbeats at all.
29type Recoverer struct {
30 RecovererConfig
31
32 cl ecl.Client
33}
34
35func NewRecoverer(cl ecl.Client, rc RecovererConfig) (*Recoverer, error) {
36 if err := rc.ControlLoopConfig.Check(); err != nil {
37 return nil, err
38 }
39 return &Recoverer{
40 RecovererConfig: rc,
41 cl: cl,
42 }, nil
43}
44
Serge Bazanski00cf57d2023-04-20 11:19:00 +020045func (r *Recoverer) getProcessInfo() processInfo {
46 return processInfo{
47 process: model.ProcessShepherdRecovery,
48 defaultBackoff: bmdb.Backoff{
49 Initial: 1 * time.Minute,
50 Maximum: 1 * time.Hour,
51 Exponent: 1.2,
52 },
Serge Bazanskic50f6942023-04-24 18:27:22 +020053 processor: metrics.ProcessorShepherdRecoverer,
Serge Bazanski00cf57d2023-04-20 11:19:00 +020054 }
55}
56
Serge Bazanskiae004682023-04-18 13:28:48 +020057func (r *Recoverer) getMachines(ctx context.Context, q *model.Queries, limit int32) ([]model.MachineProvided, error) {
Tim Windelschmidt0e749612023-08-07 17:42:59 +000058 return q.GetMachineForAgentRecovery(ctx, model.GetMachineForAgentRecoveryParams{
59 Limit: limit,
60 Provider: model.ProviderEquinix,
61 })
Serge Bazanskiae004682023-04-18 13:28:48 +020062}
63
64func (r *Recoverer) processMachine(ctx context.Context, t *task) error {
65 klog.Infof("Starting recovery of device (ID: %s, PID %s)", t.machine.MachineID, t.machine.ProviderID)
66
67 if err := r.cl.RebootDevice(ctx, t.machine.ProviderID); err != nil {
68 return fmt.Errorf("failed to reboot device: %w", err)
69 }
70
Tim Windelschmidt913a03a2023-04-24 15:57:02 +020071 // TODO(issue/215): replace this
72 // This is required as Equinix doesn't reboot the machines synchronously
73 // during the API call.
74 select {
75 case <-time.After(time.Duration(r.RebootWaitSeconds) * time.Second):
76 case <-ctx.Done():
77 return fmt.Errorf("while waiting for reboot: %w", ctx.Err())
78 }
79
Serge Bazanskiae004682023-04-18 13:28:48 +020080 klog.Infof("Removing AgentStarted/AgentHeartbeat (ID: %s, PID: %s)...", t.machine.MachineID, t.machine.ProviderID)
81 err := t.work.Finish(ctx, func(q *model.Queries) error {
82 if err := q.MachineDeleteAgentStarted(ctx, t.machine.MachineID); err != nil {
83 return fmt.Errorf("while deleting AgentStarted: %w", err)
84 }
85 if err := q.MachineDeleteAgentHeartbeat(ctx, t.machine.MachineID); err != nil {
86 return fmt.Errorf("while deleting AgentHeartbeat: %w", err)
87 }
88 return nil
89 })
90 if err != nil {
91 return fmt.Errorf("while deleting AgentStarted/AgentHeartbeat tags: %w", err)
92 }
93 return nil
94}