Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 1 | package manager |
| 2 | |
| 3 | import ( |
| 4 | "context" |
Tim Windelschmidt | 913a03a | 2023-04-24 15:57:02 +0200 | [diff] [blame^] | 5 | "flag" |
Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 6 | "fmt" |
Tim Windelschmidt | 913a03a | 2023-04-24 15:57:02 +0200 | [diff] [blame^] | 7 | "time" |
Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 8 | |
| 9 | "k8s.io/klog/v2" |
| 10 | |
| 11 | "source.monogon.dev/cloud/bmaas/bmdb/model" |
| 12 | ecl "source.monogon.dev/cloud/shepherd/equinix/wrapngo" |
| 13 | ) |
| 14 | |
| 15 | type RecovererConfig struct { |
| 16 | ControlLoopConfig |
Tim Windelschmidt | 913a03a | 2023-04-24 15:57:02 +0200 | [diff] [blame^] | 17 | RebootWaitSeconds int |
Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 18 | } |
| 19 | |
| 20 | func (r *RecovererConfig) RegisterFlags() { |
| 21 | r.ControlLoopConfig.RegisterFlags("recoverer") |
Tim Windelschmidt | 913a03a | 2023-04-24 15:57:02 +0200 | [diff] [blame^] | 22 | flag.IntVar(&r.RebootWaitSeconds, "recoverer_reboot_wait_seconds", 30, "How many seconds to sleep to ensure a reboot happend") |
Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 23 | } |
| 24 | |
| 25 | // The Recoverer reboots machines whose agent has stopped sending heartbeats or |
| 26 | // has not sent any heartbeats at all. |
| 27 | type Recoverer struct { |
| 28 | RecovererConfig |
| 29 | |
| 30 | cl ecl.Client |
| 31 | } |
| 32 | |
| 33 | func NewRecoverer(cl ecl.Client, rc RecovererConfig) (*Recoverer, error) { |
| 34 | if err := rc.ControlLoopConfig.Check(); err != nil { |
| 35 | return nil, err |
| 36 | } |
| 37 | return &Recoverer{ |
| 38 | RecovererConfig: rc, |
| 39 | cl: cl, |
| 40 | }, nil |
| 41 | } |
| 42 | |
| 43 | func (r *Recoverer) getMachines(ctx context.Context, q *model.Queries, limit int32) ([]model.MachineProvided, error) { |
| 44 | return q.GetMachineForAgentRecovery(ctx, limit) |
| 45 | } |
| 46 | |
| 47 | func (r *Recoverer) processMachine(ctx context.Context, t *task) error { |
| 48 | klog.Infof("Starting recovery of device (ID: %s, PID %s)", t.machine.MachineID, t.machine.ProviderID) |
| 49 | |
| 50 | if err := r.cl.RebootDevice(ctx, t.machine.ProviderID); err != nil { |
| 51 | return fmt.Errorf("failed to reboot device: %w", err) |
| 52 | } |
| 53 | |
Tim Windelschmidt | 913a03a | 2023-04-24 15:57:02 +0200 | [diff] [blame^] | 54 | // TODO(issue/215): replace this |
| 55 | // This is required as Equinix doesn't reboot the machines synchronously |
| 56 | // during the API call. |
| 57 | select { |
| 58 | case <-time.After(time.Duration(r.RebootWaitSeconds) * time.Second): |
| 59 | case <-ctx.Done(): |
| 60 | return fmt.Errorf("while waiting for reboot: %w", ctx.Err()) |
| 61 | } |
| 62 | |
Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 63 | klog.Infof("Removing AgentStarted/AgentHeartbeat (ID: %s, PID: %s)...", t.machine.MachineID, t.machine.ProviderID) |
| 64 | err := t.work.Finish(ctx, func(q *model.Queries) error { |
| 65 | if err := q.MachineDeleteAgentStarted(ctx, t.machine.MachineID); err != nil { |
| 66 | return fmt.Errorf("while deleting AgentStarted: %w", err) |
| 67 | } |
| 68 | if err := q.MachineDeleteAgentHeartbeat(ctx, t.machine.MachineID); err != nil { |
| 69 | return fmt.Errorf("while deleting AgentHeartbeat: %w", err) |
| 70 | } |
| 71 | return nil |
| 72 | }) |
| 73 | if err != nil { |
| 74 | return fmt.Errorf("while deleting AgentStarted/AgentHeartbeat tags: %w", err) |
| 75 | } |
| 76 | return nil |
| 77 | } |