Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 1 | package manager |
| 2 | |
| 3 | import ( |
| 4 | "context" |
Tim Windelschmidt | 913a03a | 2023-04-24 15:57:02 +0200 | [diff] [blame] | 5 | "flag" |
Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 6 | "fmt" |
Tim Windelschmidt | 913a03a | 2023-04-24 15:57:02 +0200 | [diff] [blame] | 7 | "time" |
Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 8 | |
| 9 | "k8s.io/klog/v2" |
| 10 | |
Serge Bazanski | 00cf57d | 2023-04-20 11:19:00 +0200 | [diff] [blame^] | 11 | "source.monogon.dev/cloud/bmaas/bmdb" |
Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 12 | "source.monogon.dev/cloud/bmaas/bmdb/model" |
| 13 | ecl "source.monogon.dev/cloud/shepherd/equinix/wrapngo" |
| 14 | ) |
| 15 | |
| 16 | type RecovererConfig struct { |
| 17 | ControlLoopConfig |
Tim Windelschmidt | 913a03a | 2023-04-24 15:57:02 +0200 | [diff] [blame] | 18 | RebootWaitSeconds int |
Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 19 | } |
| 20 | |
| 21 | func (r *RecovererConfig) RegisterFlags() { |
| 22 | r.ControlLoopConfig.RegisterFlags("recoverer") |
Tim Windelschmidt | 913a03a | 2023-04-24 15:57:02 +0200 | [diff] [blame] | 23 | flag.IntVar(&r.RebootWaitSeconds, "recoverer_reboot_wait_seconds", 30, "How many seconds to sleep to ensure a reboot happend") |
Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 24 | } |
| 25 | |
| 26 | // The Recoverer reboots machines whose agent has stopped sending heartbeats or |
| 27 | // has not sent any heartbeats at all. |
| 28 | type Recoverer struct { |
| 29 | RecovererConfig |
| 30 | |
| 31 | cl ecl.Client |
| 32 | } |
| 33 | |
| 34 | func NewRecoverer(cl ecl.Client, rc RecovererConfig) (*Recoverer, error) { |
| 35 | if err := rc.ControlLoopConfig.Check(); err != nil { |
| 36 | return nil, err |
| 37 | } |
| 38 | return &Recoverer{ |
| 39 | RecovererConfig: rc, |
| 40 | cl: cl, |
| 41 | }, nil |
| 42 | } |
| 43 | |
Serge Bazanski | 00cf57d | 2023-04-20 11:19:00 +0200 | [diff] [blame^] | 44 | func (r *Recoverer) getProcessInfo() processInfo { |
| 45 | return processInfo{ |
| 46 | process: model.ProcessShepherdRecovery, |
| 47 | defaultBackoff: bmdb.Backoff{ |
| 48 | Initial: 1 * time.Minute, |
| 49 | Maximum: 1 * time.Hour, |
| 50 | Exponent: 1.2, |
| 51 | }, |
| 52 | } |
| 53 | } |
| 54 | |
Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 55 | func (r *Recoverer) getMachines(ctx context.Context, q *model.Queries, limit int32) ([]model.MachineProvided, error) { |
| 56 | return q.GetMachineForAgentRecovery(ctx, limit) |
| 57 | } |
| 58 | |
| 59 | func (r *Recoverer) processMachine(ctx context.Context, t *task) error { |
| 60 | klog.Infof("Starting recovery of device (ID: %s, PID %s)", t.machine.MachineID, t.machine.ProviderID) |
| 61 | |
| 62 | if err := r.cl.RebootDevice(ctx, t.machine.ProviderID); err != nil { |
| 63 | return fmt.Errorf("failed to reboot device: %w", err) |
| 64 | } |
| 65 | |
Tim Windelschmidt | 913a03a | 2023-04-24 15:57:02 +0200 | [diff] [blame] | 66 | // TODO(issue/215): replace this |
| 67 | // This is required as Equinix doesn't reboot the machines synchronously |
| 68 | // during the API call. |
| 69 | select { |
| 70 | case <-time.After(time.Duration(r.RebootWaitSeconds) * time.Second): |
| 71 | case <-ctx.Done(): |
| 72 | return fmt.Errorf("while waiting for reboot: %w", ctx.Err()) |
| 73 | } |
| 74 | |
Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 75 | klog.Infof("Removing AgentStarted/AgentHeartbeat (ID: %s, PID: %s)...", t.machine.MachineID, t.machine.ProviderID) |
| 76 | err := t.work.Finish(ctx, func(q *model.Queries) error { |
| 77 | if err := q.MachineDeleteAgentStarted(ctx, t.machine.MachineID); err != nil { |
| 78 | return fmt.Errorf("while deleting AgentStarted: %w", err) |
| 79 | } |
| 80 | if err := q.MachineDeleteAgentHeartbeat(ctx, t.machine.MachineID); err != nil { |
| 81 | return fmt.Errorf("while deleting AgentHeartbeat: %w", err) |
| 82 | } |
| 83 | return nil |
| 84 | }) |
| 85 | if err != nil { |
| 86 | return fmt.Errorf("while deleting AgentStarted/AgentHeartbeat tags: %w", err) |
| 87 | } |
| 88 | return nil |
| 89 | } |