Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 1 | package manager |
| 2 | |
| 3 | import ( |
| 4 | "context" |
Tim Windelschmidt | 913a03a | 2023-04-24 15:57:02 +0200 | [diff] [blame] | 5 | "flag" |
Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 6 | "fmt" |
Tim Windelschmidt | 913a03a | 2023-04-24 15:57:02 +0200 | [diff] [blame] | 7 | "time" |
Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 8 | |
| 9 | "k8s.io/klog/v2" |
| 10 | |
Serge Bazanski | 00cf57d | 2023-04-20 11:19:00 +0200 | [diff] [blame] | 11 | "source.monogon.dev/cloud/bmaas/bmdb" |
Serge Bazanski | c50f694 | 2023-04-24 18:27:22 +0200 | [diff] [blame^] | 12 | "source.monogon.dev/cloud/bmaas/bmdb/metrics" |
Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 13 | "source.monogon.dev/cloud/bmaas/bmdb/model" |
| 14 | ecl "source.monogon.dev/cloud/shepherd/equinix/wrapngo" |
| 15 | ) |
| 16 | |
| 17 | type RecovererConfig struct { |
| 18 | ControlLoopConfig |
Tim Windelschmidt | 913a03a | 2023-04-24 15:57:02 +0200 | [diff] [blame] | 19 | RebootWaitSeconds int |
Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 20 | } |
| 21 | |
| 22 | func (r *RecovererConfig) RegisterFlags() { |
| 23 | r.ControlLoopConfig.RegisterFlags("recoverer") |
Tim Windelschmidt | 913a03a | 2023-04-24 15:57:02 +0200 | [diff] [blame] | 24 | flag.IntVar(&r.RebootWaitSeconds, "recoverer_reboot_wait_seconds", 30, "How many seconds to sleep to ensure a reboot happend") |
Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 25 | } |
| 26 | |
| 27 | // The Recoverer reboots machines whose agent has stopped sending heartbeats or |
| 28 | // has not sent any heartbeats at all. |
| 29 | type Recoverer struct { |
| 30 | RecovererConfig |
| 31 | |
| 32 | cl ecl.Client |
| 33 | } |
| 34 | |
| 35 | func NewRecoverer(cl ecl.Client, rc RecovererConfig) (*Recoverer, error) { |
| 36 | if err := rc.ControlLoopConfig.Check(); err != nil { |
| 37 | return nil, err |
| 38 | } |
| 39 | return &Recoverer{ |
| 40 | RecovererConfig: rc, |
| 41 | cl: cl, |
| 42 | }, nil |
| 43 | } |
| 44 | |
Serge Bazanski | 00cf57d | 2023-04-20 11:19:00 +0200 | [diff] [blame] | 45 | func (r *Recoverer) getProcessInfo() processInfo { |
| 46 | return processInfo{ |
| 47 | process: model.ProcessShepherdRecovery, |
| 48 | defaultBackoff: bmdb.Backoff{ |
| 49 | Initial: 1 * time.Minute, |
| 50 | Maximum: 1 * time.Hour, |
| 51 | Exponent: 1.2, |
| 52 | }, |
Serge Bazanski | c50f694 | 2023-04-24 18:27:22 +0200 | [diff] [blame^] | 53 | processor: metrics.ProcessorShepherdRecoverer, |
Serge Bazanski | 00cf57d | 2023-04-20 11:19:00 +0200 | [diff] [blame] | 54 | } |
| 55 | } |
| 56 | |
Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 57 | func (r *Recoverer) getMachines(ctx context.Context, q *model.Queries, limit int32) ([]model.MachineProvided, error) { |
| 58 | return q.GetMachineForAgentRecovery(ctx, limit) |
| 59 | } |
| 60 | |
| 61 | func (r *Recoverer) processMachine(ctx context.Context, t *task) error { |
| 62 | klog.Infof("Starting recovery of device (ID: %s, PID %s)", t.machine.MachineID, t.machine.ProviderID) |
| 63 | |
| 64 | if err := r.cl.RebootDevice(ctx, t.machine.ProviderID); err != nil { |
| 65 | return fmt.Errorf("failed to reboot device: %w", err) |
| 66 | } |
| 67 | |
Tim Windelschmidt | 913a03a | 2023-04-24 15:57:02 +0200 | [diff] [blame] | 68 | // TODO(issue/215): replace this |
| 69 | // This is required as Equinix doesn't reboot the machines synchronously |
| 70 | // during the API call. |
| 71 | select { |
| 72 | case <-time.After(time.Duration(r.RebootWaitSeconds) * time.Second): |
| 73 | case <-ctx.Done(): |
| 74 | return fmt.Errorf("while waiting for reboot: %w", ctx.Err()) |
| 75 | } |
| 76 | |
Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 77 | klog.Infof("Removing AgentStarted/AgentHeartbeat (ID: %s, PID: %s)...", t.machine.MachineID, t.machine.ProviderID) |
| 78 | err := t.work.Finish(ctx, func(q *model.Queries) error { |
| 79 | if err := q.MachineDeleteAgentStarted(ctx, t.machine.MachineID); err != nil { |
| 80 | return fmt.Errorf("while deleting AgentStarted: %w", err) |
| 81 | } |
| 82 | if err := q.MachineDeleteAgentHeartbeat(ctx, t.machine.MachineID); err != nil { |
| 83 | return fmt.Errorf("while deleting AgentHeartbeat: %w", err) |
| 84 | } |
| 85 | return nil |
| 86 | }) |
| 87 | if err != nil { |
| 88 | return fmt.Errorf("while deleting AgentStarted/AgentHeartbeat tags: %w", err) |
| 89 | } |
| 90 | return nil |
| 91 | } |