Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 1 | package manager |
| 2 | |
| 3 | import ( |
| 4 | "context" |
| 5 | "fmt" |
Tim Windelschmidt | 913a03a | 2023-04-24 15:57:02 +0200 | [diff] [blame] | 6 | "time" |
Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 7 | |
| 8 | "k8s.io/klog/v2" |
| 9 | |
Serge Bazanski | 00cf57d | 2023-04-20 11:19:00 +0200 | [diff] [blame] | 10 | "source.monogon.dev/cloud/bmaas/bmdb" |
Serge Bazanski | c50f694 | 2023-04-24 18:27:22 +0200 | [diff] [blame] | 11 | "source.monogon.dev/cloud/bmaas/bmdb/metrics" |
Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 12 | "source.monogon.dev/cloud/bmaas/bmdb/model" |
Tim Windelschmidt | b6308cd | 2023-10-10 21:19:03 +0200 | [diff] [blame] | 13 | "source.monogon.dev/cloud/shepherd" |
Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 14 | ) |
| 15 | |
| 16 | type RecovererConfig struct { |
| 17 | ControlLoopConfig |
| 18 | } |
| 19 | |
| 20 | func (r *RecovererConfig) RegisterFlags() { |
| 21 | r.ControlLoopConfig.RegisterFlags("recoverer") |
| 22 | } |
| 23 | |
| 24 | // The Recoverer reboots machines whose agent has stopped sending heartbeats or |
| 25 | // has not sent any heartbeats at all. |
| 26 | type Recoverer struct { |
| 27 | RecovererConfig |
Tim Windelschmidt | b6308cd | 2023-10-10 21:19:03 +0200 | [diff] [blame] | 28 | r shepherd.Recoverer |
Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 29 | } |
| 30 | |
Tim Windelschmidt | b6308cd | 2023-10-10 21:19:03 +0200 | [diff] [blame] | 31 | func NewRecoverer(r shepherd.Recoverer, rc RecovererConfig) (*Recoverer, error) { |
Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 32 | if err := rc.ControlLoopConfig.Check(); err != nil { |
| 33 | return nil, err |
| 34 | } |
| 35 | return &Recoverer{ |
| 36 | RecovererConfig: rc, |
Tim Windelschmidt | b6308cd | 2023-10-10 21:19:03 +0200 | [diff] [blame] | 37 | r: r, |
Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 38 | }, nil |
| 39 | } |
| 40 | |
Serge Bazanski | 00cf57d | 2023-04-20 11:19:00 +0200 | [diff] [blame] | 41 | func (r *Recoverer) getProcessInfo() processInfo { |
| 42 | return processInfo{ |
| 43 | process: model.ProcessShepherdRecovery, |
| 44 | defaultBackoff: bmdb.Backoff{ |
| 45 | Initial: 1 * time.Minute, |
| 46 | Maximum: 1 * time.Hour, |
| 47 | Exponent: 1.2, |
| 48 | }, |
Serge Bazanski | c50f694 | 2023-04-24 18:27:22 +0200 | [diff] [blame] | 49 | processor: metrics.ProcessorShepherdRecoverer, |
Serge Bazanski | 00cf57d | 2023-04-20 11:19:00 +0200 | [diff] [blame] | 50 | } |
| 51 | } |
| 52 | |
Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 53 | func (r *Recoverer) getMachines(ctx context.Context, q *model.Queries, limit int32) ([]model.MachineProvided, error) { |
Tim Windelschmidt | 0e74961 | 2023-08-07 17:42:59 +0000 | [diff] [blame] | 54 | return q.GetMachineForAgentRecovery(ctx, model.GetMachineForAgentRecoveryParams{ |
| 55 | Limit: limit, |
Tim Windelschmidt | b6308cd | 2023-10-10 21:19:03 +0200 | [diff] [blame] | 56 | Provider: r.r.Type(), |
Tim Windelschmidt | 0e74961 | 2023-08-07 17:42:59 +0000 | [diff] [blame] | 57 | }) |
Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 58 | } |
| 59 | |
| 60 | func (r *Recoverer) processMachine(ctx context.Context, t *task) error { |
Tim Windelschmidt | b6308cd | 2023-10-10 21:19:03 +0200 | [diff] [blame] | 61 | klog.Infof("Starting recovery of machine (ID: %s, PID %s)", t.machine.MachineID, t.machine.ProviderID) |
Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 62 | |
Tim Windelschmidt | b6308cd | 2023-10-10 21:19:03 +0200 | [diff] [blame] | 63 | if err := r.r.RebootMachine(ctx, shepherd.ProviderID(t.machine.ProviderID)); err != nil { |
| 64 | return fmt.Errorf("failed to reboot machine: %w", err) |
Tim Windelschmidt | 913a03a | 2023-04-24 15:57:02 +0200 | [diff] [blame] | 65 | } |
| 66 | |
Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 67 | klog.Infof("Removing AgentStarted/AgentHeartbeat (ID: %s, PID: %s)...", t.machine.MachineID, t.machine.ProviderID) |
| 68 | err := t.work.Finish(ctx, func(q *model.Queries) error { |
| 69 | if err := q.MachineDeleteAgentStarted(ctx, t.machine.MachineID); err != nil { |
| 70 | return fmt.Errorf("while deleting AgentStarted: %w", err) |
| 71 | } |
| 72 | if err := q.MachineDeleteAgentHeartbeat(ctx, t.machine.MachineID); err != nil { |
| 73 | return fmt.Errorf("while deleting AgentHeartbeat: %w", err) |
| 74 | } |
| 75 | return nil |
| 76 | }) |
| 77 | if err != nil { |
| 78 | return fmt.Errorf("while deleting AgentStarted/AgentHeartbeat tags: %w", err) |
| 79 | } |
| 80 | return nil |
| 81 | } |