| package manager |
| |
| import ( |
| "context" |
| "fmt" |
| "time" |
| |
| "k8s.io/klog/v2" |
| |
| "source.monogon.dev/cloud/bmaas/bmdb" |
| "source.monogon.dev/cloud/bmaas/bmdb/metrics" |
| "source.monogon.dev/cloud/bmaas/bmdb/model" |
| "source.monogon.dev/cloud/shepherd" |
| ) |
| |
| type RecovererConfig struct { |
| ControlLoopConfig |
| } |
| |
| func (r *RecovererConfig) RegisterFlags() { |
| r.ControlLoopConfig.RegisterFlags("recoverer") |
| } |
| |
| // The Recoverer reboots machines whose agent has stopped sending heartbeats or |
| // has not sent any heartbeats at all. |
| type Recoverer struct { |
| RecovererConfig |
| r shepherd.Recoverer |
| } |
| |
| func NewRecoverer(r shepherd.Recoverer, rc RecovererConfig) (*Recoverer, error) { |
| if err := rc.ControlLoopConfig.Check(); err != nil { |
| return nil, err |
| } |
| return &Recoverer{ |
| RecovererConfig: rc, |
| r: r, |
| }, nil |
| } |
| |
| func (r *Recoverer) getProcessInfo() processInfo { |
| return processInfo{ |
| process: model.ProcessShepherdRecovery, |
| defaultBackoff: bmdb.Backoff{ |
| Initial: 1 * time.Minute, |
| Maximum: 1 * time.Hour, |
| Exponent: 1.2, |
| }, |
| processor: metrics.ProcessorShepherdRecoverer, |
| } |
| } |
| |
| func (r *Recoverer) getMachines(ctx context.Context, q *model.Queries, limit int32) ([]model.MachineProvided, error) { |
| return q.GetMachineForAgentRecovery(ctx, model.GetMachineForAgentRecoveryParams{ |
| Limit: limit, |
| Provider: r.r.Type(), |
| }) |
| } |
| |
| func (r *Recoverer) processMachine(ctx context.Context, t *task) error { |
| klog.Infof("Starting recovery of machine (ID: %s, PID %s)", t.machine.MachineID, t.machine.ProviderID) |
| |
| if err := r.r.RebootMachine(ctx, shepherd.ProviderID(t.machine.ProviderID)); err != nil { |
| return fmt.Errorf("failed to reboot machine: %w", err) |
| } |
| |
| klog.Infof("Removing AgentStarted/AgentHeartbeat (ID: %s, PID: %s)...", t.machine.MachineID, t.machine.ProviderID) |
| err := t.work.Finish(ctx, func(q *model.Queries) error { |
| if err := q.MachineDeleteAgentStarted(ctx, t.machine.MachineID); err != nil { |
| return fmt.Errorf("while deleting AgentStarted: %w", err) |
| } |
| if err := q.MachineDeleteAgentHeartbeat(ctx, t.machine.MachineID); err != nil { |
| return fmt.Errorf("while deleting AgentHeartbeat: %w", err) |
| } |
| return nil |
| }) |
| if err != nil { |
| return fmt.Errorf("while deleting AgentStarted/AgentHeartbeat tags: %w", err) |
| } |
| return nil |
| } |