blob: a94700a7fd13b342db5843b1560a0c2075b6ecd6 [file] [log] [blame] [edit]
package manager
import (
"context"
"fmt"
"time"
"k8s.io/klog/v2"
"source.monogon.dev/cloud/bmaas/bmdb"
"source.monogon.dev/cloud/bmaas/bmdb/metrics"
"source.monogon.dev/cloud/bmaas/bmdb/model"
"source.monogon.dev/cloud/shepherd"
)
type RecovererConfig struct {
ControlLoopConfig
}
func (r *RecovererConfig) RegisterFlags() {
r.ControlLoopConfig.RegisterFlags("recoverer")
}
// The Recoverer reboots machines whose agent has stopped sending heartbeats or
// has not sent any heartbeats at all.
type Recoverer struct {
RecovererConfig
r shepherd.Recoverer
}
func NewRecoverer(r shepherd.Recoverer, rc RecovererConfig) (*Recoverer, error) {
if err := rc.ControlLoopConfig.Check(); err != nil {
return nil, err
}
return &Recoverer{
RecovererConfig: rc,
r: r,
}, nil
}
func (r *Recoverer) getProcessInfo() processInfo {
return processInfo{
process: model.ProcessShepherdRecovery,
defaultBackoff: bmdb.Backoff{
Initial: 1 * time.Minute,
Maximum: 1 * time.Hour,
Exponent: 1.2,
},
processor: metrics.ProcessorShepherdRecoverer,
}
}
func (r *Recoverer) getMachines(ctx context.Context, q *model.Queries, limit int32) ([]model.MachineProvided, error) {
return q.GetMachineForAgentRecovery(ctx, model.GetMachineForAgentRecoveryParams{
Limit: limit,
Provider: r.r.Type(),
})
}
func (r *Recoverer) processMachine(ctx context.Context, t *task) error {
klog.Infof("Starting recovery of machine (ID: %s, PID %s)", t.machine.MachineID, t.machine.ProviderID)
if err := r.r.RebootMachine(ctx, shepherd.ProviderID(t.machine.ProviderID)); err != nil {
return fmt.Errorf("failed to reboot machine: %w", err)
}
klog.Infof("Removing AgentStarted/AgentHeartbeat (ID: %s, PID: %s)...", t.machine.MachineID, t.machine.ProviderID)
err := t.work.Finish(ctx, func(q *model.Queries) error {
if err := q.MachineDeleteAgentStarted(ctx, t.machine.MachineID); err != nil {
return fmt.Errorf("while deleting AgentStarted: %w", err)
}
if err := q.MachineDeleteAgentHeartbeat(ctx, t.machine.MachineID); err != nil {
return fmt.Errorf("while deleting AgentHeartbeat: %w", err)
}
return nil
})
if err != nil {
return fmt.Errorf("while deleting AgentStarted/AgentHeartbeat tags: %w", err)
}
return nil
}