blob: a94700a7fd13b342db5843b1560a0c2075b6ecd6 [file] [log] [blame]
Serge Bazanskiae004682023-04-18 13:28:48 +02001package manager
2
3import (
4 "context"
5 "fmt"
Tim Windelschmidt913a03a2023-04-24 15:57:02 +02006 "time"
Serge Bazanskiae004682023-04-18 13:28:48 +02007
8 "k8s.io/klog/v2"
9
Serge Bazanski00cf57d2023-04-20 11:19:00 +020010 "source.monogon.dev/cloud/bmaas/bmdb"
Serge Bazanskic50f6942023-04-24 18:27:22 +020011 "source.monogon.dev/cloud/bmaas/bmdb/metrics"
Serge Bazanskiae004682023-04-18 13:28:48 +020012 "source.monogon.dev/cloud/bmaas/bmdb/model"
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +020013 "source.monogon.dev/cloud/shepherd"
Serge Bazanskiae004682023-04-18 13:28:48 +020014)
15
16type RecovererConfig struct {
17 ControlLoopConfig
18}
19
20func (r *RecovererConfig) RegisterFlags() {
21 r.ControlLoopConfig.RegisterFlags("recoverer")
22}
23
24// The Recoverer reboots machines whose agent has stopped sending heartbeats or
25// has not sent any heartbeats at all.
26type Recoverer struct {
27 RecovererConfig
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +020028 r shepherd.Recoverer
Serge Bazanskiae004682023-04-18 13:28:48 +020029}
30
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +020031func NewRecoverer(r shepherd.Recoverer, rc RecovererConfig) (*Recoverer, error) {
Serge Bazanskiae004682023-04-18 13:28:48 +020032 if err := rc.ControlLoopConfig.Check(); err != nil {
33 return nil, err
34 }
35 return &Recoverer{
36 RecovererConfig: rc,
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +020037 r: r,
Serge Bazanskiae004682023-04-18 13:28:48 +020038 }, nil
39}
40
Serge Bazanski00cf57d2023-04-20 11:19:00 +020041func (r *Recoverer) getProcessInfo() processInfo {
42 return processInfo{
43 process: model.ProcessShepherdRecovery,
44 defaultBackoff: bmdb.Backoff{
45 Initial: 1 * time.Minute,
46 Maximum: 1 * time.Hour,
47 Exponent: 1.2,
48 },
Serge Bazanskic50f6942023-04-24 18:27:22 +020049 processor: metrics.ProcessorShepherdRecoverer,
Serge Bazanski00cf57d2023-04-20 11:19:00 +020050 }
51}
52
Serge Bazanskiae004682023-04-18 13:28:48 +020053func (r *Recoverer) getMachines(ctx context.Context, q *model.Queries, limit int32) ([]model.MachineProvided, error) {
Tim Windelschmidt0e749612023-08-07 17:42:59 +000054 return q.GetMachineForAgentRecovery(ctx, model.GetMachineForAgentRecoveryParams{
55 Limit: limit,
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +020056 Provider: r.r.Type(),
Tim Windelschmidt0e749612023-08-07 17:42:59 +000057 })
Serge Bazanskiae004682023-04-18 13:28:48 +020058}
59
60func (r *Recoverer) processMachine(ctx context.Context, t *task) error {
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +020061 klog.Infof("Starting recovery of machine (ID: %s, PID %s)", t.machine.MachineID, t.machine.ProviderID)
Serge Bazanskiae004682023-04-18 13:28:48 +020062
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +020063 if err := r.r.RebootMachine(ctx, shepherd.ProviderID(t.machine.ProviderID)); err != nil {
64 return fmt.Errorf("failed to reboot machine: %w", err)
Tim Windelschmidt913a03a2023-04-24 15:57:02 +020065 }
66
Serge Bazanskiae004682023-04-18 13:28:48 +020067 klog.Infof("Removing AgentStarted/AgentHeartbeat (ID: %s, PID: %s)...", t.machine.MachineID, t.machine.ProviderID)
68 err := t.work.Finish(ctx, func(q *model.Queries) error {
69 if err := q.MachineDeleteAgentStarted(ctx, t.machine.MachineID); err != nil {
70 return fmt.Errorf("while deleting AgentStarted: %w", err)
71 }
72 if err := q.MachineDeleteAgentHeartbeat(ctx, t.machine.MachineID); err != nil {
73 return fmt.Errorf("while deleting AgentHeartbeat: %w", err)
74 }
75 return nil
76 })
77 if err != nil {
78 return fmt.Errorf("while deleting AgentStarted/AgentHeartbeat tags: %w", err)
79 }
80 return nil
81}