blob: f323d039f01c0f7b3294680a7c5e4731d9b51947 [file] [log] [blame]
Serge Bazanskiae004682023-04-18 13:28:48 +02001package manager
2
3import (
4 "context"
5 "fmt"
6
7 "k8s.io/klog/v2"
8
9 "source.monogon.dev/cloud/bmaas/bmdb/model"
10 ecl "source.monogon.dev/cloud/shepherd/equinix/wrapngo"
11)
12
13type RecovererConfig struct {
14 ControlLoopConfig
15}
16
17func (r *RecovererConfig) RegisterFlags() {
18 r.ControlLoopConfig.RegisterFlags("recoverer")
19}
20
21// The Recoverer reboots machines whose agent has stopped sending heartbeats or
22// has not sent any heartbeats at all.
23type Recoverer struct {
24 RecovererConfig
25
26 cl ecl.Client
27}
28
29func NewRecoverer(cl ecl.Client, rc RecovererConfig) (*Recoverer, error) {
30 if err := rc.ControlLoopConfig.Check(); err != nil {
31 return nil, err
32 }
33 return &Recoverer{
34 RecovererConfig: rc,
35 cl: cl,
36 }, nil
37}
38
39func (r *Recoverer) getMachines(ctx context.Context, q *model.Queries, limit int32) ([]model.MachineProvided, error) {
40 return q.GetMachineForAgentRecovery(ctx, limit)
41}
42
43func (r *Recoverer) processMachine(ctx context.Context, t *task) error {
44 klog.Infof("Starting recovery of device (ID: %s, PID %s)", t.machine.MachineID, t.machine.ProviderID)
45
46 if err := r.cl.RebootDevice(ctx, t.machine.ProviderID); err != nil {
47 return fmt.Errorf("failed to reboot device: %w", err)
48 }
49
50 klog.Infof("Removing AgentStarted/AgentHeartbeat (ID: %s, PID: %s)...", t.machine.MachineID, t.machine.ProviderID)
51 err := t.work.Finish(ctx, func(q *model.Queries) error {
52 if err := q.MachineDeleteAgentStarted(ctx, t.machine.MachineID); err != nil {
53 return fmt.Errorf("while deleting AgentStarted: %w", err)
54 }
55 if err := q.MachineDeleteAgentHeartbeat(ctx, t.machine.MachineID); err != nil {
56 return fmt.Errorf("while deleting AgentHeartbeat: %w", err)
57 }
58 return nil
59 })
60 if err != nil {
61 return fmt.Errorf("while deleting AgentStarted/AgentHeartbeat tags: %w", err)
62 }
63 return nil
64}