blob: 8925f176fecd478a7f478861dff25331c147d19d [file] [log] [blame]
Tim Windelschmidt6d33a432025-02-04 14:34:25 +01001// Copyright The Monogon Project Authors.
2// SPDX-License-Identifier: Apache-2.0
3
Serge Bazanskiae004682023-04-18 13:28:48 +02004package manager
5
6import (
7 "context"
8 "fmt"
Tim Windelschmidt913a03a2023-04-24 15:57:02 +02009 "time"
Serge Bazanskiae004682023-04-18 13:28:48 +020010
11 "k8s.io/klog/v2"
12
Serge Bazanski00cf57d2023-04-20 11:19:00 +020013 "source.monogon.dev/cloud/bmaas/bmdb"
Serge Bazanskic50f6942023-04-24 18:27:22 +020014 "source.monogon.dev/cloud/bmaas/bmdb/metrics"
Serge Bazanskiae004682023-04-18 13:28:48 +020015 "source.monogon.dev/cloud/bmaas/bmdb/model"
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +020016 "source.monogon.dev/cloud/shepherd"
Serge Bazanskiae004682023-04-18 13:28:48 +020017)
18
19type RecovererConfig struct {
20 ControlLoopConfig
21}
22
23func (r *RecovererConfig) RegisterFlags() {
24 r.ControlLoopConfig.RegisterFlags("recoverer")
25}
26
27// The Recoverer reboots machines whose agent has stopped sending heartbeats or
28// has not sent any heartbeats at all.
29type Recoverer struct {
30 RecovererConfig
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +020031 r shepherd.Recoverer
Serge Bazanskiae004682023-04-18 13:28:48 +020032}
33
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +020034func NewRecoverer(r shepherd.Recoverer, rc RecovererConfig) (*Recoverer, error) {
Serge Bazanskiae004682023-04-18 13:28:48 +020035 if err := rc.ControlLoopConfig.Check(); err != nil {
36 return nil, err
37 }
38 return &Recoverer{
39 RecovererConfig: rc,
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +020040 r: r,
Serge Bazanskiae004682023-04-18 13:28:48 +020041 }, nil
42}
43
Serge Bazanski00cf57d2023-04-20 11:19:00 +020044func (r *Recoverer) getProcessInfo() processInfo {
45 return processInfo{
46 process: model.ProcessShepherdRecovery,
47 defaultBackoff: bmdb.Backoff{
48 Initial: 1 * time.Minute,
49 Maximum: 1 * time.Hour,
50 Exponent: 1.2,
51 },
Serge Bazanskic50f6942023-04-24 18:27:22 +020052 processor: metrics.ProcessorShepherdRecoverer,
Serge Bazanski00cf57d2023-04-20 11:19:00 +020053 }
54}
55
Serge Bazanskiae004682023-04-18 13:28:48 +020056func (r *Recoverer) getMachines(ctx context.Context, q *model.Queries, limit int32) ([]model.MachineProvided, error) {
Tim Windelschmidt0e749612023-08-07 17:42:59 +000057 return q.GetMachineForAgentRecovery(ctx, model.GetMachineForAgentRecoveryParams{
58 Limit: limit,
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +020059 Provider: r.r.Type(),
Tim Windelschmidt0e749612023-08-07 17:42:59 +000060 })
Serge Bazanskiae004682023-04-18 13:28:48 +020061}
62
63func (r *Recoverer) processMachine(ctx context.Context, t *task) error {
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +020064 klog.Infof("Starting recovery of machine (ID: %s, PID %s)", t.machine.MachineID, t.machine.ProviderID)
Serge Bazanskiae004682023-04-18 13:28:48 +020065
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +020066 if err := r.r.RebootMachine(ctx, shepherd.ProviderID(t.machine.ProviderID)); err != nil {
67 return fmt.Errorf("failed to reboot machine: %w", err)
Tim Windelschmidt913a03a2023-04-24 15:57:02 +020068 }
69
Serge Bazanskiae004682023-04-18 13:28:48 +020070 klog.Infof("Removing AgentStarted/AgentHeartbeat (ID: %s, PID: %s)...", t.machine.MachineID, t.machine.ProviderID)
71 err := t.work.Finish(ctx, func(q *model.Queries) error {
72 if err := q.MachineDeleteAgentStarted(ctx, t.machine.MachineID); err != nil {
73 return fmt.Errorf("while deleting AgentStarted: %w", err)
74 }
75 if err := q.MachineDeleteAgentHeartbeat(ctx, t.machine.MachineID); err != nil {
76 return fmt.Errorf("while deleting AgentHeartbeat: %w", err)
77 }
78 return nil
79 })
80 if err != nil {
81 return fmt.Errorf("while deleting AgentStarted/AgentHeartbeat tags: %w", err)
82 }
83 return nil
84}