| Tim Windelschmidt | 6d33a43 | 2025-02-04 14:34:25 +0100 | [diff] [blame^] | 1 | // Copyright The Monogon Project Authors. |
| 2 | // SPDX-License-Identifier: Apache-2.0 |
| 3 | |
| Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 4 | package manager |
| 5 | |
| 6 | import ( |
| 7 | "context" |
| 8 | "fmt" |
| Tim Windelschmidt | 913a03a | 2023-04-24 15:57:02 +0200 | [diff] [blame] | 9 | "time" |
| Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 10 | |
| 11 | "k8s.io/klog/v2" |
| 12 | |
| Serge Bazanski | 00cf57d | 2023-04-20 11:19:00 +0200 | [diff] [blame] | 13 | "source.monogon.dev/cloud/bmaas/bmdb" |
| Serge Bazanski | c50f694 | 2023-04-24 18:27:22 +0200 | [diff] [blame] | 14 | "source.monogon.dev/cloud/bmaas/bmdb/metrics" |
| Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 15 | "source.monogon.dev/cloud/bmaas/bmdb/model" |
| Tim Windelschmidt | b6308cd | 2023-10-10 21:19:03 +0200 | [diff] [blame] | 16 | "source.monogon.dev/cloud/shepherd" |
| Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 17 | ) |
| 18 | |
| 19 | type RecovererConfig struct { |
| 20 | ControlLoopConfig |
| 21 | } |
| 22 | |
| 23 | func (r *RecovererConfig) RegisterFlags() { |
| 24 | r.ControlLoopConfig.RegisterFlags("recoverer") |
| 25 | } |
| 26 | |
| 27 | // The Recoverer reboots machines whose agent has stopped sending heartbeats or |
| 28 | // has not sent any heartbeats at all. |
| 29 | type Recoverer struct { |
| 30 | RecovererConfig |
| Tim Windelschmidt | b6308cd | 2023-10-10 21:19:03 +0200 | [diff] [blame] | 31 | r shepherd.Recoverer |
| Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 32 | } |
| 33 | |
| Tim Windelschmidt | b6308cd | 2023-10-10 21:19:03 +0200 | [diff] [blame] | 34 | func NewRecoverer(r shepherd.Recoverer, rc RecovererConfig) (*Recoverer, error) { |
| Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 35 | if err := rc.ControlLoopConfig.Check(); err != nil { |
| 36 | return nil, err |
| 37 | } |
| 38 | return &Recoverer{ |
| 39 | RecovererConfig: rc, |
| Tim Windelschmidt | b6308cd | 2023-10-10 21:19:03 +0200 | [diff] [blame] | 40 | r: r, |
| Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 41 | }, nil |
| 42 | } |
| 43 | |
| Serge Bazanski | 00cf57d | 2023-04-20 11:19:00 +0200 | [diff] [blame] | 44 | func (r *Recoverer) getProcessInfo() processInfo { |
| 45 | return processInfo{ |
| 46 | process: model.ProcessShepherdRecovery, |
| 47 | defaultBackoff: bmdb.Backoff{ |
| 48 | Initial: 1 * time.Minute, |
| 49 | Maximum: 1 * time.Hour, |
| 50 | Exponent: 1.2, |
| 51 | }, |
| Serge Bazanski | c50f694 | 2023-04-24 18:27:22 +0200 | [diff] [blame] | 52 | processor: metrics.ProcessorShepherdRecoverer, |
| Serge Bazanski | 00cf57d | 2023-04-20 11:19:00 +0200 | [diff] [blame] | 53 | } |
| 54 | } |
| 55 | |
| Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 56 | func (r *Recoverer) getMachines(ctx context.Context, q *model.Queries, limit int32) ([]model.MachineProvided, error) { |
| Tim Windelschmidt | 0e74961 | 2023-08-07 17:42:59 +0000 | [diff] [blame] | 57 | return q.GetMachineForAgentRecovery(ctx, model.GetMachineForAgentRecoveryParams{ |
| 58 | Limit: limit, |
| Tim Windelschmidt | b6308cd | 2023-10-10 21:19:03 +0200 | [diff] [blame] | 59 | Provider: r.r.Type(), |
| Tim Windelschmidt | 0e74961 | 2023-08-07 17:42:59 +0000 | [diff] [blame] | 60 | }) |
| Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 61 | } |
| 62 | |
| 63 | func (r *Recoverer) processMachine(ctx context.Context, t *task) error { |
| Tim Windelschmidt | b6308cd | 2023-10-10 21:19:03 +0200 | [diff] [blame] | 64 | klog.Infof("Starting recovery of machine (ID: %s, PID %s)", t.machine.MachineID, t.machine.ProviderID) |
| Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 65 | |
| Tim Windelschmidt | b6308cd | 2023-10-10 21:19:03 +0200 | [diff] [blame] | 66 | if err := r.r.RebootMachine(ctx, shepherd.ProviderID(t.machine.ProviderID)); err != nil { |
| 67 | return fmt.Errorf("failed to reboot machine: %w", err) |
| Tim Windelschmidt | 913a03a | 2023-04-24 15:57:02 +0200 | [diff] [blame] | 68 | } |
| 69 | |
| Serge Bazanski | ae00468 | 2023-04-18 13:28:48 +0200 | [diff] [blame] | 70 | klog.Infof("Removing AgentStarted/AgentHeartbeat (ID: %s, PID: %s)...", t.machine.MachineID, t.machine.ProviderID) |
| 71 | err := t.work.Finish(ctx, func(q *model.Queries) error { |
| 72 | if err := q.MachineDeleteAgentStarted(ctx, t.machine.MachineID); err != nil { |
| 73 | return fmt.Errorf("while deleting AgentStarted: %w", err) |
| 74 | } |
| 75 | if err := q.MachineDeleteAgentHeartbeat(ctx, t.machine.MachineID); err != nil { |
| 76 | return fmt.Errorf("while deleting AgentHeartbeat: %w", err) |
| 77 | } |
| 78 | return nil |
| 79 | }) |
| 80 | if err != nil { |
| 81 | return fmt.Errorf("while deleting AgentStarted/AgentHeartbeat tags: %w", err) |
| 82 | } |
| 83 | return nil |
| 84 | } |