blob: 6ca576b87c6a5b225a205752ae4b22f882ce030e [file] [log] [blame]
Tim Windelschmidt6d33a432025-02-04 14:34:25 +01001// Copyright The Monogon Project Authors.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +01002// SPDX-License-Identifier: Apache-2.0
Serge Bazanski9c09c4e2020-03-24 13:58:01 +01003
4package supervisor
5
6import (
7 "context"
8 "fmt"
9 "regexp"
10 "strings"
11
12 "github.com/cenkalti/backoff/v4"
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010013)
14
Serge Bazanski216fe7b2021-05-21 18:36:16 +020015// node is a supervision tree node. It represents the state of a Runnable
16// within this tree, its relation to other tree elements, and contains
17// supporting data needed to actually supervise it.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010018type node struct {
Serge Bazanski216fe7b2021-05-21 18:36:16 +020019 // The name of this node. Opaque string. It's used to make up the 'dn'
20 // (distinguished name) of a node within the tree. When starting a runnable
21 // inside a tree, this is where that name gets used.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010022 name string
23 runnable Runnable
24
25 // The supervisor managing this tree.
26 sup *supervisor
Serge Bazanski216fe7b2021-05-21 18:36:16 +020027 // The parent, within the tree, of this node. If this is the root node of
28 // the tree, this is nil.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010029 parent *node
Serge Bazanski216fe7b2021-05-21 18:36:16 +020030 // Children of this tree. This is represented by a map keyed from child
31 // node names, for easy access.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010032 children map[string]*node
Serge Bazanski26d52252022-02-07 15:57:54 +010033 // Reserved nodes that may not be used as child names. This is currently
34 // used by sub-loggers (see SubLogger function), preventing a sub-logger
35 // name from colliding with a node name.
36 reserved map[string]bool
Serge Bazanski216fe7b2021-05-21 18:36:16 +020037 // Supervision groups. Each group is a set of names of children. Sets, and
38 // as such groups, don't overlap between each other. A supervision group
39 // indicates that if any child within that group fails, all others should
40 // be canceled and restarted together.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010041 groups []map[string]bool
42
43 // The current state of the runnable in this node.
Serge Bazanskieca8ee32024-07-30 14:32:19 +000044 state NodeState
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010045
Jan Schär08c1c722024-12-19 12:03:17 +010046 // signaledDone is set when the runnable has signaled Done. The transition to
47 // DONE state only happens after the runnable returns.
48 signaledDone bool
49
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010050 // Backoff used to keep runnables from being restarted too fast.
51 bo *backoff.ExponentialBackOff
52
53 // Context passed to the runnable, and its cancel function.
54 ctx context.Context
55 ctxC context.CancelFunc
56}
57
Serge Bazanskieca8ee32024-07-30 14:32:19 +000058// NodeState is the state of a runnable within a node, and in a way the node
Serge Bazanski216fe7b2021-05-21 18:36:16 +020059// itself. This follows the state diagram from go/supervision.
Serge Bazanskieca8ee32024-07-30 14:32:19 +000060type NodeState int
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010061
62const (
Serge Bazanski216fe7b2021-05-21 18:36:16 +020063 // A node that has just been created, and whose runnable has been started
64 // already but hasn't signaled anything yet.
Serge Bazanskieca8ee32024-07-30 14:32:19 +000065 NodeStateNew NodeState = iota
Serge Bazanski216fe7b2021-05-21 18:36:16 +020066 // A node whose runnable has signaled being healthy - this means it's ready
67 // to serve/act.
Serge Bazanskieca8ee32024-07-30 14:32:19 +000068 NodeStateHealthy
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010069 // A node that has unexpectedly returned or panicked.
Serge Bazanskieca8ee32024-07-30 14:32:19 +000070 NodeStateDead
Serge Bazanski216fe7b2021-05-21 18:36:16 +020071 // A node that has declared that its done with its work and should not be
72 // restarted, unless a supervision tree failure requires that.
Serge Bazanskieca8ee32024-07-30 14:32:19 +000073 NodeStateDone
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010074 // A node that has returned after being requested to cancel.
Serge Bazanskieca8ee32024-07-30 14:32:19 +000075 NodeStateCanceled
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010076)
77
Serge Bazanskif9a8dcd2024-07-31 14:46:06 +000078// NodeStates is a list of all possible values of a NodeState.
79var NodeStates = []NodeState{
80 NodeStateNew,
81 NodeStateHealthy,
82 NodeStateDead,
83 NodeStateDone,
84 NodeStateCanceled,
85}
86
Serge Bazanskieca8ee32024-07-30 14:32:19 +000087func (s NodeState) String() string {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010088 switch s {
Serge Bazanskieca8ee32024-07-30 14:32:19 +000089 case NodeStateNew:
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010090 return "NODE_STATE_NEW"
Serge Bazanskieca8ee32024-07-30 14:32:19 +000091 case NodeStateHealthy:
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010092 return "NODE_STATE_HEALTHY"
Serge Bazanskieca8ee32024-07-30 14:32:19 +000093 case NodeStateDead:
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010094 return "NODE_STATE_DEAD"
Serge Bazanskieca8ee32024-07-30 14:32:19 +000095 case NodeStateDone:
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010096 return "NODE_STATE_DONE"
Serge Bazanskieca8ee32024-07-30 14:32:19 +000097 case NodeStateCanceled:
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010098 return "NODE_STATE_CANCELED"
99 }
100 return "UNKNOWN"
101}
102
103func (n *node) String() string {
104 return fmt.Sprintf("%s (%s)", n.dn(), n.state.String())
105}
106
107// contextKey is a type used to keep data within context values.
108type contextKey string
109
110var (
111 supervisorKey = contextKey("supervisor")
112 dnKey = contextKey("dn")
113)
114
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200115// fromContext retrieves a tree node from a runnable context. It takes a lock
116// on the tree and returns an unlock function. This unlock function needs to be
117// called once mutations on the tree/supervisor/node are done.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100118func fromContext(ctx context.Context) (*node, func()) {
119 sup, ok := ctx.Value(supervisorKey).(*supervisor)
120 if !ok {
121 panic("supervisor function called from non-runnable context")
122 }
123
124 sup.mu.Lock()
125
126 dnParent, ok := ctx.Value(dnKey).(string)
127 if !ok {
128 sup.mu.Unlock()
129 panic("supervisor function called from non-runnable context")
130 }
131
132 return sup.nodeByDN(dnParent), sup.mu.Unlock
133}
134
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200135// All the following 'internal' supervisor functions must only be called with
136// the supervisor lock taken. Getting a lock via fromContext is enough.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100137
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200138// dn returns the distinguished name of a node. This distinguished name is a
139// period-separated, inverse-DNS-like name. For instance, the runnable 'foo'
140// within the runnable 'bar' will be called 'root.bar.foo'. The root of the
141// tree is always named, and has the dn, 'root'.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100142func (n *node) dn() string {
143 if n.parent != nil {
144 return fmt.Sprintf("%s.%s", n.parent.dn(), n.name)
145 }
146 return n.name
147}
148
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200149// groupSiblings is a helper function to get all runnable group siblings of a
150// given runnable name within this node. All children are always in a group,
151// even if that group is unary.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100152func (n *node) groupSiblings(name string) map[string]bool {
153 for _, m := range n.groups {
154 if _, ok := m[name]; ok {
155 return m
156 }
157 }
158 return nil
159}
160
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200161// newNode creates a new node with a given parent. It does not register it with
162// the parent (as that depends on group placement).
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100163func newNode(name string, runnable Runnable, sup *supervisor, parent *node) *node {
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200164 // We use exponential backoff for failed runnables, but at some point we
165 // cap at a given backoff time. To achieve this, we set MaxElapsedTime to
166 // 0, which will cap the backoff at MaxInterval.
Serge Bazanskib9431c92020-08-24 18:16:51 +0200167 bo := backoff.NewExponentialBackOff()
168 bo.MaxElapsedTime = 0
169
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100170 n := &node{
171 name: name,
172 runnable: runnable,
173
Serge Bazanskib9431c92020-08-24 18:16:51 +0200174 bo: bo,
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100175
176 sup: sup,
177 parent: parent,
178 }
179 n.reset()
180 return n
181}
182
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200183// resetNode sets up all the dynamic fields of the node, in preparation of
184// starting a runnable. It clears the node's children, groups and resets its
185// context.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100186func (n *node) reset() {
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200187 // Make new context. First, acquire parent context. For the root node
188 // that's Background, otherwise it's the parent's context.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100189 var pCtx context.Context
190 if n.parent == nil {
191 pCtx = context.Background()
192 } else {
193 pCtx = n.parent.ctx
194 }
195 // Mark DN and supervisor in context.
196 ctx := context.WithValue(pCtx, dnKey, n.dn())
197 ctx = context.WithValue(ctx, supervisorKey, n.sup)
198 ctx, ctxC := context.WithCancel(ctx)
199 // Set context
200 n.ctx = ctx
201 n.ctxC = ctxC
202
203 // Clear children and state
Serge Bazanskieca8ee32024-07-30 14:32:19 +0000204 n.state = NodeStateNew
Jan Schär08c1c722024-12-19 12:03:17 +0100205 n.signaledDone = false
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100206 n.children = make(map[string]*node)
Serge Bazanski26d52252022-02-07 15:57:54 +0100207 n.reserved = make(map[string]bool)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100208 n.groups = nil
209
210 // The node is now ready to be scheduled.
211}
212
213// nodeByDN returns a node by given DN from the supervisor.
214func (s *supervisor) nodeByDN(dn string) *node {
215 parts := strings.Split(dn, ".")
216 if parts[0] != "root" {
217 panic("DN does not start with root.")
218 }
219 parts = parts[1:]
220 cur := s.root
221 for {
222 if len(parts) == 0 {
223 return cur
224 }
225
226 next, ok := cur.children[parts[0]]
227 if !ok {
228 panic(fmt.Errorf("could not find %v (%s) in %s", parts, dn, cur))
229 }
230 cur = next
231 parts = parts[1:]
232 }
233}
234
235// reNodeName validates a node name against constraints.
236var reNodeName = regexp.MustCompile(`[a-z90-9_]{1,64}`)
237
238// runGroup schedules a new group of runnables to run on a node.
239func (n *node) runGroup(runnables map[string]Runnable) error {
240 // Check that the parent node is in the right state.
Serge Bazanskieca8ee32024-07-30 14:32:19 +0000241 if n.state != NodeStateNew {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100242 return fmt.Errorf("cannot run new runnable on non-NEW node")
243 }
244
245 // Check the requested runnable names.
Tim Windelschmidt6b6428d2024-04-11 01:35:41 +0200246 for name := range runnables {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100247 if !reNodeName.MatchString(name) {
248 return fmt.Errorf("runnable name %q is invalid", name)
249 }
250 if _, ok := n.children[name]; ok {
251 return fmt.Errorf("runnable %q already exists", name)
252 }
Serge Bazanski26d52252022-02-07 15:57:54 +0100253 if _, ok := n.reserved[name]; ok {
254 return fmt.Errorf("runnable %q would shadow reserved name (eg. sub-logger)", name)
255 }
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100256 }
257
258 // Create child nodes.
259 dns := make(map[string]string)
260 group := make(map[string]bool)
261 for name, runnable := range runnables {
262 if g := n.groupSiblings(name); g != nil {
263 return fmt.Errorf("duplicate child name %q", name)
264 }
265 node := newNode(name, runnable, n.sup, n)
266 n.children[name] = node
267
268 dns[name] = node.dn()
269 group[name] = true
270 }
271 // Add group.
272 n.groups = append(n.groups, group)
273
274 // Schedule execution of group members.
275 go func() {
Tim Windelschmidt6b6428d2024-04-11 01:35:41 +0200276 for name := range runnables {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100277 n.sup.pReq <- &processorRequest{
278 schedule: &processorRequestSchedule{
279 dn: dns[name],
280 },
281 }
282 }
283 }()
284 return nil
285}
286
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200287// signal sequences state changes by signals received from runnables and
288// updates a node's status accordingly.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100289func (n *node) signal(signal SignalType) {
290 switch signal {
291 case SignalHealthy:
Serge Bazanskieca8ee32024-07-30 14:32:19 +0000292 if n.state != NodeStateNew {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100293 panic(fmt.Errorf("node %s signaled healthy", n))
294 }
Serge Bazanskieca8ee32024-07-30 14:32:19 +0000295 n.state = NodeStateHealthy
Serge Bazanskicf864da2024-07-31 11:23:34 +0000296 n.sup.metrics.NotifyNodeState(n.dn(), n.state)
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100297 n.bo.Reset()
298 case SignalDone:
Serge Bazanskieca8ee32024-07-30 14:32:19 +0000299 if n.state != NodeStateHealthy {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100300 panic(fmt.Errorf("node %s signaled done", n))
301 }
Jan Schär08c1c722024-12-19 12:03:17 +0100302 if n.signaledDone {
303 panic(fmt.Errorf("node %s signaled done twice", n))
304 }
305 n.signaledDone = true
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100306 n.bo.Reset()
307 }
308}