blob: 5570945bc0d4bba3e9ad19a92f66fdfc8f97f38a [file] [log] [blame]
Serge Bazanski9c09c4e2020-03-24 13:58:01 +01001// Copyright 2020 The Monogon Project Authors.
2//
3// SPDX-License-Identifier: Apache-2.0
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
17package supervisor
18
Serge Bazanski216fe7b2021-05-21 18:36:16 +020019// The service supervision library allows for writing of reliable,
20// service-style software within a Metropolis node. It builds upon the
21// Erlang/OTP supervision tree system, adapted to be more Go-ish. For detailed
22// design see go/supervision.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010023
24import (
25 "context"
Serge Bazanski26d52252022-02-07 15:57:54 +010026 "fmt"
Serge Bazanskic7359672020-10-30 16:38:57 +010027 "io"
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010028 "sync"
29
Serge Bazanski3c5d0632024-09-12 10:49:12 +000030 "source.monogon.dev/go/logging"
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020031 "source.monogon.dev/osbase/logtree"
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010032)
33
Serge Bazanski216fe7b2021-05-21 18:36:16 +020034// A Runnable is a function that will be run in a goroutine, and supervised
35// throughout its lifetime. It can in turn start more runnables as its
36// children, and those will form part of a supervision tree.
37// The context passed to a runnable is very important and needs to be handled
38// properly. It will be live (non-errored) as long as the runnable should be
39// running, and canceled (ctx.Err() will be non-nil) when the supervisor wants
40// it to exit. This means this context is also perfectly usable for performing
41// any blocking operations.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010042type Runnable func(ctx context.Context) error
43
Serge Bazanski216fe7b2021-05-21 18:36:16 +020044// RunGroup starts a set of runnables as a group. These runnables will run
45// together, and if any one of them quits unexpectedly, the result will be
46// canceled and restarted.
47// The context here must be an existing Runnable context, and the spawned
48// runnables will run under the node that this context represents.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010049func RunGroup(ctx context.Context, runnables map[string]Runnable) error {
50 node, unlock := fromContext(ctx)
51 defer unlock()
52 return node.runGroup(runnables)
53}
54
55// Run starts a single runnable in its own group.
56func Run(ctx context.Context, name string, runnable Runnable) error {
57 return RunGroup(ctx, map[string]Runnable{
58 name: runnable,
59 })
60}
61
Serge Bazanski216fe7b2021-05-21 18:36:16 +020062// Signal tells the supervisor that the calling runnable has reached a certain
63// state of its lifecycle. All runnables should SignalHealthy when they are
64// ready with set up, running other child runnables and are now 'serving'.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010065func Signal(ctx context.Context, signal SignalType) {
66 node, unlock := fromContext(ctx)
67 defer unlock()
68 node.signal(signal)
69}
70
71type SignalType int
72
73const (
Serge Bazanski216fe7b2021-05-21 18:36:16 +020074 // The runnable is healthy, done with setup, done with spawning more
75 // Runnables, and ready to serve in a loop. The runnable needs to check
76 // the parent context and ensure that if that context is done, the runnable
77 // exits.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010078 SignalHealthy SignalType = iota
Serge Bazanski216fe7b2021-05-21 18:36:16 +020079 // The runnable is done - it does not need to run any loop. This is useful
80 // for Runnables that only set up other child runnables. This runnable will
81 // be restarted if a related failure happens somewhere in the supervision
82 // tree.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010083 SignalDone
84)
85
Serge Bazanski216fe7b2021-05-21 18:36:16 +020086// supervisor represents and instance of the supervision system. It keeps track
87// of a supervision tree and a request channel to its internal processor
88// goroutine.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010089type supervisor struct {
90 // mu guards the entire state of the supervisor.
91 mu sync.RWMutex
Serge Bazanski216fe7b2021-05-21 18:36:16 +020092 // root is the root node of the supervision tree, named 'root'. It
93 // represents the Runnable started with the supervisor.New call.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010094 root *node
Serge Bazanskic7359672020-10-30 16:38:57 +010095 // logtree is the main logtree exposed to runnables and used internally.
96 logtree *logtree.LogTree
97 // ilogger is the internal logger logging to "supervisor" in the logtree.
Serge Bazanski3c5d0632024-09-12 10:49:12 +000098 ilogger logging.Leveled
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010099
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200100 // pReq is an interface channel to the lifecycle processor of the
101 // supervisor.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100102 pReq chan *processorRequest
Serge Bazanski19bb4122020-05-04 17:57:50 +0200103
104 // propagate panics, ie. don't catch them.
105 propagatePanic bool
Serge Bazanskicf864da2024-07-31 11:23:34 +0000106
107 metrics *metricsFanout
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100108}
109
Serge Bazanski19bb4122020-05-04 17:57:50 +0200110// SupervisorOpt are runtime configurable options for the supervisor.
111type SupervisorOpt func(s *supervisor)
112
Tim Windelschmidtae076612024-04-08 21:31:29 +0200113// WithPropagatePanic prevents the Supervisor from catching panics in
114// runnables and treating them as failures. This is useful to enable for
115// testing and local debugging.
116func WithPropagatePanic(s *supervisor) {
117 s.propagatePanic = true
118}
Serge Bazanski19bb4122020-05-04 17:57:50 +0200119
Serge Bazanskic7359672020-10-30 16:38:57 +0100120func WithExistingLogtree(lt *logtree.LogTree) SupervisorOpt {
121 return func(s *supervisor) {
122 s.logtree = lt
123 }
124}
125
Serge Bazanskicf864da2024-07-31 11:23:34 +0000126// WithMetrics makes the Supervisor export per-DN metrics into a given Metrics
127// implementation. This can be called repeatedly to export the same data into
128// multiple Metrics implementations.
129func WithMetrics(m Metrics) SupervisorOpt {
130 return func(s *supervisor) {
131 s.metrics.sub = append(s.metrics.sub, m)
132 }
133}
134
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100135// New creates a new supervisor with its root running the given root runnable.
136// The given context can be used to cancel the entire supervision tree.
Serge Bazanskif8a8e652021-07-06 16:23:43 +0200137//
138// For tests, we reccomend using TestHarness instead, which will also stream
139// logs to stderr and take care of propagating root runnable errors to the test
140// output.
Serge Bazanskic7359672020-10-30 16:38:57 +0100141func New(ctx context.Context, rootRunnable Runnable, opts ...SupervisorOpt) *supervisor {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100142 sup := &supervisor{
Serge Bazanskic7359672020-10-30 16:38:57 +0100143 logtree: logtree.New(),
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100144 pReq: make(chan *processorRequest),
Serge Bazanskicf864da2024-07-31 11:23:34 +0000145 metrics: &metricsFanout{},
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100146 }
Serge Bazanski19bb4122020-05-04 17:57:50 +0200147
148 for _, o := range opts {
149 o(sup)
150 }
151
Serge Bazanskic7359672020-10-30 16:38:57 +0100152 sup.ilogger = sup.logtree.MustLeveledFor("supervisor")
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100153 sup.root = newNode("root", rootRunnable, sup, nil)
154
155 go sup.processor(ctx)
156
157 sup.pReq <- &processorRequest{
158 schedule: &processorRequestSchedule{dn: "root"},
159 }
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200160
161 return sup
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100162}
Serge Bazanskic7359672020-10-30 16:38:57 +0100163
Serge Bazanski3c5d0632024-09-12 10:49:12 +0000164func Logger(ctx context.Context) logging.Leveled {
Serge Bazanskic7359672020-10-30 16:38:57 +0100165 node, unlock := fromContext(ctx)
166 defer unlock()
167 return node.sup.logtree.MustLeveledFor(logtree.DN(node.dn()))
168}
169
170func RawLogger(ctx context.Context) io.Writer {
171 node, unlock := fromContext(ctx)
172 defer unlock()
173 return node.sup.logtree.MustRawFor(logtree.DN(node.dn()))
174}
Serge Bazanski26d52252022-02-07 15:57:54 +0100175
176// SubLogger returns a LeveledLogger for a given name. The name is used to
177// placed that logger within the logtree hierarchy. For example, if the
178// runnable `root.foo` requests a SubLogger for name `bar`, the returned logger
179// will log to `root.foo.bar` in the logging tree.
180//
181// An error is returned if the given name is invalid or conflicts with a child
182// runnable of the current runnable. In addition, whenever a node uses a
183// sub-logger with a given name, that name also becomes unavailable for use as
184// a child runnable (no runnable and sub-logger may ever log into the same
185// logtree DN).
Serge Bazanski3c5d0632024-09-12 10:49:12 +0000186func SubLogger(ctx context.Context, name string) (logging.Leveled, error) {
Serge Bazanski26d52252022-02-07 15:57:54 +0100187 node, unlock := fromContext(ctx)
188 defer unlock()
189
190 if _, ok := node.children[name]; ok {
191 return nil, fmt.Errorf("name %q already in use by child runnable", name)
192 }
193 if !reNodeName.MatchString(name) {
194 return nil, fmt.Errorf("sub-logger name %q is invalid", name)
195 }
196 node.reserved[name] = true
197
198 dn := fmt.Sprintf("%s.%s", node.dn(), name)
199 return node.sup.logtree.LeveledFor(logtree.DN(dn))
200}
Serge Bazanski5a637b02022-02-18 12:18:04 +0100201
202// MustSubLogger is a wrapper around SubLogger which panics on error. Errors
203// should only happen due to invalid names, so as long as the given name is
204// compile-time constant and valid, this function is safe to use.
Serge Bazanski3c5d0632024-09-12 10:49:12 +0000205func MustSubLogger(ctx context.Context, name string) logging.Leveled {
Serge Bazanski5a637b02022-02-18 12:18:04 +0100206 l, err := SubLogger(ctx, name)
207 if err != nil {
208 panic(err)
209 }
210 return l
211}