blob: 45c5996f9e5334372ae918777e504d329a224fcb [file] [log] [blame]
Tim Windelschmidt6d33a432025-02-04 14:34:25 +01001// Copyright The Monogon Project Authors.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +01002// SPDX-License-Identifier: Apache-2.0
Serge Bazanski9c09c4e2020-03-24 13:58:01 +01003
4package supervisor
5
Serge Bazanski216fe7b2021-05-21 18:36:16 +02006// The service supervision library allows for writing of reliable,
7// service-style software within a Metropolis node. It builds upon the
8// Erlang/OTP supervision tree system, adapted to be more Go-ish. For detailed
9// design see go/supervision.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010010
11import (
12 "context"
Serge Bazanski26d52252022-02-07 15:57:54 +010013 "fmt"
Serge Bazanskic7359672020-10-30 16:38:57 +010014 "io"
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010015 "sync"
16
Serge Bazanski3c5d0632024-09-12 10:49:12 +000017 "source.monogon.dev/go/logging"
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020018 "source.monogon.dev/osbase/logtree"
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010019)
20
Serge Bazanski216fe7b2021-05-21 18:36:16 +020021// A Runnable is a function that will be run in a goroutine, and supervised
22// throughout its lifetime. It can in turn start more runnables as its
23// children, and those will form part of a supervision tree.
24// The context passed to a runnable is very important and needs to be handled
25// properly. It will be live (non-errored) as long as the runnable should be
26// running, and canceled (ctx.Err() will be non-nil) when the supervisor wants
27// it to exit. This means this context is also perfectly usable for performing
28// any blocking operations.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010029type Runnable func(ctx context.Context) error
30
Serge Bazanski216fe7b2021-05-21 18:36:16 +020031// RunGroup starts a set of runnables as a group. These runnables will run
32// together, and if any one of them quits unexpectedly, the result will be
33// canceled and restarted.
34// The context here must be an existing Runnable context, and the spawned
35// runnables will run under the node that this context represents.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010036func RunGroup(ctx context.Context, runnables map[string]Runnable) error {
37 node, unlock := fromContext(ctx)
38 defer unlock()
39 return node.runGroup(runnables)
40}
41
42// Run starts a single runnable in its own group.
43func Run(ctx context.Context, name string, runnable Runnable) error {
44 return RunGroup(ctx, map[string]Runnable{
45 name: runnable,
46 })
47}
48
Serge Bazanski216fe7b2021-05-21 18:36:16 +020049// Signal tells the supervisor that the calling runnable has reached a certain
50// state of its lifecycle. All runnables should SignalHealthy when they are
51// ready with set up, running other child runnables and are now 'serving'.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010052func Signal(ctx context.Context, signal SignalType) {
53 node, unlock := fromContext(ctx)
54 defer unlock()
55 node.signal(signal)
56}
57
58type SignalType int
59
60const (
Serge Bazanski216fe7b2021-05-21 18:36:16 +020061 // The runnable is healthy, done with setup, done with spawning more
62 // Runnables, and ready to serve in a loop. The runnable needs to check
63 // the parent context and ensure that if that context is done, the runnable
64 // exits.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010065 SignalHealthy SignalType = iota
Serge Bazanski216fe7b2021-05-21 18:36:16 +020066 // The runnable is done - it does not need to run any loop. This is useful
67 // for Runnables that only set up other child runnables. This runnable will
68 // be restarted if a related failure happens somewhere in the supervision
69 // tree.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010070 SignalDone
71)
72
Serge Bazanski216fe7b2021-05-21 18:36:16 +020073// supervisor represents and instance of the supervision system. It keeps track
74// of a supervision tree and a request channel to its internal processor
75// goroutine.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010076type supervisor struct {
77 // mu guards the entire state of the supervisor.
78 mu sync.RWMutex
Serge Bazanski216fe7b2021-05-21 18:36:16 +020079 // root is the root node of the supervision tree, named 'root'. It
80 // represents the Runnable started with the supervisor.New call.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010081 root *node
Serge Bazanskic7359672020-10-30 16:38:57 +010082 // logtree is the main logtree exposed to runnables and used internally.
83 logtree *logtree.LogTree
84 // ilogger is the internal logger logging to "supervisor" in the logtree.
Serge Bazanski3c5d0632024-09-12 10:49:12 +000085 ilogger logging.Leveled
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010086
Serge Bazanski216fe7b2021-05-21 18:36:16 +020087 // pReq is an interface channel to the lifecycle processor of the
88 // supervisor.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010089 pReq chan *processorRequest
Serge Bazanski19bb4122020-05-04 17:57:50 +020090
91 // propagate panics, ie. don't catch them.
92 propagatePanic bool
Serge Bazanskicf864da2024-07-31 11:23:34 +000093
94 metrics *metricsFanout
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010095}
96
Serge Bazanski19bb4122020-05-04 17:57:50 +020097// SupervisorOpt are runtime configurable options for the supervisor.
98type SupervisorOpt func(s *supervisor)
99
Tim Windelschmidtae076612024-04-08 21:31:29 +0200100// WithPropagatePanic prevents the Supervisor from catching panics in
101// runnables and treating them as failures. This is useful to enable for
102// testing and local debugging.
103func WithPropagatePanic(s *supervisor) {
104 s.propagatePanic = true
105}
Serge Bazanski19bb4122020-05-04 17:57:50 +0200106
Serge Bazanskic7359672020-10-30 16:38:57 +0100107func WithExistingLogtree(lt *logtree.LogTree) SupervisorOpt {
108 return func(s *supervisor) {
109 s.logtree = lt
110 }
111}
112
Serge Bazanskicf864da2024-07-31 11:23:34 +0000113// WithMetrics makes the Supervisor export per-DN metrics into a given Metrics
114// implementation. This can be called repeatedly to export the same data into
115// multiple Metrics implementations.
116func WithMetrics(m Metrics) SupervisorOpt {
117 return func(s *supervisor) {
118 s.metrics.sub = append(s.metrics.sub, m)
119 }
120}
121
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100122// New creates a new supervisor with its root running the given root runnable.
123// The given context can be used to cancel the entire supervision tree.
Serge Bazanskif8a8e652021-07-06 16:23:43 +0200124//
125// For tests, we reccomend using TestHarness instead, which will also stream
126// logs to stderr and take care of propagating root runnable errors to the test
127// output.
Serge Bazanskic7359672020-10-30 16:38:57 +0100128func New(ctx context.Context, rootRunnable Runnable, opts ...SupervisorOpt) *supervisor {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100129 sup := &supervisor{
Serge Bazanskic7359672020-10-30 16:38:57 +0100130 logtree: logtree.New(),
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100131 pReq: make(chan *processorRequest),
Serge Bazanskicf864da2024-07-31 11:23:34 +0000132 metrics: &metricsFanout{},
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100133 }
Serge Bazanski19bb4122020-05-04 17:57:50 +0200134
135 for _, o := range opts {
136 o(sup)
137 }
138
Serge Bazanskic7359672020-10-30 16:38:57 +0100139 sup.ilogger = sup.logtree.MustLeveledFor("supervisor")
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100140 sup.root = newNode("root", rootRunnable, sup, nil)
141
142 go sup.processor(ctx)
143
144 sup.pReq <- &processorRequest{
145 schedule: &processorRequestSchedule{dn: "root"},
146 }
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200147
148 return sup
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100149}
Serge Bazanskic7359672020-10-30 16:38:57 +0100150
Serge Bazanski3c5d0632024-09-12 10:49:12 +0000151func Logger(ctx context.Context) logging.Leveled {
Serge Bazanskic7359672020-10-30 16:38:57 +0100152 node, unlock := fromContext(ctx)
153 defer unlock()
154 return node.sup.logtree.MustLeveledFor(logtree.DN(node.dn()))
155}
156
157func RawLogger(ctx context.Context) io.Writer {
158 node, unlock := fromContext(ctx)
159 defer unlock()
160 return node.sup.logtree.MustRawFor(logtree.DN(node.dn()))
161}
Serge Bazanski26d52252022-02-07 15:57:54 +0100162
163// SubLogger returns a LeveledLogger for a given name. The name is used to
164// placed that logger within the logtree hierarchy. For example, if the
165// runnable `root.foo` requests a SubLogger for name `bar`, the returned logger
166// will log to `root.foo.bar` in the logging tree.
167//
168// An error is returned if the given name is invalid or conflicts with a child
169// runnable of the current runnable. In addition, whenever a node uses a
170// sub-logger with a given name, that name also becomes unavailable for use as
171// a child runnable (no runnable and sub-logger may ever log into the same
172// logtree DN).
Serge Bazanski3c5d0632024-09-12 10:49:12 +0000173func SubLogger(ctx context.Context, name string) (logging.Leveled, error) {
Serge Bazanski26d52252022-02-07 15:57:54 +0100174 node, unlock := fromContext(ctx)
175 defer unlock()
176
177 if _, ok := node.children[name]; ok {
178 return nil, fmt.Errorf("name %q already in use by child runnable", name)
179 }
180 if !reNodeName.MatchString(name) {
181 return nil, fmt.Errorf("sub-logger name %q is invalid", name)
182 }
183 node.reserved[name] = true
184
185 dn := fmt.Sprintf("%s.%s", node.dn(), name)
186 return node.sup.logtree.LeveledFor(logtree.DN(dn))
187}
Serge Bazanski5a637b02022-02-18 12:18:04 +0100188
189// MustSubLogger is a wrapper around SubLogger which panics on error. Errors
190// should only happen due to invalid names, so as long as the given name is
191// compile-time constant and valid, this function is safe to use.
Serge Bazanski3c5d0632024-09-12 10:49:12 +0000192func MustSubLogger(ctx context.Context, name string) logging.Leveled {
Serge Bazanski5a637b02022-02-18 12:18:04 +0100193 l, err := SubLogger(ctx, name)
194 if err != nil {
195 panic(err)
196 }
197 return l
198}
Tim Windelschmidt35cd44b2024-12-16 02:44:11 +0100199
200// LogTree returns the LogTree used by this supervisor instance. This should
201// only be used for reading logs. For writing logs use SubLogger instead.
202func LogTree(ctx context.Context) *logtree.LogTree {
203 node, unlock := fromContext(ctx)
204 defer unlock()
205 return node.sup.logtree
206}