| Serge Bazanski | 9c09c4e | 2020-03-24 13:58:01 +0100 | [diff] [blame] | 1 | // Copyright 2020 The Monogon Project Authors. |
| 2 | // |
| 3 | // SPDX-License-Identifier: Apache-2.0 |
| 4 | // |
| 5 | // Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | // you may not use this file except in compliance with the License. |
| 7 | // You may obtain a copy of the License at |
| 8 | // |
| 9 | // http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | // |
| 11 | // Unless required by applicable law or agreed to in writing, software |
| 12 | // distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | // See the License for the specific language governing permissions and |
| 15 | // limitations under the License. |
| 16 | |
| 17 | package supervisor |
| 18 | |
| Serge Bazanski | 216fe7b | 2021-05-21 18:36:16 +0200 | [diff] [blame] | 19 | // The service supervision library allows for writing of reliable, |
| 20 | // service-style software within a Metropolis node. It builds upon the |
| 21 | // Erlang/OTP supervision tree system, adapted to be more Go-ish. For detailed |
| 22 | // design see go/supervision. |
| Serge Bazanski | 9c09c4e | 2020-03-24 13:58:01 +0100 | [diff] [blame] | 23 | |
| 24 | import ( |
| 25 | "context" |
| Serge Bazanski | 26d5225 | 2022-02-07 15:57:54 +0100 | [diff] [blame] | 26 | "fmt" |
| Serge Bazanski | c735967 | 2020-10-30 16:38:57 +0100 | [diff] [blame] | 27 | "io" |
| Serge Bazanski | 9c09c4e | 2020-03-24 13:58:01 +0100 | [diff] [blame] | 28 | "sync" |
| 29 | |
| Serge Bazanski | 3c5d063 | 2024-09-12 10:49:12 +0000 | [diff] [blame^] | 30 | "source.monogon.dev/go/logging" |
| Tim Windelschmidt | 9f21f53 | 2024-05-07 15:14:20 +0200 | [diff] [blame] | 31 | "source.monogon.dev/osbase/logtree" |
| Serge Bazanski | 9c09c4e | 2020-03-24 13:58:01 +0100 | [diff] [blame] | 32 | ) |
| 33 | |
| Serge Bazanski | 216fe7b | 2021-05-21 18:36:16 +0200 | [diff] [blame] | 34 | // A Runnable is a function that will be run in a goroutine, and supervised |
| 35 | // throughout its lifetime. It can in turn start more runnables as its |
| 36 | // children, and those will form part of a supervision tree. |
| 37 | // The context passed to a runnable is very important and needs to be handled |
| 38 | // properly. It will be live (non-errored) as long as the runnable should be |
| 39 | // running, and canceled (ctx.Err() will be non-nil) when the supervisor wants |
| 40 | // it to exit. This means this context is also perfectly usable for performing |
| 41 | // any blocking operations. |
| Serge Bazanski | 9c09c4e | 2020-03-24 13:58:01 +0100 | [diff] [blame] | 42 | type Runnable func(ctx context.Context) error |
| 43 | |
| Serge Bazanski | 216fe7b | 2021-05-21 18:36:16 +0200 | [diff] [blame] | 44 | // RunGroup starts a set of runnables as a group. These runnables will run |
| 45 | // together, and if any one of them quits unexpectedly, the result will be |
| 46 | // canceled and restarted. |
| 47 | // The context here must be an existing Runnable context, and the spawned |
| 48 | // runnables will run under the node that this context represents. |
| Serge Bazanski | 9c09c4e | 2020-03-24 13:58:01 +0100 | [diff] [blame] | 49 | func RunGroup(ctx context.Context, runnables map[string]Runnable) error { |
| 50 | node, unlock := fromContext(ctx) |
| 51 | defer unlock() |
| 52 | return node.runGroup(runnables) |
| 53 | } |
| 54 | |
| 55 | // Run starts a single runnable in its own group. |
| 56 | func Run(ctx context.Context, name string, runnable Runnable) error { |
| 57 | return RunGroup(ctx, map[string]Runnable{ |
| 58 | name: runnable, |
| 59 | }) |
| 60 | } |
| 61 | |
| Serge Bazanski | 216fe7b | 2021-05-21 18:36:16 +0200 | [diff] [blame] | 62 | // Signal tells the supervisor that the calling runnable has reached a certain |
| 63 | // state of its lifecycle. All runnables should SignalHealthy when they are |
| 64 | // ready with set up, running other child runnables and are now 'serving'. |
| Serge Bazanski | 9c09c4e | 2020-03-24 13:58:01 +0100 | [diff] [blame] | 65 | func Signal(ctx context.Context, signal SignalType) { |
| 66 | node, unlock := fromContext(ctx) |
| 67 | defer unlock() |
| 68 | node.signal(signal) |
| 69 | } |
| 70 | |
| 71 | type SignalType int |
| 72 | |
| 73 | const ( |
| Serge Bazanski | 216fe7b | 2021-05-21 18:36:16 +0200 | [diff] [blame] | 74 | // The runnable is healthy, done with setup, done with spawning more |
| 75 | // Runnables, and ready to serve in a loop. The runnable needs to check |
| 76 | // the parent context and ensure that if that context is done, the runnable |
| 77 | // exits. |
| Serge Bazanski | 9c09c4e | 2020-03-24 13:58:01 +0100 | [diff] [blame] | 78 | SignalHealthy SignalType = iota |
| Serge Bazanski | 216fe7b | 2021-05-21 18:36:16 +0200 | [diff] [blame] | 79 | // The runnable is done - it does not need to run any loop. This is useful |
| 80 | // for Runnables that only set up other child runnables. This runnable will |
| 81 | // be restarted if a related failure happens somewhere in the supervision |
| 82 | // tree. |
| Serge Bazanski | 9c09c4e | 2020-03-24 13:58:01 +0100 | [diff] [blame] | 83 | SignalDone |
| 84 | ) |
| 85 | |
| Serge Bazanski | 216fe7b | 2021-05-21 18:36:16 +0200 | [diff] [blame] | 86 | // supervisor represents and instance of the supervision system. It keeps track |
| 87 | // of a supervision tree and a request channel to its internal processor |
| 88 | // goroutine. |
| Serge Bazanski | 9c09c4e | 2020-03-24 13:58:01 +0100 | [diff] [blame] | 89 | type supervisor struct { |
| 90 | // mu guards the entire state of the supervisor. |
| 91 | mu sync.RWMutex |
| Serge Bazanski | 216fe7b | 2021-05-21 18:36:16 +0200 | [diff] [blame] | 92 | // root is the root node of the supervision tree, named 'root'. It |
| 93 | // represents the Runnable started with the supervisor.New call. |
| Serge Bazanski | 9c09c4e | 2020-03-24 13:58:01 +0100 | [diff] [blame] | 94 | root *node |
| Serge Bazanski | c735967 | 2020-10-30 16:38:57 +0100 | [diff] [blame] | 95 | // logtree is the main logtree exposed to runnables and used internally. |
| 96 | logtree *logtree.LogTree |
| 97 | // ilogger is the internal logger logging to "supervisor" in the logtree. |
| Serge Bazanski | 3c5d063 | 2024-09-12 10:49:12 +0000 | [diff] [blame^] | 98 | ilogger logging.Leveled |
| Serge Bazanski | 9c09c4e | 2020-03-24 13:58:01 +0100 | [diff] [blame] | 99 | |
| Serge Bazanski | 216fe7b | 2021-05-21 18:36:16 +0200 | [diff] [blame] | 100 | // pReq is an interface channel to the lifecycle processor of the |
| 101 | // supervisor. |
| Serge Bazanski | 9c09c4e | 2020-03-24 13:58:01 +0100 | [diff] [blame] | 102 | pReq chan *processorRequest |
| Serge Bazanski | 19bb412 | 2020-05-04 17:57:50 +0200 | [diff] [blame] | 103 | |
| 104 | // propagate panics, ie. don't catch them. |
| 105 | propagatePanic bool |
| Serge Bazanski | cf864da | 2024-07-31 11:23:34 +0000 | [diff] [blame] | 106 | |
| 107 | metrics *metricsFanout |
| Serge Bazanski | 9c09c4e | 2020-03-24 13:58:01 +0100 | [diff] [blame] | 108 | } |
| 109 | |
| Serge Bazanski | 19bb412 | 2020-05-04 17:57:50 +0200 | [diff] [blame] | 110 | // SupervisorOpt are runtime configurable options for the supervisor. |
| 111 | type SupervisorOpt func(s *supervisor) |
| 112 | |
| Tim Windelschmidt | ae07661 | 2024-04-08 21:31:29 +0200 | [diff] [blame] | 113 | // WithPropagatePanic prevents the Supervisor from catching panics in |
| 114 | // runnables and treating them as failures. This is useful to enable for |
| 115 | // testing and local debugging. |
| 116 | func WithPropagatePanic(s *supervisor) { |
| 117 | s.propagatePanic = true |
| 118 | } |
| Serge Bazanski | 19bb412 | 2020-05-04 17:57:50 +0200 | [diff] [blame] | 119 | |
| Serge Bazanski | c735967 | 2020-10-30 16:38:57 +0100 | [diff] [blame] | 120 | func WithExistingLogtree(lt *logtree.LogTree) SupervisorOpt { |
| 121 | return func(s *supervisor) { |
| 122 | s.logtree = lt |
| 123 | } |
| 124 | } |
| 125 | |
| Serge Bazanski | cf864da | 2024-07-31 11:23:34 +0000 | [diff] [blame] | 126 | // WithMetrics makes the Supervisor export per-DN metrics into a given Metrics |
| 127 | // implementation. This can be called repeatedly to export the same data into |
| 128 | // multiple Metrics implementations. |
| 129 | func WithMetrics(m Metrics) SupervisorOpt { |
| 130 | return func(s *supervisor) { |
| 131 | s.metrics.sub = append(s.metrics.sub, m) |
| 132 | } |
| 133 | } |
| 134 | |
| Serge Bazanski | 9c09c4e | 2020-03-24 13:58:01 +0100 | [diff] [blame] | 135 | // New creates a new supervisor with its root running the given root runnable. |
| 136 | // The given context can be used to cancel the entire supervision tree. |
| Serge Bazanski | f8a8e65 | 2021-07-06 16:23:43 +0200 | [diff] [blame] | 137 | // |
| 138 | // For tests, we reccomend using TestHarness instead, which will also stream |
| 139 | // logs to stderr and take care of propagating root runnable errors to the test |
| 140 | // output. |
| Serge Bazanski | c735967 | 2020-10-30 16:38:57 +0100 | [diff] [blame] | 141 | func New(ctx context.Context, rootRunnable Runnable, opts ...SupervisorOpt) *supervisor { |
| Serge Bazanski | 9c09c4e | 2020-03-24 13:58:01 +0100 | [diff] [blame] | 142 | sup := &supervisor{ |
| Serge Bazanski | c735967 | 2020-10-30 16:38:57 +0100 | [diff] [blame] | 143 | logtree: logtree.New(), |
| Serge Bazanski | 9c09c4e | 2020-03-24 13:58:01 +0100 | [diff] [blame] | 144 | pReq: make(chan *processorRequest), |
| Serge Bazanski | cf864da | 2024-07-31 11:23:34 +0000 | [diff] [blame] | 145 | metrics: &metricsFanout{}, |
| Serge Bazanski | 9c09c4e | 2020-03-24 13:58:01 +0100 | [diff] [blame] | 146 | } |
| Serge Bazanski | 19bb412 | 2020-05-04 17:57:50 +0200 | [diff] [blame] | 147 | |
| 148 | for _, o := range opts { |
| 149 | o(sup) |
| 150 | } |
| 151 | |
| Serge Bazanski | c735967 | 2020-10-30 16:38:57 +0100 | [diff] [blame] | 152 | sup.ilogger = sup.logtree.MustLeveledFor("supervisor") |
| Serge Bazanski | 9c09c4e | 2020-03-24 13:58:01 +0100 | [diff] [blame] | 153 | sup.root = newNode("root", rootRunnable, sup, nil) |
| 154 | |
| 155 | go sup.processor(ctx) |
| 156 | |
| 157 | sup.pReq <- &processorRequest{ |
| 158 | schedule: &processorRequestSchedule{dn: "root"}, |
| 159 | } |
| Serge Bazanski | ac6b644 | 2020-05-06 19:13:43 +0200 | [diff] [blame] | 160 | |
| 161 | return sup |
| Serge Bazanski | 9c09c4e | 2020-03-24 13:58:01 +0100 | [diff] [blame] | 162 | } |
| Serge Bazanski | c735967 | 2020-10-30 16:38:57 +0100 | [diff] [blame] | 163 | |
| Serge Bazanski | 3c5d063 | 2024-09-12 10:49:12 +0000 | [diff] [blame^] | 164 | func Logger(ctx context.Context) logging.Leveled { |
| Serge Bazanski | c735967 | 2020-10-30 16:38:57 +0100 | [diff] [blame] | 165 | node, unlock := fromContext(ctx) |
| 166 | defer unlock() |
| 167 | return node.sup.logtree.MustLeveledFor(logtree.DN(node.dn())) |
| 168 | } |
| 169 | |
| 170 | func RawLogger(ctx context.Context) io.Writer { |
| 171 | node, unlock := fromContext(ctx) |
| 172 | defer unlock() |
| 173 | return node.sup.logtree.MustRawFor(logtree.DN(node.dn())) |
| 174 | } |
| Serge Bazanski | 26d5225 | 2022-02-07 15:57:54 +0100 | [diff] [blame] | 175 | |
| 176 | // SubLogger returns a LeveledLogger for a given name. The name is used to |
| 177 | // placed that logger within the logtree hierarchy. For example, if the |
| 178 | // runnable `root.foo` requests a SubLogger for name `bar`, the returned logger |
| 179 | // will log to `root.foo.bar` in the logging tree. |
| 180 | // |
| 181 | // An error is returned if the given name is invalid or conflicts with a child |
| 182 | // runnable of the current runnable. In addition, whenever a node uses a |
| 183 | // sub-logger with a given name, that name also becomes unavailable for use as |
| 184 | // a child runnable (no runnable and sub-logger may ever log into the same |
| 185 | // logtree DN). |
| Serge Bazanski | 3c5d063 | 2024-09-12 10:49:12 +0000 | [diff] [blame^] | 186 | func SubLogger(ctx context.Context, name string) (logging.Leveled, error) { |
| Serge Bazanski | 26d5225 | 2022-02-07 15:57:54 +0100 | [diff] [blame] | 187 | node, unlock := fromContext(ctx) |
| 188 | defer unlock() |
| 189 | |
| 190 | if _, ok := node.children[name]; ok { |
| 191 | return nil, fmt.Errorf("name %q already in use by child runnable", name) |
| 192 | } |
| 193 | if !reNodeName.MatchString(name) { |
| 194 | return nil, fmt.Errorf("sub-logger name %q is invalid", name) |
| 195 | } |
| 196 | node.reserved[name] = true |
| 197 | |
| 198 | dn := fmt.Sprintf("%s.%s", node.dn(), name) |
| 199 | return node.sup.logtree.LeveledFor(logtree.DN(dn)) |
| 200 | } |
| Serge Bazanski | 5a637b0 | 2022-02-18 12:18:04 +0100 | [diff] [blame] | 201 | |
| 202 | // MustSubLogger is a wrapper around SubLogger which panics on error. Errors |
| 203 | // should only happen due to invalid names, so as long as the given name is |
| 204 | // compile-time constant and valid, this function is safe to use. |
| Serge Bazanski | 3c5d063 | 2024-09-12 10:49:12 +0000 | [diff] [blame^] | 205 | func MustSubLogger(ctx context.Context, name string) logging.Leveled { |
| Serge Bazanski | 5a637b0 | 2022-02-18 12:18:04 +0100 | [diff] [blame] | 206 | l, err := SubLogger(ctx, name) |
| 207 | if err != nil { |
| 208 | panic(err) |
| 209 | } |
| 210 | return l |
| 211 | } |