Serge Bazanski | 9c09c4e | 2020-03-24 13:58:01 +0100 | [diff] [blame] | 1 | // Copyright 2020 The Monogon Project Authors. |
| 2 | // |
| 3 | // SPDX-License-Identifier: Apache-2.0 |
| 4 | // |
| 5 | // Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | // you may not use this file except in compliance with the License. |
| 7 | // You may obtain a copy of the License at |
| 8 | // |
| 9 | // http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | // |
| 11 | // Unless required by applicable law or agreed to in writing, software |
| 12 | // distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | // See the License for the specific language governing permissions and |
| 15 | // limitations under the License. |
| 16 | |
| 17 | package supervisor |
| 18 | |
Serge Bazanski | 662b5b3 | 2020-12-21 13:49:00 +0100 | [diff] [blame] | 19 | // The service supervision library allows for writing of reliable, service-style software within a Metropolis node. |
Serge Bazanski | 9c09c4e | 2020-03-24 13:58:01 +0100 | [diff] [blame] | 20 | // It builds upon the Erlang/OTP supervision tree system, adapted to be more Go-ish. |
| 21 | // For detailed design see go/supervision. |
| 22 | |
| 23 | import ( |
| 24 | "context" |
Serge Bazanski | c735967 | 2020-10-30 16:38:57 +0100 | [diff] [blame] | 25 | "io" |
Serge Bazanski | 9c09c4e | 2020-03-24 13:58:01 +0100 | [diff] [blame] | 26 | "sync" |
| 27 | |
Serge Bazanski | 31370b0 | 2021-01-07 16:31:14 +0100 | [diff] [blame] | 28 | "source.monogon.dev/metropolis/pkg/logtree" |
Serge Bazanski | 9c09c4e | 2020-03-24 13:58:01 +0100 | [diff] [blame] | 29 | ) |
| 30 | |
| 31 | // A Runnable is a function that will be run in a goroutine, and supervised throughout its lifetime. It can in turn |
| 32 | // start more runnables as its children, and those will form part of a supervision tree. |
| 33 | // The context passed to a runnable is very important and needs to be handled properly. It will be live (non-errored) as |
| 34 | // long as the runnable should be running, and canceled (ctx.Err() will be non-nil) when the supervisor wants it to |
| 35 | // exit. This means this context is also perfectly usable for performing any blocking operations. |
| 36 | type Runnable func(ctx context.Context) error |
| 37 | |
| 38 | // RunGroup starts a set of runnables as a group. These runnables will run together, and if any one of them quits |
| 39 | // unexpectedly, the result will be canceled and restarted. |
| 40 | // The context here must be an existing Runnable context, and the spawned runnables will run under the node that this |
| 41 | // context represents. |
| 42 | func RunGroup(ctx context.Context, runnables map[string]Runnable) error { |
| 43 | node, unlock := fromContext(ctx) |
| 44 | defer unlock() |
| 45 | return node.runGroup(runnables) |
| 46 | } |
| 47 | |
| 48 | // Run starts a single runnable in its own group. |
| 49 | func Run(ctx context.Context, name string, runnable Runnable) error { |
| 50 | return RunGroup(ctx, map[string]Runnable{ |
| 51 | name: runnable, |
| 52 | }) |
| 53 | } |
| 54 | |
| 55 | // Signal tells the supervisor that the calling runnable has reached a certain state of its lifecycle. All runnables |
| 56 | // should SignalHealthy when they are ready with set up, running other child runnables and are now 'serving'. |
| 57 | func Signal(ctx context.Context, signal SignalType) { |
| 58 | node, unlock := fromContext(ctx) |
| 59 | defer unlock() |
| 60 | node.signal(signal) |
| 61 | } |
| 62 | |
| 63 | type SignalType int |
| 64 | |
| 65 | const ( |
| 66 | // The runnable is healthy, done with setup, done with spawning more Runnables, and ready to serve in a loop. |
| 67 | // The runnable needs to check the parent context and ensure that if that context is done, the runnable exits. |
| 68 | SignalHealthy SignalType = iota |
| 69 | // The runnable is done - it does not need to run any loop. This is useful for Runnables that only set up other |
| 70 | // child runnables. This runnable will be restarted if a related failure happens somewhere in the supervision tree. |
| 71 | SignalDone |
| 72 | ) |
| 73 | |
Serge Bazanski | 9c09c4e | 2020-03-24 13:58:01 +0100 | [diff] [blame] | 74 | // supervisor represents and instance of the supervision system. It keeps track of a supervision tree and a request |
| 75 | // channel to its internal processor goroutine. |
| 76 | type supervisor struct { |
| 77 | // mu guards the entire state of the supervisor. |
| 78 | mu sync.RWMutex |
| 79 | // root is the root node of the supervision tree, named 'root'. It represents the Runnable started with the |
| 80 | // supervisor.New call. |
| 81 | root *node |
Serge Bazanski | c735967 | 2020-10-30 16:38:57 +0100 | [diff] [blame] | 82 | // logtree is the main logtree exposed to runnables and used internally. |
| 83 | logtree *logtree.LogTree |
| 84 | // ilogger is the internal logger logging to "supervisor" in the logtree. |
| 85 | ilogger logtree.LeveledLogger |
Serge Bazanski | 9c09c4e | 2020-03-24 13:58:01 +0100 | [diff] [blame] | 86 | |
| 87 | // pReq is an interface channel to the lifecycle processor of the supervisor. |
| 88 | pReq chan *processorRequest |
Serge Bazanski | 19bb412 | 2020-05-04 17:57:50 +0200 | [diff] [blame] | 89 | |
| 90 | // propagate panics, ie. don't catch them. |
| 91 | propagatePanic bool |
Serge Bazanski | 9c09c4e | 2020-03-24 13:58:01 +0100 | [diff] [blame] | 92 | } |
| 93 | |
Serge Bazanski | 19bb412 | 2020-05-04 17:57:50 +0200 | [diff] [blame] | 94 | // SupervisorOpt are runtime configurable options for the supervisor. |
| 95 | type SupervisorOpt func(s *supervisor) |
| 96 | |
| 97 | var ( |
| 98 | // WithPropagatePanic prevents the Supervisor from catching panics in runnables and treating them as failures. |
| 99 | // This is useful to enable for testing and local debugging. |
| 100 | WithPropagatePanic = func(s *supervisor) { |
| 101 | s.propagatePanic = true |
| 102 | } |
| 103 | ) |
| 104 | |
Serge Bazanski | c735967 | 2020-10-30 16:38:57 +0100 | [diff] [blame] | 105 | func WithExistingLogtree(lt *logtree.LogTree) SupervisorOpt { |
| 106 | return func(s *supervisor) { |
| 107 | s.logtree = lt |
| 108 | } |
| 109 | } |
| 110 | |
Serge Bazanski | 9c09c4e | 2020-03-24 13:58:01 +0100 | [diff] [blame] | 111 | // New creates a new supervisor with its root running the given root runnable. |
| 112 | // The given context can be used to cancel the entire supervision tree. |
Serge Bazanski | c735967 | 2020-10-30 16:38:57 +0100 | [diff] [blame] | 113 | func New(ctx context.Context, rootRunnable Runnable, opts ...SupervisorOpt) *supervisor { |
Serge Bazanski | 9c09c4e | 2020-03-24 13:58:01 +0100 | [diff] [blame] | 114 | sup := &supervisor{ |
Serge Bazanski | c735967 | 2020-10-30 16:38:57 +0100 | [diff] [blame] | 115 | logtree: logtree.New(), |
Serge Bazanski | 9c09c4e | 2020-03-24 13:58:01 +0100 | [diff] [blame] | 116 | pReq: make(chan *processorRequest), |
| 117 | } |
Serge Bazanski | 19bb412 | 2020-05-04 17:57:50 +0200 | [diff] [blame] | 118 | |
| 119 | for _, o := range opts { |
| 120 | o(sup) |
| 121 | } |
| 122 | |
Serge Bazanski | c735967 | 2020-10-30 16:38:57 +0100 | [diff] [blame] | 123 | sup.ilogger = sup.logtree.MustLeveledFor("supervisor") |
Serge Bazanski | 9c09c4e | 2020-03-24 13:58:01 +0100 | [diff] [blame] | 124 | sup.root = newNode("root", rootRunnable, sup, nil) |
| 125 | |
| 126 | go sup.processor(ctx) |
| 127 | |
| 128 | sup.pReq <- &processorRequest{ |
| 129 | schedule: &processorRequestSchedule{dn: "root"}, |
| 130 | } |
Serge Bazanski | ac6b644 | 2020-05-06 19:13:43 +0200 | [diff] [blame] | 131 | |
| 132 | return sup |
Serge Bazanski | 9c09c4e | 2020-03-24 13:58:01 +0100 | [diff] [blame] | 133 | } |
Serge Bazanski | c735967 | 2020-10-30 16:38:57 +0100 | [diff] [blame] | 134 | |
| 135 | func Logger(ctx context.Context) logtree.LeveledLogger { |
| 136 | node, unlock := fromContext(ctx) |
| 137 | defer unlock() |
| 138 | return node.sup.logtree.MustLeveledFor(logtree.DN(node.dn())) |
| 139 | } |
| 140 | |
| 141 | func RawLogger(ctx context.Context) io.Writer { |
| 142 | node, unlock := fromContext(ctx) |
| 143 | defer unlock() |
| 144 | return node.sup.logtree.MustRawFor(logtree.DN(node.dn())) |
| 145 | } |