blob: f26732d2d273ac8fd217c83de4379cd9a15f2522 [file] [log] [blame]
Serge Bazanski9c09c4e2020-03-24 13:58:01 +01001// Copyright 2020 The Monogon Project Authors.
2//
3// SPDX-License-Identifier: Apache-2.0
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
17package supervisor
18
Serge Bazanski216fe7b2021-05-21 18:36:16 +020019// The service supervision library allows for writing of reliable,
20// service-style software within a Metropolis node. It builds upon the
21// Erlang/OTP supervision tree system, adapted to be more Go-ish. For detailed
22// design see go/supervision.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010023
24import (
25 "context"
Serge Bazanskic7359672020-10-30 16:38:57 +010026 "io"
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010027 "sync"
28
Serge Bazanski31370b02021-01-07 16:31:14 +010029 "source.monogon.dev/metropolis/pkg/logtree"
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010030)
31
Serge Bazanski216fe7b2021-05-21 18:36:16 +020032// A Runnable is a function that will be run in a goroutine, and supervised
33// throughout its lifetime. It can in turn start more runnables as its
34// children, and those will form part of a supervision tree.
35// The context passed to a runnable is very important and needs to be handled
36// properly. It will be live (non-errored) as long as the runnable should be
37// running, and canceled (ctx.Err() will be non-nil) when the supervisor wants
38// it to exit. This means this context is also perfectly usable for performing
39// any blocking operations.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010040type Runnable func(ctx context.Context) error
41
Serge Bazanski216fe7b2021-05-21 18:36:16 +020042// RunGroup starts a set of runnables as a group. These runnables will run
43// together, and if any one of them quits unexpectedly, the result will be
44// canceled and restarted.
45// The context here must be an existing Runnable context, and the spawned
46// runnables will run under the node that this context represents.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010047func RunGroup(ctx context.Context, runnables map[string]Runnable) error {
48 node, unlock := fromContext(ctx)
49 defer unlock()
50 return node.runGroup(runnables)
51}
52
53// Run starts a single runnable in its own group.
54func Run(ctx context.Context, name string, runnable Runnable) error {
55 return RunGroup(ctx, map[string]Runnable{
56 name: runnable,
57 })
58}
59
Serge Bazanski216fe7b2021-05-21 18:36:16 +020060// Signal tells the supervisor that the calling runnable has reached a certain
61// state of its lifecycle. All runnables should SignalHealthy when they are
62// ready with set up, running other child runnables and are now 'serving'.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010063func Signal(ctx context.Context, signal SignalType) {
64 node, unlock := fromContext(ctx)
65 defer unlock()
66 node.signal(signal)
67}
68
69type SignalType int
70
71const (
Serge Bazanski216fe7b2021-05-21 18:36:16 +020072 // The runnable is healthy, done with setup, done with spawning more
73 // Runnables, and ready to serve in a loop. The runnable needs to check
74 // the parent context and ensure that if that context is done, the runnable
75 // exits.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010076 SignalHealthy SignalType = iota
Serge Bazanski216fe7b2021-05-21 18:36:16 +020077 // The runnable is done - it does not need to run any loop. This is useful
78 // for Runnables that only set up other child runnables. This runnable will
79 // be restarted if a related failure happens somewhere in the supervision
80 // tree.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010081 SignalDone
82)
83
Serge Bazanski216fe7b2021-05-21 18:36:16 +020084// supervisor represents and instance of the supervision system. It keeps track
85// of a supervision tree and a request channel to its internal processor
86// goroutine.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010087type supervisor struct {
88 // mu guards the entire state of the supervisor.
89 mu sync.RWMutex
Serge Bazanski216fe7b2021-05-21 18:36:16 +020090 // root is the root node of the supervision tree, named 'root'. It
91 // represents the Runnable started with the supervisor.New call.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010092 root *node
Serge Bazanskic7359672020-10-30 16:38:57 +010093 // logtree is the main logtree exposed to runnables and used internally.
94 logtree *logtree.LogTree
95 // ilogger is the internal logger logging to "supervisor" in the logtree.
96 ilogger logtree.LeveledLogger
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010097
Serge Bazanski216fe7b2021-05-21 18:36:16 +020098 // pReq is an interface channel to the lifecycle processor of the
99 // supervisor.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100100 pReq chan *processorRequest
Serge Bazanski19bb4122020-05-04 17:57:50 +0200101
102 // propagate panics, ie. don't catch them.
103 propagatePanic bool
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100104}
105
Serge Bazanski19bb4122020-05-04 17:57:50 +0200106// SupervisorOpt are runtime configurable options for the supervisor.
107type SupervisorOpt func(s *supervisor)
108
109var (
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200110 // WithPropagatePanic prevents the Supervisor from catching panics in
111 // runnables and treating them as failures. This is useful to enable for
112 // testing and local debugging.
Serge Bazanski19bb4122020-05-04 17:57:50 +0200113 WithPropagatePanic = func(s *supervisor) {
114 s.propagatePanic = true
115 }
116)
117
Serge Bazanskic7359672020-10-30 16:38:57 +0100118func WithExistingLogtree(lt *logtree.LogTree) SupervisorOpt {
119 return func(s *supervisor) {
120 s.logtree = lt
121 }
122}
123
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100124// New creates a new supervisor with its root running the given root runnable.
125// The given context can be used to cancel the entire supervision tree.
Serge Bazanskic7359672020-10-30 16:38:57 +0100126func New(ctx context.Context, rootRunnable Runnable, opts ...SupervisorOpt) *supervisor {
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100127 sup := &supervisor{
Serge Bazanskic7359672020-10-30 16:38:57 +0100128 logtree: logtree.New(),
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100129 pReq: make(chan *processorRequest),
130 }
Serge Bazanski19bb4122020-05-04 17:57:50 +0200131
132 for _, o := range opts {
133 o(sup)
134 }
135
Serge Bazanskic7359672020-10-30 16:38:57 +0100136 sup.ilogger = sup.logtree.MustLeveledFor("supervisor")
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100137 sup.root = newNode("root", rootRunnable, sup, nil)
138
139 go sup.processor(ctx)
140
141 sup.pReq <- &processorRequest{
142 schedule: &processorRequestSchedule{dn: "root"},
143 }
Serge Bazanskiac6b6442020-05-06 19:13:43 +0200144
145 return sup
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100146}
Serge Bazanskic7359672020-10-30 16:38:57 +0100147
148func Logger(ctx context.Context) logtree.LeveledLogger {
149 node, unlock := fromContext(ctx)
150 defer unlock()
151 return node.sup.logtree.MustLeveledFor(logtree.DN(node.dn()))
152}
153
154func RawLogger(ctx context.Context) io.Writer {
155 node, unlock := fromContext(ctx)
156 defer unlock()
157 return node.sup.logtree.MustRawFor(logtree.DN(node.dn()))
158}