blob: 5163443b1e8ab07663b53f0410cab10cdc64cd25 [file] [log] [blame]
Tim Windelschmidt6d33a432025-02-04 14:34:25 +01001// Copyright The Monogon Project Authors.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +01002// SPDX-License-Identifier: Apache-2.0
Serge Bazanski9c09c4e2020-03-24 13:58:01 +01003
4package supervisor
5
Serge Bazanski216fe7b2021-05-21 18:36:16 +02006// Supporting infrastructure to allow running some non-Go payloads under
7// supervision.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +01008
9import (
10 "context"
Lorenz Brune3032bd2023-04-13 14:43:41 +020011 "errors"
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010012 "net"
Lorenz Brune3032bd2023-04-13 14:43:41 +020013 "os"
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010014 "os/exec"
Serge Bazanski216fe7b2021-05-21 18:36:16 +020015
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010016 "google.golang.org/grpc"
Serge Bazanski96043bc2021-10-05 12:10:13 +020017
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020018 "source.monogon.dev/osbase/logtree"
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010019)
20
Serge Bazanski216fe7b2021-05-21 18:36:16 +020021// GRPCServer creates a Runnable that serves gRPC requests as longs as it's not
22// canceled.
23// If graceful is set to true, the server will be gracefully stopped instead of
24// plain stopped. This means all pending RPCs will finish, but also requires
25// streaming gRPC handlers to check their context liveliness and exit
26// accordingly. If the server code does not support this, `graceful` should be
27// false and the server will be killed violently instead.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010028func GRPCServer(srv *grpc.Server, lis net.Listener, graceful bool) Runnable {
29 return func(ctx context.Context) error {
30 Signal(ctx, SignalHealthy)
Jan Schär23e52302024-03-21 16:50:15 +010031 defer func() {
32 if graceful {
33 srv.GracefulStop()
34 } else {
35 srv.Stop()
36 }
37 }()
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010038 errC := make(chan error)
39 go func() {
40 errC <- srv.Serve(lis)
41 }()
42 select {
43 case <-ctx.Done():
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010044 return ctx.Err()
45 case err := <-errC:
46 return err
47 }
48 }
49}
50
Serge Bazanski216fe7b2021-05-21 18:36:16 +020051// RunCommand will create a Runnable that starts a long-running command, whose
52// exit is determined to be a failure.
Jan Schärbdbb9c22024-12-18 15:14:02 +010053// cmd should be created with [exec.CommandContext] so that it will be killed
54// when the context is canceled.
Serge Bazanski05604292021-03-12 17:47:21 +010055func RunCommand(ctx context.Context, cmd *exec.Cmd, opts ...RunCommandOption) error {
Serge Bazanski967be212020-11-02 11:26:59 +010056 Signal(ctx, SignalHealthy)
Serge Bazanski05604292021-03-12 17:47:21 +010057
58 var parseKLog bool
Lorenz Brune3032bd2023-04-13 14:43:41 +020059 var signal <-chan os.Signal
Serge Bazanski05604292021-03-12 17:47:21 +010060 for _, opt := range opts {
61 if opt.parseKlog {
62 parseKLog = true
63 }
Lorenz Brune3032bd2023-04-13 14:43:41 +020064 if opt.signal != nil {
65 signal = opt.signal
66 }
Serge Bazanski05604292021-03-12 17:47:21 +010067 }
68
69 if parseKLog {
70 // We make two klogs, one for each of stdout/stderr. This is to prevent
71 // accidental interleaving of both.
72 klogStdout := logtree.KLogParser(Logger(ctx))
73 defer klogStdout.Close()
74 klogStderr := logtree.KLogParser(Logger(ctx))
75 defer klogStderr.Close()
76
77 cmd.Stdout = klogStdout
78 cmd.Stderr = klogStderr
79 } else {
80 cmd.Stdout = RawLogger(ctx)
81 cmd.Stderr = RawLogger(ctx)
82 }
Lorenz Brune3032bd2023-04-13 14:43:41 +020083 err := cmd.Start()
84 if err != nil {
85 return err
86 }
87
88 exited := make(chan struct{})
89 if signal != nil {
90 go func() {
91 for {
92 var err error
93 select {
94 case s := <-signal:
95 err = cmd.Process.Signal(s)
96 case <-exited:
97 return
98 }
99 if err != nil && !errors.Is(err, os.ErrProcessDone) {
100 Logger(ctx).Warningf("Failed sending signal to process: %v", err)
101 }
102 }
103 }()
104 }
105
106 err = cmd.Wait()
107 if signal != nil {
108 exited <- struct{}{}
109 }
Serge Bazanski967be212020-11-02 11:26:59 +0100110 Logger(ctx).Infof("Command returned: %v", err)
111 return err
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100112}
Serge Bazanski05604292021-03-12 17:47:21 +0100113
114type RunCommandOption struct {
115 parseKlog bool
Lorenz Brune3032bd2023-04-13 14:43:41 +0200116 signal <-chan os.Signal
Serge Bazanski05604292021-03-12 17:47:21 +0100117}
118
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200119// ParseKLog signals that the command being run will return klog-compatible
120// logs to stdout and/or stderr, and these will be re-interpreted as structured
Serge Bazanski05604292021-03-12 17:47:21 +0100121// logging and emitted to the supervisor's logger.
122func ParseKLog() RunCommandOption {
123 return RunCommandOption{
124 parseKlog: true,
125 }
126}
Lorenz Brune3032bd2023-04-13 14:43:41 +0200127
128// SignalChan takes a channel which can be used to send signals to the
129// supervised process.
130//
131// The given channel will be read from as long as the underlying process is
132// running. If the process doesn't start successfully the channel will not be
133// read. When the process exits, the channel will stop being read.
134//
135// With the above in mind, and also taking into account the inherent lack of
136// reliability in delivering any process-handled signals in POSIX/Linux, it is
137// recommended to use unbuffered channels, always write to them in a non-blocking
138// fashion (eg. in a select { ... default: } block), and to not rely only on the
139// signal delivery mechanism for the intended behaviour.
140//
141// For example, if the signals are used to trigger some configuration reload,
142// these configuration reloads should either be verified and signal delivery should
143// be retried until confirmed successful, or there should be a backup periodic
144// reload performed by the target process independently of signal-based reload
145// triggers.
146//
147// Another example: if the signal delivered is a SIGTERM used to gracefully
148// terminate some process, it should be attempted to be delivered a number of
149// times before finally SIGKILLing the process.
150func SignalChan(s <-chan os.Signal) RunCommandOption {
151 return RunCommandOption{
152 signal: s,
153 }
154}