blob: e7b5d34004d20e272f5cf119240b37e9340c7d75 [file] [log] [blame]
Serge Bazanski9c09c4e2020-03-24 13:58:01 +01001// Copyright 2020 The Monogon Project Authors.
2//
3// SPDX-License-Identifier: Apache-2.0
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
17package supervisor
18
Serge Bazanski216fe7b2021-05-21 18:36:16 +020019// Supporting infrastructure to allow running some non-Go payloads under
20// supervision.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010021
22import (
23 "context"
Lorenz Brune3032bd2023-04-13 14:43:41 +020024 "errors"
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010025 "net"
Lorenz Brune3032bd2023-04-13 14:43:41 +020026 "os"
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010027 "os/exec"
Serge Bazanski216fe7b2021-05-21 18:36:16 +020028
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010029 "google.golang.org/grpc"
Serge Bazanski96043bc2021-10-05 12:10:13 +020030
31 "source.monogon.dev/metropolis/pkg/logtree"
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010032)
33
Serge Bazanski216fe7b2021-05-21 18:36:16 +020034// GRPCServer creates a Runnable that serves gRPC requests as longs as it's not
35// canceled.
36// If graceful is set to true, the server will be gracefully stopped instead of
37// plain stopped. This means all pending RPCs will finish, but also requires
38// streaming gRPC handlers to check their context liveliness and exit
39// accordingly. If the server code does not support this, `graceful` should be
40// false and the server will be killed violently instead.
Serge Bazanski9c09c4e2020-03-24 13:58:01 +010041func GRPCServer(srv *grpc.Server, lis net.Listener, graceful bool) Runnable {
42 return func(ctx context.Context) error {
43 Signal(ctx, SignalHealthy)
44 errC := make(chan error)
45 go func() {
46 errC <- srv.Serve(lis)
47 }()
48 select {
49 case <-ctx.Done():
50 if graceful {
51 srv.GracefulStop()
52 } else {
53 srv.Stop()
54 }
55 return ctx.Err()
56 case err := <-errC:
57 return err
58 }
59 }
60}
61
Serge Bazanski216fe7b2021-05-21 18:36:16 +020062// RunCommand will create a Runnable that starts a long-running command, whose
63// exit is determined to be a failure.
Serge Bazanski05604292021-03-12 17:47:21 +010064func RunCommand(ctx context.Context, cmd *exec.Cmd, opts ...RunCommandOption) error {
Serge Bazanski967be212020-11-02 11:26:59 +010065 Signal(ctx, SignalHealthy)
Serge Bazanski05604292021-03-12 17:47:21 +010066
67 var parseKLog bool
Lorenz Brune3032bd2023-04-13 14:43:41 +020068 var signal <-chan os.Signal
Serge Bazanski05604292021-03-12 17:47:21 +010069 for _, opt := range opts {
70 if opt.parseKlog {
71 parseKLog = true
72 }
Lorenz Brune3032bd2023-04-13 14:43:41 +020073 if opt.signal != nil {
74 signal = opt.signal
75 }
Serge Bazanski05604292021-03-12 17:47:21 +010076 }
77
78 if parseKLog {
79 // We make two klogs, one for each of stdout/stderr. This is to prevent
80 // accidental interleaving of both.
81 klogStdout := logtree.KLogParser(Logger(ctx))
82 defer klogStdout.Close()
83 klogStderr := logtree.KLogParser(Logger(ctx))
84 defer klogStderr.Close()
85
86 cmd.Stdout = klogStdout
87 cmd.Stderr = klogStderr
88 } else {
89 cmd.Stdout = RawLogger(ctx)
90 cmd.Stderr = RawLogger(ctx)
91 }
Lorenz Brune3032bd2023-04-13 14:43:41 +020092 err := cmd.Start()
93 if err != nil {
94 return err
95 }
96
97 exited := make(chan struct{})
98 if signal != nil {
99 go func() {
100 for {
101 var err error
102 select {
103 case s := <-signal:
104 err = cmd.Process.Signal(s)
105 case <-exited:
106 return
107 }
108 if err != nil && !errors.Is(err, os.ErrProcessDone) {
109 Logger(ctx).Warningf("Failed sending signal to process: %v", err)
110 }
111 }
112 }()
113 }
114
115 err = cmd.Wait()
116 if signal != nil {
117 exited <- struct{}{}
118 }
Serge Bazanski967be212020-11-02 11:26:59 +0100119 Logger(ctx).Infof("Command returned: %v", err)
120 return err
Serge Bazanski9c09c4e2020-03-24 13:58:01 +0100121}
Serge Bazanski05604292021-03-12 17:47:21 +0100122
123type RunCommandOption struct {
124 parseKlog bool
Lorenz Brune3032bd2023-04-13 14:43:41 +0200125 signal <-chan os.Signal
Serge Bazanski05604292021-03-12 17:47:21 +0100126}
127
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200128// ParseKLog signals that the command being run will return klog-compatible
129// logs to stdout and/or stderr, and these will be re-interpreted as structured
Serge Bazanski05604292021-03-12 17:47:21 +0100130// logging and emitted to the supervisor's logger.
131func ParseKLog() RunCommandOption {
132 return RunCommandOption{
133 parseKlog: true,
134 }
135}
Lorenz Brune3032bd2023-04-13 14:43:41 +0200136
137// SignalChan takes a channel which can be used to send signals to the
138// supervised process.
139//
140// The given channel will be read from as long as the underlying process is
141// running. If the process doesn't start successfully the channel will not be
142// read. When the process exits, the channel will stop being read.
143//
144// With the above in mind, and also taking into account the inherent lack of
145// reliability in delivering any process-handled signals in POSIX/Linux, it is
146// recommended to use unbuffered channels, always write to them in a non-blocking
147// fashion (eg. in a select { ... default: } block), and to not rely only on the
148// signal delivery mechanism for the intended behaviour.
149//
150// For example, if the signals are used to trigger some configuration reload,
151// these configuration reloads should either be verified and signal delivery should
152// be retried until confirmed successful, or there should be a backup periodic
153// reload performed by the target process independently of signal-based reload
154// triggers.
155//
156// Another example: if the signal delivered is a SIGTERM used to gracefully
157// terminate some process, it should be attempted to be delivered a number of
158// times before finally SIGKILLing the process.
159func SignalChan(s <-chan os.Signal) RunCommandOption {
160 return RunCommandOption{
161 signal: s,
162 }
163}