blob: 964642b59ee98ed6ca4c8d869507e4e8b95ed232 [file] [log] [blame]
Serge Bazanski1ebd1e12020-07-13 19:17:16 +02001// Copyright 2020 The Monogon Project Authors.
2//
3// SPDX-License-Identifier: Apache-2.0
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
17package main
18
19import (
Lorenz Brun09c275b2021-03-30 12:47:09 +020020 "bufio"
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020021 "context"
Lorenz Brun09c275b2021-03-30 12:47:09 +020022 "fmt"
23 "io/ioutil"
24 "os"
25 "regexp"
26 "strings"
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020027
28 "google.golang.org/grpc/codes"
29 "google.golang.org/grpc/status"
Serge Bazanskib0272182020-11-02 18:39:44 +010030
Lorenz Brun09c275b2021-03-30 12:47:09 +020031 apb "source.monogon.dev/metropolis/proto/api"
32
Serge Bazanski31370b02021-01-07 16:31:14 +010033 "source.monogon.dev/metropolis/node/core/cluster"
Serge Bazanski31370b02021-01-07 16:31:14 +010034 "source.monogon.dev/metropolis/node/kubernetes"
35 "source.monogon.dev/metropolis/pkg/logtree"
Serge Bazanskib0272182020-11-02 18:39:44 +010036)
37
38const (
39 logFilterMax = 1000
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020040)
41
Serge Bazanski662b5b32020-12-21 13:49:00 +010042// debugService implements the Metropolis node debug API.
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020043type debugService struct {
44 cluster *cluster.Manager
45 kubernetes *kubernetes.Service
Serge Bazanskib0272182020-11-02 18:39:44 +010046 logtree *logtree.LogTree
Lorenz Brun09c275b2021-03-30 12:47:09 +020047 // traceLock provides exclusive access to the Linux tracing infrastructure (ftrace)
48 // This is a channel because Go's mutexes can't be cancelled or be acquired in a non-blocking way.
49 traceLock chan struct{}
Serge Bazanskib0272182020-11-02 18:39:44 +010050}
51
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020052func (s *debugService) GetDebugKubeconfig(ctx context.Context, req *apb.GetDebugKubeconfigRequest) (*apb.GetDebugKubeconfigResponse, error) {
53 return s.kubernetes.GetDebugKubeconfig(ctx, req)
54}
55
Serge Bazanskib0272182020-11-02 18:39:44 +010056func (s *debugService) GetLogs(req *apb.GetLogsRequest, srv apb.NodeDebugService_GetLogsServer) error {
57 if len(req.Filters) > logFilterMax {
58 return status.Errorf(codes.InvalidArgument, "requested %d filters, maximum permitted is %d", len(req.Filters), logFilterMax)
59 }
60 dn := logtree.DN(req.Dn)
61 _, err := dn.Path()
62 switch err {
63 case nil:
64 case logtree.ErrInvalidDN:
65 return status.Errorf(codes.InvalidArgument, "invalid DN")
66 default:
67 return status.Errorf(codes.Unavailable, "could not parse DN: %v", err)
68 }
69
70 var options []logtree.LogReadOption
71
72 // Turn backlog mode into logtree option(s).
73 switch req.BacklogMode {
74 case apb.GetLogsRequest_BACKLOG_DISABLE:
75 case apb.GetLogsRequest_BACKLOG_ALL:
76 options = append(options, logtree.WithBacklog(logtree.BacklogAllAvailable))
77 case apb.GetLogsRequest_BACKLOG_COUNT:
78 count := int(req.BacklogCount)
79 if count <= 0 {
80 return status.Errorf(codes.InvalidArgument, "backlog_count must be > 0 if backlog_mode is BACKLOG_COUNT")
81 }
82 options = append(options, logtree.WithBacklog(count))
83 default:
84 return status.Errorf(codes.InvalidArgument, "unknown backlog_mode %d", req.BacklogMode)
85 }
86
87 // Turn stream mode into logtree option(s).
88 streamEnable := false
89 switch req.StreamMode {
90 case apb.GetLogsRequest_STREAM_DISABLE:
91 case apb.GetLogsRequest_STREAM_UNBUFFERED:
92 streamEnable = true
93 options = append(options, logtree.WithStream())
94 }
95
96 // Parse proto filters into logtree options.
97 for i, filter := range req.Filters {
98 switch inner := filter.Filter.(type) {
99 case *apb.LogFilter_WithChildren_:
100 options = append(options, logtree.WithChildren())
101 case *apb.LogFilter_OnlyRaw_:
102 options = append(options, logtree.OnlyRaw())
103 case *apb.LogFilter_OnlyLeveled_:
104 options = append(options, logtree.OnlyLeveled())
105 case *apb.LogFilter_LeveledWithMinimumSeverity_:
106 severity, err := logtree.SeverityFromProto(inner.LeveledWithMinimumSeverity.Minimum)
107 if err != nil {
108 return status.Errorf(codes.InvalidArgument, "filter %d has invalid severity: %v", i, err)
109 }
110 options = append(options, logtree.LeveledWithMinimumSeverity(severity))
111 }
112 }
113
114 reader, err := s.logtree.Read(logtree.DN(req.Dn), options...)
115 switch err {
116 case nil:
117 case logtree.ErrRawAndLeveled:
118 return status.Errorf(codes.InvalidArgument, "requested only raw and only leveled logs simultaneously")
119 default:
120 return status.Errorf(codes.Unavailable, "could not retrieve logs: %v", err)
121 }
122 defer reader.Close()
123
124 // Default protobuf message size limit is 64MB. We want to limit ourselves
125 // to 10MB.
126 // Currently each raw log line can be at most 1024 unicode codepoints (or
127 // 4096 bytes). To cover extra metadata and proto overhead, let's round
128 // this up to 4500 bytes. This in turn means we can store a maximum of
129 // (10e6/4500) == 2222 entries.
130 // Currently each leveled log line can also be at most 1024 unicode
131 // codepoints (or 4096 bytes). To cover extra metadata and proto overhead
132 // let's round this up to 2000 bytes. This in turn means we can store a
133 // maximum of (10e6/5000) == 2000 entries.
134 // The lowever of these numbers, ie the worst case scenario, is 2000
135 // maximum entries.
136 maxChunkSize := 2000
137
138 // Serve all backlog entries in chunks.
139 chunk := make([]*apb.LogEntry, 0, maxChunkSize)
140 for _, entry := range reader.Backlog {
141 p := entry.Proto()
142 if p == nil {
143 // TODO(q3k): log this once we have logtree/gRPC compatibility.
144 continue
145 }
146 chunk = append(chunk, p)
147
148 if len(chunk) >= maxChunkSize {
149 err := srv.Send(&apb.GetLogsResponse{
150 BacklogEntries: chunk,
151 })
152 if err != nil {
153 return err
154 }
155 chunk = make([]*apb.LogEntry, 0, maxChunkSize)
156 }
157 }
158
159 // Send last chunk of backlog, if present..
160 if len(chunk) > 0 {
161 err := srv.Send(&apb.GetLogsResponse{
162 BacklogEntries: chunk,
163 })
164 if err != nil {
165 return err
166 }
167 chunk = make([]*apb.LogEntry, 0, maxChunkSize)
168 }
169
170 // Start serving streaming data, if streaming has been requested.
171 if !streamEnable {
172 return nil
173 }
174
175 for {
176 entry, ok := <-reader.Stream
177 if !ok {
178 // Streaming has been ended by logtree - tell the client and return.
179 return status.Error(codes.Unavailable, "log streaming aborted by system")
180 }
181 p := entry.Proto()
182 if p == nil {
183 // TODO(q3k): log this once we have logtree/gRPC compatibility.
184 continue
185 }
186 err := srv.Send(&apb.GetLogsResponse{
187 StreamEntries: []*apb.LogEntry{p},
188 })
189 if err != nil {
190 return err
191 }
192 }
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200193}
Lorenz Brun09c275b2021-03-30 12:47:09 +0200194
195// Validate property names as they are used in path construction and we really don't want a path traversal vulnerability
196var safeTracingPropertyNamesRe = regexp.MustCompile("^[a-z0-9_]+$")
197
198func writeTracingProperty(name string, value string) error {
199 if !safeTracingPropertyNamesRe.MatchString(name) {
200 return fmt.Errorf("disallowed tracing property name received: \"%v\"", name)
201 }
202 return ioutil.WriteFile("/sys/kernel/tracing/"+name, []byte(value+"\n"), 0)
203}
204
205func (s *debugService) Trace(req *apb.TraceRequest, srv apb.NodeDebugService_TraceServer) error {
206 // Don't allow more than one trace as the kernel doesn't support this.
207 select {
208 case s.traceLock <- struct{}{}:
209 defer func() {
210 <-s.traceLock
211 }()
212 default:
213 return status.Error(codes.FailedPrecondition, "a trace is already in progress")
214 }
215
216 if len(req.FunctionFilter) == 0 {
217 req.FunctionFilter = []string{"*"} // For reset purposes
218 }
219 if len(req.GraphFunctionFilter) == 0 {
220 req.GraphFunctionFilter = []string{"*"} // For reset purposes
221 }
222
223 defer writeTracingProperty("current_tracer", "nop")
224 if err := writeTracingProperty("current_tracer", req.Tracer); err != nil {
225 return status.Errorf(codes.InvalidArgument, "requested tracer not available: %v", err)
226 }
227
228 if err := writeTracingProperty("set_ftrace_filter", strings.Join(req.FunctionFilter, " ")); err != nil {
229 return status.Errorf(codes.InvalidArgument, "setting ftrace filter failed: %v", err)
230 }
231 if err := writeTracingProperty("set_graph_function", strings.Join(req.GraphFunctionFilter, " ")); err != nil {
232 return status.Errorf(codes.InvalidArgument, "setting graph filter failed: %v", err)
233 }
234 tracePipe, err := os.Open("/sys/kernel/tracing/trace_pipe")
235 if err != nil {
236 return status.Errorf(codes.Unavailable, "cannot open trace output pipe: %v", err)
237 }
238 defer tracePipe.Close()
239
240 defer writeTracingProperty("tracing_on", "0")
241 if err := writeTracingProperty("tracing_on", "1"); err != nil {
242 return status.Errorf(codes.InvalidArgument, "requested tracer not available: %v", err)
243 }
244
245 go func() {
246 <-srv.Context().Done()
247 tracePipe.Close()
248 }()
249
250 eventScanner := bufio.NewScanner(tracePipe)
251 for eventScanner.Scan() {
252 if err := eventScanner.Err(); err != nil {
253 return status.Errorf(codes.Unavailable, "event pipe read error: %v", err)
254 }
255 err := srv.Send(&apb.TraceEvent{
256 RawLine: eventScanner.Text(),
257 })
258 if err != nil {
259 return err
260 }
261 }
262 return nil
263}