blob: e0f7753b81357a03fe42290b61deea3bfe49dedc [file] [log] [blame]
Serge Bazanski1ebd1e12020-07-13 19:17:16 +02001// Copyright 2020 The Monogon Project Authors.
2//
3// SPDX-License-Identifier: Apache-2.0
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
17package main
18
19import (
Lorenz Brun09c275b2021-03-30 12:47:09 +020020 "bufio"
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020021 "context"
Lorenz Brun09c275b2021-03-30 12:47:09 +020022 "fmt"
23 "io/ioutil"
24 "os"
25 "regexp"
26 "strings"
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020027
28 "google.golang.org/grpc/codes"
29 "google.golang.org/grpc/status"
Serge Bazanskib0272182020-11-02 18:39:44 +010030
Lorenz Brun09c275b2021-03-30 12:47:09 +020031 apb "source.monogon.dev/metropolis/proto/api"
32
Serge Bazanski31370b02021-01-07 16:31:14 +010033 "source.monogon.dev/metropolis/node/core/cluster"
Serge Bazanski31370b02021-01-07 16:31:14 +010034 "source.monogon.dev/metropolis/node/kubernetes"
35 "source.monogon.dev/metropolis/pkg/logtree"
Serge Bazanskib0272182020-11-02 18:39:44 +010036)
37
38const (
39 logFilterMax = 1000
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020040)
41
Serge Bazanski662b5b32020-12-21 13:49:00 +010042// debugService implements the Metropolis node debug API.
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020043type debugService struct {
44 cluster *cluster.Manager
45 kubernetes *kubernetes.Service
Serge Bazanskib0272182020-11-02 18:39:44 +010046 logtree *logtree.LogTree
Serge Bazanski216fe7b2021-05-21 18:36:16 +020047 // traceLock provides exclusive access to the Linux tracing infrastructure
48 // (ftrace)
49 // This is a channel because Go's mutexes can't be cancelled or be acquired
50 // in a non-blocking way.
Lorenz Brun09c275b2021-03-30 12:47:09 +020051 traceLock chan struct{}
Serge Bazanskib0272182020-11-02 18:39:44 +010052}
53
Serge Bazanski1ebd1e12020-07-13 19:17:16 +020054func (s *debugService) GetDebugKubeconfig(ctx context.Context, req *apb.GetDebugKubeconfigRequest) (*apb.GetDebugKubeconfigResponse, error) {
55 return s.kubernetes.GetDebugKubeconfig(ctx, req)
56}
57
Serge Bazanskib0272182020-11-02 18:39:44 +010058func (s *debugService) GetLogs(req *apb.GetLogsRequest, srv apb.NodeDebugService_GetLogsServer) error {
59 if len(req.Filters) > logFilterMax {
60 return status.Errorf(codes.InvalidArgument, "requested %d filters, maximum permitted is %d", len(req.Filters), logFilterMax)
61 }
62 dn := logtree.DN(req.Dn)
63 _, err := dn.Path()
64 switch err {
65 case nil:
66 case logtree.ErrInvalidDN:
67 return status.Errorf(codes.InvalidArgument, "invalid DN")
68 default:
69 return status.Errorf(codes.Unavailable, "could not parse DN: %v", err)
70 }
71
72 var options []logtree.LogReadOption
73
74 // Turn backlog mode into logtree option(s).
75 switch req.BacklogMode {
76 case apb.GetLogsRequest_BACKLOG_DISABLE:
77 case apb.GetLogsRequest_BACKLOG_ALL:
78 options = append(options, logtree.WithBacklog(logtree.BacklogAllAvailable))
79 case apb.GetLogsRequest_BACKLOG_COUNT:
80 count := int(req.BacklogCount)
81 if count <= 0 {
82 return status.Errorf(codes.InvalidArgument, "backlog_count must be > 0 if backlog_mode is BACKLOG_COUNT")
83 }
84 options = append(options, logtree.WithBacklog(count))
85 default:
86 return status.Errorf(codes.InvalidArgument, "unknown backlog_mode %d", req.BacklogMode)
87 }
88
89 // Turn stream mode into logtree option(s).
90 streamEnable := false
91 switch req.StreamMode {
92 case apb.GetLogsRequest_STREAM_DISABLE:
93 case apb.GetLogsRequest_STREAM_UNBUFFERED:
94 streamEnable = true
95 options = append(options, logtree.WithStream())
96 }
97
98 // Parse proto filters into logtree options.
99 for i, filter := range req.Filters {
100 switch inner := filter.Filter.(type) {
101 case *apb.LogFilter_WithChildren_:
102 options = append(options, logtree.WithChildren())
103 case *apb.LogFilter_OnlyRaw_:
104 options = append(options, logtree.OnlyRaw())
105 case *apb.LogFilter_OnlyLeveled_:
106 options = append(options, logtree.OnlyLeveled())
107 case *apb.LogFilter_LeveledWithMinimumSeverity_:
108 severity, err := logtree.SeverityFromProto(inner.LeveledWithMinimumSeverity.Minimum)
109 if err != nil {
110 return status.Errorf(codes.InvalidArgument, "filter %d has invalid severity: %v", i, err)
111 }
112 options = append(options, logtree.LeveledWithMinimumSeverity(severity))
113 }
114 }
115
116 reader, err := s.logtree.Read(logtree.DN(req.Dn), options...)
117 switch err {
118 case nil:
119 case logtree.ErrRawAndLeveled:
120 return status.Errorf(codes.InvalidArgument, "requested only raw and only leveled logs simultaneously")
121 default:
122 return status.Errorf(codes.Unavailable, "could not retrieve logs: %v", err)
123 }
124 defer reader.Close()
125
126 // Default protobuf message size limit is 64MB. We want to limit ourselves
127 // to 10MB.
128 // Currently each raw log line can be at most 1024 unicode codepoints (or
129 // 4096 bytes). To cover extra metadata and proto overhead, let's round
130 // this up to 4500 bytes. This in turn means we can store a maximum of
131 // (10e6/4500) == 2222 entries.
132 // Currently each leveled log line can also be at most 1024 unicode
133 // codepoints (or 4096 bytes). To cover extra metadata and proto overhead
134 // let's round this up to 2000 bytes. This in turn means we can store a
135 // maximum of (10e6/5000) == 2000 entries.
136 // The lowever of these numbers, ie the worst case scenario, is 2000
137 // maximum entries.
138 maxChunkSize := 2000
139
140 // Serve all backlog entries in chunks.
141 chunk := make([]*apb.LogEntry, 0, maxChunkSize)
142 for _, entry := range reader.Backlog {
143 p := entry.Proto()
144 if p == nil {
145 // TODO(q3k): log this once we have logtree/gRPC compatibility.
146 continue
147 }
148 chunk = append(chunk, p)
149
150 if len(chunk) >= maxChunkSize {
151 err := srv.Send(&apb.GetLogsResponse{
152 BacklogEntries: chunk,
153 })
154 if err != nil {
155 return err
156 }
157 chunk = make([]*apb.LogEntry, 0, maxChunkSize)
158 }
159 }
160
161 // Send last chunk of backlog, if present..
162 if len(chunk) > 0 {
163 err := srv.Send(&apb.GetLogsResponse{
164 BacklogEntries: chunk,
165 })
166 if err != nil {
167 return err
168 }
169 chunk = make([]*apb.LogEntry, 0, maxChunkSize)
170 }
171
172 // Start serving streaming data, if streaming has been requested.
173 if !streamEnable {
174 return nil
175 }
176
177 for {
178 entry, ok := <-reader.Stream
179 if !ok {
180 // Streaming has been ended by logtree - tell the client and return.
181 return status.Error(codes.Unavailable, "log streaming aborted by system")
182 }
183 p := entry.Proto()
184 if p == nil {
185 // TODO(q3k): log this once we have logtree/gRPC compatibility.
186 continue
187 }
188 err := srv.Send(&apb.GetLogsResponse{
189 StreamEntries: []*apb.LogEntry{p},
190 })
191 if err != nil {
192 return err
193 }
194 }
Serge Bazanski1ebd1e12020-07-13 19:17:16 +0200195}
Lorenz Brun09c275b2021-03-30 12:47:09 +0200196
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200197// Validate property names as they are used in path construction and we really
198// don't want a path traversal vulnerability
Lorenz Brun09c275b2021-03-30 12:47:09 +0200199var safeTracingPropertyNamesRe = regexp.MustCompile("^[a-z0-9_]+$")
200
201func writeTracingProperty(name string, value string) error {
202 if !safeTracingPropertyNamesRe.MatchString(name) {
203 return fmt.Errorf("disallowed tracing property name received: \"%v\"", name)
204 }
205 return ioutil.WriteFile("/sys/kernel/tracing/"+name, []byte(value+"\n"), 0)
206}
207
208func (s *debugService) Trace(req *apb.TraceRequest, srv apb.NodeDebugService_TraceServer) error {
209 // Don't allow more than one trace as the kernel doesn't support this.
210 select {
211 case s.traceLock <- struct{}{}:
212 defer func() {
213 <-s.traceLock
214 }()
215 default:
216 return status.Error(codes.FailedPrecondition, "a trace is already in progress")
217 }
218
219 if len(req.FunctionFilter) == 0 {
220 req.FunctionFilter = []string{"*"} // For reset purposes
221 }
222 if len(req.GraphFunctionFilter) == 0 {
223 req.GraphFunctionFilter = []string{"*"} // For reset purposes
224 }
225
226 defer writeTracingProperty("current_tracer", "nop")
227 if err := writeTracingProperty("current_tracer", req.Tracer); err != nil {
228 return status.Errorf(codes.InvalidArgument, "requested tracer not available: %v", err)
229 }
230
231 if err := writeTracingProperty("set_ftrace_filter", strings.Join(req.FunctionFilter, " ")); err != nil {
232 return status.Errorf(codes.InvalidArgument, "setting ftrace filter failed: %v", err)
233 }
234 if err := writeTracingProperty("set_graph_function", strings.Join(req.GraphFunctionFilter, " ")); err != nil {
235 return status.Errorf(codes.InvalidArgument, "setting graph filter failed: %v", err)
236 }
237 tracePipe, err := os.Open("/sys/kernel/tracing/trace_pipe")
238 if err != nil {
239 return status.Errorf(codes.Unavailable, "cannot open trace output pipe: %v", err)
240 }
241 defer tracePipe.Close()
242
243 defer writeTracingProperty("tracing_on", "0")
244 if err := writeTracingProperty("tracing_on", "1"); err != nil {
245 return status.Errorf(codes.InvalidArgument, "requested tracer not available: %v", err)
246 }
247
248 go func() {
249 <-srv.Context().Done()
250 tracePipe.Close()
251 }()
252
253 eventScanner := bufio.NewScanner(tracePipe)
254 for eventScanner.Scan() {
255 if err := eventScanner.Err(); err != nil {
256 return status.Errorf(codes.Unavailable, "event pipe read error: %v", err)
257 }
258 err := srv.Send(&apb.TraceEvent{
259 RawLine: eventScanner.Text(),
260 })
261 if err != nil {
262 return err
263 }
264 }
265 return nil
266}