Add ftrace support to DebugService

This allows us to do ad-hoc kernel-level tracing on a running Metropolis node.
Useful for tracking down complex bugs.

Example: `bazel run //metropolis/cli/dbg -- trace -function_graph_filter blkdev_* function_graph`

Test Plan: Debug utility, manually tested

X-Origin-Diff: phab/D748
GitOrigin-RevId: 924eb795250412a73eb30c0eef4a8c1cc726e5fd
diff --git a/metropolis/cli/dbg/main.go b/metropolis/cli/dbg/main.go
index eb9070f..75685e9 100644
--- a/metropolis/cli/dbg/main.go
+++ b/metropolis/cli/dbg/main.go
@@ -24,6 +24,7 @@
 	"io/ioutil"
 	"math/rand"
 	"os"
+	"strings"
 	"time"
 
 	"github.com/spf13/pflag"
@@ -69,6 +70,14 @@
 		fmt.Fprintf(os.Stderr, "Example:\n  %s %s IPAssigned\n", os.Args[0], os.Args[1])
 	}
 
+	traceCmd := flag.NewFlagSet("trace", flag.ExitOnError)
+	traceCmd.Usage = func() {
+		fmt.Fprintf(os.Stderr, "Usage: %v %v [options] tracer\n", os.Args[0], os.Args[1])
+		flag.PrintDefaults()
+	}
+	functionFilter := traceCmd.String("function_filter", "", "Only trace functions matched by this filter (comma-separated, supports wildcards via *)")
+	functionGraphFilter := traceCmd.String("function_graph_filter", "", "Only trace functions matched by this filter and their children (syntax same as function_filter)")
+
 	switch os.Args[1] {
 	case "logs":
 		logsCmd.Parse(os.Args[2:])
@@ -157,5 +166,37 @@
 		if err := command.Execute(); err != nil {
 			os.Exit(1)
 		}
+	case "trace":
+		traceCmd.Parse(os.Args[2:])
+		tracer := traceCmd.Arg(0)
+		var fgf []string
+		var ff []string
+		if len(*functionGraphFilter) > 0 {
+			fgf = strings.Split(*functionGraphFilter, ",")
+		}
+		if len(*functionFilter) > 0 {
+			ff = strings.Split(*functionFilter, ",")
+		}
+		req := apb.TraceRequest{
+			GraphFunctionFilter: fgf,
+			FunctionFilter:      ff,
+			Tracer:              tracer,
+		}
+		traceEvents, err := debugClient.Trace(ctx, &req)
+		if err != nil {
+			fmt.Fprintf(os.Stderr, "failed to trace: %v", err)
+			os.Exit(1)
+		}
+		for {
+			traceEvent, err := traceEvents.Recv()
+			if err != nil {
+				if err == io.EOF {
+					break
+				}
+				fmt.Fprintf(os.Stderr, "stream aborted unexpectedly: %v", err)
+				os.Exit(1)
+			}
+			fmt.Println(traceEvent.RawLine)
+		}
 	}
 }
diff --git a/metropolis/node/core/debug_service.go b/metropolis/node/core/debug_service.go
index 6dee3d9..964642b 100644
--- a/metropolis/node/core/debug_service.go
+++ b/metropolis/node/core/debug_service.go
@@ -17,15 +17,22 @@
 package main
 
 import (
+	"bufio"
 	"context"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"regexp"
+	"strings"
 
 	"google.golang.org/grpc/codes"
 	"google.golang.org/grpc/status"
 
+	apb "source.monogon.dev/metropolis/proto/api"
+
 	"source.monogon.dev/metropolis/node/core/cluster"
 	"source.monogon.dev/metropolis/node/kubernetes"
 	"source.monogon.dev/metropolis/pkg/logtree"
-	apb "source.monogon.dev/metropolis/proto/api"
 )
 
 const (
@@ -37,6 +44,9 @@
 	cluster    *cluster.Manager
 	kubernetes *kubernetes.Service
 	logtree    *logtree.LogTree
+	// traceLock provides exclusive access to the Linux tracing infrastructure (ftrace)
+	// This is a channel because Go's mutexes can't be cancelled or be acquired in a non-blocking way.
+	traceLock chan struct{}
 }
 
 func (s *debugService) GetDebugKubeconfig(ctx context.Context, req *apb.GetDebugKubeconfigRequest) (*apb.GetDebugKubeconfigResponse, error) {
@@ -181,3 +191,73 @@
 		}
 	}
 }
+
+// Validate property names as they are used in path construction and we really don't want a path traversal vulnerability
+var safeTracingPropertyNamesRe = regexp.MustCompile("^[a-z0-9_]+$")
+
+func writeTracingProperty(name string, value string) error {
+	if !safeTracingPropertyNamesRe.MatchString(name) {
+		return fmt.Errorf("disallowed tracing property name received: \"%v\"", name)
+	}
+	return ioutil.WriteFile("/sys/kernel/tracing/"+name, []byte(value+"\n"), 0)
+}
+
+func (s *debugService) Trace(req *apb.TraceRequest, srv apb.NodeDebugService_TraceServer) error {
+	// Don't allow more than one trace as the kernel doesn't support this.
+	select {
+	case s.traceLock <- struct{}{}:
+		defer func() {
+			<-s.traceLock
+		}()
+	default:
+		return status.Error(codes.FailedPrecondition, "a trace is already in progress")
+	}
+
+	if len(req.FunctionFilter) == 0 {
+		req.FunctionFilter = []string{"*"} // For reset purposes
+	}
+	if len(req.GraphFunctionFilter) == 0 {
+		req.GraphFunctionFilter = []string{"*"} // For reset purposes
+	}
+
+	defer writeTracingProperty("current_tracer", "nop")
+	if err := writeTracingProperty("current_tracer", req.Tracer); err != nil {
+		return status.Errorf(codes.InvalidArgument, "requested tracer not available: %v", err)
+	}
+
+	if err := writeTracingProperty("set_ftrace_filter", strings.Join(req.FunctionFilter, " ")); err != nil {
+		return status.Errorf(codes.InvalidArgument, "setting ftrace filter failed: %v", err)
+	}
+	if err := writeTracingProperty("set_graph_function", strings.Join(req.GraphFunctionFilter, " ")); err != nil {
+		return status.Errorf(codes.InvalidArgument, "setting graph filter failed: %v", err)
+	}
+	tracePipe, err := os.Open("/sys/kernel/tracing/trace_pipe")
+	if err != nil {
+		return status.Errorf(codes.Unavailable, "cannot open trace output pipe: %v", err)
+	}
+	defer tracePipe.Close()
+
+	defer writeTracingProperty("tracing_on", "0")
+	if err := writeTracingProperty("tracing_on", "1"); err != nil {
+		return status.Errorf(codes.InvalidArgument, "requested tracer not available: %v", err)
+	}
+
+	go func() {
+		<-srv.Context().Done()
+		tracePipe.Close()
+	}()
+
+	eventScanner := bufio.NewScanner(tracePipe)
+	for eventScanner.Scan() {
+		if err := eventScanner.Err(); err != nil {
+			return status.Errorf(codes.Unavailable, "event pipe read error: %v", err)
+		}
+		err := srv.Send(&apb.TraceEvent{
+			RawLine: eventScanner.Text(),
+		})
+		if err != nil {
+			return err
+		}
+	}
+	return nil
+}
diff --git a/metropolis/node/core/main.go b/metropolis/node/core/main.go
index 4b19b7f..442102f 100644
--- a/metropolis/node/core/main.go
+++ b/metropolis/node/core/main.go
@@ -208,6 +208,7 @@
 			cluster:    m,
 			logtree:    lt,
 			kubernetes: kubeSvc,
+			traceLock:  make(chan struct{}, 1),
 		}
 		dbgSrv := grpc.NewServer()
 		apb.RegisterNodeDebugServiceServer(dbgSrv, dbg)
diff --git a/metropolis/node/core/mounts.go b/metropolis/node/core/mounts.go
index c6a626f..7a15c30 100644
--- a/metropolis/node/core/mounts.go
+++ b/metropolis/node/core/mounts.go
@@ -37,6 +37,7 @@
 		flags uintptr
 	}{
 		{"/sys", "sysfs", unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV},
+		{"/sys/kernel/tracing", "tracefs", unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV},
 		{"/proc", "proc", unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV},
 		{"/dev", "devtmpfs", unix.MS_NOEXEC | unix.MS_NOSUID},
 		{"/dev/pts", "devpts", unix.MS_NOEXEC | unix.MS_NOSUID},
diff --git a/metropolis/proto/api/debug.proto b/metropolis/proto/api/debug.proto
index 25e369e..6cbe32b 100644
--- a/metropolis/proto/api/debug.proto
+++ b/metropolis/proto/api/debug.proto
@@ -39,6 +39,9 @@
     //
     // TODO(q3k): move method and its related messages to the non-debug node endpoint once we have one.
     rpc GetLogs(GetLogsRequest) returns (stream GetLogsResponse);
+
+    // Trace enables tracing of Metropolis using the Linux ftrace infrastructure.
+    rpc Trace(TraceRequest) returns (stream TraceEvent);
 }
 
 
@@ -146,3 +149,21 @@
         Raw raw = 3;
     }
 }
+
+message TraceRequest {
+    // Name of the tracer to use. Defined in https://www.kernel.org/doc/html/latest/trace/ftrace.html#the-tracers.
+    // Useful ones enabled in Metropolis: function_graph, function.
+    // Gets reset to nop automatically after the stream is terminated.
+    string tracer = 1;
+
+    // List of functions to trace. Accepts wildcards using the '*' character. If left empty traces all functions.
+    repeated string function_filter = 2;
+
+    // List of functions and their descendants to trace with the function_graph tracer.
+    repeated string graph_function_filter = 3;
+}
+
+message TraceEvent {
+    // Currently we do not parse the event data and just return what the kernel outputs, line-by-line.
+    string raw_line = 1;
+}
\ No newline at end of file
diff --git a/third_party/linux/linux-metropolis.config b/third_party/linux/linux-metropolis.config
index acb41b5..965ef8b 100644
--- a/third_party/linux/linux-metropolis.config
+++ b/third_party/linux/linux-metropolis.config
@@ -121,6 +121,8 @@
 # CONFIG_RCU_EXPERT is not set
 CONFIG_SRCU=y
 CONFIG_TREE_SRCU=y
+CONFIG_TASKS_RCU_GENERIC=y
+CONFIG_TASKS_RUDE_RCU=y
 CONFIG_RCU_STALL_COMMON=y
 CONFIG_RCU_NEED_SEGCBLIST=y
 # end of RCU Subsystem
@@ -245,6 +247,7 @@
 CONFIG_SLUB_CPU_PARTIAL=y
 CONFIG_SYSTEM_DATA_VERIFICATION=y
 # CONFIG_PROFILING is not set
+CONFIG_TRACEPOINTS=y
 # end of General setup
 
 CONFIG_64BIT=y
@@ -652,6 +655,7 @@
 CONFIG_KVM_INTEL=y
 CONFIG_KVM_AMD=y
 CONFIG_KVM_AMD_SEV=y
+# CONFIG_KVM_MMU_AUDIT is not set
 CONFIG_AS_AVX512=y
 CONFIG_AS_SHA1_NI=y
 CONFIG_AS_SHA256_NI=y
@@ -667,6 +671,7 @@
 CONFIG_JUMP_LABEL=y
 # CONFIG_STATIC_KEYS_SELFTEST is not set
 # CONFIG_STATIC_CALL_SELFTEST is not set
+CONFIG_UPROBES=y
 CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y
 CONFIG_ARCH_USE_BUILTIN_BSWAP=y
 CONFIG_USER_RETURN_NOTIFIER=y
@@ -1129,6 +1134,7 @@
 # Network testing
 #
 # CONFIG_NET_PKTGEN is not set
+# CONFIG_NET_DROP_MONITOR is not set
 # end of Network testing
 # end of Networking options
 
@@ -3067,6 +3073,7 @@
 CONFIG_PSTORE_COMPRESS_DEFAULT="zstd"
 # CONFIG_PSTORE_CONSOLE is not set
 # CONFIG_PSTORE_PMSG is not set
+# CONFIG_PSTORE_FTRACE is not set
 # CONFIG_PSTORE_RAM is not set
 CONFIG_PSTORE_ZONE=y
 CONFIG_PSTORE_BLK=y
@@ -3448,6 +3455,8 @@
 # CONFIG_SYSTEM_BLACKLIST_KEYRING is not set
 # end of Certificates for signature checking
 
+CONFIG_BINARY_PRINTF=y
+
 #
 # Library routines
 #
@@ -3604,6 +3613,7 @@
 # CONFIG_DEBUG_PAGEALLOC is not set
 # CONFIG_PAGE_OWNER is not set
 # CONFIG_PAGE_POISONING is not set
+# CONFIG_DEBUG_PAGE_REF is not set
 # CONFIG_DEBUG_RODATA_TEST is not set
 CONFIG_ARCH_HAS_DEBUG_WX=y
 CONFIG_DEBUG_WX=y
@@ -3685,6 +3695,8 @@
 # CONFIG_CSD_LOCK_WAIT_DEBUG is not set
 # end of Lock Debugging (spinlocks, mutexes, etc...)
 
+CONFIG_TRACE_IRQFLAGS=y
+CONFIG_TRACE_IRQFLAGS_NMI=y
 CONFIG_STACKTRACE=y
 # CONFIG_WARN_ALL_UNSEEDED_RANDOM is not set
 # CONFIG_DEBUG_KOBJECT is not set
@@ -3717,6 +3729,7 @@
 # CONFIG_CPU_HOTPLUG_STATE_CONTROL is not set
 # CONFIG_LATENCYTOP is not set
 CONFIG_USER_STACKTRACE_SUPPORT=y
+CONFIG_NOP_TRACER=y
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
@@ -3726,9 +3739,47 @@
 CONFIG_HAVE_SYSCALL_TRACEPOINTS=y
 CONFIG_HAVE_FENTRY=y
 CONFIG_HAVE_C_RECORDMCOUNT=y
+CONFIG_TRACER_MAX_TRACE=y
 CONFIG_TRACE_CLOCK=y
+CONFIG_RING_BUFFER=y
+CONFIG_EVENT_TRACING=y
+CONFIG_CONTEXT_SWITCH_TRACER=y
+CONFIG_RING_BUFFER_ALLOW_SWAP=y
+CONFIG_PREEMPTIRQ_TRACEPOINTS=y
+CONFIG_TRACING=y
+CONFIG_GENERIC_TRACER=y
 CONFIG_TRACING_SUPPORT=y
-# CONFIG_FTRACE is not set
+CONFIG_FTRACE=y
+# CONFIG_BOOTTIME_TRACING is not set
+CONFIG_FUNCTION_TRACER=y
+CONFIG_FUNCTION_GRAPH_TRACER=y
+CONFIG_DYNAMIC_FTRACE=y
+CONFIG_DYNAMIC_FTRACE_WITH_REGS=y
+CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS=y
+# CONFIG_FUNCTION_PROFILER is not set
+# CONFIG_STACK_TRACER is not set
+CONFIG_IRQSOFF_TRACER=y
+CONFIG_SCHED_TRACER=y
+CONFIG_HWLAT_TRACER=y
+# CONFIG_MMIOTRACE is not set
+CONFIG_FTRACE_SYSCALLS=y
+CONFIG_TRACER_SNAPSHOT=y
+CONFIG_TRACER_SNAPSHOT_PER_CPU_SWAP=y
+CONFIG_BRANCH_PROFILE_NONE=y
+# CONFIG_PROFILE_ANNOTATED_BRANCHES is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+CONFIG_UPROBE_EVENTS=y
+CONFIG_DYNAMIC_EVENTS=y
+CONFIG_PROBE_EVENTS=y
+CONFIG_FTRACE_MCOUNT_RECORD=y
+# CONFIG_SYNTH_EVENTS is not set
+# CONFIG_HIST_TRIGGERS is not set
+# CONFIG_TRACE_EVENT_INJECT is not set
+# CONFIG_TRACEPOINT_BENCHMARK is not set
+# CONFIG_RING_BUFFER_BENCHMARK is not set
+# CONFIG_TRACE_EVAL_MAP_FILE is not set
+# CONFIG_FTRACE_STARTUP_TEST is not set
+# CONFIG_RING_BUFFER_STARTUP_TEST is not set
 # CONFIG_PROVIDE_OHCI1394_DMA_INIT is not set
 # CONFIG_SAMPLES is not set
 CONFIG_ARCH_HAS_DEVMEM_IS_ALLOWED=y