cloud/shepherd: add equinix API metrics
This adds the following signals to our interaction with the Equinix API:
1. Latency
2. Traffic
3. Errors
4. Saturation
Change-Id: Ic2d5e36a7a26ab906ac1c2fa6741ebf86b9e551a
Reviewed-on: https://review.monogon.dev/c/monogon/+/1606
Tested-by: Jenkins CI
Reviewed-by: Tim Windelschmidt <tim@monogon.tech>
diff --git a/cloud/shepherd/equinix/wrapngo/metrics.go b/cloud/shepherd/equinix/wrapngo/metrics.go
new file mode 100644
index 0000000..0f4cc94
--- /dev/null
+++ b/cloud/shepherd/equinix/wrapngo/metrics.go
@@ -0,0 +1,129 @@
+package wrapngo
+
+import (
+ "context"
+ "errors"
+ "fmt"
+ "net/http"
+ "regexp"
+ "strings"
+ "time"
+
+ "github.com/prometheus/client_golang/prometheus"
+ "k8s.io/klog/v2"
+)
+
+// metricsSet contains all the Prometheus metrics collected by wrapngo.
+type metricsSet struct {
+ requestLatencies *prometheus.HistogramVec
+ waiting prometheus.GaugeFunc
+ inFlight prometheus.GaugeFunc
+}
+
+func newMetricsSet(ser *serializer) *metricsSet {
+ return &metricsSet{
+ requestLatencies: prometheus.NewHistogramVec(
+ prometheus.HistogramOpts{
+ Name: "equinix_api_latency",
+ Help: "Equinix API request latency in seconds, partitioned by endpoint status code",
+ },
+ []string{"endpoint", "status_code"},
+ ),
+ waiting: prometheus.NewGaugeFunc(
+ prometheus.GaugeOpts{
+ Name: "equinix_api_waiting",
+ Help: "Number of API requests pending to be sent to Equinix but waiting on semaphore",
+ },
+ func() float64 {
+ _, waiting := ser.stats()
+ return float64(waiting)
+ },
+ ),
+ inFlight: prometheus.NewGaugeFunc(
+ prometheus.GaugeOpts{
+ Name: "equinix_api_in_flight",
+ Help: "Number of API requests currently being processed by Equinix",
+ },
+ func() float64 {
+ inFlight, _ := ser.stats()
+ return float64(inFlight)
+ },
+ ),
+ }
+}
+
+// getEndpointForPath converts from an Equinix API method and path (eg.
+// /metal/v1/devices/deadbeef) into an 'endpoint' name, which is an imaginary,
+// Monogon-specific name for the API endpoint accessed by this call.
+//
+// If the given path is unknown and thus cannot be converted to an endpoint name,
+// 'Unknown' is return and a warning is logged.
+//
+// We use this function to partition request statistics per API 'endpoint'. An
+// alternative to this would be to record high-level packngo function names, but
+// one packngo function call might actually emit multiple HTTP API requests - so
+// we're stuck recording the low-level requests and gathering statistics from
+// there instead.
+func getEndpointForPath(method, path string) string {
+ path = strings.TrimPrefix(path, "/metal/v1")
+ for name, match := range endpointNames {
+ if match.matches(method, path) {
+ return name
+ }
+ }
+ klog.Warningf("Unknown Equinix API %s %s - cannot determine metric endpoint name", method, path)
+ return "Unknown"
+}
+
+// requestMatch is used to match a HTTP request method/path.
+type requestMatch struct {
+ method string
+ regexp *regexp.Regexp
+}
+
+func (r *requestMatch) matches(method, path string) bool {
+ if r.method != method {
+ return false
+ }
+ return r.regexp.MatchString(path)
+}
+
+var (
+ endpointNames = map[string]requestMatch{
+ "GetDevice": {"GET", regexp.MustCompile(`^/devices/[^/]+$`)},
+ "ListDevices": {"GET", regexp.MustCompile(`^/(organizations|projects)/[^/]+/devices$`)},
+ "CreateDevice": {"POST", regexp.MustCompile(`^/projects/[^/]+/devices$`)},
+ "ListReservations": {"GET", regexp.MustCompile(`^/project/[^/]+/hardware-reservations$`)},
+ "ListSSHKeys": {"GET", regexp.MustCompile(`^/ssh-keys$`)},
+ "CreateSSHKey": {"POST", regexp.MustCompile(`^/project/[^/]+/ssh-keys$`)},
+ "GetSSHKey": {"GET", regexp.MustCompile(`^/ssh-keys/[^/]+$`)},
+ "UpdateSSHKey": {"PATCH", regexp.MustCompile(`^/ssh-keys/[^/]+$`)},
+ "PerformDeviceAction": {"POST", regexp.MustCompile(`^/devices/[^/]+/actions$`)},
+ }
+)
+
+// onAPIRequestDone is called by the wrapngo code on every API response from
+// Equinix, and records the given parameters into metrics.
+func (m *metricsSet) onAPIRequestDone(req *http.Request, res *http.Response, err error, latency time.Duration) {
+ if m == nil {
+ return
+ }
+
+ code := "unknown"
+ if err == nil {
+ code = fmt.Sprintf("%d", res.StatusCode)
+ } else {
+ switch {
+ case errors.Is(err, context.Canceled):
+ code = "ctx canceled"
+ case errors.Is(err, context.DeadlineExceeded):
+ code = "deadline exceeded"
+ }
+ }
+ if code == "unknown" {
+ klog.Warningf("Unexpected HTTP result: req %s %s, error: %v", req.Method, req.URL.Path, res)
+ }
+
+ endpoint := getEndpointForPath(req.Method, req.URL.Path)
+ m.requestLatencies.With(prometheus.Labels{"endpoint": endpoint, "status_code": code}).Observe(latency.Seconds())
+}