cloud/bmaas/bmdb/scruffy: initialize, implement BMDB metrics
This creates a new BMaaS component, Scruffy the Janitor.
Scruffy will run a bunch of housekeeping jobs that aren't tied to a
particular provider or even region. Currently Scruffy just collects BMDB
metrics by periodically polling the BMDB SQL database.
Change-Id: Icafa714811757eaaf31fed43184ded8512bde067
Reviewed-on: https://review.monogon.dev/c/monogon/+/1819
Tested-by: Jenkins CI
Reviewed-by: Tim Windelschmidt <tim@monogon.tech>
diff --git a/cloud/bmaas/scruffy/bmdb_stats_test.go b/cloud/bmaas/scruffy/bmdb_stats_test.go
new file mode 100644
index 0000000..89d5b5e
--- /dev/null
+++ b/cloud/bmaas/scruffy/bmdb_stats_test.go
@@ -0,0 +1,169 @@
+package scruffy
+
+import (
+ "context"
+ "fmt"
+ "sort"
+ "strings"
+ "testing"
+
+ "github.com/prometheus/client_golang/prometheus"
+
+ "source.monogon.dev/cloud/bmaas/bmdb"
+ "source.monogon.dev/cloud/bmaas/bmdb/model"
+ "source.monogon.dev/cloud/lib/component"
+)
+
+func TestBMDBStats(t *testing.T) {
+ s := Server{
+ Config: Config{
+ BMDB: bmdb.BMDB{
+ Config: bmdb.Config{
+ Database: component.CockroachConfig{
+ InMemory: true,
+ },
+ },
+ },
+ },
+ }
+
+ registry := prometheus.NewRegistry()
+ runner := newBMDBStatsRunner(&s, registry)
+
+ ctx, ctxC := context.WithCancel(context.Background())
+ defer ctxC()
+
+ expect := func(wantValues map[string]int64) {
+ t.Helper()
+ res, err := registry.Gather()
+ if err != nil {
+ t.Fatalf("Gather: %v", err)
+ }
+ gotValues := make(map[string]bool)
+ for _, mf := range res {
+ if len(mf.Metric) != 1 {
+ for _, m := range mf.Metric {
+ var lvs []string
+ for _, lp := range m.Label {
+ lvs = append(lvs, fmt.Sprintf("%s=%s", *lp.Name, *lp.Value))
+ }
+ sort.Strings(lvs)
+ name := fmt.Sprintf("%s[%s]", *mf.Name, strings.Join(lvs, ","))
+ gotValues[name] = true
+ if _, ok := wantValues[name]; !ok {
+ t.Errorf("MetricFamily %s: unexpected", name)
+ }
+ if want, got := wantValues[name], int64(*m.Gauge.Value); want != got {
+ t.Errorf("MetricFamily %s: wanted %d, got %d", *mf.Name, want, got)
+ }
+ }
+ } else {
+ m := mf.Metric[0]
+ gotValues[*mf.Name] = true
+ if want, got := wantValues[*mf.Name], int64(*m.Gauge.Value); want != got {
+ t.Errorf("MetricFamily %s: wanted %d, got %d", *mf.Name, want, got)
+ }
+ if _, ok := wantValues[*mf.Name]; !ok {
+ t.Errorf("MetricFamily %s: unexpected", *mf.Name)
+ }
+ }
+ }
+ for mf, _ := range wantValues {
+ if !gotValues[mf] {
+ t.Errorf("MetricFamily %s: missing", mf)
+ }
+ }
+ }
+
+ expect(map[string]int64{
+ "bmdb_stats_machines": 0,
+ "bmdb_stats_machines_provided": 0,
+ "bmdb_stats_machines_heartbeating": 0,
+ "bmdb_stats_machines_pending_installation": 0,
+ "bmdb_stats_machines_installed": 0,
+ "bmdb_stats_machines_pending_agent_start": 0,
+ "bmdb_stats_machines_pending_agent_recovery": 0,
+ "bmdb_stats_active_backoffs[process=ShepherdAccess]": 0,
+ "bmdb_stats_active_backoffs[process=ShepherdAgentStart]": 0,
+ "bmdb_stats_active_backoffs[process=ShepherdRecovery]": 0,
+ "bmdb_stats_active_work[process=ShepherdAccess]": 0,
+ "bmdb_stats_active_work[process=ShepherdAgentStart]": 0,
+ "bmdb_stats_active_work[process=ShepherdRecovery]": 0,
+ })
+
+ conn, err := s.Config.BMDB.Open(true)
+ if err != nil {
+ t.Fatalf("Open: %v", err)
+ }
+ sess, err := conn.StartSession(ctx)
+ if err != nil {
+ t.Fatalf("StartSession: %v", err)
+ }
+
+ s.bmdb = conn
+ s.sessionC = make(chan *bmdb.Session)
+ go s.sessionWorker(ctx)
+ if err := runner.runOnce(ctx); err != nil {
+ t.Fatal(err)
+ }
+
+ expect(map[string]int64{
+ "bmdb_stats_machines": 0,
+ "bmdb_stats_machines_provided": 0,
+ "bmdb_stats_machines_heartbeating": 0,
+ "bmdb_stats_machines_pending_installation": 0,
+ "bmdb_stats_machines_installed": 0,
+ "bmdb_stats_machines_pending_agent_start": 0,
+ "bmdb_stats_machines_pending_agent_recovery": 0,
+ "bmdb_stats_active_backoffs[process=ShepherdAccess]": 0,
+ "bmdb_stats_active_backoffs[process=ShepherdAgentStart]": 0,
+ "bmdb_stats_active_backoffs[process=ShepherdRecovery]": 0,
+ "bmdb_stats_active_work[process=ShepherdAccess]": 0,
+ "bmdb_stats_active_work[process=ShepherdAgentStart]": 0,
+ "bmdb_stats_active_work[process=ShepherdRecovery]": 0,
+ })
+
+ f := fill().
+ // Provided, needs installation.
+ machine().providedE("1").build().
+ // Three machines needing recovery.
+ machine().providedE("2").agentNeverHeartbeat().build().
+ machine().providedE("3").agentNeverHeartbeat().build().
+ machine().providedE("4").agentNeverHeartbeat().build().
+ // One machine correctly heartbeating.
+ machine().providedE("5").agentHealthy().build().
+ // Two machines heartbeating and pending installation.
+ machine().providedE("6").agentHealthy().installRequested(10).build().
+ machine().providedE("7").agentHealthy().installRequested(10).installReported(9).build().
+ // Machine which is pending installation _and_ recovery.
+ machine().providedE("8").agentNeverHeartbeat().installRequested(10).build().
+ // Machine which has been successfully installed.
+ machine().providedE("9").agentStoppedHeartbeating().installRequested(10).installReported(10).build()
+
+ err = sess.Transact(ctx, func(q *model.Queries) error {
+ return f(ctx, q)
+ })
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ if err := runner.runOnce(ctx); err != nil {
+ t.Fatal(err)
+ }
+
+ expect(map[string]int64{
+ "bmdb_stats_machines": 9,
+ "bmdb_stats_machines_provided": 9,
+ "bmdb_stats_machines_heartbeating": 3,
+ "bmdb_stats_machines_pending_installation": 3,
+ "bmdb_stats_machines_installed": 1,
+ "bmdb_stats_machines_pending_agent_start": 1,
+ "bmdb_stats_machines_pending_agent_recovery": 4,
+ "bmdb_stats_active_backoffs[process=ShepherdAccess]": 0,
+ "bmdb_stats_active_backoffs[process=ShepherdAgentStart]": 0,
+ "bmdb_stats_active_backoffs[process=ShepherdRecovery]": 0,
+ "bmdb_stats_active_work[process=ShepherdAccess]": 0,
+ "bmdb_stats_active_work[process=ShepherdAgentStart]": 0,
+ "bmdb_stats_active_work[process=ShepherdRecovery]": 0,
+ })
+}