cloud/bmaas/bmdb/scruffy: initialize, implement BMDB metrics
This creates a new BMaaS component, Scruffy the Janitor.
Scruffy will run a bunch of housekeeping jobs that aren't tied to a
particular provider or even region. Currently Scruffy just collects BMDB
metrics by periodically polling the BMDB SQL database.
Change-Id: Icafa714811757eaaf31fed43184ded8512bde067
Reviewed-on: https://review.monogon.dev/c/monogon/+/1819
Tested-by: Jenkins CI
Reviewed-by: Tim Windelschmidt <tim@monogon.tech>
diff --git a/cloud/bmaas/scruffy/server.go b/cloud/bmaas/scruffy/server.go
new file mode 100644
index 0000000..58e0f51
--- /dev/null
+++ b/cloud/bmaas/scruffy/server.go
@@ -0,0 +1,127 @@
+// Package scruffy implements Scruffy, The Janitor.
+//
+// Scruffy is a BMaaS component which runs a bunch of important, housekeeping-ish
+// processes that aren't tied to any particular provider and are mostly
+// batch-oriented.
+//
+// Currently Scruffy just collects metrics from the BMDB.
+package scruffy
+
+import (
+ "context"
+ "flag"
+ "fmt"
+ "os"
+ "time"
+
+ "github.com/cenkalti/backoff/v4"
+ "k8s.io/klog/v2"
+
+ "source.monogon.dev/cloud/bmaas/bmdb"
+ "source.monogon.dev/cloud/bmaas/bmdb/metrics"
+ "source.monogon.dev/cloud/bmaas/bmdb/webug"
+ "source.monogon.dev/cloud/lib/component"
+)
+
+type Config struct {
+ Component component.ComponentConfig
+ BMDB bmdb.BMDB
+ Webug webug.Config
+
+ StatsRunnerRate time.Duration
+}
+
+// TODO(q3k): factor this out to BMDB library?
+func runtimeInfo() string {
+ hostname, _ := os.Hostname()
+ if hostname == "" {
+ hostname = "UNKNOWN"
+ }
+ return fmt.Sprintf("host %s", hostname)
+}
+
+func (c *Config) RegisterFlags() {
+ c.Component.RegisterFlags("scruffy")
+ c.BMDB.ComponentName = "scruffy"
+ c.BMDB.RuntimeInfo = runtimeInfo()
+ c.BMDB.Database.RegisterFlags("bmdb")
+ c.Webug.RegisterFlags()
+
+ flag.DurationVar(&c.StatsRunnerRate, "scruffy_stats_collection_rate", time.Minute, "How often the stats collection loop will run against BMDB")
+}
+
+type Server struct {
+ Config Config
+
+ bmdb *bmdb.Connection
+ sessionC chan *bmdb.Session
+}
+
+func (s *Server) Start(ctx context.Context) {
+ reg := s.Config.Component.PrometheusRegistry()
+ s.Config.BMDB.EnableMetrics(reg)
+ s.Config.Component.StartPrometheus(ctx)
+
+ conn, err := s.Config.BMDB.Open(true)
+ if err != nil {
+ klog.Exitf("Failed to connect to BMDB: %v", err)
+ }
+ s.bmdb = conn
+ s.sessionC = make(chan *bmdb.Session)
+ go s.sessionWorker(ctx)
+
+ bsr := newBMDBStatsRunner(s, reg)
+ go bsr.run(ctx)
+
+ hwr := newHWStatsRunner(s, reg)
+ go hwr.run(ctx)
+
+ go func() {
+ if err := s.Config.Webug.Start(ctx, conn); err != nil && err != ctx.Err() {
+ klog.Exitf("Failed to start webug: %v", err)
+ }
+ }()
+}
+
+// sessionWorker emits a valid BMDB session to sessionC as long as ctx is active.
+//
+// TODO(q3k): factor out into bmdb client lib
+func (s *Server) sessionWorker(ctx context.Context) {
+ var session *bmdb.Session
+ for {
+ if session == nil || session.Expired() {
+ klog.Infof("Starting new session...")
+ bo := backoff.NewExponentialBackOff()
+ err := backoff.Retry(func() error {
+ var err error
+ session, err = s.bmdb.StartSession(ctx, bmdb.SessionOption{Processor: metrics.ProcessorScruffyStats})
+ if err != nil {
+ klog.Errorf("Failed to start session: %v", err)
+ return err
+ } else {
+ return nil
+ }
+ }, backoff.WithContext(bo, ctx))
+ if err != nil {
+ // If something's really wrong just crash.
+ klog.Exitf("Gave up on starting session: %v", err)
+ }
+ klog.Infof("New session: %s", session.UUID)
+ }
+
+ select {
+ case <-ctx.Done():
+ return
+ case s.sessionC <- session:
+ }
+ }
+}
+
+func (s *Server) session(ctx context.Context) (*bmdb.Session, error) {
+ select {
+ case sess := <-s.sessionC:
+ return sess, nil
+ case <-ctx.Done():
+ return nil, ctx.Err()
+ }
+}