blob: 90b2df23ae997e1ffc70a8a16666e9763856fad4 [file] [log] [blame]
Tim Windelschmidt6d33a432025-02-04 14:34:25 +01001// Copyright The Monogon Project Authors.
2// SPDX-License-Identifier: Apache-2.0
3
Serge Bazanski6f599512023-04-26 19:08:19 +02004package scruffy
5
6import (
7 "context"
8 "errors"
9 "fmt"
10 "time"
11
12 "github.com/prometheus/client_golang/prometheus"
13 "k8s.io/klog/v2"
14
15 "source.monogon.dev/cloud/bmaas/bmdb/model"
16)
17
18// bmdbStatsRunner collects metrics from the BMDB and exposes them as Prometheus
19// metrics via a registry passed to newBMDBStatsRunner.
20type bmdbStatsRunner struct {
21 s *Server
22 collectors []*statsCollector
23}
24
25// A statsCollectorDefinition describes how to gather a given metric via a BMDB
26// SQL query.
27type statsCollectorDefinition struct {
28 // name of the metric. Used in actual metric name, prefixed with 'bmdb_stats_'.
29 name string
30 // help string emitted in prometheus endpoint.
31 help string
32 // labels is the label 'type definition', containing information about the
33 // dimensions of this metric.
34 labels labelDefinitions
35 // query used to retrieve the metric data.
36 query func(*model.Queries, context.Context) ([]model.MetricValue, error)
37}
38
39// labelProcess is the type definition of the 'process' label 'type', which is a
40// fixed-cardinality representation of the database Process enum.
41var labelProcess = labelDefinition{
42 name: "process",
43 initialValues: []string{
44 string(model.ProcessShepherdAccess),
45 string(model.ProcessShepherdAgentStart),
46 string(model.ProcessShepherdRecovery),
47 },
48}
49
50var collectorDefs = []statsCollectorDefinition{
51 {
52 name: "active_backoffs",
53 help: "Number of active backoffs, partitioned by process. There may be more than one active backoff per machine.",
54 query: model.WrapLabeledMetric((*model.Queries).CountActiveBackoffs),
55 labels: []labelDefinition{labelProcess},
56 },
57 {
58 name: "active_work",
59 help: "Number of active work, partitioned by process. There may be more than one active work item per machine.",
60 query: model.WrapLabeledMetric((*model.Queries).CountActiveWork),
61 labels: []labelDefinition{labelProcess},
62 },
63 {
64 name: "machines",
65 help: "Number of machines in the BMDB.",
66 query: model.WrapSimpleMetric((*model.Queries).CountMachines),
67 },
68 {
69 name: "machines_provided",
70 help: "Number of provided machines in the BMDB.",
71 query: model.WrapSimpleMetric((*model.Queries).CountMachinesProvided),
72 },
73 {
74 name: "machines_heartbeating",
75 help: "Number of machines with a currently heartbeating agent.",
76 query: model.WrapSimpleMetric((*model.Queries).CountMachinesAgentHeartbeating),
77 },
78 {
79 name: "machines_pending_installation",
80 help: "Number of machines pending installation.",
81 query: model.WrapSimpleMetric((*model.Queries).CountMachinesInstallationPending),
82 },
83 {
84 name: "machines_installed",
85 help: "Number of machines succesfully installed.",
86 query: model.WrapSimpleMetric((*model.Queries).CountMachinesInstallationComplete),
87 },
88 {
89 name: "machines_pending_agent_start",
90 help: "Number of machines pending the agent start workflow.",
91 query: model.WrapSimpleMetric((*model.Queries).CountMachinesForAgentStart),
92 },
93 {
94 name: "machines_pending_agent_recovery",
95 help: "Number of machines pending the agent recovery workflow.",
96 query: model.WrapSimpleMetric((*model.Queries).CountMachinesForAgentRecovery),
97 },
98}
99
100// A statsCollector is an instantiated statsCollectorDefinition which carries the
101// actual prometheus gauge backing the metric.
102type statsCollector struct {
103 gauge *prometheus.GaugeVec
104 def *statsCollectorDefinition
105}
106
107// setDefaults emits gauges with zero values for all metrics of the runner, using
108// the initialLabel data gathered from each metric definition.
109func (b *bmdbStatsRunner) setDefaults() {
110 for _, collector := range b.collectors {
111 info := collector.def
112 initial := info.labels.initialLabels()
113 if len(initial) == 0 {
114 collector.gauge.With(nil).Set(0.0)
115 } else {
116 for _, labels := range initial {
117 collector.gauge.With(labels).Set(0.0)
118 }
119 }
120 }
121}
122
123// newBMDBStatsRunner builds a bmdbStatsRunner from the collectorDefs above. The
124// bmdbStatsRunner then has the given's Server BMDB connection bound to it and
125// can perform actual database statistic gathering.
126func newBMDBStatsRunner(s *Server, reg *prometheus.Registry) *bmdbStatsRunner {
127 var collectors []*statsCollector
128
129 for _, info := range collectorDefs {
130 info := info
131 gauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{
132 Name: "bmdb_stats_" + info.name,
133 Help: info.help,
134 }, info.labels.names())
135 reg.MustRegister(gauge)
136
137 collectors = append(collectors, &statsCollector{
138 gauge: gauge,
139 def: &info,
140 })
141 }
142
143 res := &bmdbStatsRunner{
144 s: s,
145 collectors: collectors,
146 }
147 res.setDefaults()
148 return res
149}
150
151func (b *bmdbStatsRunner) run(ctx context.Context) {
152 klog.Infof("Starting stats runner...")
153
154 ti := time.NewTicker(b.s.Config.StatsRunnerRate)
155
156 for {
157 err := b.runOnce(ctx)
158 if err != nil {
159 if errors.Is(err, ctx.Err()) {
160 return
161 }
162 klog.Errorf("Stats run failed: %v", err)
163 }
164 select {
165 case <-ti.C:
166 case <-ctx.Done():
167 klog.Infof("Exiting stats runner (%v)...", ctx.Err())
168 return
169 }
170 }
171}
172
173func (b *bmdbStatsRunner) runOnce(ctx context.Context) error {
174 sess, err := b.s.session(ctx)
175 if err != nil {
176 return err
177 }
178
179 results := make(map[string][]model.MetricValue)
180 // TODO(q3k): don't fail entire run if we can't collect just one metric.
181 err = sess.Transact(ctx, func(q *model.Queries) error {
182 for _, c := range b.collectors {
183 res, err := c.def.query(q, ctx)
184 if err != nil {
Tim Windelschmidt327cdba2024-05-21 13:51:32 +0200185 return fmt.Errorf("collecting %s failed: %w", c.def.name, err)
Serge Bazanski6f599512023-04-26 19:08:19 +0200186 } else {
187 results[c.def.name] = res
188 }
189 }
190 return nil
191 })
192 if err != nil {
193 return err
194 }
195
196 b.setDefaults()
197 for _, c := range b.collectors {
198 for _, m := range results[c.def.name] {
199 klog.Infof("Setting %s (%v) to %d", c.def.name, m.Labels, m.Count)
200 c.gauge.With(m.Labels).Set(float64(m.Count))
201 }
202 }
203
204 return nil
205}