blob: 0aff67bfe123d64089a9379d922da50af0d30ef7 [file] [log] [blame]
Serge Bazanski6f599512023-04-26 19:08:19 +02001package scruffy
2
3import (
4 "context"
5 "errors"
6 "fmt"
7 "time"
8
9 "github.com/google/uuid"
10 "github.com/prometheus/client_golang/prometheus"
11 "google.golang.org/protobuf/proto"
12 "k8s.io/klog/v2"
13
14 "source.monogon.dev/cloud/bmaas/bmdb/model"
15 "source.monogon.dev/cloud/bmaas/server/api"
16)
17
18// hwStatsRunner collects metrics from the machine hardware inventory in BMDB and
19// exposes them as Prometheus metrics via a registry passed to newHWStatsRunner.
20type hwStatsRunner struct {
21 s *Server
22
Tim Windelschmidt60fab1c2023-06-13 20:41:07 +020023 nodesPerRegion *prometheus.GaugeVec
Serge Bazanski6f599512023-04-26 19:08:19 +020024 memoryPerRegion *prometheus.GaugeVec
25 cpuThreadsPerRegion *prometheus.GaugeVec
26}
27
28// newHWStatsRunner builds a hwStatsRunner. The hwStatsRunner then has the
29// given's Server BMDB connection bound to it and can perform actual database
30// statistic gathering.
31func newHWStatsRunner(s *Server, reg *prometheus.Registry) *hwStatsRunner {
32 hwsr := &hwStatsRunner{
33 s: s,
34
Tim Windelschmidt60fab1c2023-06-13 20:41:07 +020035 nodesPerRegion: prometheus.NewGaugeVec(prometheus.GaugeOpts{
36 Name: "bmdb_hwstats_region_nodes",
37 }, []string{"provider", "location"}),
38
Serge Bazanski6f599512023-04-26 19:08:19 +020039 memoryPerRegion: prometheus.NewGaugeVec(prometheus.GaugeOpts{
40 Name: "bmdb_hwstats_region_ram_bytes",
41 }, []string{"provider", "location"}),
42
43 cpuThreadsPerRegion: prometheus.NewGaugeVec(prometheus.GaugeOpts{
44 Name: "bmdb_hwstats_region_cpu_threads",
45 }, []string{"provider", "location"}),
46 }
Tim Windelschmidt60fab1c2023-06-13 20:41:07 +020047 reg.MustRegister(hwsr.nodesPerRegion, hwsr.memoryPerRegion, hwsr.cpuThreadsPerRegion)
Serge Bazanski6f599512023-04-26 19:08:19 +020048 return hwsr
49}
50
51func (h *hwStatsRunner) run(ctx context.Context) {
52 klog.Infof("Starting stats runner...")
53
54 ti := time.NewTicker(time.Minute)
55
56 for {
57 err := h.runOnce(ctx)
58 if err != nil {
59 if errors.Is(err, ctx.Err()) {
60 return
61 }
62 klog.Errorf("Stats run failed: %v", err)
63 }
64 select {
65 case <-ti.C:
66 case <-ctx.Done():
67 klog.Infof("Exiting stats runner (%v)...", ctx.Err())
68 return
69 }
70 }
71}
72
73// statsPerRegion are gathered and aggregated (summed) per region.
74type statsPerRegion struct {
Tim Windelschmidt60fab1c2023-06-13 20:41:07 +020075 nodes uint64
Serge Bazanski6f599512023-04-26 19:08:19 +020076 ramBytes uint64
77 numThreads uint64
78}
79
80// add a given AgentHardwareReport to this region's data.
81func (s *statsPerRegion) add(hwrep *api.AgentHardwareReport) {
Tim Windelschmidt60fab1c2023-06-13 20:41:07 +020082 s.nodes++
Serge Bazanski6f599512023-04-26 19:08:19 +020083 s.ramBytes += uint64(hwrep.Report.MemoryInstalledBytes)
84 for _, cpu := range hwrep.Report.Cpu {
85 s.numThreads += uint64(cpu.HardwareThreads)
86 }
87}
88
89// regionKey is used to uniquely identify each region per each provider.
90type regionKey struct {
91 provider model.Provider
92 location string
93}
94
95func (r *regionKey) String() string {
96 return fmt.Sprintf("%s/%s", r.provider, r.location)
97}
98
99func (h *hwStatsRunner) runOnce(ctx context.Context) error {
100 sess, err := h.s.session(ctx)
101 if err != nil {
102 return err
103 }
104
105 var start uuid.UUID
106
107 perRegion := make(map[regionKey]*statsPerRegion)
108 var total statsPerRegion
109
110 for {
111 var res []model.ListMachineHardwareRow
112 err = sess.Transact(ctx, func(q *model.Queries) error {
113 res, err = q.ListMachineHardware(ctx, model.ListMachineHardwareParams{
114 Limit: 100,
115 MachineID: start,
116 })
117 return err
118 })
119 if err != nil {
120 return err
121 }
122 klog.Infof("Machines: %d chunk", len(res))
123 if len(res) == 0 {
124 break
125 }
126 for _, row := range res {
127 var hwrep api.AgentHardwareReport
128 err = proto.Unmarshal(row.HardwareReportRaw.([]byte), &hwrep)
129 if err != nil {
130 klog.Warningf("Could not decode hardware report from %s: %v", row.MachineID, err)
131 continue
132 }
133
134 if !row.ProviderLocation.Valid {
135 klog.Warningf("%s has no provider location, skipping", row.MachineID)
136 continue
137 }
138
139 key := regionKey{
140 provider: row.Provider,
141 location: row.ProviderLocation.String,
142 }
143 if _, ok := perRegion[key]; !ok {
144 perRegion[key] = &statsPerRegion{}
145 }
146 perRegion[key].add(&hwrep)
147 total.add(&hwrep)
148
149 start = row.MachineID
150 }
151 }
152
153 for k, st := range perRegion {
154 labels := prometheus.Labels{
155 "provider": string(k.provider),
156 "location": k.location,
157 }
158
Tim Windelschmidt60fab1c2023-06-13 20:41:07 +0200159 h.nodesPerRegion.With(labels).Set(float64(st.nodes))
Serge Bazanski6f599512023-04-26 19:08:19 +0200160 h.memoryPerRegion.With(labels).Set(float64(st.ramBytes))
161 h.cpuThreadsPerRegion.With(labels).Set(float64(st.numThreads))
162 }
163 return nil
164}