blob: 0006ef228437397a63e053649dedb16051f77d0f [file] [log] [blame]
Tim Windelschmidt6d33a432025-02-04 14:34:25 +01001// Copyright The Monogon Project Authors.
2// SPDX-License-Identifier: Apache-2.0
3
Serge Bazanski6f599512023-04-26 19:08:19 +02004package scruffy
5
6import (
7 "context"
8 "errors"
9 "fmt"
10 "time"
11
12 "github.com/google/uuid"
13 "github.com/prometheus/client_golang/prometheus"
14 "google.golang.org/protobuf/proto"
15 "k8s.io/klog/v2"
16
17 "source.monogon.dev/cloud/bmaas/bmdb/model"
18 "source.monogon.dev/cloud/bmaas/server/api"
19)
20
21// hwStatsRunner collects metrics from the machine hardware inventory in BMDB and
22// exposes them as Prometheus metrics via a registry passed to newHWStatsRunner.
23type hwStatsRunner struct {
24 s *Server
25
Tim Windelschmidt60fab1c2023-06-13 20:41:07 +020026 nodesPerRegion *prometheus.GaugeVec
Serge Bazanski6f599512023-04-26 19:08:19 +020027 memoryPerRegion *prometheus.GaugeVec
28 cpuThreadsPerRegion *prometheus.GaugeVec
29}
30
31// newHWStatsRunner builds a hwStatsRunner. The hwStatsRunner then has the
32// given's Server BMDB connection bound to it and can perform actual database
33// statistic gathering.
34func newHWStatsRunner(s *Server, reg *prometheus.Registry) *hwStatsRunner {
35 hwsr := &hwStatsRunner{
36 s: s,
37
Tim Windelschmidt60fab1c2023-06-13 20:41:07 +020038 nodesPerRegion: prometheus.NewGaugeVec(prometheus.GaugeOpts{
39 Name: "bmdb_hwstats_region_nodes",
40 }, []string{"provider", "location"}),
41
Serge Bazanski6f599512023-04-26 19:08:19 +020042 memoryPerRegion: prometheus.NewGaugeVec(prometheus.GaugeOpts{
43 Name: "bmdb_hwstats_region_ram_bytes",
44 }, []string{"provider", "location"}),
45
46 cpuThreadsPerRegion: prometheus.NewGaugeVec(prometheus.GaugeOpts{
47 Name: "bmdb_hwstats_region_cpu_threads",
48 }, []string{"provider", "location"}),
49 }
Tim Windelschmidt60fab1c2023-06-13 20:41:07 +020050 reg.MustRegister(hwsr.nodesPerRegion, hwsr.memoryPerRegion, hwsr.cpuThreadsPerRegion)
Serge Bazanski6f599512023-04-26 19:08:19 +020051 return hwsr
52}
53
54func (h *hwStatsRunner) run(ctx context.Context) {
55 klog.Infof("Starting stats runner...")
56
57 ti := time.NewTicker(time.Minute)
58
59 for {
60 err := h.runOnce(ctx)
61 if err != nil {
62 if errors.Is(err, ctx.Err()) {
63 return
64 }
65 klog.Errorf("Stats run failed: %v", err)
66 }
67 select {
68 case <-ti.C:
69 case <-ctx.Done():
70 klog.Infof("Exiting stats runner (%v)...", ctx.Err())
71 return
72 }
73 }
74}
75
76// statsPerRegion are gathered and aggregated (summed) per region.
77type statsPerRegion struct {
Tim Windelschmidt60fab1c2023-06-13 20:41:07 +020078 nodes uint64
Serge Bazanski6f599512023-04-26 19:08:19 +020079 ramBytes uint64
80 numThreads uint64
81}
82
83// add a given AgentHardwareReport to this region's data.
84func (s *statsPerRegion) add(hwrep *api.AgentHardwareReport) {
Tim Windelschmidt60fab1c2023-06-13 20:41:07 +020085 s.nodes++
Serge Bazanski6f599512023-04-26 19:08:19 +020086 s.ramBytes += uint64(hwrep.Report.MemoryInstalledBytes)
87 for _, cpu := range hwrep.Report.Cpu {
88 s.numThreads += uint64(cpu.HardwareThreads)
89 }
90}
91
92// regionKey is used to uniquely identify each region per each provider.
93type regionKey struct {
94 provider model.Provider
95 location string
96}
97
98func (r *regionKey) String() string {
99 return fmt.Sprintf("%s/%s", r.provider, r.location)
100}
101
102func (h *hwStatsRunner) runOnce(ctx context.Context) error {
103 sess, err := h.s.session(ctx)
104 if err != nil {
105 return err
106 }
107
108 var start uuid.UUID
109
110 perRegion := make(map[regionKey]*statsPerRegion)
111 var total statsPerRegion
112
113 for {
114 var res []model.ListMachineHardwareRow
115 err = sess.Transact(ctx, func(q *model.Queries) error {
116 res, err = q.ListMachineHardware(ctx, model.ListMachineHardwareParams{
117 Limit: 100,
118 MachineID: start,
119 })
120 return err
121 })
122 if err != nil {
123 return err
124 }
125 klog.Infof("Machines: %d chunk", len(res))
126 if len(res) == 0 {
127 break
128 }
129 for _, row := range res {
130 var hwrep api.AgentHardwareReport
131 err = proto.Unmarshal(row.HardwareReportRaw.([]byte), &hwrep)
132 if err != nil {
133 klog.Warningf("Could not decode hardware report from %s: %v", row.MachineID, err)
134 continue
135 }
136
137 if !row.ProviderLocation.Valid {
138 klog.Warningf("%s has no provider location, skipping", row.MachineID)
139 continue
140 }
141
142 key := regionKey{
143 provider: row.Provider,
144 location: row.ProviderLocation.String,
145 }
146 if _, ok := perRegion[key]; !ok {
147 perRegion[key] = &statsPerRegion{}
148 }
149 perRegion[key].add(&hwrep)
150 total.add(&hwrep)
151
152 start = row.MachineID
153 }
154 }
155
156 for k, st := range perRegion {
157 labels := prometheus.Labels{
158 "provider": string(k.provider),
159 "location": k.location,
160 }
161
Tim Windelschmidt60fab1c2023-06-13 20:41:07 +0200162 h.nodesPerRegion.With(labels).Set(float64(st.nodes))
Serge Bazanski6f599512023-04-26 19:08:19 +0200163 h.memoryPerRegion.With(labels).Set(float64(st.ramBytes))
164 h.cpuThreadsPerRegion.With(labels).Set(float64(st.numThreads))
165 }
166 return nil
167}