| Tim Windelschmidt | 6d33a43 | 2025-02-04 14:34:25 +0100 | [diff] [blame] | 1 | // Copyright The Monogon Project Authors. |
| 2 | // SPDX-License-Identifier: Apache-2.0 |
| 3 | |
| Serge Bazanski | 6f59951 | 2023-04-26 19:08:19 +0200 | [diff] [blame] | 4 | package scruffy |
| 5 | |
| 6 | import ( |
| 7 | "context" |
| 8 | "errors" |
| 9 | "fmt" |
| 10 | "time" |
| 11 | |
| 12 | "github.com/google/uuid" |
| 13 | "github.com/prometheus/client_golang/prometheus" |
| 14 | "google.golang.org/protobuf/proto" |
| 15 | "k8s.io/klog/v2" |
| 16 | |
| 17 | "source.monogon.dev/cloud/bmaas/bmdb/model" |
| 18 | "source.monogon.dev/cloud/bmaas/server/api" |
| 19 | ) |
| 20 | |
| 21 | // hwStatsRunner collects metrics from the machine hardware inventory in BMDB and |
| 22 | // exposes them as Prometheus metrics via a registry passed to newHWStatsRunner. |
| 23 | type hwStatsRunner struct { |
| 24 | s *Server |
| 25 | |
| Tim Windelschmidt | 60fab1c | 2023-06-13 20:41:07 +0200 | [diff] [blame] | 26 | nodesPerRegion *prometheus.GaugeVec |
| Serge Bazanski | 6f59951 | 2023-04-26 19:08:19 +0200 | [diff] [blame] | 27 | memoryPerRegion *prometheus.GaugeVec |
| 28 | cpuThreadsPerRegion *prometheus.GaugeVec |
| 29 | } |
| 30 | |
| 31 | // newHWStatsRunner builds a hwStatsRunner. The hwStatsRunner then has the |
| 32 | // given's Server BMDB connection bound to it and can perform actual database |
| 33 | // statistic gathering. |
| 34 | func newHWStatsRunner(s *Server, reg *prometheus.Registry) *hwStatsRunner { |
| 35 | hwsr := &hwStatsRunner{ |
| 36 | s: s, |
| 37 | |
| Tim Windelschmidt | 60fab1c | 2023-06-13 20:41:07 +0200 | [diff] [blame] | 38 | nodesPerRegion: prometheus.NewGaugeVec(prometheus.GaugeOpts{ |
| 39 | Name: "bmdb_hwstats_region_nodes", |
| 40 | }, []string{"provider", "location"}), |
| 41 | |
| Serge Bazanski | 6f59951 | 2023-04-26 19:08:19 +0200 | [diff] [blame] | 42 | memoryPerRegion: prometheus.NewGaugeVec(prometheus.GaugeOpts{ |
| 43 | Name: "bmdb_hwstats_region_ram_bytes", |
| 44 | }, []string{"provider", "location"}), |
| 45 | |
| 46 | cpuThreadsPerRegion: prometheus.NewGaugeVec(prometheus.GaugeOpts{ |
| 47 | Name: "bmdb_hwstats_region_cpu_threads", |
| 48 | }, []string{"provider", "location"}), |
| 49 | } |
| Tim Windelschmidt | 60fab1c | 2023-06-13 20:41:07 +0200 | [diff] [blame] | 50 | reg.MustRegister(hwsr.nodesPerRegion, hwsr.memoryPerRegion, hwsr.cpuThreadsPerRegion) |
| Serge Bazanski | 6f59951 | 2023-04-26 19:08:19 +0200 | [diff] [blame] | 51 | return hwsr |
| 52 | } |
| 53 | |
| 54 | func (h *hwStatsRunner) run(ctx context.Context) { |
| 55 | klog.Infof("Starting stats runner...") |
| 56 | |
| 57 | ti := time.NewTicker(time.Minute) |
| 58 | |
| 59 | for { |
| 60 | err := h.runOnce(ctx) |
| 61 | if err != nil { |
| 62 | if errors.Is(err, ctx.Err()) { |
| 63 | return |
| 64 | } |
| 65 | klog.Errorf("Stats run failed: %v", err) |
| 66 | } |
| 67 | select { |
| 68 | case <-ti.C: |
| 69 | case <-ctx.Done(): |
| 70 | klog.Infof("Exiting stats runner (%v)...", ctx.Err()) |
| 71 | return |
| 72 | } |
| 73 | } |
| 74 | } |
| 75 | |
| 76 | // statsPerRegion are gathered and aggregated (summed) per region. |
| 77 | type statsPerRegion struct { |
| Tim Windelschmidt | 60fab1c | 2023-06-13 20:41:07 +0200 | [diff] [blame] | 78 | nodes uint64 |
| Serge Bazanski | 6f59951 | 2023-04-26 19:08:19 +0200 | [diff] [blame] | 79 | ramBytes uint64 |
| 80 | numThreads uint64 |
| 81 | } |
| 82 | |
| 83 | // add a given AgentHardwareReport to this region's data. |
| 84 | func (s *statsPerRegion) add(hwrep *api.AgentHardwareReport) { |
| Tim Windelschmidt | 60fab1c | 2023-06-13 20:41:07 +0200 | [diff] [blame] | 85 | s.nodes++ |
| Serge Bazanski | 6f59951 | 2023-04-26 19:08:19 +0200 | [diff] [blame] | 86 | s.ramBytes += uint64(hwrep.Report.MemoryInstalledBytes) |
| 87 | for _, cpu := range hwrep.Report.Cpu { |
| 88 | s.numThreads += uint64(cpu.HardwareThreads) |
| 89 | } |
| 90 | } |
| 91 | |
| 92 | // regionKey is used to uniquely identify each region per each provider. |
| 93 | type regionKey struct { |
| 94 | provider model.Provider |
| 95 | location string |
| 96 | } |
| 97 | |
| 98 | func (r *regionKey) String() string { |
| 99 | return fmt.Sprintf("%s/%s", r.provider, r.location) |
| 100 | } |
| 101 | |
| 102 | func (h *hwStatsRunner) runOnce(ctx context.Context) error { |
| 103 | sess, err := h.s.session(ctx) |
| 104 | if err != nil { |
| 105 | return err |
| 106 | } |
| 107 | |
| 108 | var start uuid.UUID |
| 109 | |
| 110 | perRegion := make(map[regionKey]*statsPerRegion) |
| 111 | var total statsPerRegion |
| 112 | |
| 113 | for { |
| 114 | var res []model.ListMachineHardwareRow |
| 115 | err = sess.Transact(ctx, func(q *model.Queries) error { |
| 116 | res, err = q.ListMachineHardware(ctx, model.ListMachineHardwareParams{ |
| 117 | Limit: 100, |
| 118 | MachineID: start, |
| 119 | }) |
| 120 | return err |
| 121 | }) |
| 122 | if err != nil { |
| 123 | return err |
| 124 | } |
| 125 | klog.Infof("Machines: %d chunk", len(res)) |
| 126 | if len(res) == 0 { |
| 127 | break |
| 128 | } |
| 129 | for _, row := range res { |
| 130 | var hwrep api.AgentHardwareReport |
| 131 | err = proto.Unmarshal(row.HardwareReportRaw.([]byte), &hwrep) |
| 132 | if err != nil { |
| 133 | klog.Warningf("Could not decode hardware report from %s: %v", row.MachineID, err) |
| 134 | continue |
| 135 | } |
| 136 | |
| 137 | if !row.ProviderLocation.Valid { |
| 138 | klog.Warningf("%s has no provider location, skipping", row.MachineID) |
| 139 | continue |
| 140 | } |
| 141 | |
| 142 | key := regionKey{ |
| 143 | provider: row.Provider, |
| 144 | location: row.ProviderLocation.String, |
| 145 | } |
| 146 | if _, ok := perRegion[key]; !ok { |
| 147 | perRegion[key] = &statsPerRegion{} |
| 148 | } |
| 149 | perRegion[key].add(&hwrep) |
| 150 | total.add(&hwrep) |
| 151 | |
| 152 | start = row.MachineID |
| 153 | } |
| 154 | } |
| 155 | |
| 156 | for k, st := range perRegion { |
| 157 | labels := prometheus.Labels{ |
| 158 | "provider": string(k.provider), |
| 159 | "location": k.location, |
| 160 | } |
| 161 | |
| Tim Windelschmidt | 60fab1c | 2023-06-13 20:41:07 +0200 | [diff] [blame] | 162 | h.nodesPerRegion.With(labels).Set(float64(st.nodes)) |
| Serge Bazanski | 6f59951 | 2023-04-26 19:08:19 +0200 | [diff] [blame] | 163 | h.memoryPerRegion.With(labels).Set(float64(st.ramBytes)) |
| 164 | h.cpuThreadsPerRegion.With(labels).Set(float64(st.numThreads)) |
| 165 | } |
| 166 | return nil |
| 167 | } |