Serge Bazanski | 6f59951 | 2023-04-26 19:08:19 +0200 | [diff] [blame] | 1 | package scruffy |
| 2 | |
| 3 | import ( |
| 4 | "context" |
| 5 | "errors" |
| 6 | "fmt" |
| 7 | "time" |
| 8 | |
| 9 | "github.com/google/uuid" |
| 10 | "github.com/prometheus/client_golang/prometheus" |
| 11 | "google.golang.org/protobuf/proto" |
| 12 | "k8s.io/klog/v2" |
| 13 | |
| 14 | "source.monogon.dev/cloud/bmaas/bmdb/model" |
| 15 | "source.monogon.dev/cloud/bmaas/server/api" |
| 16 | ) |
| 17 | |
| 18 | // hwStatsRunner collects metrics from the machine hardware inventory in BMDB and |
| 19 | // exposes them as Prometheus metrics via a registry passed to newHWStatsRunner. |
| 20 | type hwStatsRunner struct { |
| 21 | s *Server |
| 22 | |
Tim Windelschmidt | 60fab1c | 2023-06-13 20:41:07 +0200 | [diff] [blame] | 23 | nodesPerRegion *prometheus.GaugeVec |
Serge Bazanski | 6f59951 | 2023-04-26 19:08:19 +0200 | [diff] [blame] | 24 | memoryPerRegion *prometheus.GaugeVec |
| 25 | cpuThreadsPerRegion *prometheus.GaugeVec |
| 26 | } |
| 27 | |
| 28 | // newHWStatsRunner builds a hwStatsRunner. The hwStatsRunner then has the |
| 29 | // given's Server BMDB connection bound to it and can perform actual database |
| 30 | // statistic gathering. |
| 31 | func newHWStatsRunner(s *Server, reg *prometheus.Registry) *hwStatsRunner { |
| 32 | hwsr := &hwStatsRunner{ |
| 33 | s: s, |
| 34 | |
Tim Windelschmidt | 60fab1c | 2023-06-13 20:41:07 +0200 | [diff] [blame] | 35 | nodesPerRegion: prometheus.NewGaugeVec(prometheus.GaugeOpts{ |
| 36 | Name: "bmdb_hwstats_region_nodes", |
| 37 | }, []string{"provider", "location"}), |
| 38 | |
Serge Bazanski | 6f59951 | 2023-04-26 19:08:19 +0200 | [diff] [blame] | 39 | memoryPerRegion: prometheus.NewGaugeVec(prometheus.GaugeOpts{ |
| 40 | Name: "bmdb_hwstats_region_ram_bytes", |
| 41 | }, []string{"provider", "location"}), |
| 42 | |
| 43 | cpuThreadsPerRegion: prometheus.NewGaugeVec(prometheus.GaugeOpts{ |
| 44 | Name: "bmdb_hwstats_region_cpu_threads", |
| 45 | }, []string{"provider", "location"}), |
| 46 | } |
Tim Windelschmidt | 60fab1c | 2023-06-13 20:41:07 +0200 | [diff] [blame] | 47 | reg.MustRegister(hwsr.nodesPerRegion, hwsr.memoryPerRegion, hwsr.cpuThreadsPerRegion) |
Serge Bazanski | 6f59951 | 2023-04-26 19:08:19 +0200 | [diff] [blame] | 48 | return hwsr |
| 49 | } |
| 50 | |
| 51 | func (h *hwStatsRunner) run(ctx context.Context) { |
| 52 | klog.Infof("Starting stats runner...") |
| 53 | |
| 54 | ti := time.NewTicker(time.Minute) |
| 55 | |
| 56 | for { |
| 57 | err := h.runOnce(ctx) |
| 58 | if err != nil { |
| 59 | if errors.Is(err, ctx.Err()) { |
| 60 | return |
| 61 | } |
| 62 | klog.Errorf("Stats run failed: %v", err) |
| 63 | } |
| 64 | select { |
| 65 | case <-ti.C: |
| 66 | case <-ctx.Done(): |
| 67 | klog.Infof("Exiting stats runner (%v)...", ctx.Err()) |
| 68 | return |
| 69 | } |
| 70 | } |
| 71 | } |
| 72 | |
| 73 | // statsPerRegion are gathered and aggregated (summed) per region. |
| 74 | type statsPerRegion struct { |
Tim Windelschmidt | 60fab1c | 2023-06-13 20:41:07 +0200 | [diff] [blame] | 75 | nodes uint64 |
Serge Bazanski | 6f59951 | 2023-04-26 19:08:19 +0200 | [diff] [blame] | 76 | ramBytes uint64 |
| 77 | numThreads uint64 |
| 78 | } |
| 79 | |
| 80 | // add a given AgentHardwareReport to this region's data. |
| 81 | func (s *statsPerRegion) add(hwrep *api.AgentHardwareReport) { |
Tim Windelschmidt | 60fab1c | 2023-06-13 20:41:07 +0200 | [diff] [blame] | 82 | s.nodes++ |
Serge Bazanski | 6f59951 | 2023-04-26 19:08:19 +0200 | [diff] [blame] | 83 | s.ramBytes += uint64(hwrep.Report.MemoryInstalledBytes) |
| 84 | for _, cpu := range hwrep.Report.Cpu { |
| 85 | s.numThreads += uint64(cpu.HardwareThreads) |
| 86 | } |
| 87 | } |
| 88 | |
| 89 | // regionKey is used to uniquely identify each region per each provider. |
| 90 | type regionKey struct { |
| 91 | provider model.Provider |
| 92 | location string |
| 93 | } |
| 94 | |
| 95 | func (r *regionKey) String() string { |
| 96 | return fmt.Sprintf("%s/%s", r.provider, r.location) |
| 97 | } |
| 98 | |
| 99 | func (h *hwStatsRunner) runOnce(ctx context.Context) error { |
| 100 | sess, err := h.s.session(ctx) |
| 101 | if err != nil { |
| 102 | return err |
| 103 | } |
| 104 | |
| 105 | var start uuid.UUID |
| 106 | |
| 107 | perRegion := make(map[regionKey]*statsPerRegion) |
| 108 | var total statsPerRegion |
| 109 | |
| 110 | for { |
| 111 | var res []model.ListMachineHardwareRow |
| 112 | err = sess.Transact(ctx, func(q *model.Queries) error { |
| 113 | res, err = q.ListMachineHardware(ctx, model.ListMachineHardwareParams{ |
| 114 | Limit: 100, |
| 115 | MachineID: start, |
| 116 | }) |
| 117 | return err |
| 118 | }) |
| 119 | if err != nil { |
| 120 | return err |
| 121 | } |
| 122 | klog.Infof("Machines: %d chunk", len(res)) |
| 123 | if len(res) == 0 { |
| 124 | break |
| 125 | } |
| 126 | for _, row := range res { |
| 127 | var hwrep api.AgentHardwareReport |
| 128 | err = proto.Unmarshal(row.HardwareReportRaw.([]byte), &hwrep) |
| 129 | if err != nil { |
| 130 | klog.Warningf("Could not decode hardware report from %s: %v", row.MachineID, err) |
| 131 | continue |
| 132 | } |
| 133 | |
| 134 | if !row.ProviderLocation.Valid { |
| 135 | klog.Warningf("%s has no provider location, skipping", row.MachineID) |
| 136 | continue |
| 137 | } |
| 138 | |
| 139 | key := regionKey{ |
| 140 | provider: row.Provider, |
| 141 | location: row.ProviderLocation.String, |
| 142 | } |
| 143 | if _, ok := perRegion[key]; !ok { |
| 144 | perRegion[key] = &statsPerRegion{} |
| 145 | } |
| 146 | perRegion[key].add(&hwrep) |
| 147 | total.add(&hwrep) |
| 148 | |
| 149 | start = row.MachineID |
| 150 | } |
| 151 | } |
| 152 | |
| 153 | for k, st := range perRegion { |
| 154 | labels := prometheus.Labels{ |
| 155 | "provider": string(k.provider), |
| 156 | "location": k.location, |
| 157 | } |
| 158 | |
Tim Windelschmidt | 60fab1c | 2023-06-13 20:41:07 +0200 | [diff] [blame] | 159 | h.nodesPerRegion.With(labels).Set(float64(st.nodes)) |
Serge Bazanski | 6f59951 | 2023-04-26 19:08:19 +0200 | [diff] [blame] | 160 | h.memoryPerRegion.With(labels).Set(float64(st.ramBytes)) |
| 161 | h.cpuThreadsPerRegion.With(labels).Set(float64(st.numThreads)) |
| 162 | } |
| 163 | return nil |
| 164 | } |