blob: 22dd2474f0d273ed7ea89f6be6af83b334b044da [file] [log] [blame]
package scruffy
import (
"context"
"errors"
"fmt"
"time"
"github.com/google/uuid"
"github.com/prometheus/client_golang/prometheus"
"google.golang.org/protobuf/proto"
"k8s.io/klog/v2"
"source.monogon.dev/cloud/bmaas/bmdb/model"
"source.monogon.dev/cloud/bmaas/server/api"
)
// hwStatsRunner collects metrics from the machine hardware inventory in BMDB and
// exposes them as Prometheus metrics via a registry passed to newHWStatsRunner.
type hwStatsRunner struct {
s *Server
memoryPerRegion *prometheus.GaugeVec
cpuThreadsPerRegion *prometheus.GaugeVec
}
// newHWStatsRunner builds a hwStatsRunner. The hwStatsRunner then has the
// given's Server BMDB connection bound to it and can perform actual database
// statistic gathering.
func newHWStatsRunner(s *Server, reg *prometheus.Registry) *hwStatsRunner {
hwsr := &hwStatsRunner{
s: s,
memoryPerRegion: prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "bmdb_hwstats_region_ram_bytes",
}, []string{"provider", "location"}),
cpuThreadsPerRegion: prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "bmdb_hwstats_region_cpu_threads",
}, []string{"provider", "location"}),
}
reg.MustRegister(hwsr.memoryPerRegion, hwsr.cpuThreadsPerRegion)
return hwsr
}
func (h *hwStatsRunner) run(ctx context.Context) {
klog.Infof("Starting stats runner...")
ti := time.NewTicker(time.Minute)
for {
err := h.runOnce(ctx)
if err != nil {
if errors.Is(err, ctx.Err()) {
return
}
klog.Errorf("Stats run failed: %v", err)
}
select {
case <-ti.C:
case <-ctx.Done():
klog.Infof("Exiting stats runner (%v)...", ctx.Err())
return
}
}
}
// statsPerRegion are gathered and aggregated (summed) per region.
type statsPerRegion struct {
ramBytes uint64
numThreads uint64
}
// add a given AgentHardwareReport to this region's data.
func (s *statsPerRegion) add(hwrep *api.AgentHardwareReport) {
s.ramBytes += uint64(hwrep.Report.MemoryInstalledBytes)
for _, cpu := range hwrep.Report.Cpu {
s.numThreads += uint64(cpu.HardwareThreads)
}
}
// regionKey is used to uniquely identify each region per each provider.
type regionKey struct {
provider model.Provider
location string
}
func (r *regionKey) String() string {
return fmt.Sprintf("%s/%s", r.provider, r.location)
}
func (h *hwStatsRunner) runOnce(ctx context.Context) error {
sess, err := h.s.session(ctx)
if err != nil {
return err
}
var start uuid.UUID
perRegion := make(map[regionKey]*statsPerRegion)
var total statsPerRegion
for {
var res []model.ListMachineHardwareRow
err = sess.Transact(ctx, func(q *model.Queries) error {
res, err = q.ListMachineHardware(ctx, model.ListMachineHardwareParams{
Limit: 100,
MachineID: start,
})
return err
})
if err != nil {
return err
}
klog.Infof("Machines: %d chunk", len(res))
if len(res) == 0 {
break
}
for _, row := range res {
var hwrep api.AgentHardwareReport
err = proto.Unmarshal(row.HardwareReportRaw.([]byte), &hwrep)
if err != nil {
klog.Warningf("Could not decode hardware report from %s: %v", row.MachineID, err)
continue
}
if !row.ProviderLocation.Valid {
klog.Warningf("%s has no provider location, skipping", row.MachineID)
continue
}
key := regionKey{
provider: row.Provider,
location: row.ProviderLocation.String,
}
if _, ok := perRegion[key]; !ok {
perRegion[key] = &statsPerRegion{}
}
perRegion[key].add(&hwrep)
total.add(&hwrep)
start = row.MachineID
}
}
for k, st := range perRegion {
labels := prometheus.Labels{
"provider": string(k.provider),
"location": k.location,
}
h.memoryPerRegion.With(labels).Set(float64(st.ramBytes))
h.cpuThreadsPerRegion.With(labels).Set(float64(st.numThreads))
}
return nil
}