blob: c92244d235a6117a92d1675c8f247ef891e97283 [file] [log] [blame]
Lorenz Brun62948542023-01-10 13:28:44 +00001package main
2
3import (
4 "bufio"
5 "bytes"
6 "fmt"
7 "math"
8 "os"
9 "path/filepath"
10 "regexp"
11 "runtime"
12 "sort"
13 "strconv"
14 "strings"
15
16 "github.com/mdlayher/ethtool"
17 "github.com/vishvananda/netlink"
18 "golang.org/x/sys/unix"
19
20 "source.monogon.dev/cloud/agent/api"
21 "source.monogon.dev/metropolis/pkg/nvme"
22 "source.monogon.dev/metropolis/pkg/scsi"
23 "source.monogon.dev/metropolis/pkg/smbios"
24)
25
26type hwReportContext struct {
27 node *api.Node
28 errors []error
29}
30
31func (c *hwReportContext) gatherSMBIOS() {
32 smbiosFile, err := os.Open("/sys/firmware/dmi/tables/DMI")
33 if err != nil {
34 c.errors = append(c.errors, fmt.Errorf("unable to open SMBIOS table: %w", err))
35 return
36 }
37 defer smbiosFile.Close()
38 smbTbl, err := smbios.Unmarshal(bufio.NewReader(smbiosFile))
39 if err != nil {
40 c.errors = append(c.errors, fmt.Errorf("unable to parse SMBIOS table: %w", err))
41 return
42 }
43 if smbTbl.SystemInformationRaw != nil {
44 c.node.Manufacturer = smbTbl.SystemInformationRaw.Manufacturer
45 c.node.Product = smbTbl.SystemInformationRaw.ProductName
46 c.node.SerialNumber = smbTbl.SystemInformationRaw.SerialNumber
47 }
Lorenz Brun1cd26962023-04-19 16:10:17 +020048 if smbTbl.BIOSInformationRaw != nil && smbTbl.BIOSInformationRaw.StructureVersion.AtLeast(2, 2) {
49 uefiSupport := smbTbl.BIOSInformationRaw.BIOSCharacteristicsExtensionByte2&smbios.UEFISpecificationSupported != 0
50 if uefiSupport {
51 c.node.EfiSupport = api.EFISupport_EFI_SUPPORTED
52 } else {
53 c.node.EfiSupport = api.EFISupport_EFI_UNSUPPORTED
54 }
55 }
Lorenz Brun62948542023-01-10 13:28:44 +000056 for _, d := range smbTbl.MemoryDevicesRaw {
57 if d.StructureVersion.AtLeast(3, 2) && d.MemoryTechnology != 0x03 {
58 // If MemoryTechnology is available, only count DRAM
59 continue
60 }
61 size, ok := d.SizeBytes()
62 if !ok {
63 continue
64 }
65 c.node.MemoryInstalledBytes += int64(size)
66 }
67 return
68}
69
70var memoryBlockRegexp = regexp.MustCompile("^memory[0-9]+$")
71
72func (c *hwReportContext) gatherMemorySysfs() {
73 blockSizeRaw, err := os.ReadFile("/sys/devices/system/memory/block_size_bytes")
74 if err != nil {
75 c.errors = append(c.errors, fmt.Errorf("unable to read memory block size, CONFIG_MEMORY_HOTPLUG disabled or sandbox?: %w", err))
76 return
77 }
78 blockSize, err := strconv.ParseInt(strings.TrimSpace(string(blockSizeRaw)), 16, 64)
79 if err != nil {
80 c.errors = append(c.errors, fmt.Errorf("failed to parse memory block size (%q): %w", string(blockSizeRaw), err))
81 return
82 }
83 dirEntries, err := os.ReadDir("/sys/devices/system/memory")
84 if err != nil {
85 c.errors = append(c.errors, fmt.Errorf("unable to read sysfs memory devices list: %w", err))
86 return
87 }
88 c.node.MemoryInstalledBytes = 0
89 for _, e := range dirEntries {
90 if memoryBlockRegexp.MatchString(e.Name()) {
91 // This is safe as the regexp does not allow for any dots
92 state, err := os.ReadFile("/sys/devices/system/memory/%s/state")
93 if os.IsNotExist(err) {
94 // Memory hotplug operation raced us
95 continue
96 } else if err != nil {
97 c.errors = append(c.errors, fmt.Errorf("failed to read memory block state for %s: %w", e.Name(), err))
98 continue
99 }
100 if strings.TrimSpace(string(state)) != "online" {
101 // Only count online memory
102 continue
103 }
104 // Each block is one blockSize of memory
105 c.node.MemoryInstalledBytes += blockSize
106 }
107 }
108 return
109}
110
111func parseCpuinfoAMD64(cpuinfoRaw []byte) (*api.CPU, []error) {
112 // Parse line-by-line, each segment is separated by a line with no colon
113 // character, a segment describes a logical processor if it contains
114 // the key "processor". Keep track of all seen core IDs (physical
115 // processors) and processor IDs (logical processors) in a map to fill
116 // into the structure.
117 s := bufio.NewScanner(bytes.NewReader(cpuinfoRaw))
118 var cpu api.CPU
119 scannedVals := make(map[string]string)
120 seenCoreIDs := make(map[string]bool)
121 seenProcessorIDs := make(map[string]bool)
122 processItem := func() error {
123 if _, ok := scannedVals["processor"]; !ok {
124 // Not a cpu, clear data and return
125 scannedVals = make(map[string]string)
126 return nil
127 }
128 seenProcessorIDs[scannedVals["processor"]] = true
129 seenCoreIDs[scannedVals["core id"]] = true
130 cpu.Model = scannedVals["model name"]
131 cpu.Vendor = scannedVals["vendor_id"]
132 family, err := strconv.Atoi(scannedVals["cpu family"])
133 if err != nil {
134 return fmt.Errorf("unable to parse CPU family to int: %v", err)
135 }
136 model, err := strconv.Atoi(scannedVals["model"])
137 if err != nil {
138 return fmt.Errorf("unable to parse CPU model to int: %v", err)
139 }
140 stepping, err := strconv.Atoi(scannedVals["stepping"])
141 if err != nil {
142 return fmt.Errorf("unable to parse CPU stepping to int: %v", err)
143 }
144 cpu.Architecture = &api.CPU_X86_64_{
145 X86_64: &api.CPU_X86_64{
146 Family: int32(family),
147 Model: int32(model),
148 Stepping: int32(stepping),
149 },
150 }
151 scannedVals = make(map[string]string)
152 return nil
153 }
154 var errs []error
155 for s.Scan() {
156 k, v, ok := strings.Cut(s.Text(), ":")
157 // If there is a colon, add property to scannedVals.
158 if ok {
159 scannedVals[strings.TrimSpace(k)] = strings.TrimSpace(v)
160 continue
161 }
162 // Otherwise this is a segment boundary, process the segment.
163 if err := processItem(); err != nil {
164 errs = append(errs, fmt.Errorf("error parsing cpuinfo block: %w", err))
165 }
166 }
167 // Parse the last segment.
168 if err := processItem(); err != nil {
169 errs = append(errs, fmt.Errorf("error parsing cpuinfo block: %w", err))
170 }
171 cpu.Cores = int32(len(seenCoreIDs))
172 cpu.HardwareThreads = int32(len(seenProcessorIDs))
173 return &cpu, errs
174}
175
176func (c *hwReportContext) gatherCPU() {
177 switch runtime.GOARCH {
178 case "amd64":
179 // Currently a rather simple gatherer with no special NUMA handling
180 cpuinfoRaw, err := os.ReadFile("/proc/cpuinfo")
181 if err != nil {
182 c.errors = append(c.errors, fmt.Errorf("unable to read cpuinfo: %w", err))
183 return
184 }
185 cpu, errs := parseCpuinfoAMD64(cpuinfoRaw)
186 c.errors = append(c.errors, errs...)
187 c.node.Cpu = append(c.node.Cpu, cpu)
188 default:
189 // Currently unimplemented, do nothing
190 c.errors = append(c.errors, fmt.Errorf("architecture %v unsupported by CPU gatherer", runtime.GOARCH))
191 }
192 return
193}
194
Lorenz Brunaadeb792023-03-27 15:53:56 +0200195var FRUUnavailable = [16]byte{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
Lorenz Brun62948542023-01-10 13:28:44 +0000196
197func (c *hwReportContext) gatherNVMe(bd *api.BlockDevice, bde os.DirEntry) error {
198 bd.Protocol = api.BlockDevice_NVME
199 nvmeDev, err := nvme.Open("/dev/" + bde.Name())
200 if err != nil {
201 return fmt.Errorf("unable to open NVMe device: %w", err)
202 }
203 defer nvmeDev.Close()
204 identifyData, err := nvmeDev.Identify()
205 if err != nil {
206 return fmt.Errorf("calling Identify failed: %w", err)
207 }
208 bd.DeviceModel = identifyData.ModelNumber
209 bd.SerialNumber = identifyData.SerialNumber
210 if identifyData.FRUGloballyUniqueIdentifier != FRUUnavailable {
211 bd.Wwn = identifyData.FRUGloballyUniqueIdentifier[:]
212 }
213 if healthInfo, err := nvmeDev.GetHealthInfo(); err == nil {
214 bd.AvailableSpareRatio = &healthInfo.AvailableSpare
215 bd.CriticalWarning = healthInfo.HasCriticalWarning()
Lorenz Brunaadeb792023-03-27 15:53:56 +0200216 mediaErrors := int64(healthInfo.MediaAndDataIntegrityErrors)
Lorenz Brun62948542023-01-10 13:28:44 +0000217 bd.MediaErrors = &mediaErrors
218 bd.UsageRatio = &healthInfo.LifeUsed
219 }
220 return nil
221}
222
223func (c *hwReportContext) gatherSCSI(bd *api.BlockDevice, bde os.DirEntry) error {
224 bd.Protocol = api.BlockDevice_SCSI
225 scsiDev, err := scsi.Open("/dev/" + bde.Name())
226 if err != nil {
227 return fmt.Errorf("unable to open SCSI device: %w", err)
228 }
229 defer scsiDev.Close()
230 inquiryData, err := scsiDev.Inquiry()
231 if err != nil {
232 return fmt.Errorf("failed calling INQUIRY: %w", err)
233 }
234 if serial, err := scsiDev.UnitSerialNumber(); err == nil {
235 bd.SerialNumber = serial
236 }
237
238 // SAT-5 R8 Table 14
239 if inquiryData.Vendor == "ATA" { // ATA device behind SAT
240 bd.Protocol = api.BlockDevice_ATA
241 // TODO: ATA Vendor from WWN if available
242 } else { // Normal SCSI device
243 bd.Vendor = inquiryData.Vendor
244 // Attempt to read defect list to populate media error count
245 var mediaErrors int64
246 if defectsLBA, err := scsiDev.ReadDefectDataLBA(false, true); err == nil {
247 mediaErrors = int64(len(defectsLBA))
248 bd.MediaErrors = &mediaErrors
249 } else if defectsPhysical, err := scsiDev.ReadDefectDataPhysical(false, true); err == nil {
250 mediaErrors = int64(len(defectsPhysical))
251 bd.MediaErrors = &mediaErrors
252 }
253 if mediaHealth, err := scsiDev.SolidStateMediaHealth(); err == nil {
254 used := float32(mediaHealth.PercentageUsedEnduranceIndicator) / 100.
255 bd.UsageRatio = &used
256 }
257 if informationalExceptions, err := scsiDev.GetInformationalExceptions(); err == nil {
258 // Only consider FailurePredictionThresholdExceeded-class sense codes critical.
259 // The second commonly reported error here according to random forums are
260 // Warning-class errors, but looking through these they don't indicate imminent
261 // or even permanent errors.
262 bd.CriticalWarning = informationalExceptions.InformationalSenseCode.IsKey(scsi.FailurePredictionThresholdExceeded)
263 }
264 // SCSI has no reporting of available spares, so this will never be populated
265 }
266 bd.DeviceModel = inquiryData.Product
267 return nil
268}
269
270func (c *hwReportContext) gatherBlockDevices() {
271 blockDeviceEntries, err := os.ReadDir("/sys/class/block")
272 if err != nil {
273 c.errors = append(c.errors, fmt.Errorf("unable to read sysfs block device list: %w", err))
274 return
275 }
276 for _, bde := range blockDeviceEntries {
277 sysfsDir := fmt.Sprintf("/sys/class/block/%s", bde.Name())
278 if _, err := os.Stat(sysfsDir + "/partition"); err == nil {
279 // Ignore partitions, we only care about their parents
280 continue
281 }
282 var bd api.BlockDevice
283 if rotational, err := os.ReadFile(sysfsDir + "/queue/rotational"); err == nil {
284 if strings.TrimSpace(string(rotational)) == "1" {
285 bd.Rotational = true
286 }
287 }
288 if sizeRaw, err := os.ReadFile(sysfsDir + "/size"); err == nil {
289 size, err := strconv.ParseInt(strings.TrimSpace(string(sizeRaw)), 10, 64)
290 if err != nil {
291 c.errors = append(c.errors, fmt.Errorf("unable to parse block device %v size: %w", bde.Name(), err))
292 } else {
293 // Linux always defines size in terms of 512 byte blocks regardless
294 // of what the configured logical and physical block sizes are.
295 bd.CapacityBytes = size * 512
296 }
297 }
298 if lbsRaw, err := os.ReadFile(sysfsDir + "/queue/logical_block_size"); err == nil {
299 lbs, err := strconv.ParseInt(strings.TrimSpace(string(lbsRaw)), 10, 32)
300 if err != nil {
301 c.errors = append(c.errors, fmt.Errorf("unable to parse block device %v logical block size: %w", bde.Name(), err))
302 } else {
303 bd.LogicalBlockSizeBytes = int32(lbs)
304 }
305 }
306 if pbsRaw, err := os.ReadFile(sysfsDir + "/queue/physical_block_size"); err == nil {
307 pbs, err := strconv.ParseInt(strings.TrimSpace(string(pbsRaw)), 10, 32)
308 if err != nil {
309 c.errors = append(c.errors, fmt.Errorf("unable to parse physical block size: %w", err))
310 } else {
311 bd.PhysicalBlockSizeBytes = int32(pbs)
312 }
313 }
314 if strings.HasPrefix(bde.Name(), "nvme") {
315 err := c.gatherNVMe(&bd, bde)
316 if err != nil {
317 c.errors = append(c.errors, fmt.Errorf("block device %v: %w", bde.Name(), err))
318 } else {
319 c.node.BlockDevice = append(c.node.BlockDevice, &bd)
320 }
321 }
322 if strings.HasPrefix(bde.Name(), "sd") {
323 err := c.gatherSCSI(&bd, bde)
324 if err != nil {
325 c.errors = append(c.errors, fmt.Errorf("block device %v: %w", bde.Name(), err))
326 } else {
327 c.node.BlockDevice = append(c.node.BlockDevice, &bd)
328 }
329 }
330 if strings.HasPrefix(bde.Name(), "mmcblk") {
331 // TODO: MMC information
332 bd.Protocol = api.BlockDevice_MMC
333 c.node.BlockDevice = append(c.node.BlockDevice, &bd)
334 }
335 }
336 return
337}
338
339var speedModeRegexp = regexp.MustCompile("^([0-9]+)base")
340
341const mbps = (1000 * 1000) / 8
342
343func (c *hwReportContext) gatherNICs() {
344 links, err := netlink.LinkList()
345 if err != nil {
346 c.errors = append(c.errors, fmt.Errorf("failed to list network links: %w", err))
347 return
348 }
349 ethClient, err := ethtool.New()
350 if err != nil {
351 c.errors = append(c.errors, fmt.Errorf("failed to get ethtool netlink client: %w", err))
352 return
353 }
354 defer ethClient.Close()
355 for _, l := range links {
356 if l.Type() != "device" || len(l.Attrs().HardwareAddr) == 0 {
357 // Not a physical device, ignore
358 continue
359 }
360 var nif api.NetworkInterface
361 nif.Mac = l.Attrs().HardwareAddr
362 mode, err := ethClient.LinkMode(ethtool.Interface{Index: l.Attrs().Index})
363 if err == nil {
364 if mode.SpeedMegabits < math.MaxInt32 {
365 nif.CurrentSpeedBytes = int64(mode.SpeedMegabits) * mbps
366 }
367 speeds := make(map[int64]bool)
368 for _, m := range mode.Ours {
369 // Doing this with a regexp is arguably more future-proof as
370 // we don't need to add each link mode for the detection to
371 // work.
372 modeParts := speedModeRegexp.FindStringSubmatch(m.Name)
373 if len(modeParts) > 0 {
374 speedMegabits, err := strconv.ParseInt(modeParts[1], 10, 64)
375 if err != nil {
376 c.errors = append(c.errors, fmt.Errorf("nic %v: failed to parse %q as integer: %w", l.Attrs().Name, modeParts[1], err))
377 continue
378 }
379 speeds[int64(speedMegabits)*mbps] = true
380 }
381 }
382 for s := range speeds {
383 nif.SupportedSpeedBytes = append(nif.SupportedSpeedBytes, s)
384 }
385 // Go randomizes the map keys, sort to make the report stable.
386 sort.Slice(nif.SupportedSpeedBytes, func(i, j int) bool { return nif.SupportedSpeedBytes[i] > nif.SupportedSpeedBytes[j] })
387 }
388 state, err := ethClient.LinkState(ethtool.Interface{Index: l.Attrs().Index})
389 if err == nil {
390 nif.LinkUp = state.Link
391 } else {
392 // We have no ethtool support, fall back to checking if Linux
393 // thinks the link is up.
394 nif.LinkUp = l.Attrs().OperState == netlink.OperUp
395 }
396 // Linux blocks creation of interfaces which conflict with special path
397 // characters, so this path assembly is fine.
398 driverPath, err := os.Readlink("/sys/class/net/" + l.Attrs().Name + "/device/driver")
399 if err == nil {
400 nif.Driver = filepath.Base(driverPath)
401 }
402 c.node.NetworkInterface = append(c.node.NetworkInterface, &nif)
403 }
404 return
405}
406
407func gatherHWReport() (*api.Node, []error) {
Lorenz Brunaadeb792023-03-27 15:53:56 +0200408 hwReportCtx := hwReportContext{
409 node: &api.Node{},
410 }
Lorenz Brun1cd26962023-04-19 16:10:17 +0200411 hwReportCtx.node.EfiSupport = api.EFISupport_EFI_UNKNOWN
Lorenz Brun62948542023-01-10 13:28:44 +0000412
413 hwReportCtx.gatherCPU()
414 hwReportCtx.gatherSMBIOS()
415 if hwReportCtx.node.MemoryInstalledBytes == 0 {
416 hwReportCtx.gatherMemorySysfs()
417 }
418 var sysinfo unix.Sysinfo_t
419 if err := unix.Sysinfo(&sysinfo); err != nil {
420 hwReportCtx.errors = append(hwReportCtx.errors, fmt.Errorf("unable to execute sysinfo syscall: %w", err))
421 } else {
422 hwReportCtx.node.MemoryUsableRatio = float32(sysinfo.Totalram) / float32(hwReportCtx.node.MemoryInstalledBytes)
423 }
424 hwReportCtx.gatherNICs()
425 hwReportCtx.gatherBlockDevices()
426
Lorenz Brun1cd26962023-04-19 16:10:17 +0200427 if _, err := os.Stat("/sys/firmware/efi/runtime"); err == nil {
428 hwReportCtx.node.EfiSupport = api.EFISupport_EFI_ENABLED
429 }
430
Lorenz Brun62948542023-01-10 13:28:44 +0000431 return hwReportCtx.node, hwReportCtx.errors
432}