Lorenz Brun | 6294854 | 2023-01-10 13:28:44 +0000 | [diff] [blame] | 1 | package main |
| 2 | |
| 3 | import ( |
| 4 | "bufio" |
| 5 | "bytes" |
| 6 | "fmt" |
| 7 | "math" |
| 8 | "os" |
| 9 | "path/filepath" |
| 10 | "regexp" |
| 11 | "runtime" |
| 12 | "sort" |
| 13 | "strconv" |
| 14 | "strings" |
| 15 | |
| 16 | "github.com/mdlayher/ethtool" |
| 17 | "github.com/vishvananda/netlink" |
| 18 | "golang.org/x/sys/unix" |
| 19 | |
| 20 | "source.monogon.dev/cloud/agent/api" |
| 21 | "source.monogon.dev/metropolis/pkg/nvme" |
| 22 | "source.monogon.dev/metropolis/pkg/scsi" |
| 23 | "source.monogon.dev/metropolis/pkg/smbios" |
| 24 | ) |
| 25 | |
| 26 | type hwReportContext struct { |
| 27 | node *api.Node |
| 28 | errors []error |
| 29 | } |
| 30 | |
| 31 | func (c *hwReportContext) gatherSMBIOS() { |
| 32 | smbiosFile, err := os.Open("/sys/firmware/dmi/tables/DMI") |
| 33 | if err != nil { |
| 34 | c.errors = append(c.errors, fmt.Errorf("unable to open SMBIOS table: %w", err)) |
| 35 | return |
| 36 | } |
| 37 | defer smbiosFile.Close() |
| 38 | smbTbl, err := smbios.Unmarshal(bufio.NewReader(smbiosFile)) |
| 39 | if err != nil { |
| 40 | c.errors = append(c.errors, fmt.Errorf("unable to parse SMBIOS table: %w", err)) |
| 41 | return |
| 42 | } |
| 43 | if smbTbl.SystemInformationRaw != nil { |
| 44 | c.node.Manufacturer = smbTbl.SystemInformationRaw.Manufacturer |
| 45 | c.node.Product = smbTbl.SystemInformationRaw.ProductName |
| 46 | c.node.SerialNumber = smbTbl.SystemInformationRaw.SerialNumber |
| 47 | } |
Lorenz Brun | 1cd2696 | 2023-04-19 16:10:17 +0200 | [diff] [blame] | 48 | if smbTbl.BIOSInformationRaw != nil && smbTbl.BIOSInformationRaw.StructureVersion.AtLeast(2, 2) { |
| 49 | uefiSupport := smbTbl.BIOSInformationRaw.BIOSCharacteristicsExtensionByte2&smbios.UEFISpecificationSupported != 0 |
| 50 | if uefiSupport { |
| 51 | c.node.EfiSupport = api.EFISupport_EFI_SUPPORTED |
| 52 | } else { |
| 53 | c.node.EfiSupport = api.EFISupport_EFI_UNSUPPORTED |
| 54 | } |
| 55 | } |
Lorenz Brun | 6294854 | 2023-01-10 13:28:44 +0000 | [diff] [blame] | 56 | for _, d := range smbTbl.MemoryDevicesRaw { |
| 57 | if d.StructureVersion.AtLeast(3, 2) && d.MemoryTechnology != 0x03 { |
| 58 | // If MemoryTechnology is available, only count DRAM |
| 59 | continue |
| 60 | } |
| 61 | size, ok := d.SizeBytes() |
| 62 | if !ok { |
| 63 | continue |
| 64 | } |
| 65 | c.node.MemoryInstalledBytes += int64(size) |
| 66 | } |
| 67 | return |
| 68 | } |
| 69 | |
| 70 | var memoryBlockRegexp = regexp.MustCompile("^memory[0-9]+$") |
| 71 | |
| 72 | func (c *hwReportContext) gatherMemorySysfs() { |
| 73 | blockSizeRaw, err := os.ReadFile("/sys/devices/system/memory/block_size_bytes") |
| 74 | if err != nil { |
| 75 | c.errors = append(c.errors, fmt.Errorf("unable to read memory block size, CONFIG_MEMORY_HOTPLUG disabled or sandbox?: %w", err)) |
| 76 | return |
| 77 | } |
| 78 | blockSize, err := strconv.ParseInt(strings.TrimSpace(string(blockSizeRaw)), 16, 64) |
| 79 | if err != nil { |
| 80 | c.errors = append(c.errors, fmt.Errorf("failed to parse memory block size (%q): %w", string(blockSizeRaw), err)) |
| 81 | return |
| 82 | } |
| 83 | dirEntries, err := os.ReadDir("/sys/devices/system/memory") |
| 84 | if err != nil { |
| 85 | c.errors = append(c.errors, fmt.Errorf("unable to read sysfs memory devices list: %w", err)) |
| 86 | return |
| 87 | } |
| 88 | c.node.MemoryInstalledBytes = 0 |
| 89 | for _, e := range dirEntries { |
| 90 | if memoryBlockRegexp.MatchString(e.Name()) { |
| 91 | // This is safe as the regexp does not allow for any dots |
| 92 | state, err := os.ReadFile("/sys/devices/system/memory/%s/state") |
| 93 | if os.IsNotExist(err) { |
| 94 | // Memory hotplug operation raced us |
| 95 | continue |
| 96 | } else if err != nil { |
| 97 | c.errors = append(c.errors, fmt.Errorf("failed to read memory block state for %s: %w", e.Name(), err)) |
| 98 | continue |
| 99 | } |
| 100 | if strings.TrimSpace(string(state)) != "online" { |
| 101 | // Only count online memory |
| 102 | continue |
| 103 | } |
| 104 | // Each block is one blockSize of memory |
| 105 | c.node.MemoryInstalledBytes += blockSize |
| 106 | } |
| 107 | } |
| 108 | return |
| 109 | } |
| 110 | |
| 111 | func parseCpuinfoAMD64(cpuinfoRaw []byte) (*api.CPU, []error) { |
| 112 | // Parse line-by-line, each segment is separated by a line with no colon |
| 113 | // character, a segment describes a logical processor if it contains |
| 114 | // the key "processor". Keep track of all seen core IDs (physical |
| 115 | // processors) and processor IDs (logical processors) in a map to fill |
| 116 | // into the structure. |
| 117 | s := bufio.NewScanner(bytes.NewReader(cpuinfoRaw)) |
| 118 | var cpu api.CPU |
| 119 | scannedVals := make(map[string]string) |
| 120 | seenCoreIDs := make(map[string]bool) |
| 121 | seenProcessorIDs := make(map[string]bool) |
| 122 | processItem := func() error { |
| 123 | if _, ok := scannedVals["processor"]; !ok { |
| 124 | // Not a cpu, clear data and return |
| 125 | scannedVals = make(map[string]string) |
| 126 | return nil |
| 127 | } |
| 128 | seenProcessorIDs[scannedVals["processor"]] = true |
| 129 | seenCoreIDs[scannedVals["core id"]] = true |
| 130 | cpu.Model = scannedVals["model name"] |
| 131 | cpu.Vendor = scannedVals["vendor_id"] |
| 132 | family, err := strconv.Atoi(scannedVals["cpu family"]) |
| 133 | if err != nil { |
| 134 | return fmt.Errorf("unable to parse CPU family to int: %v", err) |
| 135 | } |
| 136 | model, err := strconv.Atoi(scannedVals["model"]) |
| 137 | if err != nil { |
| 138 | return fmt.Errorf("unable to parse CPU model to int: %v", err) |
| 139 | } |
| 140 | stepping, err := strconv.Atoi(scannedVals["stepping"]) |
| 141 | if err != nil { |
| 142 | return fmt.Errorf("unable to parse CPU stepping to int: %v", err) |
| 143 | } |
| 144 | cpu.Architecture = &api.CPU_X86_64_{ |
| 145 | X86_64: &api.CPU_X86_64{ |
| 146 | Family: int32(family), |
| 147 | Model: int32(model), |
| 148 | Stepping: int32(stepping), |
| 149 | }, |
| 150 | } |
| 151 | scannedVals = make(map[string]string) |
| 152 | return nil |
| 153 | } |
| 154 | var errs []error |
| 155 | for s.Scan() { |
| 156 | k, v, ok := strings.Cut(s.Text(), ":") |
| 157 | // If there is a colon, add property to scannedVals. |
| 158 | if ok { |
| 159 | scannedVals[strings.TrimSpace(k)] = strings.TrimSpace(v) |
| 160 | continue |
| 161 | } |
| 162 | // Otherwise this is a segment boundary, process the segment. |
| 163 | if err := processItem(); err != nil { |
| 164 | errs = append(errs, fmt.Errorf("error parsing cpuinfo block: %w", err)) |
| 165 | } |
| 166 | } |
| 167 | // Parse the last segment. |
| 168 | if err := processItem(); err != nil { |
| 169 | errs = append(errs, fmt.Errorf("error parsing cpuinfo block: %w", err)) |
| 170 | } |
| 171 | cpu.Cores = int32(len(seenCoreIDs)) |
| 172 | cpu.HardwareThreads = int32(len(seenProcessorIDs)) |
| 173 | return &cpu, errs |
| 174 | } |
| 175 | |
| 176 | func (c *hwReportContext) gatherCPU() { |
| 177 | switch runtime.GOARCH { |
| 178 | case "amd64": |
| 179 | // Currently a rather simple gatherer with no special NUMA handling |
| 180 | cpuinfoRaw, err := os.ReadFile("/proc/cpuinfo") |
| 181 | if err != nil { |
| 182 | c.errors = append(c.errors, fmt.Errorf("unable to read cpuinfo: %w", err)) |
| 183 | return |
| 184 | } |
| 185 | cpu, errs := parseCpuinfoAMD64(cpuinfoRaw) |
| 186 | c.errors = append(c.errors, errs...) |
| 187 | c.node.Cpu = append(c.node.Cpu, cpu) |
| 188 | default: |
| 189 | // Currently unimplemented, do nothing |
| 190 | c.errors = append(c.errors, fmt.Errorf("architecture %v unsupported by CPU gatherer", runtime.GOARCH)) |
| 191 | } |
| 192 | return |
| 193 | } |
| 194 | |
Lorenz Brun | aadeb79 | 2023-03-27 15:53:56 +0200 | [diff] [blame] | 195 | var FRUUnavailable = [16]byte{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} |
Lorenz Brun | 6294854 | 2023-01-10 13:28:44 +0000 | [diff] [blame] | 196 | |
| 197 | func (c *hwReportContext) gatherNVMe(bd *api.BlockDevice, bde os.DirEntry) error { |
| 198 | bd.Protocol = api.BlockDevice_NVME |
| 199 | nvmeDev, err := nvme.Open("/dev/" + bde.Name()) |
| 200 | if err != nil { |
| 201 | return fmt.Errorf("unable to open NVMe device: %w", err) |
| 202 | } |
| 203 | defer nvmeDev.Close() |
| 204 | identifyData, err := nvmeDev.Identify() |
| 205 | if err != nil { |
| 206 | return fmt.Errorf("calling Identify failed: %w", err) |
| 207 | } |
| 208 | bd.DeviceModel = identifyData.ModelNumber |
| 209 | bd.SerialNumber = identifyData.SerialNumber |
| 210 | if identifyData.FRUGloballyUniqueIdentifier != FRUUnavailable { |
| 211 | bd.Wwn = identifyData.FRUGloballyUniqueIdentifier[:] |
| 212 | } |
| 213 | if healthInfo, err := nvmeDev.GetHealthInfo(); err == nil { |
| 214 | bd.AvailableSpareRatio = &healthInfo.AvailableSpare |
| 215 | bd.CriticalWarning = healthInfo.HasCriticalWarning() |
Lorenz Brun | aadeb79 | 2023-03-27 15:53:56 +0200 | [diff] [blame] | 216 | mediaErrors := int64(healthInfo.MediaAndDataIntegrityErrors) |
Lorenz Brun | 6294854 | 2023-01-10 13:28:44 +0000 | [diff] [blame] | 217 | bd.MediaErrors = &mediaErrors |
| 218 | bd.UsageRatio = &healthInfo.LifeUsed |
| 219 | } |
| 220 | return nil |
| 221 | } |
| 222 | |
| 223 | func (c *hwReportContext) gatherSCSI(bd *api.BlockDevice, bde os.DirEntry) error { |
| 224 | bd.Protocol = api.BlockDevice_SCSI |
| 225 | scsiDev, err := scsi.Open("/dev/" + bde.Name()) |
| 226 | if err != nil { |
| 227 | return fmt.Errorf("unable to open SCSI device: %w", err) |
| 228 | } |
| 229 | defer scsiDev.Close() |
| 230 | inquiryData, err := scsiDev.Inquiry() |
| 231 | if err != nil { |
| 232 | return fmt.Errorf("failed calling INQUIRY: %w", err) |
| 233 | } |
| 234 | if serial, err := scsiDev.UnitSerialNumber(); err == nil { |
| 235 | bd.SerialNumber = serial |
| 236 | } |
| 237 | |
| 238 | // SAT-5 R8 Table 14 |
| 239 | if inquiryData.Vendor == "ATA" { // ATA device behind SAT |
| 240 | bd.Protocol = api.BlockDevice_ATA |
| 241 | // TODO: ATA Vendor from WWN if available |
| 242 | } else { // Normal SCSI device |
| 243 | bd.Vendor = inquiryData.Vendor |
| 244 | // Attempt to read defect list to populate media error count |
| 245 | var mediaErrors int64 |
| 246 | if defectsLBA, err := scsiDev.ReadDefectDataLBA(false, true); err == nil { |
| 247 | mediaErrors = int64(len(defectsLBA)) |
| 248 | bd.MediaErrors = &mediaErrors |
| 249 | } else if defectsPhysical, err := scsiDev.ReadDefectDataPhysical(false, true); err == nil { |
| 250 | mediaErrors = int64(len(defectsPhysical)) |
| 251 | bd.MediaErrors = &mediaErrors |
| 252 | } |
| 253 | if mediaHealth, err := scsiDev.SolidStateMediaHealth(); err == nil { |
| 254 | used := float32(mediaHealth.PercentageUsedEnduranceIndicator) / 100. |
| 255 | bd.UsageRatio = &used |
| 256 | } |
| 257 | if informationalExceptions, err := scsiDev.GetInformationalExceptions(); err == nil { |
| 258 | // Only consider FailurePredictionThresholdExceeded-class sense codes critical. |
| 259 | // The second commonly reported error here according to random forums are |
| 260 | // Warning-class errors, but looking through these they don't indicate imminent |
| 261 | // or even permanent errors. |
| 262 | bd.CriticalWarning = informationalExceptions.InformationalSenseCode.IsKey(scsi.FailurePredictionThresholdExceeded) |
| 263 | } |
| 264 | // SCSI has no reporting of available spares, so this will never be populated |
| 265 | } |
| 266 | bd.DeviceModel = inquiryData.Product |
| 267 | return nil |
| 268 | } |
| 269 | |
| 270 | func (c *hwReportContext) gatherBlockDevices() { |
| 271 | blockDeviceEntries, err := os.ReadDir("/sys/class/block") |
| 272 | if err != nil { |
| 273 | c.errors = append(c.errors, fmt.Errorf("unable to read sysfs block device list: %w", err)) |
| 274 | return |
| 275 | } |
| 276 | for _, bde := range blockDeviceEntries { |
| 277 | sysfsDir := fmt.Sprintf("/sys/class/block/%s", bde.Name()) |
| 278 | if _, err := os.Stat(sysfsDir + "/partition"); err == nil { |
| 279 | // Ignore partitions, we only care about their parents |
| 280 | continue |
| 281 | } |
| 282 | var bd api.BlockDevice |
| 283 | if rotational, err := os.ReadFile(sysfsDir + "/queue/rotational"); err == nil { |
| 284 | if strings.TrimSpace(string(rotational)) == "1" { |
| 285 | bd.Rotational = true |
| 286 | } |
| 287 | } |
| 288 | if sizeRaw, err := os.ReadFile(sysfsDir + "/size"); err == nil { |
| 289 | size, err := strconv.ParseInt(strings.TrimSpace(string(sizeRaw)), 10, 64) |
| 290 | if err != nil { |
| 291 | c.errors = append(c.errors, fmt.Errorf("unable to parse block device %v size: %w", bde.Name(), err)) |
| 292 | } else { |
| 293 | // Linux always defines size in terms of 512 byte blocks regardless |
| 294 | // of what the configured logical and physical block sizes are. |
| 295 | bd.CapacityBytes = size * 512 |
| 296 | } |
| 297 | } |
| 298 | if lbsRaw, err := os.ReadFile(sysfsDir + "/queue/logical_block_size"); err == nil { |
| 299 | lbs, err := strconv.ParseInt(strings.TrimSpace(string(lbsRaw)), 10, 32) |
| 300 | if err != nil { |
| 301 | c.errors = append(c.errors, fmt.Errorf("unable to parse block device %v logical block size: %w", bde.Name(), err)) |
| 302 | } else { |
| 303 | bd.LogicalBlockSizeBytes = int32(lbs) |
| 304 | } |
| 305 | } |
| 306 | if pbsRaw, err := os.ReadFile(sysfsDir + "/queue/physical_block_size"); err == nil { |
| 307 | pbs, err := strconv.ParseInt(strings.TrimSpace(string(pbsRaw)), 10, 32) |
| 308 | if err != nil { |
| 309 | c.errors = append(c.errors, fmt.Errorf("unable to parse physical block size: %w", err)) |
| 310 | } else { |
| 311 | bd.PhysicalBlockSizeBytes = int32(pbs) |
| 312 | } |
| 313 | } |
| 314 | if strings.HasPrefix(bde.Name(), "nvme") { |
| 315 | err := c.gatherNVMe(&bd, bde) |
| 316 | if err != nil { |
| 317 | c.errors = append(c.errors, fmt.Errorf("block device %v: %w", bde.Name(), err)) |
| 318 | } else { |
| 319 | c.node.BlockDevice = append(c.node.BlockDevice, &bd) |
| 320 | } |
| 321 | } |
| 322 | if strings.HasPrefix(bde.Name(), "sd") { |
| 323 | err := c.gatherSCSI(&bd, bde) |
| 324 | if err != nil { |
| 325 | c.errors = append(c.errors, fmt.Errorf("block device %v: %w", bde.Name(), err)) |
| 326 | } else { |
| 327 | c.node.BlockDevice = append(c.node.BlockDevice, &bd) |
| 328 | } |
| 329 | } |
| 330 | if strings.HasPrefix(bde.Name(), "mmcblk") { |
| 331 | // TODO: MMC information |
| 332 | bd.Protocol = api.BlockDevice_MMC |
| 333 | c.node.BlockDevice = append(c.node.BlockDevice, &bd) |
| 334 | } |
| 335 | } |
| 336 | return |
| 337 | } |
| 338 | |
| 339 | var speedModeRegexp = regexp.MustCompile("^([0-9]+)base") |
| 340 | |
| 341 | const mbps = (1000 * 1000) / 8 |
| 342 | |
| 343 | func (c *hwReportContext) gatherNICs() { |
| 344 | links, err := netlink.LinkList() |
| 345 | if err != nil { |
| 346 | c.errors = append(c.errors, fmt.Errorf("failed to list network links: %w", err)) |
| 347 | return |
| 348 | } |
| 349 | ethClient, err := ethtool.New() |
| 350 | if err != nil { |
| 351 | c.errors = append(c.errors, fmt.Errorf("failed to get ethtool netlink client: %w", err)) |
| 352 | return |
| 353 | } |
| 354 | defer ethClient.Close() |
| 355 | for _, l := range links { |
| 356 | if l.Type() != "device" || len(l.Attrs().HardwareAddr) == 0 { |
| 357 | // Not a physical device, ignore |
| 358 | continue |
| 359 | } |
| 360 | var nif api.NetworkInterface |
| 361 | nif.Mac = l.Attrs().HardwareAddr |
| 362 | mode, err := ethClient.LinkMode(ethtool.Interface{Index: l.Attrs().Index}) |
| 363 | if err == nil { |
| 364 | if mode.SpeedMegabits < math.MaxInt32 { |
| 365 | nif.CurrentSpeedBytes = int64(mode.SpeedMegabits) * mbps |
| 366 | } |
| 367 | speeds := make(map[int64]bool) |
| 368 | for _, m := range mode.Ours { |
| 369 | // Doing this with a regexp is arguably more future-proof as |
| 370 | // we don't need to add each link mode for the detection to |
| 371 | // work. |
| 372 | modeParts := speedModeRegexp.FindStringSubmatch(m.Name) |
| 373 | if len(modeParts) > 0 { |
| 374 | speedMegabits, err := strconv.ParseInt(modeParts[1], 10, 64) |
| 375 | if err != nil { |
| 376 | c.errors = append(c.errors, fmt.Errorf("nic %v: failed to parse %q as integer: %w", l.Attrs().Name, modeParts[1], err)) |
| 377 | continue |
| 378 | } |
| 379 | speeds[int64(speedMegabits)*mbps] = true |
| 380 | } |
| 381 | } |
| 382 | for s := range speeds { |
| 383 | nif.SupportedSpeedBytes = append(nif.SupportedSpeedBytes, s) |
| 384 | } |
| 385 | // Go randomizes the map keys, sort to make the report stable. |
| 386 | sort.Slice(nif.SupportedSpeedBytes, func(i, j int) bool { return nif.SupportedSpeedBytes[i] > nif.SupportedSpeedBytes[j] }) |
| 387 | } |
| 388 | state, err := ethClient.LinkState(ethtool.Interface{Index: l.Attrs().Index}) |
| 389 | if err == nil { |
| 390 | nif.LinkUp = state.Link |
| 391 | } else { |
| 392 | // We have no ethtool support, fall back to checking if Linux |
| 393 | // thinks the link is up. |
| 394 | nif.LinkUp = l.Attrs().OperState == netlink.OperUp |
| 395 | } |
| 396 | // Linux blocks creation of interfaces which conflict with special path |
| 397 | // characters, so this path assembly is fine. |
| 398 | driverPath, err := os.Readlink("/sys/class/net/" + l.Attrs().Name + "/device/driver") |
| 399 | if err == nil { |
| 400 | nif.Driver = filepath.Base(driverPath) |
| 401 | } |
| 402 | c.node.NetworkInterface = append(c.node.NetworkInterface, &nif) |
| 403 | } |
| 404 | return |
| 405 | } |
| 406 | |
| 407 | func gatherHWReport() (*api.Node, []error) { |
Lorenz Brun | aadeb79 | 2023-03-27 15:53:56 +0200 | [diff] [blame] | 408 | hwReportCtx := hwReportContext{ |
| 409 | node: &api.Node{}, |
| 410 | } |
Lorenz Brun | 1cd2696 | 2023-04-19 16:10:17 +0200 | [diff] [blame] | 411 | hwReportCtx.node.EfiSupport = api.EFISupport_EFI_UNKNOWN |
Lorenz Brun | 6294854 | 2023-01-10 13:28:44 +0000 | [diff] [blame] | 412 | |
| 413 | hwReportCtx.gatherCPU() |
| 414 | hwReportCtx.gatherSMBIOS() |
| 415 | if hwReportCtx.node.MemoryInstalledBytes == 0 { |
| 416 | hwReportCtx.gatherMemorySysfs() |
| 417 | } |
| 418 | var sysinfo unix.Sysinfo_t |
| 419 | if err := unix.Sysinfo(&sysinfo); err != nil { |
| 420 | hwReportCtx.errors = append(hwReportCtx.errors, fmt.Errorf("unable to execute sysinfo syscall: %w", err)) |
| 421 | } else { |
| 422 | hwReportCtx.node.MemoryUsableRatio = float32(sysinfo.Totalram) / float32(hwReportCtx.node.MemoryInstalledBytes) |
| 423 | } |
| 424 | hwReportCtx.gatherNICs() |
| 425 | hwReportCtx.gatherBlockDevices() |
| 426 | |
Lorenz Brun | 1cd2696 | 2023-04-19 16:10:17 +0200 | [diff] [blame] | 427 | if _, err := os.Stat("/sys/firmware/efi/runtime"); err == nil { |
| 428 | hwReportCtx.node.EfiSupport = api.EFISupport_EFI_ENABLED |
| 429 | } |
| 430 | |
Lorenz Brun | 6294854 | 2023-01-10 13:28:44 +0000 | [diff] [blame] | 431 | return hwReportCtx.node, hwReportCtx.errors |
| 432 | } |