blob: 3b82d275709edaa4e78ae5fa1321a88adcc58161 [file] [log] [blame]
Lorenz Brun62948542023-01-10 13:28:44 +00001package main
2
3import (
4 "bufio"
5 "bytes"
6 "fmt"
7 "math"
8 "os"
9 "path/filepath"
10 "regexp"
11 "runtime"
12 "sort"
13 "strconv"
14 "strings"
15
16 "github.com/mdlayher/ethtool"
17 "github.com/vishvananda/netlink"
18 "golang.org/x/sys/unix"
19
20 "source.monogon.dev/cloud/agent/api"
21 "source.monogon.dev/metropolis/pkg/nvme"
22 "source.monogon.dev/metropolis/pkg/scsi"
23 "source.monogon.dev/metropolis/pkg/smbios"
24)
25
26type hwReportContext struct {
27 node *api.Node
28 errors []error
29}
30
31func (c *hwReportContext) gatherSMBIOS() {
32 smbiosFile, err := os.Open("/sys/firmware/dmi/tables/DMI")
33 if err != nil {
34 c.errors = append(c.errors, fmt.Errorf("unable to open SMBIOS table: %w", err))
35 return
36 }
37 defer smbiosFile.Close()
38 smbTbl, err := smbios.Unmarshal(bufio.NewReader(smbiosFile))
39 if err != nil {
40 c.errors = append(c.errors, fmt.Errorf("unable to parse SMBIOS table: %w", err))
41 return
42 }
43 if smbTbl.SystemInformationRaw != nil {
44 c.node.Manufacturer = smbTbl.SystemInformationRaw.Manufacturer
45 c.node.Product = smbTbl.SystemInformationRaw.ProductName
46 c.node.SerialNumber = smbTbl.SystemInformationRaw.SerialNumber
47 }
48 for _, d := range smbTbl.MemoryDevicesRaw {
49 if d.StructureVersion.AtLeast(3, 2) && d.MemoryTechnology != 0x03 {
50 // If MemoryTechnology is available, only count DRAM
51 continue
52 }
53 size, ok := d.SizeBytes()
54 if !ok {
55 continue
56 }
57 c.node.MemoryInstalledBytes += int64(size)
58 }
59 return
60}
61
62var memoryBlockRegexp = regexp.MustCompile("^memory[0-9]+$")
63
64func (c *hwReportContext) gatherMemorySysfs() {
65 blockSizeRaw, err := os.ReadFile("/sys/devices/system/memory/block_size_bytes")
66 if err != nil {
67 c.errors = append(c.errors, fmt.Errorf("unable to read memory block size, CONFIG_MEMORY_HOTPLUG disabled or sandbox?: %w", err))
68 return
69 }
70 blockSize, err := strconv.ParseInt(strings.TrimSpace(string(blockSizeRaw)), 16, 64)
71 if err != nil {
72 c.errors = append(c.errors, fmt.Errorf("failed to parse memory block size (%q): %w", string(blockSizeRaw), err))
73 return
74 }
75 dirEntries, err := os.ReadDir("/sys/devices/system/memory")
76 if err != nil {
77 c.errors = append(c.errors, fmt.Errorf("unable to read sysfs memory devices list: %w", err))
78 return
79 }
80 c.node.MemoryInstalledBytes = 0
81 for _, e := range dirEntries {
82 if memoryBlockRegexp.MatchString(e.Name()) {
83 // This is safe as the regexp does not allow for any dots
84 state, err := os.ReadFile("/sys/devices/system/memory/%s/state")
85 if os.IsNotExist(err) {
86 // Memory hotplug operation raced us
87 continue
88 } else if err != nil {
89 c.errors = append(c.errors, fmt.Errorf("failed to read memory block state for %s: %w", e.Name(), err))
90 continue
91 }
92 if strings.TrimSpace(string(state)) != "online" {
93 // Only count online memory
94 continue
95 }
96 // Each block is one blockSize of memory
97 c.node.MemoryInstalledBytes += blockSize
98 }
99 }
100 return
101}
102
103func parseCpuinfoAMD64(cpuinfoRaw []byte) (*api.CPU, []error) {
104 // Parse line-by-line, each segment is separated by a line with no colon
105 // character, a segment describes a logical processor if it contains
106 // the key "processor". Keep track of all seen core IDs (physical
107 // processors) and processor IDs (logical processors) in a map to fill
108 // into the structure.
109 s := bufio.NewScanner(bytes.NewReader(cpuinfoRaw))
110 var cpu api.CPU
111 scannedVals := make(map[string]string)
112 seenCoreIDs := make(map[string]bool)
113 seenProcessorIDs := make(map[string]bool)
114 processItem := func() error {
115 if _, ok := scannedVals["processor"]; !ok {
116 // Not a cpu, clear data and return
117 scannedVals = make(map[string]string)
118 return nil
119 }
120 seenProcessorIDs[scannedVals["processor"]] = true
121 seenCoreIDs[scannedVals["core id"]] = true
122 cpu.Model = scannedVals["model name"]
123 cpu.Vendor = scannedVals["vendor_id"]
124 family, err := strconv.Atoi(scannedVals["cpu family"])
125 if err != nil {
126 return fmt.Errorf("unable to parse CPU family to int: %v", err)
127 }
128 model, err := strconv.Atoi(scannedVals["model"])
129 if err != nil {
130 return fmt.Errorf("unable to parse CPU model to int: %v", err)
131 }
132 stepping, err := strconv.Atoi(scannedVals["stepping"])
133 if err != nil {
134 return fmt.Errorf("unable to parse CPU stepping to int: %v", err)
135 }
136 cpu.Architecture = &api.CPU_X86_64_{
137 X86_64: &api.CPU_X86_64{
138 Family: int32(family),
139 Model: int32(model),
140 Stepping: int32(stepping),
141 },
142 }
143 scannedVals = make(map[string]string)
144 return nil
145 }
146 var errs []error
147 for s.Scan() {
148 k, v, ok := strings.Cut(s.Text(), ":")
149 // If there is a colon, add property to scannedVals.
150 if ok {
151 scannedVals[strings.TrimSpace(k)] = strings.TrimSpace(v)
152 continue
153 }
154 // Otherwise this is a segment boundary, process the segment.
155 if err := processItem(); err != nil {
156 errs = append(errs, fmt.Errorf("error parsing cpuinfo block: %w", err))
157 }
158 }
159 // Parse the last segment.
160 if err := processItem(); err != nil {
161 errs = append(errs, fmt.Errorf("error parsing cpuinfo block: %w", err))
162 }
163 cpu.Cores = int32(len(seenCoreIDs))
164 cpu.HardwareThreads = int32(len(seenProcessorIDs))
165 return &cpu, errs
166}
167
168func (c *hwReportContext) gatherCPU() {
169 switch runtime.GOARCH {
170 case "amd64":
171 // Currently a rather simple gatherer with no special NUMA handling
172 cpuinfoRaw, err := os.ReadFile("/proc/cpuinfo")
173 if err != nil {
174 c.errors = append(c.errors, fmt.Errorf("unable to read cpuinfo: %w", err))
175 return
176 }
177 cpu, errs := parseCpuinfoAMD64(cpuinfoRaw)
178 c.errors = append(c.errors, errs...)
179 c.node.Cpu = append(c.node.Cpu, cpu)
180 default:
181 // Currently unimplemented, do nothing
182 c.errors = append(c.errors, fmt.Errorf("architecture %v unsupported by CPU gatherer", runtime.GOARCH))
183 }
184 return
185}
186
Lorenz Brunaadeb792023-03-27 15:53:56 +0200187var FRUUnavailable = [16]byte{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
Lorenz Brun62948542023-01-10 13:28:44 +0000188
189func (c *hwReportContext) gatherNVMe(bd *api.BlockDevice, bde os.DirEntry) error {
190 bd.Protocol = api.BlockDevice_NVME
191 nvmeDev, err := nvme.Open("/dev/" + bde.Name())
192 if err != nil {
193 return fmt.Errorf("unable to open NVMe device: %w", err)
194 }
195 defer nvmeDev.Close()
196 identifyData, err := nvmeDev.Identify()
197 if err != nil {
198 return fmt.Errorf("calling Identify failed: %w", err)
199 }
200 bd.DeviceModel = identifyData.ModelNumber
201 bd.SerialNumber = identifyData.SerialNumber
202 if identifyData.FRUGloballyUniqueIdentifier != FRUUnavailable {
203 bd.Wwn = identifyData.FRUGloballyUniqueIdentifier[:]
204 }
205 if healthInfo, err := nvmeDev.GetHealthInfo(); err == nil {
206 bd.AvailableSpareRatio = &healthInfo.AvailableSpare
207 bd.CriticalWarning = healthInfo.HasCriticalWarning()
Lorenz Brunaadeb792023-03-27 15:53:56 +0200208 mediaErrors := int64(healthInfo.MediaAndDataIntegrityErrors)
Lorenz Brun62948542023-01-10 13:28:44 +0000209 bd.MediaErrors = &mediaErrors
210 bd.UsageRatio = &healthInfo.LifeUsed
211 }
212 return nil
213}
214
215func (c *hwReportContext) gatherSCSI(bd *api.BlockDevice, bde os.DirEntry) error {
216 bd.Protocol = api.BlockDevice_SCSI
217 scsiDev, err := scsi.Open("/dev/" + bde.Name())
218 if err != nil {
219 return fmt.Errorf("unable to open SCSI device: %w", err)
220 }
221 defer scsiDev.Close()
222 inquiryData, err := scsiDev.Inquiry()
223 if err != nil {
224 return fmt.Errorf("failed calling INQUIRY: %w", err)
225 }
226 if serial, err := scsiDev.UnitSerialNumber(); err == nil {
227 bd.SerialNumber = serial
228 }
229
230 // SAT-5 R8 Table 14
231 if inquiryData.Vendor == "ATA" { // ATA device behind SAT
232 bd.Protocol = api.BlockDevice_ATA
233 // TODO: ATA Vendor from WWN if available
234 } else { // Normal SCSI device
235 bd.Vendor = inquiryData.Vendor
236 // Attempt to read defect list to populate media error count
237 var mediaErrors int64
238 if defectsLBA, err := scsiDev.ReadDefectDataLBA(false, true); err == nil {
239 mediaErrors = int64(len(defectsLBA))
240 bd.MediaErrors = &mediaErrors
241 } else if defectsPhysical, err := scsiDev.ReadDefectDataPhysical(false, true); err == nil {
242 mediaErrors = int64(len(defectsPhysical))
243 bd.MediaErrors = &mediaErrors
244 }
245 if mediaHealth, err := scsiDev.SolidStateMediaHealth(); err == nil {
246 used := float32(mediaHealth.PercentageUsedEnduranceIndicator) / 100.
247 bd.UsageRatio = &used
248 }
249 if informationalExceptions, err := scsiDev.GetInformationalExceptions(); err == nil {
250 // Only consider FailurePredictionThresholdExceeded-class sense codes critical.
251 // The second commonly reported error here according to random forums are
252 // Warning-class errors, but looking through these they don't indicate imminent
253 // or even permanent errors.
254 bd.CriticalWarning = informationalExceptions.InformationalSenseCode.IsKey(scsi.FailurePredictionThresholdExceeded)
255 }
256 // SCSI has no reporting of available spares, so this will never be populated
257 }
258 bd.DeviceModel = inquiryData.Product
259 return nil
260}
261
262func (c *hwReportContext) gatherBlockDevices() {
263 blockDeviceEntries, err := os.ReadDir("/sys/class/block")
264 if err != nil {
265 c.errors = append(c.errors, fmt.Errorf("unable to read sysfs block device list: %w", err))
266 return
267 }
268 for _, bde := range blockDeviceEntries {
269 sysfsDir := fmt.Sprintf("/sys/class/block/%s", bde.Name())
270 if _, err := os.Stat(sysfsDir + "/partition"); err == nil {
271 // Ignore partitions, we only care about their parents
272 continue
273 }
274 var bd api.BlockDevice
275 if rotational, err := os.ReadFile(sysfsDir + "/queue/rotational"); err == nil {
276 if strings.TrimSpace(string(rotational)) == "1" {
277 bd.Rotational = true
278 }
279 }
280 if sizeRaw, err := os.ReadFile(sysfsDir + "/size"); err == nil {
281 size, err := strconv.ParseInt(strings.TrimSpace(string(sizeRaw)), 10, 64)
282 if err != nil {
283 c.errors = append(c.errors, fmt.Errorf("unable to parse block device %v size: %w", bde.Name(), err))
284 } else {
285 // Linux always defines size in terms of 512 byte blocks regardless
286 // of what the configured logical and physical block sizes are.
287 bd.CapacityBytes = size * 512
288 }
289 }
290 if lbsRaw, err := os.ReadFile(sysfsDir + "/queue/logical_block_size"); err == nil {
291 lbs, err := strconv.ParseInt(strings.TrimSpace(string(lbsRaw)), 10, 32)
292 if err != nil {
293 c.errors = append(c.errors, fmt.Errorf("unable to parse block device %v logical block size: %w", bde.Name(), err))
294 } else {
295 bd.LogicalBlockSizeBytes = int32(lbs)
296 }
297 }
298 if pbsRaw, err := os.ReadFile(sysfsDir + "/queue/physical_block_size"); err == nil {
299 pbs, err := strconv.ParseInt(strings.TrimSpace(string(pbsRaw)), 10, 32)
300 if err != nil {
301 c.errors = append(c.errors, fmt.Errorf("unable to parse physical block size: %w", err))
302 } else {
303 bd.PhysicalBlockSizeBytes = int32(pbs)
304 }
305 }
306 if strings.HasPrefix(bde.Name(), "nvme") {
307 err := c.gatherNVMe(&bd, bde)
308 if err != nil {
309 c.errors = append(c.errors, fmt.Errorf("block device %v: %w", bde.Name(), err))
310 } else {
311 c.node.BlockDevice = append(c.node.BlockDevice, &bd)
312 }
313 }
314 if strings.HasPrefix(bde.Name(), "sd") {
315 err := c.gatherSCSI(&bd, bde)
316 if err != nil {
317 c.errors = append(c.errors, fmt.Errorf("block device %v: %w", bde.Name(), err))
318 } else {
319 c.node.BlockDevice = append(c.node.BlockDevice, &bd)
320 }
321 }
322 if strings.HasPrefix(bde.Name(), "mmcblk") {
323 // TODO: MMC information
324 bd.Protocol = api.BlockDevice_MMC
325 c.node.BlockDevice = append(c.node.BlockDevice, &bd)
326 }
327 }
328 return
329}
330
331var speedModeRegexp = regexp.MustCompile("^([0-9]+)base")
332
333const mbps = (1000 * 1000) / 8
334
335func (c *hwReportContext) gatherNICs() {
336 links, err := netlink.LinkList()
337 if err != nil {
338 c.errors = append(c.errors, fmt.Errorf("failed to list network links: %w", err))
339 return
340 }
341 ethClient, err := ethtool.New()
342 if err != nil {
343 c.errors = append(c.errors, fmt.Errorf("failed to get ethtool netlink client: %w", err))
344 return
345 }
346 defer ethClient.Close()
347 for _, l := range links {
348 if l.Type() != "device" || len(l.Attrs().HardwareAddr) == 0 {
349 // Not a physical device, ignore
350 continue
351 }
352 var nif api.NetworkInterface
353 nif.Mac = l.Attrs().HardwareAddr
354 mode, err := ethClient.LinkMode(ethtool.Interface{Index: l.Attrs().Index})
355 if err == nil {
356 if mode.SpeedMegabits < math.MaxInt32 {
357 nif.CurrentSpeedBytes = int64(mode.SpeedMegabits) * mbps
358 }
359 speeds := make(map[int64]bool)
360 for _, m := range mode.Ours {
361 // Doing this with a regexp is arguably more future-proof as
362 // we don't need to add each link mode for the detection to
363 // work.
364 modeParts := speedModeRegexp.FindStringSubmatch(m.Name)
365 if len(modeParts) > 0 {
366 speedMegabits, err := strconv.ParseInt(modeParts[1], 10, 64)
367 if err != nil {
368 c.errors = append(c.errors, fmt.Errorf("nic %v: failed to parse %q as integer: %w", l.Attrs().Name, modeParts[1], err))
369 continue
370 }
371 speeds[int64(speedMegabits)*mbps] = true
372 }
373 }
374 for s := range speeds {
375 nif.SupportedSpeedBytes = append(nif.SupportedSpeedBytes, s)
376 }
377 // Go randomizes the map keys, sort to make the report stable.
378 sort.Slice(nif.SupportedSpeedBytes, func(i, j int) bool { return nif.SupportedSpeedBytes[i] > nif.SupportedSpeedBytes[j] })
379 }
380 state, err := ethClient.LinkState(ethtool.Interface{Index: l.Attrs().Index})
381 if err == nil {
382 nif.LinkUp = state.Link
383 } else {
384 // We have no ethtool support, fall back to checking if Linux
385 // thinks the link is up.
386 nif.LinkUp = l.Attrs().OperState == netlink.OperUp
387 }
388 // Linux blocks creation of interfaces which conflict with special path
389 // characters, so this path assembly is fine.
390 driverPath, err := os.Readlink("/sys/class/net/" + l.Attrs().Name + "/device/driver")
391 if err == nil {
392 nif.Driver = filepath.Base(driverPath)
393 }
394 c.node.NetworkInterface = append(c.node.NetworkInterface, &nif)
395 }
396 return
397}
398
399func gatherHWReport() (*api.Node, []error) {
Lorenz Brunaadeb792023-03-27 15:53:56 +0200400 hwReportCtx := hwReportContext{
401 node: &api.Node{},
402 }
Lorenz Brun62948542023-01-10 13:28:44 +0000403
404 hwReportCtx.gatherCPU()
405 hwReportCtx.gatherSMBIOS()
406 if hwReportCtx.node.MemoryInstalledBytes == 0 {
407 hwReportCtx.gatherMemorySysfs()
408 }
409 var sysinfo unix.Sysinfo_t
410 if err := unix.Sysinfo(&sysinfo); err != nil {
411 hwReportCtx.errors = append(hwReportCtx.errors, fmt.Errorf("unable to execute sysinfo syscall: %w", err))
412 } else {
413 hwReportCtx.node.MemoryUsableRatio = float32(sysinfo.Totalram) / float32(hwReportCtx.node.MemoryInstalledBytes)
414 }
415 hwReportCtx.gatherNICs()
416 hwReportCtx.gatherBlockDevices()
417
418 return hwReportCtx.node, hwReportCtx.errors
419}