blob: 87970006330eafd37d477e3434c1e3062b803207 [file] [log] [blame]
Lorenz Brun62948542023-01-10 13:28:44 +00001package main
2
3import (
4 "bufio"
5 "bytes"
6 "fmt"
7 "math"
8 "os"
9 "path/filepath"
10 "regexp"
11 "runtime"
12 "sort"
13 "strconv"
14 "strings"
15
16 "github.com/mdlayher/ethtool"
17 "github.com/vishvananda/netlink"
18 "golang.org/x/sys/unix"
19
20 "source.monogon.dev/cloud/agent/api"
21 "source.monogon.dev/metropolis/pkg/nvme"
22 "source.monogon.dev/metropolis/pkg/scsi"
23 "source.monogon.dev/metropolis/pkg/smbios"
24)
25
26type hwReportContext struct {
27 node *api.Node
28 errors []error
29}
30
31func (c *hwReportContext) gatherSMBIOS() {
32 smbiosFile, err := os.Open("/sys/firmware/dmi/tables/DMI")
33 if err != nil {
34 c.errors = append(c.errors, fmt.Errorf("unable to open SMBIOS table: %w", err))
35 return
36 }
37 defer smbiosFile.Close()
38 smbTbl, err := smbios.Unmarshal(bufio.NewReader(smbiosFile))
39 if err != nil {
40 c.errors = append(c.errors, fmt.Errorf("unable to parse SMBIOS table: %w", err))
41 return
42 }
43 if smbTbl.SystemInformationRaw != nil {
44 c.node.Manufacturer = smbTbl.SystemInformationRaw.Manufacturer
45 c.node.Product = smbTbl.SystemInformationRaw.ProductName
46 c.node.SerialNumber = smbTbl.SystemInformationRaw.SerialNumber
47 }
48 for _, d := range smbTbl.MemoryDevicesRaw {
49 if d.StructureVersion.AtLeast(3, 2) && d.MemoryTechnology != 0x03 {
50 // If MemoryTechnology is available, only count DRAM
51 continue
52 }
53 size, ok := d.SizeBytes()
54 if !ok {
55 continue
56 }
57 c.node.MemoryInstalledBytes += int64(size)
58 }
59 return
60}
61
62var memoryBlockRegexp = regexp.MustCompile("^memory[0-9]+$")
63
64func (c *hwReportContext) gatherMemorySysfs() {
65 blockSizeRaw, err := os.ReadFile("/sys/devices/system/memory/block_size_bytes")
66 if err != nil {
67 c.errors = append(c.errors, fmt.Errorf("unable to read memory block size, CONFIG_MEMORY_HOTPLUG disabled or sandbox?: %w", err))
68 return
69 }
70 blockSize, err := strconv.ParseInt(strings.TrimSpace(string(blockSizeRaw)), 16, 64)
71 if err != nil {
72 c.errors = append(c.errors, fmt.Errorf("failed to parse memory block size (%q): %w", string(blockSizeRaw), err))
73 return
74 }
75 dirEntries, err := os.ReadDir("/sys/devices/system/memory")
76 if err != nil {
77 c.errors = append(c.errors, fmt.Errorf("unable to read sysfs memory devices list: %w", err))
78 return
79 }
80 c.node.MemoryInstalledBytes = 0
81 for _, e := range dirEntries {
82 if memoryBlockRegexp.MatchString(e.Name()) {
83 // This is safe as the regexp does not allow for any dots
84 state, err := os.ReadFile("/sys/devices/system/memory/%s/state")
85 if os.IsNotExist(err) {
86 // Memory hotplug operation raced us
87 continue
88 } else if err != nil {
89 c.errors = append(c.errors, fmt.Errorf("failed to read memory block state for %s: %w", e.Name(), err))
90 continue
91 }
92 if strings.TrimSpace(string(state)) != "online" {
93 // Only count online memory
94 continue
95 }
96 // Each block is one blockSize of memory
97 c.node.MemoryInstalledBytes += blockSize
98 }
99 }
100 return
101}
102
103func parseCpuinfoAMD64(cpuinfoRaw []byte) (*api.CPU, []error) {
104 // Parse line-by-line, each segment is separated by a line with no colon
105 // character, a segment describes a logical processor if it contains
106 // the key "processor". Keep track of all seen core IDs (physical
107 // processors) and processor IDs (logical processors) in a map to fill
108 // into the structure.
109 s := bufio.NewScanner(bytes.NewReader(cpuinfoRaw))
110 var cpu api.CPU
111 scannedVals := make(map[string]string)
112 seenCoreIDs := make(map[string]bool)
113 seenProcessorIDs := make(map[string]bool)
114 processItem := func() error {
115 if _, ok := scannedVals["processor"]; !ok {
116 // Not a cpu, clear data and return
117 scannedVals = make(map[string]string)
118 return nil
119 }
120 seenProcessorIDs[scannedVals["processor"]] = true
121 seenCoreIDs[scannedVals["core id"]] = true
122 cpu.Model = scannedVals["model name"]
123 cpu.Vendor = scannedVals["vendor_id"]
124 family, err := strconv.Atoi(scannedVals["cpu family"])
125 if err != nil {
126 return fmt.Errorf("unable to parse CPU family to int: %v", err)
127 }
128 model, err := strconv.Atoi(scannedVals["model"])
129 if err != nil {
130 return fmt.Errorf("unable to parse CPU model to int: %v", err)
131 }
132 stepping, err := strconv.Atoi(scannedVals["stepping"])
133 if err != nil {
134 return fmt.Errorf("unable to parse CPU stepping to int: %v", err)
135 }
136 cpu.Architecture = &api.CPU_X86_64_{
137 X86_64: &api.CPU_X86_64{
138 Family: int32(family),
139 Model: int32(model),
140 Stepping: int32(stepping),
141 },
142 }
143 scannedVals = make(map[string]string)
144 return nil
145 }
146 var errs []error
147 for s.Scan() {
148 k, v, ok := strings.Cut(s.Text(), ":")
149 // If there is a colon, add property to scannedVals.
150 if ok {
151 scannedVals[strings.TrimSpace(k)] = strings.TrimSpace(v)
152 continue
153 }
154 // Otherwise this is a segment boundary, process the segment.
155 if err := processItem(); err != nil {
156 errs = append(errs, fmt.Errorf("error parsing cpuinfo block: %w", err))
157 }
158 }
159 // Parse the last segment.
160 if err := processItem(); err != nil {
161 errs = append(errs, fmt.Errorf("error parsing cpuinfo block: %w", err))
162 }
163 cpu.Cores = int32(len(seenCoreIDs))
164 cpu.HardwareThreads = int32(len(seenProcessorIDs))
165 return &cpu, errs
166}
167
168func (c *hwReportContext) gatherCPU() {
169 switch runtime.GOARCH {
170 case "amd64":
171 // Currently a rather simple gatherer with no special NUMA handling
172 cpuinfoRaw, err := os.ReadFile("/proc/cpuinfo")
173 if err != nil {
174 c.errors = append(c.errors, fmt.Errorf("unable to read cpuinfo: %w", err))
175 return
176 }
177 cpu, errs := parseCpuinfoAMD64(cpuinfoRaw)
178 c.errors = append(c.errors, errs...)
179 c.node.Cpu = append(c.node.Cpu, cpu)
180 default:
181 // Currently unimplemented, do nothing
182 c.errors = append(c.errors, fmt.Errorf("architecture %v unsupported by CPU gatherer", runtime.GOARCH))
183 }
184 return
185}
186
187var (
188 FRUUnavailable = [16]byte{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
189)
190
191func (c *hwReportContext) gatherNVMe(bd *api.BlockDevice, bde os.DirEntry) error {
192 bd.Protocol = api.BlockDevice_NVME
193 nvmeDev, err := nvme.Open("/dev/" + bde.Name())
194 if err != nil {
195 return fmt.Errorf("unable to open NVMe device: %w", err)
196 }
197 defer nvmeDev.Close()
198 identifyData, err := nvmeDev.Identify()
199 if err != nil {
200 return fmt.Errorf("calling Identify failed: %w", err)
201 }
202 bd.DeviceModel = identifyData.ModelNumber
203 bd.SerialNumber = identifyData.SerialNumber
204 if identifyData.FRUGloballyUniqueIdentifier != FRUUnavailable {
205 bd.Wwn = identifyData.FRUGloballyUniqueIdentifier[:]
206 }
207 if healthInfo, err := nvmeDev.GetHealthInfo(); err == nil {
208 bd.AvailableSpareRatio = &healthInfo.AvailableSpare
209 bd.CriticalWarning = healthInfo.HasCriticalWarning()
210 var mediaErrors = int64(healthInfo.MediaAndDataIntegrityErrors)
211 bd.MediaErrors = &mediaErrors
212 bd.UsageRatio = &healthInfo.LifeUsed
213 }
214 return nil
215}
216
217func (c *hwReportContext) gatherSCSI(bd *api.BlockDevice, bde os.DirEntry) error {
218 bd.Protocol = api.BlockDevice_SCSI
219 scsiDev, err := scsi.Open("/dev/" + bde.Name())
220 if err != nil {
221 return fmt.Errorf("unable to open SCSI device: %w", err)
222 }
223 defer scsiDev.Close()
224 inquiryData, err := scsiDev.Inquiry()
225 if err != nil {
226 return fmt.Errorf("failed calling INQUIRY: %w", err)
227 }
228 if serial, err := scsiDev.UnitSerialNumber(); err == nil {
229 bd.SerialNumber = serial
230 }
231
232 // SAT-5 R8 Table 14
233 if inquiryData.Vendor == "ATA" { // ATA device behind SAT
234 bd.Protocol = api.BlockDevice_ATA
235 // TODO: ATA Vendor from WWN if available
236 } else { // Normal SCSI device
237 bd.Vendor = inquiryData.Vendor
238 // Attempt to read defect list to populate media error count
239 var mediaErrors int64
240 if defectsLBA, err := scsiDev.ReadDefectDataLBA(false, true); err == nil {
241 mediaErrors = int64(len(defectsLBA))
242 bd.MediaErrors = &mediaErrors
243 } else if defectsPhysical, err := scsiDev.ReadDefectDataPhysical(false, true); err == nil {
244 mediaErrors = int64(len(defectsPhysical))
245 bd.MediaErrors = &mediaErrors
246 }
247 if mediaHealth, err := scsiDev.SolidStateMediaHealth(); err == nil {
248 used := float32(mediaHealth.PercentageUsedEnduranceIndicator) / 100.
249 bd.UsageRatio = &used
250 }
251 if informationalExceptions, err := scsiDev.GetInformationalExceptions(); err == nil {
252 // Only consider FailurePredictionThresholdExceeded-class sense codes critical.
253 // The second commonly reported error here according to random forums are
254 // Warning-class errors, but looking through these they don't indicate imminent
255 // or even permanent errors.
256 bd.CriticalWarning = informationalExceptions.InformationalSenseCode.IsKey(scsi.FailurePredictionThresholdExceeded)
257 }
258 // SCSI has no reporting of available spares, so this will never be populated
259 }
260 bd.DeviceModel = inquiryData.Product
261 return nil
262}
263
264func (c *hwReportContext) gatherBlockDevices() {
265 blockDeviceEntries, err := os.ReadDir("/sys/class/block")
266 if err != nil {
267 c.errors = append(c.errors, fmt.Errorf("unable to read sysfs block device list: %w", err))
268 return
269 }
270 for _, bde := range blockDeviceEntries {
271 sysfsDir := fmt.Sprintf("/sys/class/block/%s", bde.Name())
272 if _, err := os.Stat(sysfsDir + "/partition"); err == nil {
273 // Ignore partitions, we only care about their parents
274 continue
275 }
276 var bd api.BlockDevice
277 if rotational, err := os.ReadFile(sysfsDir + "/queue/rotational"); err == nil {
278 if strings.TrimSpace(string(rotational)) == "1" {
279 bd.Rotational = true
280 }
281 }
282 if sizeRaw, err := os.ReadFile(sysfsDir + "/size"); err == nil {
283 size, err := strconv.ParseInt(strings.TrimSpace(string(sizeRaw)), 10, 64)
284 if err != nil {
285 c.errors = append(c.errors, fmt.Errorf("unable to parse block device %v size: %w", bde.Name(), err))
286 } else {
287 // Linux always defines size in terms of 512 byte blocks regardless
288 // of what the configured logical and physical block sizes are.
289 bd.CapacityBytes = size * 512
290 }
291 }
292 if lbsRaw, err := os.ReadFile(sysfsDir + "/queue/logical_block_size"); err == nil {
293 lbs, err := strconv.ParseInt(strings.TrimSpace(string(lbsRaw)), 10, 32)
294 if err != nil {
295 c.errors = append(c.errors, fmt.Errorf("unable to parse block device %v logical block size: %w", bde.Name(), err))
296 } else {
297 bd.LogicalBlockSizeBytes = int32(lbs)
298 }
299 }
300 if pbsRaw, err := os.ReadFile(sysfsDir + "/queue/physical_block_size"); err == nil {
301 pbs, err := strconv.ParseInt(strings.TrimSpace(string(pbsRaw)), 10, 32)
302 if err != nil {
303 c.errors = append(c.errors, fmt.Errorf("unable to parse physical block size: %w", err))
304 } else {
305 bd.PhysicalBlockSizeBytes = int32(pbs)
306 }
307 }
308 if strings.HasPrefix(bde.Name(), "nvme") {
309 err := c.gatherNVMe(&bd, bde)
310 if err != nil {
311 c.errors = append(c.errors, fmt.Errorf("block device %v: %w", bde.Name(), err))
312 } else {
313 c.node.BlockDevice = append(c.node.BlockDevice, &bd)
314 }
315 }
316 if strings.HasPrefix(bde.Name(), "sd") {
317 err := c.gatherSCSI(&bd, bde)
318 if err != nil {
319 c.errors = append(c.errors, fmt.Errorf("block device %v: %w", bde.Name(), err))
320 } else {
321 c.node.BlockDevice = append(c.node.BlockDevice, &bd)
322 }
323 }
324 if strings.HasPrefix(bde.Name(), "mmcblk") {
325 // TODO: MMC information
326 bd.Protocol = api.BlockDevice_MMC
327 c.node.BlockDevice = append(c.node.BlockDevice, &bd)
328 }
329 }
330 return
331}
332
333var speedModeRegexp = regexp.MustCompile("^([0-9]+)base")
334
335const mbps = (1000 * 1000) / 8
336
337func (c *hwReportContext) gatherNICs() {
338 links, err := netlink.LinkList()
339 if err != nil {
340 c.errors = append(c.errors, fmt.Errorf("failed to list network links: %w", err))
341 return
342 }
343 ethClient, err := ethtool.New()
344 if err != nil {
345 c.errors = append(c.errors, fmt.Errorf("failed to get ethtool netlink client: %w", err))
346 return
347 }
348 defer ethClient.Close()
349 for _, l := range links {
350 if l.Type() != "device" || len(l.Attrs().HardwareAddr) == 0 {
351 // Not a physical device, ignore
352 continue
353 }
354 var nif api.NetworkInterface
355 nif.Mac = l.Attrs().HardwareAddr
356 mode, err := ethClient.LinkMode(ethtool.Interface{Index: l.Attrs().Index})
357 if err == nil {
358 if mode.SpeedMegabits < math.MaxInt32 {
359 nif.CurrentSpeedBytes = int64(mode.SpeedMegabits) * mbps
360 }
361 speeds := make(map[int64]bool)
362 for _, m := range mode.Ours {
363 // Doing this with a regexp is arguably more future-proof as
364 // we don't need to add each link mode for the detection to
365 // work.
366 modeParts := speedModeRegexp.FindStringSubmatch(m.Name)
367 if len(modeParts) > 0 {
368 speedMegabits, err := strconv.ParseInt(modeParts[1], 10, 64)
369 if err != nil {
370 c.errors = append(c.errors, fmt.Errorf("nic %v: failed to parse %q as integer: %w", l.Attrs().Name, modeParts[1], err))
371 continue
372 }
373 speeds[int64(speedMegabits)*mbps] = true
374 }
375 }
376 for s := range speeds {
377 nif.SupportedSpeedBytes = append(nif.SupportedSpeedBytes, s)
378 }
379 // Go randomizes the map keys, sort to make the report stable.
380 sort.Slice(nif.SupportedSpeedBytes, func(i, j int) bool { return nif.SupportedSpeedBytes[i] > nif.SupportedSpeedBytes[j] })
381 }
382 state, err := ethClient.LinkState(ethtool.Interface{Index: l.Attrs().Index})
383 if err == nil {
384 nif.LinkUp = state.Link
385 } else {
386 // We have no ethtool support, fall back to checking if Linux
387 // thinks the link is up.
388 nif.LinkUp = l.Attrs().OperState == netlink.OperUp
389 }
390 // Linux blocks creation of interfaces which conflict with special path
391 // characters, so this path assembly is fine.
392 driverPath, err := os.Readlink("/sys/class/net/" + l.Attrs().Name + "/device/driver")
393 if err == nil {
394 nif.Driver = filepath.Base(driverPath)
395 }
396 c.node.NetworkInterface = append(c.node.NetworkInterface, &nif)
397 }
398 return
399}
400
401func gatherHWReport() (*api.Node, []error) {
402 var hwReportCtx hwReportContext
403
404 hwReportCtx.gatherCPU()
405 hwReportCtx.gatherSMBIOS()
406 if hwReportCtx.node.MemoryInstalledBytes == 0 {
407 hwReportCtx.gatherMemorySysfs()
408 }
409 var sysinfo unix.Sysinfo_t
410 if err := unix.Sysinfo(&sysinfo); err != nil {
411 hwReportCtx.errors = append(hwReportCtx.errors, fmt.Errorf("unable to execute sysinfo syscall: %w", err))
412 } else {
413 hwReportCtx.node.MemoryUsableRatio = float32(sysinfo.Totalram) / float32(hwReportCtx.node.MemoryInstalledBytes)
414 }
415 hwReportCtx.gatherNICs()
416 hwReportCtx.gatherBlockDevices()
417
418 return hwReportCtx.node, hwReportCtx.errors
419}