blob: 775742ff09c2ac59495cd3ca5675a844a26d79d3 [file] [log] [blame]
package nvme
import (
"bytes"
"encoding/binary"
"fmt"
"math/big"
"time"
)
// healthPage represents the raw data from a NVMe Health/SMART page.
// See Figure 93 in the spec.
type healthPage struct {
CriticalWarning uint8
CompositeTemperature uint16
AvailableSpare uint8
AvailableSpareThreshold uint8
PercentageUsed uint8
_ [26]byte
DataUnitsRead uint128le
DataUnitsWritten uint128le
HostReadCommands uint128le
HostWriteCommands uint128le
ControllerBusyTime uint128le
PowerCycles uint128le
PowerOnHours uint128le
UnsafeSHutdowns uint128le
MediaAndDataIntegrityErrors uint128le
ErrorInformationLogEntries uint128le
WarningCompositeTemperatureTime uint32
CriticalCompositeTemperatureTime uint32
TemperatureSensors [8]uint16
ThermalMgmtTemperature1TransitionCount uint32
ThermalMgmtTemperature2TransitionCount uint32
_ [8]byte
TotalTimeForThermalMgmtTemperature1 uint32
TotalTimeForThermalMgmtTemperature2 uint32
}
// HealthInfo contains information related to the health of the NVMe device.
//
// Note that some values might be clamped under highly abnormal circumstances
// as they are reported as 128-bit integers which Go doesn't support.
// For easier handling values which are very unlikely to exceed 64 bits are
// exposed as 64 bit integers.
type HealthInfo struct {
// AvailableSpareSpaceCritical is set if the avilable spare threshold has
// fallen below the critical threshold.
AvailableSpareSpaceCritical bool
// TemperatureCritical is set if a temperature is outside the acceptable
// operating thresholds.
TemperatureCritical bool
// MediaCritical is set if significant media or internal issues affect the
// operation of the device.
MediaCritical bool
// ForcedReadOnly is set if the device is forced into read-only mode due
// to an error.
ForcedReadOnly bool
// VolatileMemoryBackupFailed is set if the volatile memory backup device
// has failed.
VolatileMemoryBackupFailed bool
// CompositeTemperatureKelvin contains a derived value representing the
// composite state of controller and namespace/flash temperature.
// The exact mechanism used to derive it is vendor-specific.
CompositeTemperatureKelvin uint16
// AvailableSpare represents the relative amount (0-1) of spare capacity
// still unnused.
AvailableSpare float32
// AvailableSpareThreshold represents the vendor-defined threshold which
// AvailableSpare shuld not fall under.
AvailableSpareThreshold float32
// LifeUsed represents vendor-defined relative estimate of the life of
// the device which has been used up. It is allowed to exceed 1 and will
// be clamped by the device somewhere between 1.0 and 2.55.
LifeUsed float32
// BytesRead contains the number of bytes read from the device.
// This value is only updated in 512KiB increments.
BytesRead *big.Int
// BytesWritten contains the number of bytes written to the device.
// This value is only updated in 512KiB increments.
BytesWritten *big.Int
// HostReadCommands contains the number of read commands completed by the
// controller.
HostReadCommands *big.Int
// HostWriteCommands contains the number of write commands completed by the
// controller.
HostWriteCommands *big.Int
// ControllerBusyTime contains the cumulative amount of time the controller
// has spent being busy (i.e. having at least one command outstanding on an
// I/O queue). This value is only updated in 1m increments.
ControllerBusyTime time.Duration
// PowerCycles contains the number of power cycles.
PowerCycles uint64
// PowerOnHours contains the number of hours the controller has been
// powered on. Depending on the vendor implementation it may or may
// not contain time spent in a non-operational power state.
PowerOnHours uint64
// UnsafeShutdown contains the number of power loss events without
// a prior shutdown notification from the host.
UnsafeShutdowns uint64
// MediaAndDataIntegrityErrors contains the number of occurrences where the
// controller detecte an unrecovered data integrity error.
MediaAndDataIntegrityErrors uint64
// ErrorInformationLogEntriesCount contains the number of Error
// Information log entries over the life of the controller.
ErrorInformationLogEntriesCount uint64
// WarningCompositeTemperatureTime contains the amount of time the
// controller is operational while the composite temperature is greater
// than the warning composite threshold.
WarningCompositeTemperatureTime time.Duration
// CriticalCompositeTemperatureTime contains the amount of time the
// controller is operational while the composite temperature is greater
// than the critical composite threshold.
CriticalCompositeTemperatureTime time.Duration
// TemperatureSensorValues contains the current temperature in Kelvin as
// reported by up to 8 sensors on the device. A value of zero means that
// the given sensor is not available.
TemperatureSensorValues [8]uint16
// ThermalMgmtTemperature1TransitionCount contains the number of times the
// controller transitioned to lower power active power states or performed
// vendor-specific thermal management actions to reduce temperature.
ThermalMgmtTemperature1TransitionCount uint32
// ThermalMgmtTemperature2TransitionCount is the same as above, but
// for "heavier" thermal management actions including heavy throttling.
// The actual difference is vendor-specific.
ThermalMgmtTemperature2TransitionCount uint32
// TotalTimeForThermalMgmtTemperature1 contains the total time the
// controller spent under "light" thermal management.
TotalTimeForThermalMgmtTemperature1 time.Duration
// TotalTimeForThermalMgmtTemperature2 contains the total time the
// controller spent under "heavy" thermal management.
TotalTimeForThermalMgmtTemperature2 time.Duration
}
// HasCriticalWarning returns true if any of the critical warnings
// (AvailableSpareSpaceCritical, TemperatureCritical, MediaCritical,
// ForcedReadOnly, VolatileMemoryBackupFailed) are active.
// If this returns true the NVMe medium has reason to believe that
// data availability or integrity is endangered.
func (h *HealthInfo) HasCriticalWarning() bool {
return h.AvailableSpareSpaceCritical || h.TemperatureCritical || h.MediaCritical || h.ForcedReadOnly || h.VolatileMemoryBackupFailed
}
// See Figure 93 Data Units Read
var dataUnit = big.NewInt(512 * 1000)
const (
healthLogPage = 0x02
)
// GetHealthInfo gets health information from the NVMe device's health log page.
func (d *Device) GetHealthInfo() (*HealthInfo, error) {
var buf [512]byte
if err := d.GetLogPage(GlobalNamespace, healthLogPage, 0, 0, buf[:]); err != nil {
return nil, fmt.Errorf("unable to get health log page: %w", err)
}
var page healthPage
binary.Read(bytes.NewReader(buf[:]), binary.LittleEndian, &page)
var res HealthInfo
res.AvailableSpareSpaceCritical = page.CriticalWarning&(1<<0) != 0
res.TemperatureCritical = page.CriticalWarning&(1<<1) != 0
res.MediaCritical = page.CriticalWarning&(1<<2) != 0
res.ForcedReadOnly = page.CriticalWarning&(1<<3) != 0
res.VolatileMemoryBackupFailed = page.CriticalWarning&(1<<4) != 0
res.CompositeTemperatureKelvin = page.CompositeTemperature
res.AvailableSpare = float32(page.AvailableSpare) / 100.
res.AvailableSpareThreshold = float32(page.AvailableSpareThreshold) / 100.
res.LifeUsed = float32(page.PercentageUsed) / 100.
res.BytesRead = new(big.Int).Mul(page.DataUnitsRead.BigInt(), dataUnit)
res.BytesWritten = new(big.Int).Mul(page.DataUnitsWritten.BigInt(), dataUnit)
res.HostReadCommands = page.HostReadCommands.BigInt()
res.HostWriteCommands = page.HostWriteCommands.BigInt()
res.ControllerBusyTime = time.Duration(page.ControllerBusyTime.Uint64()) * time.Minute
res.PowerCycles = page.PowerCycles.Uint64()
res.PowerOnHours = page.PowerOnHours.Uint64()
res.UnsafeShutdowns = page.UnsafeSHutdowns.Uint64()
res.MediaAndDataIntegrityErrors = page.MediaAndDataIntegrityErrors.Uint64()
res.ErrorInformationLogEntriesCount = page.ErrorInformationLogEntries.Uint64()
res.WarningCompositeTemperatureTime = time.Duration(page.WarningCompositeTemperatureTime) * time.Minute
res.CriticalCompositeTemperatureTime = time.Duration(page.CriticalCompositeTemperatureTime) * time.Minute
res.TemperatureSensorValues = page.TemperatureSensors
res.ThermalMgmtTemperature1TransitionCount = page.ThermalMgmtTemperature1TransitionCount
res.ThermalMgmtTemperature2TransitionCount = page.ThermalMgmtTemperature2TransitionCount
res.TotalTimeForThermalMgmtTemperature1 = time.Duration(page.TotalTimeForThermalMgmtTemperature1) * time.Second
res.TotalTimeForThermalMgmtTemperature2 = time.Duration(page.TotalTimeForThermalMgmtTemperature2) * time.Second
return &res, nil
}