c/a/api: reorganize and add AgentInit
Move hardware reporting-related data into a separate file for better
organization.
Also add an AgentInit message which will be used to pass data to the
Agent.
Change-Id: I1eecbd5a78da03170651f76f9f24e134dddaca4f
Reviewed-on: https://review.monogon.dev/c/monogon/+/1140
Reviewed-by: Leopold Schabel <leo@monogon.tech>
Tested-by: Leopold Schabel <leo@monogon.tech>
diff --git a/cloud/agent/api/BUILD.bazel b/cloud/agent/api/BUILD.bazel
index 9312b9d..77dfa4b 100644
--- a/cloud/agent/api/BUILD.bazel
+++ b/cloud/agent/api/BUILD.bazel
@@ -6,9 +6,11 @@
name = "api_proto",
srcs = [
"agent.proto",
+ "hwreport.proto",
"takeover.proto",
],
visibility = ["//visibility:public"],
+ deps = ["//net/proto:net_proto_proto"],
)
go_proto_library(
@@ -16,6 +18,7 @@
importpath = "source.monogon.dev/cloud/agent/api",
proto = ":api_proto",
visibility = ["//visibility:public"],
+ deps = ["//net/proto"],
)
go_library(
diff --git a/cloud/agent/api/agent.proto b/cloud/agent/api/agent.proto
index 129e984..fda4e64 100644
--- a/cloud/agent/api/agent.proto
+++ b/cloud/agent/api/agent.proto
@@ -1,124 +1,18 @@
syntax = "proto3";
package cloud.agent.api;
+import "net/proto/net.proto";
+import "cloud/agent/api/takeover.proto";
option go_package = "source.monogon.dev/cloud/agent/api";
-message BlockDevice {
- // Name of the vendor of the block device
- string vendor = 1;
- // Device model of the block device
- string device_model = 2;
- // Serial number of the block device
- string serial_number = 3;
- // World Wide Name of the block device (not always available)
- bytes wwn = 4;
- // Set if this is a rotational disk
- bool rotational = 5;
-
- // Usable capacity in bytes
- int64 capacity_bytes = 6;
-
- // Logical and physical block size in bytes. Note that on many modern
- // enterprise drives these can be changed.
- int32 logical_block_size_bytes = 7;
- int32 physical_block_size_bytes = 8;
-
- enum Protocol {
- UNKNOWN = 0;
- SCSI = 1;
- ATA = 2;
- NVME = 3;
- MMC = 4;
- }
- Protocol protocol = 9;
-
- // Set if the block device has reasons to believe that it will fail soon.
- // This is entirely controlled by firmware, its accuracy is as good as
- // the vendor has made it.
- bool critical_warning = 10;
-
- // Number of unrecoverable media read errors.
- // On SATA disks this is technically equivalent to Raw_Read_Error_Rate, but
- // only a tiny minority of devices populate that sanely. So instead this is
- // defined as the sum of S.M.A.R.T. attributes 5, 197 and 198.
- optional int64 media_errors = 11;
-
- // Fraction of spare space still available to replace bad blocks.
- // If this reaches zero, the disk generally dies.
- optional float available_spare_ratio = 12;
-
- // Fraction of the estimated life of the device used up.
- // Only considers flash wear, not runtime or similar.
- // Reported by firmware, as accurate as the vendor has made it.
- optional float usage_ratio = 13;
-}
-
-message NetworkInterface {
- // Contains the EUI-48 MAC address of the interface.
- bytes mac = 1;
- // Linux kernel driver which is bound to the interface.
- string driver = 2;
-
- // List of supported speeds in bytes per second.
- repeated int64 supported_speed_bytes = 3;
-
- // Does the interface have an active link.
- bool link_up = 4;
- // Currently-negotiated speed in bytes per second. Unstable on marginal
- // links.
- int64 current_speed_bytes = 5;
-}
-
-message CPU {
- message X86_64 {
- // Family of the CPU, including extended family.
- // For example 6 for Intel's "big" cores.
- int32 family = 1;
- // Model of the CPU, including extended model.
- // For example 154 for ADL-S.
- int32 model = 2;
- // Stepping of the CPU, model-dependent value.
- int32 stepping = 3;
- }
- oneof architecture {
- X86_64 x86_64 = 1;
- // Information specific to other architectures can be added here.
- }
- // Number of hardware threads (including SMT threads, harts, ...) exposed to
- // to the operating system.
- int32 hardware_threads = 9;
- // Number of cores of the CPU. This does not include SMT threads or other
- // equivalent mechanisms to increase logical core count.
- int32 cores = 8;
- // Name of the vendor of the CPU
- string vendor = 10;
- // Name of the model of the CPU
- string model = 11;
-}
-
-message Node {
- // Manufacturer of the system, taken from DMI.
- string manufacturer = 1;
- // Product name, taken from DMI.
- string product = 2;
- // Serial number of the system, taken from DMI.
- string serial_number = 3;
-
- // Amount of physical memory installed, in bytes. Determined using DMI (if
- // available and not marked unusable) or memory blocks in sysfs
- // (/sys/devices/system/memory/...). This is not taken from meminfo as that
- // value is relatively unstable and hard to match to.
- // Assuming a non-terrible firmware implementation this value is expected to
- // be stable.
- int64 memory_installed_bytes = 8;
-
- // Ratio of claimed installed memory which is available to the Linux
- // kernel (taken from sysinfo's totalmem). Note that this value is unstable
- // across kernel versions and even firmware configuration settings and should
- // only be used to detect gross mismatches. 1 means all of the claimed
- // installed memory is available, 0 means none.
- float memory_usable_ratio = 9;
-
- repeated CPU cpu = 10;
- repeated BlockDevice block_device = 11;
- repeated NetworkInterface network_interface = 12;
+// AgentInit contains initialization information passed to the agent from the
+// initial takeover process.
+message AgentInit {
+ // Original takeover init message which contains data to contact the BMaaS
+ // service with.
+ TakeoverInit takeover_init = 1;
+ // The Ed25519 private key to connect to the BMaaS service.
+ bytes private_key = 2;
+ // A network configuration in case automatic configuration does not work or is
+ // not desired. If left unset, automatic configuration is used.
+ net.proto.Net network_config = 3;
}
\ No newline at end of file
diff --git a/cloud/agent/api/hwreport.proto b/cloud/agent/api/hwreport.proto
new file mode 100644
index 0000000..129e984
--- /dev/null
+++ b/cloud/agent/api/hwreport.proto
@@ -0,0 +1,124 @@
+syntax = "proto3";
+package cloud.agent.api;
+option go_package = "source.monogon.dev/cloud/agent/api";
+
+message BlockDevice {
+ // Name of the vendor of the block device
+ string vendor = 1;
+ // Device model of the block device
+ string device_model = 2;
+ // Serial number of the block device
+ string serial_number = 3;
+ // World Wide Name of the block device (not always available)
+ bytes wwn = 4;
+ // Set if this is a rotational disk
+ bool rotational = 5;
+
+ // Usable capacity in bytes
+ int64 capacity_bytes = 6;
+
+ // Logical and physical block size in bytes. Note that on many modern
+ // enterprise drives these can be changed.
+ int32 logical_block_size_bytes = 7;
+ int32 physical_block_size_bytes = 8;
+
+ enum Protocol {
+ UNKNOWN = 0;
+ SCSI = 1;
+ ATA = 2;
+ NVME = 3;
+ MMC = 4;
+ }
+ Protocol protocol = 9;
+
+ // Set if the block device has reasons to believe that it will fail soon.
+ // This is entirely controlled by firmware, its accuracy is as good as
+ // the vendor has made it.
+ bool critical_warning = 10;
+
+ // Number of unrecoverable media read errors.
+ // On SATA disks this is technically equivalent to Raw_Read_Error_Rate, but
+ // only a tiny minority of devices populate that sanely. So instead this is
+ // defined as the sum of S.M.A.R.T. attributes 5, 197 and 198.
+ optional int64 media_errors = 11;
+
+ // Fraction of spare space still available to replace bad blocks.
+ // If this reaches zero, the disk generally dies.
+ optional float available_spare_ratio = 12;
+
+ // Fraction of the estimated life of the device used up.
+ // Only considers flash wear, not runtime or similar.
+ // Reported by firmware, as accurate as the vendor has made it.
+ optional float usage_ratio = 13;
+}
+
+message NetworkInterface {
+ // Contains the EUI-48 MAC address of the interface.
+ bytes mac = 1;
+ // Linux kernel driver which is bound to the interface.
+ string driver = 2;
+
+ // List of supported speeds in bytes per second.
+ repeated int64 supported_speed_bytes = 3;
+
+ // Does the interface have an active link.
+ bool link_up = 4;
+ // Currently-negotiated speed in bytes per second. Unstable on marginal
+ // links.
+ int64 current_speed_bytes = 5;
+}
+
+message CPU {
+ message X86_64 {
+ // Family of the CPU, including extended family.
+ // For example 6 for Intel's "big" cores.
+ int32 family = 1;
+ // Model of the CPU, including extended model.
+ // For example 154 for ADL-S.
+ int32 model = 2;
+ // Stepping of the CPU, model-dependent value.
+ int32 stepping = 3;
+ }
+ oneof architecture {
+ X86_64 x86_64 = 1;
+ // Information specific to other architectures can be added here.
+ }
+ // Number of hardware threads (including SMT threads, harts, ...) exposed to
+ // to the operating system.
+ int32 hardware_threads = 9;
+ // Number of cores of the CPU. This does not include SMT threads or other
+ // equivalent mechanisms to increase logical core count.
+ int32 cores = 8;
+ // Name of the vendor of the CPU
+ string vendor = 10;
+ // Name of the model of the CPU
+ string model = 11;
+}
+
+message Node {
+ // Manufacturer of the system, taken from DMI.
+ string manufacturer = 1;
+ // Product name, taken from DMI.
+ string product = 2;
+ // Serial number of the system, taken from DMI.
+ string serial_number = 3;
+
+ // Amount of physical memory installed, in bytes. Determined using DMI (if
+ // available and not marked unusable) or memory blocks in sysfs
+ // (/sys/devices/system/memory/...). This is not taken from meminfo as that
+ // value is relatively unstable and hard to match to.
+ // Assuming a non-terrible firmware implementation this value is expected to
+ // be stable.
+ int64 memory_installed_bytes = 8;
+
+ // Ratio of claimed installed memory which is available to the Linux
+ // kernel (taken from sysinfo's totalmem). Note that this value is unstable
+ // across kernel versions and even firmware configuration settings and should
+ // only be used to detect gross mismatches. 1 means all of the claimed
+ // installed memory is available, 0 means none.
+ float memory_usable_ratio = 9;
+
+ repeated CPU cpu = 10;
+ repeated BlockDevice block_device = 11;
+ repeated NetworkInterface network_interface = 12;
+}
\ No newline at end of file
diff --git a/cloud/agent/api/takeover.proto b/cloud/agent/api/takeover.proto
index 9453df3..47b531f 100644
--- a/cloud/agent/api/takeover.proto
+++ b/cloud/agent/api/takeover.proto
@@ -14,11 +14,26 @@
string bmaas_endpoint = 3;
}
-// TakeoverResponse is the message the takeover process sends back after
-// receiving an TakeoverInit message.
-message TakeoverResponse {
+message TakeoverSuccess {
// init_message is the exact init message the agent received.
TakeoverInit init_message = 1;
// key is the agent's public key.
bytes key = 2;
+ // warnings contains a list of non-critical errors which occurred during the
+ // takeover preparation.
+ repeated string warning = 3;
+}
+
+message TakeoverError {
+ // Error message
+ string message = 1;
+}
+
+// TakeoverResponse is the message the takeover process sends back after
+// receiving an TakeoverInit message.
+message TakeoverResponse {
+ oneof result {
+ TakeoverSuccess success = 1;
+ TakeoverError error = 2;
+ }
}
diff --git a/cloud/shepherd/equinix/manager/initializer.go b/cloud/shepherd/equinix/manager/initializer.go
index 6194f0d..a60f352 100644
--- a/cloud/shepherd/equinix/manager/initializer.go
+++ b/cloud/shepherd/equinix/manager/initializer.go
@@ -323,14 +323,23 @@
if err := proto.Unmarshal(stdout, &arsp); err != nil {
return nil, fmt.Errorf("agent reply couldn't be unmarshaled: %w", err)
}
- if !proto.Equal(&imsg, arsp.InitMessage) {
+ var successResp *apb.TakeoverSuccess
+ switch r := arsp.Result.(type) {
+ case *apb.TakeoverResponse_Error:
+ return nil, fmt.Errorf("agent returned error: %v", r.Error.Message)
+ case *apb.TakeoverResponse_Success:
+ successResp = r.Success
+ default:
+ return nil, fmt.Errorf("agent returned unknown result of type %T", arsp.Result)
+ }
+ if !proto.Equal(&imsg, successResp.InitMessage) {
return nil, fmt.Errorf("agent did not send back the init message.")
}
- if len(arsp.Key) != ed25519.PublicKeySize {
+ if len(successResp.Key) != ed25519.PublicKeySize {
return nil, fmt.Errorf("agent key length mismatch.")
}
- klog.Infof("Started the agent (provider ID: %s, key: %s).", d.ID, hex.EncodeToString(arsp.Key))
- return arsp.Key, nil
+ klog.Infof("Started the agent (provider ID: %s, key: %s).", d.ID, hex.EncodeToString(successResp.Key))
+ return successResp.Key, nil
}
// init initializes the server described by t, using BMDB session 'sess' to set
diff --git a/cloud/shepherd/equinix/manager/initializer_test.go b/cloud/shepherd/equinix/manager/initializer_test.go
index f07341c..6e82b98 100644
--- a/cloud/shepherd/equinix/manager/initializer_test.go
+++ b/cloud/shepherd/equinix/manager/initializer_test.go
@@ -21,11 +21,9 @@
// fakeSSHClient is an SSHClient that pretends to start an agent, but in reality
// just responds with what an agent would respond on every execution attempt.
-type fakeSSHClient struct {
-}
+type fakeSSHClient struct{}
-type fakeSSHConnection struct {
-}
+type fakeSSHConnection struct{}
func (f *fakeSSHClient) Dial(ctx context.Context, address, username string, sshkey ssh.Signer, timeout time.Duration) (SSHConnection, error) {
return &fakeSSHConnection{}, nil
@@ -43,8 +41,10 @@
return nil, nil, fmt.Errorf("while generating agent public key: %v", err)
}
arsp := apb.TakeoverResponse{
- InitMessage: &aim,
- Key: pub,
+ Result: &apb.TakeoverResponse_Success{Success: &apb.TakeoverSuccess{
+ InitMessage: &aim,
+ Key: pub,
+ }},
}
arspb, err := proto.Marshal(&arsp)
if err != nil {
diff --git a/cloud/shepherd/equinix/manager/test_agent/main.go b/cloud/shepherd/equinix/manager/test_agent/main.go
index 5dd5ccd..8f29c30 100644
--- a/cloud/shepherd/equinix/manager/test_agent/main.go
+++ b/cloud/shepherd/equinix/manager/test_agent/main.go
@@ -36,8 +36,10 @@
return
}
arsp := apb.TakeoverResponse{
- InitMessage: &aim,
- Key: pub,
+ Result: &apb.TakeoverResponse_Success{Success: &apb.TakeoverSuccess{
+ InitMessage: &aim,
+ Key: pub,
+ }},
}
arspb, err := proto.Marshal(&arsp)
if err != nil {