treewide: delete cloud/ except cloud/agent
bmdb is no longer used in production and would be hard to maintain. The only non-stub provider is Equinix Metal, which is shutting down in 2026, and we are moving away from CockroachDB. Keep the agent, which we'll need for Monogon Cloud.
Change-Id: If8b35c3ac8cdeed96a2b1814c0de7607e8acec63
Reviewed-on: https://review.monogon.dev/c/monogon/+/4235
Tested-by: Jenkins CI
Reviewed-by: Leopold Schabel <leo@monogon.tech>
diff --git a/cloud/BUILD.bazel b/cloud/BUILD.bazel
deleted file mode 100644
index cfafaa7..0000000
--- a/cloud/BUILD.bazel
+++ /dev/null
@@ -1,68 +0,0 @@
-load("@aspect_bazel_lib//lib:expand_template.bzl", "expand_template")
-load("@bazel_skylib//rules:write_file.bzl", "write_file")
-load("@rules_multirun//:defs.bzl", "multirun")
-load("@rules_oci//oci:defs.bzl", "oci_push")
-
-write_file(
- name = "tags_tmpl",
- out = "tags.txt.tmpl",
- content = [
- "BUILD_VERSION",
- ],
-)
-
-# Use the value of --embed_label under --stamp, otherwise use a deterministic constant
-# value to ensure cache hits for actions that depend on this.
-expand_template(
- name = "stamped",
- out = "_stamped.tags.txt",
- stamp_substitutions = {"BUILD_VERSION": "{{STABLE_MONOGON_cloud_version}}"},
- substitutions = {"BUILD_VERSION": "0.0.0"},
- template = "tags_tmpl",
-)
-
-oci_push(
- name = "apigw_image",
- image = "//cloud/apigw:apigw_image",
- remote_tags = ":stamped",
- repository = "gcr.io/monogon-infra/cloud/apigw",
-)
-
-oci_push(
- name = "shepherd_equinix",
- image = "//cloud/shepherd/provider/equinix:equinix_image",
- remote_tags = ":stamped",
- repository = "gcr.io/monogon-infra/cloud/shepherd/equinix",
-)
-
-oci_push(
- name = "bmsrv",
- image = "//cloud/bmaas/server/cmd:cmd_image",
- remote_tags = ":stamped",
- repository = "gcr.io/monogon-infra/cloud/bmsrv",
-)
-
-oci_push(
- name = "scruffy",
- image = "//cloud/bmaas/scruffy/cmd:cmd_image",
- remote_tags = ":stamped",
- repository = "gcr.io/monogon-infra/cloud/scruffy",
-)
-
-oci_push(
- name = "shepherd_mini",
- image = "//cloud/shepherd/mini:mini_image",
- remote_tags = ":stamped",
- repository = "gcr.io/monogon-infra/cloud/shepherd/mini",
-)
-
-multirun(
- name = "push",
- commands = [
- ":apigw_image",
- ":shepherd_equinix",
- ":bmsrv",
- ":scruffy",
- ":shepherd_mini",
- ],
-)
diff --git a/cloud/README.md b/cloud/README.md
deleted file mode 100644
index eafb512..0000000
--- a/cloud/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# `//cloud`
-
-`//cloud` contains tooling and libraries for orchestrating Monogon OS clusters.
diff --git a/cloud/agent/BUILD.bazel b/cloud/agent/BUILD.bazel
index bfb2147..48e7d20 100644
--- a/cloud/agent/BUILD.bazel
+++ b/cloud/agent/BUILD.bazel
@@ -14,7 +14,6 @@
visibility = ["//visibility:private"],
deps = [
"//cloud/agent/api",
- "//cloud/bmaas/server/api",
"//metropolis/node/core/devmgr",
"//metropolis/node/core/network",
"//osbase/blockdev",
diff --git a/cloud/agent/agent.go b/cloud/agent/agent.go
index 05b2daf..665351a 100644
--- a/cloud/agent/agent.go
+++ b/cloud/agent/agent.go
@@ -22,7 +22,6 @@
"google.golang.org/protobuf/proto"
apb "source.monogon.dev/cloud/agent/api"
- bpb "source.monogon.dev/cloud/bmaas/server/api"
"source.monogon.dev/metropolis/node/core/devmgr"
"source.monogon.dev/metropolis/node/core/network"
@@ -103,32 +102,32 @@
if err != nil {
return fmt.Errorf("error creating BMaaS gRPC client: %w", err)
}
- c := bpb.NewAgentCallbackClient(conn)
+ c := apb.NewAgentCallbackClient(conn)
supervisor.Signal(ctx, supervisor.SignalHealthy)
- assembleHWReport := func() *bpb.AgentHardwareReport {
+ assembleHWReport := func() *apb.AgentHardwareReport {
report, warnings := gatherHWReport()
var warningStrings []string
for _, w := range warnings {
l.Warningf("Hardware Report Warning: %v", w)
warningStrings = append(warningStrings, w.Error())
}
- return &bpb.AgentHardwareReport{
+ return &apb.AgentHardwareReport{
Report: report,
Warning: warningStrings,
}
}
var sentFirstHeartBeat, hwReportSent bool
- var installationReport *bpb.OSInstallationReport
+ var installationReport *apb.OSInstallationReport
var installationGeneration int64
b := backoff.NewExponentialBackOff()
// Never stop retrying, there is nothing else to do
b.MaxElapsedTime = 0
// Main heartbeat loop
for {
- req := bpb.HeartbeatRequest{
+ req := apb.HeartbeatRequest{
MachineId: agentInit.TakeoverInit.MachineId,
}
if sentFirstHeartBeat && !hwReportSent {
@@ -167,21 +166,21 @@
// This installation request has already been attempted
continue
}
- installationReport = &bpb.OSInstallationReport{
+ installationReport = &apb.OSInstallationReport{
Generation: res.InstallationRequest.Generation,
}
installCtx, cancel := context.WithTimeout(ctx, 15*time.Minute)
if err := install(installCtx, res.InstallationRequest, agentInit.NetworkConfig); err != nil {
l.Errorf("Installation failed: %v", err)
- installationReport.Result = &bpb.OSInstallationReport_Error_{
- Error: &bpb.OSInstallationReport_Error{
+ installationReport.Result = &apb.OSInstallationReport_Error_{
+ Error: &apb.OSInstallationReport_Error{
Error: err.Error(),
},
}
} else {
l.Info("Installation succeeded")
- installationReport.Result = &bpb.OSInstallationReport_Success_{
- Success: &bpb.OSInstallationReport_Success{},
+ installationReport.Result = &apb.OSInstallationReport_Success_{
+ Success: &apb.OSInstallationReport_Success{},
}
}
cancel()
diff --git a/cloud/agent/api/BUILD.bazel b/cloud/agent/api/BUILD.bazel
index 48139f2..45d30e4 100644
--- a/cloud/agent/api/BUILD.bazel
+++ b/cloud/agent/api/BUILD.bazel
@@ -9,6 +9,7 @@
"PACKAGE_VERSION_SUFFIX",
"MESSAGE_PASCAL_CASE",
"ENUM_ZERO_VALUE_SUFFIX",
+ "SERVICE_SUFFIX",
],
protos = [":api_proto"],
use_rules = [
@@ -25,15 +26,22 @@
"takeover.proto",
],
visibility = ["//visibility:public"],
- deps = ["//osbase/net/proto:proto_proto"],
+ deps = [
+ "//metropolis/proto/api:api_proto",
+ "//osbase/net/proto:proto_proto",
+ ],
)
go_proto_library(
name = "api_go_proto",
+ compilers = ["@io_bazel_rules_go//proto:go_grpc"],
importpath = "source.monogon.dev/cloud/agent/api",
proto = ":api_proto",
visibility = ["//visibility:public"],
- deps = ["//osbase/net/proto"],
+ deps = [
+ "//metropolis/proto/api",
+ "//osbase/net/proto",
+ ],
)
go_library(
diff --git a/cloud/agent/api/agent.proto b/cloud/agent/api/agent.proto
index c2ac0f7..f62b2b0 100644
--- a/cloud/agent/api/agent.proto
+++ b/cloud/agent/api/agent.proto
@@ -2,17 +2,111 @@
package cloud.agent.api;
import "osbase/net/proto/net.proto";
import "cloud/agent/api/takeover.proto";
+import "cloud/agent/api/hwreport.proto";
+import "metropolis/proto/api/configuration.proto";
+import "metropolis/proto/api/management.proto";
option go_package = "source.monogon.dev/cloud/agent/api";
// AgentInit contains initialization information passed to the agent from the
// initial takeover process.
message AgentInit {
- // Original takeover init message which contains data to contact the BMaaS
- // service with.
+ // Original takeover init message which contains data to contact the API
+ // server with.
TakeoverInit takeover_init = 1;
- // The Ed25519 private key to connect to the BMaaS service.
+ // The Ed25519 private key to connect to the API server.
bytes private_key = 2;
// A network configuration in case automatic configuration does not work or is
// not desired. If left unset, automatic configuration is used.
osbase.net.proto.Net network_config = 3;
-}
\ No newline at end of file
+}
+
+// AgentCallback runs on the API Server and exposes a gRPC interface to agents
+// running on machines. These APIs are served over TLS using component-style
+// server certificates, but clients are authenticated using ephemeral
+// certificates proving ownership of an agent keypair.
+service AgentCallback {
+ // Heartbeat is called by agents repeatedly to upload a hardware report, signal
+ // liveness and retrieve actions to be performed on a host.
+ //
+ // This isn't a streaming RPC as the current server implementation actually
+ // isn't reactive, so it would have to do its own inner polling to create
+ // a stream of updates. To keep things simple, we instead let the agent decide
+ // on the cadence of updates it wants to keep up with.
+ rpc Heartbeat(HeartbeatRequest) returns (HeartbeatResponse);
+}
+
+message AgentHardwareReport {
+ cloud.agent.api.Node report = 1;
+ // List of human-readable warnings which occurred during hardware report
+ // generation.
+ repeated string warning = 2;
+}
+
+// OSInstallationReport is submitted from the agent to the API server after
+// successful OS installation.
+message OSInstallationReport {
+ // generation must be set to the same value as 'generation' in the
+ // OSInstallation request which triggered the OS installation
+ int64 generation = 1;
+
+ // Success is set by the agent when the installation request has been
+ // successfully fulfilled. It is currently empty but is specified as a
+ // message to allow it to be expanded in the future.
+ message Success {}
+ // Error is set by the agent when the installation request could not be
+ // fulfilled due to an error.
+ message Error {
+ // A human-readable message of what went wrong.
+ string error = 1;
+ }
+ oneof result {
+ Success success = 2;
+ Error error = 3;
+ }
+}
+
+message HeartbeatRequest {
+ // MachineID that this agent represents. Technically not necessary since
+ // keypairs between agents should be unique, but this provides an extra layer
+ // of protection against programming bugs.
+ string machine_id = 1;
+ // Optional hardware report to be upserted for this machine. An agent should
+ // submit one at least once after it's started, as early as it can.
+ AgentHardwareReport hardware_report = 2;
+ // Optional installation report sent to be upserted to this machine. An agent
+ // should submit one after it successfully installed an operating system for
+ // a given OSInstallationRequest.
+ OSInstallationReport installation_report = 3;
+}
+
+message MetropolisInstallationRequest {
+ reserved 1;
+ // Parameters for fetching the OS image to install.
+ metropolis.proto.api.OSImageRef os_image = 4;
+ // Node parameters to be supplied to the new node. Note that network_config
+ // is automatically filled out if coming from the takeover.
+ metropolis.proto.api.NodeParameters node_parameters = 2;
+ // Name of the block device to be used as the root device for the install.
+ // A list of block devices can be taken from the node hardware report.
+ string root_device = 3;
+}
+
+// OSInstallationRequest is provided to the agent API server.
+message OSInstallationRequest {
+ // generation is the 'version' of the OS installation request, and will always
+ // be incremented within the API when a new OS installation request is
+ // submitted. The agent must pipe this through to the OSInstallationReport to
+ // let the rest of the system know which OS installation request it actually
+ // fulfilled.
+ int64 generation = 1;
+ // Selects which operating system installation flow is used.
+ oneof type {
+ MetropolisInstallationRequest metropolis = 2;
+ }
+}
+
+message HeartbeatResponse {
+ // If set, the control plane is requesting the installation of an operating
+ // system.
+ OSInstallationRequest installation_request = 1;
+}
diff --git a/cloud/agent/e2e/BUILD.bazel b/cloud/agent/e2e/BUILD.bazel
index 7df7ad6..91f09ce 100644
--- a/cloud/agent/e2e/BUILD.bazel
+++ b/cloud/agent/e2e/BUILD.bazel
@@ -19,7 +19,6 @@
},
deps = [
"//cloud/agent/api",
- "//cloud/bmaas/server/api",
"//metropolis/proto/api",
"//osbase/oci",
"//osbase/oci/registry",
diff --git a/cloud/agent/e2e/main_test.go b/cloud/agent/e2e/main_test.go
index 8dbce49..dc3108e 100644
--- a/cloud/agent/e2e/main_test.go
+++ b/cloud/agent/e2e/main_test.go
@@ -31,8 +31,8 @@
"google.golang.org/protobuf/proto"
apb "source.monogon.dev/cloud/agent/api"
- bpb "source.monogon.dev/cloud/bmaas/server/api"
mpb "source.monogon.dev/metropolis/proto/api"
+
"source.monogon.dev/osbase/oci"
"source.monogon.dev/osbase/oci/registry"
"source.monogon.dev/osbase/pki"
@@ -63,13 +63,13 @@
}
type fakeServer struct {
- hardwareReport *bpb.AgentHardwareReport
- installationRequest *bpb.OSInstallationRequest
- installationReport *bpb.OSInstallationReport
+ hardwareReport *apb.AgentHardwareReport
+ installationRequest *apb.OSInstallationRequest
+ installationReport *apb.OSInstallationReport
}
-func (f *fakeServer) Heartbeat(ctx context.Context, req *bpb.HeartbeatRequest) (*bpb.HeartbeatResponse, error) {
- var res bpb.HeartbeatResponse
+func (f *fakeServer) Heartbeat(ctx context.Context, req *apb.HeartbeatRequest) (*apb.HeartbeatResponse, error) {
+ var res apb.HeartbeatResponse
if req.HardwareReport != nil {
f.hardwareReport = req.HardwareReport
}
@@ -106,9 +106,9 @@
t.Fatal(err)
}
- f.installationRequest = &bpb.OSInstallationRequest{
+ f.installationRequest = &apb.OSInstallationRequest{
Generation: 5,
- Type: &bpb.OSInstallationRequest_Metropolis{Metropolis: &bpb.MetropolisInstallationRequest{
+ Type: &apb.OSInstallationRequest_Metropolis{Metropolis: &apb.MetropolisInstallationRequest{
OsImage: &mpb.OSImageRef{
Scheme: "http",
Host: registryAddr.String(),
@@ -169,7 +169,7 @@
Certificate: [][]byte{serverCert},
PrivateKey: serverPrivKey,
})))
- bpb.RegisterAgentCallbackServer(s, &f)
+ apb.RegisterAgentCallbackServer(s, &f)
grpcLis, err := net.Listen("tcp", "127.0.0.1:0")
if err != nil {
panic(err)
diff --git a/cloud/agent/install.go b/cloud/agent/install.go
index 6deffab..2ba9859 100644
--- a/cloud/agent/install.go
+++ b/cloud/agent/install.go
@@ -15,7 +15,7 @@
"github.com/cenkalti/backoff/v4"
"google.golang.org/protobuf/proto"
- bpb "source.monogon.dev/cloud/bmaas/server/api"
+ apb "source.monogon.dev/cloud/agent/api"
npb "source.monogon.dev/osbase/net/proto"
"source.monogon.dev/osbase/blockdev"
@@ -32,16 +32,16 @@
// install dispatches OSInstallationRequests to the appropriate installer
// method
-func install(ctx context.Context, req *bpb.OSInstallationRequest, netConfig *npb.Net) error {
+func install(ctx context.Context, req *apb.OSInstallationRequest, netConfig *npb.Net) error {
switch reqT := req.Type.(type) {
- case *bpb.OSInstallationRequest_Metropolis:
+ case *apb.OSInstallationRequest_Metropolis:
return installMetropolis(ctx, reqT.Metropolis, netConfig)
default:
return errors.New("unknown installation request type")
}
}
-func installMetropolis(ctx context.Context, req *bpb.MetropolisInstallationRequest, netConfig *npb.Net) error {
+func installMetropolis(ctx context.Context, req *apb.MetropolisInstallationRequest, netConfig *npb.Net) error {
l := supervisor.Logger(ctx)
// Validate we are running via EFI.
if _, err := os.Stat("/sys/firmware/efi"); os.IsNotExist(err) {
diff --git a/cloud/api/BUILD.bazel b/cloud/api/BUILD.bazel
deleted file mode 100644
index 7d471af..0000000
--- a/cloud/api/BUILD.bazel
+++ /dev/null
@@ -1,38 +0,0 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
-load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
-load("@rules_proto//proto:defs.bzl", "proto_library")
-load("@rules_proto_grpc_buf//:defs.bzl", "buf_proto_lint_test")
-
-buf_proto_lint_test(
- name = "api_proto_lint_test",
- except_rules = [
- "PACKAGE_VERSION_SUFFIX",
- "SERVICE_SUFFIX",
- ],
- protos = [":api_proto"],
- use_rules = [
- "DEFAULT",
- "COMMENTS",
- ],
-)
-
-proto_library(
- name = "api_proto",
- srcs = ["iam.proto"],
- visibility = ["//visibility:public"],
-)
-
-go_proto_library(
- name = "api_go_proto",
- compilers = ["@io_bazel_rules_go//proto:go_grpc"],
- importpath = "source.monogon.dev/cloud/api",
- proto = ":api_proto",
- visibility = ["//visibility:public"],
-)
-
-go_library(
- name = "api",
- embed = [":api_go_proto"],
- importpath = "source.monogon.dev/cloud/api",
- visibility = ["//visibility:public"],
-)
diff --git a/cloud/api/gomod-generated-placeholder.go b/cloud/api/gomod-generated-placeholder.go
deleted file mode 100644
index da08b30..0000000
--- a/cloud/api/gomod-generated-placeholder.go
+++ /dev/null
@@ -1,4 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package api
diff --git a/cloud/api/iam.proto b/cloud/api/iam.proto
deleted file mode 100644
index 2bfc4ed..0000000
--- a/cloud/api/iam.proto
+++ /dev/null
@@ -1,18 +0,0 @@
-syntax = "proto3";
-package cloud.api;
-option go_package = "source.monogon.dev/cloud/api";
-
-service IAM {
- rpc WhoAmI(WhoAmIRequest) returns (WhoAmIResponse);
-}
-
-message WhoAmIRequest {
-}
-
-message WhoAmIResponse {
- // Opaque identifier (eg. UUID) of the acting account. Immutable.
- string account_id = 1;
- // Primary email address of the acting account. Can change, must not be used
- // as a foreign key in other systems.
- string email = 2;
-}
\ No newline at end of file
diff --git a/cloud/apigw/BUILD.bazel b/cloud/apigw/BUILD.bazel
deleted file mode 100644
index 7d2114d..0000000
--- a/cloud/apigw/BUILD.bazel
+++ /dev/null
@@ -1,34 +0,0 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_library")
-load("@rules_oci//oci:defs.bzl", "oci_image")
-load("@rules_pkg//pkg:tar.bzl", "pkg_tar")
-
-go_library(
- name = "apigw_lib",
- srcs = ["main.go"],
- importpath = "source.monogon.dev/cloud/apigw",
- visibility = ["//visibility:private"],
- deps = [
- "//cloud/apigw/server",
- "@io_k8s_klog_v2//:klog",
- ],
-)
-
-go_binary(
- name = "apigw",
- embed = [":apigw_lib"],
- visibility = ["//visibility:public"],
-)
-
-pkg_tar(
- name = "apigw_layer",
- srcs = [":apigw"],
-)
-
-oci_image(
- name = "apigw_image",
- base = "@distroless_base",
- entrypoint = ["/apigw"],
- tars = [":apigw_layer"],
- visibility = ["//visibility:public"],
- workdir = "/app",
-)
diff --git a/cloud/apigw/main.go b/cloud/apigw/main.go
deleted file mode 100644
index 4d2ca31..0000000
--- a/cloud/apigw/main.go
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package main
-
-import (
- "context"
- "flag"
-
- "k8s.io/klog/v2"
-
- "source.monogon.dev/cloud/apigw/server"
-)
-
-func main() {
- s := &server.Server{}
- s.Config.RegisterFlags()
- flag.Parse()
- if flag.NArg() > 0 {
- klog.Exitf("unexpected positional arguments: %v", flag.Args())
- }
-
- ctx, ctxC := context.WithCancel(context.Background())
- // TODO: context cancel on interrupt.
- _ = ctxC
-
- s.Start(ctx)
- select {}
-}
diff --git a/cloud/apigw/model/BUILD.bazel b/cloud/apigw/model/BUILD.bazel
deleted file mode 100644
index 4720654..0000000
--- a/cloud/apigw/model/BUILD.bazel
+++ /dev/null
@@ -1,28 +0,0 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
-load("//build/sqlc:sqlc.bzl", "sqlc_go_library")
-
-sqlc_go_library(
- name = "sqlc_model",
- dialect = "cockroachdb",
- importpath = "source.monogon.dev/cloud/apigw/model",
- migrations = glob(["migrations/*.sql"]),
- queries = [
- "queries.sql",
- ],
-)
-
-go_library(
- name = "model",
- srcs = ["migrations.go"],
- embed = [
- ":sqlc_model", # keep
- ],
- embedsrcs = glob(["migrations/*.sql"]), # keep
- importpath = "source.monogon.dev/cloud/apigw/model",
- visibility = ["//visibility:public"],
- deps = [
- "@com_github_golang_migrate_migrate_v4//source",
- "@com_github_golang_migrate_migrate_v4//source/iofs",
- "@com_github_google_uuid//:uuid", # keep
- ],
-)
diff --git a/cloud/apigw/model/migrations.go b/cloud/apigw/model/migrations.go
deleted file mode 100644
index 95b5dcf..0000000
--- a/cloud/apigw/model/migrations.go
+++ /dev/null
@@ -1,18 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package model
-
-import (
- "embed"
-
- "github.com/golang-migrate/migrate/v4/source"
- "github.com/golang-migrate/migrate/v4/source/iofs"
-)
-
-//go:embed migrations/*.sql
-var migrationData embed.FS
-
-func MigrationsSource() (source.Driver, error) {
- return iofs.New(migrationData, "migrations")
-}
diff --git a/cloud/apigw/model/migrations/1663155947_initial.down.sql b/cloud/apigw/model/migrations/1663155947_initial.down.sql
deleted file mode 100644
index 032d6cf..0000000
--- a/cloud/apigw/model/migrations/1663155947_initial.down.sql
+++ /dev/null
@@ -1 +0,0 @@
-DROP TABLE accounts;
\ No newline at end of file
diff --git a/cloud/apigw/model/migrations/1663155947_initial.up.sql b/cloud/apigw/model/migrations/1663155947_initial.up.sql
deleted file mode 100644
index 4812e00..0000000
--- a/cloud/apigw/model/migrations/1663155947_initial.up.sql
+++ /dev/null
@@ -1,17 +0,0 @@
-CREATE TABLE accounts (
- -- Internal account ID. Never changes.
- account_id UUID NOT NULL DEFAULT gen_random_uuid() PRIMARY KEY,
-
- -- Identity used to tied this account to OIDC.
- -- OpenID Connect Core, 2. ID Token: “It MUST NOT exceed 255 ASCII
- -- characters in length”.
- account_oidc_sub STRING(255) NOT NULL UNIQUE,
-
- --- Copy/cache of user data retrieved from OIDC IdP on login. Currently this
- --- is only updated on first login, but we should find a way to trigger
- --- a re-retrieval.
- -- Display name preferred by user.
- -- Self-limiting ourselves to 255 unicode codepoints here. This is also
- -- supposedly what keycloak also defaults to for user attributes.
- account_display_name STRING(255) NOT NULL
-);
\ No newline at end of file
diff --git a/cloud/apigw/model/queries.sql b/cloud/apigw/model/queries.sql
deleted file mode 100644
index 564f91d..0000000
--- a/cloud/apigw/model/queries.sql
+++ /dev/null
@@ -1,13 +0,0 @@
--- name: GetAccountByOIDC :many
-SELECT
- accounts.*
-FROM accounts
-WHERE account_oidc_sub = $1;
-
--- name: InitializeAccountFromOIDC :one
-INSERT INTO accounts (
- account_oidc_sub, account_display_name
-) VALUES (
- $1, $2
-)
-RETURNING *;
\ No newline at end of file
diff --git a/cloud/apigw/server/BUILD.bazel b/cloud/apigw/server/BUILD.bazel
deleted file mode 100644
index a15ff6d..0000000
--- a/cloud/apigw/server/BUILD.bazel
+++ /dev/null
@@ -1,36 +0,0 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
-
-go_library(
- name = "server",
- srcs = ["server.go"],
- importpath = "source.monogon.dev/cloud/apigw/server",
- visibility = ["//visibility:public"],
- deps = [
- "//cloud/api",
- "//cloud/apigw/model",
- "//cloud/lib/component",
- "@com_github_improbable_eng_grpc_web//go/grpcweb",
- "@io_k8s_klog_v2//:klog",
- "@org_golang_google_grpc//:grpc",
- "@org_golang_google_grpc//codes",
- "@org_golang_google_grpc//credentials/insecure",
- "@org_golang_google_grpc//reflection",
- "@org_golang_google_grpc//status",
- ],
-)
-
-go_test(
- name = "server_test",
- srcs = ["server_test.go"],
- data = [
- "@cockroach",
- ],
- embed = [":server"],
- deps = [
- "//cloud/api",
- "//cloud/apigw/model",
- "//cloud/lib/component",
- "@org_golang_google_grpc//codes",
- "@org_golang_google_protobuf//proto",
- ],
-)
diff --git a/cloud/apigw/server/server.go b/cloud/apigw/server/server.go
deleted file mode 100644
index d075e98..0000000
--- a/cloud/apigw/server/server.go
+++ /dev/null
@@ -1,128 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package server
-
-import (
- "context"
- "errors"
- "flag"
- "net"
- "net/http"
-
- "github.com/improbable-eng/grpc-web/go/grpcweb"
- "google.golang.org/grpc"
- "google.golang.org/grpc/codes"
- "google.golang.org/grpc/credentials/insecure"
- "google.golang.org/grpc/reflection"
- "google.golang.org/grpc/status"
- "k8s.io/klog/v2"
-
- apb "source.monogon.dev/cloud/api"
- "source.monogon.dev/cloud/apigw/model"
- "source.monogon.dev/cloud/lib/component"
-)
-
-// Config is the main configuration of the apigw server. It's usually populated
-// from flags via RegisterFlags, but can also be set manually (eg. in tests).
-type Config struct {
- Component component.ComponentConfig
- Database component.CockroachConfig
-
- PublicListenAddress string
-}
-
-// RegisterFlags registers the component configuration to be provided by flags.
-// This must be called exactly once before then calling flags.Parse().
-func (c *Config) RegisterFlags() {
- c.Component.RegisterFlags("apigw")
- c.Database.RegisterFlags("apigw_db")
- flag.StringVar(&c.PublicListenAddress, "apigw_public_grpc_listen_address", ":8080", "Address to listen at for public/user gRPC connections for apigw")
-}
-
-// Server runs the apigw server. It listens on two interfaces:
-// - Internal gRPC, which is authenticated using TLS and authorized by CA. This
-// is to be used for internal RPCs, eg. management/debug.
-// - Public gRPC-Web, which is currently unauthenticated.
-type Server struct {
- Config Config
-
- // ListenGRPC will contain the address at which the internal gRPC server is
- // listening after .Start() has been called. This can differ from the configured
- // value if the configuration requests any port (via :0).
- ListenGRPC string
- // ListenPublic will contain the address at which the public API server is
- // listening after .Start() has been called. This can differ from the configured
- // value if the configuration requests any port (via :0).
- ListenPublic string
-}
-
-func (s *Server) startInternalGRPC(ctx context.Context) {
- g := grpc.NewServer(s.Config.Component.GRPCServerOptions()...)
- lis, err := net.Listen("tcp", s.Config.Component.GRPCListenAddress)
- if err != nil {
- klog.Exitf("Could not listen: %v", err)
- }
- s.ListenGRPC = lis.Addr().String()
-
- reflection.Register(g)
-
- klog.Infof("Internal gRPC listening on %s", s.ListenGRPC)
- go func() {
- err := g.Serve(lis)
- if !errors.Is(err, ctx.Err()) {
- klog.Exitf("Internal gRPC serve failed: %v", err)
- }
- }()
-}
-
-func (s *Server) startPublic(ctx context.Context) {
- g := grpc.NewServer(grpc.Creds(insecure.NewCredentials()))
- lis, err := net.Listen("tcp", s.Config.PublicListenAddress)
- if err != nil {
- klog.Exitf("Could not listen: %v", err)
- }
- s.ListenPublic = lis.Addr().String()
-
- reflection.Register(g)
- apb.RegisterIAMServer(g, s)
-
- wrapped := grpcweb.WrapServer(g)
- server := http.Server{
- Addr: s.Config.PublicListenAddress,
- Handler: http.HandlerFunc(wrapped.ServeHTTP),
- }
- klog.Infof("Public API listening on %s", s.ListenPublic)
- go func() {
- err := server.Serve(lis)
- if !errors.Is(err, ctx.Err()) {
- klog.Exitf("Public API serve failed: %v", err)
- }
- }()
-}
-
-// Start runs the two listeners of the server. The process will fail (via
-// klog.Exit) if any of the listeners/servers fail to start.
-func (s *Server) Start(ctx context.Context) {
- if s.Config.Database.Migrations == nil {
- klog.Infof("Using default migrations source.")
- m, err := model.MigrationsSource()
- if err != nil {
- klog.Exitf("failed to prepare migrations source: %v", err)
- }
- s.Config.Database.Migrations = m
- }
-
- klog.Infof("Running migrations...")
- if err := s.Config.Database.MigrateUp(); err != nil {
- klog.Exitf("Migrations failed: %v", err)
- }
- klog.Infof("Migrations done.")
- s.startInternalGRPC(ctx)
- s.startPublic(ctx)
-}
-
-func (s *Server) WhoAmI(ctx context.Context, req *apb.WhoAmIRequest) (*apb.WhoAmIResponse, error) {
- klog.Infof("req: %+v", req)
- return nil, status.Error(codes.Unimplemented, "unimplemented")
-}
diff --git a/cloud/apigw/server/server_test.go b/cloud/apigw/server/server_test.go
deleted file mode 100644
index 1455f63..0000000
--- a/cloud/apigw/server/server_test.go
+++ /dev/null
@@ -1,129 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package server
-
-import (
- "bytes"
- "context"
- "encoding/binary"
- "fmt"
- "net/http"
- "strconv"
- "testing"
-
- "google.golang.org/grpc/codes"
- "google.golang.org/protobuf/proto"
-
- apb "source.monogon.dev/cloud/api"
- "source.monogon.dev/cloud/apigw/model"
- "source.monogon.dev/cloud/lib/component"
-)
-
-func dut() *Server {
- return &Server{
- Config: Config{
- Component: component.ComponentConfig{
- GRPCListenAddress: ":0",
- DevCerts: true,
- DevCertsPath: "/tmp/foo",
- },
- Database: component.CockroachConfig{
- InMemory: true,
- },
- PublicListenAddress: ":0",
- },
- }
-}
-
-// TestPublicSimple ensures the public grpc-web listener is working.
-func TestPublicSimple(t *testing.T) {
- s := dut()
- ctx := context.Background()
- s.Start(ctx)
-
- // Craft a gRPC-Web request from scratch. There doesn't seem to be a
- // well-supported library to do this.
-
- // The request is \0 ++ uint32be(len(req)) ++ req.
- msgBytes, err := proto.Marshal(&apb.WhoAmIRequest{})
- if err != nil {
- t.Fatalf("Could not marshal request body: %v", err)
- }
- buf := bytes.NewBuffer(nil)
- binary.Write(buf, binary.BigEndian, byte(0))
- binary.Write(buf, binary.BigEndian, uint32(len(msgBytes)))
- buf.Write(msgBytes)
-
- // Perform the request. Set minimum headers required for gRPC-Web to recognize
- // this as a gRPC-Web request.
- req, err := http.NewRequest("POST", fmt.Sprintf("http://%s/cloud.api.IAM/WhoAmI", s.ListenPublic), buf)
- if err != nil {
- t.Fatalf("Could not create request: %v", err)
- }
- req.Header.Set("Content-Type", "application/grpc-web+proto")
- req.Header.Set("X-Grpc-Web", "1")
-
- res, err := http.DefaultClient.Do(req)
- if err != nil {
- t.Fatalf("Could not perform request: %v", err)
- }
- // Regardless for RPC status, 200 should always be returned.
- if want, got := 200, res.StatusCode; want != got {
- t.Errorf("Wanted code %d, got %d", want, got)
- }
-
- // Expect endpoint to return 'unimplemented'.
- code, _ := strconv.Atoi(res.Header.Get("Grpc-Status"))
- if want, got := uint32(codes.Unimplemented), uint32(code); want != got {
- t.Errorf("Wanted code %d, got %d", want, got)
- }
- if want, got := "unimplemented", res.Header.Get("Grpc-Message"); want != got {
- t.Errorf("Wanted message %q, got %q", want, got)
- }
-}
-
-// TestUserSimple makes sure we can add and retrieve users. This is a low-level
-// test which mostly exercises the machinery to bring up a working database in
-// tests.
-func TestUserSimple(t *testing.T) {
- s := dut()
- ctx := context.Background()
- s.Start(ctx)
-
- db, err := s.Config.Database.Connect()
- if err != nil {
- t.Fatalf("Connecting to the database failed: %v", err)
- }
- q := model.New(db)
-
- // Start out with no account by sub 'test'.
- accounts, err := q.GetAccountByOIDC(ctx, "test")
- if err != nil {
- t.Fatalf("Retrieving accounts failed: %v", err)
- }
- if want, got := 0, len(accounts); want != got {
- t.Fatalf("Expected no accounts at first, got %d", got)
- }
-
- // Create a new test account for sub 'test'.
- _, err = q.InitializeAccountFromOIDC(ctx, model.InitializeAccountFromOIDCParams{
- AccountOidcSub: "test",
- AccountDisplayName: "Test User",
- })
- if err != nil {
- t.Fatalf("Creating new account failed: %v", err)
- }
-
- // Expect this account to be available now.
- accounts, err = q.GetAccountByOIDC(ctx, "test")
- if err != nil {
- t.Fatalf("Retrieving accounts failed: %v", err)
- }
- if want, got := 1, len(accounts); want != got {
- t.Fatalf("Expected exactly one account after creation, got %d", got)
- }
- if want, got := "Test User", accounts[0].AccountDisplayName; want != got {
- t.Fatalf("Expected to read back display name %q, got %q", want, got)
- }
-}
diff --git a/cloud/bmaas/DEPLOYING.md b/cloud/bmaas/DEPLOYING.md
deleted file mode 100644
index a1326f6..0000000
--- a/cloud/bmaas/DEPLOYING.md
+++ /dev/null
@@ -1,40 +0,0 @@
-Schema/Version compatibility
-===
-
-Live migration
----
-
-BMaaS supports live migrating schemas. On startup, every component using the BMaaS
-will attempt to migrate the database up to the newest version of the schema it
-was built with.
-
-Components are implemented to support a range of schemas, and operators should
-sequence upgrades in the following way:
-
-1. Make sure that all components are at the newest possible CL, but not so new
- that they ship a newer version of the schema than is currently running.
-2. Upgrade components in a rolling fashion to a CL version that ships the newest
- possible schema version which is still compatible with the previous CL
- versions of the components.
-3. Repeat from point 1 until the newest wanted CL version is running.
-
-| ID | Schema range | CL range | Notes |
-|----|---------------|----------|------------------------------|
-| 0 | < 1672749980 | >= 0 | Initial production schema. |
-| 1 | >= 1672768890 | >= 1565 | Exponential backoff support. |
-
-For example, if the cluster is at version 1200, it should first be upgraded to
-< 1565 (to reach row 0), then to anything higher than 1565 (to reach row 1).
-
-Offline migration
----
-
-For simple deployments, an offline migration is easiest. To perform an offline migration:
-
-1. Turn down all BMaaS components that communicate with the BMDB.
-2. Upgrade all components to the newer version (either newest or otherwise
- wanted, but all components have to be at the same CL version).
-3. Turn up a single component of BMaaS torn down in 1., making sure the database is migrated.
-4. Turn up the rest of the components.
-
-This allows migrating across many incompatible schema migrations, but requires downtime.
\ No newline at end of file
diff --git a/cloud/bmaas/bmdb/BUILD.bazel b/cloud/bmaas/bmdb/BUILD.bazel
deleted file mode 100644
index 64f0df4..0000000
--- a/cloud/bmaas/bmdb/BUILD.bazel
+++ /dev/null
@@ -1,50 +0,0 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
-
-go_library(
- name = "bmdb",
- srcs = [
- "bmdb.go",
- "connection.go",
- "sessions.go",
- ],
- importpath = "source.monogon.dev/cloud/bmaas/bmdb",
- visibility = ["//visibility:public"],
- deps = [
- "//cloud/bmaas/bmdb/metrics",
- "//cloud/bmaas/bmdb/model",
- "//cloud/bmaas/bmdb/reflection",
- "//cloud/lib/component",
- "@com_github_cockroachdb_cockroach_go_v2//crdb",
- "@com_github_google_uuid//:uuid",
- "@com_github_lib_pq//:pq",
- "@com_github_prometheus_client_golang//prometheus",
- "@io_k8s_klog_v2//:klog",
- ],
-)
-
-go_test(
- name = "bmdb_test",
- srcs = [
- "backoff_test.go",
- "migrations_test.go",
- "queries_test.go",
- "reflection_test.go",
- "sessions_test.go",
- ],
- data = [
- "@cockroach",
- ],
- embed = [":bmdb"],
- # TODO: https://github.com/monogon-dev/monogon/issues/213
- flaky = True,
- deps = [
- "//cloud/agent/api",
- "//cloud/bmaas/bmdb/model",
- "//cloud/bmaas/bmdb/reflection",
- "//cloud/bmaas/server/api",
- "//cloud/lib/component",
- "@com_github_google_go_cmp//cmp",
- "@com_github_google_uuid//:uuid",
- "@org_golang_google_protobuf//proto",
- ],
-)
diff --git a/cloud/bmaas/bmdb/backoff_test.go b/cloud/bmaas/bmdb/backoff_test.go
deleted file mode 100644
index 19d3f51..0000000
--- a/cloud/bmaas/bmdb/backoff_test.go
+++ /dev/null
@@ -1,64 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package bmdb
-
-import (
- "testing"
- "time"
-
- "github.com/google/go-cmp/cmp"
-)
-
-// TestBackoffMath exercises the rules of Backoff.
-func TestBackoffMath(t *testing.T) {
- for _, te := range []struct {
- name string
- b *Backoff
- existing *existingBackoff
- wantSecs []int64
- }{
- {"NoBackoffSet", nil, nil, []int64{1, 1, 1}},
- {"EmptyBackoff", &Backoff{}, nil, []int64{1, 1, 1}},
- {"SimpleBackoff", &Backoff{Initial: time.Minute}, nil, []int64{60, 60, 60}},
- {"ExponentialWithMax",
- &Backoff{Initial: time.Minute, Exponent: 1.1, Maximum: time.Minute * 2},
- nil,
- []int64{60, 66, 73, 81, 90, 99, 109, 120, 120},
- },
-
- {"SimpleOverridePrevious",
- &Backoff{Initial: time.Minute},
- &existingBackoff{lastInterval: time.Second * 2},
- []int64{60, 60, 60},
- },
- {"ExponentialOverridePrevious",
- &Backoff{Initial: time.Minute, Exponent: 2.0, Maximum: time.Minute * 2},
- &existingBackoff{lastInterval: time.Second * 2},
- []int64{4, 8, 16, 32, 64, 120, 120},
- },
-
- {"ContinueExisting", nil, &existingBackoff{lastInterval: time.Minute}, []int64{60, 60, 60}},
- {"ContinueExistingInvalid1", nil, &existingBackoff{lastInterval: 0}, []int64{1, 1, 1}},
- {"ContinueExistingInvalid2", nil, &existingBackoff{lastInterval: time.Millisecond}, []int64{1, 1, 1}},
-
- {"InvalidBackoff1", &Backoff{Exponent: 0.2}, nil, []int64{1, 1, 1}},
- {"InvalidBackoff2", &Backoff{Maximum: time.Millisecond, Initial: time.Millisecond}, nil, []int64{1, 1, 1}},
- } {
- t.Run(te.name, func(t *testing.T) {
- existing := te.existing
-
- gotSecs := make([]int64, len(te.wantSecs))
- for j := 0; j < len(te.wantSecs); j++ {
- gotSecs[j] = te.b.next(existing)
- existing = &existingBackoff{
- lastInterval: time.Duration(gotSecs[j]) * time.Second,
- }
- }
-
- if diff := cmp.Diff(te.wantSecs, gotSecs); diff != "" {
- t.Errorf("Difference: %s", diff)
- }
- })
- }
-}
diff --git a/cloud/bmaas/bmdb/bmdb.go b/cloud/bmaas/bmdb/bmdb.go
deleted file mode 100644
index 400aa4a..0000000
--- a/cloud/bmaas/bmdb/bmdb.go
+++ /dev/null
@@ -1,64 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-// Package bmdb implements a connector to the Bare Metal Database, which is the
-// main data store backing information about bare metal machines.
-//
-// All components of the BMaaS project connect directly to the underlying
-// CockroachDB database storing this data via this library. In the future, this
-// library might turn into a shim which instead connects to a coordinator
-// service over gRPC.
-package bmdb
-
-import (
- "github.com/prometheus/client_golang/prometheus"
-
- "source.monogon.dev/cloud/bmaas/bmdb/metrics"
- "source.monogon.dev/cloud/lib/component"
-)
-
-// BMDB is the Bare Metal Database, a common schema to store information about
-// bare metal machines in CockroachDB. This struct is supposed to be
-// embedded/contained by different components that interact with the BMDB, and
-// provides a common interface to BMDB operations to these components.
-//
-// The BMDB provides two mechanisms facilitating a 'reactive work system' being
-// implemented on the bare metal machine data:
-//
-// - Sessions, which are maintained by heartbeats by components and signal the
-// liveness of said components to other components operating on the BMDB. These
-// effectively extend CockroachDB's transactions to be visible as row data. Any
-// session that is not actively being updated by a component can be expired by a
-// component responsible for lease garbage collection.
-// - Work locking, which bases on Sessions and allows long-standing
-// multi-transaction work to be performed on given machines, preventing
-// conflicting work from being performed by other components. As both Work
-// locking and Sessions are plain row data, other components can use SQL queries
-// to exclude machines to act on by constraining SELECT queries to not return
-// machines with some active work being performed on them.
-type BMDB struct {
- Config
-
- metrics *metrics.MetricsSet
-}
-
-// Config is the configuration of the BMDB connector.
-type Config struct {
- Database component.CockroachConfig
-
- // ComponentName is a human-readable name of the component connecting to the
- // BMDB, and is stored in any Sessions managed by this component's connector.
- ComponentName string
- // RuntimeInfo is a human-readable 'runtime information' (eg. software version,
- // host machine/job information, IP address, etc.) stored alongside the
- // ComponentName in active Sessions.
- RuntimeInfo string
-}
-
-// EnableMetrics configures BMDB metrics collection and registers it on the given
-// registry. This method should only be called once, and is not goroutine safe.
-func (b *BMDB) EnableMetrics(registry *prometheus.Registry) {
- if b.metrics == nil {
- b.metrics = metrics.New(registry)
- }
-}
diff --git a/cloud/bmaas/bmdb/connection.go b/cloud/bmaas/bmdb/connection.go
deleted file mode 100644
index 8024eb8..0000000
--- a/cloud/bmaas/bmdb/connection.go
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package bmdb
-
-import (
- "context"
- "database/sql"
- "fmt"
-
- "github.com/google/uuid"
- "k8s.io/klog/v2"
-
- "source.monogon.dev/cloud/bmaas/bmdb/model"
- "source.monogon.dev/cloud/bmaas/bmdb/reflection"
-)
-
-// Open creates a new Connection to the BMDB for the calling component. Multiple
-// connections can be opened (although there is no advantage to doing so, as
-// Connections manage an underlying CockroachDB connection pool, which performs
-// required reconnects and connection pooling automatically).
-func (b *BMDB) Open(migrate bool) (*Connection, error) {
- if b.Config.Database.Migrations == nil {
- klog.Infof("Using default migrations source.")
- m, err := model.MigrationsSource()
- if err != nil {
- klog.Exitf("failed to prepare migrations source: %v", err)
- }
- b.Config.Database.Migrations = m
- }
- if migrate {
- if err := b.Database.MigrateUp(); err != nil {
- return nil, fmt.Errorf("migration failed: %w", err)
- }
- }
- db, err := b.Database.Connect()
- if err != nil {
- return nil, err
- }
- return &Connection{
- bmdb: b,
- db: db,
-
- DatabaseName: b.Config.Database.DatabaseName,
- Address: b.Config.Database.EndpointHost,
- InMemory: b.Config.Database.InMemory,
- }, nil
-}
-
-// Connection to the BMDB. Internally, this contains a sql.DB connection pool,
-// so components can (and likely should) reuse Connections as much as possible
-// internally.
-type Connection struct {
- bmdb *BMDB
- db *sql.DB
-
- // The database name that we're connected to.
- DatabaseName string
- // The address of the CockroachDB endpoint we've connected to.
- Address string
- // Whether this connection is to an in-memory database. Note: this only works if
- // this Connection came directly from calling Open on a BMDB that was defined to
- // be in-memory. If you just connect to an in-memory CRDB manually, this will
- // still be false.
- InMemory bool
-}
-
-// Reflect returns a reflection.Schema as detected by inspecting the table
-// information of this connection to the BMDB. The Schema can then be used to
-// retrieve arbitrary tag/machine information without going through the
-// concurrency/ordering mechanism of the BMDB.
-//
-// This should only be used to implement debugging tooling and should absolutely
-// not be in the path of any user requests.
-//
-// This Connection will be used not only to query the Schema information, but
-// also for all subsequent data retrieval operations on it. Please ensure that
-// the Schema is rebuilt in the event of a database connection failure. Ideally,
-// you should be rebuilding the schema often, to follow what is currently
-// available on the production database - but not for every request. Use a cache
-// or something.
-func (c *Connection) Reflect(ctx context.Context) (*reflection.Schema, error) {
- return reflection.Reflect(ctx, c.db)
-}
-
-// ListHistoryOf retrieves a full audit history of a machine, sorted
-// chronologically. It can be read without a session / transaction for debugging
-// purposes.
-func (c *Connection) ListHistoryOf(ctx context.Context, machine uuid.UUID) ([]model.WorkHistory, error) {
- return model.New(c.db).ListHistoryOf(ctx, machine)
-}
-
-// GetSession retrieves all information about a session. It can be read without a
-// session/transaction for debugging purposes.
-func (c *Connection) GetSession(ctx context.Context, session uuid.UUID) ([]model.Session, error) {
- return model.New(c.db).GetSession(ctx, session)
-}
diff --git a/cloud/bmaas/bmdb/metrics/BUILD.bazel b/cloud/bmaas/bmdb/metrics/BUILD.bazel
deleted file mode 100644
index d5eb8a5..0000000
--- a/cloud/bmaas/bmdb/metrics/BUILD.bazel
+++ /dev/null
@@ -1,12 +0,0 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
-
-go_library(
- name = "metrics",
- srcs = ["metrics.go"],
- importpath = "source.monogon.dev/cloud/bmaas/bmdb/metrics",
- visibility = ["//visibility:public"],
- deps = [
- "//cloud/bmaas/bmdb/model",
- "@com_github_prometheus_client_golang//prometheus",
- ],
-)
diff --git a/cloud/bmaas/bmdb/metrics/metrics.go b/cloud/bmaas/bmdb/metrics/metrics.go
deleted file mode 100644
index 242ac2d..0000000
--- a/cloud/bmaas/bmdb/metrics/metrics.go
+++ /dev/null
@@ -1,216 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-// Package metrics implements a Prometheus metrics submission interface for BMDB
-// client components. A Metrics object can be attached to a BMDB object, which
-// will make all BMDB sessions/transactions/work statistics be submitted to that
-// Metrics object.
-package metrics
-
-import (
- "github.com/prometheus/client_golang/prometheus"
-
- "source.monogon.dev/cloud/bmaas/bmdb/model"
-)
-
-// Processor describes some cloud component and possibly sub-component which acts
-// upon the BMDB. When starting a BMDB session, this Processor can be provided to
-// contextualize the metrics emitted by this session. Because the selected
-// Processor ends up directly as a Prometheus metric label, it must be
-// low-cardinality - thus all possible values are defined as an enum here. If a
-// Session is not configured with a Processor, the default (ProcessorUnknown)
-// will be used.
-type Processor string
-
-const (
- ProcessorUnknown Processor = ""
- ProcessorShepherdInitializer Processor = "shepherd-initializer"
- ProcessorShepherdProvisioner Processor = "shepherd-provisioner"
- ProcessorShepherdRecoverer Processor = "shepherd-recoverer"
- ProcessorShepherdUpdater Processor = "shepherd-updater"
- ProcessorBMSRV Processor = "bmsrv"
- ProcessorScruffyStats Processor = "scruffy-stats"
-)
-
-// String returns the Prometheus label value for use with the 'processor' label
-// key.
-func (p Processor) String() string {
- switch p {
- case ProcessorUnknown:
- return "unknown"
- default:
- return string(p)
- }
-}
-
-// MetricsSet contains all the Prometheus metrics objects related to a BMDB
-// client.
-//
-// The MetricsSet object is goroutine-safe.
-//
-// An empty MetricsSet object is not valid, and should be instead constructed
-// using New.
-//
-// A nil MetricsSet object is valid and represents a no-op metrics recorder
-// that's never collected.
-type MetricsSet struct {
- sessionStarted *prometheus.CounterVec
- transactionExecuted *prometheus.CounterVec
- transactionRetried *prometheus.CounterVec
- transactionFailed *prometheus.CounterVec
- workStarted *prometheus.CounterVec
- workFinished *prometheus.CounterVec
-}
-
-func processorCounter(name, help string, labels ...string) *prometheus.CounterVec {
- labels = append([]string{"processor"}, labels...)
- return prometheus.NewCounterVec(
- prometheus.CounterOpts{
- Name: name,
- Help: help,
- },
- labels,
- )
-}
-
-// New creates a new BMDB MetricsSet object which can be then attached to a BMDB
-// object by calling BMDB.EnableMetrics on the MetricsSet object.
-//
-// The given registry must be a valid Prometheus registry, and all metrics
-// contained in this MetricsSet object will be registered into it.
-//
-// The MetricsSet object can be shared between multiple BMDB object.
-//
-// The MetricsSet object is goroutine-safe.
-func New(registry *prometheus.Registry) *MetricsSet {
- m := &MetricsSet{
- sessionStarted: processorCounter("bmdb_session_started", "How many sessions this worker started"),
- transactionExecuted: processorCounter("bmdb_transaction_executed", "How many transactions were performed by this worker"),
- transactionRetried: processorCounter("bmdb_transaction_retried", "How many transaction retries were performed by this worker"),
- transactionFailed: processorCounter("bmdb_transaction_failed", "How many transactions failed permanently on this worker"),
- workStarted: processorCounter("bmdb_work_started", "How many work items were performed by this worker, partitioned by process", "process"),
- workFinished: processorCounter("bmdb_work_finished", "How many work items were finished by this worker, partitioned by process and result", "process", "result"),
- }
- registry.MustRegister(
- m.sessionStarted,
- m.transactionExecuted,
- m.transactionRetried,
- m.transactionFailed,
- m.workStarted,
- m.workFinished,
- )
- return m
-}
-
-// ProcessorRecorder wraps a MetricsSet object with the context of some
-// Processor. It exposes methods that record specific events into the managed
-// Metrics.
-//
-// The ProcessorRecorder object is goroutine safe.
-//
-// An empty ProcessorRecorder object is not valid, and should be instead
-// constructed using Metrics.Recorder.
-//
-// A nil ProcessorRecorder object is valid and represents a no-op metrics
-// recorder.
-type ProcessorRecorder struct {
- m *MetricsSet
- labels prometheus.Labels
-}
-
-// Recorder builds a ProcessorRecorder for the given Metrics and a given
-// Processor.
-func (m *MetricsSet) Recorder(p Processor) *ProcessorRecorder {
- if m == nil {
- return nil
- }
- return &ProcessorRecorder{
- m: m,
- labels: prometheus.Labels{
- "processor": p.String(),
- },
- }
-}
-
-// OnTransactionStarted should be called any time a BMDB client starts or
-// re-starts a BMDB Transaction. The attempt should either be '1' (for the first
-// attempt) or a number larger than 1 for any subsequent attempt (i.e. retry) of
-// a transaction.
-func (r *ProcessorRecorder) OnTransactionStarted(attempt int64) {
- if r == nil {
- return
- }
- if attempt == 1 {
- r.m.transactionExecuted.With(r.labels).Inc()
- } else {
- r.m.transactionRetried.With(r.labels).Inc()
- }
-}
-
-// OnTransactionFailed should be called any time a BMDB client fails a
-// BMDB Transaction permanently.
-func (r *ProcessorRecorder) OnTransactionFailed() {
- if r == nil {
- return
- }
- r.m.transactionFailed.With(r.labels).Inc()
-}
-
-// OnSessionStarted should be called any time a BMDB client opens a new BMDB
-// Session.
-func (r *ProcessorRecorder) OnSessionStarted() {
- if r == nil {
- return
- }
- r.m.sessionStarted.With(r.labels).Inc()
-}
-
-// ProcessRecorder wraps a ProcessorRecorder with an additional model.Process.
-// The resulting object can then record work-specific events.
-//
-// The PusherWithProcess object is goroutine-safe.
-type ProcessRecorder struct {
- *ProcessorRecorder
- labels prometheus.Labels
-}
-
-// WithProcess wraps a given Pusher with a Process.
-//
-// The resulting PusherWithProcess object is goroutine-safe.
-func (r *ProcessorRecorder) WithProcess(process model.Process) *ProcessRecorder {
- if r == nil {
- return nil
- }
- return &ProcessRecorder{
- ProcessorRecorder: r,
- labels: prometheus.Labels{
- "processor": r.labels["processor"],
- "process": string(process),
- },
- }
-}
-
-// OnWorkStarted should be called any time a BMDB client starts a new Work item.
-func (r *ProcessRecorder) OnWorkStarted() {
- if r == nil {
- return
- }
- r.m.workStarted.With(r.labels).Inc()
-}
-
-type WorkResult string
-
-const (
- WorkResultFinished WorkResult = "finished"
- WorkResultCanceled WorkResult = "canceled"
- WorkResultFailed WorkResult = "failed"
-)
-
-// OnWorkFinished should be called any time a BMDB client finishes, cancels or
-// fails a Work item.
-func (r *ProcessRecorder) OnWorkFinished(result WorkResult) {
- if r == nil {
- return
- }
- r.m.workFinished.MustCurryWith(r.labels).With(prometheus.Labels{"result": string(result)}).Inc()
-}
diff --git a/cloud/bmaas/bmdb/migrations_test.go b/cloud/bmaas/bmdb/migrations_test.go
deleted file mode 100644
index fe5b75c..0000000
--- a/cloud/bmaas/bmdb/migrations_test.go
+++ /dev/null
@@ -1,150 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package bmdb
-
-import (
- "context"
- "testing"
-
- "source.monogon.dev/cloud/bmaas/bmdb/model"
-)
-
-// TestMigrateUpDown performs a full-up and full-down migration test on an
-// in-memory database twice.
-//
-// Doing this the first time allows us to check the up migrations are valid and
-// that the down migrations clean up enough after themselves for earlier down
-// migrations to success.
-//
-// Doing this the second time allows us to make sure the down migrations cleaned
-// up enough after themselves that they have left no table/type behind.
-func TestMigrateUpDown(t *testing.T) {
- // Start with an empty database.
- b := dut()
- _, err := b.Open(false)
- if err != nil {
- t.Fatalf("Starting empty database failed: %v", err)
- }
-
- // Migrations go up.
- if err := b.Database.MigrateUp(); err != nil {
- t.Fatalf("Initial up migration failed: %v", err)
- }
- // Migrations go down.
- if err := b.Database.MigrateDownDangerDanger(); err != nil {
- t.Fatalf("Initial down migration failed: %v", err)
- }
- // Migrations go up.
- if err := b.Database.MigrateUp(); err != nil {
- t.Fatalf("Second up migration failed: %v", err)
- }
- // Migrations go down.
- if err := b.Database.MigrateDownDangerDanger(); err != nil {
- t.Fatalf("Second down migration failed: %v", err)
- }
-}
-
-// TestMigrateTwice makes sure we don't hit https://review.monogon.dev/1502 again.
-func TestMigrateTwice(t *testing.T) {
- // Start with an empty database.
- b := dut()
- _, err := b.Open(false)
- if err != nil {
- t.Fatalf("Starting empty database failed: %v", err)
- }
-
- // Migrations go up.
- if err := b.Database.MigrateUp(); err != nil {
- t.Fatalf("Initial up migration failed: %v", err)
- }
- // Migrations go up again.
- if err := b.Database.MigrateUp(); err != nil {
- t.Fatalf("Initial up migration failed: %v", err)
- }
-}
-
-func TestMigration1681826233(t *testing.T) {
- // This migration adds a new nullable field to backoffs.
-
- // This guarantees that versions [prev, ver] can run concurrently in a cluster.
- minVer := uint(1672749980)
- maxVer := uint(1681826233)
-
- ctx, ctxC := context.WithCancel(context.Background())
- defer t.Cleanup(ctxC)
-
- b := dut()
- conn, err := b.Open(false)
- if err != nil {
- t.Fatalf("Starting empty database failed: %v", err)
- }
-
- // First, make sure the change can actually progress if we have some backoffs
- // already.
- if err := b.Database.MigrateUpToIncluding(minVer); err != nil {
- t.Fatalf("Migration to minimum version failed: %v", err)
- }
-
- // Create machine and old-style backoff.
- q := model.New(conn.db)
- machine, err := q.NewMachine(ctx)
- if err != nil {
- t.Fatalf("Could not create machine: %v", err)
- }
- _, err = conn.db.Exec(`
- INSERT INTO work_backoff
- (machine_id, process, until, cause)
- VALUES
- ($1, 'UnitTest1', now(), 'test');
- `, machine.MachineID)
- if err != nil {
- t.Fatalf("Could not create old-style backoff on old version: %v", err)
- }
-
- // Migrate to newer version.
- if err := b.Database.MigrateUpToIncluding(maxVer); err != nil {
- t.Fatalf("Migration to maximum version failed: %v", err)
- }
-
- // The migration should be read succesfully.
- boffs, err := q.WorkBackoffOf(ctx, model.WorkBackoffOfParams{
- MachineID: machine.MachineID,
- Process: "UnitTest1",
- })
- if err != nil {
- t.Fatalf("Reading backoff failed: %v", err)
- }
- if len(boffs) != 1 {
- t.Errorf("No backoff found")
- } else {
- boff := boffs[0]
- if boff.LastIntervalSeconds.Valid {
- t.Errorf("Expected interval to be NULL")
- }
- }
-
- // Simultaneously, any concurrently running bmdb user on an older version should
- // still be able to insert and read backoffs old style.
- _, err = conn.db.Exec(`
- INSERT INTO work_backoff
- (machine_id, process, until, cause)
- VALUES
- ($1, 'UnitTest2', now(), 'test');
- `, machine.MachineID)
- if err != nil {
- t.Fatalf("Could not create old-style backoff on new version: %v", err)
- }
- rows, err := conn.db.Query(`
- SELECT machine_id, process, until, cause FROM work_backoff
- `)
- if err != nil {
- t.Fatalf("Could not fetch old-style backoff data: %v", err)
- }
- for rows.Next() {
- var mid, process, until, cause string
- if err := rows.Scan(&mid, &process, &until, &cause); err != nil {
- t.Errorf("Scan failed: %v", err)
- }
- }
-}
diff --git a/cloud/bmaas/bmdb/model/BUILD.bazel b/cloud/bmaas/bmdb/model/BUILD.bazel
deleted file mode 100644
index 04cac8c..0000000
--- a/cloud/bmaas/bmdb/model/BUILD.bazel
+++ /dev/null
@@ -1,34 +0,0 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
-load("//build/sqlc:sqlc.bzl", "sqlc_go_library")
-
-sqlc_go_library(
- name = "sqlc_model",
- dialect = "cockroachdb",
- importpath = "source.monogon.dev/cloud/bmaas/bmdb/model",
- migrations = glob(["migrations/*.sql"]),
- queries = [
- "queries_base.sql",
- "queries_stats.sql",
- "queries_tags.sql",
- "queries_workflows.sql",
- ],
-)
-
-go_library(
- name = "model",
- srcs = [
- "interfaces.go",
- "migrations.go",
- ],
- embed = [
- ":sqlc_model", # keep
- ],
- embedsrcs = glob(["migrations/*.sql"]), # keep
- importpath = "source.monogon.dev/cloud/bmaas/bmdb/model",
- visibility = ["//visibility:public"],
- deps = [
- "@com_github_golang_migrate_migrate_v4//source",
- "@com_github_golang_migrate_migrate_v4//source/iofs",
- "@com_github_google_uuid//:uuid", # keep
- ],
-)
diff --git a/cloud/bmaas/bmdb/model/interfaces.go b/cloud/bmaas/bmdb/model/interfaces.go
deleted file mode 100644
index 1d559d6..0000000
--- a/cloud/bmaas/bmdb/model/interfaces.go
+++ /dev/null
@@ -1,76 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package model
-
-import "context"
-
-// MetricValue is a prometheus-style labeled numerical metric value. In other
-// words, it's a number accompanied by string key/value pairs.
-type MetricValue struct {
- Count int64
- Labels map[string]string
-}
-
-// WrapSimpleMetric turns a SQL model function which returns a single number into
-// a function which returns one-length MetricValue list with no labels.
-func WrapSimpleMetric(fn func(*Queries, context.Context) (int64, error)) func(*Queries, context.Context) ([]MetricValue, error) {
- return func(q *Queries, ctx context.Context) ([]MetricValue, error) {
- v, err := fn(q, ctx)
- if err != nil {
- return nil, err
- }
- return []MetricValue{
- {
- Count: v,
- Labels: nil,
- },
- }, nil
- }
-}
-
-// A SQLMetricRow is a row that is the result of some kind of SQL 'metric query'.
-// For each such query we define in our *.sql files, a corresponding
-// implementation exists here.
-type SQLMetricRow interface {
- Value() MetricValue
-}
-
-// Value implements SQLMetricRow for a row of the result of the
-// CountActiveBackoffs SQL metric query.
-func (c CountActiveBackoffsRow) Value() MetricValue {
- return MetricValue{
- Count: c.Count,
- Labels: map[string]string{
- "process": string(c.Process),
- },
- }
-}
-
-// Value implements SQLMetricRow for a row of the result of the
-// CountActiveWork SQL metric query.
-func (c CountActiveWorkRow) Value() MetricValue {
- return MetricValue{
- Count: c.Count,
- Labels: map[string]string{
- "process": string(c.Process),
- },
- }
-}
-
-// WrapLabeledMetric turns a SQL model function which returns a list of rows
-// implementing SQLMetricRow into a function which returns a list of MetricValues
-// with labels corresponding to the data returned in the rows.
-func WrapLabeledMetric[M SQLMetricRow](fn func(*Queries, context.Context) ([]M, error)) func(*Queries, context.Context) ([]MetricValue, error) {
- return func(q *Queries, ctx context.Context) ([]MetricValue, error) {
- v, err := fn(q, ctx)
- if err != nil {
- return nil, err
- }
- res := make([]MetricValue, len(v))
- for i, vv := range v {
- res[i] = vv.Value()
- }
- return res, nil
- }
-}
diff --git a/cloud/bmaas/bmdb/model/migrations.go b/cloud/bmaas/bmdb/model/migrations.go
deleted file mode 100644
index 95b5dcf..0000000
--- a/cloud/bmaas/bmdb/model/migrations.go
+++ /dev/null
@@ -1,18 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package model
-
-import (
- "embed"
-
- "github.com/golang-migrate/migrate/v4/source"
- "github.com/golang-migrate/migrate/v4/source/iofs"
-)
-
-//go:embed migrations/*.sql
-var migrationData embed.FS
-
-func MigrationsSource() (source.Driver, error) {
- return iofs.New(migrationData, "migrations")
-}
diff --git a/cloud/bmaas/bmdb/model/migrations/1662136250_initial.down.sql b/cloud/bmaas/bmdb/model/migrations/1662136250_initial.down.sql
deleted file mode 100644
index 5c0ebe9..0000000
--- a/cloud/bmaas/bmdb/model/migrations/1662136250_initial.down.sql
+++ /dev/null
@@ -1,4 +0,0 @@
-DROP TABLE work;
-DROP TABLE sessions;
-DROP TABLE machines;
-DROP TYPE process;
\ No newline at end of file
diff --git a/cloud/bmaas/bmdb/model/migrations/1662136250_initial.up.sql b/cloud/bmaas/bmdb/model/migrations/1662136250_initial.up.sql
deleted file mode 100644
index 7b5b812..0000000
--- a/cloud/bmaas/bmdb/model/migrations/1662136250_initial.up.sql
+++ /dev/null
@@ -1,52 +0,0 @@
-CREATE TABLE machines (
- machine_id UUID NOT NULL DEFAULT gen_random_uuid() PRIMARY KEY,
- machine_created_at TIMESTAMPTZ NOT NULL
-);
-
-
--- Sessions are maintained by components as they work on the rest of the machine
--- database. Once a session is created, it must be maintained by its owning
--- component by repeatedly 'poking' it, ie. updating the heartbeat_deadline
--- value to be some point in the future.
---
--- TODO: garbage collect old sessions.
-CREATE TABLE sessions (
- session_id UUID NOT NULL DEFAULT gen_random_uuid() PRIMARY KEY,
- -- Name of component which created this session. Human-readable.
- session_component_name STRING NOT NULL,
- -- Node name, hostname:port, pod name, whatever. Something to tell where
- -- a particular component is running. Human-readable.
- session_runtime_info STRING NOT NULL,
- -- Time at which this session was created.
- session_created_at TIMESTAMPTZ NOT NULL,
- -- Number of seconds by which session_deadline (counting from now())
- -- is bumped up every time the session is poked.
- session_interval_seconds INT NOT NULL,
- -- Deadline after which this session should not be considered valid anymore.
- session_deadline TIMESTAMPTZ NOT NULL
-);
-
-CREATE TYPE process AS ENUM (
- -- Reserved for unit tests.
- 'UnitTest1',
- 'UnitTest2'
-);
-
--- Work items map a session to work performed on a machine. Multiple work items
--- can exist per session, and thus, a session can back multiple items of work
--- acting on multiple machines. These are optionally created by components to
--- indicate some long-running process being performed on a machine, and will
--- lock out the same process from being run simultaneously, eg. in a
--- concurrently running instance of the same component.
-CREATE TABLE work (
- -- Machine that this work is being performed on. Prevent deleting machines
- -- that have active work on them.
- machine_id UUID NOT NULL REFERENCES machines(machine_id) ON DELETE RESTRICT,
- -- Session that this work item is tied to. If the session expires, so does
- -- the work item.
- session_id UUID NOT NULL REFERENCES sessions(session_id) ON DELETE CASCADE,
- -- Human-readable process name.
- process process NOT NULL,
- UNIQUE (machine_id, process),
- CONSTRAINT "primary" PRIMARY KEY (machine_id, session_id, process)
-);
\ No newline at end of file
diff --git a/cloud/bmaas/bmdb/model/migrations/1667232160_agent_tags.down.sql b/cloud/bmaas/bmdb/model/migrations/1667232160_agent_tags.down.sql
deleted file mode 100644
index 8630143..0000000
--- a/cloud/bmaas/bmdb/model/migrations/1667232160_agent_tags.down.sql
+++ /dev/null
@@ -1,5 +0,0 @@
-DROP TABLE machine_agent_heartbeat;
-DROP TABLE machine_agent_started;
-DROP TABLE machine_provided;
-DROP TABLE machine_hardware_report;
-DROP type provider;
diff --git a/cloud/bmaas/bmdb/model/migrations/1667232160_agent_tags.up.sql b/cloud/bmaas/bmdb/model/migrations/1667232160_agent_tags.up.sql
deleted file mode 100644
index 341e606..0000000
--- a/cloud/bmaas/bmdb/model/migrations/1667232160_agent_tags.up.sql
+++ /dev/null
@@ -1,59 +0,0 @@
-CREATE TYPE provider AS ENUM (
- 'Equinix'
- -- More providers will follow in subsequent migrations.
-);
-
--- tag MachineProvided {
--- Provider Provider
--- ProviderID String
--- }
--- Represents the fact that a machine is backed by a machine from a given
--- provider identified there with a given provider id.
-CREATE TABLE machine_provided (
- machine_id UUID NOT NULL REFERENCES machines(machine_id) ON DELETE RESTRICT,
- provider provider NOT NULL,
- provider_id STRING(128) NOT NULL,
- CONSTRAINT "primary" PRIMARY KEY (machine_id),
- UNIQUE (provider, provider_id)
-);
-
--- tag AgentStarted {
--- StartedAt time.Time
--- PublicKey []byte
--- }
--- Represents the fact that a machine has had the Agent started on it at some
--- given time, and that the agent returned a given public key which it will use
--- to authenticate itself to the bmdb API server.
-CREATE TABLE machine_agent_started (
- machine_id UUID NOT NULL REFERENCES machines(machine_id) ON DELETE RESTRICT,
- agent_started_at TIMESTAMPTZ NOT NULL,
- agent_public_key BYTES NOT NULL,
- CONSTRAINT "primary" PRIMARY KEY(machine_id)
-);
-
--- tag AgentHeartbeat {
--- At time.Time
--- }
--- Represents a successful heartbeat send by the Agent running on a machine at
--- some given time.
-CREATE TABLE machine_agent_heartbeat (
- machine_id UUID NOT NULL REFERENCES machines(machine_id) ON DELETE RESTRICT,
- agent_heartbeat_at TIMESTAMPTZ NOT NULL,
- CONSTRAINT "primary" PRIMARY KEY(machine_id)
-);
-
--- tag HardwareReport {
--- Raw []byte
--- }
--- Represents a hardware report submitted by an Agent running on a machine.
--- Usually a report is submitted only once after an agent has been started.
-CREATE TABLE machine_hardware_report (
- machine_id UUID NOT NULL REFERENCES machines(machine_id) ON DELETE RESTRICT,
- -- Serialized proto of type cloud.bmaas.server.api.AgentHardwareReport.
- hardware_report_raw BYTES NOT NULL,
- CONSTRAINT "primary" PRIMARY KEY(machine_id)
-);
-
--- Used by the Shepherd when performing mutations against the underlying machine
--- (eg. SSH access, restarts, ...).
-ALTER TYPE process ADD VALUE IF NOT EXISTS 'ShepherdAccess';
diff --git a/cloud/bmaas/bmdb/model/migrations/1672743627_installation_tags.down.sql b/cloud/bmaas/bmdb/model/migrations/1672743627_installation_tags.down.sql
deleted file mode 100644
index 0345be6..0000000
--- a/cloud/bmaas/bmdb/model/migrations/1672743627_installation_tags.down.sql
+++ /dev/null
@@ -1,2 +0,0 @@
-DROP TABLE machine_os_installation_report;
-DROP TABLE machine_os_installation_request;
\ No newline at end of file
diff --git a/cloud/bmaas/bmdb/model/migrations/1672743627_installation_tags.up.sql b/cloud/bmaas/bmdb/model/migrations/1672743627_installation_tags.up.sql
deleted file mode 100644
index 37f7b3a..0000000
--- a/cloud/bmaas/bmdb/model/migrations/1672743627_installation_tags.up.sql
+++ /dev/null
@@ -1,19 +0,0 @@
-CREATE TABLE machine_os_installation_request (
- machine_id UUID NOT NULL REFERENCES machines(machine_id) ON DELETE RESTRICT,
- -- Version of this request, for example monotonic epoch counter. Used to
- -- match successful installation (represented by OS installation report) to
- -- pending request, making sure that we don't perform spurious re-installs.
- generation INT NOT NULL,
- -- Serialized cloud.bmaas.server.api.OSInstallationRequest.
- os_installation_request_raw BYTEA NOT NULL,
- CONSTRAINT "primary" PRIMARY KEY (machine_id)
-);
-
-CREATE TABLE machine_os_installation_report (
- machine_id UUID NOT NULL REFERENCES machines(machine_id) ON DELETE RESTRICT,
- -- Matches generation in machine_os_installation_request. Not constrained on
- -- purpose, as a mismatch between generations implies an actionable
- -- installation request and is a valid state of the system.
- generation INT NOT NULL,
- CONSTRAINT "primary" PRIMARY KEY (machine_id)
-);
diff --git a/cloud/bmaas/bmdb/model/migrations/1672749980_backoff.down.sql b/cloud/bmaas/bmdb/model/migrations/1672749980_backoff.down.sql
deleted file mode 100644
index 2aa3ced..0000000
--- a/cloud/bmaas/bmdb/model/migrations/1672749980_backoff.down.sql
+++ /dev/null
@@ -1,3 +0,0 @@
-DROP TABLE work_backoff;
-DROP TABLE work_history;
-DROP TYPE work_history_event;
\ No newline at end of file
diff --git a/cloud/bmaas/bmdb/model/migrations/1672749980_backoff.up.sql b/cloud/bmaas/bmdb/model/migrations/1672749980_backoff.up.sql
deleted file mode 100644
index 24f90e5..0000000
--- a/cloud/bmaas/bmdb/model/migrations/1672749980_backoff.up.sql
+++ /dev/null
@@ -1,53 +0,0 @@
-CREATE TYPE work_history_event AS ENUM (
- 'Started',
- 'Finished',
- 'Failed',
- 'Canceled'
-);
-
--- Audit trail of work history for a given machine.
-CREATE TABLE work_history(
- -- The machine subject to this audit entry. As we want to allow keeping
- -- information about deleted machines, this is not a foreign key.
- machine_id UUID NOT NULL,
-
- -- TODO(q3k): session history?
-
- -- Process acting on this machine which caused an audit entry to be created.
- process process NOT NULL,
- -- Process lifecycle event (started, finished, etc) that caused this audit
- -- entry to be created.
- event work_history_event NOT NULL,
- -- Time at which this entry was created.
- timestamp TIMESTAMPTZ NOT NULL,
-
- -- Failure cause, only set when event == Failed.
- failed_cause STRING
-);
-
-CREATE INDEX ON work_history (machine_id);
-
--- Backoff entries are created by failed work items, and effectively act as
--- a Lockout-tagout entry for a given machine and a given process.
---
--- Currently, there is no way to fully backoff an entire machine, just
--- individual processes from a given machine.
---
--- Backoff entries are only valid as long as 'until' is before now(), after that
--- they are ignored by workflow queries. Insertion queries act as upserts,
--- and thus the backoff entries do not need to be garbage collected, as they do
--- not grow unbounded (maximumum one entry per process/machine).
-CREATE TABLE work_backoff(
- -- The machine affected by this backoff.
- machine_id UUID NOT NULL REFERENCES machines(machine_id) ON DELETE CASCADE,
- -- The process that this machine should not be subjected to.
- process process NOT NULL,
- -- Until when the backoff is enforced.
- until TIMESTAMPTZ NOT NULL,
-
- -- Error reported by process/work when this backoff was inserted.
- -- Human-readable.
- cause STRING NOT NULL,
-
- UNIQUE(machine_id, process)
-);
\ No newline at end of file
diff --git a/cloud/bmaas/bmdb/model/migrations/1681826233_exponential_backoff.down.sql b/cloud/bmaas/bmdb/model/migrations/1681826233_exponential_backoff.down.sql
deleted file mode 100644
index e0cea5f..0000000
--- a/cloud/bmaas/bmdb/model/migrations/1681826233_exponential_backoff.down.sql
+++ /dev/null
@@ -1,2 +0,0 @@
-ALTER TABLE work_backoff
-DROP COLUMN last_interval_seconds;
\ No newline at end of file
diff --git a/cloud/bmaas/bmdb/model/migrations/1681826233_exponential_backoff.up.sql b/cloud/bmaas/bmdb/model/migrations/1681826233_exponential_backoff.up.sql
deleted file mode 100644
index 94fd8a5..0000000
--- a/cloud/bmaas/bmdb/model/migrations/1681826233_exponential_backoff.up.sql
+++ /dev/null
@@ -1,5 +0,0 @@
--- Add interval, in seconds. This is used to calculate subsequent backoff values
--- for exponential backoffs. A future migration should make this field
--- non-nullable.
-ALTER TABLE work_backoff
-ADD COLUMN last_interval_seconds BIGINT NULL;
\ No newline at end of file
diff --git a/cloud/bmaas/bmdb/model/migrations/1681909788_extra_provider_data.down.sql b/cloud/bmaas/bmdb/model/migrations/1681909788_extra_provider_data.down.sql
deleted file mode 100644
index 986637c..0000000
--- a/cloud/bmaas/bmdb/model/migrations/1681909788_extra_provider_data.down.sql
+++ /dev/null
@@ -1,6 +0,0 @@
-ALTER TABLE machine_provided
-DROP COLUMN provider_reservation_id,
-DROP COLUMN provider_ip_address,
-DROP COLUMN provider_location,
-DROP COLUMN provider_status;
-DROP type provider_status;
diff --git a/cloud/bmaas/bmdb/model/migrations/1681909788_extra_provider_data.up.sql b/cloud/bmaas/bmdb/model/migrations/1681909788_extra_provider_data.up.sql
deleted file mode 100644
index c297e69..0000000
--- a/cloud/bmaas/bmdb/model/migrations/1681909788_extra_provider_data.up.sql
+++ /dev/null
@@ -1,44 +0,0 @@
-CREATE TYPE provider_status AS ENUM (
- -- The provider has no idea about this machine. This likely means something
- -- went really wrong and should be investigated.
- 'Missing',
- -- The provider is attempting to install the initial operating system and
- -- give us control over the machine.
- 'Provisioning',
- -- The provider has failed to provide this machine and is not expected to
- -- be able to provision it without deprovisioning it first.
- 'ProvisioningFailedPermanent',
-
- -- The provider sees this machine as running/healthy and ready for use by us.
- 'Running',
-
- -- The provider sees that machine as administratively stopped/down, but not
- -- failed. It can be enabled / turned back on and should become Running.
- 'Stopped',
-
- -- Any other state that we're not able to parse. Likely should be
- -- investigated.
- 'Unknown'
-);
-
-ALTER TABLE machine_provided
--- Optional hardware reservation ID for this provider. Currently only implemented
--- for Equinix.
-ADD COLUMN provider_reservation_id STRING(128) NULL,
--- Optional 'main' IP address as seen by provider. 'Main' is provider specific,
--- but generally should be the IP address that system operators would consider
--- the primary IP address of the machine, generally the one that operators would
--- SSH into. It might be a publicly routable address or might not be. It might
--- be a single IP address or a CIDR. Regardless, it's some human-readable
--- representation of the address, and generally should not be machine-parsed.
---
--- On Equinix, we pick the first IP address marked as 'public'.
-ADD COLUMN provider_ip_address STRING(128) NULL,
--- Optional location/region as seen by provider. This is provider-specific: it
--- might be a city name, some internal metro ID, a PoP name, a slug, or even an
--- opaque string.
-ADD COLUMN provider_location STRING(128) NULL,
--- Optional status as seen by provider. This is converted from provider-specific
--- data into an internal type. This field is machine-usable and with time should
--- be moved to be non-null.
-ADD COLUMN provider_status provider_status NULL;
\ No newline at end of file
diff --git a/cloud/bmaas/bmdb/model/migrations/1681980576_extra_shepherd_processes.down.sql b/cloud/bmaas/bmdb/model/migrations/1681980576_extra_shepherd_processes.down.sql
deleted file mode 100644
index e01997f..0000000
--- a/cloud/bmaas/bmdb/model/migrations/1681980576_extra_shepherd_processes.down.sql
+++ /dev/null
@@ -1,2 +0,0 @@
--- Not removing added enum values, as the 'up' migration has 'if not exists',
--- and there is no harm in keeping unused enum values around.
\ No newline at end of file
diff --git a/cloud/bmaas/bmdb/model/migrations/1681980576_extra_shepherd_processes.up.sql b/cloud/bmaas/bmdb/model/migrations/1681980576_extra_shepherd_processes.up.sql
deleted file mode 100644
index 6e7c630..0000000
--- a/cloud/bmaas/bmdb/model/migrations/1681980576_extra_shepherd_processes.up.sql
+++ /dev/null
@@ -1,6 +0,0 @@
--- Add two more process kinds, ShepherdAgentStart and ShepherdRecovery, for agent
--- start and recovery by the shepherd respectively. These deprecate the previous
--- ShepherdAccess process. The two processes mutually exclude each other.
-
-ALTER TYPE process ADD VALUE IF NOT EXISTS 'ShepherdAgentStart';
-ALTER TYPE process ADD VALUE IF NOT EXISTS 'ShepherdRecovery';
\ No newline at end of file
diff --git a/cloud/bmaas/bmdb/model/migrations/1686656942_add_indexes.down.sql b/cloud/bmaas/bmdb/model/migrations/1686656942_add_indexes.down.sql
deleted file mode 100644
index 5828d1d..0000000
--- a/cloud/bmaas/bmdb/model/migrations/1686656942_add_indexes.down.sql
+++ /dev/null
@@ -1,3 +0,0 @@
-DROP INDEX work_backoff@process_machine_id_idx;
-DROP INDEX sessions@session_id_deadline_idx;
-DROP INDEX machine_agent_started@agent_public_key_idx;
\ No newline at end of file
diff --git a/cloud/bmaas/bmdb/model/migrations/1686656942_add_indexes.up.sql b/cloud/bmaas/bmdb/model/migrations/1686656942_add_indexes.up.sql
deleted file mode 100644
index a789e10..0000000
--- a/cloud/bmaas/bmdb/model/migrations/1686656942_add_indexes.up.sql
+++ /dev/null
@@ -1,14 +0,0 @@
--- Used by the agent gRPC server to retrieve agent information by public key.
-CREATE INDEX agent_public_key_idx
-ON machine_agent_started (agent_public_key)
-INCLUDE (agent_started_at);
-
--- Used by queries which require a live session.
-CREATE INDEX session_id_deadline_idx
-ON sessions (session_id, session_deadline)
-INCLUDE (session_component_name, session_runtime_info, session_Created_at, session_interval_seconds);
-
--- Used by work retrieval/scheduling queries to exclude machines that have a given process backed off.
-CREATE INDEX process_machine_id_idx
-ON work_backoff (process, machine_id)
-INCLUDE (until);
\ No newline at end of file
diff --git a/cloud/bmaas/bmdb/model/migrations/1687875953_add_installation_report.down.sql b/cloud/bmaas/bmdb/model/migrations/1687875953_add_installation_report.down.sql
deleted file mode 100644
index 6c2cb40..0000000
--- a/cloud/bmaas/bmdb/model/migrations/1687875953_add_installation_report.down.sql
+++ /dev/null
@@ -1,5 +0,0 @@
-ALTER TABLE machine_os_installation_report
-DROP COLUMN os_installation_result,
-DROP COLUMN os_installation_report_raw;
-
-DROP TYPE machine_os_installation_result;
diff --git a/cloud/bmaas/bmdb/model/migrations/1687875953_add_installation_report.up.sql b/cloud/bmaas/bmdb/model/migrations/1687875953_add_installation_report.up.sql
deleted file mode 100644
index 4f25589..0000000
--- a/cloud/bmaas/bmdb/model/migrations/1687875953_add_installation_report.up.sql
+++ /dev/null
@@ -1,10 +0,0 @@
-CREATE TYPE machine_os_installation_result AS ENUM (
- 'Success',
- 'Error'
- );
-
--- Add column for storing the serialized cloud.bmaas.server.api.OSInstallationReport
--- also add a column to display if the installation was successful or not.
-ALTER TABLE machine_os_installation_report
- ADD COLUMN os_installation_result machine_os_installation_result NOT NULL,
- ADD COLUMN os_installation_report_raw BYTEA NOT NULL;
\ No newline at end of file
diff --git a/cloud/bmaas/bmdb/model/migrations/1698288577_add_providers.down.sql b/cloud/bmaas/bmdb/model/migrations/1698288577_add_providers.down.sql
deleted file mode 100644
index b0b22d0..0000000
--- a/cloud/bmaas/bmdb/model/migrations/1698288577_add_providers.down.sql
+++ /dev/null
@@ -1,2 +0,0 @@
-ALTER TYPE provider DROP VALUE 'Lumen';
-ALTER TYPE provider DROP VALUE 'None';
diff --git a/cloud/bmaas/bmdb/model/migrations/1698288577_add_providers.up.sql b/cloud/bmaas/bmdb/model/migrations/1698288577_add_providers.up.sql
deleted file mode 100644
index d309d7a..0000000
--- a/cloud/bmaas/bmdb/model/migrations/1698288577_add_providers.up.sql
+++ /dev/null
@@ -1,2 +0,0 @@
-ALTER TYPE provider ADD VALUE 'Lumen';
-ALTER TYPE provider ADD VALUE 'None';
diff --git a/cloud/bmaas/bmdb/model/queries_base.sql b/cloud/bmaas/bmdb/model/queries_base.sql
deleted file mode 100644
index e150fbb..0000000
--- a/cloud/bmaas/bmdb/model/queries_base.sql
+++ /dev/null
@@ -1,90 +0,0 @@
--- name: NewMachine :one
-INSERT INTO machines (
- machine_created_at
-) VALUES (
- now()
-)
-RETURNING *;
-
--- name: NewSession :one
-INSERT INTO sessions (
- session_component_name, session_runtime_info, session_created_at, session_interval_seconds, session_deadline
-) VALUES (
- $1, $2, now(), $3, (now() + $3 * interval '1 second')
-)
-RETURNING *;
-
--- name: SessionPoke :exec
--- Update a given session with a new deadline. Must be called in the same
--- transaction as SessionCheck to ensure the session is still alive.
-UPDATE sessions
-SET session_deadline = now() + session_interval_seconds * interval '1 second'
-WHERE session_id = $1;
-
--- name: SessionCheck :many
--- SessionCheck returns a session by ID if that session is still valid (ie. its
--- deadline hasn't expired).
-SELECT *
-FROM sessions
-WHERE session_id = $1
-AND session_deadline > now();
-
--- name: StartWork :exec
-INSERT INTO work (
- machine_id, session_id, process
-) VALUES (
- $1, $2, $3
-);
-
--- name: FinishWork :exec
-DELETE FROM work
-WHERE machine_id = $1
- AND session_id = $2
- AND process = $3;
-
-
--- name: WorkHistoryInsert :exec
--- Insert an entry into the work_history audit table.
-INSERT INTO work_history (
- machine_id, process, event, timestamp, failed_cause
-) VALUES (
- $1, $2, $3, now(), $4
-);
-
--- name: WorkBackoffInsert :exec
--- Upsert a backoff for a given machine/process.
-INSERT INTO work_backoff (
- machine_id, process, cause, until, last_interval_seconds
-) VALUES (
- $1, $2, $3,
- now() + (sqlc.arg(seconds)::int * interval '1 second'),
- sqlc.arg(seconds)::bigint
-) ON CONFLICT (machine_id, process) DO UPDATE SET
- cause = $3,
- until = now() + (sqlc.arg(seconds)::int * interval '1 second'),
- last_interval_seconds = sqlc.arg(seconds)::bigint
-;
-
--- name: WorkBackoffDelete :exec
-DELETE FROM work_backoff
-WHERE machine_id = $1
- AND process = $2;
-
--- name: WorkBackoffOf :many
-SELECT *
-FROM work_backoff
-WHERE machine_id = $1
- AND process = $2;
-
--- name: ListHistoryOf :many
--- Retrieve full audit history of a machine.
-SELECT *
-FROM work_history
-WHERE machine_id = $1
-ORDER BY timestamp ASC;
-
--- name: GetSession :many
--- Retrieve session information by session ID.
-SELECT *
-FROM sessions
-WHERE session_id = $1;
\ No newline at end of file
diff --git a/cloud/bmaas/bmdb/model/queries_stats.sql b/cloud/bmaas/bmdb/model/queries_stats.sql
deleted file mode 100644
index d868a6c..0000000
--- a/cloud/bmaas/bmdb/model/queries_stats.sql
+++ /dev/null
@@ -1,110 +0,0 @@
--- name: CountActiveWork :many
--- Return number of active work, grouped by process.
-SELECT COUNT(*), work.process
-FROM work
-GROUP BY (work.process);
-
--- name: CountActiveBackoffs :many
--- Return number of active backoffs, grouped by process.
-SELECT COUNT(*), work_backoff.process
-FROM work_backoff
-GROUP BY (work_backoff.process);
-
--- name: CountMachines :one
-SELECT COUNT(*)
-FROM machines;
-
--- name: CountMachinesProvided :one
-SELECT COUNT(*)
-FROM machine_provided;
-
--- name: CountMachinesAgentHeartbeating :one
-SELECT COUNT(*)
-FROM machines
- INNER JOIN machine_provided ON machines.machine_id = machine_provided.machine_id
- INNER JOIN machine_agent_heartbeat ON machines.machine_id = machine_agent_heartbeat.machine_id
-WHERE
- now() < machine_agent_heartbeat.agent_heartbeat_at + interval '10 minute';
-
--- name: CountMachinesInstallationPending :one
-SELECT COUNT(*)
-FROM machines
- INNER JOIN machine_provided ON machines.machine_id = machine_provided.machine_id
- INNER JOIN machine_os_installation_request ON machines.machine_id = machine_os_installation_request.machine_id
- LEFT JOIN machine_os_installation_report ON machines.machine_id = machine_os_installation_report.machine_id
-WHERE
- machine_os_installation_request.generation IS DISTINCT FROM machine_os_installation_report.generation;
-
--- name: CountMachinesInstallationComplete :one
-SELECT COUNT(*)
-FROM machines
- INNER JOIN machine_provided ON machines.machine_id = machine_provided.machine_id
- INNER JOIN machine_os_installation_request ON machines.machine_id = machine_os_installation_request.machine_id
- INNER JOIN machine_os_installation_report ON machines.machine_id = machine_os_installation_report.machine_id
-WHERE
- machine_os_installation_request.generation IS NOT DISTINCT FROM machine_os_installation_report.generation;
-
--- name: CountMachinesForAgentStart :one
--- Return number of machines eligible for agent start.
--- ONCHANGE(queries_workflows.sql): constraints must be kept in sync with GetMachinesForAgentStart.
-SELECT COUNT(machine_provided)
-FROM machines
- INNER JOIN machine_provided ON machines.machine_id = machine_provided.machine_id
- LEFT JOIN work ON machines.machine_id = work.machine_id AND work.process IN ('ShepherdAccess', 'ShepherdAgentStart', 'ShepherdRecovery')
- LEFT JOIN work_backoff ON machines.machine_id = work_backoff.machine_id AND work_backoff.until > now() AND work_backoff.process = 'ShepherdAgentStart'
- LEFT JOIN machine_agent_started ON machines.machine_id = machine_agent_started.machine_id
- LEFT JOIN machine_os_installation_request ON machines.machine_id = machine_os_installation_request.machine_id
- LEFT JOIN machine_os_installation_report ON machines.machine_id = machine_os_installation_report.machine_id
-WHERE
- work.machine_id IS NULL
- AND work_backoff.machine_id IS NULL
- AND machine_agent_started.machine_id IS NULL
- -- If there is a pending installation request, it must not have been fulfilled already.
- AND (
- machine_os_installation_request.machine_id IS NULL
- OR machine_os_installation_report.generation IS DISTINCT FROM machine_os_installation_request.generation
- );
-
--- name: CountMachinesForAgentRecovery :one
--- Return number of machines eligible for agent recovery.
--- ONCHANGE(queries_workflows.sql): constraints must be kept in sync with GetMachineForAgentRecovery.
-SELECT COUNT(machine_provided)
-FROM machines
- INNER JOIN machine_provided ON machines.machine_id = machine_provided.machine_id
- LEFT JOIN work ON machines.machine_id = work.machine_id AND work.process IN ('ShepherdAccess', 'ShepherdAgentStart', 'ShepherdRecovery')
- LEFT JOIN work_backoff ON machines.machine_id = work_backoff.machine_id AND work_backoff.until > now() AND work_backoff.process = 'ShepherdRecovery'
- INNER JOIN machine_agent_started ON machines.machine_id = machine_agent_started.machine_id
- LEFT JOIN machine_agent_heartbeat ON machines.machine_id = machine_agent_heartbeat.machine_id
- LEFT JOIN machine_os_installation_request ON machines.machine_id = machine_os_installation_request.machine_id
- LEFT JOIN machine_os_installation_report ON machines.machine_id = machine_os_installation_report.machine_id
-WHERE
- work.machine_id IS NULL
- AND work_backoff.machine_id IS NULL
- -- Only act on machines where the agent is expected to have been started:
- -- 1. If there is a pending installation request, it must not have been fulfilled already.
- AND (
- machine_os_installation_request.machine_id IS NULL
- OR machine_os_installation_report.generation IS DISTINCT FROM machine_os_installation_request.generation
- )
- -- 2. The agent must have never heartbeat or must have stopped heartbeating.
- AND (
- -- No heartbeat 30 minutes after starting the agent.
- ( machine_agent_heartbeat.machine_id IS NULL
- AND now() > (machine_agent_started.agent_started_at + interval '30 minutes')
- )
- -- Heartbeats ceased for 10 minutes.
- OR ( machine_agent_heartbeat.machine_id IS NOT NULL
- AND now() > (machine_agent_heartbeat.agent_heartbeat_at + interval '10 minutes')
- )
- );
-
--- name: ListMachineHardware :many
-SELECT
- machine_provided.*,
- machine_hardware_report.*
-FROM machines
- INNER JOIN machine_provided ON machines.machine_id = machine_provided.machine_id
- INNER JOIN machine_hardware_report ON machines.machine_id = machine_hardware_report.machine_id
-WHERE machines.machine_id > $1
-ORDER BY machines.machine_id ASC
-LIMIT $2;
diff --git a/cloud/bmaas/bmdb/model/queries_tags.sql b/cloud/bmaas/bmdb/model/queries_tags.sql
deleted file mode 100644
index 58e2d96..0000000
--- a/cloud/bmaas/bmdb/model/queries_tags.sql
+++ /dev/null
@@ -1,74 +0,0 @@
--- name: MachineAddProvided :exec
-INSERT INTO machine_provided (
- machine_id, provider, provider_id
-) VALUES (
- $1, $2, $3
-);
-
--- name: MachineSetAgentStarted :exec
-INSERT INTO machine_agent_started (
- machine_id, agent_started_at, agent_public_key
-) VALUES (
- $1, $2, $3
-) ON CONFLICT (machine_id) DO UPDATE SET
- agent_started_at = $2,
- agent_public_key = $3
-;
-
--- name: MachineSetAgentHeartbeat :exec
-INSERT INTO machine_agent_heartbeat (
- machine_id, agent_heartbeat_at
-) VALUES (
- $1, $2
-) ON CONFLICT (machine_id) DO UPDATE SET
- agent_heartbeat_at = $2
-;
-
--- name: MachineSetHardwareReport :exec
-INSERT INTO machine_hardware_report (
- machine_id, hardware_report_raw
-) VALUES (
- $1, $2
-) ON CONFLICT (machine_id) DO UPDATE SET
- hardware_report_raw = $2
-;
-
--- name: MachineSetOSInstallationRequest :exec
-INSERT INTO machine_os_installation_request (
- machine_id, generation, os_installation_request_raw
-) VALUES (
- $1, $2, $3
-) ON CONFLICT (machine_id) DO UPDATE SET
- generation = $2,
- os_installation_request_raw = $3
-;
-
--- name: MachineSetOSInstallationReport :exec
-INSERT INTO machine_os_installation_report (
- machine_id, generation, os_installation_result, os_installation_report_raw
-) VALUES (
- $1, $2, $3, $4
-) ON CONFLICT (machine_id) DO UPDATE SET
- generation = $2,
- os_installation_result = $3,
- os_installation_report_raw = $4
-;
-
-
--- name: MachineDeleteAgentStarted :exec
-DELETE FROM machine_agent_started
-WHERE machine_id = $1;
-
--- name: MachineDeleteAgentHeartbeat :exec
-DELETE FROM machine_agent_heartbeat
-WHERE machine_id = $1;
-
--- name: MachineUpdateProviderStatus :exec
-UPDATE machine_provided
-SET
- provider_reservation_id = COALESCE($3, provider_reservation_id),
- provider_ip_address = COALESCE($4, provider_ip_address),
- provider_location = COALESCE($5, provider_location),
- provider_status = COALESCE($6, provider_status)
-WHERE provider_id = $1
-AND provider = $2;
\ No newline at end of file
diff --git a/cloud/bmaas/bmdb/model/queries_workflows.sql b/cloud/bmaas/bmdb/model/queries_workflows.sql
deleted file mode 100644
index 49622d8..0000000
--- a/cloud/bmaas/bmdb/model/queries_workflows.sql
+++ /dev/null
@@ -1,103 +0,0 @@
--- name: GetProvidedMachines :many
-SELECT
- machine_provided.*
-FROM machines
-INNER JOIN machine_provided ON machines.machine_id = machine_provided.machine_id
-WHERE machine_provided.provider = $1;
-
--- name: GetMachinesForAgentStart :many
--- Get machines that need agent started for the first time. Machine can be
--- assumed to be 'new', with no previous attempts or failures.
--- ONCHANGE(queries_stats.sql): constraints must be kept in sync with StatsMachinesForAgentStart.
-SELECT
- machine_provided.*
-FROM machines
-INNER JOIN machine_provided ON machines.machine_id = machine_provided.machine_id
-LEFT JOIN work ON machines.machine_id = work.machine_id AND work.process IN ('ShepherdAccess', 'ShepherdAgentStart', 'ShepherdRecovery')
-LEFT JOIN work_backoff ON machines.machine_id = work_backoff.machine_id AND work_backoff.until > now() AND work_backoff.process = 'ShepherdAgentStart'
-LEFT JOIN machine_agent_started ON machines.machine_id = machine_agent_started.machine_id
-LEFT JOIN machine_os_installation_request ON machines.machine_id = machine_os_installation_request.machine_id
-LEFT JOIN machine_os_installation_report ON machines.machine_id = machine_os_installation_report.machine_id
-WHERE
- machine_agent_started.machine_id IS NULL
- -- Do not start on machines that have a fulfilled OS installation request.
- AND (
- machine_os_installation_request.machine_id IS NULL
- OR machine_os_installation_request.generation IS DISTINCT FROM machine_os_installation_report.generation
- )
- AND work.machine_id IS NULL
- AND work_backoff.machine_id IS NULL
- AND machine_provided.provider = $2
-LIMIT $1;
-
--- name: GetMachineForAgentRecovery :many
--- Get machines that need agent restarted after something went wrong. Either
--- the agent started but never responded, or the agent stopped responding at
--- some point, or the machine got rebooted or somehow else lost the agent. Assume
--- some work needs to be performed on the shepherd side to diagnose and recover
--- whatever state the machine truly is in.
--- ONCHANGE(queries_stats.sql): constraints must be kept in sync with StatsMachinesForAgentRecovery.
-SELECT
- machine_provided.*
-FROM machines
-INNER JOIN machine_provided ON machines.machine_id = machine_provided.machine_id
-LEFT JOIN work ON machines.machine_id = work.machine_id AND work.process IN ('ShepherdAccess', 'ShepherdAgentStart', 'ShepherdRecovery')
-LEFT JOIN work_backoff ON machines.machine_id = work_backoff.machine_id AND work_backoff.until > now() AND work_backoff.process = 'ShepherdRecovery'
-INNER JOIN machine_agent_started ON machines.machine_id = machine_agent_started.machine_id
-LEFT JOIN machine_agent_heartbeat ON machines.machine_id = machine_agent_heartbeat.machine_id
-LEFT JOIN machine_os_installation_request ON machines.machine_id = machine_os_installation_request.machine_id
-LEFT JOIN machine_os_installation_report ON machines.machine_id = machine_os_installation_report.machine_id AND machine_os_installation_report.os_installation_result = 'Success'
-WHERE
- -- Do not recover machines that have a fulfilled OS installation request.
- (
- machine_os_installation_request.machine_id IS NULL
- OR machine_os_installation_request.generation IS DISTINCT FROM machine_os_installation_report.generation
- )
- AND (
- -- No heartbeat 30 minutes after starting the agent.
- (
- machine_agent_heartbeat.machine_id IS NULL
- AND now() > (machine_agent_started.agent_started_at + interval '30 minutes')
- )
- -- Heartbeats ceased for 10 minutes.
- OR (
- machine_agent_heartbeat.machine_id IS NOT NULL
- AND now() > (machine_agent_heartbeat.agent_heartbeat_at + interval '10 minutes')
- )
- )
- AND work.machine_id IS NULL
- AND work_backoff.machine_id IS NULL
- AND machine_provided.provider = $2
-LIMIT $1;
-
--- name: AuthenticateAgentConnection :many
--- Used by bmdb server to verify incoming connections.
-SELECT
- machine_agent_started.*
-FROM machines
-INNER JOIN machine_agent_started ON machines.machine_id = machine_agent_started.machine_id
-WHERE
- machines.machine_id = $1
- AND machine_agent_started.agent_public_key = $2;
-
--- name: GetExactMachineForOSInstallation :many
--- Get OS installation request for a given machine ID. Used by the bmdb server
--- to tell agent whether there's a pending installation request for the machine
--- it's running on.
-SELECT
- machine_os_installation_request.*
-FROM machines
-LEFT JOIN machine_os_installation_request ON machines.machine_id = machine_os_installation_request.machine_id
-LEFT JOIN machine_os_installation_report ON machines.machine_id = machine_os_installation_report.machine_id
-WHERE
- -- We are only interested in one concrete machine.
- machines.machine_id = $1
- -- We must have an installation request.
- AND machine_os_installation_request.machine_id IS NOT NULL
- -- And we either must have no installation report, or the installation
- -- report's generation must not match the installation request's generation.
- AND (
- machine_os_installation_report.machine_id IS NULL
- OR machine_os_installation_report.generation != machine_os_installation_request.generation
- )
-LIMIT $2;
diff --git a/cloud/bmaas/bmdb/queries_test.go b/cloud/bmaas/bmdb/queries_test.go
deleted file mode 100644
index 917033f..0000000
--- a/cloud/bmaas/bmdb/queries_test.go
+++ /dev/null
@@ -1,279 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package bmdb
-
-import (
- "context"
- "fmt"
- "testing"
- "time"
-
- "github.com/google/uuid"
-
- "source.monogon.dev/cloud/bmaas/bmdb/model"
-)
-
-// TestAgentStart exercises GetMachinesForAgentStart.
-func TestAgentStart(t *testing.T) {
- b := dut()
- conn, err := b.Open(true)
- if err != nil {
- t.Fatalf("Open failed: %v", err)
- }
-
- ctx, ctxC := context.WithCancel(context.Background())
- defer ctxC()
-
- session, err := conn.StartSession(ctx)
- if err != nil {
- t.Fatalf("Starting session failed: %v", err)
- }
-
- // Create a test machine.
- var machine model.Machine
- err = session.Transact(ctx, func(q *model.Queries) error {
- machine, err = q.NewMachine(ctx)
- return err
- })
- if err != nil {
- t.Fatalf("Creating machine failed: %v", err)
- }
-
- // It should be, by default, not a candidate for agent start as it's not yet
- // provided by any provider.
- expectCandidates := func(want int) {
- t.Helper()
- if err := session.Transact(ctx, func(q *model.Queries) error {
- candidates, err := q.GetMachinesForAgentStart(ctx, model.GetMachinesForAgentStartParams{
- Limit: 1,
- Provider: model.ProviderEquinix,
- })
- if err != nil {
- t.Fatalf("Could not retrieve machines for agent start: %v", err)
- }
- if got := len(candidates); want != got {
- t.Fatalf("Wanted %d machines for agent start, got %+v", want, candidates)
- }
- return nil
- }); err != nil {
- t.Fatal(err)
- }
- }
-
- // Provide machine, and check it is now a candidate.
- if err := session.Transact(ctx, func(q *model.Queries) error {
- return q.MachineAddProvided(ctx, model.MachineAddProvidedParams{
- MachineID: machine.MachineID,
- Provider: model.ProviderEquinix,
- ProviderID: "123",
- })
- }); err != nil {
- t.Fatalf("could not add provided tag to machine: %v", err)
- }
- expectCandidates(1)
-
- // Add a start tag. Machine shouldn't be a candidate anymore.
- if err := session.Transact(ctx, func(q *model.Queries) error {
- return q.MachineSetAgentStarted(ctx, model.MachineSetAgentStartedParams{
- MachineID: machine.MachineID,
- AgentStartedAt: time.Now(),
- AgentPublicKey: []byte("fakefakefakefake"),
- })
- }); err != nil {
- t.Fatalf("could not add provided tag to machine: %v", err)
- }
- expectCandidates(0)
-
- // Add a new machine which has an unfulfilled installation request. It should be
- // a candidate.
- if err = session.Transact(ctx, func(q *model.Queries) error {
- machine, err = q.NewMachine(ctx)
- if err != nil {
- return err
- }
- if err := q.MachineAddProvided(ctx, model.MachineAddProvidedParams{
- MachineID: machine.MachineID,
- Provider: model.ProviderEquinix,
- ProviderID: "234",
- }); err != nil {
- return err
- }
- if err := q.MachineSetOSInstallationRequest(ctx, model.MachineSetOSInstallationRequestParams{
- MachineID: machine.MachineID,
- Generation: 10,
- }); err != nil {
- return err
- }
- return nil
- }); err != nil {
- t.Fatalf("could not add new machine with installation request: %v", err)
- }
- expectCandidates(1)
-
- // Fulfill installation request on machine with an older generation. it should
- // remain a candidate.
- if err = session.Transact(ctx, func(q *model.Queries) error {
- if err := q.MachineSetOSInstallationReport(ctx, model.MachineSetOSInstallationReportParams{
- MachineID: machine.MachineID,
- Generation: 9,
- OsInstallationResult: model.MachineOsInstallationResultSuccess,
- }); err != nil {
- return err
- }
- return nil
- }); err != nil {
- t.Fatalf("could not fulfill installation request with older generation: %v", err)
- }
- expectCandidates(1)
-
- // Fulfill installation request with correct generation. The machine should not
- // be a candidate anymore.
- if err = session.Transact(ctx, func(q *model.Queries) error {
- if err := q.MachineSetOSInstallationReport(ctx, model.MachineSetOSInstallationReportParams{
- MachineID: machine.MachineID,
- Generation: 10,
- OsInstallationResult: model.MachineOsInstallationResultSuccess,
- }); err != nil {
- return err
- }
- return nil
- }); err != nil {
- t.Fatalf("could not fulfill installation request with current generation: %v", err)
- }
- expectCandidates(0)
-}
-
-// TestAgentRecovery exercises GetMachinesForAgentRecovery though a few
-// different scenarios in which a test machine is present with different tags
-// set.
-func TestAgentRecovery(t *testing.T) {
- b := dut()
- conn, err := b.Open(true)
- if err != nil {
- t.Fatalf("Open failed: %v", err)
- }
-
- ctx, ctxC := context.WithCancel(context.Background())
- defer ctxC()
-
- session, err := conn.StartSession(ctx)
- if err != nil {
- t.Fatalf("Starting session failed: %v", err)
- }
-
- for i, scenario := range []struct {
- // Whether recovery is expected to run.
- wantRun bool
- // started will add a AgentStarted tag for a given time, if set.
- started time.Time
- // heartbeat will add a AgentHeartbeat tag for a given time, if set.
- heartbeat time.Time
- // requestGeneration will populate a OSInstallationRequest if not zero.
- requestGeneration int64
- // requestGeneration will populate a OSInstallationResponse if not zero.
- reportGeneration int64
- }{
- // No start, no heartbeat -> no recovery expected.
- {false, time.Time{}, time.Time{}, 0, 0},
- // Started recently, no heartbeat -> no recovery expected.
- {false, time.Now(), time.Time{}, 0, 0},
- // Started a while ago, heartbeat active -> no recovery expected.
- {false, time.Now().Add(-40 * time.Minute), time.Now(), 0, 0},
-
- // Started a while ago, no heartbeat -> recovery expected.
- {true, time.Now().Add(-40 * time.Minute), time.Time{}, 0, 0},
- // Started a while ago, no recent heartbeat -> recovery expected.
- {true, time.Now().Add(-40 * time.Minute), time.Now().Add(-20 * time.Minute), 0, 0},
-
- // Installation request without report -> recovery expected.
- {true, time.Now().Add(-40 * time.Minute), time.Time{}, 10, 0},
- {true, time.Now().Add(-40 * time.Minute), time.Now().Add(-20 * time.Minute), 10, 0},
- // Installation request mismatching report -> recovery expected.
- {true, time.Now().Add(-40 * time.Minute), time.Time{}, 10, 9},
- {true, time.Now().Add(-40 * time.Minute), time.Now().Add(-20 * time.Minute), 10, 9},
- // Installation request matching report -> no recovery expected.
- {false, time.Now().Add(-40 * time.Minute), time.Time{}, 10, 10},
- {false, time.Now().Add(-40 * time.Minute), time.Now().Add(-20 * time.Minute), 10, 10},
- } {
- var machineID uuid.UUID
- if err := session.Transact(ctx, func(q *model.Queries) error {
- machine, err := q.NewMachine(ctx)
- if err != nil {
- return fmt.Errorf("NewMachine: %w", err)
- }
- machineID = machine.MachineID
- if err := q.MachineAddProvided(ctx, model.MachineAddProvidedParams{
- MachineID: machine.MachineID,
- Provider: model.ProviderEquinix,
- ProviderID: fmt.Sprintf("test-%d", i),
- }); err != nil {
- return fmt.Errorf("MachineAddProvided: %w", err)
- }
- if !scenario.started.IsZero() {
- if err := q.MachineSetAgentStarted(ctx, model.MachineSetAgentStartedParams{
- MachineID: machine.MachineID,
- AgentStartedAt: scenario.started,
- AgentPublicKey: []byte("fake"),
- }); err != nil {
- return fmt.Errorf("MachineSetAgentStarted: %w", err)
- }
- }
- if !scenario.heartbeat.IsZero() {
- if err := q.MachineSetAgentHeartbeat(ctx, model.MachineSetAgentHeartbeatParams{
- MachineID: machine.MachineID,
- AgentHeartbeatAt: scenario.heartbeat,
- }); err != nil {
- return fmt.Errorf("MachineSetAgentHeartbeat: %w", err)
- }
- }
- if scenario.requestGeneration != 0 {
- if err := q.MachineSetOSInstallationRequest(ctx, model.MachineSetOSInstallationRequestParams{
- MachineID: machine.MachineID,
- Generation: scenario.requestGeneration,
- }); err != nil {
- return fmt.Errorf("MachineSetOSInstallationRequest: %w", err)
- }
- }
- if scenario.reportGeneration != 0 {
- if err := q.MachineSetOSInstallationReport(ctx, model.MachineSetOSInstallationReportParams{
- MachineID: machine.MachineID,
- Generation: scenario.reportGeneration,
- OsInstallationResult: model.MachineOsInstallationResultSuccess,
- }); err != nil {
- return fmt.Errorf("MachineSetOSInstallationReport: %w", err)
- }
- }
- return nil
- }); err != nil {
- t.Errorf("%d: setup failed: %v", i, err)
- continue
- }
-
- found := false
- if err := session.Transact(ctx, func(q *model.Queries) error {
- candidates, err := q.GetMachineForAgentRecovery(ctx, model.GetMachineForAgentRecoveryParams{
- Limit: 100,
- Provider: model.ProviderEquinix,
- })
- if err != nil {
- return fmt.Errorf("GetMachinesForAgentRecovery: %w", err)
- }
- for _, c := range candidates {
- if c.MachineID == machineID {
- found = true
- break
- }
- }
- return nil
- }); err != nil {
- t.Errorf("%d: failed to retrieve candidates: %v", i, err)
- }
- if scenario.wantRun && !found {
- t.Errorf("%d: expected recovery but not scheduled", i)
- }
- if !scenario.wantRun && found {
- t.Errorf("%d: expected no recovery but is scheduled", i)
- }
- }
-}
diff --git a/cloud/bmaas/bmdb/reflection/BUILD.bazel b/cloud/bmaas/bmdb/reflection/BUILD.bazel
deleted file mode 100644
index 0922542..0000000
--- a/cloud/bmaas/bmdb/reflection/BUILD.bazel
+++ /dev/null
@@ -1,22 +0,0 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
-
-go_library(
- name = "reflection",
- srcs = [
- "reflection.go",
- "schema.go",
- ],
- importpath = "source.monogon.dev/cloud/bmaas/bmdb/reflection",
- visibility = ["//visibility:public"],
- deps = [
- "//cloud/bmaas/server/api",
- "@com_github_google_uuid//:uuid",
- "@com_github_iancoleman_strcase//:strcase",
- "@io_k8s_klog_v2//:klog",
- "@org_golang_google_protobuf//encoding/prototext",
- "@org_golang_google_protobuf//proto",
- "@org_golang_google_protobuf//reflect/protopath",
- "@org_golang_google_protobuf//reflect/protorange",
- "@org_golang_google_protobuf//reflect/protoreflect",
- ],
-)
diff --git a/cloud/bmaas/bmdb/reflection/reflection.go b/cloud/bmaas/bmdb/reflection/reflection.go
deleted file mode 100644
index 0976905..0000000
--- a/cloud/bmaas/bmdb/reflection/reflection.go
+++ /dev/null
@@ -1,499 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-// Package reflection implements facilities to retrieve information about the
-// implemented Tags and their types from a plain CockroachDB SQL connection,
-// bypassing the queries/types defined in models. Then, the retrieved Schema can
-// be used to retrieve information about machines.
-//
-// This is designed to be used in debugging facilities to allow arbitrary machine
-// introspection. It must _not_ be used in the user path, as the schema
-// extraction functionality is implemented best-effort.
-package reflection
-
-import (
- "context"
- "database/sql"
- "encoding/hex"
- "fmt"
- "sort"
- "strings"
- "time"
-
- "k8s.io/klog/v2"
-
- "github.com/google/uuid"
- "google.golang.org/protobuf/encoding/prototext"
- "google.golang.org/protobuf/proto"
- "google.golang.org/protobuf/reflect/protopath"
- "google.golang.org/protobuf/reflect/protorange"
-)
-
-// GetMachinesOpts influences the behaviour of GetMachines.
-type GetMachinesOpts struct {
- // FilterMachine, if set, will only retrieve information about the machine with
- // the given UUID. In case the given machine UUID does not exist in the database,
- // an empty result will be returned and _no_ error will be set.
- FilterMachine *uuid.UUID
- // Strict enables strict consistency. This is not recommended for use when
- // retrieving all machines, as such queries will compete against all currently
- // running operations. When not enabled, the retrieval will be executed AS OF
- // SYSTEM TIME follower_timestamp(), meaning the data might be a few seconds out
- // of date. Regardless of the option, the returned machine data will be
- // internally consistent, even across machines - but when not enabled the data
- // might be stale.
- Strict bool
- // ExpiredBackoffs enables the retrieval of information about all machine
- // backoffs, including expired backoff. Note that expired backoffs might be
- // garbage collected in the future, and their long-term storage is not
- // guaranteed.
- ExpiredBackoffs bool
-}
-
-// GetMachines retrieves all available BMDB data about one or more machines. The
-// Schema's embedded SQL connection is used to performed the retrieval.
-//
-// Options can be specified to influenced the exact operation performed. By
-// default (with a zeroed structure or nil pointer), all machines with active
-// backoffs are retrieved with weak consistency. See GetMachineOpts to influence
-// this behaviour.
-func (r *Schema) GetMachines(ctx context.Context, opts *GetMachinesOpts) (*Reflected[[]*Machine], error) {
- if opts == nil {
- opts = &GetMachinesOpts{}
- }
-
- // We're about to build a pretty big SELECT query with a ton of joins.
- //
- // First, we join against work_backoff and work to get information about active
- // work and backoffs on the machines we're retrieving.
- //
- // Second, we join against all the tags that are declared in the schema.
-
- // These are the colums we'll SELECT <...> FROM
- columns := []string{
- "machines.machine_id",
- "machines.machine_created_at",
- "work_backoff.process",
- "work_backoff.cause",
- "work_backoff.until",
- "work.process",
- "work.session_id",
- // ... tag columns will come after this.
- }
- // These are tha args we'll pass to the query.
- var args []any
-
- // Start building joins. First, against work_backoff and work.
- backoffFilter := " AND work_backoff.until > now()"
- if opts.ExpiredBackoffs {
- backoffFilter = ""
- }
- joins := []string{
- "LEFT JOIN work_backoff ON machines.machine_id = work_backoff.machine_id" + backoffFilter,
- "LEFT JOIN work ON machines.machine_id = work.machine_id",
- }
-
- // Then, against tags. Also populate columns as we go along.
- for _, tagType := range r.TagTypes {
- joins = append(joins, fmt.Sprintf("LEFT JOIN %s ON machines.machine_id = %s.machine_id", tagType.NativeName, tagType.NativeName))
- columns = append(columns, fmt.Sprintf("%s.machine_id", tagType.NativeName))
- for _, fieldType := range tagType.Fields {
- columns = append(columns, fmt.Sprintf("%s.%s", tagType.NativeName, fieldType.NativeName))
- }
- }
-
- // Finalize query.
- q := []string{
- "SELECT",
- strings.Join(columns, ", "),
- "FROM machines",
- }
- q = append(q, joins...)
- if !opts.Strict {
- q = append(q, "AS OF SYSTEM TIME follower_read_timestamp()")
- }
- if opts.FilterMachine != nil {
- q = append(q, "WHERE machines.machine_id = $1")
- args = append(args, *opts.FilterMachine)
- }
- q = append(q, "ORDER BY machines.machine_id")
-
- rows, err := r.db.QueryContext(ctx, strings.Join(q, "\n"), args...)
- if err != nil {
- return nil, fmt.Errorf("query failed: %w", err)
- }
- defer rows.Close()
-
- // Okay, we can start scanning the result rows.
- //
- // As this is a complex join, we need to merge some rows together and discard
- // some NULLs. We do merging/deduplication using machine_id values for the
- // machine data, and abuse UNIQUE constraints in the work_backoff/work tables to
- // deduplicate these.
- //
- // The alternative would be to rewrite this query to use array_agg, and we might
- // do that at some point. This is only really a problem if we
- // have _a lot_ of active work/backoffs (as that effectively duplicates all
- // machine/tag data), which isn't the case yet. But we should keep an eye out for
- // this.
-
- var machines []*Machine
- for rows.Next() {
-
- // We need to scan this row back into columns. For constant columns we'll just
- // create the data here and refer to it later.
- var dests []any
-
- // Add non-tag always-retrieved constants.
- var mid uuid.UUID
- var machineCreated time.Time
- var workSession uuid.NullUUID
- var backoffProcess, backoffCause, workProcess sql.NullString
- var backoffUntil sql.NullTime
-
- dests = append(dests, &mid, &machineCreated, &backoffProcess, &backoffCause, &backoffUntil, &workProcess, &workSession)
-
- // For dynamic data, we need to keep a reference to a list of columns that are
- // part of tags, and then refer to them later. We can't just refer back to dests
- // as the types are erased into `any`. scannedTags is that data storage.
- type scannedTag struct {
- ty *TagType
- id uuid.NullUUID
- fields []*TagField
- }
- var scannedTags []*scannedTag
- for _, tagType := range r.TagTypes {
- tagType := tagType
- st := scannedTag{
- ty: &tagType,
- }
- scannedTags = append(scannedTags, &st)
- dests = append(dests, &st.id)
- for _, fieldType := range tagType.Fields {
- fieldType := fieldType
- field := TagField{
- Type: &fieldType,
- }
- dests = append(dests, &field)
- st.fields = append(st.fields, &field)
-
- }
- }
-
- if err := rows.Scan(dests...); err != nil {
- return nil, fmt.Errorf("scan failed: %w", err)
- }
-
- // Now comes the merging/deduplication.
-
- // First, check if we are processing a new machine. If so, create a new
- // Machine. Otherwise, pick up the previous one.
- var machine *Machine
- if len(machines) == 0 || machines[len(machines)-1].ID.String() != mid.String() {
- // New machine or no machine yet.
- machine = &Machine{
- ID: mid,
- Created: machineCreated,
- Tags: make(map[string]*Tag),
- Backoffs: make(map[string]Backoff),
- Work: make(map[string]Work),
- }
-
- // Collect tags into machine.
- for _, st := range scannedTags {
- if !st.id.Valid {
- continue
- }
- var fields []TagField
- for _, f := range st.fields {
- fields = append(fields, *f)
- }
- machine.Tags[st.ty.Name()] = &Tag{
- Type: st.ty,
- Fields: fields,
- }
- }
- machines = append(machines, machine)
- } else {
- // Continue previous machine.
- machine = machines[len(machines)-1]
- }
-
- // Do we have a backoff? Upsert it to the machine. This works because there's a
- // UNIQUE(machine_id, process) constraint on the work_backoff table, and we're
- // effectively rebuilding that keyspace here by indexing first by machine then by
- // process.
- if backoffCause.Valid && backoffProcess.Valid && backoffUntil.Valid {
- process := backoffProcess.String
- machine.Backoffs[process] = Backoff{
- Cause: backoffCause.String,
- Process: process,
- Until: backoffUntil.Time,
- }
- }
-
- // Do we have an active work item? Upsert it to the machine. Same UNIQUE
- // constraint abuse happening here.
- if workProcess.Valid && workSession.Valid {
- process := workProcess.String
- machine.Work[process] = Work{
- SessionID: workSession.UUID,
- Process: process,
- }
- }
- }
-
- return &Reflected[[]*Machine]{
- Data: machines,
- Query: strings.Join(q, " "),
- }, nil
-}
-
-// Reflected wraps data retrieved by reflection (T) with metadata about the
-// retrieval.
-type Reflected[T any] struct {
- Data T
- // Effective SQL query performed on the database.
- Query string
-}
-
-// Machine retrieved from BMDB.
-type Machine struct {
- ID uuid.UUID
- Created time.Time
-
- // Tags on this machine, keyed by Tag type name (canonical, not native).
- Tags map[string]*Tag
-
- // Backoffs on this machine, keyed by process name. By default these are only
- // active backoffs, unless ExpiredBackoffs was set on GetMachineOptions.
- Backoffs map[string]Backoff
-
- // Work active on this machine, keyed by process name.
- Work map[string]Work
-}
-
-// ActiveBackoffs retrieves a copy of a Machine's active backoffs. Note: the
-// expiration check is performed according tu current system time, so it might
-// not be consistent with the data snapshot retrieved from the database.
-func (r *Machine) ActiveBackoffs() []*Backoff {
- var res []*Backoff
- for _, bo := range r.Backoffs {
- bo := bo
- if !bo.Active() {
- continue
- }
- res = append(res, &bo)
- }
- sort.Slice(res, func(i, j int) bool { return res[i].Process < res[j].Process })
- return res
-}
-
-// ExpiredBackoffs retrieves a copy of a Machine's expired backoffs. Note: the
-// expiration check is performed according tu current system time, so it might
-// not be consistent with the data snapshot retrieved from the database.
-func (r *Machine) ExpiredBackoffs() []*Backoff {
- var res []*Backoff
- for _, bo := range r.Backoffs {
- bo := bo
- if bo.Active() {
- continue
- }
- res = append(res, &bo)
- }
- sort.Slice(res, func(i, j int) bool { return res[i].Process < res[j].Process })
- return res
-}
-
-// Tag value set on a Machine.
-type Tag struct {
- // Type describing this tag.
- Type *TagType
- // Field data contained in this tag, sorted alphabetically by name.
- Fields []TagField
-}
-
-// Field is a shorthand for returning a TagField by its name.
-func (r *Tag) Field(name string) *TagField {
- for _, f := range r.Fields {
- if f.Type.NativeName == name {
- return &f
- }
- }
- return nil
-}
-
-// DisplayOption is an opaque argument used to influence the display style of a
-// tag value when returned from HumanValue.
-type DisplayOption string
-
-const (
- // DisplaySingleLine limits display to a single line (i.e. don't try to
- // pretty-print long values by inserting newlines and indents).
- DisplaySingleLine DisplayOption = "single-line"
-)
-
-func (r *Tag) HumanValue(opts ...DisplayOption) string {
- var kvs []string
- for _, field := range r.Fields {
- kvs = append(kvs, fmt.Sprintf("%s: %s", field.Type.NativeName, field.HumanValue(opts...)))
- }
- return strings.Join(kvs, ", ")
-}
-
-// TagField value which is part of a Tag set on a Machine.
-type TagField struct {
- // Type describing this field.
- Type *TagFieldType
-
- text *string
- bytes *[]byte
- time *time.Time
- proto proto.Message
-}
-
-// HumanValue returns a human-readable (best effort) representation of the field
-// value.
-func (r *TagField) HumanValue(opts ...DisplayOption) string {
- switch {
- case r.proto != nil:
- mopts := prototext.MarshalOptions{
- Multiline: true,
- Indent: "\t",
- }
- for _, opt := range opts {
- if opt == DisplaySingleLine {
- mopts.Multiline = false
- }
- }
- return mopts.Format(r.proto)
- case r.text != nil:
- return *r.text
- case r.bytes != nil:
- return hex.EncodeToString(*r.bytes)
- case r.time != nil:
- return r.time.String()
- default:
- return "<unknown>"
- }
-}
-
-// Index attempts to index into a structured tag field (currently only protobuf
-// fields) by a 'field.subfield.subsubfield' selector.
-//
-// The selector for Protobuf fields follows the convention from 'protorange',
-// which is a semi-standardized format used in the Protobuf ecosystem. See
-// https://pkg.go.dev/google.golang.org/protobuf/reflect/protorange for more
-// details.
-//
-// An error will be returned if the TagField is not a protobuf field or if the
-// given selector does not point to a known message field.
-func (r *TagField) Index(k string) (string, error) {
- if r.Type.ProtoType == nil {
- return "", fmt.Errorf("can only index proto fields")
- }
- k = fmt.Sprintf("(%s).%s", r.Type.ProtoType.Descriptor().FullName(), k)
-
- var res string
- var found bool
- ref := r.proto.ProtoReflect()
- protorange.Range(ref, func(values protopath.Values) error {
- if values.Path.String() == k {
- res = values.Index(-1).Value.String()
- found = true
- }
- return nil
- })
-
- if !found {
- return "", fmt.Errorf("protobuf field not found")
- }
- return res, nil
-}
-
-// Backoff on a Machine.
-type Backoff struct {
- // Process which established Backoff.
- Process string
- // Time when Backoff expires.
- Until time.Time
- // Cause for the Backoff as emitted by worker.
- Cause string
-}
-
-// Active returns whether this Backoff is _currently_ active per the _local_ time.
-func (r Backoff) Active() bool {
- return time.Now().Before(r.Until)
-}
-
-// Work being actively performed on a Machine.
-type Work struct {
- // SessionID of the worker performing this Work.
- SessionID uuid.UUID
- // Process name of this Work.
- Process string
-}
-
-// Scan implements sql.Scanner for direct scanning of query results into a
-// reflected tag value. This method is not meant to by used outside the
-// reflection package.
-func (r *TagField) Scan(src any) error {
- if src == nil {
- return nil
- }
-
- switch r.Type.NativeType {
- case "text":
- src2, ok := src.(string)
- if !ok {
- return fmt.Errorf("SQL type %q, but got %+v", r.Type.NativeType, src)
- }
- r.text = &src2
- case "bytea":
- src2, ok := src.([]byte)
- if !ok {
- return fmt.Errorf("SQL type %q, but got %+v", r.Type.NativeType, src)
- }
- // Copy the bytes, as they are otherwise going to be reused by the pq library.
- copied := make([]byte, len(src2))
- copy(copied[:], src2)
- r.bytes = &copied
-
- if r.Type.ProtoType != nil {
- msg := r.Type.ProtoType.New().Interface()
- err := proto.Unmarshal(*r.bytes, msg)
- if err != nil {
- klog.Warningf("Could not unmarshal %s: %v", r.Type.NativeName, err)
- } else {
- r.proto = msg
- }
- }
- case "USER-DEFINED":
- switch r.Type.NativeUDTName {
- case "provider", "provider_status":
- src2, ok := src.([]byte)
- if !ok {
- return fmt.Errorf("SQL type %q, but got %+v", r.Type.NativeType, src)
- }
- src3 := string(src2)
- r.text = &src3
- }
- case "timestamp with time zone":
- src2, ok := src.(time.Time)
- if !ok {
- return fmt.Errorf("SQL type %q, but got %+v", r.Type.NativeType, src)
- }
- r.time = &src2
- case "bigint":
- src2, ok := src.(int64)
- if !ok {
- return fmt.Errorf("SQL type %q, but got %+v", r.Type.NativeType, src)
- }
- src3 := fmt.Sprintf("%d", src2)
- r.text = &src3
- default:
- return fmt.Errorf("unimplemented SQL type %q", r.Type.NativeType)
- }
-
- return nil
-}
diff --git a/cloud/bmaas/bmdb/reflection/schema.go b/cloud/bmaas/bmdb/reflection/schema.go
deleted file mode 100644
index bceb931..0000000
--- a/cloud/bmaas/bmdb/reflection/schema.go
+++ /dev/null
@@ -1,200 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package reflection
-
-import (
- "context"
- "database/sql"
- "fmt"
- "strings"
-
- "github.com/iancoleman/strcase"
- "google.golang.org/protobuf/proto"
- "google.golang.org/protobuf/reflect/protoreflect"
- "k8s.io/klog/v2"
-
- "source.monogon.dev/cloud/bmaas/server/api"
-)
-
-// Schema contains information about the tag types in a BMDB. It also contains an
-// active connection to the BMDB, allowing retrieval of data based on the
-// detected schema.
-//
-// It also contains an embedded connection to the CockroachDB database backing
-// this BMDB which is then used to retrieve data described by this schema.
-type Schema struct {
- // TagTypes is the list of tag types extracted from the BMDB.
- TagTypes []TagType
- // Version is the go-migrate schema version of the BMDB this schema was extracted
- // from. By convention, it is a stringified base-10 number representing the number
- // of seconds since UNIX epoch of when the migration version was created, but
- // this is not guaranteed.
- Version string
-
- db *sql.DB
-}
-
-// TagType describes the type of a BMDB Tag. Each tag in turn corresponds to a
-// CockroachDB database.
-type TagType struct {
- // NativeName is the name of the table that holds tags of this type.
- NativeName string
- // Fields are the types of fields contained in this tag type.
- Fields []TagFieldType
-}
-
-// Name returns the canonical name of this tag type. For example, a table named
-// machine_agent_started will have a canonical name AgentStarted.
-func (r *TagType) Name() string {
- tableSuffix := strings.TrimPrefix(r.NativeName, "machine_")
- parts := strings.Split(tableSuffix, "_")
- // Capitalize some known acronyms.
- for i, p := range parts {
- parts[i] = strings.ReplaceAll(p, "os", "OS")
- }
- return strcase.ToCamel(strings.Join(parts, "_"))
-}
-
-// TagFieldType is the type of a field within a BMDB Tag. Each tag field in turn
-// corresponds to a column inside its Tag table.
-type TagFieldType struct {
- // NativeName is the name of the column that holds this field type. It is also
- // the canonical name of the field type.
- NativeName string
- // NativeType is the CockroachDB type name of this field.
- NativeType string
- // NativeUDTName is the CockroachDB user-defined-type name of this field. This is
- // only valid if NativeType is 'USER-DEFINED'.
- NativeUDTName string
- // ProtoType is set non-nil if the field is a serialized protobuf of the same
- // type as the given protoreflect.Message.
- ProtoType protoreflect.Message
-}
-
-// knownProtoFields is a mapping from column name of a field containing a
-// serialized protobuf to an instance of a proto.Message that will be used to
-// parse that column's data.
-//
-// Just mapping from column name is fine enough for now as we have mostly unique
-// column names, and these column names uniquely map to a single type.
-var knownProtoFields = map[string]proto.Message{
- "hardware_report_raw": &api.AgentHardwareReport{},
- "os_installation_request_raw": &api.OSInstallationRequest{},
- "os_installation_report_raw": &api.OSInstallationReport{},
-}
-
-// HumanType returns a human-readable representation of the field's type. This is
-// not well-defined, and should be used only informatively.
-func (r *TagFieldType) HumanType() string {
- if r.ProtoType != nil {
- return string(r.ProtoType.Descriptor().FullName())
- }
- switch r.NativeType {
- case "USER-DEFINED":
- return r.NativeUDTName
- case "timestamp with time zone":
- return "timestamp"
- case "bytea":
- return "bytes"
- case "bigint":
- return "int"
- default:
- return r.NativeType
- }
-}
-
-// Reflect builds a runtime BMDB schema from a raw SQL connection to the BMDB
-// database. You're probably looking for bmdb.Connection.Reflect.
-func Reflect(ctx context.Context, db *sql.DB) (*Schema, error) {
- // Get all tables in the currently connected to database.
- rows, err := db.QueryContext(ctx, `
- SELECT table_name
- FROM information_schema.tables
- WHERE table_catalog = current_database()
- AND table_schema = 'public'
- AND table_name LIKE 'machine\_%'
- `)
- if err != nil {
- return nil, fmt.Errorf("could not query table names: %w", err)
- }
- defer rows.Close()
-
- // Collect all table names for further processing.
- var tableNames []string
- for rows.Next() {
- var name string
- if err := rows.Scan(&name); err != nil {
- return nil, fmt.Errorf("table name scan failed: %w", err)
- }
- tableNames = append(tableNames, name)
- }
-
- // Start processing each table into a TagType.
- tags := make([]TagType, 0, len(tableNames))
- for _, tagName := range tableNames {
- // Get all columns of the table.
- rows, err := db.QueryContext(ctx, `
- SELECT column_name, data_type, udt_name
- FROM information_schema.columns
- WHERE table_catalog = current_database()
- AND table_schema = 'public'
- AND table_name = $1
- `, tagName)
- if err != nil {
- return nil, fmt.Errorf("could not query columns: %w", err)
- }
-
- tag := TagType{
- NativeName: tagName,
- }
-
- // Build field types from columns.
- foundMachineID := false
- for rows.Next() {
- var column_name, data_type, udt_name string
- if err := rows.Scan(&column_name, &data_type, &udt_name); err != nil {
- rows.Close()
- return nil, fmt.Errorf("column scan failed: %w", err)
- }
- if column_name == "machine_id" {
- foundMachineID = true
- continue
- }
- field := TagFieldType{
- NativeName: column_name,
- NativeType: data_type,
- NativeUDTName: udt_name,
- }
- if t, ok := knownProtoFields[column_name]; ok {
- field.ProtoType = t.ProtoReflect()
- }
- tag.Fields = append(tag.Fields, field)
- }
-
- // Make sure there's a machine_id key in the table, then remove it.
- if !foundMachineID {
- klog.Warningf("Table %q has no machine_id column, skipping", tag.NativeName)
- continue
- }
-
- tags = append(tags, tag)
- }
-
- // Retrieve version information from go-migrate's schema_migrations table.
- var version string
- var dirty bool
- if err := db.QueryRowContext(ctx, "SELECT version, dirty FROM schema_migrations").Scan(&version, &dirty); err != nil {
- return nil, fmt.Errorf("could not select schema version: %w", err)
- }
- if dirty {
- version += " DIRTY!!!"
- }
-
- return &Schema{
- TagTypes: tags,
- Version: version,
-
- db: db,
- }, nil
-}
diff --git a/cloud/bmaas/bmdb/reflection_test.go b/cloud/bmaas/bmdb/reflection_test.go
deleted file mode 100644
index 17abc9d..0000000
--- a/cloud/bmaas/bmdb/reflection_test.go
+++ /dev/null
@@ -1,300 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package bmdb
-
-import (
- "context"
- "fmt"
- "strings"
- "testing"
- "time"
-
- "github.com/google/uuid"
- "google.golang.org/protobuf/proto"
-
- apb "source.monogon.dev/cloud/agent/api"
- "source.monogon.dev/cloud/bmaas/bmdb/model"
- "source.monogon.dev/cloud/bmaas/bmdb/reflection"
- "source.monogon.dev/cloud/bmaas/server/api"
-)
-
-// TestReflection exercises the BMDB reflection schema reflection and data
-// retrieval code. Ideally this code would live in //cloud/bmaas/bmdb/reflection,
-// but due to namespacing issues it lives here.
-func TestReflection(t *testing.T) {
- b := dut()
- conn, err := b.Open(true)
- if err != nil {
- t.Fatalf("Open failed: %v", err)
- }
-
- ctx, ctxC := context.WithCancel(context.Background())
- defer ctxC()
-
- sess, err := conn.StartSession(ctx)
- if err != nil {
- t.Fatalf("StartSession: %v", err)
- }
-
- // Create 10 test machines.
- var mids []uuid.UUID
- err = sess.Transact(ctx, func(q *model.Queries) error {
- for i := 0; i < 10; i += 1 {
- mach, err := q.NewMachine(ctx)
- if err != nil {
- return err
- }
- err = q.MachineAddProvided(ctx, model.MachineAddProvidedParams{
- MachineID: mach.MachineID,
- Provider: model.ProviderEquinix,
- ProviderID: fmt.Sprintf("test-%d", i),
- })
- if err != nil {
- return err
- }
- mids = append(mids, mach.MachineID)
- }
- return nil
- })
- if err != nil {
- t.Fatal(err)
- }
- // Start and fail work on one of the machines with an hour long backoff.
- w, err := sess.Work(ctx, model.ProcessUnitTest1, func(q *model.Queries) ([]uuid.UUID, error) {
- return mids[0:1], nil
- })
- if err != nil {
- t.Fatal(err)
- }
- backoff := Backoff{
- Initial: time.Hour,
- }
- w.Fail(ctx, &backoff, "failure test")
-
- // On another machine, create a failure with a 1 second backoff.
- w, err = sess.Work(ctx, model.ProcessUnitTest1, func(q *model.Queries) ([]uuid.UUID, error) {
- return mids[1:2], nil
- })
- if err != nil {
- t.Fatal(err)
- }
- backoff = Backoff{
- Initial: time.Second,
- }
- w.Fail(ctx, &backoff, "failure test")
- // Later on in the test we must wait for this backoff to actually elapse. Start
- // counting now.
- elapsed := time.NewTicker(time.Second * 1)
- defer elapsed.Stop()
-
- // On another machine, create work and don't finish it yet.
- _, err = sess.Work(ctx, model.ProcessUnitTest1, func(q *model.Queries) ([]uuid.UUID, error) {
- return mids[2:3], nil
- })
- if err != nil {
- t.Fatal(err)
- }
-
- schema, err := conn.Reflect(ctx)
- if err != nil {
- t.Fatalf("ReflectTagTypes: %v", err)
- }
-
- // Dump all in strict mode.
- opts := &reflection.GetMachinesOpts{
- Strict: true,
- }
- res, err := schema.GetMachines(ctx, opts)
- if err != nil {
- t.Fatalf("Dump failed: %v", err)
- }
- if res.Query == "" {
- t.Errorf("Query not set on result")
- }
- machines := res.Data
- if want, got := 10, len(machines); want != got {
- t.Fatalf("Expected %d machines in dump, got %d", want, got)
- }
-
- // Expect Provided tag on all machines. Do a detailed check on fields, too.
- for _, machine := range machines {
- tag, ok := machine.Tags["Provided"]
- if !ok {
- t.Errorf("No Provided tag on machine.")
- continue
- }
- if want, got := "Provided", tag.Type.Name(); want != got {
- t.Errorf("Provided tag should have type %q, got %q", want, got)
- }
- if provider := tag.Field("provider"); provider != nil {
- if want, got := provider.HumanValue(), "Equinix"; want != got {
- t.Errorf("Wanted Provided.provider value %q, got %q", want, got)
- }
- } else {
- t.Errorf("Provider tag has no provider field")
- }
- if providerId := tag.Field("provider_id"); providerId != nil {
- if !strings.HasPrefix(providerId.HumanValue(), "test-") {
- t.Errorf("Unexpected provider_id value %q", providerId.HumanValue())
- }
- } else {
- t.Errorf("Provider tag has no provider_id field")
- }
- }
-
- // Now just dump one machine.
- opts.FilterMachine = &mids[0]
- res, err = schema.GetMachines(ctx, opts)
- if err != nil {
- t.Fatalf("Dump failed: %v", err)
- }
- machines = res.Data
- if want, got := 1, len(machines); want != got {
- t.Fatalf("Expected %d machines in dump, got %d", want, got)
- }
- if want, got := mids[0].String(), machines[0].ID.String(); want != got {
- t.Fatalf("Expected machine %s, got %s", want, got)
- }
-
- // Now dump a machine that doesn't exist. That should just return an empty list.
- fakeMid := uuid.New()
- opts.FilterMachine = &fakeMid
- res, err = schema.GetMachines(ctx, opts)
- if err != nil {
- t.Fatalf("Dump failed: %v", err)
- }
- machines = res.Data
- if want, got := 0, len(machines); want != got {
- t.Fatalf("Expected %d machines in dump, got %d", want, got)
- }
-
- // Finally, check the special case machines. The first one should have an active
- // backoff.
- opts.FilterMachine = &mids[0]
- res, err = schema.GetMachines(ctx, opts)
- if err != nil {
- t.Errorf("Dump failed: %v", err)
- } else {
- machine := res.Data[0]
- if _, ok := machine.Backoffs["UnitTest1"]; !ok {
- t.Errorf("Expected UnitTest1 backoff on machine")
- }
- }
- // The second one should have an expired backoff that shouldn't be reported in a
- // normal call..
- <-elapsed.C
- opts.FilterMachine = &mids[1]
- res, err = schema.GetMachines(ctx, opts)
- if err != nil {
- t.Errorf("Dump failed: %v", err)
- } else {
- machine := res.Data[0]
- if _, ok := machine.Backoffs["UnitTest1"]; ok {
- t.Errorf("Expected no UnitTest1 backoff on machine")
- }
- }
- // But if we ask for expired backoffs, we should get it.
- opts.ExpiredBackoffs = true
- res, err = schema.GetMachines(ctx, opts)
- if err != nil {
- t.Errorf("Dump failed: %v", err)
- } else {
- machine := res.Data[0]
- if _, ok := machine.Backoffs["UnitTest1"]; !ok {
- t.Errorf("Expected UnitTest1 backoff on machine")
- }
- }
- // Finally, the third machine should have an active Work item.
- opts.FilterMachine = &mids[2]
- res, err = schema.GetMachines(ctx, opts)
- if err != nil {
- t.Errorf("Dump failed: %v", err)
- } else {
- machine := res.Data[0]
- if _, ok := machine.Work["UnitTest1"]; !ok {
- t.Errorf("Expected UnitTest1 work item on machine")
- }
- }
-}
-
-// TestReflectionProtoFields ensures that the basic proto field introspection
-// functionality works.
-func TestReflectionProtoFields(t *testing.T) {
- s := dut()
- ctx, ctxC := context.WithCancel(context.Background())
- defer ctxC()
-
- bmdb, err := s.Open(true)
- if err != nil {
- t.Fatalf("Open: %v", err)
- }
- sess, err := bmdb.StartSession(ctx)
- if err != nil {
- t.Fatalf("StartSession: %v", err)
- }
- var machine model.Machine
- err = sess.Transact(ctx, func(q *model.Queries) error {
- machine, err = q.NewMachine(ctx)
- if err != nil {
- return err
- }
-
- report := &api.AgentHardwareReport{
- Report: &apb.Node{
- Manufacturer: "Charles Babbage",
- Product: "Analytical Engine",
- SerialNumber: "183701",
- MemoryInstalledBytes: 14375,
- MemoryUsableRatio: 1.0,
- Cpu: []*apb.CPU{
- {
- Architecture: nil,
- HardwareThreads: 1,
- Cores: 1,
- },
- },
- },
- Warning: []string{"something went wrong"},
- }
- b, _ := proto.Marshal(report)
- return q.MachineSetHardwareReport(ctx, model.MachineSetHardwareReportParams{
- MachineID: machine.MachineID,
- HardwareReportRaw: b,
- })
- })
- if err != nil {
- t.Fatalf("Failed to submit hardware report: %v", err)
- }
-
- schem, err := bmdb.Reflect(ctx)
- if err != nil {
- t.Fatalf("Failed to reflect on database: %v", err)
- }
-
- machines, err := schem.GetMachines(ctx, &reflection.GetMachinesOpts{FilterMachine: &machine.MachineID, Strict: true})
- if err != nil {
- t.Fatalf("Failed to get machine: %v", err)
- }
- if len(machines.Data) != 1 {
- t.Errorf("Expected one machine, got %d", len(machines.Data))
- } else {
- machine := machines.Data[0]
- ty := machine.Tags["HardwareReport"].Field("hardware_report_raw").Type.HumanType()
- if want, got := "cloud.bmaas.server.api.AgentHardwareReport", ty; want != got {
- t.Errorf("Mismatch in type: wanted %q, got %q", want, got)
- }
- v := machine.Tags["HardwareReport"].Field("hardware_report_raw").HumanValue()
- if !strings.Contains(v, "manufacturer:") {
- t.Errorf("Invalid serialized prototext: %s", v)
- }
- fv, err := machine.Tags["HardwareReport"].Field("hardware_report_raw").Index("report.cpu[0].cores")
- if err != nil {
- t.Errorf("Could not get report.cpu[0].cores from hardware_report_raw: %v", err)
- } else {
- if want, got := "1", fv; want != got {
- t.Errorf("report.cpu[0].cores should be %q, got %q", want, got)
- }
- }
- }
-}
diff --git a/cloud/bmaas/bmdb/sessions.go b/cloud/bmaas/bmdb/sessions.go
deleted file mode 100644
index 22ea3fd..0000000
--- a/cloud/bmaas/bmdb/sessions.go
+++ /dev/null
@@ -1,637 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package bmdb
-
-import (
- "context"
- "database/sql"
- "errors"
- "fmt"
- "time"
-
- "github.com/cockroachdb/cockroach-go/v2/crdb"
- "github.com/google/uuid"
- "github.com/lib/pq"
- "k8s.io/klog/v2"
-
- "source.monogon.dev/cloud/bmaas/bmdb/metrics"
- "source.monogon.dev/cloud/bmaas/bmdb/model"
-)
-
-// StartSession creates a new BMDB session which will be maintained in a
-// background goroutine as long as the given context is valid. Each Session is
-// represented by an entry in a sessions table within the BMDB, and subsequent
-// Transact calls emit SQL transactions which depend on that entry still being
-// present and up to date. A garbage collection system (to be implemented) will
-// remove expired sessions from the BMDB, but this mechanism is not necessary
-// for the session expiry mechanism to work.
-//
-// When the session becomes invalid (for example due to network partition),
-// subsequent attempts to call Transact will fail with ErrSessionExpired. This
-// means that the caller within the component is responsible for recreating a
-// new Session if a previously used one expires.
-func (c *Connection) StartSession(ctx context.Context, opts ...SessionOption) (*Session, error) {
- intervalSeconds := 5
-
- res, err := model.New(c.db).NewSession(ctx, model.NewSessionParams{
- SessionComponentName: c.bmdb.ComponentName,
- SessionRuntimeInfo: c.bmdb.RuntimeInfo,
- SessionIntervalSeconds: int64(intervalSeconds),
- })
- if err != nil {
- return nil, fmt.Errorf("creating session failed: %w", err)
- }
-
- klog.Infof("Started session %s", res.SessionID)
-
- ctx2, ctxC := context.WithCancel(ctx)
-
- var processor metrics.Processor
- for _, opt := range opts {
- if opt.Processor != "" {
- processor = opt.Processor
- }
- }
-
- s := &Session{
- connection: c,
- interval: time.Duration(intervalSeconds) * time.Second,
-
- UUID: res.SessionID,
-
- ctx: ctx2,
- ctxC: ctxC,
- m: c.bmdb.metrics.Recorder(processor),
- }
- s.m.OnSessionStarted()
- go s.maintainHeartbeat(ctx2)
- return s, nil
-}
-
-type SessionOption struct {
- Processor metrics.Processor
-}
-
-// Session is a session (identified by UUID) that has been started in the BMDB.
-// Its liveness is maintained by a background goroutine, and as long as that
-// session is alive, it can perform transactions and work on the BMDB.
-type Session struct {
- connection *Connection
- interval time.Duration
-
- UUID uuid.UUID
-
- ctx context.Context
- ctxC context.CancelFunc
-
- m *metrics.ProcessorRecorder
-}
-
-// Expired returns true if this session is expired and will fail all subsequent
-// transactions/work.
-func (s *Session) Expired() bool {
- return s.ctx.Err() != nil
-}
-
-// expire is a helper which marks this session as expired and returns
-// ErrSessionExpired.
-func (s *Session) expire() error {
- s.ctxC()
- return ErrSessionExpired
-}
-
-var (
- // ErrSessionExpired is returned when attempting to Transact or Work on a
- // Session that has expired or been canceled. Once a Session starts returning
- // these errors, it must be re-created by another StartSession call, as no other
- // calls will succeed.
- ErrSessionExpired = errors.New("session expired")
- // ErrWorkConflict is returned when attempting to Work on a Session with a
- // process name that's already performing some work, concurrently, on the
- // requested machine.
- ErrWorkConflict = errors.New("conflicting work on machine")
-)
-
-// maintainHeartbeat will attempt to repeatedly poke the session at a frequency
-// twice of that of the minimum frequency mandated by the configured 5-second
-// interval. It will exit if it detects that the session cannot be maintained
-// anymore, canceling the session's internal context and causing future
-// Transact/Work calls to fail.
-func (s *Session) maintainHeartbeat(ctx context.Context) {
- // Internal deadline, used to check whether we haven't dropped the ball on
- // performing the updates due to a lot of transient errors.
- deadline := time.Now().Add(s.interval)
- for {
- if ctx.Err() != nil {
- klog.Infof("Session %s: context over, exiting: %v", s.UUID, ctx.Err())
- return
- }
-
- err := s.Transact(ctx, func(q *model.Queries) error {
- sessions, err := q.SessionCheck(ctx, s.UUID)
- if err != nil {
- return fmt.Errorf("when retrieving session: %w", err)
- }
- if len(sessions) < 1 {
- return s.expire()
- }
- err = q.SessionPoke(ctx, s.UUID)
- if err != nil {
- return fmt.Errorf("when poking session: %w", err)
- }
- return nil
- })
- if err != nil {
- klog.Errorf("Session %s: update failed: %v", s.UUID, err)
- if errors.Is(err, ErrSessionExpired) || deadline.After(time.Now()) {
- // No way to recover.
- klog.Errorf("Session %s: exiting", s.UUID)
- s.ctxC()
- return
- }
- // Just retry in a bit. One second seems about right for a 5 second interval.
- //
- // TODO(q3k): calculate this based on the configured interval.
- time.Sleep(time.Second)
- }
- // Success. Keep going.
- deadline = time.Now().Add(s.interval)
- select {
- case <-ctx.Done():
- // Do nothing, next loop iteration will exit.
- case <-time.After(s.interval / 2):
- // Do nothing, next loop iteration will heartbeat.
- }
- }
-}
-
-// Transact runs a given function in the context of both a CockroachDB and BMDB
-// transaction, retrying as necessary.
-//
-// Most pure (meaning without side effects outside the database itself) BMDB
-// transactions should be run this way.
-func (s *Session) Transact(ctx context.Context, fn func(q *model.Queries) error) error {
- var attempts int64
-
- err := crdb.ExecuteTx(ctx, s.connection.db, nil, func(tx *sql.Tx) error {
- attempts += 1
- s.m.OnTransactionStarted(attempts)
-
- qtx := model.New(tx)
- sessions, err := qtx.SessionCheck(ctx, s.UUID)
- if err != nil {
- return fmt.Errorf("when retrieving session: %w", err)
- }
- if len(sessions) < 1 {
- return s.expire()
- }
-
- if err := fn(qtx); err != nil {
- return err
- }
-
- return nil
- })
- if err != nil {
- s.m.OnTransactionFailed()
- }
- return err
-}
-
-var (
- ErrNothingToDo = errors.New("nothing to do")
- // PostgresUniqueViolation is returned by the lib/pq driver when a mutation
- // cannot be performed due to a UNIQUE constraint being violated as a result of
- // the query.
- postgresUniqueViolation = pq.ErrorCode("23505")
-)
-
-// Work starts work on a machine. Full work execution is performed in three
-// phases:
-//
-// 1. Retrieval phase. This is performed by 'fn' given to this function.
-// The retrieval function must return zero or more machines that some work
-// should be performed on per the BMDB. The first returned machine will be
-// locked for work under the given process and made available in the Work
-// structure returned by this call. The function may be called multiple times,
-// as it's run within a CockroachDB transaction which may be retried an
-// arbitrary number of times. Thus, it should be side-effect free, ideally only
-// performing read queries to the database.
-// 2. Work phase. This is performed by user code while holding on to the Work
-// structure instance.
-// 3. Commit phase. This is performed by the function passed to Work.Finish. See
-// that method's documentation for more details.
-//
-// Important: after retrieving Work successfully, either Finish or Cancel must be
-// called, otherwise the machine will be locked until the parent session expires
-// or is closed! It's safe and recommended to `defer work.Close()` after calling
-// Work().
-//
-// If no machine is eligible for work, ErrNothingToDo should be returned by the
-// retrieval function, and the same error (wrapped) will be returned by Work. In
-// case the retrieval function returns no machines and no error, that error will
-// also be returned.
-//
-// The returned Work object is _not_ goroutine safe.
-func (s *Session) Work(ctx context.Context, process model.Process, fn func(q *model.Queries) ([]uuid.UUID, error)) (*Work, error) {
- var mid *uuid.UUID
- var exisingingBackoff *existingBackoff
- err := s.Transact(ctx, func(q *model.Queries) error {
- mids, err := fn(q)
- if err != nil {
- return fmt.Errorf("could not retrieve machines for work: %w", err)
- }
- if len(mids) < 1 {
- return ErrNothingToDo
- }
- mid = &mids[0]
- err = q.StartWork(ctx, model.StartWorkParams{
- MachineID: mids[0],
- SessionID: s.UUID,
- Process: process,
- })
- if err != nil {
- var perr *pq.Error
- if errors.As(err, &perr) && perr.Code == postgresUniqueViolation {
- return ErrWorkConflict
- }
- return fmt.Errorf("could not start work on %q: %w", mids[0], err)
- }
- err = q.WorkHistoryInsert(ctx, model.WorkHistoryInsertParams{
- MachineID: mids[0],
- Event: model.WorkHistoryEventStarted,
- Process: process,
- })
- if err != nil {
- return fmt.Errorf("could not insert history event: %w", err)
- }
- backoffs, err := q.WorkBackoffOf(ctx, model.WorkBackoffOfParams{
- MachineID: mids[0],
- Process: process,
- })
- if err != nil {
- return fmt.Errorf("could not get backoffs: %w", err)
- }
- if len(backoffs) > 0 {
- // If the backoff exists but the last interval is null (e.g. is from a previous
- // version of the schema when backoffs had no interval data) pretend it doesn't
- // exist. Then the backoff mechanism can restart from a clean slate and populate
- // a new, full backoff row.
- if backoff := backoffs[0]; backoff.LastIntervalSeconds.Valid {
- klog.Infof("Existing backoff: %d seconds", backoff.LastIntervalSeconds.Int64)
- exisingingBackoff = &existingBackoff{
- lastInterval: time.Second * time.Duration(backoff.LastIntervalSeconds.Int64),
- }
- }
- }
- return nil
- })
- if err != nil {
- return nil, err
- }
- w := &Work{
- Machine: *mid,
- s: s,
- process: process,
- backoff: exisingingBackoff,
- m: s.m.WithProcess(process),
- }
- w.m.OnWorkStarted()
- klog.Infof("Started work %q on machine %q (sess %q)", process, *mid, s.UUID)
- return w, nil
-}
-
-// existingBackoff contains backoff information retrieved from a work item that
-// has previously failed with a backoff.
-type existingBackoff struct {
- // lastInterval is the last interval as stored in the backoff table.
- lastInterval time.Duration
-}
-
-// Backoff describes the configuration of backoff for a failed work item. It can
-// be passed to Work.Fail to cause an item to not be processed again (to be 'in
-// backoff') for a given period of time. Exponential backoff can be configured so
-// that subsequent failures of a process will have exponentially increasing
-// backoff periods, up to some maximum length.
-//
-// The underlying unit of backoff period length in the database is one second.
-// What that means is that all effective calculated backoff periods must be an
-// integer number of seconds. This is performed by always rounding up this period
-// to the nearest second. A side effect of this is that with exponential backoff,
-// non-integer exponents will be less precisely applied for small backoff values,
-// e.g. an exponent of 1.1 with initial backoff of 1s will generate the following
-// sequence of backoff periods:
-//
-// 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17
-//
-// Which corresponds to the following approximate multipliers in between periods:
-//
-// 2.00, 1.50, 1.33, 1.25, 1.20, 1.17, 1.14, 1.12, 1.11, 1.10, 1.18, 1.15, 1.13
-//
-// Thus, the exponent value should be treated more as a limit that the sequence
-// of periods will approach than a hard rule for calculating the periods.
-// However, if the exponent is larger than 1 (i.e. any time exponential backoff
-// is requested), this guarantees that the backoff won't get 'stuck' on a
-// repeated period value due to a rounding error.
-//
-// A zero backoff structure is valid and represents a non-exponential backoff of
-// one second.
-//
-// A partially filled structure is also valid. See the field comments for more
-// information about how fields are capped if not set. The described behaviour
-// allows for two useful shorthands:
-//
-// 1. If only Initial is set, then the backoff is non-exponential and will always
-// be of value Initial (or whatever the previous period already persisted the
-// database).
-// 2. If only Maximum and Exponent are set, the backoff will be exponential,
-// starting at one second, and exponentially increasing to Maximum.
-//
-// It is recommended to construct Backoff structures as const values and treat
-// them as read-only 'descriptors', one per work kind / process.
-//
-// One feature currently missing from the Backoff implementation is jitter. This
-// might be introduced in the future if deemed necessary.
-type Backoff struct {
- // Initial backoff period, used for the backoff if this item failed for the first
- // time (i.e. has not had a Finish call in between two Fail calls).
- //
- // Subsequent calls will ignore this field if the backoff is exponential. If
- // non-exponential, the initial time will always override whatever was previously
- // persisted in the database, i.e. the backoff will always be of value 'Initial'.
- //
- // Cannot be lower than one second. If it is, it will be capped to it.
- Initial time.Duration `u:"initial"`
-
- // Maximum time for backoff. If the calculation of the next back off period
- // (based on the Exponent and last backoff value) exceeds this maximum, it will
- // be capped to it.
- //
- // Maximum is not persisted in the database. Instead, it is always read from this
- // structure.
- //
- // Cannot be lower than Initial. If it is, it will be capped to it.
- Maximum time.Duration `u:"maximum"`
-
- // Exponent used for next backoff calculation. Any time a work item fails
- // directly after another failure, the previous backoff period will be multiplied
- // by the exponent to yield the new backoff period. The new period will then be
- // capped to Maximum.
- //
- // Exponent is not persisted in the database. Instead, it is always read from
- // this structure.
- //
- // Cannot be lower than 1.0. If it is, it will be capped to it.
- Exponent float64 `u:"exponent"`
-}
-
-// normalized copies the given backoff and returns a 'normalized' version of it,
-// with the 'when zero/unset' rules described in the Backoff documentation
-// strings.
-func (b *Backoff) normalized() *Backoff {
- c := *b
-
- if c.Exponent < 1.0 {
- c.Exponent = 1.0
- }
- if c.Initial < time.Second {
- c.Initial = time.Second
- }
- if c.Maximum < c.Initial {
- c.Maximum = c.Initial
- }
- return &c
-}
-
-func (b *Backoff) simple() bool {
- // Non-normalized simple backoffs will have a zero exponent.
- if b.Exponent == 0.0 {
- return true
- }
- // Normalized simple backoffs will have a 1.0 exponent.
- if b.Exponent == 1.0 {
- return true
- }
- return false
-}
-
-// next calculates the backoff period based on a backoff descriptor and previous
-// existing backoff information. Both or either can be nil.
-func (b *Backoff) next(e *existingBackoff) int64 {
- second := time.Second.Nanoseconds()
-
- // Minimum interval is one second. Start with that.
- last := second
- // Then, if we have a previous interval, and it's greater than a second, use that
- // as the last interval.
- if e != nil {
- if previous := e.lastInterval.Nanoseconds(); previous > second {
- last = previous
- }
- }
-
- // If no backoff is configured, go with either the minimum of one second, or
- // whatever the last previous interval was.
- if b == nil {
- return last / second
- }
-
- // Make a copy of the backoff descriptor, normalizing as necessary.
- c := b.normalized()
-
- // Simple backoffs always return Initial.
- if b.simple() {
- return c.Initial.Nanoseconds() / second
- }
-
- // If there is no existing backoff, return the initial backoff value directly.
- if e == nil {
- return c.Initial.Nanoseconds() / second
- }
-
- // Start out with the persisted interval.
- next := last
- // If by any chance we persisted an interval less than one second, clamp it.
- if next < second {
- next = second
- }
-
- // Multiply by exponent from descriptor.
- next = int64(float64(next) * c.Exponent)
-
- // Handle overflows. If multiplying by a positive number resulted in a lower
- // value than what we started with, it means we overflowed and wrapped around. If
- // so, clamp to maximum.
- if next < last {
- next = c.Maximum.Nanoseconds()
- }
-
- // Clamp to maximum.
- if next > c.Maximum.Nanoseconds() {
- next = c.Maximum.Nanoseconds()
- }
- // Round up to the nearest second.
- if next%second == 0 {
- return next / second
- } else {
- return next/second + 1
- }
-}
-
-// Work being performed on a machine.
-type Work struct {
- // Machine that this work is being performed on, as retrieved by the retrieval
- // function passed to the Work method.
- Machine uuid.UUID
- // s is the parent session.
- s *Session
- // done marks that this work has already been canceled or finished.
- done bool
- // process that this work performs.
- process model.Process
-
- backoff *existingBackoff
-
- m *metrics.ProcessRecorder
-}
-
-// Cancel the Work started on a machine. If the work has already been finished
-// or canceled, this is a no-op. In case of error, a log line will be emitted.
-func (w *Work) Cancel(ctx context.Context) {
- if w.done {
- return
- }
- w.done = true
- w.m.OnWorkFinished(metrics.WorkResultCanceled)
-
- klog.Infof("Canceling work %q on machine %q (sess %q)", w.process, w.Machine, w.s.UUID)
- // Eat error and log. There's nothing we can do if this fails, and if it does, it's
- // probably because our connectivity to the BMDB has failed. If so, our session
- // will be invalidated soon and so will the work being performed on this
- // machine.
- err := w.s.Transact(ctx, func(q *model.Queries) error {
- err := q.FinishWork(ctx, model.FinishWorkParams{
- MachineID: w.Machine,
- SessionID: w.s.UUID,
- Process: w.process,
- })
- if err != nil {
- return err
- }
- return q.WorkHistoryInsert(ctx, model.WorkHistoryInsertParams{
- MachineID: w.Machine,
- Process: w.process,
- Event: model.WorkHistoryEventCanceled,
- })
- })
- if err != nil {
- klog.Errorf("Failed to cancel work %q on %q (sess %q): %v", w.process, w.Machine, w.s.UUID, err)
- }
-}
-
-// Finish work by executing a commit function 'fn' and releasing the machine
-// from the work performed. The function given should apply tags to the
-// processed machine in a way that causes it to not be eligible for retrieval
-// again. As with the retriever function, the commit function might be called an
-// arbitrary number of times as part of cockroachdb transaction retries.
-//
-// This may be called only once.
-func (w *Work) Finish(ctx context.Context, fn func(q *model.Queries) error) error {
- if w.done {
- return fmt.Errorf("already finished")
- }
- w.done = true
- w.m.OnWorkFinished(metrics.WorkResultFinished)
-
- klog.Infof("Finishing work %q on machine %q (sess %q)", w.process, w.Machine, w.s.UUID)
- return w.s.Transact(ctx, func(q *model.Queries) error {
- err := q.FinishWork(ctx, model.FinishWorkParams{
- MachineID: w.Machine,
- SessionID: w.s.UUID,
- Process: w.process,
- })
- if err != nil {
- return err
- }
- err = q.WorkBackoffDelete(ctx, model.WorkBackoffDeleteParams{
- MachineID: w.Machine,
- Process: w.process,
- })
- if err != nil {
- return err
- }
- err = q.WorkHistoryInsert(ctx, model.WorkHistoryInsertParams{
- MachineID: w.Machine,
- Process: w.process,
- Event: model.WorkHistoryEventFinished,
- })
- if err != nil {
- return err
- }
- return fn(q)
- })
-}
-
-// Fail work and introduce backoff. The given cause is an operator-readable
-// string that will be persisted alongside the backoff and the work history/audit
-// table.
-//
-// The backoff describes a period during which the same process will not be
-// retried on this machine until its expiration.
-//
-// The given backoff is a structure which describes both the initial backoff
-// period if the work failed for the first time, and a mechanism to exponentially
-// increase the backoff period if that work failed repeatedly. The work is
-// defined to have failed repeatedly if it only resulted in Cancel/Fail calls
-// without any Finish calls in the meantime.
-//
-// Only the last backoff period is persisted in the database. The exponential
-// backoff behaviour (including its maximum time) is always calculated based on
-// the given backoff structure.
-//
-// If nil, the backoff defaults to a non-exponential, one second backoff. This is
-// the minimum designed to keep the system chugging along without repeatedly
-// trying a failed job in a loop. However, the backoff should generally be set to
-// some well engineered value to prevent spurious retries.
-func (w *Work) Fail(ctx context.Context, backoff *Backoff, cause string) error {
- if w.done {
- return fmt.Errorf("already finished")
- }
- w.done = true
- w.m.OnWorkFinished(metrics.WorkResultFailed)
-
- return w.s.Transact(ctx, func(q *model.Queries) error {
- err := q.FinishWork(ctx, model.FinishWorkParams{
- MachineID: w.Machine,
- SessionID: w.s.UUID,
- Process: w.process,
- })
- if err != nil {
- return err
- }
- err = q.WorkHistoryInsert(ctx, model.WorkHistoryInsertParams{
- MachineID: w.Machine,
- Process: w.process,
- Event: model.WorkHistoryEventFailed,
- FailedCause: sql.NullString{
- String: cause,
- Valid: true,
- },
- })
- if err != nil {
- return err
- }
- if backoff == nil {
- klog.Warningf("Nil backoff for %q on machine %q: defaulting to one second non-exponential.", w.process, w.Machine)
- }
- seconds := backoff.next(w.backoff)
- klog.Infof("Adding backoff for %q on machine %q: %d seconds", w.process, w.Machine, seconds)
- return q.WorkBackoffInsert(ctx, model.WorkBackoffInsertParams{
- MachineID: w.Machine,
- Process: w.process,
- Cause: cause,
- Seconds: seconds,
- })
- })
-}
diff --git a/cloud/bmaas/bmdb/sessions_test.go b/cloud/bmaas/bmdb/sessions_test.go
deleted file mode 100644
index 8839c17..0000000
--- a/cloud/bmaas/bmdb/sessions_test.go
+++ /dev/null
@@ -1,643 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package bmdb
-
-import (
- "context"
- "errors"
- "fmt"
- "testing"
- "time"
-
- "github.com/google/uuid"
-
- "source.monogon.dev/cloud/bmaas/bmdb/model"
- "source.monogon.dev/cloud/lib/component"
-)
-
-func dut() *BMDB {
- return &BMDB{
- Config: Config{
- Database: component.CockroachConfig{
- InMemory: true,
- },
- },
- }
-}
-
-// TestSessionExpiry exercises the session heartbeat logic, making sure that if
-// a session stops being maintained subsequent Transact calls will fail.
-func TestSessionExpiry(t *testing.T) {
- b := dut()
- conn, err := b.Open(true)
- if err != nil {
- t.Fatalf("Open failed: %v", err)
- }
-
- ctx, ctxC := context.WithCancel(context.Background())
- defer ctxC()
-
- session, err := conn.StartSession(ctx)
- if err != nil {
- t.Fatalf("Starting session failed: %v", err)
- }
-
- // A transaction in a brand-new session should work.
- var machine model.Machine
- err = session.Transact(ctx, func(q *model.Queries) error {
- machine, err = q.NewMachine(ctx)
- return err
- })
- if err != nil {
- t.Fatalf("First transaction failed: %v", err)
- }
-
- time.Sleep(6 * time.Second)
-
- // A transaction after the 5-second session interval should continue to work.
- err = session.Transact(ctx, func(q *model.Queries) error {
- _, err = q.NewMachine(ctx)
- return err
- })
- if err != nil {
- t.Fatalf("Second transaction failed: %v", err)
- }
-
- // A transaction after the 5-second session interval should fail if we don't
- // maintain its heartbeat.
- session.ctxC()
- time.Sleep(6 * time.Second)
-
- err = session.Transact(ctx, func(q *model.Queries) error {
- return q.MachineAddProvided(ctx, model.MachineAddProvidedParams{
- MachineID: machine.MachineID,
- Provider: "foo",
- ProviderID: "bar",
- })
- })
- if !errors.Is(err, ErrSessionExpired) {
- t.Fatalf("Second transaction should've failed due to expired session, got %v", err)
- }
-
-}
-
-// TestWork exercises the per-{process,machine} mutual exclusion mechanism of
-// Work items.
-func TestWork(t *testing.T) {
- b := dut()
- conn, err := b.Open(true)
- if err != nil {
- t.Fatalf("Open failed: %v", err)
- }
-
- ctx, ctxC := context.WithCancel(context.Background())
- defer ctxC()
-
- // Start two session for testing.
- session1, err := conn.StartSession(ctx)
- if err != nil {
- t.Fatalf("Starting session failed: %v", err)
- }
- session2, err := conn.StartSession(ctx)
- if err != nil {
- t.Fatalf("Starting session failed: %v", err)
- }
-
- var machine model.Machine
- err = session1.Transact(ctx, func(q *model.Queries) error {
- machine, err = q.NewMachine(ctx)
- return err
- })
- if err != nil {
- t.Fatalf("Creating machine failed: %v", err)
- }
-
- // Create a subcontext for a long-term work request. We'll cancel it later as
- // part of the test.
- ctxB, ctxBC := context.WithCancel(ctx)
- defer ctxBC()
-
- constantRetriever := func(_ *model.Queries) ([]uuid.UUID, error) {
- return []uuid.UUID{machine.MachineID}, nil
- }
-
- // Start work on machine which we're not gonna finish for a while.
- work1, err := session1.Work(ctxB, model.ProcessUnitTest1, constantRetriever)
- if err != nil {
- t.Fatalf("Starting first work failed: %v", err)
- }
-
- // Starting more work on the same machine but a different process should still
- // be allowed.
- for _, session := range []*Session{session1, session2} {
- work2, err := session.Work(ctxB, model.ProcessUnitTest2, constantRetriever)
- if err != nil {
- t.Errorf("Could not run concurrent process on machine: %v", err)
- } else {
- work2.Cancel(ctxB)
- }
- }
-
- // However, starting work with the same process on the same machine should
- // fail.
- for _, session := range []*Session{session1, session2} {
- work2, err := session.Work(ctxB, model.ProcessUnitTest1, constantRetriever)
- if !errors.Is(err, ErrWorkConflict) {
- t.Errorf("Concurrent work with same process should've been forbidden, got %v", err)
- work2.Cancel(ctxB)
- }
- }
-
- // Now, finish the long-running work.
- work1.Cancel(ctx)
-
- // We should now be able to perform 'test1' work again against this machine.
- for _, session := range []*Session{session1, session2} {
- work1, err := session.Work(ctxB, model.ProcessUnitTest1, constantRetriever)
- if err != nil {
- t.Errorf("Could not run work against machine: %v", err)
- } else {
- work1.Cancel(ctxB)
- }
- }
-}
-
-// TestWorkBackoff exercises the backoff functionality within the BMDB.
-func TestWorkBackoff(t *testing.T) {
- b := dut()
- conn, err := b.Open(true)
- if err != nil {
- t.Fatalf("Open failed: %v", err)
- }
-
- ctx, ctxC := context.WithCancel(context.Background())
- defer ctxC()
-
- session, err := conn.StartSession(ctx)
- if err != nil {
- t.Fatalf("Starting session failed: %v", err)
- }
-
- var machine model.Machine
- // Create machine.
- err = session.Transact(ctx, func(q *model.Queries) error {
- machine, err = q.NewMachine(ctx)
- if err != nil {
- return err
- }
- return q.MachineAddProvided(ctx, model.MachineAddProvidedParams{
- MachineID: machine.MachineID,
- Provider: model.ProviderEquinix,
- ProviderID: "123",
- })
- })
- if err != nil {
- t.Fatalf("Creating machine failed: %v", err)
- }
-
- waitMachine := func(nsec int64) *Work {
- t.Helper()
-
- deadline := time.Now().Add(time.Duration(nsec) * 2 * time.Second)
- for {
- if time.Now().After(deadline) {
- t.Fatalf("Deadline expired")
- }
- work, err := session.Work(ctx, model.ProcessShepherdAgentStart, func(q *model.Queries) ([]uuid.UUID, error) {
- machines, err := q.GetMachinesForAgentStart(ctx, model.GetMachinesForAgentStartParams{
- Limit: 1,
- Provider: model.ProviderEquinix,
- })
- if err != nil {
- return nil, err
- }
- if len(machines) < 1 {
- return nil, ErrNothingToDo
- }
- return []uuid.UUID{machines[0].MachineID}, nil
- })
- if err == nil {
- return work
- }
- if !errors.Is(err, ErrNothingToDo) {
- t.Fatalf("Unexpected work error: %v", err)
- }
- time.Sleep(100 * time.Millisecond)
- }
- }
-
- // Work on machine, but fail it with a backoff.
- work := waitMachine(1)
- backoff := Backoff{
- Initial: time.Second,
- Maximum: 5 * time.Second,
- Exponent: 2,
- }
- if err := work.Fail(ctx, &backoff, "test"); err != nil {
- t.Fatalf("Failing work failed: %v", err)
- }
-
- expect := func(count int) {
- t.Helper()
-
- var machines []model.MachineProvided
- var err error
- err = session.Transact(ctx, func(q *model.Queries) error {
- machines, err = q.GetMachinesForAgentStart(ctx, model.GetMachinesForAgentStartParams{
- Limit: 1,
- Provider: model.ProviderEquinix,
- })
- if err != nil {
- return err
- }
- return nil
- })
- if err != nil {
- t.Errorf("Failed to retrieve machines for agent start: %v", err)
- }
- if want, got := count, len(machines); want != got {
- t.Errorf("Expected %d machines, got %d", want, got)
- }
- }
-
- // The machine shouldn't be returned now.
- expect(0)
-
- // Wait for the backoff to expire.
- time.Sleep(1100 * time.Millisecond)
-
- // The machine should now be returned again.
- expect(1)
-
- // Prepare helper for checking exponential backoffs.
- failAndCheck := func(nsec int64) {
- t.Helper()
- work := waitMachine(nsec)
- if err := work.Fail(ctx, &backoff, "test"); err != nil {
- t.Fatalf("Failing work failed: %v", err)
- }
-
- var backoffs []model.WorkBackoff
- err = session.Transact(ctx, func(q *model.Queries) error {
- var err error
- backoffs, err = q.WorkBackoffOf(ctx, model.WorkBackoffOfParams{
- MachineID: machine.MachineID,
- Process: model.ProcessShepherdAgentStart,
- })
- return err
- })
- if err != nil {
- t.Errorf("Failed to retrieve machines for agent start: %v", err)
- }
- if len(backoffs) < 1 {
- t.Errorf("No backoff")
- } else {
- backoff := backoffs[0]
- if want, got := nsec, backoff.LastIntervalSeconds.Int64; want != got {
- t.Fatalf("Wanted backoff of %d seconds, got %d", want, got)
- }
- }
- }
-
- // Exercise exponential backoff functionality.
- failAndCheck(2)
- failAndCheck(4)
- failAndCheck(5)
- failAndCheck(5)
-
- // If the job now succeeds, subsequent failures should start from 1 again.
- work = waitMachine(5)
- err = work.Finish(ctx, func(q *model.Queries) error {
- // Not setting any tags that would cause subsequent queries to not return the
- // machine anymore.
- return nil
- })
- if err != nil {
- t.Fatalf("Could not finish work: %v", err)
- }
-
- failAndCheck(1)
- failAndCheck(2)
-}
-
-// TestAgentStartWorkflow exercises the agent start workflow within the BMDB.
-func TestAgentStartWorkflow(t *testing.T) {
- b := dut()
- conn, err := b.Open(true)
- if err != nil {
- t.Fatalf("Open failed: %v", err)
- }
-
- ctx, ctxC := context.WithCancel(context.Background())
- defer ctxC()
-
- session, err := conn.StartSession(ctx)
- if err != nil {
- t.Fatalf("Starting session failed: %v", err)
- }
-
- // Create machine. Drop its ID.
- err = session.Transact(ctx, func(q *model.Queries) error {
- machine, err := q.NewMachine(ctx)
- if err != nil {
- return err
- }
- return q.MachineAddProvided(ctx, model.MachineAddProvidedParams{
- MachineID: machine.MachineID,
- Provider: model.ProviderEquinix,
- ProviderID: "123",
- })
- })
- if err != nil {
- t.Fatalf("Creating machine failed: %v", err)
- }
-
- // Start working on a machine.
- var machine uuid.UUID
- startedC := make(chan struct{})
- doneC := make(chan struct{})
- errC := make(chan error)
- go func() {
- work, err := session.Work(ctx, model.ProcessShepherdAgentStart, func(q *model.Queries) ([]uuid.UUID, error) {
- machines, err := q.GetMachinesForAgentStart(ctx, model.GetMachinesForAgentStartParams{
- Limit: 1,
- Provider: model.ProviderEquinix,
- })
- if err != nil {
- return nil, err
- }
- if len(machines) < 1 {
- return nil, ErrNothingToDo
- }
- machine = machines[0].MachineID
- return []uuid.UUID{machines[0].MachineID}, nil
- })
- defer work.Cancel(ctx)
-
- if err != nil {
- close(startedC)
- errC <- err
- return
- }
-
- // Simulate work by blocking on a channel.
- close(startedC)
-
- <-doneC
-
- err = work.Finish(ctx, func(q *model.Queries) error {
- return q.MachineSetAgentStarted(ctx, model.MachineSetAgentStartedParams{
- MachineID: work.Machine,
- AgentStartedAt: time.Now(),
- AgentPublicKey: []byte("fakefakefake"),
- })
- })
- errC <- err
- }()
- <-startedC
-
- // Work on the machine has started. Attempting to get more machines now should
- // return no machines.
-
- // Mutual exclusion with AgentStart:
- err = session.Transact(ctx, func(q *model.Queries) error {
- machines, err := q.GetMachinesForAgentStart(ctx, model.GetMachinesForAgentStartParams{
- Limit: 1,
- Provider: model.ProviderEquinix,
- })
- if err != nil {
- return err
- }
- if len(machines) > 0 {
- t.Errorf("Expected no machines ready for agent start.")
- }
- return nil
- })
- if err != nil {
- t.Fatalf("Failed to retrieve machines for start in parallel: %v", err)
- }
-
- // Mutual exclusion with Recovery:
- err = session.Transact(ctx, func(q *model.Queries) error {
- machines, err := q.GetMachineForAgentRecovery(ctx, model.GetMachineForAgentRecoveryParams{
- Limit: 1,
- Provider: model.ProviderEquinix,
- })
- if err != nil {
- return err
- }
- if len(machines) > 0 {
- t.Errorf("Expected no machines ready for agent recovery.")
- }
- return nil
- })
- if err != nil {
- t.Fatalf("Failed to retrieve machines for recovery in parallel: %v", err)
- }
-
- // Finish working on machine.
- close(doneC)
- err = <-errC
- if err != nil {
- t.Fatalf("Failed to finish work on machine: %v", err)
- }
- // That machine has its agent started, so we still expect no work to have to be
- // done.
- err = session.Transact(ctx, func(q *model.Queries) error {
- machines, err := q.GetMachinesForAgentStart(ctx, model.GetMachinesForAgentStartParams{
- Limit: 1,
- Provider: model.ProviderEquinix,
- })
- if err != nil {
- return err
- }
- if len(machines) > 0 {
- t.Errorf("Expected still no machines ready for agent start.")
- }
- return nil
- })
- if err != nil {
- t.Fatalf("Failed to retrieve machines for agent start after work finished: %v", err)
- }
-
- // Check history has been recorded.
- var history []model.WorkHistory
- err = session.Transact(ctx, func(q *model.Queries) error {
- history, err = q.ListHistoryOf(ctx, machine)
- return err
- })
- if err != nil {
- t.Fatalf("Failed to retrieve machine history: %v", err)
- }
- // Expect two history items: started and finished.
- if want, got := 2, len(history); want != got {
- t.Errorf("Wanted %d history items, got %d", want, got)
- } else {
- if want, got := model.WorkHistoryEventStarted, history[0].Event; want != got {
- t.Errorf("Wanted first history event to be %s, got %s", want, got)
- }
- if want, got := model.WorkHistoryEventFinished, history[1].Event; want != got {
- t.Errorf("Wanted second history event to be %s, got %s", want, got)
- }
- }
- // Check all other history event data.
- for i, el := range history {
- if want, got := machine, el.MachineID; want.String() != got.String() {
- t.Errorf("Wanted %d history event machine ID to be %s, got %s", i, want, got)
- }
- if want, got := model.ProcessShepherdAgentStart, el.Process; want != got {
- t.Errorf("Wanted %d history event process to be %s, got %s", i, want, got)
- }
- }
-}
-
-// TestAgentStartWorkflowParallel starts work on three machines by six workers
-// and makes sure that there are no scheduling conflicts between them.
-func TestAgentStartWorkflowParallel(t *testing.T) {
- b := dut()
- conn, err := b.Open(true)
- if err != nil {
- t.Fatalf("Open failed: %v", err)
- }
-
- ctx, ctxC := context.WithCancel(context.Background())
- defer ctxC()
-
- makeMachine := func(providerID string) {
- ctxS, ctxC := context.WithCancel(ctx)
- defer ctxC()
- session, err := conn.StartSession(ctxS)
- if err != nil {
- t.Fatalf("Starting session failed: %v", err)
- }
- err = session.Transact(ctx, func(q *model.Queries) error {
- machine, err := q.NewMachine(ctx)
- if err != nil {
- return err
- }
- return q.MachineAddProvided(ctx, model.MachineAddProvidedParams{
- MachineID: machine.MachineID,
- Provider: model.ProviderEquinix,
- ProviderID: providerID,
- })
- })
- if err != nil {
- t.Fatalf("Creating machine failed: %v", err)
- }
- }
- // Make six machines for testing.
- for i := 0; i < 6; i++ {
- makeMachine(fmt.Sprintf("test%d", i))
- }
-
- workStarted := make(chan struct{})
- workDone := make(chan struct {
- machine uuid.UUID
- workerID int
- })
-
- workOnce := func(ctx context.Context, workerID int, session *Session) error {
- work, err := session.Work(ctx, model.ProcessShepherdAgentStart, func(q *model.Queries) ([]uuid.UUID, error) {
- machines, err := q.GetMachinesForAgentStart(ctx, model.GetMachinesForAgentStartParams{
- Limit: 1,
- Provider: model.ProviderEquinix,
- })
- if err != nil {
- return nil, err
- }
- if len(machines) < 1 {
- return nil, ErrNothingToDo
- }
- return []uuid.UUID{machines[0].MachineID}, nil
- })
-
- if err != nil {
- return err
- }
- defer work.Cancel(ctx)
-
- select {
- case <-workStarted:
- case <-ctx.Done():
- return ctx.Err()
- }
-
- select {
- case workDone <- struct {
- machine uuid.UUID
- workerID int
- }{
- machine: work.Machine,
- workerID: workerID,
- }:
- case <-ctx.Done():
- return ctx.Err()
- }
-
- return work.Finish(ctx, func(q *model.Queries) error {
- return q.MachineSetAgentStarted(ctx, model.MachineSetAgentStartedParams{
- MachineID: work.Machine,
- AgentStartedAt: time.Now(),
- AgentPublicKey: []byte("fakefakefake"),
- })
- })
- }
-
- worker := func(workerID int) {
- ctxS, ctxSC := context.WithCancel(ctx)
- defer ctxSC()
- session, err := conn.StartSession(ctxS)
- if err != nil {
- t.Errorf("Starting session failed: %v", err)
- ctxC()
- return
- }
- for {
- err := workOnce(ctxS, workerID, session)
- if err != nil {
- if errors.Is(err, ErrNothingToDo) {
- continue
- }
- if errors.Is(err, ctxS.Err()) {
- return
- }
- t.Errorf("worker failed: %v", err)
- ctxC()
- return
- }
- }
- }
- // Start three workers.
- for i := 0; i < 3; i++ {
- go worker(i)
- }
-
- // Wait for at least three workers to be alive.
- for i := 0; i < 3; i++ {
- select {
- case workStarted <- struct{}{}:
- case <-ctx.Done():
- t.FailNow()
- }
- }
-
- // Allow all workers to continue running from now on.
- close(workStarted)
-
- // Expect six machines to have been handled in parallel by three workers.
- seenWorkers := make(map[int]bool)
- seenMachines := make(map[string]bool)
- for i := 0; i < 6; i++ {
- res := <-workDone
- seenWorkers[res.workerID] = true
- seenMachines[res.machine.String()] = true
- }
-
- if want, got := 3, len(seenWorkers); want != got {
- t.Errorf("Expected %d workers, got %d", want, got)
- }
- if want, got := 6, len(seenMachines); want != got {
- t.Errorf("Expected %d machines, got %d", want, got)
- }
-}
diff --git a/cloud/bmaas/bmdb/webug/BUILD.bazel b/cloud/bmaas/bmdb/webug/BUILD.bazel
deleted file mode 100644
index bbcaed4..0000000
--- a/cloud/bmaas/bmdb/webug/BUILD.bazel
+++ /dev/null
@@ -1,28 +0,0 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
-
-go_library(
- name = "webug",
- srcs = [
- "functions.go",
- "views.go",
- "webug.go",
- ],
- embedsrcs = [
- "templates/base.gohtml",
- "templates/fragment_tag.gohtml",
- "templates/fragment_tag_default.gohtml",
- "templates/fragment_tag_provided.gohtml",
- "templates/machines.gohtml",
- "templates/machine.gohtml",
- ],
- importpath = "source.monogon.dev/cloud/bmaas/bmdb/webug",
- visibility = ["//visibility:public"],
- deps = [
- "//cloud/bmaas/bmdb",
- "//cloud/bmaas/bmdb/model",
- "//cloud/bmaas/bmdb/reflection",
- "@com_github_cenkalti_backoff_v4//:backoff",
- "@com_github_google_uuid//:uuid",
- "@io_k8s_klog_v2//:klog",
- ],
-)
diff --git a/cloud/bmaas/bmdb/webug/functions.go b/cloud/bmaas/bmdb/webug/functions.go
deleted file mode 100644
index 3c8f43b..0000000
--- a/cloud/bmaas/bmdb/webug/functions.go
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package webug
-
-import (
- "strings"
-)
-
-var (
- // templateFuncs are helper functions accessible to the rendered templates.
- templateFuncs = map[string]any{
- // summarizeError attempts to make a Go-style "foo: bar: baz" error short by
- // using some ugly heuristics. This is currently used to show a shorter error
- // message in the backoff column of the machine list.
- //
- // TODO(q3k): fix backoff causes to be less verbose and nuke this.
- "summarizeError": func(in string) string {
- parts := strings.Split(in, ": ")
- for i, p := range parts {
- // Attempt to strip some common error prefixes.
- if strings.HasPrefix(p, "failed to ") {
- continue
- }
- if strings.HasPrefix(p, "when ") {
- continue
- }
- if strings.HasPrefix(p, "while ") {
- continue
- }
- // If we had some prefixes stripped but suddenly reached a part that is not
- // prefixed
- return "[...] " + strings.Join(parts[i:], ": ")
- }
- // If we stripped every single segment then just return the whole thing.
- return in
- },
- }
-)
diff --git a/cloud/bmaas/bmdb/webug/templates/base.gohtml b/cloud/bmaas/bmdb/webug/templates/base.gohtml
deleted file mode 100644
index fef098d..0000000
--- a/cloud/bmaas/bmdb/webug/templates/base.gohtml
+++ /dev/null
@@ -1,104 +0,0 @@
-<!DOCTYPE html>
-<meta charset="utf-8">
-<title>BMDB webug</title>
-<style>
- body {
- font-family: sans-serif;
- background: #fff;
- }
-
- /* Logotype. */
- h1 {
- clear: both;
- padding-left: 1em;
- padding-top: 0.5em;
- }
- h1 a {
- text-decoration: none;
- }
- h1 a, h1 a:visited, h1 a:hover, h1 a:active {
- color: inherit;
- }
- h1 span.red {
- background-color: red;
- color: white;
- padding: 0.1em;
- border-radius: 0.4em;
- }
- h1 span.info {
- font-size: 0.5em;
- font-weight: normal;
- font-style: italic;
- }
-
- /* Section headers. */
- h2 {
- clear: both;
- width: 100%;
- text-align: center;
- font-size: 120%;
- background: #eeeeff;
- }
-
- /* Stylish tables. */
- table, th, td {
- background-color: #eee;
- padding: 0.2em 0.4em 0.2em 0.4em;
- }
- table th {
- background-color: #c0c0c0;
- }
- table {
- background-color: #fff;
- border-spacing: 0.2em;
- }
-
- /* Colouring of the Work History log in machine.gohtml. */
- tr.EventFailed td,
- tr.EventCanceled td {
- background-color: #f8e8e8;
- }
- tr.EventFinished td {
- background-color: #e8f8e8;
- }
-
- /* Colouring of the Machine State in fragment_tag_provided.gohtml. */
- b.StatusMissing,
- b.StatusProvisioning,
- b.StatusProvisioningFailedPermanent,
- b.StatusStopped
- {
- color: red;
- }
- b.StatusRunning {
- color: green;
- }
-
- /* Generic font style tags for any element. */
- .small {
- font-size: 0.8em;
- }
- .faint {
- color: #333;
- }
- .mono {
- font-family: monospace;
- tab-size: 2;
- }
- .error {
- color: #f00;
- }
-
- /* For simple column style layouts. */
- .vsplit {
- display: flex;
- flex-direction: row;
- flex-wrap: nowrap;
- align-items: stretch;
- }
- .column {
- flex-grow: 1;
- padding: 0.5em;
- }
-</style>
-<h1><a href="/">we<span class="red">bug</span></a> <span class="info">BMDB at {{ .BMDBAddress }} schema {{ .BMDBSchema }}</span></h1>
\ No newline at end of file
diff --git a/cloud/bmaas/bmdb/webug/templates/fragment_tag.gohtml b/cloud/bmaas/bmdb/webug/templates/fragment_tag.gohtml
deleted file mode 100644
index 341cd78..0000000
--- a/cloud/bmaas/bmdb/webug/templates/fragment_tag.gohtml
+++ /dev/null
@@ -1,5 +0,0 @@
-{{- if eq .Type.Name "Provided" -}}
- {{- template "fragment_tag_provided.gohtml" . -}}
-{{- else -}}
- {{- template "fragment_tag_default.gohtml" . -}}
-{{- end -}}
diff --git a/cloud/bmaas/bmdb/webug/templates/fragment_tag_default.gohtml b/cloud/bmaas/bmdb/webug/templates/fragment_tag_default.gohtml
deleted file mode 100644
index 464d517..0000000
--- a/cloud/bmaas/bmdb/webug/templates/fragment_tag_default.gohtml
+++ /dev/null
@@ -1 +0,0 @@
-<b>{{ .Type.Name }}</b>(...)
diff --git a/cloud/bmaas/bmdb/webug/templates/fragment_tag_provided.gohtml b/cloud/bmaas/bmdb/webug/templates/fragment_tag_provided.gohtml
deleted file mode 100644
index 0a7da82..0000000
--- a/cloud/bmaas/bmdb/webug/templates/fragment_tag_provided.gohtml
+++ /dev/null
@@ -1,6 +0,0 @@
-{{- $provider := (.Field "provider").HumanValue }}
-{{- $pid := (.Field "provider_id").HumanValue }}
-{{- $location := (.Field "provider_location").HumanValue }}
-{{- $status := (.Field "provider_status").HumanValue }}
-{{- $address := (.Field "provider_ip_address").HumanValue }}
-<b class="Status{{ $status }}">{{ .Type.Name }}</b>({{- $provider }}, {{ $location }}, {{ $address }}, <a href="/provider/{{ $provider }}/{{ $pid }}" style="font-family: mono">{{ $pid }}</a>)
diff --git a/cloud/bmaas/bmdb/webug/templates/machine.gohtml b/cloud/bmaas/bmdb/webug/templates/machine.gohtml
deleted file mode 100644
index 73c8118..0000000
--- a/cloud/bmaas/bmdb/webug/templates/machine.gohtml
+++ /dev/null
@@ -1,176 +0,0 @@
-{{ template "base.gohtml" .Base }}
-<h2>Machine {{ .Machine.ID }}</h2>
-
-{{ $sessions := .Sessions }}
-
-<table>
- <tr>
- <td><b>Machine ID</b></td>
- <td class="mono">{{ .Machine.ID }}</td>
- </tr>
- <tr>
- <td><b>Created</b></td>
- <td>{{ .Machine.Created }}</td>
- </tr>
- <tr>
- <td><b>Active Backoffs</b></td>
- <td>{{ len .Machine.ActiveBackoffs }}</td>
- </tr>
- <tr>
- <td><b>Active Work</b></td>
- <td>{{ len .Machine.Work }}</td>
- </tr>
-</table>
-
-<div class="vsplit">
- <div class="column">
- <h2>Tags</h2>
- {{ range $name, $tag := .Machine.Tags }}
- <table>
- <tr>
- <th colspan="2">
- {{ template "fragment_tag.gohtml" $tag }}
- </th>
- </tr>
- {{ range $tag.Fields }}
- <tr>
- <td>
- <b>{{ .Type.NativeName }}:</b>
- </td>
- <td class="mono">
- <pre>{{ .HumanValue }}</pre>
- </td>
- </tr>
- {{ end }}
- </table>
- {{ else }}
- <i>No tags.</i>
- {{ end }}
- <h2>Work</h2>
- {{ range $name, $work := .Machine.Work }}
- <table>
- <tr>
- <th colspan="3">
- <b>{{ $work.Process }}</b>
- </th>
- </tr>
- <tr>
- <td><b>Process:</b></td>
- <td class="mono" colspan="2">
- {{ $work.Process }}
- </td>
- </tr>
- {{ $sessionOrErr := index $sessions $name }}
- {{ if ne $sessionOrErr.Error "" }}
- <tr>
- <td colspan="3" class="error">
- Could not retrieve session information: {{ $sessionOrErr.Error }}
- </td>
- </tr>
- {{ else }}
- {{ $session := $sessionOrErr.Session }}
- <tr>
- <td rowspan="5" style="vertical-align: top;"><b>Session</b></td>
- <td><b>ID:</b></td>
- <td class="mono" colspan="2">
- <a href="/session/{{ $session.SessionID }}">{{ $session.SessionID }}</a>
- </td>
- </tr>
- <tr>
- <td><b>Component:</b></td>
- <td class="mono">{{ $session.SessionComponentName }}</td>
- </tr>
- <tr>
- <td><b>Runtime:</b></td>
- <td class="mono">{{ $session.SessionRuntimeInfo }}</td>
- </tr>
- <tr>
- <td><b>Created At:</b></td>
- <td>{{ $session.SessionCreatedAt }}</td>
- </tr>
- <tr>
- <td><b>Liveness:</b></td>
- <td>Interval {{ $session.SessionIntervalSeconds }}s, deadline {{ $session.SessionDeadline }}</td>
- </tr>
- {{ end }}
- </table>
- {{ else }}
- <i>No active work.</i>
- {{ end }}
- <h2>Backoffs</h2>
- <h3>Active</h3>
- {{ range $name, $backoff := .Machine.ActiveBackoffs }}
- <table>
- <tr>
- <th colspan="2">
- <b>{{ $backoff.Process }}</b>
- </th>
- </tr>
- <tr>
- <td><b>Process:</b></td>
- <td class="mono">{{ $backoff.Process }}</td>
- </tr>
- <tr>
- <td><b>Until:</b></td>
- <td class="mono">{{ $backoff.Until }}</td>
- </tr>
- <tr>
- <td><b>Cause:</b></td>
- <td class="mono">{{ $backoff.Cause }}</td>
- </tr>
- </table>
- {{ else }}
- <i>No active backoffs.</i>
- {{ end }}
- <h3>Expired</h3>
- {{ range $name, $backoff := .Machine.ExpiredBackoffs }}
- <table style="opacity: 0.4">
- <tr>
- <th colspan="2">
- <b>{{ $backoff.Process }}</b>
- </th>
- </tr>
- <tr>
- <td><b>Process:</b></td>
- <td class="mono">{{ $backoff.Process }}</td>
- </tr>
- <tr>
- <td><b>Until:</b></td>
- <td class="mono">{{ $backoff.Until }}</td>
- </tr>
- <tr>
- <td><b>Cause:</b></td>
- <td class="mono">{{ $backoff.Cause }}</td>
- </tr>
- </table>
- {{ else }}
- <i>No expired backoffs.</i>
- {{ end }}
- </div>
- <div class="column">
- <h2>Work History</h2>
- {{ if ne .HistoryError "" }}
- <b class="error">Unavailable: {{ .HistoryError }}</b>
- {{ else }}
- <i>Note: reverse chronological order.</i>
- <table>
- <tr>
- <th>Time</th>
- <th>Process</th>
- <th>Event</th>
- </tr>
- {{ range .History }}
- <tr class="Event{{.Event}}">
- <td>{{ .Timestamp }}</td>
- <td><b>{{ .Process }}</b></td>
- {{ if eq .Event "Failed" }}
- <td>{{ .Event }}: <span class="mono">{{ .FailedCause.String }}</span></td>
- {{ else }}
- <td>{{ .Event }}</td>
- {{ end }}
- </tr>
- {{ end }}
- </table>
- {{ end }}
- </div>
-</div>
\ No newline at end of file
diff --git a/cloud/bmaas/bmdb/webug/templates/machines.gohtml b/cloud/bmaas/bmdb/webug/templates/machines.gohtml
deleted file mode 100644
index b31f9b6..0000000
--- a/cloud/bmaas/bmdb/webug/templates/machines.gohtml
+++ /dev/null
@@ -1,48 +0,0 @@
-{{ template "base.gohtml" .Base }}
-<h2>Machine List</h2>
-<p>
- Click on a Machine ID to explore it further. Commonly viewed tags are expanded.
-</p>
-<table>
- <tr>
- <th>Machine ID</th>
- <th>Work</th>
- <th>Backoffs</th>
- <th>Tags</th>
- </tr>
- {{ range .Machines -}}
- <tr>
- <td class="mono"><a href="/machine/{{ .ID }}">{{ .ID }}</a></td>
- <td>
- {{- range $process, $work := .Work -}}
- <b><a href="/session/{{ $work.SessionID }}">{{ $process }}</a></b>
- {{- end -}}
- </td>
- <td>
- {{- range $process, $backoff := .Backoffs -}}
- <b>{{ $backoff.Process }}</b>(<span class="small">{{ summarizeError .Cause }}</span>)
- {{- end -}}
- </td>
- <td>
- {{- range $name, $tag := .Tags -}}
- {{- template "fragment_tag.gohtml" $tag -}}
- {{- end -}}
- </td>
- </tr>
- {{ end -}}
-</table>
-<table>
- <tr>
- {{ range $name, $count := .TagCount -}}
- <th>{{ $name }}</th>
- {{ end -}}
- </tr>
- <tr>
- {{ range $name, $count := .TagCount -}}
- <td>{{ $count }}</td>
- {{ end -}}
- </tr>
-</table>
-<p class="small faint mono">
- {{ .NMachines }} rows, rendered in {{ .RenderTime }}. Query: {{ .Query }}
-</p>
\ No newline at end of file
diff --git a/cloud/bmaas/bmdb/webug/views.go b/cloud/bmaas/bmdb/webug/views.go
deleted file mode 100644
index a0bab7b..0000000
--- a/cloud/bmaas/bmdb/webug/views.go
+++ /dev/null
@@ -1,183 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package webug
-
-import (
- "context"
- "fmt"
- "net/http"
- "time"
-
- "github.com/google/uuid"
- "k8s.io/klog/v2"
-
- "source.monogon.dev/cloud/bmaas/bmdb/model"
- "source.monogon.dev/cloud/bmaas/bmdb/reflection"
-)
-
-// baseParams are passed to all rendered templates, and are consumed by tags in
-// templates/base.gohtml.
-type baseParams struct {
- // Address to display in page header.
- BMDBAddress string
- // Schema version to display in page header.
- BMDBSchema string
-}
-
-// makeBase builds baseParams from information about the current connection.
-func (s *server) makeBase() baseParams {
- address := fmt.Sprintf("%s@%s", s.conn.DatabaseName, s.conn.Address)
- if s.conn.InMemory {
- address += " (in memory)"
- }
- return baseParams{
- BMDBAddress: address,
- BMDBSchema: s.curSchema().Version,
- }
-}
-
-// viewMachines renders a list of all machines in the BMDB.
-func (s *server) viewMachines(w http.ResponseWriter, r *http.Request, args ...string) {
- start := time.Now()
- res, err := s.curSchema().GetMachines(r.Context(), &reflection.GetMachinesOpts{Strict: s.strictConsistency})
- if err != nil {
- w.WriteHeader(http.StatusInternalServerError)
- fmt.Fprintf(w, "could not dump BMDB: %v", err)
- return
- }
- duration := time.Since(start)
-
- tagCount := make(map[string]int)
- for _, d := range res.Data {
- for _, t := range d.Tags {
- tagCount[t.Type.Name()]++
- }
- }
-
- type params struct {
- Base baseParams
- Query string
- Machines []*reflection.Machine
- NMachines int
- RenderTime time.Duration
- TagCount map[string]int
- }
- err = templates.ExecuteTemplate(w, "machines.gohtml", ¶ms{
- Base: s.makeBase(),
- Query: res.Query,
- Machines: res.Data,
- NMachines: len(res.Data),
- RenderTime: duration,
- TagCount: tagCount,
- })
- if err != nil {
- klog.Errorf("Template rendering failed: %v", err)
- }
-}
-
-// viewMachineDetail renders a detailed page for a single machine.
-func (s *server) viewMachineDetail(w http.ResponseWriter, r *http.Request, args ...string) {
- mid, err := uuid.Parse(args[0])
- if err != nil {
- w.WriteHeader(http.StatusUnprocessableEntity)
- fmt.Fprintf(w, "invalid machine UUID")
- return
- }
-
- opts := reflection.GetMachinesOpts{
- FilterMachine: &mid,
- Strict: s.strictConsistency,
- ExpiredBackoffs: true,
- }
- res, err := s.curSchema().GetMachines(r.Context(), &opts)
- if err != nil {
- w.WriteHeader(http.StatusInternalServerError)
- fmt.Fprintf(w, "could not dump BMDB: %v", err)
- return
- }
- if len(res.Data) == 0 {
- w.WriteHeader(http.StatusNotFound)
- fmt.Fprintf(w, "machine not found")
- return
- }
- machine := res.Data[0]
-
- // Params to pass to template.
- type sessionOrError struct {
- Session *model.Session
- Error string
- }
- type params struct {
- Base baseParams
- Machine *reflection.Machine
-
- HistoryError string
- History []model.WorkHistory
-
- Sessions map[string]sessionOrError
- }
- p := params{
- Base: s.makeBase(),
- Machine: machine,
- Sessions: make(map[string]sessionOrError),
- }
-
- // History retrieval is performed with strict consistency guarantees, and thus
- // might block. Make sure we don't block the entire page.
- subQueriesCtx, subQueriesCtxC := context.WithTimeout(r.Context(), time.Second)
- defer subQueriesCtxC()
- history, err := s.conn.ListHistoryOf(subQueriesCtx, mid)
- if err != nil {
- p.HistoryError = err.Error()
- }
-
- // Same for sessions.
- for name, work := range machine.Work {
- sessions, err := s.conn.GetSession(subQueriesCtx, work.SessionID)
- if err != nil {
- p.Sessions[name] = sessionOrError{Error: err.Error()}
- } else {
- if len(sessions) == 0 {
- // This can happen if the session literally just disappeared.
- //
- // TODO(q3k): put all of these operations in a DB TX so that we don't end up with
- // possible inconsistencies?
- p.Sessions[name] = sessionOrError{Error: "not found"}
- continue
- }
- p.Sessions[name] = sessionOrError{Session: &sessions[0]}
- }
- }
-
- p.History = make([]model.WorkHistory, len(history))
- for i := 0; i < len(history); i += 1 {
- p.History[i] = history[len(history)-(i+1)]
- }
- if err := templates.ExecuteTemplate(w, "machine.gohtml", &p); err != nil {
- klog.Errorf("Template rendering failed: %v", err)
- }
-}
-
-// viewProviderRedirects redirects a given provider and provider_id into a
-// provider's web portal for more detailed information about an underlying
-// machine.
-func (s *server) viewProviderRedirect(w http.ResponseWriter, r *http.Request, args ...string) {
- providerUrls := map[string]string{
- "Equinix": "https://console.equinix.com/devices/%s/overview",
- }
- if providerUrls[args[0]] == "" {
- w.WriteHeader(http.StatusNotFound)
- fmt.Fprintf(w, "Usage: /provider/Equinix/<id>")
- return
- }
- url := fmt.Sprintf(providerUrls[args[0]], args[1])
- http.Redirect(w, r, url, http.StatusFound)
-}
-
-// viewSession shows detailed information about a BMDB session.
-func (s *server) viewSession(w http.ResponseWriter, r *http.Request, args ...string) {
- // TODO(q3k): implement this once we add session info to work history so that
- // this can actually display something useful.
- fmt.Fprintf(w, "underconstruction.gif")
-}
diff --git a/cloud/bmaas/bmdb/webug/webug.go b/cloud/bmaas/bmdb/webug/webug.go
deleted file mode 100644
index 137a0fd..0000000
--- a/cloud/bmaas/bmdb/webug/webug.go
+++ /dev/null
@@ -1,187 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-// Package webug implements a web-based debug/troubleshooting/introspection
-// system for the BMDB. It's optimized for use by developers and trained
-// operators, prioritizing information density, fast navigation and heavy
-// interlinking.
-package webug
-
-import (
- "context"
- "embed"
- "flag"
- "fmt"
- "html/template"
- "net/http"
- "regexp"
- "sync"
- "time"
-
- "github.com/cenkalti/backoff/v4"
- "k8s.io/klog/v2"
-
- "source.monogon.dev/cloud/bmaas/bmdb"
- "source.monogon.dev/cloud/bmaas/bmdb/reflection"
-)
-
-var (
- //go:embed templates/*.gohtml
- templateFS embed.FS
- templates = template.Must(template.New("base.gohtml").Funcs(templateFuncs).ParseFS(templateFS, "templates/*"))
-)
-
-// server holds the state of an active webug interface.
-type server struct {
- // connection pool to BMDB.
- conn *bmdb.Connection
- // schema retrieved from BMDB.
- schema *reflection.Schema
- // muSchema locks schema for updates.
- muSchema sync.RWMutex
- // strictConsistency, when enabled, makes webug render its views with the
- // freshest available data, potentially conflicting with online
- // transactions. This should only be enabled during testing, as it tends to
- // clog up the database query planner and make everything slow.
- strictConsistency bool
-}
-
-// curSchema returns the current cached BMDB schema.
-func (s *server) curSchema() *reflection.Schema {
- s.muSchema.Lock()
- defer s.muSchema.Unlock()
- return s.schema
-}
-
-// schemaWorker runs a background goroutine which attempts to update the server's
-// cached BMDB schema every hour.
-func (s *server) schemaWorker(ctx context.Context) {
- t := time.NewTicker(time.Hour)
- defer t.Stop()
-
- for {
- // Wait for the timer to tick, or for the context to expire.
- select {
- case <-ctx.Done():
- klog.Infof("Schema fetch worker exiting: %v", ctx.Err())
- return
- case <-t.C:
- }
-
- // Time to check the schema. Do that in an exponential backoff loop until
- // successful.
- bo := backoff.NewExponentialBackOff()
- bo.MaxElapsedTime = 0
- var schema *reflection.Schema
- err := backoff.Retry(func() error {
- var err error
- schema, err = s.conn.Reflect(ctx)
- if err != nil {
- klog.Warningf("Failed to fetch new schema: %v", err)
- return err
- }
- return nil
- }, backoff.WithContext(bo, ctx))
- // This will only happen due to context expiration.
- if err != nil {
- klog.Errorf("Giving up on schema fetch: %v", err)
- continue
- }
-
- // Swap the current schema if necessary.
- cur := s.curSchema().Version
- newVer := schema.Version
- if cur != newVer {
- klog.Infof("Got new schema: %s -> %s", cur, newVer)
- s.muSchema.Lock()
- s.schema = schema
- s.muSchema.Unlock()
- }
- }
-}
-
-// Register webug on an HTTP mux, using a BMDB connection pool.
-//
-// The given context will be used not only to time out the registration call, but
-// also used to run a BMDB schema fetching goroutine that will attempt to fetch
-// newer versions of the schema every hour.
-//
-// This is a low-level function useful when tying webug into an existing web
-// application. If you just want to run webug on a separate port that's
-// configured by flags, use Config and Config.RegisterFlags.
-func Register(ctx context.Context, conn *bmdb.Connection, mux *http.ServeMux, enableStrictConsistency bool) error {
- schema, err := conn.Reflect(ctx)
- if err != nil {
- return fmt.Errorf("could not get BMDB schema for webug: %w", err)
- }
- s := server{
- conn: conn,
- schema: schema,
- strictConsistency: enableStrictConsistency,
- }
- go s.schemaWorker(ctx)
-
- type route struct {
- pattern *regexp.Regexp
- handler func(w http.ResponseWriter, r *http.Request, args ...string)
- }
-
- routes := []route{
- {regexp.MustCompile(`^/$`), s.viewMachines},
- {regexp.MustCompile(`^/machine/([a-fA-F0-9\-]+)$`), s.viewMachineDetail},
- {regexp.MustCompile(`^/provider/([^/]+)/([^/]+)$`), s.viewProviderRedirect},
- {regexp.MustCompile(`^/session/([^/]+)`), s.viewSession},
- }
-
- mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
- for _, route := range routes {
- match := route.pattern.FindStringSubmatch(r.URL.Path)
- if match == nil {
- continue
- }
- route.handler(w, r, match[1:]...)
- return
- }
- http.NotFound(w, r)
- })
- return nil
-}
-
-// Config describes the webug interface configuration. This should be embedded
-// inside your component's Config object.
-//
-// To configure, either set values or call RegisterFlags before flag.Parse.
-//
-// To run after configuration, call Start.
-type Config struct {
- // If set, start a webug interface on an HTTP listener bound to the given address.
- WebugListenAddress string
-
- // Enables strict consistency
- WebugDBFetchStrict bool
-}
-
-// RegisterFlags for webug interface.
-func (c *Config) RegisterFlags() {
- flag.StringVar(&c.WebugListenAddress, "webug_listen_address", "", "Address to start BMDB webug on. If not set, webug will not be started.")
- flag.BoolVar(&c.WebugDBFetchStrict, "webug_dbfetch_strict", false, "Enables strict consistency")
-}
-
-// Start the webug interface in the foreground if enabled. The returned error
-// will be either a configuration/connection error returned as soon as possible,
-// or a context expiration error.
-//
-// The given context will be used for all connections from the webug interface to
-// the given BMDB connection.
-func (c *Config) Start(ctx context.Context, conn *bmdb.Connection) error {
- if c.WebugListenAddress == "" {
- return nil
- }
- mux := http.NewServeMux()
- if err := Register(ctx, conn, mux, c.WebugDBFetchStrict); err != nil {
- return err
- }
-
- klog.Infof("Webug listening at %s", c.WebugListenAddress)
- return http.ListenAndServe(c.WebugListenAddress, mux)
-}
diff --git a/cloud/bmaas/scruffy/BUILD.bazel b/cloud/bmaas/scruffy/BUILD.bazel
deleted file mode 100644
index 4934ad2..0000000
--- a/cloud/bmaas/scruffy/BUILD.bazel
+++ /dev/null
@@ -1,48 +0,0 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
-
-go_library(
- name = "scruffy",
- srcs = [
- "bmdb_stats.go",
- "hw_stats.go",
- "labels.go",
- "server.go",
- ],
- importpath = "source.monogon.dev/cloud/bmaas/scruffy",
- visibility = ["//visibility:public"],
- deps = [
- "//cloud/bmaas/bmdb",
- "//cloud/bmaas/bmdb/metrics",
- "//cloud/bmaas/bmdb/model",
- "//cloud/bmaas/bmdb/webug",
- "//cloud/bmaas/server/api",
- "//cloud/lib/component",
- "//go/algorithm/cartesian",
- "@com_github_cenkalti_backoff_v4//:backoff",
- "@com_github_google_uuid//:uuid",
- "@com_github_prometheus_client_golang//prometheus",
- "@io_k8s_klog_v2//:klog",
- "@org_golang_google_protobuf//proto",
- ],
-)
-
-go_test(
- name = "scruffy_test",
- srcs = [
- "bmdb_stats_test.go",
- "hw_stats_test.go",
- ],
- data = [
- "@cockroach",
- ],
- embed = [":scruffy"],
- deps = [
- "//cloud/agent/api",
- "//cloud/bmaas/bmdb",
- "//cloud/bmaas/bmdb/model",
- "//cloud/bmaas/server/api",
- "//cloud/lib/component",
- "@com_github_prometheus_client_golang//prometheus",
- "@org_golang_google_protobuf//proto",
- ],
-)
diff --git a/cloud/bmaas/scruffy/bmdb_stats.go b/cloud/bmaas/scruffy/bmdb_stats.go
deleted file mode 100644
index 90b2df2..0000000
--- a/cloud/bmaas/scruffy/bmdb_stats.go
+++ /dev/null
@@ -1,205 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package scruffy
-
-import (
- "context"
- "errors"
- "fmt"
- "time"
-
- "github.com/prometheus/client_golang/prometheus"
- "k8s.io/klog/v2"
-
- "source.monogon.dev/cloud/bmaas/bmdb/model"
-)
-
-// bmdbStatsRunner collects metrics from the BMDB and exposes them as Prometheus
-// metrics via a registry passed to newBMDBStatsRunner.
-type bmdbStatsRunner struct {
- s *Server
- collectors []*statsCollector
-}
-
-// A statsCollectorDefinition describes how to gather a given metric via a BMDB
-// SQL query.
-type statsCollectorDefinition struct {
- // name of the metric. Used in actual metric name, prefixed with 'bmdb_stats_'.
- name string
- // help string emitted in prometheus endpoint.
- help string
- // labels is the label 'type definition', containing information about the
- // dimensions of this metric.
- labels labelDefinitions
- // query used to retrieve the metric data.
- query func(*model.Queries, context.Context) ([]model.MetricValue, error)
-}
-
-// labelProcess is the type definition of the 'process' label 'type', which is a
-// fixed-cardinality representation of the database Process enum.
-var labelProcess = labelDefinition{
- name: "process",
- initialValues: []string{
- string(model.ProcessShepherdAccess),
- string(model.ProcessShepherdAgentStart),
- string(model.ProcessShepherdRecovery),
- },
-}
-
-var collectorDefs = []statsCollectorDefinition{
- {
- name: "active_backoffs",
- help: "Number of active backoffs, partitioned by process. There may be more than one active backoff per machine.",
- query: model.WrapLabeledMetric((*model.Queries).CountActiveBackoffs),
- labels: []labelDefinition{labelProcess},
- },
- {
- name: "active_work",
- help: "Number of active work, partitioned by process. There may be more than one active work item per machine.",
- query: model.WrapLabeledMetric((*model.Queries).CountActiveWork),
- labels: []labelDefinition{labelProcess},
- },
- {
- name: "machines",
- help: "Number of machines in the BMDB.",
- query: model.WrapSimpleMetric((*model.Queries).CountMachines),
- },
- {
- name: "machines_provided",
- help: "Number of provided machines in the BMDB.",
- query: model.WrapSimpleMetric((*model.Queries).CountMachinesProvided),
- },
- {
- name: "machines_heartbeating",
- help: "Number of machines with a currently heartbeating agent.",
- query: model.WrapSimpleMetric((*model.Queries).CountMachinesAgentHeartbeating),
- },
- {
- name: "machines_pending_installation",
- help: "Number of machines pending installation.",
- query: model.WrapSimpleMetric((*model.Queries).CountMachinesInstallationPending),
- },
- {
- name: "machines_installed",
- help: "Number of machines succesfully installed.",
- query: model.WrapSimpleMetric((*model.Queries).CountMachinesInstallationComplete),
- },
- {
- name: "machines_pending_agent_start",
- help: "Number of machines pending the agent start workflow.",
- query: model.WrapSimpleMetric((*model.Queries).CountMachinesForAgentStart),
- },
- {
- name: "machines_pending_agent_recovery",
- help: "Number of machines pending the agent recovery workflow.",
- query: model.WrapSimpleMetric((*model.Queries).CountMachinesForAgentRecovery),
- },
-}
-
-// A statsCollector is an instantiated statsCollectorDefinition which carries the
-// actual prometheus gauge backing the metric.
-type statsCollector struct {
- gauge *prometheus.GaugeVec
- def *statsCollectorDefinition
-}
-
-// setDefaults emits gauges with zero values for all metrics of the runner, using
-// the initialLabel data gathered from each metric definition.
-func (b *bmdbStatsRunner) setDefaults() {
- for _, collector := range b.collectors {
- info := collector.def
- initial := info.labels.initialLabels()
- if len(initial) == 0 {
- collector.gauge.With(nil).Set(0.0)
- } else {
- for _, labels := range initial {
- collector.gauge.With(labels).Set(0.0)
- }
- }
- }
-}
-
-// newBMDBStatsRunner builds a bmdbStatsRunner from the collectorDefs above. The
-// bmdbStatsRunner then has the given's Server BMDB connection bound to it and
-// can perform actual database statistic gathering.
-func newBMDBStatsRunner(s *Server, reg *prometheus.Registry) *bmdbStatsRunner {
- var collectors []*statsCollector
-
- for _, info := range collectorDefs {
- info := info
- gauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{
- Name: "bmdb_stats_" + info.name,
- Help: info.help,
- }, info.labels.names())
- reg.MustRegister(gauge)
-
- collectors = append(collectors, &statsCollector{
- gauge: gauge,
- def: &info,
- })
- }
-
- res := &bmdbStatsRunner{
- s: s,
- collectors: collectors,
- }
- res.setDefaults()
- return res
-}
-
-func (b *bmdbStatsRunner) run(ctx context.Context) {
- klog.Infof("Starting stats runner...")
-
- ti := time.NewTicker(b.s.Config.StatsRunnerRate)
-
- for {
- err := b.runOnce(ctx)
- if err != nil {
- if errors.Is(err, ctx.Err()) {
- return
- }
- klog.Errorf("Stats run failed: %v", err)
- }
- select {
- case <-ti.C:
- case <-ctx.Done():
- klog.Infof("Exiting stats runner (%v)...", ctx.Err())
- return
- }
- }
-}
-
-func (b *bmdbStatsRunner) runOnce(ctx context.Context) error {
- sess, err := b.s.session(ctx)
- if err != nil {
- return err
- }
-
- results := make(map[string][]model.MetricValue)
- // TODO(q3k): don't fail entire run if we can't collect just one metric.
- err = sess.Transact(ctx, func(q *model.Queries) error {
- for _, c := range b.collectors {
- res, err := c.def.query(q, ctx)
- if err != nil {
- return fmt.Errorf("collecting %s failed: %w", c.def.name, err)
- } else {
- results[c.def.name] = res
- }
- }
- return nil
- })
- if err != nil {
- return err
- }
-
- b.setDefaults()
- for _, c := range b.collectors {
- for _, m := range results[c.def.name] {
- klog.Infof("Setting %s (%v) to %d", c.def.name, m.Labels, m.Count)
- c.gauge.With(m.Labels).Set(float64(m.Count))
- }
- }
-
- return nil
-}
diff --git a/cloud/bmaas/scruffy/bmdb_stats_test.go b/cloud/bmaas/scruffy/bmdb_stats_test.go
deleted file mode 100644
index 0e28755..0000000
--- a/cloud/bmaas/scruffy/bmdb_stats_test.go
+++ /dev/null
@@ -1,172 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package scruffy
-
-import (
- "context"
- "fmt"
- "sort"
- "strings"
- "testing"
-
- "github.com/prometheus/client_golang/prometheus"
-
- "source.monogon.dev/cloud/bmaas/bmdb"
- "source.monogon.dev/cloud/bmaas/bmdb/model"
- "source.monogon.dev/cloud/lib/component"
-)
-
-func TestBMDBStats(t *testing.T) {
- s := Server{
- Config: Config{
- BMDB: bmdb.BMDB{
- Config: bmdb.Config{
- Database: component.CockroachConfig{
- InMemory: true,
- },
- },
- },
- },
- }
-
- registry := prometheus.NewRegistry()
- runner := newBMDBStatsRunner(&s, registry)
-
- ctx, ctxC := context.WithCancel(context.Background())
- defer ctxC()
-
- expect := func(wantValues map[string]int64) {
- t.Helper()
- res, err := registry.Gather()
- if err != nil {
- t.Fatalf("Gather: %v", err)
- }
- gotValues := make(map[string]bool)
- for _, mf := range res {
- if len(mf.Metric) != 1 {
- for _, m := range mf.Metric {
- var lvs []string
- for _, lp := range m.Label {
- lvs = append(lvs, fmt.Sprintf("%s=%s", *lp.Name, *lp.Value))
- }
- sort.Strings(lvs)
- name := fmt.Sprintf("%s[%s]", *mf.Name, strings.Join(lvs, ","))
- gotValues[name] = true
- if _, ok := wantValues[name]; !ok {
- t.Errorf("MetricFamily %s: unexpected", name)
- }
- if want, got := wantValues[name], int64(*m.Gauge.Value); want != got {
- t.Errorf("MetricFamily %s: wanted %d, got %d", *mf.Name, want, got)
- }
- }
- } else {
- m := mf.Metric[0]
- gotValues[*mf.Name] = true
- if want, got := wantValues[*mf.Name], int64(*m.Gauge.Value); want != got {
- t.Errorf("MetricFamily %s: wanted %d, got %d", *mf.Name, want, got)
- }
- if _, ok := wantValues[*mf.Name]; !ok {
- t.Errorf("MetricFamily %s: unexpected", *mf.Name)
- }
- }
- }
- for mf := range wantValues {
- if !gotValues[mf] {
- t.Errorf("MetricFamily %s: missing", mf)
- }
- }
- }
-
- expect(map[string]int64{
- "bmdb_stats_machines": 0,
- "bmdb_stats_machines_provided": 0,
- "bmdb_stats_machines_heartbeating": 0,
- "bmdb_stats_machines_pending_installation": 0,
- "bmdb_stats_machines_installed": 0,
- "bmdb_stats_machines_pending_agent_start": 0,
- "bmdb_stats_machines_pending_agent_recovery": 0,
- "bmdb_stats_active_backoffs[process=ShepherdAccess]": 0,
- "bmdb_stats_active_backoffs[process=ShepherdAgentStart]": 0,
- "bmdb_stats_active_backoffs[process=ShepherdRecovery]": 0,
- "bmdb_stats_active_work[process=ShepherdAccess]": 0,
- "bmdb_stats_active_work[process=ShepherdAgentStart]": 0,
- "bmdb_stats_active_work[process=ShepherdRecovery]": 0,
- })
-
- conn, err := s.Config.BMDB.Open(true)
- if err != nil {
- t.Fatalf("Open: %v", err)
- }
- sess, err := conn.StartSession(ctx)
- if err != nil {
- t.Fatalf("StartSession: %v", err)
- }
-
- s.bmdb = conn
- s.sessionC = make(chan *bmdb.Session)
- go s.sessionWorker(ctx)
- if err := runner.runOnce(ctx); err != nil {
- t.Fatal(err)
- }
-
- expect(map[string]int64{
- "bmdb_stats_machines": 0,
- "bmdb_stats_machines_provided": 0,
- "bmdb_stats_machines_heartbeating": 0,
- "bmdb_stats_machines_pending_installation": 0,
- "bmdb_stats_machines_installed": 0,
- "bmdb_stats_machines_pending_agent_start": 0,
- "bmdb_stats_machines_pending_agent_recovery": 0,
- "bmdb_stats_active_backoffs[process=ShepherdAccess]": 0,
- "bmdb_stats_active_backoffs[process=ShepherdAgentStart]": 0,
- "bmdb_stats_active_backoffs[process=ShepherdRecovery]": 0,
- "bmdb_stats_active_work[process=ShepherdAccess]": 0,
- "bmdb_stats_active_work[process=ShepherdAgentStart]": 0,
- "bmdb_stats_active_work[process=ShepherdRecovery]": 0,
- })
-
- f := fill().
- // Provided, needs installation.
- machine().providedE("1").build().
- // Three machines needing recovery.
- machine().providedE("2").agentNeverHeartbeat().build().
- machine().providedE("3").agentNeverHeartbeat().build().
- machine().providedE("4").agentNeverHeartbeat().build().
- // One machine correctly heartbeating.
- machine().providedE("5").agentHealthy().build().
- // Two machines heartbeating and pending installation.
- machine().providedE("6").agentHealthy().installRequested(10).build().
- machine().providedE("7").agentHealthy().installRequested(10).installReported(9).build().
- // Machine which is pending installation _and_ recovery.
- machine().providedE("8").agentNeverHeartbeat().installRequested(10).build().
- // Machine which has been successfully installed.
- machine().providedE("9").agentStoppedHeartbeating().installRequested(10).installReported(10).build()
-
- err = sess.Transact(ctx, func(q *model.Queries) error {
- return f(ctx, q)
- })
- if err != nil {
- t.Fatal(err)
- }
-
- if err := runner.runOnce(ctx); err != nil {
- t.Fatal(err)
- }
-
- expect(map[string]int64{
- "bmdb_stats_machines": 9,
- "bmdb_stats_machines_provided": 9,
- "bmdb_stats_machines_heartbeating": 3,
- "bmdb_stats_machines_pending_installation": 3,
- "bmdb_stats_machines_installed": 1,
- "bmdb_stats_machines_pending_agent_start": 1,
- "bmdb_stats_machines_pending_agent_recovery": 4,
- "bmdb_stats_active_backoffs[process=ShepherdAccess]": 0,
- "bmdb_stats_active_backoffs[process=ShepherdAgentStart]": 0,
- "bmdb_stats_active_backoffs[process=ShepherdRecovery]": 0,
- "bmdb_stats_active_work[process=ShepherdAccess]": 0,
- "bmdb_stats_active_work[process=ShepherdAgentStart]": 0,
- "bmdb_stats_active_work[process=ShepherdRecovery]": 0,
- })
-}
diff --git a/cloud/bmaas/scruffy/cmd/BUILD.bazel b/cloud/bmaas/scruffy/cmd/BUILD.bazel
deleted file mode 100644
index 9284aae..0000000
--- a/cloud/bmaas/scruffy/cmd/BUILD.bazel
+++ /dev/null
@@ -1,31 +0,0 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_library")
-load("@rules_oci//oci:defs.bzl", "oci_image")
-load("@rules_pkg//pkg:tar.bzl", "pkg_tar")
-
-go_library(
- name = "cmd_lib",
- srcs = ["main.go"],
- importpath = "source.monogon.dev/cloud/bmaas/scruffy/cmd",
- visibility = ["//visibility:private"],
- deps = ["//cloud/bmaas/scruffy"],
-)
-
-go_binary(
- name = "cmd",
- embed = [":cmd_lib"],
- visibility = ["//visibility:public"],
-)
-
-pkg_tar(
- name = "cmd_layer",
- srcs = [":cmd"],
-)
-
-oci_image(
- name = "cmd_image",
- base = "@distroless_base",
- entrypoint = ["/cmd"],
- tars = [":cmd_layer"],
- visibility = ["//visibility:public"],
- workdir = "/app",
-)
diff --git a/cloud/bmaas/scruffy/cmd/main.go b/cloud/bmaas/scruffy/cmd/main.go
deleted file mode 100644
index 457c05a..0000000
--- a/cloud/bmaas/scruffy/cmd/main.go
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package main
-
-import (
- "context"
- "flag"
- "os"
- "os/signal"
-
- "source.monogon.dev/cloud/bmaas/scruffy"
-)
-
-func main() {
- s := &scruffy.Server{}
- s.Config.RegisterFlags()
- flag.Parse()
-
- ctx, _ := signal.NotifyContext(context.Background(), os.Interrupt)
- s.Start(ctx)
- <-ctx.Done()
-}
diff --git a/cloud/bmaas/scruffy/hw_stats.go b/cloud/bmaas/scruffy/hw_stats.go
deleted file mode 100644
index 0006ef2..0000000
--- a/cloud/bmaas/scruffy/hw_stats.go
+++ /dev/null
@@ -1,167 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package scruffy
-
-import (
- "context"
- "errors"
- "fmt"
- "time"
-
- "github.com/google/uuid"
- "github.com/prometheus/client_golang/prometheus"
- "google.golang.org/protobuf/proto"
- "k8s.io/klog/v2"
-
- "source.monogon.dev/cloud/bmaas/bmdb/model"
- "source.monogon.dev/cloud/bmaas/server/api"
-)
-
-// hwStatsRunner collects metrics from the machine hardware inventory in BMDB and
-// exposes them as Prometheus metrics via a registry passed to newHWStatsRunner.
-type hwStatsRunner struct {
- s *Server
-
- nodesPerRegion *prometheus.GaugeVec
- memoryPerRegion *prometheus.GaugeVec
- cpuThreadsPerRegion *prometheus.GaugeVec
-}
-
-// newHWStatsRunner builds a hwStatsRunner. The hwStatsRunner then has the
-// given's Server BMDB connection bound to it and can perform actual database
-// statistic gathering.
-func newHWStatsRunner(s *Server, reg *prometheus.Registry) *hwStatsRunner {
- hwsr := &hwStatsRunner{
- s: s,
-
- nodesPerRegion: prometheus.NewGaugeVec(prometheus.GaugeOpts{
- Name: "bmdb_hwstats_region_nodes",
- }, []string{"provider", "location"}),
-
- memoryPerRegion: prometheus.NewGaugeVec(prometheus.GaugeOpts{
- Name: "bmdb_hwstats_region_ram_bytes",
- }, []string{"provider", "location"}),
-
- cpuThreadsPerRegion: prometheus.NewGaugeVec(prometheus.GaugeOpts{
- Name: "bmdb_hwstats_region_cpu_threads",
- }, []string{"provider", "location"}),
- }
- reg.MustRegister(hwsr.nodesPerRegion, hwsr.memoryPerRegion, hwsr.cpuThreadsPerRegion)
- return hwsr
-}
-
-func (h *hwStatsRunner) run(ctx context.Context) {
- klog.Infof("Starting stats runner...")
-
- ti := time.NewTicker(time.Minute)
-
- for {
- err := h.runOnce(ctx)
- if err != nil {
- if errors.Is(err, ctx.Err()) {
- return
- }
- klog.Errorf("Stats run failed: %v", err)
- }
- select {
- case <-ti.C:
- case <-ctx.Done():
- klog.Infof("Exiting stats runner (%v)...", ctx.Err())
- return
- }
- }
-}
-
-// statsPerRegion are gathered and aggregated (summed) per region.
-type statsPerRegion struct {
- nodes uint64
- ramBytes uint64
- numThreads uint64
-}
-
-// add a given AgentHardwareReport to this region's data.
-func (s *statsPerRegion) add(hwrep *api.AgentHardwareReport) {
- s.nodes++
- s.ramBytes += uint64(hwrep.Report.MemoryInstalledBytes)
- for _, cpu := range hwrep.Report.Cpu {
- s.numThreads += uint64(cpu.HardwareThreads)
- }
-}
-
-// regionKey is used to uniquely identify each region per each provider.
-type regionKey struct {
- provider model.Provider
- location string
-}
-
-func (r *regionKey) String() string {
- return fmt.Sprintf("%s/%s", r.provider, r.location)
-}
-
-func (h *hwStatsRunner) runOnce(ctx context.Context) error {
- sess, err := h.s.session(ctx)
- if err != nil {
- return err
- }
-
- var start uuid.UUID
-
- perRegion := make(map[regionKey]*statsPerRegion)
- var total statsPerRegion
-
- for {
- var res []model.ListMachineHardwareRow
- err = sess.Transact(ctx, func(q *model.Queries) error {
- res, err = q.ListMachineHardware(ctx, model.ListMachineHardwareParams{
- Limit: 100,
- MachineID: start,
- })
- return err
- })
- if err != nil {
- return err
- }
- klog.Infof("Machines: %d chunk", len(res))
- if len(res) == 0 {
- break
- }
- for _, row := range res {
- var hwrep api.AgentHardwareReport
- err = proto.Unmarshal(row.HardwareReportRaw.([]byte), &hwrep)
- if err != nil {
- klog.Warningf("Could not decode hardware report from %s: %v", row.MachineID, err)
- continue
- }
-
- if !row.ProviderLocation.Valid {
- klog.Warningf("%s has no provider location, skipping", row.MachineID)
- continue
- }
-
- key := regionKey{
- provider: row.Provider,
- location: row.ProviderLocation.String,
- }
- if _, ok := perRegion[key]; !ok {
- perRegion[key] = &statsPerRegion{}
- }
- perRegion[key].add(&hwrep)
- total.add(&hwrep)
-
- start = row.MachineID
- }
- }
-
- for k, st := range perRegion {
- labels := prometheus.Labels{
- "provider": string(k.provider),
- "location": k.location,
- }
-
- h.nodesPerRegion.With(labels).Set(float64(st.nodes))
- h.memoryPerRegion.With(labels).Set(float64(st.ramBytes))
- h.cpuThreadsPerRegion.With(labels).Set(float64(st.numThreads))
- }
- return nil
-}
diff --git a/cloud/bmaas/scruffy/hw_stats_test.go b/cloud/bmaas/scruffy/hw_stats_test.go
deleted file mode 100644
index 7dc8139..0000000
--- a/cloud/bmaas/scruffy/hw_stats_test.go
+++ /dev/null
@@ -1,324 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package scruffy
-
-import (
- "context"
- "database/sql"
- "testing"
- "time"
-
- "github.com/prometheus/client_golang/prometheus"
- "google.golang.org/protobuf/proto"
-
- aapi "source.monogon.dev/cloud/agent/api"
- "source.monogon.dev/cloud/bmaas/server/api"
-
- "source.monogon.dev/cloud/bmaas/bmdb"
- "source.monogon.dev/cloud/bmaas/bmdb/model"
- "source.monogon.dev/cloud/lib/component"
-)
-
-type filler func(ctx context.Context, q *model.Queries) error
-
-func fill() filler {
- return func(ctx context.Context, q *model.Queries) error {
- return nil
- }
-}
-
-func (f filler) chain(n func(ctx context.Context, q *model.Queries) error) filler {
- return func(ctx context.Context, q *model.Queries) error {
- if err := f(ctx, q); err != nil {
- return err
- }
- return n(ctx, q)
- }
-}
-
-type fillerMachine struct {
- f filler
-
- provider *model.Provider
- providerID *string
-
- location *string
-
- threads *int32
- ramgb *int64
-
- agentStartedAt *time.Time
-
- agentHeartbeatAt *time.Time
-
- installationRequestGeneration *int64
-
- installationReportGeneration *int64
-}
-
-func (f filler) machine() *fillerMachine {
- return &fillerMachine{
- f: f,
- }
-}
-
-func (m *fillerMachine) provided(p model.Provider, pid string) *fillerMachine {
- m.provider = &p
- m.providerID = &pid
- return m
-}
-
-func (m *fillerMachine) providedE(pid string) *fillerMachine {
- return m.provided(model.ProviderEquinix, pid)
-}
-
-func (m *fillerMachine) located(location string) *fillerMachine {
- m.location = &location
- return m
-}
-
-func (m *fillerMachine) hardware(threads int32, ramgb int64) *fillerMachine {
- m.threads = &threads
- m.ramgb = &ramgb
- return m
-}
-
-func (m *fillerMachine) agentStarted(t time.Time) *fillerMachine {
- m.agentStartedAt = &t
- return m
-}
-
-func (m *fillerMachine) agentHeartbeat(t time.Time) *fillerMachine {
- m.agentHeartbeatAt = &t
- return m
-}
-
-func (m *fillerMachine) agentHealthy() *fillerMachine {
- now := time.Now()
- return m.agentStarted(now.Add(-30 * time.Minute)).agentHeartbeat(now.Add(-1 * time.Minute))
-}
-
-func (m *fillerMachine) agentStoppedHeartbeating() *fillerMachine {
- now := time.Now()
- return m.agentStarted(now.Add(-30 * time.Minute)).agentHeartbeat(now.Add(-20 * time.Minute))
-}
-
-func (m *fillerMachine) agentNeverHeartbeat() *fillerMachine {
- now := time.Now()
- return m.agentStarted(now.Add(-30 * time.Minute))
-}
-
-func (m *fillerMachine) installRequested(gen int64) *fillerMachine {
- m.installationRequestGeneration = &gen
- return m
-}
-
-func (m *fillerMachine) installReported(gen int64) *fillerMachine {
- m.installationReportGeneration = &gen
- return m
-}
-
-func (m *fillerMachine) build() filler {
- return m.f.chain(func(ctx context.Context, q *model.Queries) error {
- mach, err := q.NewMachine(ctx)
- if err != nil {
- return err
- }
- if m.providerID != nil {
- err = q.MachineAddProvided(ctx, model.MachineAddProvidedParams{
- MachineID: mach.MachineID,
- Provider: *m.provider,
- ProviderID: *m.providerID,
- })
- if err != nil {
- return err
- }
- if m.location != nil {
- err = q.MachineUpdateProviderStatus(ctx, model.MachineUpdateProviderStatusParams{
- ProviderID: *m.providerID,
- Provider: *m.provider,
- ProviderLocation: sql.NullString{Valid: true, String: *m.location},
- })
- if err != nil {
- return err
- }
- }
- }
- if m.threads != nil {
- report := api.AgentHardwareReport{
- Report: &aapi.Node{
- MemoryInstalledBytes: *m.ramgb << 30,
- MemoryUsableRatio: 1.0,
- Cpu: []*aapi.CPU{
- {
- HardwareThreads: *m.threads,
- Cores: *m.threads,
- },
- },
- },
- Warning: nil,
- }
- raw, err := proto.Marshal(&report)
- if err != nil {
- return err
- }
- err = q.MachineSetHardwareReport(ctx, model.MachineSetHardwareReportParams{
- MachineID: mach.MachineID,
- HardwareReportRaw: raw,
- })
- if err != nil {
- return err
- }
- }
- if m.agentStartedAt != nil {
- err = q.MachineSetAgentStarted(ctx, model.MachineSetAgentStartedParams{
- MachineID: mach.MachineID,
- AgentStartedAt: *m.agentStartedAt,
- AgentPublicKey: []byte("fakefakefake"),
- })
- if err != nil {
- return err
- }
- }
- if m.agentHeartbeatAt != nil {
- err = q.MachineSetAgentHeartbeat(ctx, model.MachineSetAgentHeartbeatParams{
- MachineID: mach.MachineID,
- AgentHeartbeatAt: *m.agentHeartbeatAt,
- })
- if err != nil {
- return err
- }
- }
- if m.installationRequestGeneration != nil {
- err = q.MachineSetOSInstallationRequest(ctx, model.MachineSetOSInstallationRequestParams{
- MachineID: mach.MachineID,
- Generation: *m.installationRequestGeneration,
- })
- if err != nil {
- return err
- }
- }
- if m.installationReportGeneration != nil {
- err = q.MachineSetOSInstallationReport(ctx, model.MachineSetOSInstallationReportParams{
- MachineID: mach.MachineID,
- Generation: *m.installationReportGeneration,
- OsInstallationResult: model.MachineOsInstallationResultSuccess,
- })
- if err != nil {
- return err
- }
- }
- return nil
- })
-}
-
-func TestHWStats(t *testing.T) {
- s := Server{
- Config: Config{
- BMDB: bmdb.BMDB{
- Config: bmdb.Config{
- Database: component.CockroachConfig{
- InMemory: true,
- },
- },
- },
- },
- }
-
- registry := prometheus.NewRegistry()
- runner := newHWStatsRunner(&s, registry)
-
- ctx, ctxC := context.WithCancel(context.Background())
- defer ctxC()
-
- res, err := registry.Gather()
- if err != nil {
- t.Fatalf("Gather: %v", err)
- }
- if want, got := 0, len(res); want != got {
- t.Fatalf("Expected no metrics with empty database, got %d", got)
- }
-
- conn, err := s.Config.BMDB.Open(true)
- if err != nil {
- t.Fatalf("Open: %v", err)
- }
- sess, err := conn.StartSession(ctx)
- if err != nil {
- t.Fatalf("StartSession: %v", err)
- }
- // Populate database with some test data.
- err = sess.Transact(ctx, func(q *model.Queries) error {
- f := fill().
- machine().provided(model.ProviderEquinix, "1").hardware(32, 256).located("dark-bramble").build().
- machine().provided(model.ProviderEquinix, "2").hardware(32, 256).located("dark-bramble").build().
- machine().provided(model.ProviderEquinix, "3").hardware(32, 256).located("dark-bramble").build().
- machine().provided(model.ProviderEquinix, "4").hardware(32, 256).located("brittle-hollow").build().
- machine().provided(model.ProviderEquinix, "5").hardware(32, 256).located("timber-hearth").build().
- machine().provided(model.ProviderEquinix, "6").hardware(32, 256).located("timber-hearth").build()
- return f(ctx, q)
- })
- if err != nil {
- t.Fatalf("Transact: %v", err)
- }
-
- s.bmdb = conn
- s.sessionC = make(chan *bmdb.Session)
- go s.sessionWorker(ctx)
-
- // Do a statistics run and check results.
- if err := runner.runOnce(ctx); err != nil {
- t.Fatalf("runOnce: %v", err)
- }
-
- mfs, err := registry.Gather()
- if err != nil {
- t.Fatalf("Gatcher: %v", err)
- }
-
- // metric name -> provider -> location -> value
- values := make(map[string]map[string]map[string]float64)
- for _, mf := range mfs {
- values[*mf.Name] = make(map[string]map[string]float64)
- for _, m := range mf.Metric {
- var provider, location string
- for _, pair := range m.Label {
- switch *pair.Name {
- case "location":
- location = *pair.Value
- case "provider":
- provider = *pair.Value
- }
- }
- if _, ok := values[*mf.Name][provider]; !ok {
- values[*mf.Name][provider] = make(map[string]float64)
- }
- switch {
- case m.Gauge != nil && m.Gauge.Value != nil:
- values[*mf.Name][provider][location] = *m.Gauge.Value
- }
- }
- }
-
- for _, te := range []struct {
- provider model.Provider
- location string
- threads int32
- ramgb int64
- }{
- {model.ProviderEquinix, "dark-bramble", 96, 768},
- {model.ProviderEquinix, "brittle-hollow", 32, 256},
- {model.ProviderEquinix, "timber-hearth", 64, 512},
- } {
- threads := values["bmdb_hwstats_region_cpu_threads"][string(te.provider)][te.location]
- bytes := values["bmdb_hwstats_region_ram_bytes"][string(te.provider)][te.location]
-
- if want, got := te.threads, int32(threads); want != got {
- t.Errorf("Wanted %d threads in %s/%s, got %d", want, te.provider, te.location, got)
- }
- if want, got := te.ramgb, int64(bytes)>>30; want != got {
- t.Errorf("Wanted %d GB RAM in %s/%s, got %d", want, te.provider, te.location, got)
- }
- }
-}
diff --git a/cloud/bmaas/scruffy/labels.go b/cloud/bmaas/scruffy/labels.go
deleted file mode 100644
index c90e43a..0000000
--- a/cloud/bmaas/scruffy/labels.go
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package scruffy
-
-import (
- "github.com/prometheus/client_golang/prometheus"
-
- "source.monogon.dev/go/algorithm/cartesian"
-)
-
-// A labelDefinition describes a key/value pair that's a metric dimension. It
-// consists of the label key/name (a string), and a list of possible values of
-// this key. The list of values will be used to initialize the metrics at startup
-// with zero values.
-//
-// The initialValues system is intended to be used with labels that are
-// low-cardinality enums, e.g. the name of a subsystem.
-//
-// All labelDefinitions for a single metric will then create a cartesian product
-// of all initialValues.
-type labelDefinition struct {
- // name/key of the label.
- name string
- // initialValues defines the default values for this label key/name that will be
- // used to generate a list of initial zero-filled metrics.
- initialValues []string
-}
-
-// labelDefinitions is a list of labelDefinition which define the label
-// dimensions of a metric. All the initialValues of the respective
-// labelDefinitions will create a cartesian set of default zero-filled metric
-// values when the metric susbsystem gets initialized. These zero values will
-// then get overridden by real data as it is collected.
-type labelDefinitions []labelDefinition
-
-// initialLabels generates the list of initial labels key/values that should be
-// used to generate zero-filled metrics on startup. This is a cartesian product
-// of all initialValues of all labelDefinitions.
-func (l labelDefinitions) initialLabels() []prometheus.Labels {
- // Nothing to do if this is an empty labelDefinitions.
- if len(l) == 0 {
- return nil
- }
-
- // Given:
- //
- // labelDefinitions := []labelDefinition{
- // { name: "a", initialValues: []string{"foo", "bar"}},
- // { name: "b", initialValues: []string{"baz", "barf"}},
- // }
- //
- // This creates:
- //
- // values := []string{
- // { "foo", "bar" }, // label 'a'
- // { "baz", "barf" }, // label 'b'
- // }
- var values [][]string
- for _, ld := range l {
- values = append(values, ld.initialValues)
- }
-
- // Given the above:
- //
- // valuesProduct := []string{
- // // a b
- // { "foo", "baz" },
- // { "foo", "barf" },
- // { "bar", "baz" },
- // { "bar", "barf" },
- // }
- valuesProduct := cartesian.Product[string](values...)
-
- // This converts valuesProduct into an actual prometheus-compatible type,
- // re-attaching the label names back into the columns as seen above.
- var res []prometheus.Labels
- for _, vp := range valuesProduct {
- labels := make(prometheus.Labels)
- for i, lv := range vp {
- labelDef := l[i]
- labels[labelDef.name] = lv
- }
- res = append(res, labels)
- }
- return res
-}
-
-// names returns the keys/names of all the metric labels from these
-// labelDefinitions.
-func (l labelDefinitions) names() []string {
- var res []string
- for _, ld := range l {
- res = append(res, ld.name)
- }
- return res
-}
diff --git a/cloud/bmaas/scruffy/server.go b/cloud/bmaas/scruffy/server.go
deleted file mode 100644
index d020a77..0000000
--- a/cloud/bmaas/scruffy/server.go
+++ /dev/null
@@ -1,131 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-// Package scruffy implements Scruffy, The Janitor.
-//
-// Scruffy is a BMaaS component which runs a bunch of important, housekeeping-ish
-// processes that aren't tied to any particular provider and are mostly
-// batch-oriented.
-//
-// Currently Scruffy just collects metrics from the BMDB.
-package scruffy
-
-import (
- "context"
- "errors"
- "flag"
- "fmt"
- "os"
- "time"
-
- "github.com/cenkalti/backoff/v4"
- "k8s.io/klog/v2"
-
- "source.monogon.dev/cloud/bmaas/bmdb"
- "source.monogon.dev/cloud/bmaas/bmdb/metrics"
- "source.monogon.dev/cloud/bmaas/bmdb/webug"
- "source.monogon.dev/cloud/lib/component"
-)
-
-type Config struct {
- Component component.ComponentConfig
- BMDB bmdb.BMDB
- Webug webug.Config
-
- StatsRunnerRate time.Duration
-}
-
-// TODO(q3k): factor this out to BMDB library?
-func runtimeInfo() string {
- hostname, _ := os.Hostname()
- if hostname == "" {
- hostname = "UNKNOWN"
- }
- return fmt.Sprintf("host %s", hostname)
-}
-
-func (c *Config) RegisterFlags() {
- c.Component.RegisterFlags("scruffy")
- c.BMDB.ComponentName = "scruffy"
- c.BMDB.RuntimeInfo = runtimeInfo()
- c.BMDB.Database.RegisterFlags("bmdb")
- c.Webug.RegisterFlags()
-
- flag.DurationVar(&c.StatsRunnerRate, "scruffy_stats_collection_rate", time.Minute, "How often the stats collection loop will run against BMDB")
-}
-
-type Server struct {
- Config Config
-
- bmdb *bmdb.Connection
- sessionC chan *bmdb.Session
-}
-
-func (s *Server) Start(ctx context.Context) {
- reg := s.Config.Component.PrometheusRegistry()
- s.Config.BMDB.EnableMetrics(reg)
- s.Config.Component.StartPrometheus(ctx)
-
- conn, err := s.Config.BMDB.Open(true)
- if err != nil {
- klog.Exitf("Failed to connect to BMDB: %v", err)
- }
- s.bmdb = conn
- s.sessionC = make(chan *bmdb.Session)
- go s.sessionWorker(ctx)
-
- bsr := newBMDBStatsRunner(s, reg)
- go bsr.run(ctx)
-
- hwr := newHWStatsRunner(s, reg)
- go hwr.run(ctx)
-
- go func() {
- if err := s.Config.Webug.Start(ctx, conn); err != nil && !errors.Is(err, ctx.Err()) {
- klog.Exitf("Failed to start webug: %v", err)
- }
- }()
-}
-
-// sessionWorker emits a valid BMDB session to sessionC as long as ctx is active.
-//
-// TODO(q3k): factor out into bmdb client lib
-func (s *Server) sessionWorker(ctx context.Context) {
- var session *bmdb.Session
- for {
- if session == nil || session.Expired() {
- klog.Infof("Starting new session...")
- bo := backoff.NewExponentialBackOff()
- err := backoff.Retry(func() error {
- var err error
- session, err = s.bmdb.StartSession(ctx, bmdb.SessionOption{Processor: metrics.ProcessorScruffyStats})
- if err != nil {
- klog.Errorf("Failed to start session: %v", err)
- return err
- } else {
- return nil
- }
- }, backoff.WithContext(bo, ctx))
- if err != nil {
- // If something's really wrong just crash.
- klog.Exitf("Gave up on starting session: %v", err)
- }
- klog.Infof("New session: %s", session.UUID)
- }
-
- select {
- case <-ctx.Done():
- return
- case s.sessionC <- session:
- }
- }
-}
-
-func (s *Server) session(ctx context.Context) (*bmdb.Session, error) {
- select {
- case sess := <-s.sessionC:
- return sess, nil
- case <-ctx.Done():
- return nil, ctx.Err()
- }
-}
diff --git a/cloud/bmaas/server/BUILD.bazel b/cloud/bmaas/server/BUILD.bazel
deleted file mode 100644
index d23f56b..0000000
--- a/cloud/bmaas/server/BUILD.bazel
+++ /dev/null
@@ -1,47 +0,0 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
-
-go_library(
- name = "server",
- srcs = [
- "agent_callback_service.go",
- "server.go",
- ],
- importpath = "source.monogon.dev/cloud/bmaas/server",
- visibility = ["//visibility:public"],
- deps = [
- "//cloud/bmaas/bmdb",
- "//cloud/bmaas/bmdb/metrics",
- "//cloud/bmaas/bmdb/model",
- "//cloud/bmaas/bmdb/webug",
- "//cloud/bmaas/server/api",
- "//cloud/lib/component",
- "//metropolis/node/core/rpc",
- "@com_github_cenkalti_backoff_v4//:backoff",
- "@com_github_google_uuid//:uuid",
- "@io_k8s_klog_v2//:klog",
- "@org_golang_google_grpc//:grpc",
- "@org_golang_google_grpc//codes",
- "@org_golang_google_grpc//reflection",
- "@org_golang_google_grpc//status",
- "@org_golang_google_protobuf//proto",
- ],
-)
-
-go_test(
- name = "server_test",
- srcs = ["agent_callback_service_test.go"],
- data = [
- "@cockroach",
- ],
- embed = [":server"],
- deps = [
- "//cloud/bmaas/bmdb",
- "//cloud/bmaas/bmdb/model",
- "//cloud/bmaas/server/api",
- "//cloud/lib/component",
- "//metropolis/node/core/rpc",
- "@com_github_google_uuid//:uuid",
- "@org_golang_google_grpc//:grpc",
- "@org_golang_google_protobuf//proto",
- ],
-)
diff --git a/cloud/bmaas/server/README.md b/cloud/bmaas/server/README.md
deleted file mode 100644
index b84ae25..0000000
--- a/cloud/bmaas/server/README.md
+++ /dev/null
@@ -1,27 +0,0 @@
-BMaaS Server
-===
-
-Background
----
-
-This server provides an interface to the BMaaS database/state over a gRPC API. Most components of the BMaaS system talk to the database directly whenever possible. Everything else communicates through this server.
-
-Currently this is:
-
-1. Agents running on machines, as they should only be allowed to access/update information about the machine they're running on, and they're generally considered untrusted since they run on end-user available machines.
-
-In the future this server will likely also take care of:
-
-1. A debug web API for developers/administrators to inspect database/BMDB state.
-2. Periodic batch jobs across the entire BMDB, like consistency checks.
-3. Exporting BMDB state into monitoring systems.
-4. Coordinating access to the BMDB systems if the current direct-access-to-database architecture stops scaling.
-
-Running
----
-
- bazel run //cloud/bmaas/server/cmd -- -srv_dev_certs -bmdb_eat_my_data
-
-Although that's not very useful in itself currently. Instead, most functionality is currently exercised through automated tests.
-
-TODO(q3k): document complete BMaaS dev deployment (multi-component, single BMDB).
\ No newline at end of file
diff --git a/cloud/bmaas/server/agent_callback_service.go b/cloud/bmaas/server/agent_callback_service.go
deleted file mode 100644
index b664231..0000000
--- a/cloud/bmaas/server/agent_callback_service.go
+++ /dev/null
@@ -1,166 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package server
-
-import (
- "context"
- "crypto/ed25519"
- "encoding/hex"
- "errors"
- "fmt"
- "time"
-
- "github.com/google/uuid"
- "google.golang.org/grpc/codes"
- "google.golang.org/grpc/status"
- "google.golang.org/protobuf/proto"
- "k8s.io/klog/v2"
-
- apb "source.monogon.dev/cloud/bmaas/server/api"
-
- "source.monogon.dev/cloud/bmaas/bmdb/model"
- "source.monogon.dev/metropolis/node/core/rpc"
-)
-
-type agentCallbackService struct {
- s *Server
-}
-
-var (
- errAgentUnauthenticated = errors.New("machine id or public key unknown")
-)
-
-func (a *agentCallbackService) Heartbeat(ctx context.Context, req *apb.HeartbeatRequest) (*apb.HeartbeatResponse, error) {
- // Extract ED25519 self-signed certificate from client connection.
- cert, err := rpc.GetPeerCertificate(ctx)
- if err != nil {
- return nil, err
- }
- pk := cert.PublicKey.(ed25519.PublicKey)
- machineId, err := uuid.Parse(req.MachineId)
- if err != nil {
- return nil, status.Error(codes.InvalidArgument, "machine_id invalid")
- }
-
- session, err := a.s.session(ctx)
- if err != nil {
- klog.Errorf("Could not start session: %v", err)
- return nil, status.Error(codes.Unavailable, "could not start session")
- }
-
- // Verify that machine ID and connection public key match up to a machine in the
- // BMDB. Prevent leaking information about a machine's existence to unauthorized
- // agents.
- err = session.Transact(ctx, func(q *model.Queries) error {
- agents, err := q.AuthenticateAgentConnection(ctx, model.AuthenticateAgentConnectionParams{
- MachineID: machineId,
- AgentPublicKey: pk,
- })
- if err != nil {
- return fmt.Errorf("AuthenticateAgentConnection: %w", err)
- }
- if len(agents) < 1 {
- klog.Errorf("No agent for %s/%s", machineId.String(), hex.EncodeToString(pk))
- return errAgentUnauthenticated
- }
- return nil
- })
- if err != nil {
- if errors.Is(err, errAgentUnauthenticated) {
- return nil, status.Error(codes.Unauthenticated, err.Error())
- }
- klog.Errorf("Could not authenticate agent: %v", err)
- return nil, status.Error(codes.Unavailable, "could not authenticate agent")
- }
-
- // Request is now authenticated.
-
- // Serialize hardware report if submitted alongside heartbeat.
- var hwraw []byte
- if req.HardwareReport != nil {
- hwraw, err = proto.Marshal(req.HardwareReport)
- if err != nil {
- return nil, status.Errorf(codes.InvalidArgument, "could not serialize hardware report: %v", err)
- }
- }
-
- var installRaw []byte
- if req.InstallationReport != nil {
- installRaw, err = proto.Marshal(req.InstallationReport)
- if err != nil {
- return nil, status.Errorf(codes.InvalidArgument, "could not serialize installation report: %v", err)
- }
- }
-
- // Upsert heartbeat time and hardware report.
- err = session.Transact(ctx, func(q *model.Queries) error {
- // Upsert hardware report if submitted.
- if len(hwraw) != 0 {
- err = q.MachineSetHardwareReport(ctx, model.MachineSetHardwareReportParams{
- MachineID: machineId,
- HardwareReportRaw: hwraw,
- })
- if err != nil {
- return fmt.Errorf("hardware report upsert: %w", err)
- }
- }
- // Upsert os installation report if submitted.
- if len(installRaw) != 0 {
- var result model.MachineOsInstallationResult
- switch req.InstallationReport.Result.(type) {
- case *apb.OSInstallationReport_Success_:
- result = model.MachineOsInstallationResultSuccess
- case *apb.OSInstallationReport_Error_:
- result = model.MachineOsInstallationResultError
- default:
- return fmt.Errorf("unknown installation report result: %T", req.InstallationReport.Result)
- }
- err = q.MachineSetOSInstallationReport(ctx, model.MachineSetOSInstallationReportParams{
- MachineID: machineId,
- Generation: req.InstallationReport.Generation,
- OsInstallationResult: result,
- OsInstallationReportRaw: installRaw,
- })
- }
- return q.MachineSetAgentHeartbeat(ctx, model.MachineSetAgentHeartbeatParams{
- MachineID: machineId,
- AgentHeartbeatAt: time.Now(),
- })
- })
- if err != nil {
- klog.Errorf("Could not submit heartbeat: %v", err)
- return nil, status.Error(codes.Unavailable, "could not submit heartbeat")
- }
- klog.Infof("Heartbeat from %s/%s", machineId.String(), hex.EncodeToString(pk))
-
- // Get installation request for machine if present.
- var installRequest *apb.OSInstallationRequest
- err = session.Transact(ctx, func(q *model.Queries) error {
- reqs, err := q.GetExactMachineForOSInstallation(ctx, model.GetExactMachineForOSInstallationParams{
- MachineID: machineId,
- Limit: 1,
- })
- if err != nil {
- return fmt.Errorf("GetExactMachineForOSInstallation: %w", err)
- }
- if len(reqs) > 0 {
- raw := reqs[0].OsInstallationRequestRaw
- var preq apb.OSInstallationRequest
- if err := proto.Unmarshal(raw, &preq); err != nil {
- return fmt.Errorf("could not decode stored OS installation request: %w", err)
- }
- installRequest = &preq
- }
- return nil
- })
- if err != nil {
- // Do not fail entire request. Instead, just log an error.
- // TODO(q3k): alert on this
- klog.Errorf("Failure during OS installation request retrieval: %v", err)
- }
-
- return &apb.HeartbeatResponse{
- InstallationRequest: installRequest,
- }, nil
-}
diff --git a/cloud/bmaas/server/agent_callback_service_test.go b/cloud/bmaas/server/agent_callback_service_test.go
deleted file mode 100644
index a055d1d..0000000
--- a/cloud/bmaas/server/agent_callback_service_test.go
+++ /dev/null
@@ -1,256 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package server
-
-import (
- "context"
- "crypto/ed25519"
- "crypto/rand"
- "testing"
- "time"
-
- "github.com/google/uuid"
- "google.golang.org/grpc"
- "google.golang.org/protobuf/proto"
-
- apb "source.monogon.dev/cloud/bmaas/server/api"
-
- "source.monogon.dev/cloud/bmaas/bmdb"
- "source.monogon.dev/cloud/bmaas/bmdb/model"
- "source.monogon.dev/cloud/lib/component"
- "source.monogon.dev/metropolis/node/core/rpc"
-)
-
-func dut() *Server {
- return &Server{
- Config: Config{
- Component: component.ComponentConfig{
- GRPCListenAddress: ":0",
- DevCerts: true,
- DevCertsPath: "/tmp/foo",
- },
- BMDB: bmdb.BMDB{
- Config: bmdb.Config{
- Database: component.CockroachConfig{
- InMemory: true,
- },
- },
- },
- PublicListenAddress: ":0",
- },
- }
-}
-
-// TestAgentCallbackService exercises the basic flow for submitting an agent
-// heartbeat and hardware report.
-func TestAgentCallbackService(t *testing.T) {
- s := dut()
- ctx, ctxC := context.WithCancel(context.Background())
- defer ctxC()
- s.Start(ctx)
-
- pub, priv, err := ed25519.GenerateKey(rand.Reader)
- if err != nil {
- t.Fatalf("could not generate keypair: %v", err)
- }
-
- sess, err := s.bmdb.StartSession(ctx)
- if err != nil {
- t.Fatalf("could not start session")
- }
-
- heartbeat := func(mid uuid.UUID) error {
- creds, err := rpc.NewEphemeralCredentials(priv, rpc.WantInsecure())
- if err != nil {
- t.Fatalf("could not generate ephemeral credentials: %v", err)
- }
- conn, err := grpc.NewClient(s.ListenPublic, grpc.WithTransportCredentials(creds))
- if err != nil {
- t.Fatalf("NewClient failed: %v", err)
- }
- defer conn.Close()
-
- stub := apb.NewAgentCallbackClient(conn)
- _, err = stub.Heartbeat(ctx, &apb.HeartbeatRequest{
- MachineId: mid.String(),
- HardwareReport: &apb.AgentHardwareReport{},
- })
- return err
- }
-
- // First, attempt to heartbeat for some totally made up machine ID. That should
- // fail.
- if err := heartbeat(uuid.New()); err == nil {
- t.Errorf("heartbeat for made up UUID should've failed")
- }
-
- // Create an actual machine in the BMDB alongside the expected pubkey within an
- // AgentStarted tag.
- var machine model.Machine
- err = sess.Transact(ctx, func(q *model.Queries) error {
- machine, err = q.NewMachine(ctx)
- if err != nil {
- return err
- }
- err = q.MachineAddProvided(ctx, model.MachineAddProvidedParams{
- MachineID: machine.MachineID,
- Provider: model.ProviderEquinix,
- ProviderID: "123",
- })
- if err != nil {
- return err
- }
- return q.MachineSetAgentStarted(ctx, model.MachineSetAgentStartedParams{
- MachineID: machine.MachineID,
- AgentStartedAt: time.Now(),
- AgentPublicKey: pub,
- })
- })
- if err != nil {
- t.Fatalf("could not create machine: %v", err)
- }
-
- // Now heartbeat with correct machine ID and key. This should succeed.
- if err := heartbeat(machine.MachineID); err != nil {
- t.Errorf("heartbeat should've succeeded, got: %v", err)
- }
-
- // TODO(q3k): test hardware report being attached once we have some debug API
- // for tags.
-}
-
-// TestOSInstallationFlow exercises the agent's OS installation request/report
-// functionality.
-func TestOSInstallationFlow(t *testing.T) {
- s := dut()
- ctx, ctxC := context.WithCancel(context.Background())
- defer ctxC()
- s.Start(ctx)
-
- pub, priv, err := ed25519.GenerateKey(rand.Reader)
- if err != nil {
- t.Fatalf("could not generate keypair: %v", err)
- }
-
- sess, err := s.bmdb.StartSession(ctx)
- if err != nil {
- t.Fatalf("could not start session")
- }
-
- heartbeat := func(mid uuid.UUID, report *apb.OSInstallationReport) (*apb.HeartbeatResponse, error) {
- creds, err := rpc.NewEphemeralCredentials(priv, rpc.WantInsecure())
- if err != nil {
- t.Fatalf("could not generate ephemeral credentials: %v", err)
- }
- conn, err := grpc.NewClient(s.ListenPublic, grpc.WithTransportCredentials(creds))
- if err != nil {
- t.Fatalf("NewClient failed: %v", err)
- }
- defer conn.Close()
-
- stub := apb.NewAgentCallbackClient(conn)
- return stub.Heartbeat(ctx, &apb.HeartbeatRequest{
- MachineId: mid.String(),
- HardwareReport: &apb.AgentHardwareReport{},
- InstallationReport: report,
- })
- }
-
- // Create machine with no OS installation request.
- var machine model.Machine
- err = sess.Transact(ctx, func(q *model.Queries) error {
- machine, err = q.NewMachine(ctx)
- if err != nil {
- return err
- }
- err = q.MachineAddProvided(ctx, model.MachineAddProvidedParams{
- MachineID: machine.MachineID,
- Provider: model.ProviderEquinix,
- ProviderID: "123",
- })
- if err != nil {
- return err
- }
- return q.MachineSetAgentStarted(ctx, model.MachineSetAgentStartedParams{
- MachineID: machine.MachineID,
- AgentStartedAt: time.Now(),
- AgentPublicKey: pub,
- })
- })
- if err != nil {
- t.Fatalf("could not create machine: %v", err)
- }
-
- // Expect successful heartbeat, but no OS installation request.
- hbr, err := heartbeat(machine.MachineID, nil)
- if err != nil {
- t.Fatalf("heartbeat: %v", err)
- }
- if hbr.InstallationRequest != nil {
- t.Fatalf("expected no installation request")
- }
-
- // Now add an OS installation request tag, and expect it to be returned.
- err = sess.Transact(ctx, func(q *model.Queries) error {
- req := apb.OSInstallationRequest{
- Generation: 123,
- }
- raw, _ := proto.Marshal(&req)
- return q.MachineSetOSInstallationRequest(ctx, model.MachineSetOSInstallationRequestParams{
- MachineID: machine.MachineID,
- Generation: req.Generation,
- OsInstallationRequestRaw: raw,
- })
- })
- if err != nil {
- t.Fatalf("could not add os installation request to machine: %v", err)
- }
-
- // Heartbeat a few times just to make sure every response is as expected.
- for i := 0; i < 3; i++ {
- hbr, err = heartbeat(machine.MachineID, nil)
- if err != nil {
- t.Fatalf("heartbeat: %v", err)
- }
- if hbr.InstallationRequest == nil || hbr.InstallationRequest.Generation != 123 {
- t.Fatalf("expected installation request for generation 123, got %+v", hbr.InstallationRequest)
- }
- }
-
- // Submit a report, expect no more request.
- hbr, err = heartbeat(machine.MachineID, &apb.OSInstallationReport{Generation: 123, Result: &apb.OSInstallationReport_Success_{}})
- if err != nil {
- t.Fatalf("heartbeat: %v", err)
- }
- if hbr.InstallationRequest != nil {
- t.Fatalf("expected no installation request")
- }
-
- // Submit a newer request, expect it to be returned.
- err = sess.Transact(ctx, func(q *model.Queries) error {
- req := apb.OSInstallationRequest{
- Generation: 234,
- }
- raw, _ := proto.Marshal(&req)
- return q.MachineSetOSInstallationRequest(ctx, model.MachineSetOSInstallationRequestParams{
- MachineID: machine.MachineID,
- Generation: req.Generation,
- OsInstallationRequestRaw: raw,
- })
- })
- if err != nil {
- t.Fatalf("could not update installation request: %v", err)
- }
-
- // Heartbeat a few times just to make sure every response is as expected.
- for i := 0; i < 3; i++ {
- hbr, err = heartbeat(machine.MachineID, nil)
- if err != nil {
- t.Fatalf("heartbeat: %v", err)
- }
- if hbr.InstallationRequest == nil || hbr.InstallationRequest.Generation != 234 {
- t.Fatalf("expected installation request for generation 234, got %+v", hbr.InstallationRequest)
- }
- }
-}
diff --git a/cloud/bmaas/server/api/BUILD.bazel b/cloud/bmaas/server/api/BUILD.bazel
deleted file mode 100644
index 56a9426..0000000
--- a/cloud/bmaas/server/api/BUILD.bazel
+++ /dev/null
@@ -1,46 +0,0 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
-load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
-load("@rules_proto//proto:defs.bzl", "proto_library")
-load("@rules_proto_grpc_buf//:defs.bzl", "buf_proto_lint_test")
-
-buf_proto_lint_test(
- name = "api_proto_lint_test",
- except_rules = [
- "PACKAGE_VERSION_SUFFIX",
- "SERVICE_SUFFIX",
- ],
- protos = [":api_proto"],
- use_rules = [
- "DEFAULT",
- "COMMENTS",
- ],
-)
-
-proto_library(
- name = "api_proto",
- srcs = ["agent.proto"],
- visibility = ["//visibility:public"],
- deps = [
- "//cloud/agent/api:api_proto",
- "//metropolis/proto/api:api_proto",
- ],
-)
-
-go_proto_library(
- name = "api_go_proto",
- compilers = ["@io_bazel_rules_go//proto:go_grpc"],
- importpath = "source.monogon.dev/cloud/bmaas/server/api",
- proto = ":api_proto",
- visibility = ["//visibility:public"],
- deps = [
- "//cloud/agent/api",
- "//metropolis/proto/api",
- ],
-)
-
-go_library(
- name = "api",
- embed = [":api_go_proto"],
- importpath = "source.monogon.dev/cloud/bmaas/server/api",
- visibility = ["//visibility:public"],
-)
diff --git a/cloud/bmaas/server/api/agent.proto b/cloud/bmaas/server/api/agent.proto
deleted file mode 100644
index 3b3c5b2..0000000
--- a/cloud/bmaas/server/api/agent.proto
+++ /dev/null
@@ -1,99 +0,0 @@
-syntax = "proto3";
-package cloud.bmaas.server.api;
-option go_package = "source.monogon.dev/cloud/bmaas/server/api";
-
-import "metropolis/proto/api/configuration.proto";
-import "metropolis/proto/api/management.proto";
-import "cloud/agent/api/hwreport.proto";
-
-// AgentCallback runs on the BMDB Server and exposes a gRPC interface to agents
-// running on machines. These APIs are served over TLS using component-style
-// server certificates, but clients are authenticated using ephemeral
-// certificates proving ownership of an agent keypair.
-service AgentCallback {
- // Heartbeat is called by agents repeatedly to upload a hardware report, signal
- // liveness and retrieve actions to be performed on a host.
- //
- // This isn't a streaming RPC as the current server implementation actually
- // isn't reactive, so it would have to do its own inner polling to create
- // a stream of updates. To keep things simple, we instead let the agent decide
- // on the cadence of updates it wants to keep up with.
- rpc Heartbeat(HeartbeatRequest) returns (HeartbeatResponse);
-}
-
-message AgentHardwareReport {
- cloud.agent.api.Node report = 1;
- // List of human-readable warnings which occurred during hardware report
- // generation.
- repeated string warning = 2;
-}
-
-// OSInstallationReport is submitted from the agent to the BMDB server after
-// successful OS installation.
-message OSInstallationReport {
- // generation must be set to the same value as 'generation' in the
- // OSInstallation request which triggered the OS installation
- int64 generation = 1;
-
- // Success is set by the agent when the installation request has been
- // successfully fulfilled. It is currently empty but is specified as a
- // message to allow it to be expanded in the future.
- message Success {}
- // Error is set by the agent when the installation request could not be
- // fulfilled due to an error.
- message Error {
- // A human-readable message of what went wrong.
- string error = 1;
- }
- oneof result {
- Success success = 2;
- Error error = 3;
- }
-}
-
-message HeartbeatRequest {
- // MachineID that this agent represents. Technically not necessary since
- // keypairs between agents should be unique, but this provides an extra layer
- // of protection against programming bugs.
- string machine_id = 1;
- // Optional hardware report to be upserted for this machine. An agent should
- // submit one at least once after it's started, as early as it can.
- AgentHardwareReport hardware_report = 2;
- // Optional installation report sent to be upserted to this machine. An agent
- // should submit one after it successfully installed an operating system for
- // a given OSInstallationRequest.
- OSInstallationReport installation_report = 3;
-}
-
-message MetropolisInstallationRequest {
- reserved 1;
- // Parameters for fetching the OS image to install.
- metropolis.proto.api.OSImageRef os_image = 4;
- // Node parameters to be supplied to the new node. Note that network_config
- // is automatically filled out if coming from the takeover.
- metropolis.proto.api.NodeParameters node_parameters = 2;
- // Name of the block device to be used as the root device for the install.
- // A list of block devices can be taken from the node hardware report.
- string root_device = 3;
-}
-
-// OSInstallationRequest is provided to the agent by the BMDB server, from
-// a responding BMDB tag, when an OS installation request is pending.
-message OSInstallationRequest {
- // generation is the 'version' of the OS installation request, and will always
- // be incremented within the BMDB when a new OS installation request is
- // submitted. The agent must pipe this through to the OSInstallationReport to
- // let the rest of the system know which OS installation request it actually
- // fulfilled.
- int64 generation = 1;
- // Selects which operating system installation flow is used.
- oneof type {
- MetropolisInstallationRequest metropolis = 2;
- }
-}
-
-message HeartbeatResponse {
- // If set, the control plane is requesting the installation of an operating
- // system.
- OSInstallationRequest installation_request = 1;
-}
diff --git a/cloud/bmaas/server/api/gomod-generated-placeholder.go b/cloud/bmaas/server/api/gomod-generated-placeholder.go
deleted file mode 100644
index da08b30..0000000
--- a/cloud/bmaas/server/api/gomod-generated-placeholder.go
+++ /dev/null
@@ -1,4 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package api
diff --git a/cloud/bmaas/server/cmd/BUILD.bazel b/cloud/bmaas/server/cmd/BUILD.bazel
deleted file mode 100644
index cbfdfed..0000000
--- a/cloud/bmaas/server/cmd/BUILD.bazel
+++ /dev/null
@@ -1,34 +0,0 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_library")
-load("@rules_oci//oci:defs.bzl", "oci_image")
-load("@rules_pkg//pkg:tar.bzl", "pkg_tar")
-
-go_library(
- name = "cmd_lib",
- srcs = ["main.go"],
- importpath = "source.monogon.dev/cloud/bmaas/server/cmd",
- visibility = ["//visibility:private"],
- deps = [
- "//cloud/bmaas/server",
- "@io_k8s_klog_v2//:klog",
- ],
-)
-
-go_binary(
- name = "cmd",
- embed = [":cmd_lib"],
- visibility = ["//visibility:public"],
-)
-
-pkg_tar(
- name = "cmd_layer",
- srcs = [":cmd"],
-)
-
-oci_image(
- name = "cmd_image",
- base = "@distroless_base",
- entrypoint = ["/cmd"],
- tars = [":cmd_layer"],
- visibility = ["//visibility:public"],
- workdir = "/app",
-)
diff --git a/cloud/bmaas/server/cmd/main.go b/cloud/bmaas/server/cmd/main.go
deleted file mode 100644
index 6c14ab3..0000000
--- a/cloud/bmaas/server/cmd/main.go
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package main
-
-import (
- "context"
- "flag"
- "os"
- "os/signal"
-
- "k8s.io/klog/v2"
-
- "source.monogon.dev/cloud/bmaas/server"
-)
-
-func main() {
- s := &server.Server{}
- s.Config.RegisterFlags()
- flag.Parse()
- if flag.NArg() > 0 {
- klog.Exitf("unexpected positional arguments: %v", flag.Args())
- }
-
- ctx, _ := signal.NotifyContext(context.Background(), os.Interrupt)
- s.Start(ctx)
- select {}
-}
diff --git a/cloud/bmaas/server/server.go b/cloud/bmaas/server/server.go
deleted file mode 100644
index 8cedf2b..0000000
--- a/cloud/bmaas/server/server.go
+++ /dev/null
@@ -1,174 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package server
-
-import (
- "context"
- "errors"
- "flag"
- "fmt"
- "net"
- "os"
-
- "github.com/cenkalti/backoff/v4"
- "google.golang.org/grpc"
- "google.golang.org/grpc/reflection"
- "k8s.io/klog/v2"
-
- "source.monogon.dev/cloud/bmaas/bmdb"
- "source.monogon.dev/cloud/bmaas/bmdb/metrics"
- "source.monogon.dev/cloud/bmaas/bmdb/webug"
- apb "source.monogon.dev/cloud/bmaas/server/api"
- "source.monogon.dev/cloud/lib/component"
-)
-
-type Config struct {
- Component component.ComponentConfig
- BMDB bmdb.BMDB
- Webug webug.Config
-
- // PublicListenAddress is the address at which the 'public' (agent-facing) gRPC
- // server listener will run.
- PublicListenAddress string
-}
-
-// TODO(q3k): factor this out to BMDB library?
-func runtimeInfo() string {
- hostname, _ := os.Hostname()
- if hostname == "" {
- hostname = "UNKNOWN"
- }
- return fmt.Sprintf("host %s", hostname)
-}
-
-func (c *Config) RegisterFlags() {
- c.Component.RegisterFlags("srv")
- c.BMDB.ComponentName = "srv"
- c.BMDB.RuntimeInfo = runtimeInfo()
- c.BMDB.Database.RegisterFlags("bmdb")
- c.Webug.RegisterFlags()
-
- flag.StringVar(&c.PublicListenAddress, "srv_public_grpc_listen_address", ":8080", "Address to listen at for public/user gRPC connections for bmdbsrv")
-}
-
-type Server struct {
- Config Config
-
- // ListenGRPC will contain the address at which the internal gRPC server is
- // listening after .Start() has been called. This can differ from the configured
- // value if the configuration requests any port (via :0).
- ListenGRPC string
- // ListenPublic will contain the address at which the 'public' (agent-facing)
- // gRPC server is lsitening after .Start() has been called.
- ListenPublic string
-
- bmdb *bmdb.Connection
- acsvc *agentCallbackService
-
- sessionC chan *bmdb.Session
-}
-
-// sessionWorker emits a valid BMDB session to sessionC as long as ctx is active.
-func (s *Server) sessionWorker(ctx context.Context) {
- var session *bmdb.Session
- for {
- if session == nil || session.Expired() {
- klog.Infof("Starting new session...")
- bo := backoff.NewExponentialBackOff()
- err := backoff.Retry(func() error {
- var err error
- session, err = s.bmdb.StartSession(ctx, bmdb.SessionOption{Processor: metrics.ProcessorBMSRV})
- if err != nil {
- klog.Errorf("Failed to start session: %v", err)
- return err
- } else {
- return nil
- }
- }, backoff.WithContext(bo, ctx))
- if err != nil {
- // If something's really wrong just crash.
- klog.Exitf("Gave up on starting session: %v", err)
- }
- klog.Infof("New session: %s", session.UUID)
- }
-
- select {
- case <-ctx.Done():
- return
- case s.sessionC <- session:
- }
- }
-}
-
-func (s *Server) session(ctx context.Context) (*bmdb.Session, error) {
- select {
- case sess := <-s.sessionC:
- return sess, nil
- case <-ctx.Done():
- return nil, ctx.Err()
- }
-}
-
-func (s *Server) startPublic(ctx context.Context) {
- g := grpc.NewServer(s.Config.Component.GRPCServerOptionsPublic()...)
- lis, err := net.Listen("tcp", s.Config.PublicListenAddress)
- if err != nil {
- klog.Exitf("Could not listen: %v", err)
- }
- s.ListenPublic = lis.Addr().String()
- apb.RegisterAgentCallbackServer(g, s.acsvc)
- reflection.Register(g)
-
- klog.Infof("Public API listening on %s", s.ListenPublic)
- go func() {
- err := g.Serve(lis)
- if !errors.Is(err, ctx.Err()) {
- klog.Exitf("Public gRPC serve failed: %v", err)
- }
- }()
-}
-
-func (s *Server) startInternalGRPC(ctx context.Context) {
- g := grpc.NewServer(s.Config.Component.GRPCServerOptions()...)
- lis, err := net.Listen("tcp", s.Config.Component.GRPCListenAddress)
- if err != nil {
- klog.Exitf("Could not listen: %v", err)
- }
- s.ListenGRPC = lis.Addr().String()
-
- reflection.Register(g)
- klog.Infof("Internal gRPC listening on %s", s.ListenGRPC)
- go func() {
- err := g.Serve(lis)
- if !errors.Is(err, ctx.Err()) {
- klog.Exitf("Internal gRPC serve failed: %v", err)
- }
- }()
-}
-
-// Start the BMaaS Server in background goroutines. This should only be called
-// once. The process will exit with debug logs if starting the server failed.
-func (s *Server) Start(ctx context.Context) {
- reg := s.Config.Component.PrometheusRegistry()
- s.Config.BMDB.EnableMetrics(reg)
- s.Config.Component.StartPrometheus(ctx)
-
- conn, err := s.Config.BMDB.Open(true)
- if err != nil {
- klog.Exitf("Failed to connect to BMDB: %v", err)
- }
- s.acsvc = &agentCallbackService{
- s: s,
- }
- s.bmdb = conn
- s.sessionC = make(chan *bmdb.Session)
- go s.sessionWorker(ctx)
- s.startInternalGRPC(ctx)
- s.startPublic(ctx)
- go func() {
- if err := s.Config.Webug.Start(ctx, conn); err != nil && !errors.Is(err, ctx.Err()) {
- klog.Exitf("Failed to start webug: %v", err)
- }
- }()
-}
diff --git a/cloud/equinix/cli/BUILD.bazel b/cloud/equinix/cli/BUILD.bazel
deleted file mode 100644
index 020e93c..0000000
--- a/cloud/equinix/cli/BUILD.bazel
+++ /dev/null
@@ -1,27 +0,0 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_library")
-
-go_library(
- name = "cli_lib",
- srcs = [
- "cmd_delete.go",
- "cmd_list.go",
- "cmd_move.go",
- "cmd_reboot.go",
- "cmd_yoink.go",
- "main.go",
- ],
- importpath = "source.monogon.dev/cloud/equinix/cli",
- visibility = ["//visibility:private"],
- deps = [
- "//cloud/equinix/wrapngo",
- "@com_github_packethost_packngo//:packngo",
- "@com_github_spf13_cobra//:cobra",
- "@io_k8s_klog_v2//:klog",
- ],
-)
-
-go_binary(
- name = "cli",
- embed = [":cli_lib"],
- visibility = ["//visibility:public"],
-)
diff --git a/cloud/equinix/cli/cmd_delete.go b/cloud/equinix/cli/cmd_delete.go
deleted file mode 100644
index 5e1d132..0000000
--- a/cloud/equinix/cli/cmd_delete.go
+++ /dev/null
@@ -1,65 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package main
-
-import (
- "context"
- "os"
- "os/signal"
- "time"
-
- "github.com/packethost/packngo"
- "github.com/spf13/cobra"
- "k8s.io/klog/v2"
-
- "source.monogon.dev/cloud/equinix/wrapngo"
-)
-
-var deleteCmd = &cobra.Command{
- Use: "delete [target]",
- Short: "Delete all devices from one project",
- Args: cobra.ExactArgs(1),
- Run: doDelete,
-}
-
-func init() {
- rootCmd.AddCommand(deleteCmd)
-}
-
-func doDelete(cmd *cobra.Command, args []string) {
- ctx, _ := signal.NotifyContext(context.Background(), os.Interrupt)
- api := wrapngo.New(&c)
-
- klog.Infof("Listing devices for %q", args[0])
-
- devices, err := api.ListDevices(ctx, args[0])
- if err != nil {
- klog.Exitf("failed listing devices: %v", err)
- }
-
- if len(devices) == 0 {
- klog.Infof("No devices found in %s", args[0])
- return
- }
-
- klog.Infof("Deleting %d Devices in %s. THIS WILL DELETE SERVERS! You have five seconds to cancel!", len(devices), args[0])
- time.Sleep(5 * time.Second)
-
- for _, d := range devices {
- h := "deleted-" + d.Hostname
- _, err := api.UpdateDevice(ctx, d.ID, &packngo.DeviceUpdateRequest{
- Hostname: &h,
- })
- if err != nil {
- klog.Infof("failed updating device %s (%s): %v", d.ID, d.Hostname, err)
- continue
- }
-
- klog.Infof("deleting %s (%s)...", d.ID, d.Hostname)
- if err := api.DeleteDevice(ctx, d.ID); err != nil {
- klog.Infof("failed deleting device %s (%s): %v", d.ID, d.Hostname, err)
- continue
- }
- }
-}
diff --git a/cloud/equinix/cli/cmd_list.go b/cloud/equinix/cli/cmd_list.go
deleted file mode 100644
index 917494a..0000000
--- a/cloud/equinix/cli/cmd_list.go
+++ /dev/null
@@ -1,108 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package main
-
-import (
- "context"
- "fmt"
- "os"
- "os/signal"
- "slices"
- "strings"
-
- "github.com/packethost/packngo"
- "github.com/spf13/cobra"
- "k8s.io/klog/v2"
-
- "source.monogon.dev/cloud/equinix/wrapngo"
-)
-
-var listCmd = &cobra.Command{
- Use: "list",
- Long: `This lists all hardware reservations inside a specified organization or project.`,
- Args: cobra.NoArgs,
- Run: doList,
-}
-
-func init() {
- listCmd.Flags().String("equinix_organization", "", "from which organization to list from")
- listCmd.Flags().String("equinix_project", "", "from which project to list from")
- rootCmd.AddCommand(listCmd)
-}
-
-func doList(cmd *cobra.Command, args []string) {
- organization, err := cmd.Flags().GetString("equinix_organization")
- if err != nil {
- klog.Exitf("flag: %v", err)
- }
-
- project, err := cmd.Flags().GetString("equinix_project")
- if err != nil {
- klog.Exitf("flag: %v", err)
- }
-
- if organization == "" && project == "" {
- klog.Exitf("missing organization or project flag")
- }
-
- ctx, _ := signal.NotifyContext(context.Background(), os.Interrupt)
- api := wrapngo.New(&c)
-
- var (
- reservations []packngo.HardwareReservation
- )
- switch {
- case project != "" && organization == "":
- klog.Infof("Listing reservations for project: %s", project)
- reservations, err = api.ListReservations(ctx, project)
- case organization != "" && project == "":
- klog.Infof("Listing reservations for organization: %s", organization)
- reservations, err = api.ListOrganizationReservations(ctx, organization)
- default:
- klog.Exitf("exactly one of organization or project flags has to be set")
- }
-
- if err != nil {
- klog.Fatalf("Failed to list reservations: %v", err)
- }
-
- type configDC struct {
- config string
- dc string
- }
- type configDCP struct {
- configDC
- project string
- }
- mtypes := make(map[configDC]int)
- mptypes := make(map[configDCP]int)
-
- klog.Infof("Got %d reservations", len(reservations))
- for _, r := range reservations {
- curType := configDC{config: strings.ToLower(r.Plan.Name), dc: strings.ToLower(r.Facility.Metro.Code)}
- curPType := configDCP{curType, r.Project.Name}
- mtypes[curType]++
- mptypes[curPType]++
- }
-
- klog.Infof("Found the following configurations:")
- var mStrings []string
- for dc, c := range mtypes {
- mStrings = append(mStrings, fmt.Sprintf("%s | %s | %d", dc.dc, dc.config, c))
- }
- slices.Sort(mStrings)
- for _, s := range mStrings {
- klog.Info(s)
- }
-
- klog.Infof("Found the following configurations (per project):")
- var mpStrings []string
- for dc, c := range mptypes {
- mpStrings = append(mpStrings, fmt.Sprintf("%s | %s | %s | %d", dc.project, dc.dc, dc.config, c))
- }
- slices.Sort(mpStrings)
- for _, s := range mpStrings {
- klog.Info(s)
- }
-}
diff --git a/cloud/equinix/cli/cmd_move.go b/cloud/equinix/cli/cmd_move.go
deleted file mode 100644
index e835a7e..0000000
--- a/cloud/equinix/cli/cmd_move.go
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package main
-
-import (
- "context"
- "os"
- "os/signal"
-
- "github.com/spf13/cobra"
- "k8s.io/klog/v2"
-
- "source.monogon.dev/cloud/equinix/wrapngo"
-)
-
-var moveCmd = &cobra.Command{
- Use: "move [source] [target]",
- Short: "Move all reserved hardware from one to another project",
- Args: cobra.ExactArgs(2),
- Run: doMove,
-}
-
-func init() {
- rootCmd.AddCommand(moveCmd)
-}
-
-func doMove(cmd *cobra.Command, args []string) {
- ctx, _ := signal.NotifyContext(context.Background(), os.Interrupt)
- api := wrapngo.New(&c)
-
- klog.Infof("Listing reservations for %q", args[0])
- reservations, err := api.ListReservations(ctx, args[0])
- if err != nil {
- klog.Exitf("failed listing reservations: %v", err)
- }
-
- klog.Infof("Got %d reservations. Moving machines", len(reservations))
- for _, r := range reservations {
- _, err := api.MoveReservation(ctx, r.ID, args[1])
- if err != nil {
- klog.Errorf("failed moving reservation: %v", err)
- continue
- }
- klog.Infof("Moved Device %s", r.ID)
- }
-}
diff --git a/cloud/equinix/cli/cmd_reboot.go b/cloud/equinix/cli/cmd_reboot.go
deleted file mode 100644
index 0f7f317..0000000
--- a/cloud/equinix/cli/cmd_reboot.go
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package main
-
-import (
- "context"
- "os"
- "os/signal"
-
- "github.com/spf13/cobra"
- "k8s.io/klog/v2"
-
- "source.monogon.dev/cloud/equinix/wrapngo"
-)
-
-var rebootCmd = &cobra.Command{
- Use: "reboot [project] [id]",
- Short: "Reboots all or one specific node",
- Args: cobra.MaximumNArgs(1),
- Run: doReboot,
-}
-
-func init() {
- rootCmd.AddCommand(rebootCmd)
-}
-
-func doReboot(cmd *cobra.Command, args []string) {
- ctx, _ := signal.NotifyContext(context.Background(), os.Interrupt)
- api := wrapngo.New(&c)
-
- klog.Infof("Requesting device list...")
- devices, err := api.ListDevices(ctx, args[0])
- if err != nil {
- klog.Fatal(err)
- }
-
- for _, d := range devices {
- if len(args) == 2 && args[1] != d.ID {
- continue
- }
-
- err := api.RebootDevice(ctx, d.ID)
- if err != nil {
- klog.Error(err)
- continue
- }
- klog.Infof("rebooted %s", d.ID)
- }
-}
diff --git a/cloud/equinix/cli/cmd_yoink.go b/cloud/equinix/cli/cmd_yoink.go
deleted file mode 100644
index 1de5803..0000000
--- a/cloud/equinix/cli/cmd_yoink.go
+++ /dev/null
@@ -1,173 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package main
-
-import (
- "bufio"
- "context"
- "os"
- "os/signal"
- "sort"
- "strconv"
- "strings"
-
- "github.com/packethost/packngo"
- "github.com/spf13/cobra"
- "k8s.io/klog/v2"
-
- "source.monogon.dev/cloud/equinix/wrapngo"
-)
-
-var yoinkCmd = &cobra.Command{
- Use: "yoink",
- Long: `This moves a specified amount of servers that match the given spec to a different metro.
-While spec is a easy to find argument that matches the equinix system spec e.g. w3amd.75xx24c.512.8160.x86,
-metro does not represent the public facing name. Instead it is the acutal datacenter name e.g. fr2"`,
- Short: "Move a server base on the spec from one to another project",
- Args: cobra.NoArgs,
- Run: doYoink,
-}
-
-func init() {
- yoinkCmd.Flags().Int("count", 1, "how many machines should be moved")
- yoinkCmd.Flags().String("equinix_source_project", "", "from which project should the machine be yoinked")
- yoinkCmd.Flags().String("equinix_target_project", "", "to which project should the machine be moved")
- yoinkCmd.Flags().String("spec", "", "which device spec should be moved")
- yoinkCmd.Flags().String("metro", "", "to which metro should be moved")
- rootCmd.AddCommand(yoinkCmd)
-}
-
-func doYoink(cmd *cobra.Command, args []string) {
- srcProject, err := cmd.Flags().GetString("equinix_source_project")
- if err != nil {
- klog.Exitf("flag: %v", err)
- }
-
- dstProject, err := cmd.Flags().GetString("equinix_target_project")
- if err != nil {
- klog.Exitf("flag: %v", err)
- }
-
- if srcProject == "" || dstProject == "" {
- klog.Exitf("missing project flags")
- }
-
- count, err := cmd.Flags().GetInt("count")
- if err != nil {
- klog.Exitf("flag: %v", err)
- }
-
- spec, err := cmd.Flags().GetString("spec")
- if err != nil {
- klog.Exitf("flag: %v", err)
- }
-
- if spec == "" {
- klog.Exitf("missing spec flag")
- }
-
- metro, err := cmd.Flags().GetString("metro")
- if err != nil {
- klog.Exitf("flag: %v", err)
- }
-
- if metro == "" {
- klog.Exitf("missing metro flag")
- }
-
- ctx, _ := signal.NotifyContext(context.Background(), os.Interrupt)
- api := wrapngo.New(&c)
-
- klog.Infof("Listing reservations for %q", srcProject)
- reservations, err := api.ListReservations(ctx, srcProject)
- if err != nil {
- klog.Exitf("Failed to list reservations: %v", err)
- }
-
- type configDC struct {
- config string
- dc string
- }
- mtypes := make(map[configDC]int)
-
- var matchingReservations []packngo.HardwareReservation
- reqType := configDC{config: strings.ToLower(spec), dc: strings.ToLower(metro)}
-
- klog.Infof("Got %d reservations", len(reservations))
- for _, r := range reservations {
- curType := configDC{config: strings.ToLower(r.Plan.Name), dc: strings.ToLower(r.Facility.Metro.Code)}
-
- mtypes[curType]++
- if curType == reqType {
- matchingReservations = append(matchingReservations, r)
- }
- }
-
- klog.Infof("Found the following configurations:")
- for dc, c := range mtypes {
- klog.Infof("%s | %s | %d", dc.dc, dc.config, c)
- }
-
- if len(matchingReservations) == 0 {
- klog.Exitf("Configuration not found: %s - %s", reqType.dc, reqType.config)
- }
-
- if len(matchingReservations)-count < 0 {
- klog.Exitf("Not enough machines with matching configuration found ")
- }
-
- // prefer hosts that are not deployed
- sort.Slice(matchingReservations, func(i, j int) bool {
- return matchingReservations[i].Device == nil && matchingReservations[j].Device != nil
- })
-
- toMove := matchingReservations[:count]
- var toDelete []string
- for _, r := range toMove {
- if r.Device != nil {
- toDelete = append(toDelete, r.Device.Hostname)
- }
- }
-
- stdInReader := bufio.NewReader(os.Stdin)
- klog.Infof("Will move %d machines with spec %s in %s from %s to %s.", count, spec, metro, srcProject, dstProject)
- if len(toDelete) > 0 {
- klog.Warningf("Not enough free machines found. This will delete %d provisioned hosts! Hosts scheduled for deletion: ", len(toDelete))
- klog.Warningf("%s", strings.Join(toDelete, ", "))
- klog.Warningf("Please confirm by inputting in the number of machines that will be moved.")
-
- read, err := stdInReader.ReadString('\n')
- if err != nil {
- klog.Exitf("failed reading input: %v", err)
- }
-
- atoi, err := strconv.Atoi(strings.TrimSpace(read))
- if err != nil {
- klog.Exitf("failed parsing number: %v", err)
- }
-
- if atoi != len(toDelete) {
- klog.Exitf("Confirmation failed! Wanted \"%q\" got \"%d\"", len(toDelete), atoi)
- } else {
- klog.Infof("Thanks for the confirmation! continuing...")
- }
- }
-
- klog.Infof("Note: It can be normal for a device move to fail for project validation issues. This is a known issue and can be ignored")
- for _, r := range matchingReservations[:count] {
- if r.Device != nil {
- klog.Warningf("Deleting server %s (%s) on %s", r.Device.ID, r.Device.Hostname, r.ID)
-
- if err := api.DeleteDevice(ctx, r.Device.ID); err != nil {
- klog.Errorf("failed deleting device %s (%s): %v", r.Device.ID, r.Device.Hostname, err)
- continue
- }
- }
-
- _, err := api.MoveReservation(ctx, r.ID, dstProject)
- if err != nil {
- klog.Errorf("failed moving device %s: %v", r.ID, err)
- }
- }
-}
diff --git a/cloud/equinix/cli/main.go b/cloud/equinix/cli/main.go
deleted file mode 100644
index 85de18b..0000000
--- a/cloud/equinix/cli/main.go
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package main
-
-import (
- "flag"
-
- "github.com/spf13/cobra"
-
- "k8s.io/klog/v2"
-
- "source.monogon.dev/cloud/equinix/wrapngo"
-)
-
-// rootCmd represents the base command when called without any subcommands
-var rootCmd = &cobra.Command{
- PersistentPreRunE: func(cmd *cobra.Command, args []string) error {
- if c.APIKey == "" || c.User == "" {
- klog.Exitf("-equinix_api_username and -equinix_api_key must be set")
- }
- return nil
- },
-}
-
-var c wrapngo.Opts
-
-func init() {
- c.RegisterFlags()
- rootCmd.PersistentFlags().AddGoFlagSet(flag.CommandLine)
-}
-
-func main() {
- cobra.CheckErr(rootCmd.Execute())
-}
diff --git a/cloud/equinix/wrapngo/BUILD.bazel b/cloud/equinix/wrapngo/BUILD.bazel
deleted file mode 100644
index 1574a6a..0000000
--- a/cloud/equinix/wrapngo/BUILD.bazel
+++ /dev/null
@@ -1,31 +0,0 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
-
-go_library(
- name = "wrapngo",
- srcs = [
- "duct_tape.go",
- "metrics.go",
- "wrapn.go",
- ],
- importpath = "source.monogon.dev/cloud/equinix/wrapngo",
- visibility = ["//visibility:public"],
- deps = [
- "@com_github_cenkalti_backoff_v4//:backoff",
- "@com_github_google_uuid//:uuid",
- "@com_github_packethost_packngo//:packngo",
- "@com_github_prometheus_client_golang//prometheus",
- "@io_k8s_klog_v2//:klog",
- ],
-)
-
-go_test(
- name = "wrapngo_test",
- timeout = "eternal",
- srcs = ["wrapngo_live_test.go"],
- args = ["-test.v"],
- embed = [":wrapngo"],
- deps = [
- "@com_github_packethost_packngo//:packngo",
- "@org_golang_x_crypto//ssh",
- ],
-)
diff --git a/cloud/equinix/wrapngo/duct_tape.go b/cloud/equinix/wrapngo/duct_tape.go
deleted file mode 100644
index 21581ce..0000000
--- a/cloud/equinix/wrapngo/duct_tape.go
+++ /dev/null
@@ -1,129 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package wrapngo
-
-import (
- "context"
- "errors"
- "fmt"
- "net/http"
- "time"
-
- "github.com/cenkalti/backoff/v4"
- "github.com/packethost/packngo"
- "k8s.io/klog/v2"
-)
-
-// wrap a given fn in some reliability-increasing duct tape: context support and
-// exponential backoff retries for intermittent connectivity issues. This allows
-// us to use packngo code instead of writing our own API stub for Equinix Metal.
-//
-// The given fn will be retried until it returns a 'permanent' Equinix error (see
-// isPermanentEquinixError) or the given context expires. Additionally, fn will
-// be called with a brand new packngo client tied to the context of the wrap
-// call. Finally, the given client will also have some logging middleware
-// attached to it which can be activated by setting verbosity 5 (or greater) on
-// this file.
-//
-// The wrapped fn can be either just a plain packngo method or some complicated
-// idempotent logic, as long as it cooperates with the above contract.
-func wrap[U any](ctx context.Context, cl *client, fn func(*packngo.Client) (U, error)) (U, error) {
- var zero U
- if err := cl.serializer.up(ctx); err != nil {
- return zero, err
- }
- defer cl.serializer.down()
-
- bc := backoff.WithContext(cl.o.BackOff(), ctx)
- pngo, err := cl.clientForContext(ctx)
- if err != nil {
- // Generally this shouldn't happen other than with programming errors, so we
- // don't back this off.
- return zero, fmt.Errorf("could not crate equinix client: %w", err)
- }
-
- var res U
- err = backoff.Retry(func() error {
- res, err = fn(pngo)
- if isPermanentEquinixError(err) {
- return backoff.Permanent(err)
- }
- return err
- }, bc)
- if err != nil {
- return zero, err
- }
- return res, nil
-}
-
-type injectContextRoundTripper struct {
- ctx context.Context
- original http.RoundTripper
- metrics *metricsSet
-}
-
-func (r *injectContextRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) {
- klog.V(5).Infof("Request -> %v", req.URL.String())
- start := time.Now()
- res, err := r.original.RoundTrip(req.WithContext(r.ctx))
- latency := time.Since(start)
- r.metrics.onAPIRequestDone(req, res, err, latency)
-
- if err != nil {
- klog.V(5).Infof("HTTP error <- %v", err)
- } else {
- klog.V(5).Infof("Response <- %v", res.Status)
- }
- return res, err
-}
-
-func (c *client) clientForContext(ctx context.Context) (*packngo.Client, error) {
- httpcl := &http.Client{
- Transport: &injectContextRoundTripper{
- ctx: ctx,
- original: http.DefaultTransport,
- metrics: c.metrics,
- },
- }
- return packngo.NewClient(packngo.WithAuth(c.username, c.token), packngo.WithHTTPClient(httpcl))
-}
-
-// httpStatusCode extracts the status code from error values returned by
-// packngo methods.
-func httpStatusCode(err error) int {
- var er *packngo.ErrorResponse
- if err != nil && errors.As(err, &er) {
- return er.Response.StatusCode
- }
- return -1
-}
-
-// IsNotFound returns true if the given error is an Equinix packngo/wrapngo 'not
-// found' error.
-func IsNotFound(err error) bool {
- return httpStatusCode(err) == http.StatusNotFound
-}
-
-func isPermanentEquinixError(err error) bool {
- // Invalid argument/state errors from wrapping.
- if errors.Is(err, ErrRaceLost) {
- return true
- }
- if errors.Is(err, ErrNoReservationProvided) {
- return true
- }
- // Real errors returned from equinix.
- st := httpStatusCode(err)
- switch st {
- case http.StatusUnauthorized:
- return true
- case http.StatusForbidden:
- return true
- case http.StatusNotFound:
- return true
- case http.StatusUnprocessableEntity:
- return true
- }
- return false
-}
diff --git a/cloud/equinix/wrapngo/metrics.go b/cloud/equinix/wrapngo/metrics.go
deleted file mode 100644
index f6cabcb..0000000
--- a/cloud/equinix/wrapngo/metrics.go
+++ /dev/null
@@ -1,132 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package wrapngo
-
-import (
- "context"
- "errors"
- "fmt"
- "net/http"
- "regexp"
- "strings"
- "time"
-
- "github.com/prometheus/client_golang/prometheus"
- "k8s.io/klog/v2"
-)
-
-// metricsSet contains all the Prometheus metrics collected by wrapngo.
-type metricsSet struct {
- requestLatencies *prometheus.HistogramVec
- waiting prometheus.GaugeFunc
- inFlight prometheus.GaugeFunc
-}
-
-func newMetricsSet(ser *serializer) *metricsSet {
- return &metricsSet{
- requestLatencies: prometheus.NewHistogramVec(
- prometheus.HistogramOpts{
- Name: "equinix_api_latency",
- Help: "Equinix API request latency in seconds, partitioned by endpoint status code",
- },
- []string{"endpoint", "status_code"},
- ),
- waiting: prometheus.NewGaugeFunc(
- prometheus.GaugeOpts{
- Name: "equinix_api_waiting",
- Help: "Number of API requests pending to be sent to Equinix but waiting on semaphore",
- },
- func() float64 {
- _, waiting := ser.stats()
- return float64(waiting)
- },
- ),
- inFlight: prometheus.NewGaugeFunc(
- prometheus.GaugeOpts{
- Name: "equinix_api_in_flight",
- Help: "Number of API requests currently being processed by Equinix",
- },
- func() float64 {
- inFlight, _ := ser.stats()
- return float64(inFlight)
- },
- ),
- }
-}
-
-// getEndpointForPath converts from an Equinix API method and path (eg.
-// /metal/v1/devices/deadbeef) into an 'endpoint' name, which is an imaginary,
-// Monogon-specific name for the API endpoint accessed by this call.
-//
-// If the given path is unknown and thus cannot be converted to an endpoint name,
-// 'Unknown' is return and a warning is logged.
-//
-// We use this function to partition request statistics per API 'endpoint'. An
-// alternative to this would be to record high-level packngo function names, but
-// one packngo function call might actually emit multiple HTTP API requests - so
-// we're stuck recording the low-level requests and gathering statistics from
-// there instead.
-func getEndpointForPath(method, path string) string {
- path = strings.TrimPrefix(path, "/metal/v1")
- for name, match := range endpointNames {
- if match.matches(method, path) {
- return name
- }
- }
- klog.Warningf("Unknown Equinix API %s %s - cannot determine metric endpoint name", method, path)
- return "Unknown"
-}
-
-// requestMatch is used to match a HTTP request method/path.
-type requestMatch struct {
- method string
- regexp *regexp.Regexp
-}
-
-func (r *requestMatch) matches(method, path string) bool {
- if r.method != method {
- return false
- }
- return r.regexp.MatchString(path)
-}
-
-var (
- endpointNames = map[string]requestMatch{
- "GetDevice": {"GET", regexp.MustCompile(`^/devices/[^/]+$`)},
- "ListDevices": {"GET", regexp.MustCompile(`^/(organizations|projects)/[^/]+/devices$`)},
- "CreateDevice": {"POST", regexp.MustCompile(`^/projects/[^/]+/devices$`)},
- "ListReservations": {"GET", regexp.MustCompile(`^/projects/[^/]+/hardware-reservations$`)},
- "ListSSHKeys": {"GET", regexp.MustCompile(`^/ssh-keys$`)},
- "CreateSSHKey": {"POST", regexp.MustCompile(`^/project/[^/]+/ssh-keys$`)},
- "GetSSHKey": {"GET", regexp.MustCompile(`^/ssh-keys/[^/]+$`)},
- "UpdateSSHKey": {"PATCH", regexp.MustCompile(`^/ssh-keys/[^/]+$`)},
- "PerformDeviceAction": {"POST", regexp.MustCompile(`^/devices/[^/]+/actions$`)},
- }
-)
-
-// onAPIRequestDone is called by the wrapngo code on every API response from
-// Equinix, and records the given parameters into metrics.
-func (m *metricsSet) onAPIRequestDone(req *http.Request, res *http.Response, err error, latency time.Duration) {
- if m == nil {
- return
- }
-
- code := "unknown"
- if err == nil {
- code = fmt.Sprintf("%d", res.StatusCode)
- } else {
- switch {
- case errors.Is(err, context.Canceled):
- code = "ctx canceled"
- case errors.Is(err, context.DeadlineExceeded):
- code = "deadline exceeded"
- }
- }
- if code == "unknown" {
- klog.Warningf("Unexpected HTTP result: req %s %s, error: %v", req.Method, req.URL.Path, res)
- }
-
- endpoint := getEndpointForPath(req.Method, req.URL.Path)
- m.requestLatencies.With(prometheus.Labels{"endpoint": endpoint, "status_code": code}).Observe(latency.Seconds())
-}
diff --git a/cloud/equinix/wrapngo/wrapn.go b/cloud/equinix/wrapngo/wrapn.go
deleted file mode 100644
index eee0b57..0000000
--- a/cloud/equinix/wrapngo/wrapn.go
+++ /dev/null
@@ -1,446 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-// Package wrapngo wraps packngo methods providing the following usability
-// enhancements:
-// - API call rate limiting
-// - resource-aware call retries
-// - use of a configurable back-off algorithm implementation
-// - context awareness
-//
-// The implementation is provided with the following caveats:
-//
-// There can be only one call in flight. Concurrent calls to API-related
-// methods of the same client will block. Calls returning packngo structs will
-// return nil data when a non-nil error value is returned. An
-// os.ErrDeadlineExceeded will be returned after the underlying API calls time
-// out beyond the chosen back-off algorithm implementation's maximum allowed
-// retry interval. Other errors, excluding context.Canceled and
-// context.DeadlineExceeded, indicate either an error originating at Equinix'
-// API endpoint (which may still stem from invalid call inputs), or a network
-// error.
-//
-// Packngo wrappers included below may return timeout errors even after the
-// wrapped calls succeed in the event server reply could not have been
-// received.
-//
-// This implies that effects of mutating calls can't always be verified
-// atomically, requiring explicit synchronization between API users, regardless
-// of the retry/recovery logic used.
-//
-// Having that in mind, some call wrappers exposed by this package will attempt
-// to recover from this kind of situations by requesting information on any
-// resources created, and retrying the call if needed. This approach assumes
-// any concurrent mutating API users will be synchronized, as it should be in
-// any case.
-//
-// Another way of handling this problem would be to leave it up to the user to
-// retry calls if needed, though this would leak Equinix Metal API, and
-// complicate implementations depending on this package. Due to that, the prior
-// approach was chosen.
-package wrapngo
-
-import (
- "context"
- "errors"
- "flag"
- "fmt"
- "net/http"
- "sync/atomic"
- "time"
-
- "github.com/cenkalti/backoff/v4"
- "github.com/google/uuid"
- "github.com/packethost/packngo"
- "github.com/prometheus/client_golang/prometheus"
-)
-
-// Opts conveys configurable Client parameters.
-type Opts struct {
- // User and APIKey are the credentials used to authenticate with
- // Metal API.
-
- User string
- APIKey string
-
- // Optional parameters:
-
- // BackOff controls the client's behavior in the event of API calls failing
- // due to IO timeouts by adjusting the lower bound on time taken between
- // subsequent calls.
- BackOff func() backoff.BackOff
-
- // APIRate is the minimum time taken between subsequent API calls.
- APIRate time.Duration
-
- // Parallelism defines how many calls to the Equinix API will be issued in
- // parallel. When this limit is reached, subsequent attmepts to call the API will
- // block. The order of serving of pending calls is currently undefined.
- //
- // If not defined (ie. 0), defaults to 1.
- Parallelism int
-
- MetricsRegistry *prometheus.Registry
-}
-
-func (o *Opts) RegisterFlags() {
- flag.StringVar(&o.User, "equinix_api_username", "", "Username for Equinix API")
- flag.StringVar(&o.APIKey, "equinix_api_key", "", "Key/token/password for Equinix API")
- flag.IntVar(&o.Parallelism, "equinix_parallelism", 3, "How many parallel connections to the Equinix API will be allowed")
-}
-
-// Client is a limited interface of methods that the Shepherd uses on Equinix. It
-// is provided to allow for dependency injection of a fake equinix API for tests.
-type Client interface {
- // GetDevice wraps packngo's cl.Devices.Get.
- //
- // TODO(q3k): remove unused pid parameter.
- GetDevice(ctx context.Context, pid, did string, opts *packngo.ListOptions) (*packngo.Device, error)
- // ListDevices wraps packngo's cl.Device.List.
- ListDevices(ctx context.Context, pid string) ([]packngo.Device, error)
- // CreateDevice attempts to create a new device according to the provided
- // request. The request _must_ configure a HardwareReservationID. This call
- // attempts to be as idempotent as possible, and will return ErrRaceLost if a
- // retry was needed but in the meantime the requested hardware reservation from
- // which this machine was requested got lost.
- CreateDevice(ctx context.Context, request *packngo.DeviceCreateRequest) (*packngo.Device, error)
-
- UpdateDevice(ctx context.Context, id string, request *packngo.DeviceUpdateRequest) (*packngo.Device, error)
- RebootDevice(ctx context.Context, did string) error
- DeleteDevice(ctx context.Context, id string) error
-
- // ListReservations returns a complete list of hardware reservations associated
- // with project pid. This is an expensive method that takes a while to execute,
- // handle with care.
- ListReservations(ctx context.Context, pid string) ([]packngo.HardwareReservation, error)
-
- ListOrganizationReservations(ctx context.Context, oid string) ([]packngo.HardwareReservation, error)
-
- // MoveReservation moves a reserved device to the given project.
- MoveReservation(ctx context.Context, hardwareReservationDID, projectID string) (*packngo.HardwareReservation, error)
-
- // ListSSHKeys wraps packngo's cl.Keys.List.
- ListSSHKeys(ctx context.Context) ([]packngo.SSHKey, error)
- // CreateSSHKey is idempotent - the key label can be used only once. Further
- // calls referring to the same label and key will not yield errors. See the
- // package comment for more info on this method's behavior and returned error
- // values.
- CreateSSHKey(ctx context.Context, req *packngo.SSHKeyCreateRequest) (*packngo.SSHKey, error)
- // UpdateSSHKey is idempotent - values included in r can be applied only once,
- // while subsequent updates using the same data don't produce errors. See the
- // package comment for information on this method's behavior and returned error
- // values.
- UpdateSSHKey(ctx context.Context, kid string, req *packngo.SSHKeyUpdateRequest) (*packngo.SSHKey, error)
-
- Close()
-}
-
-// client implements the Client interface.
-type client struct {
- username string
- token string
- o *Opts
- rlt *time.Ticker
-
- serializer *serializer
- metrics *metricsSet
-}
-
-// serializer is an N-semaphore channel (configured by opts.Parallelism) which is
-// used to limit the number of concurrent calls to the Equinix API.
-//
-// In addition, it implements some simple waiting/usage statistics for
-// metrics/introspection.
-type serializer struct {
- sem chan struct{}
- usage int64
- waiting int64
-}
-
-// up blocks until the serializer has at least one available concurrent call
-// slot. If the given context expires before such a slot is available, the
-// context error is returned.
-func (s *serializer) up(ctx context.Context) error {
- atomic.AddInt64(&s.waiting, 1)
- select {
- case s.sem <- struct{}{}:
- atomic.AddInt64(&s.waiting, -1)
- atomic.AddInt64(&s.usage, 1)
- return nil
- case <-ctx.Done():
- atomic.AddInt64(&s.waiting, -1)
- return ctx.Err()
- }
-}
-
-// down releases a previously acquire concurrent call slot.
-func (s *serializer) down() {
- atomic.AddInt64(&s.usage, -1)
- <-s.sem
-}
-
-// stats returns the number of in-flight and waiting-for-semaphore requests.
-func (s *serializer) stats() (usage, waiting int64) {
- usage = atomic.LoadInt64(&s.usage)
- waiting = atomic.LoadInt64(&s.waiting)
- return
-}
-
-// New creates a Client instance based on Opts. PACKNGO_DEBUG environment
-// variable can be set prior to the below call to enable verbose packngo
-// debug logs.
-func New(opts *Opts) Client {
- return newClient(opts)
-}
-
-func newClient(opts *Opts) *client {
- // Apply the defaults.
- if opts.APIRate == 0 {
- opts.APIRate = 2 * time.Second
- }
- if opts.BackOff == nil {
- opts.BackOff = func() backoff.BackOff {
- return backoff.NewExponentialBackOff()
- }
- }
- if opts.Parallelism == 0 {
- opts.Parallelism = 1
- }
-
- cl := &client{
- username: opts.User,
- token: opts.APIKey,
- o: opts,
- rlt: time.NewTicker(opts.APIRate),
-
- serializer: &serializer{
- sem: make(chan struct{}, opts.Parallelism),
- },
- }
- if opts.MetricsRegistry != nil {
- ms := newMetricsSet(cl.serializer)
- opts.MetricsRegistry.MustRegister(ms.inFlight, ms.waiting, ms.requestLatencies)
- cl.metrics = ms
- }
- return cl
-}
-
-func (c *client) Close() {
- c.rlt.Stop()
-}
-
-var (
- ErrRaceLost = errors.New("race lost with another API user")
- ErrNoReservationProvided = errors.New("hardware reservation must be set")
-)
-
-func (c *client) PowerOffDevice(ctx context.Context, pid string) error {
- _, err := wrap(ctx, c, func(p *packngo.Client) (*packngo.Response, error) {
- r, err := p.Devices.PowerOff(pid)
- if err != nil {
- return nil, fmt.Errorf("Devices.PowerOff: %w", err)
- }
- return r, nil
- })
- return err
-}
-
-func (c *client) PowerOnDevice(ctx context.Context, pid string) error {
- _, err := wrap(ctx, c, func(p *packngo.Client) (*packngo.Response, error) {
- r, err := p.Devices.PowerOn(pid)
- if err != nil {
- return nil, fmt.Errorf("Devices.PowerOn: %w", err)
- }
- return r, nil
- })
- return err
-}
-
-func (c *client) DeleteDevice(ctx context.Context, id string) error {
- _, err := wrap(ctx, c, func(p *packngo.Client) (*packngo.Response, error) {
- r, err := p.Devices.Delete(id, false)
- if err != nil {
- return nil, fmt.Errorf("Devices.Delete: %w", err)
- }
- return r, nil
- })
- return err
-}
-
-func (c *client) CreateDevice(ctx context.Context, r *packngo.DeviceCreateRequest) (*packngo.Device, error) {
- if r.HardwareReservationID == "" {
- return nil, ErrNoReservationProvided
- }
- // Add a tag to the request to detect if someone snatches a hardware reservation
- // from under us.
- witnessTag := fmt.Sprintf("wrapngo-idempotency-%s", uuid.New().String())
- r.Tags = append(r.Tags, witnessTag)
-
- return wrap(ctx, c, func(cl *packngo.Client) (*packngo.Device, error) {
- //Does the device already exist?
- res, _, err := cl.HardwareReservations.Get(r.HardwareReservationID, nil)
- if err != nil {
- return nil, fmt.Errorf("couldn't check if device already exists: %w", err)
- }
- if res == nil {
- return nil, fmt.Errorf("unexpected nil response")
- }
- if res.Device != nil {
- // Check if we lost the race for this hardware reservation.
- tags := make(map[string]bool)
- for _, tag := range res.Device.Tags {
- tags[tag] = true
- }
- if !tags[witnessTag] {
- return nil, ErrRaceLost
- }
- return res.Device, nil
- }
-
- // No device yet. Try to create it.
- dev, _, err := cl.Devices.Create(r)
- if err == nil {
- return dev, nil
- }
- // In case of a transient failure (eg. network issue), we retry the whole
- // operation, which means we first check again if the device already exists. If
- // it's a permanent error from the API, the backoff logic will fail immediately.
- return nil, fmt.Errorf("couldn't create device: %w", err)
- })
-}
-
-func (c *client) UpdateDevice(ctx context.Context, id string, r *packngo.DeviceUpdateRequest) (*packngo.Device, error) {
- return wrap(ctx, c, func(cl *packngo.Client) (*packngo.Device, error) {
- dev, _, err := cl.Devices.Update(id, r)
- return dev, err
- })
-}
-
-func (c *client) ListDevices(ctx context.Context, pid string) ([]packngo.Device, error) {
- return wrap(ctx, c, func(cl *packngo.Client) ([]packngo.Device, error) {
- // to increase the chances of a stable pagination, we sort the devices by hostname
- res, _, err := cl.Devices.List(pid, &packngo.GetOptions{SortBy: "hostname"})
- return res, err
- })
-}
-
-func (c *client) GetDevice(ctx context.Context, pid, did string, opts *packngo.ListOptions) (*packngo.Device, error) {
- return wrap(ctx, c, func(cl *packngo.Client) (*packngo.Device, error) {
- d, _, err := cl.Devices.Get(did, opts)
- return d, err
- })
-}
-
-// Currently unexported, only used in tests.
-func (c *client) deleteDevice(ctx context.Context, did string) error {
- _, err := wrap(ctx, c, func(cl *packngo.Client) (*struct{}, error) {
- _, err := cl.Devices.Delete(did, false)
- if httpStatusCode(err) == http.StatusNotFound {
- // 404s may pop up as an after effect of running the back-off
- // algorithm, and as such should not be propagated.
- return nil, nil
- }
- return nil, err
- })
- return err
-}
-
-func (c *client) ListReservations(ctx context.Context, pid string) ([]packngo.HardwareReservation, error) {
- return wrap(ctx, c, func(cl *packngo.Client) ([]packngo.HardwareReservation, error) {
- res, _, err := cl.HardwareReservations.List(pid, &packngo.ListOptions{Includes: []string{"facility", "device"}})
- return res, err
- })
-}
-
-func (c *client) ListOrganizationReservations(ctx context.Context, pid string) ([]packngo.HardwareReservation, error) {
- return wrap(ctx, c, func(cl *packngo.Client) ([]packngo.HardwareReservation, error) {
- res, _, err := cl.Organizations.ListHardwareReservations(pid, &packngo.ListOptions{Includes: []string{"facility", "device", "project"}})
- return res, err
- })
-}
-
-func (c *client) MoveReservation(ctx context.Context, hardwareReservationDID, projectID string) (*packngo.HardwareReservation, error) {
- return wrap(ctx, c, func(cl *packngo.Client) (*packngo.HardwareReservation, error) {
- hr, _, err := cl.HardwareReservations.Move(hardwareReservationDID, projectID)
- if err != nil {
- return nil, fmt.Errorf("HardwareReservations.Move: %w", err)
- }
- return hr, err
- })
-}
-
-func (c *client) CreateSSHKey(ctx context.Context, r *packngo.SSHKeyCreateRequest) (*packngo.SSHKey, error) {
- return wrap(ctx, c, func(cl *packngo.Client) (*packngo.SSHKey, error) {
- // Does the key already exist?
- ks, _, err := cl.SSHKeys.List()
- if err != nil {
- return nil, fmt.Errorf("SSHKeys.List: %w", err)
- }
- for _, k := range ks {
- if k.Label == r.Label {
- if k.Key != r.Key {
- return nil, fmt.Errorf("key label already in use for a different key")
- }
- return &k, nil
- }
- }
-
- // No key yet. Try to create it.
- k, _, err := cl.SSHKeys.Create(r)
- if err != nil {
- return nil, fmt.Errorf("SSHKeys.Create: %w", err)
- }
- return k, nil
- })
-}
-
-func (c *client) UpdateSSHKey(ctx context.Context, id string, r *packngo.SSHKeyUpdateRequest) (*packngo.SSHKey, error) {
- return wrap(ctx, c, func(cl *packngo.Client) (*packngo.SSHKey, error) {
- k, _, err := cl.SSHKeys.Update(id, r)
- if err != nil {
- return nil, fmt.Errorf("SSHKeys.Update: %w", err)
- }
- return k, err
- })
-}
-
-// Currently unexported, only used in tests.
-func (c *client) deleteSSHKey(ctx context.Context, id string) error {
- _, err := wrap(ctx, c, func(cl *packngo.Client) (struct{}, error) {
- _, err := cl.SSHKeys.Delete(id)
- if err != nil {
- return struct{}{}, fmt.Errorf("SSHKeys.Delete: %w", err)
- }
- return struct{}{}, err
- })
- return err
-}
-
-func (c *client) ListSSHKeys(ctx context.Context) ([]packngo.SSHKey, error) {
- return wrap(ctx, c, func(cl *packngo.Client) ([]packngo.SSHKey, error) {
- ks, _, err := cl.SSHKeys.List()
- if err != nil {
- return nil, fmt.Errorf("SSHKeys.List: %w", err)
- }
- return ks, nil
- })
-}
-
-// Currently unexported, only used in tests.
-func (c *client) getSSHKey(ctx context.Context, id string) (*packngo.SSHKey, error) {
- return wrap(ctx, c, func(cl *packngo.Client) (*packngo.SSHKey, error) {
- k, _, err := cl.SSHKeys.Get(id, nil)
- if err != nil {
- return nil, fmt.Errorf("SSHKeys.Get: %w", err)
- }
- return k, nil
- })
-}
-
-func (c *client) RebootDevice(ctx context.Context, did string) error {
- _, err := wrap(ctx, c, func(cl *packngo.Client) (struct{}, error) {
- _, err := cl.Devices.Reboot(did)
- return struct{}{}, err
- })
- return err
-}
diff --git a/cloud/equinix/wrapngo/wrapngo_live_test.go b/cloud/equinix/wrapngo/wrapngo_live_test.go
deleted file mode 100644
index 0ccce37..0000000
--- a/cloud/equinix/wrapngo/wrapngo_live_test.go
+++ /dev/null
@@ -1,350 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package wrapngo
-
-import (
- "context"
- "crypto/ed25519"
- "crypto/rand"
- "errors"
- "fmt"
- "log"
- "os"
- "testing"
- "time"
-
- "github.com/packethost/packngo"
- "golang.org/x/crypto/ssh"
-)
-
-type liveTestClient struct {
- cl *client
- ctx context.Context
-
- apipid string
- apios string
-
- sshKeyLabel string
- testDeviceHostname string
-}
-
-func newLiveTestClient(t *testing.T) *liveTestClient {
- t.Helper()
-
- apiuser := os.Getenv("EQUINIX_USER")
- apikey := os.Getenv("EQUINIX_APIKEY")
- apipid := os.Getenv("EQUINIX_PROJECT_ID")
- apios := os.Getenv("EQUINIX_DEVICE_OS")
-
- if apiuser == "" {
- t.Skip("EQUINIX_USER must be set.")
- }
- if apikey == "" {
- t.Skip("EQUINIX_APIKEY must be set.")
- }
- if apipid == "" {
- t.Skip("EQUINIX_PROJECT_ID must be set.")
- }
- if apios == "" {
- t.Skip("EQUINIX_DEVICE_OS must be set.")
- }
- ctx, ctxC := context.WithCancel(context.Background())
- t.Cleanup(ctxC)
- return &liveTestClient{
- cl: newClient(&Opts{
- User: apiuser,
- APIKey: apikey,
- }),
- ctx: ctx,
-
- apipid: apipid,
- apios: apios,
-
- sshKeyLabel: "shepherd-livetest-client",
- testDeviceHostname: "shepherd-livetest-device",
- }
-}
-
-// awaitDeviceState returns nil after device matching the id reaches one of the
-// provided states. It will return a non-nil value in case of an API error, and
-// particularly if there exists no device matching id.
-func (l *liveTestClient) awaitDeviceState(t *testing.T, id string, states ...string) error {
- t.Helper()
-
- for {
- d, err := l.cl.GetDevice(l.ctx, l.apipid, id, nil)
- if err != nil {
- if errors.Is(err, os.ErrDeadlineExceeded) {
- continue
- }
- return fmt.Errorf("while fetching device info: %w", err)
- }
- if d == nil {
- return fmt.Errorf("expected the test device (ID: %s) to exist.", id)
- }
- for _, s := range states {
- if d.State == s {
- return nil
- }
- }
- t.Logf("Waiting for device to be provisioned (ID: %s, current state: %q)", id, d.State)
- time.Sleep(time.Second)
- }
-}
-
-// cleanup ensures both the test device and the test key are deleted at
-// Equinix.
-func (l *liveTestClient) cleanup(t *testing.T) {
- t.Helper()
-
- t.Logf("Cleaning up.")
-
- // Ensure the device matching testDeviceHostname is deleted.
- ds, err := l.cl.ListDevices(l.ctx, l.apipid)
- if err != nil {
- log.Fatalf("while listing devices: %v", err)
- }
- var td *packngo.Device
- for _, d := range ds {
- if d.Hostname == l.testDeviceHostname {
- td = &d
- break
- }
- }
- if td != nil {
- t.Logf("Found a test device (ID: %s) that needs to be deleted before progressing further.", td.ID)
-
- // Devices currently being provisioned can't be deleted. After it's
- // provisioned, device's state will match either "active", or "failed".
- if err := l.awaitDeviceState(t, "active", "failed"); err != nil {
- t.Fatalf("while waiting for device to be provisioned: %v", err)
- }
- if err := l.cl.deleteDevice(l.ctx, td.ID); err != nil {
- t.Fatalf("while deleting test device: %v", err)
- }
- }
-
- // Ensure the key matching sshKeyLabel is deleted.
- ks, err := l.cl.ListSSHKeys(l.ctx)
- if err != nil {
- t.Fatalf("while listing SSH keys: %v", err)
- }
- for _, k := range ks {
- if k.Label == l.sshKeyLabel {
- t.Logf("Found a SSH test key (ID: %s) - deleting...", k.ID)
- if err := l.cl.deleteSSHKey(l.ctx, k.ID); err != nil {
- t.Fatalf("while deleting an SSH key: %v", err)
- }
- t.Logf("Deleted a SSH test key (ID: %s).", k.ID)
- }
- }
-}
-
-// createSSHAuthKey returns an SSH public key in OpenSSH authorized_keys
-// format.
-func createSSHAuthKey(t *testing.T) string {
- t.Helper()
- pub, _, err := ed25519.GenerateKey(rand.Reader)
- if err != nil {
- t.Errorf("while generating SSH key: %v", err)
- }
-
- sshpub, err := ssh.NewPublicKey(pub)
- if err != nil {
- t.Errorf("while generating SSH public key: %v", err)
- }
- return string(ssh.MarshalAuthorizedKey(sshpub))
-}
-
-// TestLiveAPI performs smoke tests of wrapngo against the real Equinix API. See
-// newLiveTestClient to see which environment variables need to be provided in
-// order for this test to run.
-func TestLiveAPI(t *testing.T) {
- ltc := newLiveTestClient(t)
- ltc.cleanup(t)
-
- cl := ltc.cl
- ctx := ltc.ctx
-
- t.Run("ListReservations", func(t *testing.T) {
- _, err := cl.ListReservations(ctx, ltc.apipid)
- if err != nil {
- t.Errorf("while listing hardware reservations: %v", err)
- }
- })
-
- var sshKeyID string
- t.Run("CreateSSHKey", func(t *testing.T) {
- nk, err := cl.CreateSSHKey(ctx, &packngo.SSHKeyCreateRequest{
- Label: ltc.sshKeyLabel,
- Key: createSSHAuthKey(t),
- ProjectID: ltc.apipid,
- })
- if err != nil {
- t.Fatalf("while creating an SSH key: %v", err)
- }
- if nk.Label != ltc.sshKeyLabel {
- t.Errorf("key labels don't match.")
- }
- t.Logf("Created an SSH key (ID: %s)", nk.ID)
- sshKeyID = nk.ID
- })
-
- var dummySSHPK2 string
- t.Run("UpdateSSHKey", func(t *testing.T) {
- if sshKeyID == "" {
- t.Skip("SSH key couldn't have been created - skipping...")
- }
-
- dummySSHPK2 = createSSHAuthKey(t)
- k, err := cl.UpdateSSHKey(ctx, sshKeyID, &packngo.SSHKeyUpdateRequest{
- Key: &dummySSHPK2,
- })
- if err != nil {
- t.Fatalf("while updating an SSH key: %v", err)
- }
- if k.Key != dummySSHPK2 {
- t.Errorf("updated SSH key doesn't match the original.")
- }
- })
- t.Run("GetSSHKey", func(t *testing.T) {
- if sshKeyID == "" {
- t.Skip("SSH key couldn't have been created - skipping...")
- }
-
- k, err := cl.getSSHKey(ctx, sshKeyID)
- if err != nil {
- t.Fatalf("while getting an SSH key: %v", err)
- }
- if k.Key != dummySSHPK2 {
- t.Errorf("got key contents that don't match the original.")
- }
- })
- t.Run("ListSSHKeys", func(t *testing.T) {
- if sshKeyID == "" {
- t.Skip("SSH key couldn't have been created - skipping...")
- }
-
- ks, err := cl.ListSSHKeys(ctx)
- if err != nil {
- t.Fatalf("while listing SSH keys: %v", err)
- }
-
- // Check that our key is part of the list.
- found := false
- for _, k := range ks {
- if k.ID == sshKeyID {
- found = true
- break
- }
- }
- if !found {
- t.Errorf("SSH key not listed.")
- }
- })
-
- var testDevice *packngo.Device
- t.Run("CreateDevice", func(t *testing.T) {
- // Find a provisionable hardware reservation the device will be created with.
- rvs, err := cl.ListReservations(ctx, ltc.apipid)
- if err != nil {
- t.Errorf("while listing hardware reservations: %v", err)
- }
- var rv *packngo.HardwareReservation
- for _, r := range rvs {
- if r.Provisionable {
- rv = &r
- break
- }
- }
- if rv == nil {
- t.Skip("could not find a provisionable hardware reservation - skipping...")
- }
-
- // nolint:SA5011
- d, err := cl.CreateDevice(ctx, &packngo.DeviceCreateRequest{
- Hostname: ltc.testDeviceHostname,
- OS: ltc.apios,
- Plan: rv.Plan.Slug,
- HardwareReservationID: rv.ID,
- ProjectID: ltc.apipid,
- })
- if err != nil {
- t.Fatalf("while creating a device: %v", err)
- }
- t.Logf("Created a new test device (ID: %s)", d.ID)
- testDevice = d
- })
- t.Run("GetDevice", func(t *testing.T) {
- if testDevice == nil {
- t.Skip("the test device couldn't have been created - skipping...")
- }
-
- d, err := cl.GetDevice(ctx, ltc.apipid, testDevice.ID, nil)
- if err != nil {
- t.Fatalf("while fetching device info: %v", err)
- }
- if d == nil {
- t.Fatalf("expected the test device (ID: %s) to exist.", testDevice.ID)
- return
- }
- if d.ID != testDevice.ID {
- t.Errorf("got device ID that doesn't match the original.")
- return
- }
- })
- t.Run("ListDevices", func(t *testing.T) {
- if testDevice == nil {
- t.Skip("the test device couldn't have been created - skipping...")
- }
-
- ds, err := cl.ListDevices(ctx, ltc.apipid)
- if err != nil {
- t.Errorf("while listing devices: %v", err)
- }
- if len(ds) == 0 {
- t.Errorf("expected at least one device.")
- }
- })
- t.Run("DeleteDevice", func(t *testing.T) {
- if testDevice == nil {
- t.Skip("the test device couldn't have been created - skipping...")
- }
-
- // Devices currently being provisioned can't be deleted. After it's
- // provisioned, device's state will match either "active", or "failed".
- if err := ltc.awaitDeviceState(t, testDevice.ID, "active", "failed"); err != nil {
- t.Fatalf("while waiting for device to be provisioned: %v", err)
- }
- t.Logf("Deleting the test device (ID: %s)", testDevice.ID)
- if err := cl.deleteDevice(ctx, testDevice.ID); err != nil {
- t.Fatalf("while deleting a device: %v", err)
- }
- d, err := cl.GetDevice(ctx, ltc.apipid, testDevice.ID, nil)
- if err != nil && !IsNotFound(err) {
- t.Fatalf("while fetching device info: %v", err)
- }
- if d != nil {
- t.Fatalf("device should not exist.")
- }
- t.Logf("Deleted the test device (ID: %s)", testDevice.ID)
- })
- t.Run("DeleteSSHKey", func(t *testing.T) {
- if sshKeyID == "" {
- t.Skip("SSH key couldn't have been created - skipping...")
- }
-
- t.Logf("Deleting the test SSH key (ID: %s)", sshKeyID)
- if err := cl.deleteSSHKey(ctx, sshKeyID); err != nil {
- t.Fatalf("couldn't delete an SSH key: %v", err)
- }
- _, err := cl.getSSHKey(ctx, sshKeyID)
- if err == nil {
- t.Fatalf("SSH key should not exist")
- }
- t.Logf("Deleted the test SSH key (ID: %s)", sshKeyID)
- })
-
- ltc.cleanup(t)
-}
diff --git a/cloud/lib/component/BUILD.bazel b/cloud/lib/component/BUILD.bazel
deleted file mode 100644
index e751aaf..0000000
--- a/cloud/lib/component/BUILD.bazel
+++ /dev/null
@@ -1,28 +0,0 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
-
-go_library(
- name = "component",
- srcs = [
- "component.go",
- "crdb.go",
- "devcerts.go",
- ],
- importpath = "source.monogon.dev/cloud/lib/component",
- visibility = ["//visibility:public"],
- deps = [
- "//osbase/pki",
- "@com_github_adrg_xdg//:xdg",
- "@com_github_cockroachdb_cockroach_go_v2//testserver",
- "@com_github_golang_migrate_migrate_v4//:migrate",
- "@com_github_golang_migrate_migrate_v4//database/cockroachdb",
- "@com_github_golang_migrate_migrate_v4//source",
- "@com_github_lib_pq//:pq",
- "@com_github_prometheus_client_golang//prometheus",
- "@com_github_prometheus_client_golang//prometheus/collectors",
- "@com_github_prometheus_client_golang//prometheus/promhttp",
- "@io_bazel_rules_go//go/runfiles",
- "@io_k8s_klog_v2//:klog",
- "@org_golang_google_grpc//:grpc",
- "@org_golang_google_grpc//credentials",
- ],
-)
diff --git a/cloud/lib/component/component.go b/cloud/lib/component/component.go
deleted file mode 100644
index 46fa58a..0000000
--- a/cloud/lib/component/component.go
+++ /dev/null
@@ -1,214 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-// Package component implements reusable bits for cloud service components. Each
-// component is currently defined as being a standalone Go binary with its own
-// internal gRPC listener. Subsequent listeners (eg. public gRPC or HTTP) can be
-// defined by users of this library.
-package component
-
-import (
- "context"
- "crypto/tls"
- "crypto/x509"
- "flag"
- "net"
- "net/http"
- "os"
- "path/filepath"
-
- "github.com/adrg/xdg"
- "github.com/prometheus/client_golang/prometheus"
- "github.com/prometheus/client_golang/prometheus/collectors"
- "github.com/prometheus/client_golang/prometheus/promhttp"
- "google.golang.org/grpc"
- "google.golang.org/grpc/credentials"
- "k8s.io/klog/v2"
-)
-
-// ComponentConfig is the common configuration of a component. It's
-// supposed to be instantiated within a Configuration struct of a component.
-//
-// It can be configured by flags (via RegisterFlags) or manually (eg. in tests).
-type ComponentConfig struct {
- // GRPCKeyPath is the filesystem path of the x509 key used to serve internal
- // gRPC traffic.
- GRPCKeyPath string
- // GRPCCertificatePath is the filesystem path of the x509 certificate used to
- // serve internal gRPC traffic.
- GRPCCertificatePath string
- // GRPCCAPath is the filesystem path of of the x509 CA certificate used to
- // verify incoming connections on internal gRPC traffic.
- GRPCCAPath string
- // GRPCListenAddress is the address on which the component should server
- // internal gRPC traffic.
- GRPCListenAddress string
-
- // DevCerts, if enabled, automatically generates development CA and component
- // certificates/keys at DevCertsPath, uses these to serve traffic.
- DevCerts bool
- // DevCertsPath sets the prefix in which DevCerts are generated. All components
- // should have the same path set so that they reuse the CA certificate.
- DevCertsPath string
-
- // ComponentName is the name of this component, which should be [a-z0-9+]. It's
- // used to prefix all flags set by the Configuration.
- ComponentName string
-
- // PrometheusListenAddress is the address on which the component should serve
- // Prometheus metrics.
- PrometheusListenAddress string
- // PrometheusInsecure enables serving Prometheus metrics without any TLS, running
- // a plain HTTP listener. If disabled, Prometheus metrics are served using the
- // same PKI setup as the components' gRPC server.
- PrometheusInsecure bool
-
- prometheusRegistry *prometheus.Registry
-}
-
-// RegisterFlags registers the component configuration to be provided by flags.
-// This must be called exactly once before then calling flags.Parse().
-func (c *ComponentConfig) RegisterFlags(componentName string) {
- flag.StringVar(&c.GRPCKeyPath, componentName+"_grpc_key_path", "", "Path to gRPC server/client key for "+componentName)
- flag.StringVar(&c.GRPCCertificatePath, componentName+"_grpc_certificate_path", "", "Path to gRPC server/client certificate for "+componentName)
- flag.StringVar(&c.GRPCCAPath, componentName+"_grpc_ca_certificate_path", "", "Path to gRPC CA certificate for "+componentName)
- flag.StringVar(&c.GRPCListenAddress, componentName+"_grpc_listen_address", ":4242", "Address to listen at for gRPC connections for "+componentName)
- flag.StringVar(&c.PrometheusListenAddress, componentName+"_prometheus_listen_address", ":4243", "Address to listen at for Prometheus connections for "+componentName)
- flag.BoolVar(&c.PrometheusInsecure, componentName+"_prometheus_insecure", false, "Serve plain HTTP prometheus without mTLS. If not set, main gRPC TLS credentials/certificates are used")
-
- flag.BoolVar(&c.DevCerts, componentName+"_dev_certs", false, "Use developer certificates (autogenerated) for "+componentName)
- flag.StringVar(&c.DevCertsPath, componentName+"_dev_certs_path", filepath.Join(xdg.ConfigHome, "monogon-dev-certs"), "Path for storing developer certificates")
-
- c.ComponentName = componentName
-}
-
-func (c *ComponentConfig) getTLSConfig() *tls.Config {
- var certPath, keyPath, caPath string
- if c.DevCerts {
- // Use devcerts if requested.
- certPath, keyPath, caPath = c.GetDevCerts()
- } else {
- // Otherwise, use data from flags.
- if c.GRPCKeyPath == "" {
- klog.Exitf("-grpc_key_path must be set")
- }
- if c.GRPCCertificatePath == "" {
- klog.Exitf("-grpc_certificate_path must be set")
- }
- if c.GRPCCAPath == "" {
- klog.Exitf("-grpc_ca_certificate_path must be set")
- }
- keyPath = c.GRPCKeyPath
- certPath = c.GRPCCertificatePath
- caPath = c.GRPCCAPath
- }
-
- ca, err := os.ReadFile(caPath)
- if err != nil {
- klog.Exitf("Could not read GRPC CA: %v", err)
- }
- certPool := x509.NewCertPool()
- if !certPool.AppendCertsFromPEM(ca) {
- klog.Exitf("Could not load GRPC CA: %v", err)
- }
-
- pair, err := tls.LoadX509KeyPair(certPath, keyPath)
- if err != nil {
- klog.Exitf("Could not load GRPC TLS keypair: %v", err)
- }
- return &tls.Config{
- Certificates: []tls.Certificate{pair},
- ClientAuth: tls.RequireAndVerifyClientCert,
- ClientCAs: certPool,
- }
-}
-
-// PrometheusRegistry returns this component's singleton Prometheus registry,
-// creating it as needed. This method is not goroutine-safe, and should only be
-// called during the setup process of the Component.
-func (c *ComponentConfig) PrometheusRegistry() *prometheus.Registry {
- if c.prometheusRegistry == nil {
- c.prometheusRegistry = prometheus.NewRegistry()
- c.prometheusRegistry.Register(collectors.NewGoCollector())
- c.prometheusRegistry.Register(collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}))
- }
- return c.prometheusRegistry
-}
-
-// StartPrometheus starts a Prometheus metrics server in a goroutine. It will
-// serve any metrics that have been registered with the registry returned by
-// PrometheusRegistry.
-func (c *ComponentConfig) StartPrometheus(ctx context.Context) {
- reg := c.PrometheusRegistry()
-
- mux := http.NewServeMux()
- mux.Handle("/metrics", promhttp.HandlerFor(reg, promhttp.HandlerOpts{}))
-
- var lis net.Listener
- var err error
-
- if c.PrometheusInsecure {
- lis, err = net.Listen("tcp", c.PrometheusListenAddress)
- } else {
- lis, err = tls.Listen("tcp", c.PrometheusListenAddress, c.getTLSConfig())
- }
- if err != nil {
- klog.Exitf("Could not listen on prometheus address: %v", err)
- }
-
- srv := http.Server{
- Handler: mux,
- }
- go func() {
- klog.Infof("Prometheus listening on %s", lis.Addr())
- if err := srv.Serve(lis); err != nil && ctx.Err() == nil {
- klog.Exitf("Prometheus serve failed: %v", err)
- }
- }()
- go func() {
- <-ctx.Done()
- srv.Close()
- }()
-}
-
-// GRPCServerOptions returns pre-built grpc.ServerOptions that this component
-// should use to serve internal gRPC.
-func (c *ComponentConfig) GRPCServerOptions() []grpc.ServerOption {
- return []grpc.ServerOption{
- grpc.Creds(credentials.NewTLS(c.getTLSConfig())),
- }
-}
-
-// GRPCServerOptionsPublic returns pre-built grpc.ServerOptions that this
-// component should use to serve public gRPC. Any client will be allowed to
-// connect, and it's up to the server implementation to authenticate incoming
-// requests.
-func (c *ComponentConfig) GRPCServerOptionsPublic() []grpc.ServerOption {
- var certPath, keyPath string
- if c.DevCerts {
- // Use devcerts if requested.
- certPath, keyPath, _ = c.GetDevCerts()
- } else {
- // Otherwise, use data from flags.
- if c.GRPCKeyPath == "" {
- klog.Exitf("-grpc_key_path must be set")
- }
- if c.GRPCCertificatePath == "" {
- klog.Exitf("-grpc_certificate_path must be set")
- }
- keyPath = c.GRPCKeyPath
- certPath = c.GRPCCertificatePath
- }
-
- pair, err := tls.LoadX509KeyPair(certPath, keyPath)
- if err != nil {
- klog.Exitf("Could not load GRPC TLS keypair: %v", err)
- }
- tlsConf := &tls.Config{
- Certificates: []tls.Certificate{pair},
- ClientAuth: tls.RequestClientCert,
- }
- return []grpc.ServerOption{
- grpc.Creds(credentials.NewTLS(tlsConf)),
- }
-}
diff --git a/cloud/lib/component/crdb.go b/cloud/lib/component/crdb.go
deleted file mode 100644
index 07306d1..0000000
--- a/cloud/lib/component/crdb.go
+++ /dev/null
@@ -1,202 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package component
-
-import (
- "database/sql"
- "errors"
- "flag"
- "fmt"
- "net/url"
- "os"
- "sync"
-
- "github.com/bazelbuild/rules_go/go/runfiles"
- "github.com/cockroachdb/cockroach-go/v2/testserver"
- "github.com/golang-migrate/migrate/v4"
- _ "github.com/golang-migrate/migrate/v4/database/cockroachdb"
- "github.com/golang-migrate/migrate/v4/source"
- _ "github.com/lib/pq"
- "k8s.io/klog/v2"
-)
-
-// CockroachConfig is the common configuration of a components' connection to
-// CockroachDB. It's supposed to be instantiated within a Configuration struct
-// of a component.
-//
-// It can be configured by flags (via RegisterFlags) or manually (eg. in tests).
-type CockroachConfig struct {
- // Migrations is the go-migrate source of migrations for this database. Usually
- // this can be taken from a go-embedded set of migration files.
- Migrations source.Driver
-
- // EndpointHost is the host part of the endpoint address of the database server.
- EndpointHost string
- // TLSKeyPath is the filesystem path of the x509 key used to authenticate to the
- // database server.
- TLSKeyPath string
- // TLSKeyPath is the filesystem path of the x509 certificate used to
- // authenticate to the database server.
- TLSCertificatePath string
- // TLSCACertificatePath is the filesystem path of the x509 CA certificate used
- // to verify the database server's certificate.
- TLSCACertificatePath string
- // UserName is the username to be used on the database server.
- UserName string
- // UserName is the database name to be used on the database server.
- DatabaseName string
-
- // InMemory indicates that an in-memory CockroachDB instance should be used.
- // Data will be lost after the component shuts down.
- InMemory bool
-
- // mu guards inMemoryInstance.
- mu sync.Mutex
- // inMemoryInstance is populated with a CockroachDB test server handle when
- // InMemory is set and Connect()/MigrateUp() is called.
- inMemoryInstance testserver.TestServer
-}
-
-// RegisterFlags registers the connection configuration to be provided by flags.
-// This must be called exactly once before then calling flags.Parse().
-func (c *CockroachConfig) RegisterFlags(prefix string) {
- flag.StringVar(&c.EndpointHost, prefix+"_endpoint_host", "", "Host of CockroachDB endpoint for "+prefix)
- flag.StringVar(&c.TLSKeyPath, prefix+"_tls_key_path", "", "Path to CockroachDB TLS client key for "+prefix)
- flag.StringVar(&c.TLSCertificatePath, prefix+"_tls_certificate_path", "", "Path to CockroachDB TLS client certificate for "+prefix)
- flag.StringVar(&c.TLSCACertificatePath, prefix+"_tls_ca_certificate_path", "", "Path to CockroachDB CA certificate for "+prefix)
- flag.StringVar(&c.UserName, prefix+"_user_name", prefix, "CockroachDB user name for "+prefix)
- flag.StringVar(&c.DatabaseName, prefix+"_database_name", prefix, "CockroachDB database name for "+prefix)
- flag.BoolVar(&c.InMemory, prefix+"_eat_my_data", false, "Use in-memory CockroachDB for "+prefix+". Warning: Data will be lost at process shutdown!")
-}
-
-// startInMemory starts an in-memory cockroachdb server as a subprocess, and
-// returns a DSN that connects to the newly created database.
-func (c *CockroachConfig) startInMemory(scheme string) string {
- c.mu.Lock()
- defer c.mu.Unlock()
-
- klog.Warningf("STARTING IN-MEMORY COCKROACHDB FOR TESTS")
- klog.Warningf("ALL DATA WILL BE LOST AFTER SERVER SHUTDOWN!")
-
- if c.inMemoryInstance == nil {
- opts := []testserver.TestServerOpt{
- testserver.SecureOpt(),
- }
- if path, err := runfiles.Rlocation("cockroach/cockroach"); err == nil {
- opts = append(opts, testserver.CockroachBinaryPathOpt(path))
- } else {
- if os.Getenv("TEST_TMPDIR") != "" {
- klog.Exitf("In test which requires in-memory cockroachdb, but @cockroach//:cockroach missing as a dependency. Failing.")
- }
- klog.Warningf("CockroachDB in-memory database requested, but not available as a build dependency. Trying to download it...")
- }
-
- inst, err := testserver.NewTestServer(opts...)
- if err != nil {
- klog.Exitf("Failed to create crdb test server: %v", err)
- }
- c.inMemoryInstance = inst
- }
-
- u := *c.inMemoryInstance.PGURL()
- u.Scheme = scheme
- return u.String()
-}
-
-// buildDSN returns a DSN to the configured database connection with a given DSN
-// scheme. The scheme will usually be 'postgres' or 'cockroach', depending on
-// whether it's used for lib/pq or for golang-migrate.
-func (c *CockroachConfig) buildDSN(scheme string) string {
- if c.InMemory {
- return c.startInMemory(scheme)
- }
-
- query := make(url.Values)
- query.Set("sslmode", "verify-full")
- query.Set("sslcert", c.TLSCertificatePath)
- query.Set("sslkey", c.TLSKeyPath)
- query.Set("sslrootcert", c.TLSCACertificatePath)
- u := url.URL{
- Scheme: scheme,
- User: url.User(c.UserName),
- Host: c.EndpointHost,
- Path: c.DatabaseName,
- RawQuery: query.Encode(),
- }
- return u.String()
-}
-
-// Connect returns a working *sql.DB handle to the database described by this
-// CockroachConfig.
-func (c *CockroachConfig) Connect() (*sql.DB, error) {
- dsn := c.buildDSN("postgres")
- klog.Infof("Connecting to %s...", dsn)
- return sql.Open("postgres", c.buildDSN("postgres"))
-}
-
-// MigrateUp performs all possible migrations upwards for the database described
-// by this CockroachConfig.
-func (c *CockroachConfig) MigrateUp() error {
- dsn := c.buildDSN("cockroachdb")
- klog.Infof("Running migrations up...")
- m, err := migrate.NewWithSourceInstance("iofs", c.Migrations, dsn)
- if err != nil {
- return err
- }
- err = m.Up()
- switch {
- case err == nil:
- return nil
- case errors.Is(err, migrate.ErrNoChange):
- return nil
- default:
- return err
- }
-}
-
-func (c *CockroachConfig) MigrateUpToIncluding(ver uint) error {
- dsn := c.buildDSN("cockroachdb")
- klog.Infof("Running migrations up to %d...", ver)
- m, err := migrate.NewWithSourceInstance("iofs", c.Migrations, dsn)
- if err != nil {
- return err
- }
-
- return m.Migrate(ver)
-}
-
-// MigrateDownDangerDanger removes all data from the database by performing a
-// full migration down.
-//
-// Let me reiterate: this function, by design, DESTROYS YOUR DATA.
-//
-// Obviously, this is a dangerous method. Thus, to prevent accidental nuking of
-// production data, we currently only allow this to be performed on InMemory
-// databases.
-func (c *CockroachConfig) MigrateDownDangerDanger() error {
- if !c.InMemory {
- return fmt.Errorf("refusing to migrate down a non-in-memory database")
- }
- // Sneaky extra check to make sure the caller didn't just set InMemory after
- // connecting to an external database. We really need to be safe here.
- if c.inMemoryInstance == nil {
- return fmt.Errorf("no really, this cannot be run on non-in-memory databases")
- }
- dsn := c.buildDSN("cockroachdb")
- klog.Infof("Running migrations down...")
- m, err := migrate.NewWithSourceInstance("iofs", c.Migrations, dsn)
- if err != nil {
- return err
- }
- // Final sneaky check, make sure the remote schema version is our maximum locally
- // supported version.
- v, _, err := m.Version()
- if err != nil {
- return fmt.Errorf("could not retrieve remote version: %w", err)
- }
- if v2, err := c.Migrations.Next(v); !os.IsNotExist(err) {
- return fmt.Errorf("remote running version %d, but we know %d which is newer", v, v2)
- }
- return m.Down()
-}
diff --git a/cloud/lib/component/devcerts.go b/cloud/lib/component/devcerts.go
deleted file mode 100644
index 837ad98..0000000
--- a/cloud/lib/component/devcerts.go
+++ /dev/null
@@ -1,191 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package component
-
-import (
- "crypto/ed25519"
- "crypto/rand"
- "crypto/tls"
- "crypto/x509"
- "encoding/pem"
- "fmt"
- "math/big"
- "os"
- "time"
-
- "k8s.io/klog/v2"
-
- "source.monogon.dev/osbase/pki"
-)
-
-// GetDevCerts returns paths to this component's development certificate, key
-// and CA, or panics if unavailable.
-func (c *ComponentConfig) GetDevCerts() (certPath, keyPath, caPath string) {
- klog.Infof("Using developer certificates at %s", c.DevCertsPath)
-
- caPath = c.ensureDevCA()
- certPath, keyPath = c.ensureDevComponent()
- return
-}
-
-// ensureDevComponent ensures that a development certificate/key exists for this
-// component and returns paths to them. This data is either read from disk if it
-// already exists, or is generated when this function is called. If any problem
-// occurs, the code panics.
-func (c *ComponentConfig) ensureDevComponent() (certPath, keyPath string) {
- caKeyPath := c.DevCertsPath + "/ca.key"
- caCertPath := c.DevCertsPath + "/ca.cert"
-
- // Load CA. By convention, we are always called after ensureDevCA.
- ca, err := tls.LoadX509KeyPair(caCertPath, caKeyPath)
- if err != nil {
- klog.Exitf("Could not load Dev CA: %v", err)
- }
- caCert, err := x509.ParseCertificate(ca.Certificate[0])
- if err != nil {
- klog.Exitf("Could not parse Dev CA: %v", err)
- }
-
- // Check if we have keys already.
- keyPath = c.DevCertsPath + fmt.Sprintf("/%s.key", c.ComponentName)
- certPath = c.DevCertsPath + fmt.Sprintf("/%s.crt", c.ComponentName)
- noKey := false
- if _, err := os.Stat(keyPath); os.IsNotExist(err) {
- noKey = true
- }
- noCert := false
- if _, err := os.Stat(certPath); os.IsNotExist(err) {
- noCert = true
- }
-
- if noKey || noCert {
- klog.Infof("Generating developer %s certificate...", c.ComponentName)
- } else {
- return
- }
-
- // Generate key/certificate.
- cert := pki.Server([]string{
- fmt.Sprintf("%s.local", c.ComponentName),
- }, nil)
-
- serialNumberLimit := new(big.Int).Lsh(big.NewInt(1), 127)
- serialNumber, err := rand.Int(rand.Reader, serialNumberLimit)
- if err != nil {
- klog.Exitf("Failed to generate %s serial number: %v", c.ComponentName, err)
- }
- cert.ExtKeyUsage = append(cert.ExtKeyUsage, x509.ExtKeyUsageClientAuth)
- cert.SerialNumber = serialNumber
- cert.NotBefore = time.Now()
- cert.NotAfter = pki.UnknownNotAfter
- cert.BasicConstraintsValid = true
-
- pub, priv, err := ed25519.GenerateKey(rand.Reader)
- if err != nil {
- klog.Exitf("Failed to generate %s key: %v", c.ComponentName, err)
- }
- certBytes, err := x509.CreateCertificate(rand.Reader, &cert, caCert, pub, ca.PrivateKey)
- if err != nil {
- klog.Exitf("Failed to generate %s certificate: %v", c.ComponentName, err)
- }
-
- // And marshal them to disk.
- privPKCS, err := x509.MarshalPKCS8PrivateKey(priv)
- if err != nil {
- klog.Exitf("Failed to marshal %s private key: %v", c.ComponentName, err)
- }
- err = os.WriteFile(keyPath, pem.EncodeToMemory(&pem.Block{
- Type: "PRIVATE KEY",
- Bytes: privPKCS,
- }), 0600)
- if err != nil {
- klog.Exitf("Failed to write %s private key: %v", c.ComponentName, err)
- }
- err = os.WriteFile(certPath, pem.EncodeToMemory(&pem.Block{
- Type: "CERTIFICATE",
- Bytes: certBytes,
- }), 0644)
- if err != nil {
- klog.Exitf("Failed to write %s certificate: %v", c.ComponentName, err)
- }
-
- return
-}
-
-// ensureDevCA ensures that a development CA certificate/key exists and returns
-// paths to them. This data is either read from disk if it already exists, or is
-// generated when this function is called. If any problem occurs, the code
-// panics.
-func (c *ComponentConfig) ensureDevCA() (caCertPath string) {
- caKeyPath := c.DevCertsPath + "/ca.key"
- caCertPath = c.DevCertsPath + "/ca.cert"
-
- if err := os.MkdirAll(c.DevCertsPath, 0700); err != nil {
- klog.Exitf("Failed to make developer certificate directory: %v", err)
- }
-
- // Check if we already have a key/certificate.
- noKey := false
- if _, err := os.Stat(caKeyPath); os.IsNotExist(err) {
- noKey = true
- }
- noCert := false
- if _, err := os.Stat(caCertPath); os.IsNotExist(err) {
- noCert = true
- }
-
- if noKey || noCert {
- klog.Infof("Generating developer CA certificate...")
- } else {
- return
- }
- hostname, err := os.Hostname()
- if err != nil {
- hostname = "unknown"
- }
-
- // No key/certificate, generate them.
- ca := pki.CA(fmt.Sprintf("monogon dev certs CA (%s)", hostname))
-
- serialNumberLimit := new(big.Int).Lsh(big.NewInt(1), 127)
- serialNumber, err := rand.Int(rand.Reader, serialNumberLimit)
- if err != nil {
- klog.Exitf("Failed to generate CA serial number: %v", err)
- }
- ca.SerialNumber = serialNumber
- ca.NotBefore = time.Now()
- ca.NotAfter = pki.UnknownNotAfter
- ca.BasicConstraintsValid = true
-
- caPub, caPriv, err := ed25519.GenerateKey(rand.Reader)
- if err != nil {
- klog.Exitf("Failed to generate CA key: %v", err)
- }
- caBytes, err := x509.CreateCertificate(rand.Reader, &ca, &ca, caPub, caPriv)
- if err != nil {
- klog.Exitf("Failed to generate CA certificate: %v", err)
- }
-
- // And marshal them to disk.
- caPrivPKCS, err := x509.MarshalPKCS8PrivateKey(caPriv)
- if err != nil {
- klog.Exitf("Failed to marshal %s private key: %v", c.ComponentName, err)
- }
- err = os.WriteFile(caKeyPath, pem.EncodeToMemory(&pem.Block{
- Type: "PRIVATE KEY",
- Bytes: caPrivPKCS,
- }), 0600)
- if err != nil {
- klog.Exitf("Failed to write CA private key: %v", err)
- }
- err = os.WriteFile(caCertPath, pem.EncodeToMemory(&pem.Block{
- Type: "CERTIFICATE",
- Bytes: caBytes,
- }), 0644)
- if err != nil {
- klog.Exitf("Failed to write CA certificate: %v", err)
- }
-
- return
-}
diff --git a/cloud/lib/sinbin/BUILD.bazel b/cloud/lib/sinbin/BUILD.bazel
deleted file mode 100644
index df9203f..0000000
--- a/cloud/lib/sinbin/BUILD.bazel
+++ /dev/null
@@ -1,14 +0,0 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
-
-go_library(
- name = "sinbin",
- srcs = ["sinbin.go"],
- importpath = "source.monogon.dev/cloud/lib/sinbin",
- visibility = ["//visibility:public"],
-)
-
-go_test(
- name = "sinbin_test",
- srcs = ["sinbin_test.go"],
- embed = [":sinbin"],
-)
diff --git a/cloud/lib/sinbin/sinbin.go b/cloud/lib/sinbin/sinbin.go
deleted file mode 100644
index b1c905b..0000000
--- a/cloud/lib/sinbin/sinbin.go
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-// Package sinbin implements a sinbin for naughty processed elements that we wish
-// to time out for a while. This is kept in memory, and effectively implements a
-// simplified version of the Circuit Breaker pattern.
-//
-// “sin bin”, noun, informal: (in sport) a box or bench to which offending
-// players can be sent for a period as a penalty during a game, especially in ice
-// hockey.
-package sinbin
-
-import (
- "sync"
- "time"
-)
-
-type entry struct {
- until time.Time
-}
-
-// A Sinbin contains a set of entries T which are added with a deadline, and will
-// be automatically collected when that deadline expires.
-//
-// The zero value of a Sinbin is ready to use, and can be called from multiple
-// goroutines.
-type Sinbin[T comparable] struct {
- mu sync.RWMutex
- bench map[T]*entry
-
- lastSweep time.Time
-}
-
-func (s *Sinbin[T]) initializeUnlocked() {
- if s.bench == nil {
- s.bench = make(map[T]*entry)
- }
-}
-
-func (s *Sinbin[T]) sweepUnlocked() {
- if s.lastSweep.Add(time.Minute).After(time.Now()) {
- return
- }
- now := time.Now()
- for k, e := range s.bench {
- if now.After(e.until) {
- delete(s.bench, k)
- }
- }
- s.lastSweep = now
-}
-
-// Add an element 't' to a Sinbin with a given deadline. From now until that
-// deadline Penalized(t) will return true.
-func (s *Sinbin[T]) Add(t T, until time.Time) {
- s.mu.Lock()
- defer s.mu.Unlock()
-
- s.initializeUnlocked()
- s.sweepUnlocked()
-
- existing, ok := s.bench[t]
- if ok {
- if until.After(existing.until) {
- existing.until = until
- }
- return
- }
- s.bench[t] = &entry{
- until: until,
- }
-}
-
-// Penalized returns whether the given element is currently sitting on the
-// time-out bench after having been Added previously.
-func (s *Sinbin[T]) Penalized(t T) bool {
- s.mu.RLock()
- defer s.mu.RUnlock()
-
- if s.bench == nil {
- return false
- }
-
- existing, ok := s.bench[t]
- if !ok {
- return false
- }
- if time.Now().After(existing.until) {
- delete(s.bench, t)
- return false
- }
- return true
-}
diff --git a/cloud/lib/sinbin/sinbin_test.go b/cloud/lib/sinbin/sinbin_test.go
deleted file mode 100644
index 715ac94..0000000
--- a/cloud/lib/sinbin/sinbin_test.go
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package sinbin
-
-import (
- "testing"
- "time"
-)
-
-// TestSinbinBasics performs some basic tests on the Sinbin.
-func TestSinbinBasics(t *testing.T) {
- var s Sinbin[string]
-
- if s.Penalized("foo") {
- t.Errorf("'foo' should not be penalized as it hasn't yet been added")
- }
- s.Add("foo", time.Now().Add(-1*time.Second))
- if s.Penalized("foo") {
- t.Errorf("'foo' should not be penalized as it has been added with an expired time")
- }
- s.Add("bar", time.Now().Add(time.Hour))
- if !s.Penalized("bar") {
- t.Errorf("'bar' should be penalized as it has been added with an hour long penalty")
- }
-
- // Force sweep.
- s.lastSweep = time.Now().Add(-1 * time.Hour)
- s.sweepUnlocked()
-
- if len(s.bench) != 1 {
- t.Errorf("there should only be one element on the bench")
- }
- if _, ok := s.bench["bar"]; !ok {
- t.Errorf("the bench should contain 'bar'")
- }
-}
diff --git a/cloud/shepherd/BUILD.bazel b/cloud/shepherd/BUILD.bazel
deleted file mode 100644
index 512ebed..0000000
--- a/cloud/shepherd/BUILD.bazel
+++ /dev/null
@@ -1,12 +0,0 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
-
-go_library(
- name = "shepherd",
- srcs = ["shepherd.go"],
- importpath = "source.monogon.dev/cloud/shepherd",
- visibility = ["//visibility:public"],
- deps = [
- "//cloud/bmaas/bmdb",
- "//cloud/bmaas/bmdb/model",
- ],
-)
diff --git a/cloud/shepherd/manager/BUILD.bazel b/cloud/shepherd/manager/BUILD.bazel
deleted file mode 100644
index 92a2ffe..0000000
--- a/cloud/shepherd/manager/BUILD.bazel
+++ /dev/null
@@ -1,54 +0,0 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
-
-go_library(
- name = "manager",
- srcs = [
- "control_loop.go",
- "fake_ssh_client.go",
- "initializer.go",
- "manager.go",
- "provisioner.go",
- "recoverer.go",
- "ssh_key_signer.go",
- ],
- importpath = "source.monogon.dev/cloud/shepherd/manager",
- visibility = ["//visibility:public"],
- deps = [
- "//cloud/agent/api",
- "//cloud/bmaas/bmdb",
- "//cloud/bmaas/bmdb/metrics",
- "//cloud/bmaas/bmdb/model",
- "//cloud/shepherd",
- "//go/mflags",
- "//osbase/net/sshtakeover",
- "@com_github_google_uuid//:uuid",
- "@io_k8s_klog_v2//:klog",
- "@org_golang_google_protobuf//proto",
- "@org_golang_x_crypto//ssh",
- "@org_golang_x_sync//errgroup",
- "@org_golang_x_time//rate",
- ],
-)
-
-go_test(
- name = "manager_test",
- srcs = [
- "initializer_test.go",
- "provider_test.go",
- "provisioner_test.go",
- ],
- data = [
- "@cockroach",
- ],
- embed = [":manager"],
- deps = [
- "//cloud/bmaas/bmdb",
- "//cloud/bmaas/bmdb/model",
- "//cloud/lib/component",
- "//cloud/shepherd",
- "@com_github_google_uuid//:uuid",
- "@io_k8s_klog_v2//:klog",
- "@org_golang_x_crypto//ssh",
- "@org_golang_x_time//rate",
- ],
-)
diff --git a/cloud/shepherd/manager/README.md b/cloud/shepherd/manager/README.md
deleted file mode 100644
index d5a17c3..0000000
--- a/cloud/shepherd/manager/README.md
+++ /dev/null
@@ -1,54 +0,0 @@
-Equinix Shepherd
-===
-
-Manages Equinix machines in sync with BMDB contents. Made up of two components:
-
-Provisioner
----
-
-Brings up machines from hardware reservations and populates BMDB with new Provided machines.
-
-Initializer
----
-
-Starts the Agent over SSH (wherever necessary per the BMDB) and reports success into the BMDB.
-
-
-Running
-===
-
-Unit Tests
----
-
-The Shepherd has some basic smoke tests which run against a Fakequinix.
-
-Manual Testing
----
-
-If you have Equinix credentials, you can run:
-
-```
-$ bazel build //cloud/shepherd/provider/equinix
-$ bazel build //cloud/shepherd/manager/test_agent
-$ bazel-bin/cloud/shepherd/provider/equinix/equinix_/equinix \
- -bmdb_eat_my_data \
- -equinix_project_id FIXME \
- -equinix_api_username FIXME \
- -equinix_api_key FIXME \
- -agent_executable_path bazel-bin/cloud/shepherd/manager/test_agent/test_agent_/test_agent \
- -agent_endpoint example.com \
- -equinix_ssh_key_label $USER-FIXME \
- -equinix_device_prefix $USER-FIXME- \
- -provisioner_assimilate -provisioner_max_machines 10
-```
-
-Replace $USER-FIXME with `<your username>-test` or some other unique name/prefix.
-
-This will start a single instance of the provisioner accompanied by a single instance of the initializer.
-
-A persistent SSH key will be created in your current working directory.
-
-Prod Deployment
----
-
-TODO(q3k): split server binary into separate provisioner/initializer for initializer scalability, as that's the main bottleneck.
\ No newline at end of file
diff --git a/cloud/shepherd/manager/control_loop.go b/cloud/shepherd/manager/control_loop.go
deleted file mode 100644
index 138b6cb..0000000
--- a/cloud/shepherd/manager/control_loop.go
+++ /dev/null
@@ -1,224 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package manager
-
-import (
- "context"
- "errors"
- "flag"
- "fmt"
- "time"
-
- "github.com/google/uuid"
- "golang.org/x/sync/errgroup"
- "golang.org/x/time/rate"
- "k8s.io/klog/v2"
-
- "source.monogon.dev/cloud/bmaas/bmdb"
- "source.monogon.dev/cloud/bmaas/bmdb/metrics"
- "source.monogon.dev/cloud/bmaas/bmdb/model"
- "source.monogon.dev/go/mflags"
-)
-
-// task describes a single server currently being processed by a control loop.
-type task struct {
- // machine is the machine data (including provider and provider ID) retrieved
- // from the BMDB.
- machine *model.MachineProvided
- // work is a machine lock facilitated by BMDB that prevents machines from
- // being processed by multiple workers at the same time.
- work *bmdb.Work
- // backoff is configured from processInfo.defaultBackoff but can be overridden by
- // processMachine to set a different backoff policy for specific failure modes.
- backoff bmdb.Backoff
-}
-
-// controlLoop is implemented by any component which should act as a BMDB-based
-// control loop. Implementing these methods allows the given component to be
-// started using RunControlLoop.
-type controlLoop interface {
- getProcessInfo() processInfo
-
- // getMachines must return the list of machines ready to be processed by the
- // control loop for a given control loop implementation.
- getMachines(ctx context.Context, q *model.Queries, limit int32) ([]model.MachineProvided, error)
- // processMachine will be called within the scope of an active task/BMDB work by
- // the control loop logic.
- processMachine(ctx context.Context, t *task) error
-
- // getControlLoopConfig is implemented by ControlLoopConfig which should be
- // embedded by the control loop component. If not embedded, this method will have
- // to be implemented, too.
- getControlLoopConfig() *ControlLoopConfig
-}
-
-type processInfo struct {
- process model.Process
- processor metrics.Processor
- defaultBackoff bmdb.Backoff
-}
-
-// ControlLoopConfig should be embedded the every component which acts as a
-// control loop. RegisterFlags should be called by the component whenever it is
-// registering its own flags. Check should be called whenever the component is
-// instantiated, after RegisterFlags has been called.
-type ControlLoopConfig struct {
- // DBQueryLimiter limits the rate at which BMDB is queried for servers ready
- // for BMaaS agent initialization. Must be set.
- DBQueryLimiter *rate.Limiter
-
- // Parallelism is how many instances of the Initializer will be allowed to run in
- // parallel against the BMDB. This speeds up the process of starting/restarting
- // agents significantly, as one initializer instance can handle at most one agent
- // (re)starting process.
- //
- // If not set (ie. 0), default to 1. A good starting value for production
- // deployments is 10 or so.
- Parallelism int
-}
-
-func (c *ControlLoopConfig) getControlLoopConfig() *ControlLoopConfig {
- return c
-}
-
-// RegisterFlags should be called on this configuration whenever the embeddeding
-// component/configuration is registering its own flags. The prefix should be the
-// name of the component.
-func (c *ControlLoopConfig) RegisterFlags(prefix string) {
- mflags.Limiter(&c.DBQueryLimiter, prefix+"_db_query_rate", "250ms,8", "Rate limiting for BMDB queries")
- flag.IntVar(&c.Parallelism, prefix+"_loop_parallelism", 1, "How many initializer instances to run in parallel, ie. how many agents to attempt to (re)start at once")
-}
-
-// Check should be called after RegisterFlags but before the control loop is ran.
-// If an error is returned, the control loop cannot start.
-func (c *ControlLoopConfig) Check() error {
- if c.DBQueryLimiter == nil {
- return fmt.Errorf("DBQueryLimiter must be configured")
- }
- if c.Parallelism == 0 {
- c.Parallelism = 1
- }
- return nil
-}
-
-// RunControlLoop runs the given controlLoop implementation against the BMDB. The
-// loop will be run with the parallelism and rate configured by the
-// ControlLoopConfig embedded or otherwise returned by the controlLoop.
-func RunControlLoop(ctx context.Context, conn *bmdb.Connection, loop controlLoop) error {
- clr := &controlLoopRunner{
- loop: loop,
- config: loop.getControlLoopConfig(),
- }
- return clr.run(ctx, conn)
-}
-
-// controlLoopRunner is a configured control loop with an underlying control loop
-// implementation.
-type controlLoopRunner struct {
- config *ControlLoopConfig
- loop controlLoop
-}
-
-// run the control loops(s) (depending on opts.Parallelism) blocking the current
-// goroutine until the given context expires and all provisioners quit.
-func (r *controlLoopRunner) run(ctx context.Context, conn *bmdb.Connection) error {
- pinfo := r.loop.getProcessInfo()
-
- var eg errgroup.Group
- for j := 0; j < r.config.Parallelism; j += 1 {
- eg.Go(func() error {
- return r.runOne(ctx, conn, &pinfo)
- })
- }
- return eg.Wait()
-}
-
-// run the control loop blocking the current goroutine until the given context
-// expires.
-func (r *controlLoopRunner) runOne(ctx context.Context, conn *bmdb.Connection, pinfo *processInfo) error {
- var err error
-
- // Maintain a BMDB session as long as possible.
- var sess *bmdb.Session
- for {
- if sess == nil {
- sess, err = conn.StartSession(ctx, bmdb.SessionOption{Processor: pinfo.processor})
- if err != nil {
- return fmt.Errorf("could not start BMDB session: %w", err)
- }
- }
- // Inside that session, run the main logic.
- err := r.runInSession(ctx, sess, pinfo)
-
- switch {
- case err == nil:
- case errors.Is(err, ctx.Err()):
- return err
- case errors.Is(err, bmdb.ErrSessionExpired):
- klog.Errorf("Session expired, restarting...")
- sess = nil
- time.Sleep(time.Second)
- default:
- klog.Errorf("Processing failed: %v", err)
- // TODO(q3k): close session
- time.Sleep(time.Second)
- }
- }
-}
-
-// runInSession executes one iteration of the control loop within a BMDB session.
-// This control loop attempts to start or re-start the agent on any machines that
-// need this per the BMDB.
-func (r *controlLoopRunner) runInSession(ctx context.Context, sess *bmdb.Session, pinfo *processInfo) error {
- t, err := r.source(ctx, sess, pinfo)
- if err != nil {
- return fmt.Errorf("could not source machine: %w", err)
- }
- if t == nil {
- return nil
- }
- defer t.work.Cancel(ctx)
-
- if err := r.loop.processMachine(ctx, t); err != nil {
- klog.Errorf("Failed to process machine %s: %v", t.machine.MachineID, err)
- err = t.work.Fail(ctx, &t.backoff, fmt.Sprintf("failed to process: %v", err))
- return err
- }
- return nil
-}
-
-// source supplies returns a BMDB-locked server ready for processing by the
-// control loop, locked by a work item. If both task and error are nil, then
-// there are no machines needed to be initialized. The returned work item in task
-// _must_ be canceled or finished by the caller.
-func (r *controlLoopRunner) source(ctx context.Context, sess *bmdb.Session, pinfo *processInfo) (*task, error) {
- r.config.DBQueryLimiter.Wait(ctx)
-
- var machine *model.MachineProvided
- work, err := sess.Work(ctx, pinfo.process, func(q *model.Queries) ([]uuid.UUID, error) {
- machines, err := r.loop.getMachines(ctx, q, 1)
- if err != nil {
- return nil, err
- }
- if len(machines) < 1 {
- return nil, bmdb.ErrNothingToDo
- }
- machine = &machines[0]
- return []uuid.UUID{machines[0].MachineID}, nil
- })
-
- if errors.Is(err, bmdb.ErrNothingToDo) {
- return nil, nil
- }
-
- if err != nil {
- return nil, fmt.Errorf("while querying BMDB agent candidates: %w", err)
- }
-
- return &task{
- machine: machine,
- work: work,
- backoff: pinfo.defaultBackoff,
- }, nil
-}
diff --git a/cloud/shepherd/manager/fake_ssh_client.go b/cloud/shepherd/manager/fake_ssh_client.go
deleted file mode 100644
index 97de575..0000000
--- a/cloud/shepherd/manager/fake_ssh_client.go
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package manager
-
-import (
- "context"
- "crypto/ed25519"
- "crypto/rand"
- "fmt"
- "io"
-
- "golang.org/x/crypto/ssh"
- "google.golang.org/protobuf/proto"
-
- apb "source.monogon.dev/cloud/agent/api"
-)
-
-type fakeSSHClient struct{}
-
-// FakeSSHDial pretends to start an agent, but in reality just responds with
-// what an agent would respond on every execution attempt.
-func FakeSSHDial(ctx context.Context, address string, config *ssh.ClientConfig) (SSHClient, error) {
- return &fakeSSHClient{}, nil
-}
-
-func (f *fakeSSHClient) Execute(ctx context.Context, command string, stdin []byte) (stdout []byte, stderr []byte, err error) {
- var aim apb.TakeoverInit
- if err := proto.Unmarshal(stdin, &aim); err != nil {
- return nil, nil, fmt.Errorf("while unmarshaling TakeoverInit message: %w", err)
- }
-
- // Agent should send back apb.TakeoverResponse on its standard output.
- pub, _, err := ed25519.GenerateKey(rand.Reader)
- if err != nil {
- return nil, nil, fmt.Errorf("while generating agent public key: %w", err)
- }
- arsp := apb.TakeoverResponse{
- Result: &apb.TakeoverResponse_Success{Success: &apb.TakeoverSuccess{
- InitMessage: &aim,
- Key: pub,
- }},
- }
- arspb, err := proto.Marshal(&arsp)
- if err != nil {
- return nil, nil, fmt.Errorf("while marshaling TakeoverResponse message: %w", err)
- }
- return arspb, nil, nil
-}
-
-func (f *fakeSSHClient) UploadExecutable(ctx context.Context, targetPath string, _ io.Reader) error {
- if targetPath != "/fake/path" {
- return fmt.Errorf("unexpected target path in test")
- }
- return nil
-}
-
-func (f *fakeSSHClient) Close() error {
- return nil
-}
diff --git a/cloud/shepherd/manager/initializer.go b/cloud/shepherd/manager/initializer.go
deleted file mode 100644
index d9e3579..0000000
--- a/cloud/shepherd/manager/initializer.go
+++ /dev/null
@@ -1,288 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package manager
-
-import (
- "bytes"
- "context"
- "crypto/ed25519"
- "crypto/x509"
- "encoding/hex"
- "encoding/pem"
- "flag"
- "fmt"
- "io"
- "net"
- "os"
- "strings"
- "time"
-
- "github.com/google/uuid"
- "golang.org/x/crypto/ssh"
- "google.golang.org/protobuf/proto"
- "k8s.io/klog/v2"
-
- apb "source.monogon.dev/cloud/agent/api"
-
- "source.monogon.dev/cloud/bmaas/bmdb"
- "source.monogon.dev/cloud/bmaas/bmdb/metrics"
- "source.monogon.dev/cloud/bmaas/bmdb/model"
- "source.monogon.dev/cloud/shepherd"
- "source.monogon.dev/osbase/net/sshtakeover"
-)
-
-// InitializerConfig configures how the Initializer will deploy Agents on
-// machines. In CLI scenarios, this should be populated from flags via
-// RegisterFlags.
-type InitializerConfig struct {
- ControlLoopConfig
-
- // Executable is the contents of the agent binary created and run
- // at the provisioned servers. Must be set.
- Executable []byte
-
- // TargetPath is a filesystem destination path used while uploading the BMaaS
- // agent executable to hosts as part of the initialization process. Must be set.
- TargetPath string
-
- // Endpoint is the address Agent will use to contact the BMaaS
- // infrastructure. Must be set.
- Endpoint string
-
- // EndpointCACertificate is an optional DER-encoded (but not PEM-armored) X509
- // certificate used to populate the trusted CA store of the agent. It should be
- // set to the CA certificate of the endpoint if not using a system-trusted CA
- // certificate.
- EndpointCACertificate []byte
-
- SSHConfig ssh.ClientConfig
- // SSHExecTimeout is the amount of time set aside for executing the agent and
- // getting its output once the SSH connection has been established. Upon timeout,
- // the iteration would be declared as failure. Must be set.
- SSHExecTimeout time.Duration
-
- // DialSSH can be set in tests to override how ssh connections are started.
- DialSSH func(ctx context.Context, address string, config *ssh.ClientConfig) (SSHClient, error)
-}
-
-type SSHClient interface {
- Execute(ctx context.Context, command string, stdin []byte) (stdout []byte, stderr []byte, err error)
- UploadExecutable(ctx context.Context, targetPath string, src io.Reader) error
- Close() error
-}
-
-func (ic *InitializerConfig) RegisterFlags() {
- ic.ControlLoopConfig.RegisterFlags("initializer")
-
- flag.Func("agent_executable_path", "Local filesystem path of agent binary to be uploaded", func(val string) error {
- if val == "" {
- return nil
- }
- data, err := os.ReadFile(val)
- if err != nil {
- return fmt.Errorf("could not read: %w", err)
- }
- ic.Executable = data
- return nil
- })
- flag.StringVar(&ic.TargetPath, "agent_target_path", "/root/agent", "Filesystem path where the agent will be uploaded to and ran from")
- flag.StringVar(&ic.Endpoint, "agent_endpoint", "", "Address of BMDB Server to which the agent will attempt to connect")
- flag.Func("agent_endpoint_ca_certificate_path", "Path to PEM X509 CA certificate that the agent endpoint is serving with. If not set, the agent will attempt to use system CA certificates to authenticate the endpoint.", func(val string) error {
- if val == "" {
- return nil
- }
- data, err := os.ReadFile(val)
- if err != nil {
- return fmt.Errorf("could not read: %w", err)
- }
- block, _ := pem.Decode(data)
- if block.Type != "CERTIFICATE" {
- return fmt.Errorf("not a certificate")
- }
- _, err = x509.ParseCertificate(block.Bytes)
- if err != nil {
- return fmt.Errorf("invalid certificate: %w", err)
- }
- ic.EndpointCACertificate = block.Bytes
- return nil
- })
- flag.DurationVar(&ic.SSHConfig.Timeout, "agent_ssh_connect_timeout", 2*time.Second, "Timeout for connecting over SSH to a machine")
- flag.DurationVar(&ic.SSHExecTimeout, "agent_ssh_exec_timeout", 60*time.Second, "Timeout for connecting over SSH to a machine")
-}
-
-func (ic *InitializerConfig) Check() error {
- if err := ic.ControlLoopConfig.Check(); err != nil {
- return err
- }
-
- if len(ic.Executable) == 0 {
- return fmt.Errorf("agent executable not configured")
- }
- if ic.TargetPath == "" {
- return fmt.Errorf("agent target path must be set")
- }
- if ic.Endpoint == "" {
- return fmt.Errorf("agent endpoint must be set")
- }
- if ic.SSHConfig.Timeout == 0 {
- return fmt.Errorf("agent SSH connection timeout must be set")
- }
- if ic.SSHExecTimeout == 0 {
- return fmt.Errorf("agent SSH execution timeout must be set")
- }
-
- return nil
-}
-
-// The Initializer starts the agent on machines that aren't yet running it.
-type Initializer struct {
- InitializerConfig
-
- p shepherd.Provider
-}
-
-// NewInitializer creates an Initializer instance, checking the
-// InitializerConfig, SharedConfig and AgentConfig for errors.
-func NewInitializer(p shepherd.Provider, ic InitializerConfig) (*Initializer, error) {
- if err := ic.Check(); err != nil {
- return nil, err
- }
-
- return &Initializer{
- InitializerConfig: ic,
-
- p: p,
- }, nil
-}
-
-func (i *Initializer) getProcessInfo() processInfo {
- return processInfo{
- process: model.ProcessShepherdAgentStart,
- defaultBackoff: bmdb.Backoff{
- Initial: 5 * time.Minute,
- Maximum: 4 * time.Hour,
- Exponent: 1.2,
- },
- processor: metrics.ProcessorShepherdInitializer,
- }
-}
-
-func (i *Initializer) getMachines(ctx context.Context, q *model.Queries, limit int32) ([]model.MachineProvided, error) {
- return q.GetMachinesForAgentStart(ctx, model.GetMachinesForAgentStartParams{
- Limit: limit,
- Provider: i.p.Type(),
- })
-}
-
-func (i *Initializer) processMachine(ctx context.Context, t *task) error {
- machine, err := i.p.GetMachine(ctx, shepherd.ProviderID(t.machine.ProviderID))
- if err != nil {
- return fmt.Errorf("while fetching machine %q: %w", t.machine.ProviderID, err)
- }
-
- // Start the agent.
- klog.Infof("Starting agent on machine (ID: %s, PID %s)", t.machine.MachineID, t.machine.ProviderID)
- apk, err := i.startAgent(ctx, machine, t.machine.MachineID)
- if err != nil {
- return fmt.Errorf("while starting the agent: %w", err)
- }
-
- // Agent startup succeeded. Set the appropriate BMDB tag, and release the
- // lock.
- klog.Infof("Setting AgentStarted (ID: %s, PID: %s, Agent public key: %s).", t.machine.MachineID, t.machine.ProviderID, hex.EncodeToString(apk))
- err = t.work.Finish(ctx, func(q *model.Queries) error {
- return q.MachineSetAgentStarted(ctx, model.MachineSetAgentStartedParams{
- MachineID: t.machine.MachineID,
- AgentStartedAt: time.Now(),
- AgentPublicKey: apk,
- })
- })
- if err != nil {
- return fmt.Errorf("while setting AgentStarted tag: %w", err)
- }
- return nil
-}
-
-// startAgent runs the agent executable on the target machine m, returning the
-// agent's public key on success.
-func (i *Initializer) startAgent(ctx context.Context, m shepherd.Machine, mid uuid.UUID) ([]byte, error) {
- // Provide a bound on execution time in case we get stuck after the SSH
- // connection is established.
- sctx, sctxC := context.WithTimeout(ctx, i.SSHExecTimeout)
- defer sctxC()
-
- // Use the machine's IP address
- ni := m.Addr()
- if !ni.IsValid() {
- return nil, fmt.Errorf("machine (machine ID: %s) has no available addresses", mid)
- }
-
- addr := net.JoinHostPort(ni.String(), "22")
- klog.V(1).Infof("Dialing machine (machine ID: %s, addr: %s).", mid, addr)
-
- var conn SSHClient
- var err error
- if i.DialSSH != nil {
- conn, err = i.DialSSH(sctx, addr, &i.SSHConfig)
- } else {
- conn, err = sshtakeover.Dial(sctx, addr, &i.SSHConfig)
- }
- if err != nil {
- return nil, fmt.Errorf("while dialing the machine: %w", err)
- }
- defer conn.Close()
-
- // Upload the agent executable.
-
- klog.Infof("Uploading the agent executable (machine ID: %s, addr: %s).", mid, addr)
- if err := conn.UploadExecutable(sctx, i.TargetPath, bytes.NewReader(i.Executable)); err != nil {
- return nil, fmt.Errorf("while uploading agent executable: %w", err)
- }
- klog.V(1).Infof("Upload successful (machine ID: %s, addr: %s).", mid, addr)
-
- // The initialization protobuf message will be sent to the agent on its
- // standard input.
- imsg := apb.TakeoverInit{
- MachineId: mid.String(),
- BmaasEndpoint: i.Endpoint,
- CaCertificate: i.EndpointCACertificate,
- }
- imsgb, err := proto.Marshal(&imsg)
- if err != nil {
- return nil, fmt.Errorf("while marshaling agent message: %w", err)
- }
-
- // Start the agent and wait for the agent's output to arrive.
- klog.V(1).Infof("Starting the agent executable at path %q (machine ID: %s).", i.TargetPath, mid)
- stdout, stderr, err := conn.Execute(ctx, i.TargetPath, imsgb)
- stderrStr := strings.TrimSpace(string(stderr))
- if stderrStr != "" {
- klog.Warningf("Agent stderr: %q", stderrStr)
- }
- if err != nil {
- return nil, fmt.Errorf("while starting the agent executable: %w", err)
- }
-
- var arsp apb.TakeoverResponse
- if err := proto.Unmarshal(stdout, &arsp); err != nil {
- return nil, fmt.Errorf("agent reply couldn't be unmarshaled: %w", err)
- }
- var successResp *apb.TakeoverSuccess
- switch r := arsp.Result.(type) {
- case *apb.TakeoverResponse_Error:
- return nil, fmt.Errorf("agent returned error: %v", r.Error.Message)
- case *apb.TakeoverResponse_Success:
- successResp = r.Success
- default:
- return nil, fmt.Errorf("agent returned unknown result of type %T", arsp.Result)
- }
- if !proto.Equal(&imsg, successResp.InitMessage) {
- return nil, fmt.Errorf("agent did not send back the init message")
- }
- if len(successResp.Key) != ed25519.PublicKeySize {
- return nil, fmt.Errorf("agent key length mismatch")
- }
- klog.Infof("Started the agent (machine ID: %s, key: %s).", mid, hex.EncodeToString(successResp.Key))
- return successResp.Key, nil
-}
diff --git a/cloud/shepherd/manager/initializer_test.go b/cloud/shepherd/manager/initializer_test.go
deleted file mode 100644
index 3b41044..0000000
--- a/cloud/shepherd/manager/initializer_test.go
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package manager
-
-import (
- "context"
- "testing"
- "time"
-
- "golang.org/x/crypto/ssh"
- "golang.org/x/time/rate"
-
- "source.monogon.dev/cloud/bmaas/bmdb"
- "source.monogon.dev/cloud/bmaas/bmdb/model"
- "source.monogon.dev/cloud/lib/component"
-)
-
-// TestInitializerSmokes makes sure the Initializer doesn't go up in flames on
-// the happy path.
-func TestInitializerSmokes(t *testing.T) {
- provider := newDummyProvider(100)
-
- ic := InitializerConfig{
- ControlLoopConfig: ControlLoopConfig{
- DBQueryLimiter: rate.NewLimiter(rate.Every(time.Second), 10),
- },
- Executable: []byte("beep boop i'm a real program"),
- TargetPath: "/fake/path",
- Endpoint: "example.com:1234",
- SSHConfig: ssh.ClientConfig{
- Timeout: time.Second,
- },
- SSHExecTimeout: time.Second,
- DialSSH: provider.FakeSSHDial,
- }
-
- i, err := NewInitializer(provider, ic)
- if err != nil {
- t.Fatalf("Could not create Initializer: %v", err)
- }
-
- b := bmdb.BMDB{
- Config: bmdb.Config{
- Database: component.CockroachConfig{
- InMemory: true,
- },
- ComponentName: "test",
- RuntimeInfo: "test",
- },
- }
- conn, err := b.Open(true)
- if err != nil {
- t.Fatalf("Could not create in-memory BMDB: %v", err)
- }
-
- ctx, ctxC := context.WithCancel(context.Background())
- t.Cleanup(ctxC)
-
- go RunControlLoop(ctx, conn, i)
-
- sess, err := conn.StartSession(ctx)
- if err != nil {
- t.Fatalf("Failed to create BMDB session for verifiaction: %v", err)
- }
-
- // Create 10 provided machines for testing.
- if _, err := provider.createDummyMachines(ctx, sess, 10); err != nil {
- t.Fatalf("Failed to create dummy machines: %v", err)
- }
-
- // Expect to find 0 machines needing start.
- for {
- time.Sleep(100 * time.Millisecond)
-
- var machines []model.MachineProvided
- err = sess.Transact(ctx, func(q *model.Queries) error {
- var err error
- machines, err = q.GetMachinesForAgentStart(ctx, model.GetMachinesForAgentStartParams{
- Limit: 100,
- Provider: provider.Type(),
- })
- return err
- })
- if err != nil {
- t.Fatalf("Failed to run Transaction: %v", err)
- }
- if len(machines) == 0 {
- break
- }
- }
-
- provider.muMachines.RLock()
- defer provider.muMachines.RUnlock()
- for _, m := range provider.machines {
- if !m.agentStarted {
- t.Fatalf("Initializer didn't start agent on machine %q", m.id)
- }
- }
-}
diff --git a/cloud/shepherd/manager/manager.go b/cloud/shepherd/manager/manager.go
deleted file mode 100644
index ae8115e..0000000
--- a/cloud/shepherd/manager/manager.go
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-// Package manager, itself a part of BMaaS project, provides implementation
-// governing Equinix bare metal server lifecycle according to conditions set by
-// Bare Metal Database (BMDB).
-//
-// The implementation will attempt to provide as many machines as possible and
-// register them with BMDB. This is limited by the count of Hardware
-// Reservations available in the Equinix Metal project used. The BMaaS agent
-// will then be started on these machines as soon as they become ready.
-//
-// The implementation is provided in the form of a library, to which interface is
-// exported through Provisioner and Initializer types, each taking servers
-// through a single stage of their lifecycle.
-//
-// See the included test code for usage examples.
-//
-// The terms "device" and "machine" are used interchangeably throughout this
-// package due to differences in Equinix Metal and BMDB nomenclature.
-package manager
diff --git a/cloud/shepherd/manager/provider_test.go b/cloud/shepherd/manager/provider_test.go
deleted file mode 100644
index 2dc3fa9..0000000
--- a/cloud/shepherd/manager/provider_test.go
+++ /dev/null
@@ -1,190 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package manager
-
-import (
- "context"
- "fmt"
- "net/netip"
- "sync"
-
- "github.com/google/uuid"
- "golang.org/x/crypto/ssh"
- "k8s.io/klog/v2"
-
- "source.monogon.dev/cloud/bmaas/bmdb"
- "source.monogon.dev/cloud/bmaas/bmdb/model"
- "source.monogon.dev/cloud/shepherd"
-)
-
-type dummyMachine struct {
- id shepherd.ProviderID
- addr netip.Addr
- availability shepherd.Availability
- agentStarted bool
-}
-
-func (dm *dummyMachine) Failed() bool {
- return false
-}
-
-func (dm *dummyMachine) ID() shepherd.ProviderID {
- return dm.id
-}
-
-func (dm *dummyMachine) Addr() netip.Addr {
- return dm.addr
-}
-
-func (dm *dummyMachine) Availability() shepherd.Availability {
- return dm.availability
-}
-
-type dummySSHClient struct {
- SSHClient
- m *dummyMachine
-}
-
-func (dsc *dummySSHClient) Execute(ctx context.Context, command string, stdin []byte) ([]byte, []byte, error) {
- stdout, stderr, err := dsc.SSHClient.Execute(ctx, command, stdin)
- if err != nil {
- return nil, nil, err
- }
-
- dsc.m.agentStarted = true
- return stdout, stderr, nil
-}
-
-func (dp *dummyProvider) FakeSSHDial(ctx context.Context, address string, config *ssh.ClientConfig) (SSHClient, error) {
- conn, err := FakeSSHDial(ctx, address, config)
- if err != nil {
- return nil, err
- }
-
- addrPort := netip.MustParseAddrPort(address)
- uid, err := uuid.FromBytes(addrPort.Addr().AsSlice())
- if err != nil {
- return nil, err
- }
-
- dp.muMachines.RLock()
- m := dp.machines[shepherd.ProviderID(uid.String())]
- dp.muMachines.RUnlock()
- if m == nil {
- return nil, fmt.Errorf("failed finding machine in map")
- }
-
- return &dummySSHClient{conn, m}, nil
-}
-
-func newDummyProvider(cap int) *dummyProvider {
- return &dummyProvider{
- capacity: cap,
- machines: make(map[shepherd.ProviderID]*dummyMachine),
- }
-}
-
-type dummyProvider struct {
- capacity int
- machines map[shepherd.ProviderID]*dummyMachine
- muMachines sync.RWMutex
-}
-
-func (dp *dummyProvider) createDummyMachines(ctx context.Context, session *bmdb.Session, count int) ([]shepherd.Machine, error) {
- dp.muMachines.RLock()
- if len(dp.machines)+count > dp.capacity {
- dp.muMachines.RUnlock()
- return nil, fmt.Errorf("no capacity left")
- }
- dp.muMachines.RUnlock()
-
- var machines []shepherd.Machine
- for i := 0; i < count; i++ {
- uid := uuid.Must(uuid.NewRandom())
- m, err := dp.CreateMachine(ctx, session, shepherd.CreateMachineRequest{
- UnusedMachine: &dummyMachine{
- id: shepherd.ProviderID(uid.String()),
- availability: shepherd.AvailabilityKnownUsed,
- addr: netip.AddrFrom16(uid),
- },
- })
- if err != nil {
- return nil, err
- }
- machines = append(machines, m)
- }
-
- return machines, nil
-}
-
-func (dp *dummyProvider) ListMachines(ctx context.Context) ([]shepherd.Machine, error) {
- var machines []shepherd.Machine
- dp.muMachines.RLock()
- for _, m := range dp.machines {
- machines = append(machines, m)
- }
- dp.muMachines.RUnlock()
-
- unusedMachineCount := dp.capacity - len(machines)
- for i := 0; i < unusedMachineCount; i++ {
- uid := uuid.Must(uuid.NewRandom())
- machines = append(machines, &dummyMachine{
- id: shepherd.ProviderID(uid.String()),
- availability: shepherd.AvailabilityKnownUnused,
- addr: netip.AddrFrom16(uid),
- })
- }
-
- return machines, nil
-}
-
-func (dp *dummyProvider) GetMachine(ctx context.Context, id shepherd.ProviderID) (shepherd.Machine, error) {
- dp.muMachines.RLock()
- defer dp.muMachines.RUnlock()
- for _, m := range dp.machines {
- if m.ID() == id {
- return m, nil
- }
- }
-
- return nil, shepherd.ErrMachineNotFound
-}
-
-func (dp *dummyProvider) CreateMachine(ctx context.Context, session *bmdb.Session, request shepherd.CreateMachineRequest) (shepherd.Machine, error) {
- dm := request.UnusedMachine.(*dummyMachine)
-
- err := session.Transact(ctx, func(q *model.Queries) error {
- // Create a new machine record within BMDB.
- m, err := q.NewMachine(ctx)
- if err != nil {
- return fmt.Errorf("while creating a new machine record in BMDB: %w", err)
- }
-
- p := model.MachineAddProvidedParams{
- MachineID: m.MachineID,
- ProviderID: string(dm.id),
- Provider: dp.Type(),
- }
- klog.Infof("Setting \"provided\" tag (ID: %s, PID: %s, Provider: %s).", p.MachineID, p.ProviderID, p.Provider)
- if err := q.MachineAddProvided(ctx, p); err != nil {
- return fmt.Errorf("while tagging machine active: %w", err)
- }
- return nil
- })
-
- if err != nil {
- return nil, err
- }
-
- dm.availability = shepherd.AvailabilityKnownUsed
- dp.muMachines.Lock()
- dp.machines[dm.id] = dm
- dp.muMachines.Unlock()
-
- return dm, nil
-}
-
-func (dp *dummyProvider) Type() model.Provider {
- return model.ProviderNone
-}
diff --git a/cloud/shepherd/manager/provisioner.go b/cloud/shepherd/manager/provisioner.go
deleted file mode 100644
index e13c63e..0000000
--- a/cloud/shepherd/manager/provisioner.go
+++ /dev/null
@@ -1,424 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package manager
-
-import (
- "context"
- "errors"
- "flag"
- "fmt"
- "net/netip"
- "sort"
- "time"
-
- "github.com/google/uuid"
- "golang.org/x/time/rate"
- "k8s.io/klog/v2"
-
- "source.monogon.dev/cloud/bmaas/bmdb"
- "source.monogon.dev/cloud/bmaas/bmdb/metrics"
- "source.monogon.dev/cloud/bmaas/bmdb/model"
- "source.monogon.dev/cloud/shepherd"
- "source.monogon.dev/go/mflags"
-)
-
-// Provisioner implements the server provisioning logic. Provisioning entails
-// bringing all available machines (subject to limits) into BMDB.
-type Provisioner struct {
- ProvisionerConfig
- p shepherd.Provider
-}
-
-// ProvisionerConfig configures the provisioning process.
-type ProvisionerConfig struct {
- // MaxCount is the maximum count of managed servers. No new devices will be
- // created after reaching the limit. No attempt will be made to reduce the
- // server count.
- MaxCount uint
-
- // ReconcileLoopLimiter limits the rate of the main reconciliation loop
- // iterating.
- ReconcileLoopLimiter *rate.Limiter
-
- // DeviceCreation limits the rate at which devices are created.
- DeviceCreationLimiter *rate.Limiter
-
- // ChunkSize is how many machines will try to be spawned in a
- // single reconciliation loop. Higher numbers allow for faster initial
- // provisioning, but lower numbers decrease potential raciness with other systems
- // and make sure that other parts of the reconciliation logic are ran regularly.
- //
- // 20 is decent starting point.
- ChunkSize uint
-}
-
-func (pc *ProvisionerConfig) RegisterFlags() {
- flag.UintVar(&pc.MaxCount, "provisioner_max_machines", 50, "Limit of machines that the provisioner will attempt to pull into the BMDB. Zero for no limit.")
- mflags.Limiter(&pc.ReconcileLoopLimiter, "provisioner_reconciler_rate", "1m,1", "Rate limiting for main provisioner reconciliation loop")
- mflags.Limiter(&pc.DeviceCreationLimiter, "provisioner_device_creation_rate", "5s,1", "Rate limiting for machine creation")
- flag.UintVar(&pc.ChunkSize, "provisioner_reservation_chunk_size", 20, "How many machines will the provisioner attempt to create in a single reconciliation loop iteration")
-}
-
-func (pc *ProvisionerConfig) check() error {
- // If these are unset, it's probably because someone is using us as a library.
- // Provide error messages useful to code users instead of flag names.
- if pc.ReconcileLoopLimiter == nil {
- return fmt.Errorf("ReconcileLoopLimiter must be set")
- }
- if pc.DeviceCreationLimiter == nil {
- return fmt.Errorf("DeviceCreationLimiter must be set")
- }
- if pc.ChunkSize == 0 {
- return fmt.Errorf("ChunkSize must be set")
- }
- return nil
-}
-
-// NewProvisioner creates a Provisioner instance, checking ProvisionerConfig and
-// providerConfig for errors.
-func NewProvisioner(p shepherd.Provider, pc ProvisionerConfig) (*Provisioner, error) {
- if err := pc.check(); err != nil {
- return nil, err
- }
-
- return &Provisioner{
- ProvisionerConfig: pc,
- p: p,
- }, nil
-}
-
-// Run the provisioner blocking the current goroutine until the given context
-// expires.
-func (p *Provisioner) Run(ctx context.Context, conn *bmdb.Connection) error {
-
- var sess *bmdb.Session
- var err error
- for {
- if sess == nil {
- sess, err = conn.StartSession(ctx, bmdb.SessionOption{Processor: metrics.ProcessorShepherdProvisioner})
- if err != nil {
- return fmt.Errorf("could not start BMDB session: %w", err)
- }
- }
- err = p.runInSession(ctx, sess)
-
- switch {
- case err == nil:
- case errors.Is(err, ctx.Err()):
- return err
- case errors.Is(err, bmdb.ErrSessionExpired):
- klog.Errorf("Session expired, restarting...")
- sess = nil
- time.Sleep(time.Second)
- default:
- klog.Errorf("Processing failed: %v", err)
- // TODO(q3k): close session
- time.Sleep(time.Second)
- }
- }
-}
-
-type machineListing struct {
- machines []shepherd.Machine
- err error
-}
-
-// runInSession executes one iteration of the provisioner's control loop within a
-// BMDB session. This control loop attempts to bring all capacity into machines in
-// the BMDB, subject to limits.
-func (p *Provisioner) runInSession(ctx context.Context, sess *bmdb.Session) error {
- if err := p.ReconcileLoopLimiter.Wait(ctx); err != nil {
- return err
- }
-
- providerC := make(chan *machineListing, 1)
- bmdbC := make(chan *machineListing, 1)
-
- klog.Infof("Getting provider and bmdb machines...")
-
- // Make sub-context for two parallel operations, and so that we can cancel one
- // immediately if the other fails.
- subCtx, subCtxC := context.WithCancel(ctx)
- defer subCtxC()
-
- go func() {
- machines, err := p.listInProvider(subCtx)
- providerC <- &machineListing{
- machines: machines,
- err: err,
- }
- }()
- go func() {
- machines, err := p.listInBMDB(subCtx, sess)
- bmdbC <- &machineListing{
- machines: machines,
- err: err,
- }
- }()
- var inProvider, inBMDB *machineListing
- for {
- select {
- case inProvider = <-providerC:
- if err := inProvider.err; err != nil {
- return fmt.Errorf("listing provider machines failed: %w", err)
- }
- klog.Infof("Got %d machines in provider.", len(inProvider.machines))
- case inBMDB = <-bmdbC:
- if err := inBMDB.err; err != nil {
- return fmt.Errorf("listing BMDB machines failed: %w", err)
- }
- klog.Infof("Got %d machines in BMDB.", len(inBMDB.machines))
- }
- if inProvider != nil && inBMDB != nil {
- break
- }
- }
-
- subCtxC()
- if err := p.reconcile(ctx, sess, inProvider.machines, inBMDB.machines); err != nil {
- return fmt.Errorf("reconciliation failed: %w", err)
- }
- return nil
-}
-
-// listInProviders returns all machines that the provider thinks we should be
-// managing.
-func (p *Provisioner) listInProvider(ctx context.Context) ([]shepherd.Machine, error) {
- machines, err := p.p.ListMachines(ctx)
- if err != nil {
- return nil, fmt.Errorf("while fetching managed machines: %w", err)
- }
- sort.Slice(machines, func(i, j int) bool {
- return machines[i].ID() < machines[j].ID()
- })
- return machines, nil
-}
-
-type providedMachine struct {
- model.MachineProvided
-}
-
-func (p providedMachine) Failed() bool {
- if !p.MachineProvided.ProviderStatus.Valid {
- // If we don't have any ProviderStatus to check for, return false
- // to trigger the validation inside the reconciler loop.
- return false
- }
- switch p.MachineProvided.ProviderStatus.ProviderStatus {
- case model.ProviderStatusProvisioningFailedPermanent:
- return true
- }
- return false
-}
-
-func (p providedMachine) ID() shepherd.ProviderID {
- return shepherd.ProviderID(p.ProviderID)
-}
-
-func (p providedMachine) Addr() netip.Addr {
- if !p.ProviderIpAddress.Valid {
- return netip.Addr{}
- }
-
- addr, err := netip.ParseAddr(p.ProviderIpAddress.String)
- if err != nil {
- return netip.Addr{}
- }
- return addr
-}
-
-func (p providedMachine) Availability() shepherd.Availability {
- return shepherd.AvailabilityKnownUsed
-}
-
-// listInBMDB returns all the machines that the BMDB thinks we should be managing.
-func (p *Provisioner) listInBMDB(ctx context.Context, sess *bmdb.Session) ([]shepherd.Machine, error) {
- var res []shepherd.Machine
- err := sess.Transact(ctx, func(q *model.Queries) error {
- machines, err := q.GetProvidedMachines(ctx, p.p.Type())
- if err != nil {
- return err
- }
- res = make([]shepherd.Machine, 0, len(machines))
- for _, machine := range machines {
- _, err := uuid.Parse(machine.ProviderID)
- if err != nil {
- klog.Errorf("BMDB machine %s has unparseable provider ID %q", machine.MachineID, machine.ProviderID)
- continue
- }
-
- res = append(res, providedMachine{machine})
- }
- return nil
- })
- if err != nil {
- return nil, err
- }
- sort.Slice(res, func(i, j int) bool {
- return res[i].ID() < res[j].ID()
- })
- return res, nil
-}
-
-// resolvePossiblyUsed checks if the availability is set to possibly used and
-// resolves it to the correct one.
-func (p *Provisioner) resolvePossiblyUsed(machine shepherd.Machine, providedMachines map[shepherd.ProviderID]shepherd.Machine) shepherd.Availability {
- state, id := machine.Availability(), machine.ID()
-
- // Bail out if this isn't possibly used.
- if state != shepherd.AvailabilityPossiblyUsed {
- return state
- }
-
- // If a machine does not have a valid id, its always seen as unused.
- if !id.IsValid() {
- return shepherd.AvailabilityKnownUnused
- }
-
- // If the machine is not inside the bmdb, it's seen as unused.
- if _, ok := providedMachines[id]; !ok {
- return shepherd.AvailabilityKnownUnused
- }
-
- return shepherd.AvailabilityKnownUsed
-}
-
-// reconcile takes a list of machines that the provider thinks we should be
-// managing and that the BMDB thinks we should be managing, and tries to make
-// sense of that. First, some checks are performed across the two lists to make
-// sure we haven't dropped anything. Then, additional machines are deployed from
-// hardware reservations as needed.
-func (p *Provisioner) reconcile(ctx context.Context, sess *bmdb.Session, inProvider, bmdbMachines []shepherd.Machine) error {
- klog.Infof("Reconciling...")
-
- bmdb := make(map[shepherd.ProviderID]shepherd.Machine)
- for _, machine := range bmdbMachines {
- // Dont check the availability here as its hardcoded to be known used.
- bmdb[machine.ID()] = machine
- }
-
- var availableMachines []shepherd.Machine
- provider := make(map[shepherd.ProviderID]shepherd.Machine)
- for _, machine := range inProvider {
- state := p.resolvePossiblyUsed(machine, bmdb)
-
- switch state {
- case shepherd.AvailabilityKnownUnused:
- availableMachines = append(availableMachines, machine)
-
- case shepherd.AvailabilityKnownUsed:
- provider[machine.ID()] = machine
-
- default:
- return fmt.Errorf("machine has invalid availability (ID: %s, Addr: %s): %s", machine.ID(), machine.Addr(), state)
- }
- }
-
- managed := make(map[shepherd.ProviderID]bool)
-
- // We discovered that a machine mostly fails either when provisioning or
- // deprovisioning. A already deployed and running machine can only switch
- // into failed state if any api interaction happend, e.g. rebooting the
- // machine into recovery mode. If such a machine is returned to the
- // reconciling loop, it will trigger the badbadnotgood safety switch and
- // return with an error. To reduce the manual intervention required we
- // filter out these machines on both sides (bmdb and provider).
- isBadBadNotGood := func(known map[shepherd.ProviderID]shepherd.Machine, machine shepherd.Machine) bool {
- // If the machine is missing and not failed, its a bad case.
- if known[machine.ID()] == nil && !machine.Failed() {
- return true
- }
- return false
- }
-
- // Some desynchronization between the BMDB and Provider point of view might be so
- // bad we shouldn't attempt to do any work, at least not any time soon.
- badbadnotgood := false
-
- // Find any machines supposedly managed by us in the provider, but not in the
- // BMDB.
- for id, machine := range provider {
- if isBadBadNotGood(bmdb, machine) {
- klog.Errorf("Provider machine has no corresponding machine in BMDB. (PID: %s)", id)
- badbadnotgood = true
- continue
- }
-
- managed[id] = true
- }
-
- // Find any machines in the BMDB but not in the provider.
- for id, machine := range bmdb {
- if isBadBadNotGood(provider, machine) {
- klog.Errorf("Provider machine referred to in BMDB but missing in provider. (PID: %s)", id)
- badbadnotgood = true
- }
- }
-
- // Bail if things are weird.
- if badbadnotgood {
- klog.Errorf("Something's very wrong. Bailing early and refusing to do any work.")
- return fmt.Errorf("fatal discrepency between BMDB and provider")
- }
-
- // Summarize all managed machines, which is the intersection of BMDB and
- // Provisioner machines, usually both of these sets being equal.
- nmanaged := len(managed)
- klog.Infof("Total managed machines: %d", nmanaged)
-
- if p.MaxCount != 0 && p.MaxCount <= uint(nmanaged) {
- klog.Infof("Not bringing up more machines (at limit of %d machines)", p.MaxCount)
- return nil
- }
-
- limitName := "no limit"
- if p.MaxCount != 0 {
- limitName = fmt.Sprintf("%d", p.MaxCount)
- }
- klog.Infof("Below managed machine limit (%s), bringing up more...", limitName)
-
- if len(availableMachines) == 0 {
- klog.Infof("No more capacity available.")
- return nil
- }
-
- toProvision := availableMachines
- // Limit them to MaxCount, if applicable.
- if p.MaxCount != 0 {
- needed := int(p.MaxCount) - nmanaged
- if len(toProvision) < needed {
- needed = len(toProvision)
- }
- toProvision = toProvision[:needed]
- }
-
- // Limit them to an arbitrary 'chunk' size so that we don't do too many things in
- // a single reconciliation operation.
- if uint(len(toProvision)) > p.ChunkSize {
- toProvision = toProvision[:p.ChunkSize]
- }
-
- if len(toProvision) == 0 {
- klog.Infof("No more unused machines available, or all filtered out.")
- return nil
- }
-
- klog.Infof("Bringing up %d machines...", len(toProvision))
- for _, machine := range toProvision {
- if err := p.DeviceCreationLimiter.Wait(ctx); err != nil {
- return err
- }
-
- nd, err := p.p.CreateMachine(ctx, sess, shepherd.CreateMachineRequest{
- UnusedMachine: machine,
- })
- if err != nil {
- klog.Errorf("while creating new device (ID: %s, Addr: %s, Availability: %s): %v", machine.ID(), machine.Addr(), machine.Availability(), err)
- continue
- }
- klog.Infof("Created new machine with ID: %s", nd.ID())
- }
-
- return nil
-}
diff --git a/cloud/shepherd/manager/provisioner_test.go b/cloud/shepherd/manager/provisioner_test.go
deleted file mode 100644
index eba178c..0000000
--- a/cloud/shepherd/manager/provisioner_test.go
+++ /dev/null
@@ -1,143 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package manager
-
-import (
- "context"
- "testing"
- "time"
-
- "golang.org/x/time/rate"
-
- "source.monogon.dev/cloud/bmaas/bmdb"
- "source.monogon.dev/cloud/bmaas/bmdb/model"
- "source.monogon.dev/cloud/lib/component"
- "source.monogon.dev/cloud/shepherd"
-)
-
-// TestProvisionerSmokes makes sure the Provisioner doesn't go up in flames on
-// the happy path.
-func TestProvisionerSmokes(t *testing.T) {
- pc := ProvisionerConfig{
- MaxCount: 10,
- // We need 3 iterations to provide 10 machines with a chunk size of 4.
- ReconcileLoopLimiter: rate.NewLimiter(rate.Every(10*time.Second), 3),
- DeviceCreationLimiter: rate.NewLimiter(rate.Every(time.Second), 10),
- ChunkSize: 4,
- }
-
- provider := newDummyProvider(100)
-
- p, err := NewProvisioner(provider, pc)
- if err != nil {
- t.Fatalf("Could not create Provisioner: %v", err)
- }
-
- ctx, ctxC := context.WithCancel(context.Background())
- defer ctxC()
-
- b := bmdb.BMDB{
- Config: bmdb.Config{
- Database: component.CockroachConfig{
- InMemory: true,
- },
- ComponentName: "test",
- RuntimeInfo: "test",
- },
- }
- conn, err := b.Open(true)
- if err != nil {
- t.Fatalf("Could not create in-memory BMDB: %v", err)
- }
-
- go p.Run(ctx, conn)
-
- sess, err := conn.StartSession(ctx)
- if err != nil {
- t.Fatalf("Failed to create BMDB session for verification: %v", err)
- }
- for {
- time.Sleep(100 * time.Millisecond)
-
- var provided []model.MachineProvided
- err = sess.Transact(ctx, func(q *model.Queries) error {
- var err error
- provided, err = q.GetProvidedMachines(ctx, provider.Type())
- return err
- })
- if err != nil {
- t.Fatalf("Transact failed: %v", err)
- }
- if len(provided) < 10 {
- continue
- }
- if len(provided) > 10 {
- t.Fatalf("%d machines provided (limit: 10)", len(provided))
- }
-
- for _, mp := range provided {
- provider.muMachines.RLock()
- if provider.machines[shepherd.ProviderID(mp.ProviderID)] == nil {
- t.Fatalf("BMDB machine %q has unknown provider ID %q", mp.MachineID, mp.ProviderID)
- }
- provider.muMachines.RUnlock()
- }
-
- return
- }
-}
-
-// TestProvisioner_resolvePossiblyUsed makes sure the PossiblyUsed availability is
-// resolved correctly.
-func TestProvisioner_resolvePossiblyUsed(t *testing.T) {
- const providedMachineID = "provided-machine"
-
- providedMachines := map[shepherd.ProviderID]shepherd.Machine{
- providedMachineID: nil,
- }
-
- tests := []struct {
- name string
- machineID shepherd.ProviderID
- machineAvailability shepherd.Availability
- wantedAvailability shepherd.Availability
- }{
- {
- name: "skip KnownUsed",
- machineAvailability: shepherd.AvailabilityKnownUsed,
- wantedAvailability: shepherd.AvailabilityKnownUsed,
- },
- {
- name: "skip KnownUnused",
- machineAvailability: shepherd.AvailabilityKnownUnused,
- wantedAvailability: shepherd.AvailabilityKnownUnused,
- },
- {
- name: "invalid ID",
- machineID: shepherd.InvalidProviderID,
- machineAvailability: shepherd.AvailabilityPossiblyUsed,
- wantedAvailability: shepherd.AvailabilityKnownUnused,
- },
- {
- name: "valid ID, not in providedMachines",
- machineID: "unused-machine",
- machineAvailability: shepherd.AvailabilityPossiblyUsed,
- wantedAvailability: shepherd.AvailabilityKnownUnused,
- },
- {
- name: "valid ID, in providedMachines",
- machineID: providedMachineID,
- machineAvailability: shepherd.AvailabilityPossiblyUsed,
- wantedAvailability: shepherd.AvailabilityKnownUsed,
- },
- }
- for _, tt := range tests {
- t.Run(tt.name, func(t *testing.T) {
- p := &Provisioner{}
- if got := p.resolvePossiblyUsed(&dummyMachine{id: tt.machineID, availability: tt.machineAvailability}, providedMachines); got != tt.wantedAvailability {
- t.Fatalf("resolvePossiblyUsed() = %v, want %v", got, tt.wantedAvailability)
- }
- })
- }
-}
diff --git a/cloud/shepherd/manager/recoverer.go b/cloud/shepherd/manager/recoverer.go
deleted file mode 100644
index 8925f17..0000000
--- a/cloud/shepherd/manager/recoverer.go
+++ /dev/null
@@ -1,84 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package manager
-
-import (
- "context"
- "fmt"
- "time"
-
- "k8s.io/klog/v2"
-
- "source.monogon.dev/cloud/bmaas/bmdb"
- "source.monogon.dev/cloud/bmaas/bmdb/metrics"
- "source.monogon.dev/cloud/bmaas/bmdb/model"
- "source.monogon.dev/cloud/shepherd"
-)
-
-type RecovererConfig struct {
- ControlLoopConfig
-}
-
-func (r *RecovererConfig) RegisterFlags() {
- r.ControlLoopConfig.RegisterFlags("recoverer")
-}
-
-// The Recoverer reboots machines whose agent has stopped sending heartbeats or
-// has not sent any heartbeats at all.
-type Recoverer struct {
- RecovererConfig
- r shepherd.Recoverer
-}
-
-func NewRecoverer(r shepherd.Recoverer, rc RecovererConfig) (*Recoverer, error) {
- if err := rc.ControlLoopConfig.Check(); err != nil {
- return nil, err
- }
- return &Recoverer{
- RecovererConfig: rc,
- r: r,
- }, nil
-}
-
-func (r *Recoverer) getProcessInfo() processInfo {
- return processInfo{
- process: model.ProcessShepherdRecovery,
- defaultBackoff: bmdb.Backoff{
- Initial: 1 * time.Minute,
- Maximum: 1 * time.Hour,
- Exponent: 1.2,
- },
- processor: metrics.ProcessorShepherdRecoverer,
- }
-}
-
-func (r *Recoverer) getMachines(ctx context.Context, q *model.Queries, limit int32) ([]model.MachineProvided, error) {
- return q.GetMachineForAgentRecovery(ctx, model.GetMachineForAgentRecoveryParams{
- Limit: limit,
- Provider: r.r.Type(),
- })
-}
-
-func (r *Recoverer) processMachine(ctx context.Context, t *task) error {
- klog.Infof("Starting recovery of machine (ID: %s, PID %s)", t.machine.MachineID, t.machine.ProviderID)
-
- if err := r.r.RebootMachine(ctx, shepherd.ProviderID(t.machine.ProviderID)); err != nil {
- return fmt.Errorf("failed to reboot machine: %w", err)
- }
-
- klog.Infof("Removing AgentStarted/AgentHeartbeat (ID: %s, PID: %s)...", t.machine.MachineID, t.machine.ProviderID)
- err := t.work.Finish(ctx, func(q *model.Queries) error {
- if err := q.MachineDeleteAgentStarted(ctx, t.machine.MachineID); err != nil {
- return fmt.Errorf("while deleting AgentStarted: %w", err)
- }
- if err := q.MachineDeleteAgentHeartbeat(ctx, t.machine.MachineID); err != nil {
- return fmt.Errorf("while deleting AgentHeartbeat: %w", err)
- }
- return nil
- })
- if err != nil {
- return fmt.Errorf("while deleting AgentStarted/AgentHeartbeat tags: %w", err)
- }
- return nil
-}
diff --git a/cloud/shepherd/manager/ssh_key_signer.go b/cloud/shepherd/manager/ssh_key_signer.go
deleted file mode 100644
index b28b0b5..0000000
--- a/cloud/shepherd/manager/ssh_key_signer.go
+++ /dev/null
@@ -1,111 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package manager
-
-import (
- "crypto/ed25519"
- "crypto/rand"
- "flag"
- "fmt"
- "os"
- "sync"
-
- "golang.org/x/crypto/ssh"
- "k8s.io/klog/v2"
-)
-
-type SSHKey struct {
- // myKey guards Key.
- muKey sync.Mutex
-
- // SSH key to use when creating machines and then connecting to them. If not
- // provided, it will be automatically loaded from KeyPersistPath, and if that
- // doesn't exist either, it will be first generated and persisted there.
- Key ed25519.PrivateKey
-
- // Path at which the SSH key will be loaded from and persisted to, if Key is not
- // explicitly set. Either KeyPersistPath or Key must be set.
- KeyPersistPath string
-}
-
-func (c *SSHKey) RegisterFlags() {
- flag.StringVar(&c.KeyPersistPath, "ssh_key_path", "", "Local filesystem path to read SSH key from, and save generated key to")
-}
-
-// sshKey returns the SSH key as defined by the Key and KeyPersistPath options,
-// loading/generating/persisting it as necessary.
-func (c *SSHKey) sshKey() (ed25519.PrivateKey, error) {
- c.muKey.Lock()
- defer c.muKey.Unlock()
-
- if c.Key != nil {
- return c.Key, nil
- }
- if c.KeyPersistPath == "" {
- return nil, fmt.Errorf("-ssh_key_path must be set")
- }
-
- data, err := os.ReadFile(c.KeyPersistPath)
- switch {
- case err == nil:
- if len(data) != ed25519.PrivateKeySize {
- return nil, fmt.Errorf("%s is not a valid ed25519 private key", c.KeyPersistPath)
- }
- c.Key = data
- klog.Infof("Loaded SSH key from %s", c.KeyPersistPath)
- return c.Key, nil
- case os.IsNotExist(err):
- if err := c.sshGenerateUnlocked(); err != nil {
- return nil, err
- }
- if err := os.WriteFile(c.KeyPersistPath, c.Key, 0400); err != nil {
- return nil, fmt.Errorf("could not persist key: %w", err)
- }
- return c.Key, nil
- default:
- return nil, fmt.Errorf("could not load peristed key: %w", err)
- }
-}
-
-// PublicKey returns the SSH public key marshaled for use, based on sshKey.
-func (c *SSHKey) PublicKey() (string, error) {
- private, err := c.sshKey()
- if err != nil {
- return "", err
- }
- // Marshal the public key part in OpenSSH authorized_keys.
- sshpub, err := ssh.NewPublicKey(private.Public())
- if err != nil {
- return "", fmt.Errorf("while building SSH public key: %w", err)
- }
- return string(ssh.MarshalAuthorizedKey(sshpub)), nil
-}
-
-// Signer builds an ssh.Signer (for use in SSH connections) based on sshKey.
-func (c *SSHKey) Signer() (ssh.Signer, error) {
- private, err := c.sshKey()
- if err != nil {
- return nil, err
- }
- // Set up the internal ssh.Signer to be later used to initiate SSH
- // connections with newly provided hosts.
- signer, err := ssh.NewSignerFromKey(private)
- if err != nil {
- return nil, fmt.Errorf("while building SSH signer: %w", err)
- }
- return signer, nil
-}
-
-// sshGenerateUnlocked saves a new private key into SharedConfig.Key.
-func (c *SSHKey) sshGenerateUnlocked() error {
- if c.Key != nil {
- return nil
- }
- _, priv, err := ed25519.GenerateKey(rand.Reader)
- if err != nil {
- return fmt.Errorf("while generating SSH key: %w", err)
- }
- c.Key = priv
- return nil
-}
diff --git a/cloud/shepherd/manager/test_agent/BUILD.bazel b/cloud/shepherd/manager/test_agent/BUILD.bazel
deleted file mode 100644
index 7636cdd..0000000
--- a/cloud/shepherd/manager/test_agent/BUILD.bazel
+++ /dev/null
@@ -1,28 +0,0 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_library")
-load("//build/static_binary_tarball:def.bzl", "static_binary_tarball")
-
-go_binary(
- name = "test_agent",
- embed = [":test_agent_lib"],
- visibility = [
- "//cloud/shepherd/manager:__pkg__",
- ],
-)
-
-go_library(
- name = "test_agent_lib",
- srcs = ["main.go"],
- importpath = "source.monogon.dev/cloud/shepherd/manager/test_agent",
- visibility = ["//visibility:private"],
- deps = [
- "//cloud/agent/api",
- "@org_golang_google_protobuf//proto",
- ],
-)
-
-# Used by container_images, forces a static build of the test_agent.
-static_binary_tarball(
- name = "test_agent_layer",
- executable = ":test_agent",
- visibility = ["//visibility:public"],
-)
diff --git a/cloud/shepherd/manager/test_agent/main.go b/cloud/shepherd/manager/test_agent/main.go
deleted file mode 100644
index 98daf1b..0000000
--- a/cloud/shepherd/manager/test_agent/main.go
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-// test_agent is used by the Equinix Metal Manager test code. Its only role
-// is to ensure successful delivery of the BMaaS agent executable to the test
-// hosts, together with its subsequent execution.
-package main
-
-import (
- "crypto/ed25519"
- "crypto/rand"
- "fmt"
- "io"
- "os"
-
- "google.golang.org/protobuf/proto"
-
- apb "source.monogon.dev/cloud/agent/api"
-)
-
-func main() {
- // The agent initialization message will arrive from Shepherd on Agent's
- // standard input.
- aimb, err := io.ReadAll(os.Stdin)
- if err != nil {
- fmt.Fprintf(os.Stderr, "while reading AgentInit message: %v\n", err)
- return
- }
- var aim apb.TakeoverInit
- if err := proto.Unmarshal(aimb, &aim); err != nil {
- fmt.Fprintf(os.Stderr, "while unmarshaling TakeoverInit message: %v\n", err)
- return
- }
-
- // Agent should send back apb.TakeoverResponse on its standard output.
- pub, _, err := ed25519.GenerateKey(rand.Reader)
- if err != nil {
- fmt.Fprintf(os.Stderr, "while generating agent public key: %v\n", err)
- return
- }
- arsp := apb.TakeoverResponse{
- Result: &apb.TakeoverResponse_Success{Success: &apb.TakeoverSuccess{
- InitMessage: &aim,
- Key: pub,
- }},
- }
- arspb, err := proto.Marshal(&arsp)
- if err != nil {
- fmt.Fprintf(os.Stderr, "while marshaling TakeoverResponse message: %v\n", err)
- return
- }
- if _, err := os.Stdout.Write(arspb); err != nil {
- fmt.Fprintf(os.Stderr, "while writing TakeoverResponse message: %v\n", err)
- }
- // The agent must detach and/or terminate after sending back the reply.
- // Failure to do so will leave the session hanging.
-}
diff --git a/cloud/shepherd/mini/BUILD.bazel b/cloud/shepherd/mini/BUILD.bazel
deleted file mode 100644
index 7b1a7ad..0000000
--- a/cloud/shepherd/mini/BUILD.bazel
+++ /dev/null
@@ -1,47 +0,0 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_library")
-load("@rules_oci//oci:defs.bzl", "oci_image")
-load("@rules_pkg//pkg:tar.bzl", "pkg_tar")
-
-go_library(
- name = "mini_lib",
- srcs = [
- "main.go",
- "provider.go",
- "ssh.go",
- ],
- importpath = "source.monogon.dev/cloud/shepherd/mini",
- visibility = ["//visibility:public"],
- deps = [
- "//cloud/bmaas/bmdb",
- "//cloud/bmaas/bmdb/model",
- "//cloud/bmaas/bmdb/webug",
- "//cloud/lib/component",
- "//cloud/shepherd",
- "//cloud/shepherd/manager",
- "@io_k8s_klog_v2//:klog",
- "@org_golang_x_crypto//ssh",
- ],
-)
-
-go_binary(
- name = "mini",
- embed = [":mini_lib"],
- visibility = ["//visibility:public"],
-)
-
-pkg_tar(
- name = "mini_layer",
- srcs = [":mini"],
-)
-
-oci_image(
- name = "mini_image",
- base = "@distroless_base",
- entrypoint = ["/mini"],
- tars = [
- ":mini_layer",
- "//cloud/agent/takeover:takeover_layer",
- ],
- visibility = ["//visibility:public"],
- workdir = "/app",
-)
diff --git a/cloud/shepherd/mini/main.go b/cloud/shepherd/mini/main.go
deleted file mode 100644
index b056d20..0000000
--- a/cloud/shepherd/mini/main.go
+++ /dev/null
@@ -1,195 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package main
-
-import (
- "context"
- "encoding/json"
- "errors"
- "flag"
- "fmt"
- "io"
- "net/http"
- "net/url"
- "os"
- "os/signal"
- "strings"
-
- "k8s.io/klog/v2"
-
- "source.monogon.dev/cloud/bmaas/bmdb"
- "source.monogon.dev/cloud/bmaas/bmdb/model"
- "source.monogon.dev/cloud/bmaas/bmdb/webug"
- "source.monogon.dev/cloud/lib/component"
- "source.monogon.dev/cloud/shepherd"
- "source.monogon.dev/cloud/shepherd/manager"
-)
-
-type Config struct {
- Component component.ComponentConfig
- BMDB bmdb.BMDB
- WebugConfig webug.Config
-
- InitializerConfig manager.InitializerConfig
- ProvisionerConfig manager.ProvisionerConfig
- RecovererConfig manager.RecovererConfig
-
- SSHConfig sshConfig
- DeviceListSource string
- ProviderType model.Provider
-}
-
-// TODO(q3k): factor this out to BMDB library?
-func runtimeInfo() string {
- hostname, _ := os.Hostname()
- if hostname == "" {
- hostname = "UNKNOWN"
- }
- return fmt.Sprintf("host %s", hostname)
-}
-
-func (c *Config) RegisterFlags() {
- c.Component.RegisterFlags("shepherd")
- c.BMDB.ComponentName = "shepherd-mini"
- c.BMDB.RuntimeInfo = runtimeInfo()
- c.BMDB.Database.RegisterFlags("bmdb")
- c.WebugConfig.RegisterFlags()
-
- c.InitializerConfig.RegisterFlags()
- c.ProvisionerConfig.RegisterFlags()
- c.RecovererConfig.RegisterFlags()
-
- c.SSHConfig.RegisterFlags()
- flag.StringVar(&c.DeviceListSource, "mini_device_list_url", "", "The url from where to fetch the device list. For local paths use file:// as scheme")
- flag.Func("mini_provider", "The provider this mini shepherd should emulate. Supported values are: lumen,equinix", func(s string) error {
- switch s {
- case strings.ToLower(string(model.ProviderEquinix)):
- c.ProviderType = model.ProviderEquinix
- case strings.ToLower(string(model.ProviderLumen)):
- c.ProviderType = model.ProviderLumen
- default:
- return fmt.Errorf("invalid provider name")
- }
- return nil
- })
-}
-
-type deviceList []machine
-
-func (dl deviceList) asMap() map[shepherd.ProviderID]machine {
- mm := make(map[shepherd.ProviderID]machine)
- for _, m := range dl {
- mm[m.ProviderID] = m
- }
- return mm
-}
-
-func fetchDeviceList(s string) (deviceList, error) {
- var r io.Reader
- u, err := url.Parse(s)
- if err != nil {
- return nil, fmt.Errorf("failed parsing device list url: %w", err)
- }
-
- if u.Scheme != "file" {
- resp, err := http.Get(u.String())
- if err != nil {
- return nil, err
- }
- defer resp.Body.Close()
-
- if resp.StatusCode != http.StatusOK {
- return nil, fmt.Errorf("invalid status code: %d != %v", http.StatusOK, resp.StatusCode)
- }
- r = resp.Body
- } else {
- f, err := os.Open(u.Path)
- if err != nil {
- return nil, err
- }
- defer f.Close()
- r = f
- }
-
- var d deviceList
- dec := json.NewDecoder(r)
- dec.DisallowUnknownFields()
- if err := dec.Decode(&d); err != nil {
- return nil, err
- }
-
- klog.Infof("Fetched device list with %d entries", len(d))
-
- return d, nil
-}
-
-func main() {
- var c Config
- c.RegisterFlags()
-
- flag.Parse()
- if flag.NArg() > 0 {
- klog.Exitf("unexpected positional arguments: %v", flag.Args())
- }
-
- registry := c.Component.PrometheusRegistry()
- c.BMDB.EnableMetrics(registry)
-
- ctx, _ := signal.NotifyContext(context.Background(), os.Interrupt)
- c.Component.StartPrometheus(ctx)
-
- conn, err := c.BMDB.Open(true)
- if err != nil {
- klog.Exitf("Failed to open BMDB connection: %v", err)
- }
-
- err = c.SSHConfig.Configure(&c.InitializerConfig.SSHConfig)
- if err != nil {
- klog.Exitf("Failed to create SSH client: %v", err)
- }
-
- if c.DeviceListSource == "" {
- klog.Exitf("-mini_device_list_source must be set")
- }
-
- list, err := fetchDeviceList(c.DeviceListSource)
- if err != nil {
- klog.Exitf("Failed to fetch device list: %v", err)
- }
-
- mini := &provider{
- providerType: c.ProviderType,
- machines: list.asMap(),
- }
-
- provisioner, err := manager.NewProvisioner(mini, c.ProvisionerConfig)
- if err != nil {
- klog.Exitf("%v", err)
- }
-
- initializer, err := manager.NewInitializer(mini, c.InitializerConfig)
- if err != nil {
- klog.Exitf("%v", err)
- }
-
- go func() {
- err = provisioner.Run(ctx, conn)
- if err != nil {
- klog.Exit(err)
- }
- }()
- go func() {
- err = manager.RunControlLoop(ctx, conn, initializer)
- if err != nil {
- klog.Exit(err)
- }
- }()
- go func() {
- if err := c.WebugConfig.Start(ctx, conn); err != nil && !errors.Is(err, ctx.Err()) {
- klog.Exitf("Failed to start webug: %v", err)
- }
- }()
-
- <-ctx.Done()
-}
diff --git a/cloud/shepherd/mini/provider.go b/cloud/shepherd/mini/provider.go
deleted file mode 100644
index 1ee55c4..0000000
--- a/cloud/shepherd/mini/provider.go
+++ /dev/null
@@ -1,133 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package main
-
-import (
- "context"
- "database/sql"
- "fmt"
- "net/netip"
-
- "k8s.io/klog/v2"
-
- "source.monogon.dev/cloud/bmaas/bmdb"
- "source.monogon.dev/cloud/bmaas/bmdb/model"
- "source.monogon.dev/cloud/shepherd"
-)
-
-// provider represents a shepherd.Provider that works entirely on a
-// static device list. It requires a provider type and a device list.
-type provider struct {
- providerType model.Provider
- machines map[shepherd.ProviderID]machine
-}
-
-type machine struct {
- ProviderID shepherd.ProviderID `json:"ID"`
- Address netip.Addr `json:"Addr"`
- Location string `json:"Location"`
-}
-
-func (d machine) Failed() bool {
- return false
-}
-
-func (d machine) ID() shepherd.ProviderID {
- return d.ProviderID
-}
-
-func (d machine) Addr() netip.Addr {
- return d.Address
-}
-
-func (d machine) Availability() shepherd.Availability {
- return shepherd.AvailabilityPossiblyUsed
-}
-
-func (p *provider) ListMachines(ctx context.Context) ([]shepherd.Machine, error) {
- machines := make([]shepherd.Machine, 0, len(p.machines))
- for _, m := range p.machines {
- machines = append(machines, m)
- }
-
- return machines, nil
-}
-
-func (p *provider) GetMachine(ctx context.Context, id shepherd.ProviderID) (shepherd.Machine, error) {
- // If the provided machine is not inside our known machines,
- // bail-out early as this is unsupported.
- if _, ok := p.machines[id]; !ok {
- return nil, fmt.Errorf("unknown provided machine requested")
- }
-
- return p.machines[id], nil
-}
-
-func (p *provider) CreateMachine(ctx context.Context, session *bmdb.Session, request shepherd.CreateMachineRequest) (shepherd.Machine, error) {
- if request.UnusedMachine == nil {
- return nil, fmt.Errorf("parameter UnusedMachine is missing")
- }
-
- //TODO: Do we just trust the implementation to be correct?
- m, ok := request.UnusedMachine.(machine)
- if !ok {
- return nil, fmt.Errorf("invalid type for parameter UnusedMachine")
- }
-
- if err := p.assimilate(ctx, session, m); err != nil {
- klog.Errorf("Failed to provision machine %s: %v", m.ProviderID, err)
- return nil, err
- }
-
- return m, nil
-}
-
-func (p *provider) assimilate(ctx context.Context, sess *bmdb.Session, machine machine) error {
- return sess.Transact(ctx, func(q *model.Queries) error {
- // Create a new machine record within BMDB.
- m, err := q.NewMachine(ctx)
- if err != nil {
- return fmt.Errorf("while creating a new machine record in BMDB: %w", err)
- }
-
- // Link the new machine with the device, and tag it "provided".
- addParams := model.MachineAddProvidedParams{
- MachineID: m.MachineID,
- ProviderID: string(machine.ProviderID),
- Provider: p.providerType,
- }
- klog.Infof("Setting \"provided\" tag (ID: %s, PID: %s, Provider: %s).", addParams.MachineID, addParams.ProviderID, addParams.Provider)
- if err := q.MachineAddProvided(ctx, addParams); err != nil {
- return fmt.Errorf("while tagging machine active: %w", err)
- }
-
- upParams := model.MachineUpdateProviderStatusParams{
- ProviderID: string(machine.ProviderID),
- Provider: p.providerType,
- ProviderIpAddress: sql.NullString{
- String: machine.Address.String(),
- Valid: true,
- },
- ProviderLocation: sql.NullString{
- String: machine.Location,
- Valid: machine.Location != "",
- },
- ProviderStatus: model.NullProviderStatus{
- ProviderStatus: model.ProviderStatusUnknown,
- Valid: true,
- },
- }
-
- klog.Infof("Setting \"provided\" tag status parameter (ID: %s, PID: %s, Provider: %s).", addParams.MachineID, upParams.ProviderID, upParams.Provider)
- if err := q.MachineUpdateProviderStatus(ctx, upParams); err != nil {
- return fmt.Errorf("while setting machine params: %w", err)
- }
-
- return nil
- })
-}
-
-func (p *provider) Type() model.Provider {
- return p.providerType
-}
diff --git a/cloud/shepherd/mini/ssh.go b/cloud/shepherd/mini/ssh.go
deleted file mode 100644
index 59cefca..0000000
--- a/cloud/shepherd/mini/ssh.go
+++ /dev/null
@@ -1,73 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package main
-
-import (
- "flag"
- "fmt"
-
- "golang.org/x/crypto/ssh"
- "k8s.io/klog/v2"
-
- "source.monogon.dev/cloud/shepherd/manager"
-)
-
-type sshConfig struct {
- User string
- Pass string
- SSHKey manager.SSHKey
-}
-
-func (sc *sshConfig) check() error {
- if sc.User == "" {
- return fmt.Errorf("-ssh_user must be set")
- }
-
- if sc.Pass == "" && sc.SSHKey.KeyPersistPath == "" {
- //TODO: The flag name -ssh_key_path could change, which would make this
- // error very confusing.
- return fmt.Errorf("-ssh_pass or -ssh_key_path must be set")
- }
-
- return nil
-}
-
-func (sc *sshConfig) RegisterFlags() {
- flag.StringVar(&sc.User, "ssh_user", "", "SSH username to log into the machines")
- flag.StringVar(&sc.Pass, "ssh_pass", "", "SSH password to log into the machines")
- sc.SSHKey.RegisterFlags()
-}
-
-func (sc *sshConfig) Configure(config *ssh.ClientConfig) error {
- if err := sc.check(); err != nil {
- return err
- }
-
- config.User = sc.User
-
- switch {
- case sc.Pass != "":
- config.Auth = []ssh.AuthMethod{ssh.Password(sc.Pass)}
- case sc.SSHKey.KeyPersistPath != "":
- signer, err := sc.SSHKey.Signer()
- if err != nil {
- return err
- }
-
- pubKey, err := sc.SSHKey.PublicKey()
- if err != nil {
- return err
- }
-
- klog.Infof("Using ssh key auth with public key: %s", pubKey)
-
- config.Auth = []ssh.AuthMethod{ssh.PublicKeys(signer)}
- }
-
- // Ignore the host key, since it's likely the first time anything logs into
- // this device, and also because there's no way of knowing its fingerprint.
- config.HostKeyCallback = ssh.InsecureIgnoreHostKey()
-
- return nil
-}
diff --git a/cloud/shepherd/provider/equinix/BUILD.bazel b/cloud/shepherd/provider/equinix/BUILD.bazel
deleted file mode 100644
index e0333b1..0000000
--- a/cloud/shepherd/provider/equinix/BUILD.bazel
+++ /dev/null
@@ -1,77 +0,0 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_library", "go_test")
-load("@rules_oci//oci:defs.bzl", "oci_image")
-load("@rules_pkg//pkg:tar.bzl", "pkg_tar")
-
-go_library(
- name = "equinix_lib",
- srcs = [
- "main.go",
- "provider.go",
- "provider_config.go",
- "updater.go",
- ],
- importpath = "source.monogon.dev/cloud/shepherd/provider/equinix",
- visibility = ["//visibility:private"],
- deps = [
- "//cloud/bmaas/bmdb",
- "//cloud/bmaas/bmdb/metrics",
- "//cloud/bmaas/bmdb/model",
- "//cloud/bmaas/bmdb/webug",
- "//cloud/equinix/wrapngo",
- "//cloud/lib/component",
- "//cloud/lib/sinbin",
- "//cloud/shepherd",
- "//cloud/shepherd/manager",
- "@com_github_packethost_packngo//:packngo",
- "@io_k8s_klog_v2//:klog",
- "@org_golang_x_crypto//ssh",
- ],
-)
-
-go_test(
- name = "equinix_test",
- srcs = [
- "fakequinix_test.go",
- "initializer_test.go",
- "provisioner_test.go",
- "recoverer_test.go",
- "updater_test.go",
- ],
- data = [
- "@cockroach",
- ],
- embed = [":equinix_lib"],
- deps = [
- "//cloud/bmaas/bmdb",
- "//cloud/bmaas/bmdb/model",
- "//cloud/lib/component",
- "//cloud/shepherd/manager",
- "@com_github_google_uuid//:uuid",
- "@com_github_packethost_packngo//:packngo",
- "@org_golang_x_crypto//ssh",
- "@org_golang_x_time//rate",
- ],
-)
-
-go_binary(
- name = "equinix",
- embed = [":equinix_lib"],
- visibility = ["//visibility:public"],
-)
-
-pkg_tar(
- name = "equinix_layer",
- srcs = [":equinix"],
-)
-
-oci_image(
- name = "equinix_image",
- base = "@distroless_base",
- entrypoint = ["/equinix"],
- tars = [
- ":equinix_layer",
- "//cloud/agent/takeover:takeover_layer",
- ],
- visibility = ["//visibility:public"],
- workdir = "/app",
-)
diff --git a/cloud/shepherd/provider/equinix/fakequinix_test.go b/cloud/shepherd/provider/equinix/fakequinix_test.go
deleted file mode 100644
index aa0b234..0000000
--- a/cloud/shepherd/provider/equinix/fakequinix_test.go
+++ /dev/null
@@ -1,228 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package main
-
-import (
- "context"
- "fmt"
- "net/http"
- "sync"
-
- "github.com/google/uuid"
- "github.com/packethost/packngo"
-)
-
-// fakequinix implements a wrapngo.Client for testing. It starts out with a
-// number of made up hardware reservations, and allows for creating devices and
-// SSH keys.
-type fakequinix struct {
- mu sync.Mutex
-
- pid string
- devices map[string]*packngo.Device
- reservations map[string]*packngo.HardwareReservation
- sshKeys map[string]*packngo.SSHKey
- reboots map[string]int
-}
-
-func (f *fakequinix) ListOrganizationReservations(ctx context.Context, oid string) ([]packngo.HardwareReservation, error) {
- return nil, fmt.Errorf("not implemented")
-}
-
-// newFakequinix makes a fakequinix with a given fake project ID and number of
-// hardware reservations to create.
-func newFakequinix(pid string, numReservations int) *fakequinix {
- f := fakequinix{
- pid: pid,
- devices: make(map[string]*packngo.Device),
- reservations: make(map[string]*packngo.HardwareReservation),
- sshKeys: make(map[string]*packngo.SSHKey),
- reboots: make(map[string]int),
- }
-
- for i := 0; i < numReservations; i++ {
- uid := uuid.New()
- f.reservations[uid.String()] = &packngo.HardwareReservation{
- ID: uid.String(),
- ShortID: uid.String(),
- Provisionable: true,
- }
- }
-
- return &f
-}
-
-func (f *fakequinix) notFound() error {
- return &packngo.ErrorResponse{
- Response: &http.Response{
- StatusCode: http.StatusNotFound,
- },
- }
-}
-
-func (f *fakequinix) GetDevice(_ context.Context, pid, did string, _ *packngo.ListOptions) (*packngo.Device, error) {
- f.mu.Lock()
- defer f.mu.Unlock()
-
- val := f.devices[did]
- if val == nil {
- return nil, f.notFound()
- }
- return val, nil
-}
-
-func (f *fakequinix) ListDevices(_ context.Context, pid string) ([]packngo.Device, error) {
- f.mu.Lock()
- defer f.mu.Unlock()
-
- if pid != f.pid {
- return nil, nil
- }
- var res []packngo.Device
- for _, dev := range f.devices {
- res = append(res, *dev)
- }
- return res, nil
-}
-
-func (f *fakequinix) UpdateDevice(ctx context.Context, id string, r *packngo.DeviceUpdateRequest) (*packngo.Device, error) {
- return nil, fmt.Errorf("not implemented")
-}
-
-// MoveReservation is not implemented in fakequinix
-func (f *fakequinix) MoveReservation(_ context.Context, hardwareReservationDID, projectID string) (*packngo.HardwareReservation, error) {
- return nil, &packngo.ErrorResponse{
- Response: &http.Response{
- StatusCode: http.StatusNotImplemented,
- },
- }
-}
-
-func (f *fakequinix) DeleteDevice(_ context.Context, id string) error {
- f.mu.Lock()
- defer f.mu.Unlock()
-
- if _, ok := f.devices[id]; !ok {
- return f.notFound()
- }
-
- delete(f.devices, id)
-
- return nil
-}
-
-func (f *fakequinix) CreateDevice(_ context.Context, request *packngo.DeviceCreateRequest) (*packngo.Device, error) {
- f.mu.Lock()
- defer f.mu.Unlock()
-
- rid := request.HardwareReservationID
- res := f.reservations[rid]
- if res == nil {
- return nil, f.notFound()
- }
- if res.Device != nil {
- return nil, f.notFound()
- }
-
- dev := &packngo.Device{
- ID: uuid.New().String(),
- State: "active",
- HardwareReservation: &packngo.HardwareReservation{
- ID: rid,
- },
- Network: []*packngo.IPAddressAssignment{
- {
- IpAddressCommon: packngo.IpAddressCommon{
- Public: true,
- Address: "1.2.3.4",
- },
- },
- },
- Facility: &packngo.Facility{
- Code: "wad",
- },
- Hostname: request.Hostname,
- OS: &packngo.OS{
- Name: request.OS,
- Slug: request.OS,
- },
- }
- res.Device = dev
- res.Provisionable = false
-
- f.devices[dev.ID] = dev
- return dev, nil
-}
-
-func (f *fakequinix) ListReservations(_ context.Context, pid string) ([]packngo.HardwareReservation, error) {
- f.mu.Lock()
- defer f.mu.Unlock()
-
- var res []packngo.HardwareReservation
- for _, r := range f.reservations {
- res = append(res, *r)
- }
-
- return res, nil
-}
-
-func (f *fakequinix) ListSSHKeys(_ context.Context) ([]packngo.SSHKey, error) {
- f.mu.Lock()
- defer f.mu.Unlock()
-
- var res []packngo.SSHKey
- for _, key := range f.sshKeys {
- res = append(res, *key)
- }
-
- return res, nil
-}
-
-func (f *fakequinix) CreateSSHKey(_ context.Context, req *packngo.SSHKeyCreateRequest) (*packngo.SSHKey, error) {
- f.mu.Lock()
- defer f.mu.Unlock()
-
- for _, k := range f.sshKeys {
- if k.Key == req.Key {
- return nil, f.notFound()
- }
- if k.Label == req.Label {
- return nil, f.notFound()
- }
- }
-
- uid := uuid.New().String()
- f.sshKeys[uid] = &packngo.SSHKey{
- ID: uid,
- Label: req.Label,
- Key: req.Key,
- }
-
- return f.sshKeys[uid], nil
-}
-
-func (f *fakequinix) UpdateSSHKey(_ context.Context, kid string, req *packngo.SSHKeyUpdateRequest) (*packngo.SSHKey, error) {
- f.mu.Lock()
- defer f.mu.Unlock()
-
- key := f.sshKeys[kid]
- if key == nil {
- return nil, f.notFound()
- }
- key.Key = *req.Key
-
- return key, nil
-}
-
-func (f *fakequinix) RebootDevice(_ context.Context, did string) error {
- f.mu.Lock()
- defer f.mu.Unlock()
-
- f.reboots[did]++
-
- return nil
-}
-
-func (f *fakequinix) Close() {
-}
diff --git a/cloud/shepherd/provider/equinix/initializer_test.go b/cloud/shepherd/provider/equinix/initializer_test.go
deleted file mode 100644
index fc34a10..0000000
--- a/cloud/shepherd/provider/equinix/initializer_test.go
+++ /dev/null
@@ -1,178 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package main
-
-import (
- "context"
- "crypto/ed25519"
- "crypto/rand"
- "fmt"
- "testing"
- "time"
-
- "github.com/packethost/packngo"
- "golang.org/x/crypto/ssh"
- "golang.org/x/time/rate"
-
- "source.monogon.dev/cloud/bmaas/bmdb"
- "source.monogon.dev/cloud/bmaas/bmdb/model"
- "source.monogon.dev/cloud/lib/component"
- "source.monogon.dev/cloud/shepherd/manager"
-)
-
-type initializerDut struct {
- f *fakequinix
- i *manager.Initializer
- bmdb *bmdb.Connection
- ctx context.Context
- provider *equinixProvider
-}
-
-func newInitializerDut(t *testing.T) *initializerDut {
- t.Helper()
-
- sc := providerConfig{
- ProjectId: "noproject",
- KeyLabel: "somekey",
- DevicePrefix: "test-",
- }
- _, key, _ := ed25519.GenerateKey(rand.Reader)
- k := manager.SSHKey{
- Key: key,
- }
-
- f := newFakequinix(sc.ProjectId, 100)
- provider, err := sc.New(&k, f)
- if err != nil {
- t.Fatalf("Could not create Provider: %v", err)
- }
-
- ic := manager.InitializerConfig{
- ControlLoopConfig: manager.ControlLoopConfig{
- DBQueryLimiter: rate.NewLimiter(rate.Every(time.Second), 10),
- },
- Executable: []byte("beep boop i'm a real program"),
- TargetPath: "/fake/path",
- Endpoint: "example.com:1234",
- SSHConfig: ssh.ClientConfig{
- Timeout: time.Second,
- },
- SSHExecTimeout: time.Second,
- DialSSH: manager.FakeSSHDial,
- }
-
- i, err := manager.NewInitializer(provider, ic)
- if err != nil {
- t.Fatalf("Could not create Initializer: %v", err)
- }
-
- b := bmdb.BMDB{
- Config: bmdb.Config{
- Database: component.CockroachConfig{
- InMemory: true,
- },
- ComponentName: "test",
- RuntimeInfo: "test",
- },
- }
- conn, err := b.Open(true)
- if err != nil {
- t.Fatalf("Could not create in-memory BMDB: %v", err)
- }
-
- ctx, ctxC := context.WithCancel(context.Background())
- t.Cleanup(ctxC)
-
- if err := provider.SSHEquinixEnsure(ctx); err != nil {
- t.Fatalf("Failed to ensure SSH key: %v", err)
- }
- go manager.RunControlLoop(ctx, conn, i)
-
- return &initializerDut{
- f: f,
- i: i,
- bmdb: conn,
- ctx: ctx,
- provider: provider,
- }
-}
-
-// TestInitializerSmokes makes sure the Initializer doesn't go up in flames on
-// the happy path.
-func TestInitializerSmokes(t *testing.T) {
- dut := newInitializerDut(t)
- f := dut.f
- ctx := dut.ctx
- conn := dut.bmdb
-
- reservations, _ := f.ListReservations(ctx, f.pid)
- kid, err := dut.provider.sshEquinixId(ctx)
- if err != nil {
- t.Fatalf("Failed to retrieve equinix key ID: %v", err)
- }
- sess, err := conn.StartSession(ctx)
- if err != nil {
- t.Fatalf("Failed to create BMDB session for verifiaction: %v", err)
- }
-
- // Create 10 provided machines for testing.
- for i := 0; i < 10; i++ {
- res := reservations[i]
- dev, _ := f.CreateDevice(ctx, &packngo.DeviceCreateRequest{
- Hostname: fmt.Sprintf("test-%d", i),
- OS: "fake",
- ProjectID: f.pid,
- HardwareReservationID: res.ID,
- ProjectSSHKeys: []string{kid},
- })
- f.mu.Lock()
- f.devices[dev.ID].Network = []*packngo.IPAddressAssignment{
- {
- IpAddressCommon: packngo.IpAddressCommon{
- ID: "fake",
- Address: "1.2.3.4",
- Management: true,
- AddressFamily: 4,
- Public: true,
- },
- },
- }
- f.mu.Unlock()
- err = sess.Transact(ctx, func(q *model.Queries) error {
- machine, err := q.NewMachine(ctx)
- if err != nil {
- return err
- }
- return q.MachineAddProvided(ctx, model.MachineAddProvidedParams{
- MachineID: machine.MachineID,
- Provider: model.ProviderEquinix,
- ProviderID: dev.ID,
- })
- })
- if err != nil {
- t.Fatalf("Failed to create BMDB machine: %v", err)
- }
- }
-
- // Expect to find 0 machines needing start.
- for {
- time.Sleep(100 * time.Millisecond)
-
- var machines []model.MachineProvided
- err = sess.Transact(ctx, func(q *model.Queries) error {
- var err error
- machines, err = q.GetMachinesForAgentStart(ctx, model.GetMachinesForAgentStartParams{
- Limit: 100,
- Provider: model.ProviderEquinix,
- })
- return err
- })
- if err != nil {
- t.Fatalf("Failed to run Transaction: %v", err)
- }
- if len(machines) == 0 {
- break
- }
- }
-}
diff --git a/cloud/shepherd/provider/equinix/main.go b/cloud/shepherd/provider/equinix/main.go
deleted file mode 100644
index 9903b04..0000000
--- a/cloud/shepherd/provider/equinix/main.go
+++ /dev/null
@@ -1,159 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package main
-
-import (
- "context"
- "errors"
- "flag"
- "fmt"
- "os"
- "os/signal"
-
- "golang.org/x/crypto/ssh"
- "k8s.io/klog/v2"
-
- "source.monogon.dev/cloud/bmaas/bmdb"
- "source.monogon.dev/cloud/bmaas/bmdb/webug"
- "source.monogon.dev/cloud/equinix/wrapngo"
- "source.monogon.dev/cloud/lib/component"
- "source.monogon.dev/cloud/shepherd/manager"
-)
-
-type Config struct {
- Component component.ComponentConfig
- BMDB bmdb.BMDB
- WebugConfig webug.Config
-
- SSHKey manager.SSHKey
- InitializerConfig manager.InitializerConfig
- ProvisionerConfig manager.ProvisionerConfig
- RecovererConfig manager.RecovererConfig
-
- API wrapngo.Opts
- Provider providerConfig
- UpdaterConfig UpdaterConfig
-}
-
-// TODO(q3k): factor this out to BMDB library?
-func runtimeInfo() string {
- hostname, _ := os.Hostname()
- if hostname == "" {
- hostname = "UNKNOWN"
- }
- return fmt.Sprintf("host %s", hostname)
-}
-
-func (c *Config) RegisterFlags() {
- c.Component.RegisterFlags("shepherd")
- c.BMDB.ComponentName = "shepherd-equinix"
- c.BMDB.RuntimeInfo = runtimeInfo()
- c.BMDB.Database.RegisterFlags("bmdb")
- c.WebugConfig.RegisterFlags()
-
- c.SSHKey.RegisterFlags()
- c.InitializerConfig.RegisterFlags()
- c.ProvisionerConfig.RegisterFlags()
- c.RecovererConfig.RegisterFlags()
-
- c.API.RegisterFlags()
- c.Provider.RegisterFlags()
- c.UpdaterConfig.RegisterFlags()
-}
-
-func main() {
- var c Config
- c.RegisterFlags()
-
- flag.Parse()
- if flag.NArg() > 0 {
- klog.Exitf("unexpected positional arguments: %v", flag.Args())
- }
-
- registry := c.Component.PrometheusRegistry()
- c.BMDB.EnableMetrics(registry)
-
- ctx, _ := signal.NotifyContext(context.Background(), os.Interrupt)
- c.Component.StartPrometheus(ctx)
-
- if c.API.APIKey == "" || c.API.User == "" {
- klog.Exitf("-equinix_api_username and -equinix_api_key must be set")
- }
- c.API.MetricsRegistry = registry
- api := wrapngo.New(&c.API)
-
- provider, err := c.Provider.New(&c.SSHKey, api)
- if err != nil {
- klog.Exitf("%v", err)
- }
-
- sshSigner, err := c.SSHKey.Signer()
- if err != nil {
- klog.Exitf("%v", err)
- }
-
- c.InitializerConfig.SSHConfig.Auth = []ssh.AuthMethod{ssh.PublicKeys(sshSigner)}
- // Equinix OS installations always use root.
- c.InitializerConfig.SSHConfig.User = "root"
- // Ignore the host key, since it's likely the first time anything logs into
- // this device, and also because there's no way of knowing its fingerprint.
- c.InitializerConfig.SSHConfig.HostKeyCallback = ssh.InsecureIgnoreHostKey()
-
- provisioner, err := manager.NewProvisioner(provider, c.ProvisionerConfig)
- if err != nil {
- klog.Exitf("%v", err)
- }
-
- initializer, err := manager.NewInitializer(provider, c.InitializerConfig)
- if err != nil {
- klog.Exitf("%v", err)
- }
-
- recoverer, err := manager.NewRecoverer(provider, c.RecovererConfig)
- if err != nil {
- klog.Exitf("%v", err)
- }
-
- updater, err := c.UpdaterConfig.New(api)
- if err != nil {
- klog.Exitf("%v", err)
- }
-
- conn, err := c.BMDB.Open(true)
- if err != nil {
- klog.Exitf("Failed to open BMDB connection: %v", err)
- }
-
- go func() {
- err = provisioner.Run(ctx, conn)
- if err != nil {
- klog.Exit(err)
- }
- }()
- go func() {
- err = manager.RunControlLoop(ctx, conn, initializer)
- if err != nil {
- klog.Exit(err)
- }
- }()
- go func() {
- err = manager.RunControlLoop(ctx, conn, recoverer)
- if err != nil {
- klog.Exit(err)
- }
- }()
- go func() {
- err = updater.Run(ctx, conn)
- if err != nil {
- klog.Exit(err)
- }
- }()
- go func() {
- if err := c.WebugConfig.Start(ctx, conn); err != nil && !errors.Is(err, ctx.Err()) {
- klog.Exitf("Failed to start webug: %v", err)
- }
- }()
-
- <-ctx.Done()
-}
diff --git a/cloud/shepherd/provider/equinix/provider.go b/cloud/shepherd/provider/equinix/provider.go
deleted file mode 100644
index 16dedf0..0000000
--- a/cloud/shepherd/provider/equinix/provider.go
+++ /dev/null
@@ -1,381 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package main
-
-import (
- "context"
- "errors"
- "fmt"
- "net/netip"
- "slices"
- "strings"
- "time"
-
- "github.com/packethost/packngo"
- "k8s.io/klog/v2"
-
- "source.monogon.dev/cloud/bmaas/bmdb"
- "source.monogon.dev/cloud/bmaas/bmdb/model"
- "source.monogon.dev/cloud/equinix/wrapngo"
- "source.monogon.dev/cloud/lib/sinbin"
- "source.monogon.dev/cloud/shepherd"
- "source.monogon.dev/cloud/shepherd/manager"
-)
-
-type equinixProvider struct {
- config *providerConfig
- api wrapngo.Client
- sshKey *manager.SSHKey
-
- // badReservations is a holiday resort for Equinix hardware reservations which
- // failed to be provisioned for some reason or another. We keep a list of them in
- // memory just so that we don't repeatedly try to provision the same known bad
- // machines.
- badReservations sinbin.Sinbin[string]
-
- reservationDeadline time.Time
- reservationCache []packngo.HardwareReservation
-}
-
-func (ep *equinixProvider) RebootMachine(ctx context.Context, id shepherd.ProviderID) error {
- if err := ep.api.RebootDevice(ctx, string(id)); err != nil {
- return fmt.Errorf("failed to reboot device: %w", err)
- }
-
- // TODO(issue/215): replace this
- // This is required as Equinix doesn't reboot the machines synchronously
- // during the API call.
- select {
- case <-time.After(time.Duration(ep.config.RebootWaitSeconds) * time.Second):
- case <-ctx.Done():
- return fmt.Errorf("while waiting for reboot: %w", ctx.Err())
- }
- return nil
-}
-
-func (ep *equinixProvider) ReinstallMachine(ctx context.Context, id shepherd.ProviderID) error {
- return shepherd.ErrNotImplemented
-}
-
-func (ep *equinixProvider) GetMachine(ctx context.Context, id shepherd.ProviderID) (shepherd.Machine, error) {
- machines, err := ep.ListMachines(ctx)
- if err != nil {
- return nil, err
- }
-
- for _, machine := range machines {
- if machine.ID() == id {
- return machine, nil
- }
- }
-
- return nil, shepherd.ErrMachineNotFound
-}
-
-func (ep *equinixProvider) ListMachines(ctx context.Context) ([]shepherd.Machine, error) {
- if ep.reservationDeadline.Before(time.Now()) {
- reservations, err := ep.listReservations(ctx)
- if err != nil {
- return nil, err
- }
- ep.reservationCache = reservations
- ep.reservationDeadline = time.Now().Add(ep.config.ReservationCacheTimeout)
- }
-
- devices, err := ep.managedDevices(ctx)
- if err != nil {
- return nil, err
- }
-
- machines := make([]shepherd.Machine, 0, len(ep.reservationCache)+len(devices))
- for _, device := range devices {
- machines = append(machines, &machine{device})
- }
-
- for _, res := range ep.reservationCache {
- machines = append(machines, reservation{res})
- }
-
- return machines, nil
-}
-
-func (ep *equinixProvider) CreateMachine(ctx context.Context, session *bmdb.Session, request shepherd.CreateMachineRequest) (shepherd.Machine, error) {
- if request.UnusedMachine == nil {
- return nil, fmt.Errorf("parameter UnusedMachine is missing")
- }
-
- //TODO: Do we just trust the implementation to be correct?
- res, ok := request.UnusedMachine.(reservation)
- if !ok {
- return nil, fmt.Errorf("invalid type for parameter UnusedMachine")
- }
-
- d, err := ep.provision(ctx, session, res.HardwareReservation)
- if err != nil {
- klog.Errorf("Failed to provision reservation %s: %v", res.HardwareReservation.ID, err)
- until := time.Now().Add(time.Hour)
- klog.Errorf("Adding hardware reservation %s to sinbin until %s", res.HardwareReservation.ID, until)
- ep.badReservations.Add(res.HardwareReservation.ID, until)
- return nil, err
- }
-
- return &machine{*d}, nil
-}
-
-func (ep *equinixProvider) Type() model.Provider {
- return model.ProviderEquinix
-}
-
-type reservation struct {
- packngo.HardwareReservation
-}
-
-func (e reservation) Failed() bool {
- return false
-}
-
-func (e reservation) ID() shepherd.ProviderID {
- return shepherd.InvalidProviderID
-}
-
-func (e reservation) Addr() netip.Addr {
- return netip.Addr{}
-}
-
-func (e reservation) Availability() shepherd.Availability {
- return shepherd.AvailabilityKnownUnused
-}
-
-type machine struct {
- packngo.Device
-}
-
-func (e *machine) Failed() bool {
- return e.State == "failed"
-}
-
-func (e *machine) ID() shepherd.ProviderID {
- return shepherd.ProviderID(e.Device.ID)
-}
-
-func (e *machine) Addr() netip.Addr {
- ni := e.GetNetworkInfo()
-
- var addr string
- if ni.PublicIPv4 != "" {
- addr = ni.PublicIPv4
- } else if ni.PublicIPv6 != "" {
- addr = ni.PublicIPv6
- } else {
- klog.Errorf("missing address for machine: %v", e.ID())
- return netip.Addr{}
- }
-
- a, err := netip.ParseAddr(addr)
- if err != nil {
- klog.Errorf("failed parsing address %q: %v", addr, err)
- return netip.Addr{}
- }
-
- return a
-}
-
-func (e *machine) Availability() shepherd.Availability {
- return shepherd.AvailabilityKnownUsed
-}
-
-// listReservations doesn't lock the mutex and expects the caller to lock.
-func (ep *equinixProvider) listReservations(ctx context.Context) ([]packngo.HardwareReservation, error) {
- klog.Infof("Retrieving hardware reservations, this will take a while...")
- reservations, err := ep.api.ListReservations(ctx, ep.config.ProjectId)
- if err != nil {
- return nil, fmt.Errorf("failed to list reservations: %w", err)
- }
-
- var available []packngo.HardwareReservation
- var inUse, notProvisionable, penalized int
- for _, reservation := range reservations {
- if reservation.Device != nil {
- inUse++
- continue
- }
- if !reservation.Provisionable {
- notProvisionable++
- continue
- }
- if ep.badReservations.Penalized(reservation.ID) {
- penalized++
- continue
- }
- available = append(available, reservation)
- }
- klog.Infof("Retrieved hardware reservations: %d (total), %d (available), %d (in use), %d (not provisionable), %d (penalized)", len(reservations), len(available), inUse, notProvisionable, penalized)
-
- return available, nil
-}
-
-// provision attempts to create a device within Equinix using given Hardware
-// Reservation rsv. The resulting device is registered with BMDB, and tagged as
-// "provided" in the process.
-func (ep *equinixProvider) provision(ctx context.Context, sess *bmdb.Session, rsv packngo.HardwareReservation) (*packngo.Device, error) {
- klog.Infof("Creating a new device using reservation ID %s.", rsv.ID)
- hostname := ep.config.DevicePrefix + rsv.ID[:18]
- kid, err := ep.sshEquinixId(ctx)
- if err != nil {
- return nil, err
- }
- req := &packngo.DeviceCreateRequest{
- Hostname: hostname,
- OS: ep.config.OS,
- Plan: rsv.Plan.Slug,
- ProjectID: ep.config.ProjectId,
- HardwareReservationID: rsv.ID,
- ProjectSSHKeys: []string{kid},
- }
- if ep.config.UseProjectKeys {
- klog.Warningf("INSECURE: Machines will be created with ALL PROJECT SSH KEYS!")
- req.ProjectSSHKeys = nil
- }
-
- nd, err := ep.api.CreateDevice(ctx, req)
- if err != nil {
- return nil, fmt.Errorf("while creating new device within Equinix: %w", err)
- }
- klog.Infof("Created a new device within Equinix (RID: %s, PID: %s, HOST: %s)", rsv.ID, nd.ID, hostname)
-
- ep.reservationCache = slices.DeleteFunc(ep.reservationCache, func(v packngo.HardwareReservation) bool {
- return rsv.ID == v.ID
- })
-
- err = ep.assimilate(ctx, sess, nd.ID)
- if err != nil {
- // TODO(serge@monogon.tech) at this point the device at Equinix isn't
- // matched by a BMDB record. Schedule device deletion or make sure this
- // case is being handled elsewhere.
- return nil, err
- }
- return nd, nil
-}
-
-// assimilate brings in an already existing machine from Equinix into the BMDB.
-// This is only used in manual testing.
-func (ep *equinixProvider) assimilate(ctx context.Context, sess *bmdb.Session, deviceID string) error {
- return sess.Transact(ctx, func(q *model.Queries) error {
- // Create a new machine record within BMDB.
- m, err := q.NewMachine(ctx)
- if err != nil {
- return fmt.Errorf("while creating a new machine record in BMDB: %w", err)
- }
-
- // Link the new machine with the Equinix device, and tag it "provided".
- p := model.MachineAddProvidedParams{
- MachineID: m.MachineID,
- ProviderID: deviceID,
- Provider: model.ProviderEquinix,
- }
- klog.Infof("Setting \"provided\" tag (ID: %s, PID: %s, Provider: %s).", p.MachineID, p.ProviderID, p.Provider)
- if err := q.MachineAddProvided(ctx, p); err != nil {
- return fmt.Errorf("while tagging machine active: %w", err)
- }
- return nil
- })
-}
-
-// sshEquinixGet looks up the Equinix key matching providerConfig.KeyLabel,
-// returning its packngo.SSHKey instance.
-func (ep *equinixProvider) sshEquinix(ctx context.Context) (*packngo.SSHKey, error) {
- ks, err := ep.api.ListSSHKeys(ctx)
- if err != nil {
- return nil, fmt.Errorf("while listing SSH keys: %w", err)
- }
-
- for _, k := range ks {
- if k.Label == ep.config.KeyLabel {
- return &k, nil
- }
- }
- return nil, ErrNoSuchKey
-}
-
-// sshEquinixId looks up the Equinix key identified by providerConfig.KeyLabel,
-// returning its Equinix-assigned UUID.
-func (ep *equinixProvider) sshEquinixId(ctx context.Context) (string, error) {
- k, err := ep.sshEquinix(ctx)
- if err != nil {
- return "", err
- }
- return k.ID, nil
-}
-
-// sshEquinixUpdate makes sure the existing SSH key registered with Equinix
-// matches the one from sshPub.
-func (ep *equinixProvider) sshEquinixUpdate(ctx context.Context, kid string) error {
- pub, err := ep.sshKey.PublicKey()
- if err != nil {
- return err
- }
- _, err = ep.api.UpdateSSHKey(ctx, kid, &packngo.SSHKeyUpdateRequest{
- Key: &pub,
- })
- if err != nil {
- return fmt.Errorf("while updating the SSH key: %w", err)
- }
- return nil
-}
-
-// sshEquinixUpload registers a new SSH key from sshPub.
-func (ep *equinixProvider) sshEquinixUpload(ctx context.Context) error {
- pub, err := ep.sshKey.PublicKey()
- if err != nil {
- return fmt.Errorf("while generating public key: %w", err)
- }
- _, err = ep.api.CreateSSHKey(ctx, &packngo.SSHKeyCreateRequest{
- Label: ep.config.KeyLabel,
- Key: pub,
- ProjectID: ep.config.ProjectId,
- })
- if err != nil {
- return fmt.Errorf("while creating an SSH key: %w", err)
- }
- return nil
-}
-
-// SSHEquinixEnsure initializes the locally managed SSH key (from a persistence
-// path or explicitly set key) and updates or uploads it to Equinix. The key is
-// generated as needed The key is generated as needed
-func (ep *equinixProvider) SSHEquinixEnsure(ctx context.Context) error {
- k, err := ep.sshEquinix(ctx)
- switch {
- case errors.Is(err, ErrNoSuchKey):
- if err := ep.sshEquinixUpload(ctx); err != nil {
- return fmt.Errorf("while uploading key: %w", err)
- }
- return nil
- case err == nil:
- if err := ep.sshEquinixUpdate(ctx, k.ID); err != nil {
- return fmt.Errorf("while updating key: %w", err)
- }
- return nil
- default:
- return err
- }
-}
-
-// managedDevices provides a map of device provider IDs to matching
-// packngo.Device instances. It calls Equinix API's ListDevices. The returned
-// devices are filtered according to DevicePrefix provided through Opts. The
-// returned error value, if not nil, will originate in wrapngo.
-func (ep *equinixProvider) managedDevices(ctx context.Context) (map[string]packngo.Device, error) {
- ds, err := ep.api.ListDevices(ctx, ep.config.ProjectId)
- if err != nil {
- return nil, err
- }
- dm := map[string]packngo.Device{}
- for _, d := range ds {
- if strings.HasPrefix(d.Hostname, ep.config.DevicePrefix) {
- dm[d.ID] = d
- }
- }
- return dm, nil
-}
diff --git a/cloud/shepherd/provider/equinix/provider_config.go b/cloud/shepherd/provider/equinix/provider_config.go
deleted file mode 100644
index 146b010..0000000
--- a/cloud/shepherd/provider/equinix/provider_config.go
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package main
-
-import (
- "errors"
- "flag"
- "fmt"
- "strings"
- "time"
-
- "source.monogon.dev/cloud/equinix/wrapngo"
- "source.monogon.dev/cloud/shepherd/manager"
-)
-
-var (
- ErrNoSuchKey = errors.New("no such key")
-)
-
-// providerConfig contains configuration options used by both the Initializer and
-// Provisioner components of the Shepherd. In CLI scenarios, RegisterFlags should
-// be called to configure this struct from CLI flags. Otherwise, this structure
-// should be explicitly configured, as the default values are not valid.
-type providerConfig struct {
- // ProjectId is the Equinix project UUID used by the manager. See Equinix API
- // documentation for details. Must be set.
- ProjectId string
-
- // KeyLabel specifies the ID to use when handling the Equinix-registered SSH
- // key used to authenticate to newly created servers. Must be set.
- KeyLabel string
-
- // DevicePrefix applied to all devices (machines) created by the Provisioner,
- // and used by the Provisioner to identify machines which it managed.
- // Must be set.
- DevicePrefix string
-
- // OS defines the operating system new devices are created with. Its format
- // is specified by Equinix API.
- OS string
-
- // UseProjectKeys defines if the provisioner adds all ssh keys defined inside
- // the used project to every new machine. This is only used for debug purposes.
- UseProjectKeys bool
-
- // RebootWaitSeconds defines how many seconds to sleep after a reboot call
- // to ensure a reboot actually happened.
- RebootWaitSeconds int
-
- // ReservationCacheTimeout defines how after which time the reservations should be
- // refreshed.
- ReservationCacheTimeout time.Duration
-}
-
-func (pc *providerConfig) check() error {
- if pc.ProjectId == "" {
- return fmt.Errorf("-equinix_project_id must be set")
- }
- if pc.KeyLabel == "" {
- return fmt.Errorf("-equinix_ssh_key_label must be set")
- }
- if pc.DevicePrefix == "" {
- return fmt.Errorf("-equinix_device_prefix must be set")
- }
-
- // These variables are _very_ important to configure correctly, otherwise someone
- // running this locally with prod creds will actually destroy production
- // data.
- if strings.Contains(pc.KeyLabel, "FIXME") {
- return fmt.Errorf("refusing to run with -equinix_ssh_key_label %q, please set it to something unique", pc.KeyLabel)
- }
- if strings.Contains(pc.DevicePrefix, "FIXME") {
- return fmt.Errorf("refusing to run with -equinix_device_prefix %q, please set it to something unique", pc.DevicePrefix)
- }
-
- return nil
-}
-
-func (pc *providerConfig) RegisterFlags() {
- flag.StringVar(&pc.ProjectId, "equinix_project_id", "", "Equinix project ID where resources will be managed")
- flag.StringVar(&pc.KeyLabel, "equinix_ssh_key_label", "shepherd-FIXME", "Label used to identify managed SSH key in Equinix project")
- flag.StringVar(&pc.DevicePrefix, "equinix_device_prefix", "shepherd-FIXME-", "Prefix applied to all devices (machines) in Equinix project, used to identify managed machines")
- flag.StringVar(&pc.OS, "equinix_os", "ubuntu_20_04", "OS that provisioner will deploy on Equinix machines. Not the target OS for cluster customers.")
- flag.BoolVar(&pc.UseProjectKeys, "equinix_use_project_keys", false, "Add all Equinix project keys to newly provisioned machines, not just the provisioner's managed key. Debug/development only.")
- flag.IntVar(&pc.RebootWaitSeconds, "equinix_reboot_wait_seconds", 30, "How many seconds to sleep to ensure a reboot happend")
- flag.DurationVar(&pc.ReservationCacheTimeout, "equinix_reservation_cache_timeout", time.Minute*15, "Reservation cache validity timeo")
-}
-
-func (pc *providerConfig) New(sshKey *manager.SSHKey, api wrapngo.Client) (*equinixProvider, error) {
- if err := pc.check(); err != nil {
- return nil, err
- }
-
- return &equinixProvider{
- config: pc,
- sshKey: sshKey,
- api: api,
- }, nil
-}
diff --git a/cloud/shepherd/provider/equinix/provisioner_test.go b/cloud/shepherd/provider/equinix/provisioner_test.go
deleted file mode 100644
index 6a5b0c6..0000000
--- a/cloud/shepherd/provider/equinix/provisioner_test.go
+++ /dev/null
@@ -1,106 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package main
-
-import (
- "context"
- "crypto/ed25519"
- "crypto/rand"
- "testing"
- "time"
-
- "golang.org/x/time/rate"
-
- "source.monogon.dev/cloud/bmaas/bmdb"
- "source.monogon.dev/cloud/bmaas/bmdb/model"
- "source.monogon.dev/cloud/lib/component"
- "source.monogon.dev/cloud/shepherd/manager"
-)
-
-// TestProvisionerSmokes makes sure the Provisioner doesn't go up in flames on
-// the happy path.
-func TestProvisionerSmokes(t *testing.T) {
- pc := manager.ProvisionerConfig{
- MaxCount: 10,
- // We need 3 iterations to provide 10 machines with a chunk size of 4.
- ReconcileLoopLimiter: rate.NewLimiter(rate.Every(10*time.Second), 3),
- DeviceCreationLimiter: rate.NewLimiter(rate.Every(time.Second), 10),
- ChunkSize: 4,
- }
- sc := providerConfig{
- ProjectId: "noproject",
- KeyLabel: "somekey",
- DevicePrefix: "test-",
- }
-
- _, key, _ := ed25519.GenerateKey(rand.Reader)
- k := manager.SSHKey{
- Key: key,
- }
-
- f := newFakequinix(sc.ProjectId, 100)
- provider, err := sc.New(&k, f)
- if err != nil {
- t.Fatalf("Could not create Provider: %v", err)
- }
-
- p, err := manager.NewProvisioner(provider, pc)
- if err != nil {
- t.Fatalf("Could not create Provisioner: %v", err)
- }
-
- ctx, ctxC := context.WithCancel(context.Background())
- defer ctxC()
-
- b := bmdb.BMDB{
- Config: bmdb.Config{
- Database: component.CockroachConfig{
- InMemory: true,
- },
- ComponentName: "test",
- RuntimeInfo: "test",
- },
- }
- conn, err := b.Open(true)
- if err != nil {
- t.Fatalf("Could not create in-memory BMDB: %v", err)
- }
-
- if err := provider.SSHEquinixEnsure(ctx); err != nil {
- t.Fatalf("Failed to ensure SSH key: %v", err)
- }
- go p.Run(ctx, conn)
-
- sess, err := conn.StartSession(ctx)
- if err != nil {
- t.Fatalf("Failed to create BMDB session for verification: %v", err)
- }
- for {
- time.Sleep(100 * time.Millisecond)
-
- var provided []model.MachineProvided
- err = sess.Transact(ctx, func(q *model.Queries) error {
- var err error
- provided, err = q.GetProvidedMachines(ctx, model.ProviderEquinix)
- return err
- })
- if err != nil {
- t.Errorf("Transact failed: %v", err)
- }
- if len(provided) < 10 {
- continue
- }
- if len(provided) > 10 {
- t.Errorf("%d machines provided (limit: 10)", len(provided))
- }
-
- for _, mp := range provided {
- if f.devices[mp.ProviderID] == nil {
- t.Errorf("BMDB machine %q has unknown provider ID %q", mp.MachineID, mp.ProviderID)
- }
- }
-
- return
- }
-}
diff --git a/cloud/shepherd/provider/equinix/recoverer_test.go b/cloud/shepherd/provider/equinix/recoverer_test.go
deleted file mode 100644
index 6d1447f..0000000
--- a/cloud/shepherd/provider/equinix/recoverer_test.go
+++ /dev/null
@@ -1,184 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package main
-
-import (
- "context"
- "crypto/ed25519"
- "crypto/rand"
- "testing"
- "time"
-
- "github.com/packethost/packngo"
- "golang.org/x/time/rate"
-
- "source.monogon.dev/cloud/bmaas/bmdb"
- "source.monogon.dev/cloud/bmaas/bmdb/model"
- "source.monogon.dev/cloud/lib/component"
- "source.monogon.dev/cloud/shepherd/manager"
-)
-
-type recovererDut struct {
- f *fakequinix
- r *manager.Recoverer
- bmdb *bmdb.Connection
- ctx context.Context
-}
-
-func newRecovererDut(t *testing.T) *recovererDut {
- t.Helper()
-
- rc := manager.RecovererConfig{
- ControlLoopConfig: manager.ControlLoopConfig{
- DBQueryLimiter: rate.NewLimiter(rate.Every(time.Second), 10),
- },
- }
-
- sc := providerConfig{
- ProjectId: "noproject",
- KeyLabel: "somekey",
- DevicePrefix: "test-",
- }
-
- _, key, _ := ed25519.GenerateKey(rand.Reader)
- k := manager.SSHKey{
- Key: key,
- }
-
- f := newFakequinix(sc.ProjectId, 100)
- provider, err := sc.New(&k, f)
- if err != nil {
- t.Fatalf("Could not create Provider: %v", err)
- }
-
- r, err := manager.NewRecoverer(provider, rc)
- if err != nil {
- t.Fatalf("Could not create Initializer: %v", err)
- }
-
- b := bmdb.BMDB{
- Config: bmdb.Config{
- Database: component.CockroachConfig{
- InMemory: true,
- },
- ComponentName: "test",
- RuntimeInfo: "test",
- },
- }
- conn, err := b.Open(true)
- if err != nil {
- t.Fatalf("Could not create in-memory BMDB: %v", err)
- }
-
- ctx, ctxC := context.WithCancel(context.Background())
- t.Cleanup(ctxC)
-
- go manager.RunControlLoop(ctx, conn, r)
-
- return &recovererDut{
- f: f,
- r: r,
- bmdb: conn,
- ctx: ctx,
- }
-}
-
-// TestRecoverySmokes makes sure that the Initializer in recovery mode doesn't go
-// up in flames on the happy path.
-func TestRecoverySmokes(t *testing.T) {
- dut := newRecovererDut(t)
- f := dut.f
- ctx := dut.ctx
- conn := dut.bmdb
-
- reservations, _ := f.ListReservations(ctx, "fake")
-
- sess, err := conn.StartSession(ctx)
- if err != nil {
- t.Fatalf("Failed to create BMDB session: %v", err)
- }
-
- // Create test machine that should be selected for recovery.
- // First in Fakequinix...
- dev, _ := f.CreateDevice(ctx, &packngo.DeviceCreateRequest{
- Hostname: "test-devices",
- OS: "fake",
- ProjectID: "fake",
- HardwareReservationID: reservations[0].ID,
- ProjectSSHKeys: []string{},
- })
- // ... and in BMDB.
- err = sess.Transact(ctx, func(q *model.Queries) error {
- machine, err := q.NewMachine(ctx)
- if err != nil {
- return err
- }
- err = q.MachineAddProvided(ctx, model.MachineAddProvidedParams{
- MachineID: machine.MachineID,
- Provider: model.ProviderEquinix,
- ProviderID: dev.ID,
- })
- if err != nil {
- return err
- }
- return q.MachineSetAgentStarted(ctx, model.MachineSetAgentStartedParams{
- MachineID: machine.MachineID,
- AgentStartedAt: time.Now().Add(time.Hour * -10),
- AgentPublicKey: []byte("fakefakefakefake"),
- })
- })
- if err != nil {
- t.Fatalf("Failed to create test machine: %v", err)
- }
-
- // Expect to find 0 machines needing recovery.
- deadline := time.Now().Add(10 * time.Second)
- for {
- if time.Now().After(deadline) {
- t.Fatalf("Machines did not get processed in time")
- }
- time.Sleep(100 * time.Millisecond)
-
- var machines []model.MachineProvided
- err = sess.Transact(ctx, func(q *model.Queries) error {
- var err error
- machines, err = q.GetMachineForAgentRecovery(ctx, model.GetMachineForAgentRecoveryParams{
- Limit: 100,
- Provider: model.ProviderEquinix,
- })
- return err
- })
- if err != nil {
- t.Fatalf("Failed to run Transaction: %v", err)
- }
- if len(machines) == 0 {
- break
- }
- }
-
- // Expect the target machine to have been rebooted.
- dut.f.mu.Lock()
- reboots := dut.f.reboots[dev.ID]
- dut.f.mu.Unlock()
- if want, got := 1, reboots; want != got {
- t.Fatalf("Wanted %d reboot, got %d", want, got)
- }
-
- // Expect machine to now be available again for agent start.
- var machines []model.MachineProvided
- err = sess.Transact(ctx, func(q *model.Queries) error {
- var err error
- machines, err = q.GetMachinesForAgentStart(ctx, model.GetMachinesForAgentStartParams{
- Limit: 100,
- Provider: model.ProviderEquinix,
- })
- return err
- })
- if err != nil {
- t.Fatalf("Failed to run Transaction: %v", err)
- }
- if want, got := 1, len(machines); want != got {
- t.Fatalf("Wanted %d machine ready for agent start, got %d", want, got)
- }
-}
diff --git a/cloud/shepherd/provider/equinix/updater.go b/cloud/shepherd/provider/equinix/updater.go
deleted file mode 100644
index 5b6e089..0000000
--- a/cloud/shepherd/provider/equinix/updater.go
+++ /dev/null
@@ -1,266 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package main
-
-import (
- "context"
- "database/sql"
- "errors"
- "flag"
- "fmt"
- "time"
-
- "github.com/packethost/packngo"
- "k8s.io/klog/v2"
-
- "source.monogon.dev/cloud/bmaas/bmdb"
- "source.monogon.dev/cloud/bmaas/bmdb/metrics"
- "source.monogon.dev/cloud/bmaas/bmdb/model"
- ecl "source.monogon.dev/cloud/equinix/wrapngo"
- "source.monogon.dev/cloud/lib/sinbin"
-)
-
-type UpdaterConfig struct {
- // Enable starts the updater.
- Enable bool
- // IterationRate is the minimu mtime taken between subsequent iterations of the
- // updater.
- IterationRate time.Duration
-}
-
-func (u *UpdaterConfig) RegisterFlags() {
- flag.BoolVar(&u.Enable, "updater_enable", true, "Enable the updater, which periodically scans equinix machines and updates their status in the BMDB")
- flag.DurationVar(&u.IterationRate, "updater_iteration_rate", time.Minute, "Rate limiting for updater iteration loop")
-}
-
-// The Updater periodically scans all machines backed by the equinix provider and
-// updaters their Provided status fields based on data retrieved from the Equinix
-// API.
-type Updater struct {
- config *UpdaterConfig
- sinbin sinbin.Sinbin[string]
-
- cl ecl.Client
-}
-
-func (u *UpdaterConfig) New(cl ecl.Client) (*Updater, error) {
- return &Updater{
- config: u,
- cl: cl,
- }, nil
-}
-
-func (u *Updater) Run(ctx context.Context, conn *bmdb.Connection) error {
- var sess *bmdb.Session
- var err error
-
- if !u.config.Enable {
- return nil
- }
-
- for {
- if sess == nil {
- sess, err = conn.StartSession(ctx, bmdb.SessionOption{Processor: metrics.ProcessorShepherdUpdater})
- if err != nil {
- return fmt.Errorf("could not start BMDB session: %w", err)
- }
- }
- limit := time.After(u.config.IterationRate)
-
- err = u.runInSession(ctx, sess)
- switch {
- case err == nil:
- <-limit
- case errors.Is(err, ctx.Err()):
- return err
- case errors.Is(err, bmdb.ErrSessionExpired):
- klog.Errorf("Session expired, restarting...")
- sess = nil
- time.Sleep(time.Second)
- default:
- klog.Errorf("Processing failed: %v", err)
- // TODO(q3k): close session
- time.Sleep(time.Second)
- }
- }
-}
-
-// applyNullStringUpdate returns true if 'up' supersedes 'cur'. Otherwise, it
-// returns false and zeroes out up.
-func applyNullStringUpdate(up, cur *sql.NullString) bool {
- if up.Valid {
- if !cur.Valid {
- return true
- }
- if up.String != cur.String {
- return true
- }
- }
- up.String = ""
- up.Valid = false
- return false
-}
-
-// applyNullProviderStatusUpdate returns true if 'up' supersedes 'cur'.
-// Otherwise, it returns false and zeroes out up.
-func applyNullProviderStatusUpdate(up, cur *model.NullProviderStatus) bool {
- if up.Valid {
- if !cur.Valid {
- return true
- }
- if up.ProviderStatus != cur.ProviderStatus {
- return true
- }
- }
- up.ProviderStatus = model.ProviderStatusUnknown
- up.Valid = false
- return false
-}
-
-// applyUpdate returns true if 'up' supersedes 'cur'. Otherwise, it returns false
-// and zeroes out up.
-func applyUpdate(up *model.MachineUpdateProviderStatusParams, cur *model.MachineProvided) bool {
- res := false
- res = res || applyNullStringUpdate(&up.ProviderReservationID, &cur.ProviderReservationID)
- res = res || applyNullStringUpdate(&up.ProviderIpAddress, &cur.ProviderIpAddress)
- res = res || applyNullStringUpdate(&up.ProviderLocation, &cur.ProviderLocation)
- res = res || applyNullProviderStatusUpdate(&up.ProviderStatus, &cur.ProviderStatus)
- return res
-}
-
-// updateLog logs information about the given update as calculated by applyUpdate.
-func updateLog(up *model.MachineUpdateProviderStatusParams) {
- if up.ProviderReservationID.Valid {
- klog.Infof(" Machine %s: new reservation ID %s", up.ProviderID, up.ProviderReservationID.String)
- }
- if up.ProviderIpAddress.Valid {
- klog.Infof(" Machine %s: new IP address %s", up.ProviderID, up.ProviderIpAddress.String)
- }
- if up.ProviderLocation.Valid {
- klog.Infof(" Machine %s: new location %s", up.ProviderID, up.ProviderLocation.String)
- }
- if up.ProviderStatus.Valid {
- klog.Infof(" Machine %s: new status %s", up.ProviderID, up.ProviderStatus.ProviderStatus)
- }
-}
-
-func (u *Updater) runInSession(ctx context.Context, sess *bmdb.Session) error {
- // Get all machines provided by us into the BMDB.
- // TODO(q3k): do not load all machines into memory.
-
- var machines []model.MachineProvided
- err := sess.Transact(ctx, func(q *model.Queries) error {
- var err error
- machines, err = q.GetProvidedMachines(ctx, model.ProviderEquinix)
- return err
- })
- if err != nil {
- return fmt.Errorf("when fetching provided machines: %w", err)
- }
-
- // Limit how many machines we check by timing them out if they're likely to not
- // get updated soon.
- penalized := 0
- var check []model.MachineProvided
- for _, m := range machines {
- if u.sinbin.Penalized(m.ProviderID) {
- penalized += 1
- } else {
- check = append(check, m)
- }
- }
-
- klog.Infof("Machines to check %d, skipping: %d", len(check), penalized)
- for _, m := range check {
- dev, err := u.cl.GetDevice(ctx, "", m.ProviderID, &packngo.ListOptions{
- Includes: []string{
- "hardware_reservation",
- },
- Excludes: []string{
- "created_by", "customdata", "network_ports", "operating_system", "actions",
- "plan", "provisioning_events", "ssh_keys", "tags", "volumes",
- },
- })
- if err != nil {
- klog.Warningf("Fetching device %s failed: %v", m.ProviderID, err)
- continue
- }
-
- // nextCheck will be used to sinbin the machine for some given time if there is
- // no difference between the current state and new state.
- //
- // Some conditions override this to be shorter (when the machine doesn't yet have
- // all data available or is in an otherwise unstable state).
- nextCheck := time.Minute * 30
-
- up := model.MachineUpdateProviderStatusParams{
- Provider: m.Provider,
- ProviderID: m.ProviderID,
- }
-
- if dev.HardwareReservation != nil {
- up.ProviderReservationID.Valid = true
- up.ProviderReservationID.String = dev.HardwareReservation.ID
- } else {
- nextCheck = time.Minute
- }
-
- for _, addr := range dev.Network {
- if !addr.Public {
- continue
- }
- up.ProviderIpAddress.Valid = true
- up.ProviderIpAddress.String = addr.Address
- break
- }
- if !up.ProviderIpAddress.Valid {
- nextCheck = time.Minute
- }
-
- if dev.Facility != nil {
- up.ProviderLocation.Valid = true
- up.ProviderLocation.String = dev.Facility.Code
- } else {
- nextCheck = time.Minute
- }
-
- up.ProviderStatus.Valid = true
- switch dev.State {
- case "active":
- up.ProviderStatus.ProviderStatus = model.ProviderStatusRunning
- case "deleted":
- up.ProviderStatus.ProviderStatus = model.ProviderStatusMissing
- case "failed":
- up.ProviderStatus.ProviderStatus = model.ProviderStatusProvisioningFailedPermanent
- case "inactive":
- up.ProviderStatus.ProviderStatus = model.ProviderStatusStopped
- case "powering_on", "powering_off":
- nextCheck = time.Minute
- up.ProviderStatus.ProviderStatus = model.ProviderStatusStopped
- case "queued", "provisioning", "reinstalling", "post_provisioning":
- nextCheck = time.Minute
- up.ProviderStatus.ProviderStatus = model.ProviderStatusProvisioning
- default:
- klog.Warningf("Device %s has unexpected status: %q", m.ProviderID, dev.State)
- nextCheck = time.Minute
- up.ProviderStatus.ProviderStatus = model.ProviderStatusUnknown
- }
-
- if !applyUpdate(&up, &m) {
- u.sinbin.Add(m.ProviderID, time.Now().Add(nextCheck))
- continue
- }
-
- klog.Infof("Device %s has new data:", m.ProviderID)
- updateLog(&up)
- err = sess.Transact(ctx, func(q *model.Queries) error {
- return q.MachineUpdateProviderStatus(ctx, up)
- })
- if err != nil {
- klog.Warningf("Device %s failed to update: %v", m.ProviderID, err)
- }
- u.sinbin.Add(m.ProviderID, time.Now().Add(time.Minute))
- }
- return nil
-}
diff --git a/cloud/shepherd/provider/equinix/updater_test.go b/cloud/shepherd/provider/equinix/updater_test.go
deleted file mode 100644
index ed3da7a..0000000
--- a/cloud/shepherd/provider/equinix/updater_test.go
+++ /dev/null
@@ -1,146 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package main
-
-import (
- "context"
- "testing"
- "time"
-
- "github.com/packethost/packngo"
-
- "source.monogon.dev/cloud/bmaas/bmdb"
- "source.monogon.dev/cloud/bmaas/bmdb/model"
- "source.monogon.dev/cloud/lib/component"
-)
-
-type updaterDut struct {
- f *fakequinix
- u *Updater
- bmdb *bmdb.Connection
- ctx context.Context
-}
-
-func newUpdaterDut(t *testing.T) *updaterDut {
- t.Helper()
-
- uc := UpdaterConfig{
- Enable: true,
- IterationRate: time.Second,
- }
-
- f := newFakequinix("fake", 100)
- u, err := uc.New(f)
- if err != nil {
- t.Fatalf("Could not create Updater: %v", err)
- }
-
- b := bmdb.BMDB{
- Config: bmdb.Config{
- Database: component.CockroachConfig{
- InMemory: true,
- },
- ComponentName: "test",
- RuntimeInfo: "test",
- },
- }
- conn, err := b.Open(true)
- if err != nil {
- t.Fatalf("Could not create in-memory BMDB: %v", err)
- }
-
- ctx, ctxC := context.WithCancel(context.Background())
- t.Cleanup(ctxC)
-
- go u.Run(ctx, conn)
-
- return &updaterDut{
- f: f,
- u: u,
- bmdb: conn,
- ctx: ctx,
- }
-}
-
-func TestUpdater(t *testing.T) {
- dut := newUpdaterDut(t)
- f := dut.f
- ctx := dut.ctx
- conn := dut.bmdb
-
- reservations, _ := f.ListReservations(ctx, "fake")
-
- sess, err := conn.StartSession(ctx)
- if err != nil {
- t.Fatalf("Failed to create BMDB session: %v", err)
- }
-
- // Create test machine that should be selected for updating.
- // First in Fakequinix...
- dev, _ := f.CreateDevice(ctx, &packngo.DeviceCreateRequest{
- Hostname: "test-devices",
- OS: "fake",
- ProjectID: "fake",
- HardwareReservationID: reservations[0].ID,
- ProjectSSHKeys: []string{},
- })
- // ... and in BMDB.
- err = sess.Transact(ctx, func(q *model.Queries) error {
- machine, err := q.NewMachine(ctx)
- if err != nil {
- return err
- }
- err = q.MachineAddProvided(ctx, model.MachineAddProvidedParams{
- MachineID: machine.MachineID,
- Provider: model.ProviderEquinix,
- ProviderID: dev.ID,
- })
- if err != nil {
- return err
- }
- return q.MachineSetAgentStarted(ctx, model.MachineSetAgentStartedParams{
- MachineID: machine.MachineID,
- AgentStartedAt: time.Now().Add(time.Hour * -10),
- AgentPublicKey: []byte("fakefakefakefake"),
- })
- })
- if err != nil {
- t.Fatalf("failed to execute bmdb transaction: %v", err)
- }
-
- deadline := time.Now().Add(time.Second * 10)
- for {
- time.Sleep(100 * time.Millisecond)
- if time.Now().After(deadline) {
- t.Fatalf("Deadline exceeded")
- }
-
- var provided []model.MachineProvided
- err = sess.Transact(ctx, func(q *model.Queries) error {
- var err error
- provided, err = q.GetProvidedMachines(ctx, model.ProviderEquinix)
- return err
- })
- if err != nil {
- t.Fatalf("Transact: %v", err)
- }
- if len(provided) < 1 {
- continue
- }
- p := provided[0]
- if p.ProviderStatus.ProviderStatus != model.ProviderStatusRunning {
- continue
- }
- if p.ProviderLocation.String != "wad" {
- continue
- }
- if p.ProviderIpAddress.String != "1.2.3.4" {
- continue
- }
- if p.ProviderReservationID.String != reservations[0].ID {
- continue
- }
- break
- }
-}
diff --git a/cloud/shepherd/shepherd.go b/cloud/shepherd/shepherd.go
deleted file mode 100644
index aeaab46..0000000
--- a/cloud/shepherd/shepherd.go
+++ /dev/null
@@ -1,126 +0,0 @@
-// Copyright The Monogon Project Authors.
-// SPDX-License-Identifier: Apache-2.0
-
-package shepherd
-
-import (
- "context"
- "fmt"
- "net/netip"
-
- "source.monogon.dev/cloud/bmaas/bmdb"
- "source.monogon.dev/cloud/bmaas/bmdb/model"
-)
-
-var ErrMachineNotFound = fmt.Errorf("machine not found")
-var ErrNotImplemented = fmt.Errorf("not implemented")
-
-// ProviderID is an opaque unique identifier for a machine within a single
-// provider instance. It is generated by the Provider and usually the same
-// as the ID of the machine within the system that the Provider managed.
-// The Shepherd (and BMaaS in general) requires these IDs to be unique
-// within a provider and stable.
-type ProviderID string
-
-const InvalidProviderID ProviderID = "invalid"
-
-// IsValid reports whether the ProviderID is valid.
-func (p ProviderID) IsValid() bool {
- return p != InvalidProviderID
-}
-
-// Availability defines the availability state according to the provider.
-// See the different states for more information.
-type Availability int
-
-const (
- // AvailabilityUndefined is used as a placeholder to prevent that the default
- // value can create any type of bad behaviour.
- AvailabilityUndefined Availability = iota
- // AvailabilityPossiblyUsed defines the state where a machine is possibly used,
- // this is a state for use in stateless providers where the shepherd has
- // to check against the bmdb if Machine.ID is already provisioned or not.
- // These machines must have a valid ID and Addr.
- AvailabilityPossiblyUsed
- // AvailabilityKnownUnused defines the state where a machine is know to be free,
- // e.g. a hardware reservation at equinix. These machines may not have an
- // ID or Addr.
- AvailabilityKnownUnused
- // AvailabilityKnownUsed defines the state where a machine is known to be used,
- // e.g. a deployed machine that is in use. These machines must have a
- // valid ID and Addr.
- AvailabilityKnownUsed
-)
-
-func (a Availability) String() string {
- switch a {
- case AvailabilityUndefined:
- return "Undefined"
- case AvailabilityKnownUnused:
- return "KnownUnused"
- case AvailabilityKnownUsed:
- return "KnownUsed"
- case AvailabilityPossiblyUsed:
- return "PossiblyUsed"
- default:
- return fmt.Sprintf("<invalid value %d>", a)
- }
-}
-
-type Machine interface {
- // ID returns the provider id, see ProviderID for more information.
- ID() ProviderID
- // Addr returns the machines ip address that is reachable from the
- // shepherd. It is used to connect to the machine via SSH to execute
- // all takeover tasks, etc.
- Addr() netip.Addr
- // Availability returns the availability of the machine.
- Availability() Availability
- // Failed should return true if the machine is in a failed state and
- // should be ignored if there are inconsistencies between the provider
- // and BMDB.
- Failed() bool
-}
-
-type CreateMachineRequest struct {
- // UnusedMachine resembles a machine to use as deployment target.
- UnusedMachine Machine
-}
-
-// Provider is the interface that is used to abstract the interaction between
-// the shepherd and machine providers like Equinix. All methods inside this
-// interface must not be called concurrently.
-type Provider interface {
- // ListMachines returns all existing machines for a provider. Machines
- // that are still in the state of being created by CreateMachine should
- // not be returned.
- ListMachines(context.Context) ([]Machine, error)
-
- // GetMachine returns an existing machine for a provider. Machines
- // that are still in the state of being created by CreateMachine should
- // not be returned. If a there are no machines found after these filters
- // an error should be returned.
- GetMachine(context.Context, ProviderID) (Machine, error)
-
- // CreateMachine creates a new machine with the given parameters and
- // returns the created instance. The provider is required to create the
- // entry into the machine table and MachineProvided tag. If there are no
- // more machines avaiable, an error should be returned.
- CreateMachine(context.Context, *bmdb.Session, CreateMachineRequest) (Machine, error)
-
- // Type returns the value that represents this provider inside the database.
- Type() model.Provider
-}
-
-type Recoverer interface {
- Provider
-
- // RebootMachine tries to bring a machine back from the dead by e.g. rebooting
- RebootMachine(context.Context, ProviderID) error
-
- // ReinstallMachine should reinstall the given machine and if the provider
- // does not support reinstallation, the function should return an error
- // stating this. If reinstalled, the installed tag should be updated to
- // allow the reconcile loop to restart the takeover process.
- ReinstallMachine(context.Context, ProviderID) error
-}