cloud/shepherd: skip machines in failed state
Currently we are not ignoring machines that are in failed state, which
brings the provider to a stop as soon as a provision request fails. We
should ignore such machines.
Change-Id: I7be64c710cc15428f4d8d4e75b6df1e816d453f8
Reviewed-on: https://review.monogon.dev/c/monogon/+/2417
Reviewed-by: Serge Bazanski <serge@monogon.tech>
Tested-by: Jenkins CI
diff --git a/cloud/shepherd/manager/provider_test.go b/cloud/shepherd/manager/provider_test.go
index d1c6361..b08c8b7 100644
--- a/cloud/shepherd/manager/provider_test.go
+++ b/cloud/shepherd/manager/provider_test.go
@@ -21,6 +21,10 @@
agentStarted bool
}
+func (dm *dummyMachine) Failed() bool {
+ return false
+}
+
func (dm *dummyMachine) ID() shepherd.ProviderID {
return dm.id
}
diff --git a/cloud/shepherd/manager/provisioner.go b/cloud/shepherd/manager/provisioner.go
index a77f241..5f19c6d 100644
--- a/cloud/shepherd/manager/provisioner.go
+++ b/cloud/shepherd/manager/provisioner.go
@@ -196,6 +196,19 @@
model.MachineProvided
}
+func (p providedMachine) Failed() bool {
+ if !p.MachineProvided.ProviderStatus.Valid {
+ // If we don't have any ProviderStatus to check for, return false
+ // to trigger the validation inside the reconciler loop.
+ return false
+ }
+ switch p.MachineProvided.ProviderStatus.ProviderStatus {
+ case model.ProviderStatusProvisioningFailedPermanent:
+ return true
+ }
+ return false
+}
+
func (p providedMachine) ID() shepherd.ProviderID {
return shepherd.ProviderID(p.ProviderID)
}
@@ -247,7 +260,7 @@
// resolvePossiblyUsed checks if the state is set to possibly used and finds out
// which state is the correct one.
-func (p *Provisioner) resolvePossiblyUsed(machine shepherd.Machine, providedMachines map[shepherd.ProviderID]bool) shepherd.State {
+func (p *Provisioner) resolvePossiblyUsed(machine shepherd.Machine, providedMachines map[shepherd.ProviderID]shepherd.Machine) shepherd.State {
state, id := machine.State(), machine.ID()
// Bail out if this isn't a possibly used state.
@@ -276,14 +289,14 @@
func (p *Provisioner) reconcile(ctx context.Context, sess *bmdb.Session, inProvider, bmdbMachines []shepherd.Machine) error {
klog.Infof("Reconciling...")
- bmdb := make(map[shepherd.ProviderID]bool)
+ bmdb := make(map[shepherd.ProviderID]shepherd.Machine)
for _, machine := range bmdbMachines {
// Dont check the state here as its hardcoded to be known used.
- bmdb[machine.ID()] = true
+ bmdb[machine.ID()] = machine
}
var availableMachines []shepherd.Machine
- provider := make(map[shepherd.ProviderID]bool)
+ provider := make(map[shepherd.ProviderID]shepherd.Machine)
for _, machine := range inProvider {
state := p.resolvePossiblyUsed(machine, bmdb)
@@ -292,7 +305,7 @@
availableMachines = append(availableMachines, machine)
case shepherd.StateKnownUsed:
- provider[machine.ID()] = true
+ provider[machine.ID()] = machine
default:
return fmt.Errorf("machine has invalid state (ID: %s, Addr: %s): %s", machine.ID(), machine.Addr(), state)
@@ -301,25 +314,41 @@
managed := make(map[shepherd.ProviderID]bool)
+ // We discovered that a machine mostly fails either when provisioning or
+ // deprovisioning. A already deployed and running machine can only switch
+ // into failed state if any api interaction happend, e.g. rebooting the
+ // machine into recovery mode. If such a machine is returned to the
+ // reconciling loop, it will trigger the badbadnotgood safety switch and
+ // return with an error. To reduce the manual intervention required we
+ // filter out these machines on both sides (bmdb and provider).
+ isBadBadNotGood := func(known map[shepherd.ProviderID]shepherd.Machine, machine shepherd.Machine) bool {
+ // If the machine is missing and not failed, its a bad case.
+ if known[machine.ID()] == nil && !machine.Failed() {
+ return true
+ }
+ return false
+ }
+
// Some desynchronization between the BMDB and Provider point of view might be so
// bad we shouldn't attempt to do any work, at least not any time soon.
badbadnotgood := false
// Find any machines supposedly managed by us in the provider, but not in the
// BMDB.
- for machine, _ := range provider {
- if bmdb[machine] {
- managed[machine] = true
+ for id, machine := range provider {
+ if isBadBadNotGood(bmdb, machine) {
+ klog.Errorf("Provider machine has no corresponding machine in BMDB. (PID: %s)", id)
+ badbadnotgood = true
continue
}
- klog.Errorf("Provider machine %s has no corresponding machine in BMDB.", machine)
- badbadnotgood = true
+
+ managed[id] = true
}
// Find any machines in the BMDB but not in the provider.
- for machine, _ := range bmdb {
- if !provider[machine] {
- klog.Errorf("Provider device ID %s referred to in BMDB (from TODO) but missing in provider.", machine)
+ for id, machine := range bmdb {
+ if isBadBadNotGood(provider, machine) {
+ klog.Errorf("Provider machine referred to in BMDB but missing in provider. (PID: %s)", id)
badbadnotgood = true
}
}
diff --git a/cloud/shepherd/manager/provisioner_test.go b/cloud/shepherd/manager/provisioner_test.go
index 5adc408..a145bdc 100644
--- a/cloud/shepherd/manager/provisioner_test.go
+++ b/cloud/shepherd/manager/provisioner_test.go
@@ -88,8 +88,8 @@
func TestProvisioner_resolvePossiblyUsed(t *testing.T) {
const providedMachineID = "provided-machine"
- providedMachines := map[shepherd.ProviderID]bool{
- providedMachineID: true,
+ providedMachines := map[shepherd.ProviderID]shepherd.Machine{
+ providedMachineID: nil,
}
tests := []struct {
diff --git a/cloud/shepherd/mini/provider.go b/cloud/shepherd/mini/provider.go
index 05b628f..42b6348 100644
--- a/cloud/shepherd/mini/provider.go
+++ b/cloud/shepherd/mini/provider.go
@@ -26,6 +26,10 @@
Location string `json:"Location"`
}
+func (d machine) Failed() bool {
+ return false
+}
+
func (d machine) ID() shepherd.ProviderID {
return d.ProviderID
}
diff --git a/cloud/shepherd/provider/equinix/provider.go b/cloud/shepherd/provider/equinix/provider.go
index edc8f3f..7d3c0d2 100644
--- a/cloud/shepherd/provider/equinix/provider.go
+++ b/cloud/shepherd/provider/equinix/provider.go
@@ -127,6 +127,10 @@
packngo.HardwareReservation
}
+func (e reservation) Failed() bool {
+ return false
+}
+
func (e reservation) ID() shepherd.ProviderID {
return shepherd.InvalidProviderID
}
@@ -143,6 +147,10 @@
packngo.Device
}
+func (e *machine) Failed() bool {
+ return e.Device.State == "failed"
+}
+
func (e *machine) ID() shepherd.ProviderID {
return shepherd.ProviderID(e.Device.ID)
}
diff --git a/cloud/shepherd/shepherd.go b/cloud/shepherd/shepherd.go
index 3504eb7..db5e75d 100644
--- a/cloud/shepherd/shepherd.go
+++ b/cloud/shepherd/shepherd.go
@@ -73,6 +73,10 @@
Addr() netip.Addr
// State returns the state in which the machine is
State() State
+ // Failed should return true if the machine is in a failed state and
+ // should be ignored if there are inconsistencies between the provider
+ // and BMDB.
+ Failed() bool
}
type CreateMachineRequest struct {