cloud/shepherd: skip machines in failed state

Currently we are not ignoring machines that are in failed state, which
brings the provider to a stop as soon as a provision request fails. We
should ignore such machines.

Change-Id: I7be64c710cc15428f4d8d4e75b6df1e816d453f8
Reviewed-on: https://review.monogon.dev/c/monogon/+/2417
Reviewed-by: Serge Bazanski <serge@monogon.tech>
Tested-by: Jenkins CI
diff --git a/cloud/shepherd/manager/provider_test.go b/cloud/shepherd/manager/provider_test.go
index d1c6361..b08c8b7 100644
--- a/cloud/shepherd/manager/provider_test.go
+++ b/cloud/shepherd/manager/provider_test.go
@@ -21,6 +21,10 @@
 	agentStarted bool
 }
 
+func (dm *dummyMachine) Failed() bool {
+	return false
+}
+
 func (dm *dummyMachine) ID() shepherd.ProviderID {
 	return dm.id
 }
diff --git a/cloud/shepherd/manager/provisioner.go b/cloud/shepherd/manager/provisioner.go
index a77f241..5f19c6d 100644
--- a/cloud/shepherd/manager/provisioner.go
+++ b/cloud/shepherd/manager/provisioner.go
@@ -196,6 +196,19 @@
 	model.MachineProvided
 }
 
+func (p providedMachine) Failed() bool {
+	if !p.MachineProvided.ProviderStatus.Valid {
+		// If we don't have any ProviderStatus to check for, return false
+		// to trigger the validation inside the reconciler loop.
+		return false
+	}
+	switch p.MachineProvided.ProviderStatus.ProviderStatus {
+	case model.ProviderStatusProvisioningFailedPermanent:
+		return true
+	}
+	return false
+}
+
 func (p providedMachine) ID() shepherd.ProviderID {
 	return shepherd.ProviderID(p.ProviderID)
 }
@@ -247,7 +260,7 @@
 
 // resolvePossiblyUsed checks if the state is set to possibly used and finds out
 // which state is the correct one.
-func (p *Provisioner) resolvePossiblyUsed(machine shepherd.Machine, providedMachines map[shepherd.ProviderID]bool) shepherd.State {
+func (p *Provisioner) resolvePossiblyUsed(machine shepherd.Machine, providedMachines map[shepherd.ProviderID]shepherd.Machine) shepherd.State {
 	state, id := machine.State(), machine.ID()
 
 	// Bail out if this isn't a possibly used state.
@@ -276,14 +289,14 @@
 func (p *Provisioner) reconcile(ctx context.Context, sess *bmdb.Session, inProvider, bmdbMachines []shepherd.Machine) error {
 	klog.Infof("Reconciling...")
 
-	bmdb := make(map[shepherd.ProviderID]bool)
+	bmdb := make(map[shepherd.ProviderID]shepherd.Machine)
 	for _, machine := range bmdbMachines {
 		// Dont check the state here as its hardcoded to be known used.
-		bmdb[machine.ID()] = true
+		bmdb[machine.ID()] = machine
 	}
 
 	var availableMachines []shepherd.Machine
-	provider := make(map[shepherd.ProviderID]bool)
+	provider := make(map[shepherd.ProviderID]shepherd.Machine)
 	for _, machine := range inProvider {
 		state := p.resolvePossiblyUsed(machine, bmdb)
 
@@ -292,7 +305,7 @@
 			availableMachines = append(availableMachines, machine)
 
 		case shepherd.StateKnownUsed:
-			provider[machine.ID()] = true
+			provider[machine.ID()] = machine
 
 		default:
 			return fmt.Errorf("machine has invalid state (ID: %s, Addr: %s): %s", machine.ID(), machine.Addr(), state)
@@ -301,25 +314,41 @@
 
 	managed := make(map[shepherd.ProviderID]bool)
 
+	// We discovered that a machine mostly fails either when provisioning or
+	// deprovisioning. A already deployed and running machine can only switch
+	// into failed state if any api interaction happend, e.g. rebooting the
+	// machine into recovery mode. If such a machine is returned to the
+	// reconciling loop, it will trigger the badbadnotgood safety switch and
+	// return with an error. To reduce the manual intervention required we
+	// filter out these machines on both sides (bmdb and provider).
+	isBadBadNotGood := func(known map[shepherd.ProviderID]shepherd.Machine, machine shepherd.Machine) bool {
+		// If the machine is missing and not failed, its a bad case.
+		if known[machine.ID()] == nil && !machine.Failed() {
+			return true
+		}
+		return false
+	}
+
 	// Some desynchronization between the BMDB and Provider point of view might be so
 	// bad we shouldn't attempt to do any work, at least not any time soon.
 	badbadnotgood := false
 
 	// Find any machines supposedly managed by us in the provider, but not in the
 	// BMDB.
-	for machine, _ := range provider {
-		if bmdb[machine] {
-			managed[machine] = true
+	for id, machine := range provider {
+		if isBadBadNotGood(bmdb, machine) {
+			klog.Errorf("Provider machine has no corresponding machine in BMDB. (PID: %s)", id)
+			badbadnotgood = true
 			continue
 		}
-		klog.Errorf("Provider machine %s has no corresponding machine in BMDB.", machine)
-		badbadnotgood = true
+
+		managed[id] = true
 	}
 
 	// Find any machines in the BMDB but not in the provider.
-	for machine, _ := range bmdb {
-		if !provider[machine] {
-			klog.Errorf("Provider device ID %s referred to in BMDB (from TODO) but missing in provider.", machine)
+	for id, machine := range bmdb {
+		if isBadBadNotGood(provider, machine) {
+			klog.Errorf("Provider machine referred to in BMDB but missing in provider. (PID: %s)", id)
 			badbadnotgood = true
 		}
 	}
diff --git a/cloud/shepherd/manager/provisioner_test.go b/cloud/shepherd/manager/provisioner_test.go
index 5adc408..a145bdc 100644
--- a/cloud/shepherd/manager/provisioner_test.go
+++ b/cloud/shepherd/manager/provisioner_test.go
@@ -88,8 +88,8 @@
 func TestProvisioner_resolvePossiblyUsed(t *testing.T) {
 	const providedMachineID = "provided-machine"
 
-	providedMachines := map[shepherd.ProviderID]bool{
-		providedMachineID: true,
+	providedMachines := map[shepherd.ProviderID]shepherd.Machine{
+		providedMachineID: nil,
 	}
 
 	tests := []struct {
diff --git a/cloud/shepherd/mini/provider.go b/cloud/shepherd/mini/provider.go
index 05b628f..42b6348 100644
--- a/cloud/shepherd/mini/provider.go
+++ b/cloud/shepherd/mini/provider.go
@@ -26,6 +26,10 @@
 	Location   string              `json:"Location"`
 }
 
+func (d machine) Failed() bool {
+	return false
+}
+
 func (d machine) ID() shepherd.ProviderID {
 	return d.ProviderID
 }
diff --git a/cloud/shepherd/provider/equinix/provider.go b/cloud/shepherd/provider/equinix/provider.go
index edc8f3f..7d3c0d2 100644
--- a/cloud/shepherd/provider/equinix/provider.go
+++ b/cloud/shepherd/provider/equinix/provider.go
@@ -127,6 +127,10 @@
 	packngo.HardwareReservation
 }
 
+func (e reservation) Failed() bool {
+	return false
+}
+
 func (e reservation) ID() shepherd.ProviderID {
 	return shepherd.InvalidProviderID
 }
@@ -143,6 +147,10 @@
 	packngo.Device
 }
 
+func (e *machine) Failed() bool {
+	return e.Device.State == "failed"
+}
+
 func (e *machine) ID() shepherd.ProviderID {
 	return shepherd.ProviderID(e.Device.ID)
 }
diff --git a/cloud/shepherd/shepherd.go b/cloud/shepherd/shepherd.go
index 3504eb7..db5e75d 100644
--- a/cloud/shepherd/shepherd.go
+++ b/cloud/shepherd/shepherd.go
@@ -73,6 +73,10 @@
 	Addr() netip.Addr
 	// State returns the state in which the machine is
 	State() State
+	// Failed should return true if the machine is in a failed state and
+	// should be ignored if there are inconsistencies between the provider
+	// and BMDB.
+	Failed() bool
 }
 
 type CreateMachineRequest struct {