| Tim Windelschmidt | 6d33a43 | 2025-02-04 14:34:25 +0100 | [diff] [blame] | 1 | // Copyright The Monogon Project Authors. |
| 2 | // SPDX-License-Identifier: Apache-2.0 |
| 3 | |
| Jan Schär | d20ddcc | 2024-05-08 14:18:29 +0200 | [diff] [blame] | 4 | package reconciler |
| 5 | |
| 6 | import ( |
| 7 | "context" |
| 8 | "testing" |
| 9 | "time" |
| 10 | |
| Lorenz Brun | 62229cf | 2025-07-07 12:47:31 +0200 | [diff] [blame] | 11 | "go.etcd.io/etcd/tests/v3/framework/integration" |
| Jan Schär | d20ddcc | 2024-05-08 14:18:29 +0200 | [diff] [blame] | 12 | "google.golang.org/protobuf/proto" |
| 13 | "k8s.io/client-go/kubernetes/fake" |
| 14 | |
| 15 | "source.monogon.dev/metropolis/node/core/consensus/client" |
| 16 | "source.monogon.dev/metropolis/node/core/curator" |
| 17 | ppb "source.monogon.dev/metropolis/node/core/curator/proto/private" |
| Jan Schär | b86917b | 2025-05-14 16:31:08 +0000 | [diff] [blame] | 18 | "source.monogon.dev/metropolis/node/core/productinfo" |
| Jan Schär | d20ddcc | 2024-05-08 14:18:29 +0200 | [diff] [blame] | 19 | cpb "source.monogon.dev/metropolis/proto/common" |
| Tim Windelschmidt | 9f21f53 | 2024-05-07 15:14:20 +0200 | [diff] [blame] | 20 | "source.monogon.dev/osbase/supervisor" |
| Jan Schär | d20ddcc | 2024-05-08 14:18:29 +0200 | [diff] [blame] | 21 | "source.monogon.dev/version" |
| 22 | vpb "source.monogon.dev/version/spec" |
| 23 | ) |
| 24 | |
| Jan Schär | b86917b | 2025-05-14 16:31:08 +0000 | [diff] [blame] | 25 | var productInfo = productinfo.Get() |
| 26 | |
| Jan Schär | d20ddcc | 2024-05-08 14:18:29 +0200 | [diff] [blame] | 27 | // TestMinimumReleasesNotAboveMetropolisRelease tests that minimum releases |
| 28 | // are not above the metropolis release itself, because that would cause |
| 29 | // things to get stuck. |
| 30 | func TestMinimumReleasesNotAboveMetropolisRelease(t *testing.T) { |
| Jan Schär | b86917b | 2025-05-14 16:31:08 +0000 | [diff] [blame] | 31 | if version.ReleaseLessThan(productInfo.Version.Release, minReconcilerRelease) { |
| Jan Schär | d20ddcc | 2024-05-08 14:18:29 +0200 | [diff] [blame] | 32 | t.Errorf("Metropolis release %s is below the minimum reconciler release %s", |
| Jan Schär | b86917b | 2025-05-14 16:31:08 +0000 | [diff] [blame] | 33 | version.Semver(productInfo.Version), |
| Jan Schär | d20ddcc | 2024-05-08 14:18:29 +0200 | [diff] [blame] | 34 | version.Release(minReconcilerRelease), |
| 35 | ) |
| 36 | } |
| Jan Schär | b86917b | 2025-05-14 16:31:08 +0000 | [diff] [blame] | 37 | if version.ReleaseLessThan(productInfo.Version.Release, minApiserverRelease) { |
| Jan Schär | d20ddcc | 2024-05-08 14:18:29 +0200 | [diff] [blame] | 38 | t.Errorf("Metropolis release %s is below the minimum apiserver release %s", |
| Jan Schär | b86917b | 2025-05-14 16:31:08 +0000 | [diff] [blame] | 39 | version.Semver(productInfo.Version), |
| Jan Schär | d20ddcc | 2024-05-08 14:18:29 +0200 | [diff] [blame] | 40 | version.Release(minApiserverRelease), |
| 41 | ) |
| 42 | } |
| 43 | } |
| 44 | |
| 45 | // startEtcd creates an etcd cluster and client for testing. |
| 46 | func startEtcd(t *testing.T) client.Namespaced { |
| 47 | t.Helper() |
| 48 | // Start a single-node etcd cluster. |
| 49 | integration.BeforeTestExternal(t) |
| Lorenz Brun | 62229cf | 2025-07-07 12:47:31 +0200 | [diff] [blame] | 50 | cluster := integration.NewCluster(t, &integration.ClusterConfig{Size: 1}) |
| Jan Schär | d20ddcc | 2024-05-08 14:18:29 +0200 | [diff] [blame] | 51 | t.Cleanup(func() { |
| 52 | cluster.Terminate(t) |
| 53 | }) |
| 54 | // Create etcd client to test cluster. |
| 55 | curEtcd, _ := client.NewLocal(cluster.Client(0)).Sub("curator") |
| 56 | return curEtcd |
| 57 | } |
| 58 | |
| 59 | func setStatus(t *testing.T, cl client.Namespaced, status *ppb.KubernetesReconcilerStatus) { |
| 60 | t.Helper() |
| 61 | ctx := context.Background() |
| 62 | |
| 63 | statusBytes, err := proto.Marshal(status) |
| 64 | if err != nil { |
| 65 | t.Fatalf("Failed to marshal status: %v", err) |
| 66 | } |
| 67 | |
| 68 | _, err = cl.Put(ctx, statusKey, string(statusBytes)) |
| 69 | if err != nil { |
| 70 | t.Fatalf("Put: %v", err) |
| 71 | } |
| 72 | } |
| 73 | |
| 74 | func makeNode(isController bool, release *vpb.Version_Release) *ppb.Node { |
| 75 | node := &ppb.Node{ |
| 76 | Roles: &cpb.NodeRoles{}, |
| 77 | Status: &cpb.NodeStatus{ |
| 78 | Version: &vpb.Version{Release: release}, |
| 79 | }, |
| 80 | } |
| 81 | if isController { |
| 82 | node.Roles.KubernetesController = &cpb.NodeRoles_KubernetesController{} |
| 83 | } |
| 84 | return node |
| 85 | } |
| 86 | |
| 87 | // putNode puts the node into etcd, or deletes if nil. |
| 88 | // It returns the etcd revision of the operation. |
| 89 | func putNode(t *testing.T, cl client.Namespaced, id string, node *ppb.Node) int64 { |
| 90 | t.Helper() |
| 91 | ctx := context.Background() |
| 92 | |
| 93 | nkey, err := curator.NodeEtcdPrefix.Key(id) |
| 94 | if err != nil { |
| 95 | t.Fatal(err) |
| 96 | } |
| 97 | if node != nil { |
| 98 | nodeBytes, err := proto.Marshal(node) |
| 99 | if err != nil { |
| 100 | t.Fatalf("Failed to marshal node: %v", err) |
| 101 | } |
| 102 | resp, err := cl.Put(ctx, nkey, string(nodeBytes)) |
| 103 | if err != nil { |
| 104 | t.Fatalf("Put: %v", err) |
| 105 | } |
| 106 | return resp.Header.Revision |
| 107 | } else { |
| 108 | resp, err := cl.Delete(ctx, nkey) |
| 109 | if err != nil { |
| 110 | t.Fatalf("Delete: %v", err) |
| 111 | } |
| 112 | return resp.Header.Revision |
| 113 | } |
| 114 | } |
| 115 | |
| 116 | // TestWaitReady tests that WaitReady does not return too early, and the test |
| 117 | // will time out if WaitReady fails to return when it is supposed to. |
| 118 | func TestWaitReady(t *testing.T) { |
| 119 | cl := startEtcd(t) |
| Jan Schär | b86917b | 2025-05-14 16:31:08 +0000 | [diff] [blame] | 120 | s := Service{ |
| 121 | Etcd: cl, |
| 122 | } |
| Jan Schär | d20ddcc | 2024-05-08 14:18:29 +0200 | [diff] [blame] | 123 | |
| 124 | isReady := make(chan struct{}) |
| 125 | supervisor.TestHarness(t, func(ctx context.Context) error { |
| Jan Schär | b86917b | 2025-05-14 16:31:08 +0000 | [diff] [blame] | 126 | err := s.WaitReady(ctx) |
| Jan Schär | d20ddcc | 2024-05-08 14:18:29 +0200 | [diff] [blame] | 127 | if err != nil { |
| 128 | t.Error(err) |
| 129 | } |
| 130 | close(isReady) |
| 131 | supervisor.Signal(ctx, supervisor.SignalHealthy) |
| 132 | supervisor.Signal(ctx, supervisor.SignalDone) |
| 133 | return nil |
| 134 | }) |
| 135 | |
| 136 | // status does not exist. |
| 137 | time.Sleep(10 * time.Millisecond) |
| 138 | |
| 139 | // Version is too old. |
| 140 | setStatus(t, cl, &ppb.KubernetesReconcilerStatus{ |
| 141 | State: ppb.KubernetesReconcilerStatus_STATE_DONE, |
| 142 | Version: &vpb.Version{ |
| 143 | Release: &vpb.Version_Release{Major: 0, Minor: 0, Patch: 0}, |
| 144 | }, |
| 145 | MinimumCompatibleRelease: &vpb.Version_Release{Major: 0, Minor: 0, Patch: 0}, |
| 146 | }) |
| 147 | time.Sleep(10 * time.Millisecond) |
| 148 | |
| 149 | // MinimumCompatibleRelease is too new. |
| 150 | setStatus(t, cl, &ppb.KubernetesReconcilerStatus{ |
| 151 | State: ppb.KubernetesReconcilerStatus_STATE_DONE, |
| 152 | Version: &vpb.Version{ |
| 153 | Release: &vpb.Version_Release{Major: 10000, Minor: 0, Patch: 0}, |
| 154 | }, |
| 155 | MinimumCompatibleRelease: &vpb.Version_Release{Major: 10000, Minor: 0, Patch: 0}, |
| 156 | }) |
| 157 | time.Sleep(10 * time.Millisecond) |
| 158 | |
| 159 | select { |
| 160 | case <-isReady: |
| 161 | t.Fatal("WaitReady returned too early.") |
| 162 | default: |
| 163 | } |
| 164 | |
| 165 | // Now set the status to something compatible. |
| 166 | setStatus(t, cl, &ppb.KubernetesReconcilerStatus{ |
| 167 | State: ppb.KubernetesReconcilerStatus_STATE_DONE, |
| 168 | Version: &vpb.Version{ |
| 169 | Release: &vpb.Version_Release{Major: 10000, Minor: 0, Patch: 0}, |
| 170 | }, |
| Jan Schär | b86917b | 2025-05-14 16:31:08 +0000 | [diff] [blame] | 171 | MinimumCompatibleRelease: productInfo.Version.Release, |
| Jan Schär | d20ddcc | 2024-05-08 14:18:29 +0200 | [diff] [blame] | 172 | }) |
| 173 | |
| 174 | <-isReady |
| 175 | } |
| 176 | |
| 177 | // TestWatchNodes ensures that WatchNodes always updates releases correctly |
| 178 | // as nodes are changed in various ways. |
| 179 | func TestWatchNodes(t *testing.T) { |
| 180 | ctx := context.Background() |
| 181 | cl := startEtcd(t) |
| 182 | s := Service{ |
| 183 | Etcd: cl, |
| 184 | } |
| 185 | w := s.releases.Watch() |
| 186 | defer w.Close() |
| 187 | |
| 188 | expectReleases := func(expectMin, expectMax string, expectRev int64) { |
| 189 | t.Helper() |
| 190 | releases, err := w.Get(ctx) |
| 191 | if err != nil { |
| 192 | t.Fatal(err) |
| 193 | } |
| 194 | if actualMin := version.Release(releases.minRelease); actualMin != expectMin { |
| 195 | t.Fatalf("Expected minimum release %s, got %s", expectMin, actualMin) |
| 196 | } |
| 197 | if actualMax := version.Release(releases.maxRelease); actualMax != expectMax { |
| 198 | t.Fatalf("Expected maximum release %s, got %s", expectMax, actualMax) |
| 199 | } |
| 200 | if releases.revision != expectRev { |
| 201 | t.Fatalf("Expected revision %v, got %v", expectRev, releases.revision) |
| 202 | } |
| 203 | } |
| 204 | |
| 205 | putNode(t, cl, "a1", makeNode(true, &vpb.Version_Release{Major: 0, Minor: 0, Patch: 2})) |
| 206 | putNode(t, cl, "a2", makeNode(true, &vpb.Version_Release{Major: 0, Minor: 0, Patch: 2})) |
| 207 | putNode(t, cl, "a3", makeNode(true, &vpb.Version_Release{Major: 0, Minor: 0, Patch: 2})) |
| 208 | putNode(t, cl, "b", makeNode(true, &vpb.Version_Release{Major: 0, Minor: 0, Patch: 3})) |
| 209 | rev := putNode(t, cl, "c", makeNode(true, &vpb.Version_Release{Major: 10000, Minor: 0, Patch: 0})) |
| 210 | |
| 211 | supervisor.TestHarness(t, s.watchNodes) |
| 212 | expectReleases("0.0.2", "10000.0.0", rev) |
| 213 | // Node a1 is no longer a Kubernetes controller. |
| 214 | rev = putNode(t, cl, "a1", makeNode(false, &vpb.Version_Release{Major: 0, Minor: 0, Patch: 2})) |
| 215 | expectReleases("0.0.2", "10000.0.0", rev) |
| 216 | // Node a2 is deleted. |
| 217 | rev = putNode(t, cl, "a2", nil) |
| 218 | expectReleases("0.0.2", "10000.0.0", rev) |
| 219 | // Node a3 changes release. Now, the minimum should change. |
| 220 | rev = putNode(t, cl, "a3", makeNode(true, &vpb.Version_Release{Major: 0, Minor: 0, Patch: 4})) |
| 221 | expectReleases("0.0.3", "10000.0.0", rev) |
| 222 | } |
| 223 | |
| 224 | // TestService tests the entire service, checking that it reconciles |
| 225 | // only in situations where it should. |
| 226 | func TestService(t *testing.T) { |
| 227 | reconcileWait = 10 * time.Millisecond |
| 228 | cl := startEtcd(t) |
| 229 | clientset := fake.NewSimpleClientset() |
| 230 | s := Service{ |
| 231 | Etcd: cl, |
| 232 | ClientSet: clientset, |
| 233 | NodeID: "testnode", |
| 234 | } |
| 235 | |
| 236 | // This node is newer than the local node, election should not start. |
| 237 | putNode(t, cl, "a", makeNode(true, &vpb.Version_Release{Major: 10000, Minor: 0, Patch: 0})) |
| 238 | |
| 239 | cancelService, _ := supervisor.TestHarness(t, s.Run) |
| 240 | |
| 241 | time.Sleep(50 * time.Millisecond) |
| 242 | if len(clientset.Actions()) != 0 { |
| 243 | t.Fatal("Actions shouldn't have been performed yet.") |
| 244 | } |
| 245 | |
| 246 | // The status allows a too old node to start. |
| 247 | setStatus(t, cl, &ppb.KubernetesReconcilerStatus{ |
| 248 | State: ppb.KubernetesReconcilerStatus_STATE_DONE, |
| 249 | Version: &vpb.Version{ |
| 250 | Release: &vpb.Version_Release{Major: 0, Minor: 0, Patch: 2}, |
| 251 | }, |
| 252 | MinimumCompatibleRelease: &vpb.Version_Release{Major: 0, Minor: 0, Patch: 2}, |
| 253 | }) |
| 254 | |
| 255 | // This node is too old, before minApiserverRelease. |
| 256 | putNode(t, cl, "a", makeNode(true, &vpb.Version_Release{Major: 0, Minor: 0, Patch: 2})) |
| 257 | |
| 258 | // watch-releases restarts with 500 ms backoff + randomization, so wait 1s. |
| 259 | time.Sleep(time.Second) |
| 260 | if len(clientset.Actions()) != 0 { |
| 261 | t.Fatal("Actions shouldn't have been performed yet.") |
| 262 | } |
| 263 | |
| 264 | // Upgrade the node. |
| 265 | putNode(t, cl, "a", makeNode(true, minApiserverRelease)) |
| 266 | |
| 267 | // Wait for status to be set. |
| 268 | waitForActions := func() { |
| 269 | isReady := make(chan struct{}) |
| 270 | supervisor.TestHarness(t, func(ctx context.Context) error { |
| Jan Schär | b86917b | 2025-05-14 16:31:08 +0000 | [diff] [blame] | 271 | err := s.WaitReady(ctx) |
| Jan Schär | d20ddcc | 2024-05-08 14:18:29 +0200 | [diff] [blame] | 272 | if err != nil { |
| 273 | t.Error(err) |
| 274 | } |
| 275 | close(isReady) |
| 276 | supervisor.Signal(ctx, supervisor.SignalHealthy) |
| 277 | supervisor.Signal(ctx, supervisor.SignalDone) |
| 278 | return nil |
| 279 | }) |
| 280 | <-isReady |
| 281 | |
| 282 | if len(clientset.Actions()) == 0 { |
| 283 | t.Fatal("Actions should have been performed.") |
| 284 | } |
| 285 | clientset.ClearActions() |
| 286 | } |
| 287 | waitForActions() |
| 288 | |
| 289 | // The status does not allow a too old node to start. |
| 290 | setStatus(t, cl, &ppb.KubernetesReconcilerStatus{ |
| 291 | State: ppb.KubernetesReconcilerStatus_STATE_DONE, |
| 292 | Version: &vpb.Version{ |
| 293 | Release: &vpb.Version_Release{Major: 0, Minor: 0, Patch: 2}, |
| 294 | }, |
| 295 | MinimumCompatibleRelease: &vpb.Version_Release{Major: 10000, Minor: 0, Patch: 0}, |
| 296 | }) |
| 297 | |
| 298 | // This node is too old, before minApiserverRelease. But because it is not |
| 299 | // allowed to start, the reconciler is not blocked. |
| 300 | putNode(t, cl, "a", makeNode(true, &vpb.Version_Release{Major: 0, Minor: 0, Patch: 2})) |
| 301 | |
| 302 | // Start another instance. The old node is still leader. |
| 303 | supervisor.TestHarness(t, s.Run) |
| 304 | |
| 305 | time.Sleep(50 * time.Millisecond) |
| 306 | if len(clientset.Actions()) != 0 { |
| 307 | t.Fatal("Actions shouldn't have been performed yet.") |
| 308 | } |
| 309 | |
| 310 | // Stop the first instance. Now the second instance should get elected. |
| 311 | cancelService() |
| 312 | waitForActions() |
| 313 | } |