blob: 1f17e6986a8d47e95452bc7feddf692f9c1e3f60 [file] [log] [blame]
Tim Windelschmidt6d33a432025-02-04 14:34:25 +01001// Copyright The Monogon Project Authors.
2// SPDX-License-Identifier: Apache-2.0
3
Jan Schärd20ddcc2024-05-08 14:18:29 +02004package reconciler
5
6import (
7 "context"
8 "testing"
9 "time"
10
11 "go.etcd.io/etcd/tests/v3/integration"
12 "google.golang.org/protobuf/proto"
13 "k8s.io/client-go/kubernetes/fake"
14
15 "source.monogon.dev/metropolis/node/core/consensus/client"
16 "source.monogon.dev/metropolis/node/core/curator"
17 ppb "source.monogon.dev/metropolis/node/core/curator/proto/private"
Jan Schärb86917b2025-05-14 16:31:08 +000018 "source.monogon.dev/metropolis/node/core/productinfo"
Jan Schärd20ddcc2024-05-08 14:18:29 +020019 cpb "source.monogon.dev/metropolis/proto/common"
Tim Windelschmidt9f21f532024-05-07 15:14:20 +020020 "source.monogon.dev/osbase/supervisor"
Jan Schärd20ddcc2024-05-08 14:18:29 +020021 "source.monogon.dev/version"
22 vpb "source.monogon.dev/version/spec"
23)
24
Jan Schärb86917b2025-05-14 16:31:08 +000025var productInfo = productinfo.Get()
26
Jan Schärd20ddcc2024-05-08 14:18:29 +020027// TestMinimumReleasesNotAboveMetropolisRelease tests that minimum releases
28// are not above the metropolis release itself, because that would cause
29// things to get stuck.
30func TestMinimumReleasesNotAboveMetropolisRelease(t *testing.T) {
Jan Schärb86917b2025-05-14 16:31:08 +000031 if version.ReleaseLessThan(productInfo.Version.Release, minReconcilerRelease) {
Jan Schärd20ddcc2024-05-08 14:18:29 +020032 t.Errorf("Metropolis release %s is below the minimum reconciler release %s",
Jan Schärb86917b2025-05-14 16:31:08 +000033 version.Semver(productInfo.Version),
Jan Schärd20ddcc2024-05-08 14:18:29 +020034 version.Release(minReconcilerRelease),
35 )
36 }
Jan Schärb86917b2025-05-14 16:31:08 +000037 if version.ReleaseLessThan(productInfo.Version.Release, minApiserverRelease) {
Jan Schärd20ddcc2024-05-08 14:18:29 +020038 t.Errorf("Metropolis release %s is below the minimum apiserver release %s",
Jan Schärb86917b2025-05-14 16:31:08 +000039 version.Semver(productInfo.Version),
Jan Schärd20ddcc2024-05-08 14:18:29 +020040 version.Release(minApiserverRelease),
41 )
42 }
43}
44
45// startEtcd creates an etcd cluster and client for testing.
46func startEtcd(t *testing.T) client.Namespaced {
47 t.Helper()
48 // Start a single-node etcd cluster.
49 integration.BeforeTestExternal(t)
50 cluster := integration.NewClusterV3(t, &integration.ClusterConfig{Size: 1})
51 t.Cleanup(func() {
52 cluster.Terminate(t)
53 })
54 // Create etcd client to test cluster.
55 curEtcd, _ := client.NewLocal(cluster.Client(0)).Sub("curator")
56 return curEtcd
57}
58
59func setStatus(t *testing.T, cl client.Namespaced, status *ppb.KubernetesReconcilerStatus) {
60 t.Helper()
61 ctx := context.Background()
62
63 statusBytes, err := proto.Marshal(status)
64 if err != nil {
65 t.Fatalf("Failed to marshal status: %v", err)
66 }
67
68 _, err = cl.Put(ctx, statusKey, string(statusBytes))
69 if err != nil {
70 t.Fatalf("Put: %v", err)
71 }
72}
73
74func makeNode(isController bool, release *vpb.Version_Release) *ppb.Node {
75 node := &ppb.Node{
76 Roles: &cpb.NodeRoles{},
77 Status: &cpb.NodeStatus{
78 Version: &vpb.Version{Release: release},
79 },
80 }
81 if isController {
82 node.Roles.KubernetesController = &cpb.NodeRoles_KubernetesController{}
83 }
84 return node
85}
86
87// putNode puts the node into etcd, or deletes if nil.
88// It returns the etcd revision of the operation.
89func putNode(t *testing.T, cl client.Namespaced, id string, node *ppb.Node) int64 {
90 t.Helper()
91 ctx := context.Background()
92
93 nkey, err := curator.NodeEtcdPrefix.Key(id)
94 if err != nil {
95 t.Fatal(err)
96 }
97 if node != nil {
98 nodeBytes, err := proto.Marshal(node)
99 if err != nil {
100 t.Fatalf("Failed to marshal node: %v", err)
101 }
102 resp, err := cl.Put(ctx, nkey, string(nodeBytes))
103 if err != nil {
104 t.Fatalf("Put: %v", err)
105 }
106 return resp.Header.Revision
107 } else {
108 resp, err := cl.Delete(ctx, nkey)
109 if err != nil {
110 t.Fatalf("Delete: %v", err)
111 }
112 return resp.Header.Revision
113 }
114}
115
116// TestWaitReady tests that WaitReady does not return too early, and the test
117// will time out if WaitReady fails to return when it is supposed to.
118func TestWaitReady(t *testing.T) {
119 cl := startEtcd(t)
Jan Schärb86917b2025-05-14 16:31:08 +0000120 s := Service{
121 Etcd: cl,
122 }
Jan Schärd20ddcc2024-05-08 14:18:29 +0200123
124 isReady := make(chan struct{})
125 supervisor.TestHarness(t, func(ctx context.Context) error {
Jan Schärb86917b2025-05-14 16:31:08 +0000126 err := s.WaitReady(ctx)
Jan Schärd20ddcc2024-05-08 14:18:29 +0200127 if err != nil {
128 t.Error(err)
129 }
130 close(isReady)
131 supervisor.Signal(ctx, supervisor.SignalHealthy)
132 supervisor.Signal(ctx, supervisor.SignalDone)
133 return nil
134 })
135
136 // status does not exist.
137 time.Sleep(10 * time.Millisecond)
138
139 // Version is too old.
140 setStatus(t, cl, &ppb.KubernetesReconcilerStatus{
141 State: ppb.KubernetesReconcilerStatus_STATE_DONE,
142 Version: &vpb.Version{
143 Release: &vpb.Version_Release{Major: 0, Minor: 0, Patch: 0},
144 },
145 MinimumCompatibleRelease: &vpb.Version_Release{Major: 0, Minor: 0, Patch: 0},
146 })
147 time.Sleep(10 * time.Millisecond)
148
149 // MinimumCompatibleRelease is too new.
150 setStatus(t, cl, &ppb.KubernetesReconcilerStatus{
151 State: ppb.KubernetesReconcilerStatus_STATE_DONE,
152 Version: &vpb.Version{
153 Release: &vpb.Version_Release{Major: 10000, Minor: 0, Patch: 0},
154 },
155 MinimumCompatibleRelease: &vpb.Version_Release{Major: 10000, Minor: 0, Patch: 0},
156 })
157 time.Sleep(10 * time.Millisecond)
158
159 select {
160 case <-isReady:
161 t.Fatal("WaitReady returned too early.")
162 default:
163 }
164
165 // Now set the status to something compatible.
166 setStatus(t, cl, &ppb.KubernetesReconcilerStatus{
167 State: ppb.KubernetesReconcilerStatus_STATE_DONE,
168 Version: &vpb.Version{
169 Release: &vpb.Version_Release{Major: 10000, Minor: 0, Patch: 0},
170 },
Jan Schärb86917b2025-05-14 16:31:08 +0000171 MinimumCompatibleRelease: productInfo.Version.Release,
Jan Schärd20ddcc2024-05-08 14:18:29 +0200172 })
173
174 <-isReady
175}
176
177// TestWatchNodes ensures that WatchNodes always updates releases correctly
178// as nodes are changed in various ways.
179func TestWatchNodes(t *testing.T) {
180 ctx := context.Background()
181 cl := startEtcd(t)
182 s := Service{
183 Etcd: cl,
184 }
185 w := s.releases.Watch()
186 defer w.Close()
187
188 expectReleases := func(expectMin, expectMax string, expectRev int64) {
189 t.Helper()
190 releases, err := w.Get(ctx)
191 if err != nil {
192 t.Fatal(err)
193 }
194 if actualMin := version.Release(releases.minRelease); actualMin != expectMin {
195 t.Fatalf("Expected minimum release %s, got %s", expectMin, actualMin)
196 }
197 if actualMax := version.Release(releases.maxRelease); actualMax != expectMax {
198 t.Fatalf("Expected maximum release %s, got %s", expectMax, actualMax)
199 }
200 if releases.revision != expectRev {
201 t.Fatalf("Expected revision %v, got %v", expectRev, releases.revision)
202 }
203 }
204
205 putNode(t, cl, "a1", makeNode(true, &vpb.Version_Release{Major: 0, Minor: 0, Patch: 2}))
206 putNode(t, cl, "a2", makeNode(true, &vpb.Version_Release{Major: 0, Minor: 0, Patch: 2}))
207 putNode(t, cl, "a3", makeNode(true, &vpb.Version_Release{Major: 0, Minor: 0, Patch: 2}))
208 putNode(t, cl, "b", makeNode(true, &vpb.Version_Release{Major: 0, Minor: 0, Patch: 3}))
209 rev := putNode(t, cl, "c", makeNode(true, &vpb.Version_Release{Major: 10000, Minor: 0, Patch: 0}))
210
211 supervisor.TestHarness(t, s.watchNodes)
212 expectReleases("0.0.2", "10000.0.0", rev)
213 // Node a1 is no longer a Kubernetes controller.
214 rev = putNode(t, cl, "a1", makeNode(false, &vpb.Version_Release{Major: 0, Minor: 0, Patch: 2}))
215 expectReleases("0.0.2", "10000.0.0", rev)
216 // Node a2 is deleted.
217 rev = putNode(t, cl, "a2", nil)
218 expectReleases("0.0.2", "10000.0.0", rev)
219 // Node a3 changes release. Now, the minimum should change.
220 rev = putNode(t, cl, "a3", makeNode(true, &vpb.Version_Release{Major: 0, Minor: 0, Patch: 4}))
221 expectReleases("0.0.3", "10000.0.0", rev)
222}
223
224// TestService tests the entire service, checking that it reconciles
225// only in situations where it should.
226func TestService(t *testing.T) {
227 reconcileWait = 10 * time.Millisecond
228 cl := startEtcd(t)
229 clientset := fake.NewSimpleClientset()
230 s := Service{
231 Etcd: cl,
232 ClientSet: clientset,
233 NodeID: "testnode",
234 }
235
236 // This node is newer than the local node, election should not start.
237 putNode(t, cl, "a", makeNode(true, &vpb.Version_Release{Major: 10000, Minor: 0, Patch: 0}))
238
239 cancelService, _ := supervisor.TestHarness(t, s.Run)
240
241 time.Sleep(50 * time.Millisecond)
242 if len(clientset.Actions()) != 0 {
243 t.Fatal("Actions shouldn't have been performed yet.")
244 }
245
246 // The status allows a too old node to start.
247 setStatus(t, cl, &ppb.KubernetesReconcilerStatus{
248 State: ppb.KubernetesReconcilerStatus_STATE_DONE,
249 Version: &vpb.Version{
250 Release: &vpb.Version_Release{Major: 0, Minor: 0, Patch: 2},
251 },
252 MinimumCompatibleRelease: &vpb.Version_Release{Major: 0, Minor: 0, Patch: 2},
253 })
254
255 // This node is too old, before minApiserverRelease.
256 putNode(t, cl, "a", makeNode(true, &vpb.Version_Release{Major: 0, Minor: 0, Patch: 2}))
257
258 // watch-releases restarts with 500 ms backoff + randomization, so wait 1s.
259 time.Sleep(time.Second)
260 if len(clientset.Actions()) != 0 {
261 t.Fatal("Actions shouldn't have been performed yet.")
262 }
263
264 // Upgrade the node.
265 putNode(t, cl, "a", makeNode(true, minApiserverRelease))
266
267 // Wait for status to be set.
268 waitForActions := func() {
269 isReady := make(chan struct{})
270 supervisor.TestHarness(t, func(ctx context.Context) error {
Jan Schärb86917b2025-05-14 16:31:08 +0000271 err := s.WaitReady(ctx)
Jan Schärd20ddcc2024-05-08 14:18:29 +0200272 if err != nil {
273 t.Error(err)
274 }
275 close(isReady)
276 supervisor.Signal(ctx, supervisor.SignalHealthy)
277 supervisor.Signal(ctx, supervisor.SignalDone)
278 return nil
279 })
280 <-isReady
281
282 if len(clientset.Actions()) == 0 {
283 t.Fatal("Actions should have been performed.")
284 }
285 clientset.ClearActions()
286 }
287 waitForActions()
288
289 // The status does not allow a too old node to start.
290 setStatus(t, cl, &ppb.KubernetesReconcilerStatus{
291 State: ppb.KubernetesReconcilerStatus_STATE_DONE,
292 Version: &vpb.Version{
293 Release: &vpb.Version_Release{Major: 0, Minor: 0, Patch: 2},
294 },
295 MinimumCompatibleRelease: &vpb.Version_Release{Major: 10000, Minor: 0, Patch: 0},
296 })
297
298 // This node is too old, before minApiserverRelease. But because it is not
299 // allowed to start, the reconciler is not blocked.
300 putNode(t, cl, "a", makeNode(true, &vpb.Version_Release{Major: 0, Minor: 0, Patch: 2}))
301
302 // Start another instance. The old node is still leader.
303 supervisor.TestHarness(t, s.Run)
304
305 time.Sleep(50 * time.Millisecond)
306 if len(clientset.Actions()) != 0 {
307 t.Fatal("Actions shouldn't have been performed yet.")
308 }
309
310 // Stop the first instance. Now the second instance should get elected.
311 cancelService()
312 waitForActions()
313}