blob: 7288c84f111b600dbc1a752ce682741097a036a2 [file] [log] [blame]
Lorenz Brunb15abad2020-04-16 11:17:12 +02001// Copyright 2020 The Monogon Project Authors.
2//
3// SPDX-License-Identifier: Apache-2.0
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
17package kubernetes
18
19import (
20 "context"
21 "errors"
22 "fmt"
Lorenz Brunb15abad2020-04-16 11:17:12 +020023 "os"
24 "path/filepath"
25
Lorenz Brun37050122021-03-30 14:00:27 +020026 "golang.org/x/sys/unix"
Lorenz Brunb15abad2020-04-16 11:17:12 +020027 v1 "k8s.io/api/core/v1"
28 storagev1 "k8s.io/api/storage/v1"
29 apierrs "k8s.io/apimachinery/pkg/api/errors"
30 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
31 "k8s.io/client-go/informers"
32 coreinformers "k8s.io/client-go/informers/core/v1"
33 storageinformers "k8s.io/client-go/informers/storage/v1"
34 "k8s.io/client-go/kubernetes"
35 "k8s.io/client-go/kubernetes/scheme"
36 typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1"
37 "k8s.io/client-go/tools/cache"
38 "k8s.io/client-go/tools/record"
39 ref "k8s.io/client-go/tools/reference"
40 "k8s.io/client-go/util/workqueue"
Serge Bazanskic2c7ad92020-07-13 17:20:09 +020041
Serge Bazanski31370b02021-01-07 16:31:14 +010042 "source.monogon.dev/metropolis/node/core/localstorage"
Serge Bazanski31370b02021-01-07 16:31:14 +010043 "source.monogon.dev/metropolis/pkg/fsquota"
Lorenz Brun37050122021-03-30 14:00:27 +020044 "source.monogon.dev/metropolis/pkg/logtree"
Serge Bazanski31370b02021-01-07 16:31:14 +010045 "source.monogon.dev/metropolis/pkg/supervisor"
Lorenz Brunb15abad2020-04-16 11:17:12 +020046)
47
Serge Bazanski216fe7b2021-05-21 18:36:16 +020048// ONCHANGE(//metropolis/node/kubernetes/reconciler:resources_csi.go): needs to
49// match csiProvisionerServerName declared.
Serge Bazanski662b5b32020-12-21 13:49:00 +010050const csiProvisionerServerName = "dev.monogon.metropolis.vfs"
Lorenz Brunb15abad2020-04-16 11:17:12 +020051
Serge Bazanski216fe7b2021-05-21 18:36:16 +020052// csiProvisionerServer is responsible for the provisioning and deprovisioning
53// of CSI-based container volumes. It runs on all nodes and watches PVCs for
54// ones assigned to the node it's running on and fulfills the provisioning
55// request by creating a directory, applying a quota and creating the
56// corresponding PV. When the PV is released and its retention policy is
57// Delete, the directory and the PV resource are deleted.
Serge Bazanskic2c7ad92020-07-13 17:20:09 +020058type csiProvisionerServer struct {
59 NodeName string
60 Kubernetes kubernetes.Interface
61 InformerFactory informers.SharedInformerFactory
62 VolumesDirectory *localstorage.DataVolumesDirectory
63
Lorenz Brunb15abad2020-04-16 11:17:12 +020064 claimQueue workqueue.RateLimitingInterface
65 pvQueue workqueue.RateLimitingInterface
66 recorder record.EventRecorder
67 pvcInformer coreinformers.PersistentVolumeClaimInformer
68 pvInformer coreinformers.PersistentVolumeInformer
69 storageClassInformer storageinformers.StorageClassInformer
Serge Bazanskic7359672020-10-30 16:38:57 +010070 logger logtree.LeveledLogger
Lorenz Brunb15abad2020-04-16 11:17:12 +020071}
72
Serge Bazanski216fe7b2021-05-21 18:36:16 +020073// runCSIProvisioner runs the main provisioning machinery. It consists of a
74// bunch of informers which keep track of the events happening on the
75// Kubernetes control plane and informs us when something happens. If anything
76// happens to PVCs or PVs, we enqueue the identifier of that resource in a work
77// queue. Queues are being worked on by only one worker to limit load and avoid
78// complicated locking infrastructure. Failed items are requeued.
Serge Bazanskic2c7ad92020-07-13 17:20:09 +020079func (p *csiProvisionerServer) Run(ctx context.Context) error {
Serge Bazanski216fe7b2021-05-21 18:36:16 +020080 // The recorder is used to log Kubernetes events for successful or failed
81 // volume provisions. These events then show up in `kubectl describe pvc`
82 // and can be used by admins to debug issues with this provisioner.
Serge Bazanskic2c7ad92020-07-13 17:20:09 +020083 eventBroadcaster := record.NewBroadcaster()
84 eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: p.Kubernetes.CoreV1().Events("")})
85 p.recorder = eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: csiProvisionerServerName, Host: p.NodeName})
Lorenz Brunb15abad2020-04-16 11:17:12 +020086
Serge Bazanskic2c7ad92020-07-13 17:20:09 +020087 p.pvInformer = p.InformerFactory.Core().V1().PersistentVolumes()
88 p.pvcInformer = p.InformerFactory.Core().V1().PersistentVolumeClaims()
89 p.storageClassInformer = p.InformerFactory.Storage().V1().StorageClasses()
Lorenz Brunb15abad2020-04-16 11:17:12 +020090
Serge Bazanskic2c7ad92020-07-13 17:20:09 +020091 p.claimQueue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter())
92 p.pvQueue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter())
Lorenz Brunb15abad2020-04-16 11:17:12 +020093
Serge Bazanskic2c7ad92020-07-13 17:20:09 +020094 p.pvcInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
95 AddFunc: p.enqueueClaim,
96 UpdateFunc: func(old, new interface{}) {
97 p.enqueueClaim(new)
98 },
99 })
100 p.pvInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
101 AddFunc: p.enqueuePV,
102 UpdateFunc: func(old, new interface{}) {
103 p.enqueuePV(new)
104 },
105 })
106 p.logger = supervisor.Logger(ctx)
Lorenz Brunb15abad2020-04-16 11:17:12 +0200107
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200108 go p.pvcInformer.Informer().Run(ctx.Done())
109 go p.pvInformer.Informer().Run(ctx.Done())
110 go p.storageClassInformer.Informer().Run(ctx.Done())
Lorenz Brunb15abad2020-04-16 11:17:12 +0200111
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200112 // These will self-terminate once the queues are shut down
113 go p.processQueueItems(p.claimQueue, func(key string) error {
114 return p.processPVC(key)
115 })
116 go p.processQueueItems(p.pvQueue, func(key string) error {
117 return p.processPV(key)
118 })
Lorenz Brunb15abad2020-04-16 11:17:12 +0200119
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200120 supervisor.Signal(ctx, supervisor.SignalHealthy)
121 <-ctx.Done()
122 p.claimQueue.ShutDown()
123 p.pvQueue.ShutDown()
124 return nil
Lorenz Brunb15abad2020-04-16 11:17:12 +0200125}
126
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200127// isOurPVC checks if the given PVC is is to be provisioned by this provisioner
128// and has been scheduled onto this node
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200129func (p *csiProvisionerServer) isOurPVC(pvc *v1.PersistentVolumeClaim) bool {
130 if pvc.ObjectMeta.Annotations["volume.beta.kubernetes.io/storage-provisioner"] != csiProvisionerServerName {
131 return false
132 }
133 if pvc.ObjectMeta.Annotations["volume.kubernetes.io/selected-node"] != p.NodeName {
134 return false
135 }
136 return true
Lorenz Brunb15abad2020-04-16 11:17:12 +0200137}
138
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200139// isOurPV checks if the given PV has been provisioned by this provisioner and
140// has been scheduled onto this node
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200141func (p *csiProvisionerServer) isOurPV(pv *v1.PersistentVolume) bool {
142 if pv.ObjectMeta.Annotations["pv.kubernetes.io/provisioned-by"] != csiProvisionerServerName {
143 return false
144 }
145 if pv.Spec.NodeAffinity.Required.NodeSelectorTerms[0].MatchExpressions[0].Values[0] != p.NodeName {
146 return false
147 }
148 return true
Lorenz Brunb15abad2020-04-16 11:17:12 +0200149}
150
151// enqueueClaim adds an added/changed PVC to the work queue
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200152func (p *csiProvisionerServer) enqueueClaim(obj interface{}) {
Lorenz Brunb15abad2020-04-16 11:17:12 +0200153 key, err := cache.MetaNamespaceKeyFunc(obj)
154 if err != nil {
Serge Bazanskic7359672020-10-30 16:38:57 +0100155 p.logger.Errorf("Not queuing PVC because key could not be derived: %v", err)
Lorenz Brunb15abad2020-04-16 11:17:12 +0200156 return
157 }
158 p.claimQueue.Add(key)
159}
160
161// enqueuePV adds an added/changed PV to the work queue
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200162func (p *csiProvisionerServer) enqueuePV(obj interface{}) {
Lorenz Brunb15abad2020-04-16 11:17:12 +0200163 key, err := cache.MetaNamespaceKeyFunc(obj)
164 if err != nil {
Serge Bazanskic7359672020-10-30 16:38:57 +0100165 p.logger.Errorf("Not queuing PV because key could not be derived: %v", err)
Lorenz Brunb15abad2020-04-16 11:17:12 +0200166 return
167 }
168 p.pvQueue.Add(key)
169}
170
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200171// processQueueItems gets items from the given work queue and calls the process
172// function for each of them. It self- terminates once the queue is shut down.
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200173func (p *csiProvisionerServer) processQueueItems(queue workqueue.RateLimitingInterface, process func(key string) error) {
Lorenz Brunb15abad2020-04-16 11:17:12 +0200174 for {
175 obj, shutdown := queue.Get()
176 if shutdown {
177 return
178 }
179
180 func(obj interface{}) {
181 defer queue.Done(obj)
182 key, ok := obj.(string)
183 if !ok {
184 queue.Forget(obj)
Serge Bazanskic7359672020-10-30 16:38:57 +0100185 p.logger.Errorf("Expected string in workqueue, got %+v", obj)
Lorenz Brunb15abad2020-04-16 11:17:12 +0200186 return
187 }
188
189 if err := process(key); err != nil {
Serge Bazanskic7359672020-10-30 16:38:57 +0100190 p.logger.Warningf("Failed processing item %q, requeueing (numrequeues: %d): %v", key, queue.NumRequeues(obj), err)
Lorenz Brunb15abad2020-04-16 11:17:12 +0200191 queue.AddRateLimited(obj)
192 }
193
194 queue.Forget(obj)
195 }(obj)
196 }
197}
198
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200199// volumePath gets the path where the volume is stored.
200func (p *csiProvisionerServer) volumePath(volumeID string) string {
201 return filepath.Join(p.VolumesDirectory.FullPath(), volumeID)
Lorenz Brunb15abad2020-04-16 11:17:12 +0200202}
203
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200204// processPVC looks at a single PVC item from the queue, determines if it needs
205// to be provisioned and logs the provisioning result to the recorder
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200206func (p *csiProvisionerServer) processPVC(key string) error {
Lorenz Brunb15abad2020-04-16 11:17:12 +0200207 namespace, name, err := cache.SplitMetaNamespaceKey(key)
208 if err != nil {
209 return fmt.Errorf("invalid resource key: %s", key)
210 }
211 pvc, err := p.pvcInformer.Lister().PersistentVolumeClaims(namespace).Get(name)
212 if apierrs.IsNotFound(err) {
213 return nil // nothing to do, no error
214 } else if err != nil {
215 return fmt.Errorf("failed to get PVC for processing: %w", err)
216 }
217
218 if !p.isOurPVC(pvc) {
219 return nil
220 }
221
222 if pvc.Status.Phase != "Pending" {
223 // If the PVC is not pending, we don't need to provision anything
224 return nil
225 }
226
227 storageClass, err := p.storageClassInformer.Lister().Get(*pvc.Spec.StorageClassName)
228 if err != nil {
229 return fmt.Errorf("")
230 }
231
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200232 if storageClass.Provisioner != csiProvisionerServerName {
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200233 // We're not responsible for this PVC. Can only happen if
234 // controller-manager makes a mistake setting the annotations, but
235 // we're bailing here anyways for safety.
Lorenz Brunb15abad2020-04-16 11:17:12 +0200236 return nil
237 }
238
239 err = p.provisionPVC(pvc, storageClass)
240
241 if err != nil {
242 p.recorder.Eventf(pvc, v1.EventTypeWarning, "ProvisioningFailed", "Failed to provision PV: %v", err)
243 return err
244 }
245 p.recorder.Eventf(pvc, v1.EventTypeNormal, "Provisioned", "Successfully provisioned PV")
246
247 return nil
248}
249
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200250// provisionPVC creates the directory where the volume lives, sets a quota for
251// the requested amount of storage and creates the PV object representing this
252// new volume
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200253func (p *csiProvisionerServer) provisionPVC(pvc *v1.PersistentVolumeClaim, storageClass *storagev1.StorageClass) error {
Lorenz Brunb15abad2020-04-16 11:17:12 +0200254 claimRef, err := ref.GetReference(scheme.Scheme, pvc)
255 if err != nil {
256 return fmt.Errorf("failed to get reference to PVC: %w", err)
257 }
258
259 storageReq := pvc.Spec.Resources.Requests[v1.ResourceStorage]
260 if storageReq.IsZero() {
261 return fmt.Errorf("PVC is not requesting any storage, this is not supported")
262 }
263 capacity, ok := storageReq.AsInt64()
264 if !ok {
265 return fmt.Errorf("PVC requesting more than 2^63 bytes of storage, this is not supported")
266 }
267
Lorenz Brunb15abad2020-04-16 11:17:12 +0200268 volumeID := "pvc-" + string(pvc.ObjectMeta.UID)
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200269 volumePath := p.volumePath(volumeID)
Lorenz Brunb15abad2020-04-16 11:17:12 +0200270
Serge Bazanskic7359672020-10-30 16:38:57 +0100271 p.logger.Infof("Creating local PV %s", volumeID)
Lorenz Brun37050122021-03-30 14:00:27 +0200272
273 switch *pvc.Spec.VolumeMode {
274 case "", v1.PersistentVolumeFilesystem:
275 if err := os.Mkdir(volumePath, 0644); err != nil && !os.IsExist(err) {
276 return fmt.Errorf("failed to create volume directory: %w", err)
277 }
Lorenz Brun764a2de2021-11-22 16:26:36 +0100278 files, err := os.ReadDir(volumePath)
Lorenz Brun37050122021-03-30 14:00:27 +0200279 if err != nil {
280 return fmt.Errorf("failed to list files in newly-created volume: %w", err)
281 }
282 if len(files) > 0 {
283 return errors.New("newly-created volume already contains data, bailing")
284 }
285 if err := fsquota.SetQuota(volumePath, uint64(capacity), 100000); err != nil {
286 return fmt.Errorf("failed to update quota: %v", err)
287 }
288 case v1.PersistentVolumeBlock:
289 imageFile, err := os.OpenFile(volumePath, os.O_CREATE|os.O_RDWR, 0644)
290 if err != nil {
291 return fmt.Errorf("failed to create volume image: %w", err)
292 }
293 defer imageFile.Close()
294 if err := unix.Fallocate(int(imageFile.Fd()), 0, 0, capacity); err != nil {
295 return fmt.Errorf("failed to fallocate() volume image: %w", err)
296 }
297 default:
298 return fmt.Errorf("VolumeMode \"%s\" is unsupported", *pvc.Spec.VolumeMode)
Lorenz Brunb15abad2020-04-16 11:17:12 +0200299 }
300
301 vol := &v1.PersistentVolume{
302 ObjectMeta: metav1.ObjectMeta{
303 Name: volumeID,
304 Annotations: map[string]string{
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200305 "pv.kubernetes.io/provisioned-by": csiProvisionerServerName},
Lorenz Brunb15abad2020-04-16 11:17:12 +0200306 },
307 Spec: v1.PersistentVolumeSpec{
308 AccessModes: []v1.PersistentVolumeAccessMode{v1.ReadWriteOnce},
309 Capacity: v1.ResourceList{
310 v1.ResourceStorage: storageReq, // We're always giving the exact amount
311 },
312 PersistentVolumeSource: v1.PersistentVolumeSource{
313 CSI: &v1.CSIPersistentVolumeSource{
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200314 Driver: csiProvisionerServerName,
Lorenz Brunb15abad2020-04-16 11:17:12 +0200315 VolumeHandle: volumeID,
316 },
317 },
Lorenz Brun37050122021-03-30 14:00:27 +0200318 ClaimRef: claimRef,
319 VolumeMode: pvc.Spec.VolumeMode,
Lorenz Brunb15abad2020-04-16 11:17:12 +0200320 NodeAffinity: &v1.VolumeNodeAffinity{
321 Required: &v1.NodeSelector{
322 NodeSelectorTerms: []v1.NodeSelectorTerm{
323 {
324 MatchExpressions: []v1.NodeSelectorRequirement{
325 {
326 Key: "kubernetes.io/hostname",
327 Operator: v1.NodeSelectorOpIn,
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200328 Values: []string{p.NodeName},
Lorenz Brunb15abad2020-04-16 11:17:12 +0200329 },
330 },
331 },
332 },
333 },
334 },
335 StorageClassName: *pvc.Spec.StorageClassName,
336 PersistentVolumeReclaimPolicy: *storageClass.ReclaimPolicy,
337 },
338 }
339
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200340 _, err = p.Kubernetes.CoreV1().PersistentVolumes().Create(context.Background(), vol, metav1.CreateOptions{})
341 if err != nil && !apierrs.IsAlreadyExists(err) {
Lorenz Brunb15abad2020-04-16 11:17:12 +0200342 return fmt.Errorf("failed to create PV object: %w", err)
343 }
344 return nil
345}
346
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200347// processPV looks at a single PV item from the queue and checks if it has been
348// released and needs to be deleted. If yes it deletes the associated quota,
349// directory and the PV object and logs the result to the recorder.
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200350func (p *csiProvisionerServer) processPV(key string) error {
Lorenz Brunb15abad2020-04-16 11:17:12 +0200351 _, name, err := cache.SplitMetaNamespaceKey(key)
352 if err != nil {
353 return fmt.Errorf("invalid resource key: %s", key)
354 }
355 pv, err := p.pvInformer.Lister().Get(name)
356 if apierrs.IsNotFound(err) {
357 return nil // nothing to do, no error
358 } else if err != nil {
359 return fmt.Errorf("failed to get PV for processing: %w", err)
360 }
361
362 if !p.isOurPV(pv) {
363 return nil
364 }
365 if pv.Spec.PersistentVolumeReclaimPolicy != v1.PersistentVolumeReclaimDelete || pv.Status.Phase != "Released" {
366 return nil
367 }
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200368 volumePath := p.volumePath(pv.Spec.CSI.VolumeHandle)
Lorenz Brunb15abad2020-04-16 11:17:12 +0200369
370 // Log deletes for auditing purposes
Serge Bazanskic7359672020-10-30 16:38:57 +0100371 p.logger.Infof("Deleting persistent volume %s", pv.Spec.CSI.VolumeHandle)
Lorenz Brun37050122021-03-30 14:00:27 +0200372 switch *pv.Spec.VolumeMode {
373 case "", v1.PersistentVolumeFilesystem:
374 if err := fsquota.SetQuota(volumePath, 0, 0); err != nil {
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200375 // We record these here manually since a successful deletion
376 // removes the PV we'd be attaching them to.
Lorenz Brun37050122021-03-30 14:00:27 +0200377 p.recorder.Eventf(pv, v1.EventTypeWarning, "DeprovisioningFailed", "Failed to remove quota: %v", err)
378 return fmt.Errorf("failed to remove quota: %w", err)
379 }
380 if err := os.RemoveAll(volumePath); err != nil && !os.IsNotExist(err) {
381 p.recorder.Eventf(pv, v1.EventTypeWarning, "DeprovisioningFailed", "Failed to delete volume: %v", err)
382 return fmt.Errorf("failed to delete volume: %w", err)
383 }
384 case v1.PersistentVolumeBlock:
385 if err := os.Remove(volumePath); err != nil && !os.IsNotExist(err) {
386 p.recorder.Eventf(pv, v1.EventTypeWarning, "DeprovisioningFailed", "Failed to delete volume: %v", err)
387 return fmt.Errorf("failed to delete volume: %w", err)
388 }
389 default:
390 p.recorder.Eventf(pv, v1.EventTypeWarning, "DeprovisioningFailed", "Invalid volume mode \"%v\"", *pv.Spec.VolumeMode)
391 return fmt.Errorf("invalid volume mode \"%v\"", *pv.Spec.VolumeMode)
Lorenz Brunb15abad2020-04-16 11:17:12 +0200392 }
393
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200394 err = p.Kubernetes.CoreV1().PersistentVolumes().Delete(context.Background(), pv.Name, metav1.DeleteOptions{})
Lorenz Brunb15abad2020-04-16 11:17:12 +0200395 if err != nil && !apierrs.IsNotFound(err) {
396 p.recorder.Eventf(pv, v1.EventTypeWarning, "DeprovisioningFailed", "Failed to delete PV object from K8s API: %v", err)
397 return fmt.Errorf("failed to delete PV object: %w", err)
398 }
399 return nil
400}