blob: 42edf77137ae46cc2472b3e9fb510223b009ef4f [file] [log] [blame]
Lorenz Brunb15abad2020-04-16 11:17:12 +02001// Copyright 2020 The Monogon Project Authors.
2//
3// SPDX-License-Identifier: Apache-2.0
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
17package kubernetes
18
19import (
20 "context"
21 "errors"
22 "fmt"
23 "io/ioutil"
24 "os"
25 "path/filepath"
26
Lorenz Brun37050122021-03-30 14:00:27 +020027 "golang.org/x/sys/unix"
Lorenz Brunb15abad2020-04-16 11:17:12 +020028 v1 "k8s.io/api/core/v1"
29 storagev1 "k8s.io/api/storage/v1"
30 apierrs "k8s.io/apimachinery/pkg/api/errors"
31 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
32 "k8s.io/client-go/informers"
33 coreinformers "k8s.io/client-go/informers/core/v1"
34 storageinformers "k8s.io/client-go/informers/storage/v1"
35 "k8s.io/client-go/kubernetes"
36 "k8s.io/client-go/kubernetes/scheme"
37 typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1"
38 "k8s.io/client-go/tools/cache"
39 "k8s.io/client-go/tools/record"
40 ref "k8s.io/client-go/tools/reference"
41 "k8s.io/client-go/util/workqueue"
Serge Bazanskic2c7ad92020-07-13 17:20:09 +020042
Serge Bazanski31370b02021-01-07 16:31:14 +010043 "source.monogon.dev/metropolis/node/core/localstorage"
Serge Bazanski31370b02021-01-07 16:31:14 +010044 "source.monogon.dev/metropolis/pkg/fsquota"
Lorenz Brun37050122021-03-30 14:00:27 +020045 "source.monogon.dev/metropolis/pkg/logtree"
Serge Bazanski31370b02021-01-07 16:31:14 +010046 "source.monogon.dev/metropolis/pkg/supervisor"
Lorenz Brunb15abad2020-04-16 11:17:12 +020047)
48
Serge Bazanski216fe7b2021-05-21 18:36:16 +020049// ONCHANGE(//metropolis/node/kubernetes/reconciler:resources_csi.go): needs to
50// match csiProvisionerServerName declared.
Serge Bazanski662b5b32020-12-21 13:49:00 +010051const csiProvisionerServerName = "dev.monogon.metropolis.vfs"
Lorenz Brunb15abad2020-04-16 11:17:12 +020052
Serge Bazanski216fe7b2021-05-21 18:36:16 +020053// csiProvisionerServer is responsible for the provisioning and deprovisioning
54// of CSI-based container volumes. It runs on all nodes and watches PVCs for
55// ones assigned to the node it's running on and fulfills the provisioning
56// request by creating a directory, applying a quota and creating the
57// corresponding PV. When the PV is released and its retention policy is
58// Delete, the directory and the PV resource are deleted.
Serge Bazanskic2c7ad92020-07-13 17:20:09 +020059type csiProvisionerServer struct {
60 NodeName string
61 Kubernetes kubernetes.Interface
62 InformerFactory informers.SharedInformerFactory
63 VolumesDirectory *localstorage.DataVolumesDirectory
64
Lorenz Brunb15abad2020-04-16 11:17:12 +020065 claimQueue workqueue.RateLimitingInterface
66 pvQueue workqueue.RateLimitingInterface
67 recorder record.EventRecorder
68 pvcInformer coreinformers.PersistentVolumeClaimInformer
69 pvInformer coreinformers.PersistentVolumeInformer
70 storageClassInformer storageinformers.StorageClassInformer
Serge Bazanskic7359672020-10-30 16:38:57 +010071 logger logtree.LeveledLogger
Lorenz Brunb15abad2020-04-16 11:17:12 +020072}
73
Serge Bazanski216fe7b2021-05-21 18:36:16 +020074// runCSIProvisioner runs the main provisioning machinery. It consists of a
75// bunch of informers which keep track of the events happening on the
76// Kubernetes control plane and informs us when something happens. If anything
77// happens to PVCs or PVs, we enqueue the identifier of that resource in a work
78// queue. Queues are being worked on by only one worker to limit load and avoid
79// complicated locking infrastructure. Failed items are requeued.
Serge Bazanskic2c7ad92020-07-13 17:20:09 +020080func (p *csiProvisionerServer) Run(ctx context.Context) error {
Serge Bazanski216fe7b2021-05-21 18:36:16 +020081 // The recorder is used to log Kubernetes events for successful or failed
82 // volume provisions. These events then show up in `kubectl describe pvc`
83 // and can be used by admins to debug issues with this provisioner.
Serge Bazanskic2c7ad92020-07-13 17:20:09 +020084 eventBroadcaster := record.NewBroadcaster()
85 eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: p.Kubernetes.CoreV1().Events("")})
86 p.recorder = eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: csiProvisionerServerName, Host: p.NodeName})
Lorenz Brunb15abad2020-04-16 11:17:12 +020087
Serge Bazanskic2c7ad92020-07-13 17:20:09 +020088 p.pvInformer = p.InformerFactory.Core().V1().PersistentVolumes()
89 p.pvcInformer = p.InformerFactory.Core().V1().PersistentVolumeClaims()
90 p.storageClassInformer = p.InformerFactory.Storage().V1().StorageClasses()
Lorenz Brunb15abad2020-04-16 11:17:12 +020091
Serge Bazanskic2c7ad92020-07-13 17:20:09 +020092 p.claimQueue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter())
93 p.pvQueue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter())
Lorenz Brunb15abad2020-04-16 11:17:12 +020094
Serge Bazanskic2c7ad92020-07-13 17:20:09 +020095 p.pvcInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
96 AddFunc: p.enqueueClaim,
97 UpdateFunc: func(old, new interface{}) {
98 p.enqueueClaim(new)
99 },
100 })
101 p.pvInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
102 AddFunc: p.enqueuePV,
103 UpdateFunc: func(old, new interface{}) {
104 p.enqueuePV(new)
105 },
106 })
107 p.logger = supervisor.Logger(ctx)
Lorenz Brunb15abad2020-04-16 11:17:12 +0200108
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200109 go p.pvcInformer.Informer().Run(ctx.Done())
110 go p.pvInformer.Informer().Run(ctx.Done())
111 go p.storageClassInformer.Informer().Run(ctx.Done())
Lorenz Brunb15abad2020-04-16 11:17:12 +0200112
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200113 // These will self-terminate once the queues are shut down
114 go p.processQueueItems(p.claimQueue, func(key string) error {
115 return p.processPVC(key)
116 })
117 go p.processQueueItems(p.pvQueue, func(key string) error {
118 return p.processPV(key)
119 })
Lorenz Brunb15abad2020-04-16 11:17:12 +0200120
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200121 supervisor.Signal(ctx, supervisor.SignalHealthy)
122 <-ctx.Done()
123 p.claimQueue.ShutDown()
124 p.pvQueue.ShutDown()
125 return nil
Lorenz Brunb15abad2020-04-16 11:17:12 +0200126}
127
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200128// isOurPVC checks if the given PVC is is to be provisioned by this provisioner
129// and has been scheduled onto this node
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200130func (p *csiProvisionerServer) isOurPVC(pvc *v1.PersistentVolumeClaim) bool {
131 if pvc.ObjectMeta.Annotations["volume.beta.kubernetes.io/storage-provisioner"] != csiProvisionerServerName {
132 return false
133 }
134 if pvc.ObjectMeta.Annotations["volume.kubernetes.io/selected-node"] != p.NodeName {
135 return false
136 }
137 return true
Lorenz Brunb15abad2020-04-16 11:17:12 +0200138}
139
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200140// isOurPV checks if the given PV has been provisioned by this provisioner and
141// has been scheduled onto this node
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200142func (p *csiProvisionerServer) isOurPV(pv *v1.PersistentVolume) bool {
143 if pv.ObjectMeta.Annotations["pv.kubernetes.io/provisioned-by"] != csiProvisionerServerName {
144 return false
145 }
146 if pv.Spec.NodeAffinity.Required.NodeSelectorTerms[0].MatchExpressions[0].Values[0] != p.NodeName {
147 return false
148 }
149 return true
Lorenz Brunb15abad2020-04-16 11:17:12 +0200150}
151
152// enqueueClaim adds an added/changed PVC to the work queue
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200153func (p *csiProvisionerServer) enqueueClaim(obj interface{}) {
Lorenz Brunb15abad2020-04-16 11:17:12 +0200154 key, err := cache.MetaNamespaceKeyFunc(obj)
155 if err != nil {
Serge Bazanskic7359672020-10-30 16:38:57 +0100156 p.logger.Errorf("Not queuing PVC because key could not be derived: %v", err)
Lorenz Brunb15abad2020-04-16 11:17:12 +0200157 return
158 }
159 p.claimQueue.Add(key)
160}
161
162// enqueuePV adds an added/changed PV to the work queue
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200163func (p *csiProvisionerServer) enqueuePV(obj interface{}) {
Lorenz Brunb15abad2020-04-16 11:17:12 +0200164 key, err := cache.MetaNamespaceKeyFunc(obj)
165 if err != nil {
Serge Bazanskic7359672020-10-30 16:38:57 +0100166 p.logger.Errorf("Not queuing PV because key could not be derived: %v", err)
Lorenz Brunb15abad2020-04-16 11:17:12 +0200167 return
168 }
169 p.pvQueue.Add(key)
170}
171
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200172// processQueueItems gets items from the given work queue and calls the process
173// function for each of them. It self- terminates once the queue is shut down.
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200174func (p *csiProvisionerServer) processQueueItems(queue workqueue.RateLimitingInterface, process func(key string) error) {
Lorenz Brunb15abad2020-04-16 11:17:12 +0200175 for {
176 obj, shutdown := queue.Get()
177 if shutdown {
178 return
179 }
180
181 func(obj interface{}) {
182 defer queue.Done(obj)
183 key, ok := obj.(string)
184 if !ok {
185 queue.Forget(obj)
Serge Bazanskic7359672020-10-30 16:38:57 +0100186 p.logger.Errorf("Expected string in workqueue, got %+v", obj)
Lorenz Brunb15abad2020-04-16 11:17:12 +0200187 return
188 }
189
190 if err := process(key); err != nil {
Serge Bazanskic7359672020-10-30 16:38:57 +0100191 p.logger.Warningf("Failed processing item %q, requeueing (numrequeues: %d): %v", key, queue.NumRequeues(obj), err)
Lorenz Brunb15abad2020-04-16 11:17:12 +0200192 queue.AddRateLimited(obj)
193 }
194
195 queue.Forget(obj)
196 }(obj)
197 }
198}
199
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200200// volumePath gets the path where the volume is stored.
201func (p *csiProvisionerServer) volumePath(volumeID string) string {
202 return filepath.Join(p.VolumesDirectory.FullPath(), volumeID)
Lorenz Brunb15abad2020-04-16 11:17:12 +0200203}
204
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200205// processPVC looks at a single PVC item from the queue, determines if it needs
206// to be provisioned and logs the provisioning result to the recorder
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200207func (p *csiProvisionerServer) processPVC(key string) error {
Lorenz Brunb15abad2020-04-16 11:17:12 +0200208 namespace, name, err := cache.SplitMetaNamespaceKey(key)
209 if err != nil {
210 return fmt.Errorf("invalid resource key: %s", key)
211 }
212 pvc, err := p.pvcInformer.Lister().PersistentVolumeClaims(namespace).Get(name)
213 if apierrs.IsNotFound(err) {
214 return nil // nothing to do, no error
215 } else if err != nil {
216 return fmt.Errorf("failed to get PVC for processing: %w", err)
217 }
218
219 if !p.isOurPVC(pvc) {
220 return nil
221 }
222
223 if pvc.Status.Phase != "Pending" {
224 // If the PVC is not pending, we don't need to provision anything
225 return nil
226 }
227
228 storageClass, err := p.storageClassInformer.Lister().Get(*pvc.Spec.StorageClassName)
229 if err != nil {
230 return fmt.Errorf("")
231 }
232
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200233 if storageClass.Provisioner != csiProvisionerServerName {
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200234 // We're not responsible for this PVC. Can only happen if
235 // controller-manager makes a mistake setting the annotations, but
236 // we're bailing here anyways for safety.
Lorenz Brunb15abad2020-04-16 11:17:12 +0200237 return nil
238 }
239
240 err = p.provisionPVC(pvc, storageClass)
241
242 if err != nil {
243 p.recorder.Eventf(pvc, v1.EventTypeWarning, "ProvisioningFailed", "Failed to provision PV: %v", err)
244 return err
245 }
246 p.recorder.Eventf(pvc, v1.EventTypeNormal, "Provisioned", "Successfully provisioned PV")
247
248 return nil
249}
250
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200251// provisionPVC creates the directory where the volume lives, sets a quota for
252// the requested amount of storage and creates the PV object representing this
253// new volume
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200254func (p *csiProvisionerServer) provisionPVC(pvc *v1.PersistentVolumeClaim, storageClass *storagev1.StorageClass) error {
Lorenz Brunb15abad2020-04-16 11:17:12 +0200255 claimRef, err := ref.GetReference(scheme.Scheme, pvc)
256 if err != nil {
257 return fmt.Errorf("failed to get reference to PVC: %w", err)
258 }
259
260 storageReq := pvc.Spec.Resources.Requests[v1.ResourceStorage]
261 if storageReq.IsZero() {
262 return fmt.Errorf("PVC is not requesting any storage, this is not supported")
263 }
264 capacity, ok := storageReq.AsInt64()
265 if !ok {
266 return fmt.Errorf("PVC requesting more than 2^63 bytes of storage, this is not supported")
267 }
268
Lorenz Brunb15abad2020-04-16 11:17:12 +0200269 volumeID := "pvc-" + string(pvc.ObjectMeta.UID)
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200270 volumePath := p.volumePath(volumeID)
Lorenz Brunb15abad2020-04-16 11:17:12 +0200271
Serge Bazanskic7359672020-10-30 16:38:57 +0100272 p.logger.Infof("Creating local PV %s", volumeID)
Lorenz Brun37050122021-03-30 14:00:27 +0200273
274 switch *pvc.Spec.VolumeMode {
275 case "", v1.PersistentVolumeFilesystem:
276 if err := os.Mkdir(volumePath, 0644); err != nil && !os.IsExist(err) {
277 return fmt.Errorf("failed to create volume directory: %w", err)
278 }
279 files, err := ioutil.ReadDir(volumePath)
280 if err != nil {
281 return fmt.Errorf("failed to list files in newly-created volume: %w", err)
282 }
283 if len(files) > 0 {
284 return errors.New("newly-created volume already contains data, bailing")
285 }
286 if err := fsquota.SetQuota(volumePath, uint64(capacity), 100000); err != nil {
287 return fmt.Errorf("failed to update quota: %v", err)
288 }
289 case v1.PersistentVolumeBlock:
290 imageFile, err := os.OpenFile(volumePath, os.O_CREATE|os.O_RDWR, 0644)
291 if err != nil {
292 return fmt.Errorf("failed to create volume image: %w", err)
293 }
294 defer imageFile.Close()
295 if err := unix.Fallocate(int(imageFile.Fd()), 0, 0, capacity); err != nil {
296 return fmt.Errorf("failed to fallocate() volume image: %w", err)
297 }
298 default:
299 return fmt.Errorf("VolumeMode \"%s\" is unsupported", *pvc.Spec.VolumeMode)
Lorenz Brunb15abad2020-04-16 11:17:12 +0200300 }
301
302 vol := &v1.PersistentVolume{
303 ObjectMeta: metav1.ObjectMeta{
304 Name: volumeID,
305 Annotations: map[string]string{
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200306 "pv.kubernetes.io/provisioned-by": csiProvisionerServerName},
Lorenz Brunb15abad2020-04-16 11:17:12 +0200307 },
308 Spec: v1.PersistentVolumeSpec{
309 AccessModes: []v1.PersistentVolumeAccessMode{v1.ReadWriteOnce},
310 Capacity: v1.ResourceList{
311 v1.ResourceStorage: storageReq, // We're always giving the exact amount
312 },
313 PersistentVolumeSource: v1.PersistentVolumeSource{
314 CSI: &v1.CSIPersistentVolumeSource{
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200315 Driver: csiProvisionerServerName,
Lorenz Brunb15abad2020-04-16 11:17:12 +0200316 VolumeHandle: volumeID,
317 },
318 },
Lorenz Brun37050122021-03-30 14:00:27 +0200319 ClaimRef: claimRef,
320 VolumeMode: pvc.Spec.VolumeMode,
Lorenz Brunb15abad2020-04-16 11:17:12 +0200321 NodeAffinity: &v1.VolumeNodeAffinity{
322 Required: &v1.NodeSelector{
323 NodeSelectorTerms: []v1.NodeSelectorTerm{
324 {
325 MatchExpressions: []v1.NodeSelectorRequirement{
326 {
327 Key: "kubernetes.io/hostname",
328 Operator: v1.NodeSelectorOpIn,
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200329 Values: []string{p.NodeName},
Lorenz Brunb15abad2020-04-16 11:17:12 +0200330 },
331 },
332 },
333 },
334 },
335 },
336 StorageClassName: *pvc.Spec.StorageClassName,
337 PersistentVolumeReclaimPolicy: *storageClass.ReclaimPolicy,
338 },
339 }
340
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200341 _, err = p.Kubernetes.CoreV1().PersistentVolumes().Create(context.Background(), vol, metav1.CreateOptions{})
342 if err != nil && !apierrs.IsAlreadyExists(err) {
Lorenz Brunb15abad2020-04-16 11:17:12 +0200343 return fmt.Errorf("failed to create PV object: %w", err)
344 }
345 return nil
346}
347
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200348// processPV looks at a single PV item from the queue and checks if it has been
349// released and needs to be deleted. If yes it deletes the associated quota,
350// directory and the PV object and logs the result to the recorder.
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200351func (p *csiProvisionerServer) processPV(key string) error {
Lorenz Brunb15abad2020-04-16 11:17:12 +0200352 _, name, err := cache.SplitMetaNamespaceKey(key)
353 if err != nil {
354 return fmt.Errorf("invalid resource key: %s", key)
355 }
356 pv, err := p.pvInformer.Lister().Get(name)
357 if apierrs.IsNotFound(err) {
358 return nil // nothing to do, no error
359 } else if err != nil {
360 return fmt.Errorf("failed to get PV for processing: %w", err)
361 }
362
363 if !p.isOurPV(pv) {
364 return nil
365 }
366 if pv.Spec.PersistentVolumeReclaimPolicy != v1.PersistentVolumeReclaimDelete || pv.Status.Phase != "Released" {
367 return nil
368 }
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200369 volumePath := p.volumePath(pv.Spec.CSI.VolumeHandle)
Lorenz Brunb15abad2020-04-16 11:17:12 +0200370
371 // Log deletes for auditing purposes
Serge Bazanskic7359672020-10-30 16:38:57 +0100372 p.logger.Infof("Deleting persistent volume %s", pv.Spec.CSI.VolumeHandle)
Lorenz Brun37050122021-03-30 14:00:27 +0200373 switch *pv.Spec.VolumeMode {
374 case "", v1.PersistentVolumeFilesystem:
375 if err := fsquota.SetQuota(volumePath, 0, 0); err != nil {
Serge Bazanski216fe7b2021-05-21 18:36:16 +0200376 // We record these here manually since a successful deletion
377 // removes the PV we'd be attaching them to.
Lorenz Brun37050122021-03-30 14:00:27 +0200378 p.recorder.Eventf(pv, v1.EventTypeWarning, "DeprovisioningFailed", "Failed to remove quota: %v", err)
379 return fmt.Errorf("failed to remove quota: %w", err)
380 }
381 if err := os.RemoveAll(volumePath); err != nil && !os.IsNotExist(err) {
382 p.recorder.Eventf(pv, v1.EventTypeWarning, "DeprovisioningFailed", "Failed to delete volume: %v", err)
383 return fmt.Errorf("failed to delete volume: %w", err)
384 }
385 case v1.PersistentVolumeBlock:
386 if err := os.Remove(volumePath); err != nil && !os.IsNotExist(err) {
387 p.recorder.Eventf(pv, v1.EventTypeWarning, "DeprovisioningFailed", "Failed to delete volume: %v", err)
388 return fmt.Errorf("failed to delete volume: %w", err)
389 }
390 default:
391 p.recorder.Eventf(pv, v1.EventTypeWarning, "DeprovisioningFailed", "Invalid volume mode \"%v\"", *pv.Spec.VolumeMode)
392 return fmt.Errorf("invalid volume mode \"%v\"", *pv.Spec.VolumeMode)
Lorenz Brunb15abad2020-04-16 11:17:12 +0200393 }
394
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200395 err = p.Kubernetes.CoreV1().PersistentVolumes().Delete(context.Background(), pv.Name, metav1.DeleteOptions{})
Lorenz Brunb15abad2020-04-16 11:17:12 +0200396 if err != nil && !apierrs.IsNotFound(err) {
397 p.recorder.Eventf(pv, v1.EventTypeWarning, "DeprovisioningFailed", "Failed to delete PV object from K8s API: %v", err)
398 return fmt.Errorf("failed to delete PV object: %w", err)
399 }
400 return nil
401}