blob: 0aa5c668cf2c36e8111e560c0bccfc2cdcaa1960 [file] [log] [blame]
Lorenz Brunb15abad2020-04-16 11:17:12 +02001// Copyright 2020 The Monogon Project Authors.
2//
3// SPDX-License-Identifier: Apache-2.0
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
17package kubernetes
18
19import (
20 "context"
21 "errors"
22 "fmt"
23 "io/ioutil"
24 "os"
25 "path/filepath"
26
Lorenz Brun37050122021-03-30 14:00:27 +020027 "golang.org/x/sys/unix"
Lorenz Brunb15abad2020-04-16 11:17:12 +020028 v1 "k8s.io/api/core/v1"
29 storagev1 "k8s.io/api/storage/v1"
30 apierrs "k8s.io/apimachinery/pkg/api/errors"
31 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
32 "k8s.io/client-go/informers"
33 coreinformers "k8s.io/client-go/informers/core/v1"
34 storageinformers "k8s.io/client-go/informers/storage/v1"
35 "k8s.io/client-go/kubernetes"
36 "k8s.io/client-go/kubernetes/scheme"
37 typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1"
38 "k8s.io/client-go/tools/cache"
39 "k8s.io/client-go/tools/record"
40 ref "k8s.io/client-go/tools/reference"
41 "k8s.io/client-go/util/workqueue"
Serge Bazanskic2c7ad92020-07-13 17:20:09 +020042
Serge Bazanski31370b02021-01-07 16:31:14 +010043 "source.monogon.dev/metropolis/node/core/localstorage"
Serge Bazanski31370b02021-01-07 16:31:14 +010044 "source.monogon.dev/metropolis/pkg/fsquota"
Lorenz Brun37050122021-03-30 14:00:27 +020045 "source.monogon.dev/metropolis/pkg/logtree"
Serge Bazanski31370b02021-01-07 16:31:14 +010046 "source.monogon.dev/metropolis/pkg/supervisor"
Lorenz Brunb15abad2020-04-16 11:17:12 +020047)
48
Serge Bazanski77cb6c52020-12-19 00:09:22 +010049// ONCHANGE(//metropolis/node/kubernetes/reconciler:resources_csi.go): needs to match csiProvisionerServerName declared.
Serge Bazanski662b5b32020-12-21 13:49:00 +010050const csiProvisionerServerName = "dev.monogon.metropolis.vfs"
Lorenz Brunb15abad2020-04-16 11:17:12 +020051
Serge Bazanskic2c7ad92020-07-13 17:20:09 +020052// csiProvisionerServer is responsible for the provisioning and deprovisioning of CSI-based container volumes. It runs on all
Lorenz Brunb15abad2020-04-16 11:17:12 +020053// nodes and watches PVCs for ones assigned to the node it's running on and fulfills the provisioning request by
54// creating a directory, applying a quota and creating the corresponding PV. When the PV is released and its retention
55// policy is Delete, the directory and the PV resource are deleted.
Serge Bazanskic2c7ad92020-07-13 17:20:09 +020056type csiProvisionerServer struct {
57 NodeName string
58 Kubernetes kubernetes.Interface
59 InformerFactory informers.SharedInformerFactory
60 VolumesDirectory *localstorage.DataVolumesDirectory
61
Lorenz Brunb15abad2020-04-16 11:17:12 +020062 claimQueue workqueue.RateLimitingInterface
63 pvQueue workqueue.RateLimitingInterface
64 recorder record.EventRecorder
65 pvcInformer coreinformers.PersistentVolumeClaimInformer
66 pvInformer coreinformers.PersistentVolumeInformer
67 storageClassInformer storageinformers.StorageClassInformer
Serge Bazanskic7359672020-10-30 16:38:57 +010068 logger logtree.LeveledLogger
Lorenz Brunb15abad2020-04-16 11:17:12 +020069}
70
71// runCSIProvisioner runs the main provisioning machinery. It consists of a bunch of informers which keep track of
72// the events happening on the Kubernetes control plane and informs us when something happens. If anything happens to
73// PVCs or PVs, we enqueue the identifier of that resource in a work queue. Queues are being worked on by only one
74// worker to limit load and avoid complicated locking infrastructure. Failed items are requeued.
Serge Bazanskic2c7ad92020-07-13 17:20:09 +020075func (p *csiProvisionerServer) Run(ctx context.Context) error {
76 // The recorder is used to log Kubernetes events for successful or failed volume provisions. These events then
77 // show up in `kubectl describe pvc` and can be used by admins to debug issues with this provisioner.
78 eventBroadcaster := record.NewBroadcaster()
79 eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: p.Kubernetes.CoreV1().Events("")})
80 p.recorder = eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: csiProvisionerServerName, Host: p.NodeName})
Lorenz Brunb15abad2020-04-16 11:17:12 +020081
Serge Bazanskic2c7ad92020-07-13 17:20:09 +020082 p.pvInformer = p.InformerFactory.Core().V1().PersistentVolumes()
83 p.pvcInformer = p.InformerFactory.Core().V1().PersistentVolumeClaims()
84 p.storageClassInformer = p.InformerFactory.Storage().V1().StorageClasses()
Lorenz Brunb15abad2020-04-16 11:17:12 +020085
Serge Bazanskic2c7ad92020-07-13 17:20:09 +020086 p.claimQueue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter())
87 p.pvQueue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter())
Lorenz Brunb15abad2020-04-16 11:17:12 +020088
Serge Bazanskic2c7ad92020-07-13 17:20:09 +020089 p.pvcInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
90 AddFunc: p.enqueueClaim,
91 UpdateFunc: func(old, new interface{}) {
92 p.enqueueClaim(new)
93 },
94 })
95 p.pvInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
96 AddFunc: p.enqueuePV,
97 UpdateFunc: func(old, new interface{}) {
98 p.enqueuePV(new)
99 },
100 })
101 p.logger = supervisor.Logger(ctx)
Lorenz Brunb15abad2020-04-16 11:17:12 +0200102
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200103 go p.pvcInformer.Informer().Run(ctx.Done())
104 go p.pvInformer.Informer().Run(ctx.Done())
105 go p.storageClassInformer.Informer().Run(ctx.Done())
Lorenz Brunb15abad2020-04-16 11:17:12 +0200106
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200107 // These will self-terminate once the queues are shut down
108 go p.processQueueItems(p.claimQueue, func(key string) error {
109 return p.processPVC(key)
110 })
111 go p.processQueueItems(p.pvQueue, func(key string) error {
112 return p.processPV(key)
113 })
Lorenz Brunb15abad2020-04-16 11:17:12 +0200114
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200115 supervisor.Signal(ctx, supervisor.SignalHealthy)
116 <-ctx.Done()
117 p.claimQueue.ShutDown()
118 p.pvQueue.ShutDown()
119 return nil
Lorenz Brunb15abad2020-04-16 11:17:12 +0200120}
121
122// isOurPVC checks if the given PVC is is to be provisioned by this provisioner and has been scheduled onto this node
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200123func (p *csiProvisionerServer) isOurPVC(pvc *v1.PersistentVolumeClaim) bool {
124 if pvc.ObjectMeta.Annotations["volume.beta.kubernetes.io/storage-provisioner"] != csiProvisionerServerName {
125 return false
126 }
127 if pvc.ObjectMeta.Annotations["volume.kubernetes.io/selected-node"] != p.NodeName {
128 return false
129 }
130 return true
Lorenz Brunb15abad2020-04-16 11:17:12 +0200131}
132
133// isOurPV checks if the given PV has been provisioned by this provisioner and has been scheduled onto this node
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200134func (p *csiProvisionerServer) isOurPV(pv *v1.PersistentVolume) bool {
135 if pv.ObjectMeta.Annotations["pv.kubernetes.io/provisioned-by"] != csiProvisionerServerName {
136 return false
137 }
138 if pv.Spec.NodeAffinity.Required.NodeSelectorTerms[0].MatchExpressions[0].Values[0] != p.NodeName {
139 return false
140 }
141 return true
Lorenz Brunb15abad2020-04-16 11:17:12 +0200142}
143
144// enqueueClaim adds an added/changed PVC to the work queue
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200145func (p *csiProvisionerServer) enqueueClaim(obj interface{}) {
Lorenz Brunb15abad2020-04-16 11:17:12 +0200146 key, err := cache.MetaNamespaceKeyFunc(obj)
147 if err != nil {
Serge Bazanskic7359672020-10-30 16:38:57 +0100148 p.logger.Errorf("Not queuing PVC because key could not be derived: %v", err)
Lorenz Brunb15abad2020-04-16 11:17:12 +0200149 return
150 }
151 p.claimQueue.Add(key)
152}
153
154// enqueuePV adds an added/changed PV to the work queue
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200155func (p *csiProvisionerServer) enqueuePV(obj interface{}) {
Lorenz Brunb15abad2020-04-16 11:17:12 +0200156 key, err := cache.MetaNamespaceKeyFunc(obj)
157 if err != nil {
Serge Bazanskic7359672020-10-30 16:38:57 +0100158 p.logger.Errorf("Not queuing PV because key could not be derived: %v", err)
Lorenz Brunb15abad2020-04-16 11:17:12 +0200159 return
160 }
161 p.pvQueue.Add(key)
162}
163
164// processQueueItems gets items from the given work queue and calls the process function for each of them. It self-
165// terminates once the queue is shut down.
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200166func (p *csiProvisionerServer) processQueueItems(queue workqueue.RateLimitingInterface, process func(key string) error) {
Lorenz Brunb15abad2020-04-16 11:17:12 +0200167 for {
168 obj, shutdown := queue.Get()
169 if shutdown {
170 return
171 }
172
173 func(obj interface{}) {
174 defer queue.Done(obj)
175 key, ok := obj.(string)
176 if !ok {
177 queue.Forget(obj)
Serge Bazanskic7359672020-10-30 16:38:57 +0100178 p.logger.Errorf("Expected string in workqueue, got %+v", obj)
Lorenz Brunb15abad2020-04-16 11:17:12 +0200179 return
180 }
181
182 if err := process(key); err != nil {
Serge Bazanskic7359672020-10-30 16:38:57 +0100183 p.logger.Warningf("Failed processing item %q, requeueing (numrequeues: %d): %v", key, queue.NumRequeues(obj), err)
Lorenz Brunb15abad2020-04-16 11:17:12 +0200184 queue.AddRateLimited(obj)
185 }
186
187 queue.Forget(obj)
188 }(obj)
189 }
190}
191
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200192// volumePath gets the path where the volume is stored.
193func (p *csiProvisionerServer) volumePath(volumeID string) string {
194 return filepath.Join(p.VolumesDirectory.FullPath(), volumeID)
Lorenz Brunb15abad2020-04-16 11:17:12 +0200195}
196
197// processPVC looks at a single PVC item from the queue, determines if it needs to be provisioned and logs the
198// provisioning result to the recorder
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200199func (p *csiProvisionerServer) processPVC(key string) error {
Lorenz Brunb15abad2020-04-16 11:17:12 +0200200 namespace, name, err := cache.SplitMetaNamespaceKey(key)
201 if err != nil {
202 return fmt.Errorf("invalid resource key: %s", key)
203 }
204 pvc, err := p.pvcInformer.Lister().PersistentVolumeClaims(namespace).Get(name)
205 if apierrs.IsNotFound(err) {
206 return nil // nothing to do, no error
207 } else if err != nil {
208 return fmt.Errorf("failed to get PVC for processing: %w", err)
209 }
210
211 if !p.isOurPVC(pvc) {
212 return nil
213 }
214
215 if pvc.Status.Phase != "Pending" {
216 // If the PVC is not pending, we don't need to provision anything
217 return nil
218 }
219
220 storageClass, err := p.storageClassInformer.Lister().Get(*pvc.Spec.StorageClassName)
221 if err != nil {
222 return fmt.Errorf("")
223 }
224
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200225 if storageClass.Provisioner != csiProvisionerServerName {
Lorenz Brunb15abad2020-04-16 11:17:12 +0200226 // We're not responsible for this PVC. Can only happen if controller-manager makes a mistake
227 // setting the annotations, but we're bailing here anyways for safety.
228 return nil
229 }
230
231 err = p.provisionPVC(pvc, storageClass)
232
233 if err != nil {
234 p.recorder.Eventf(pvc, v1.EventTypeWarning, "ProvisioningFailed", "Failed to provision PV: %v", err)
235 return err
236 }
237 p.recorder.Eventf(pvc, v1.EventTypeNormal, "Provisioned", "Successfully provisioned PV")
238
239 return nil
240}
241
242// provisionPVC creates the directory where the volume lives, sets a quota for the requested amount of storage and
243// creates the PV object representing this new volume
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200244func (p *csiProvisionerServer) provisionPVC(pvc *v1.PersistentVolumeClaim, storageClass *storagev1.StorageClass) error {
Lorenz Brunb15abad2020-04-16 11:17:12 +0200245 claimRef, err := ref.GetReference(scheme.Scheme, pvc)
246 if err != nil {
247 return fmt.Errorf("failed to get reference to PVC: %w", err)
248 }
249
250 storageReq := pvc.Spec.Resources.Requests[v1.ResourceStorage]
251 if storageReq.IsZero() {
252 return fmt.Errorf("PVC is not requesting any storage, this is not supported")
253 }
254 capacity, ok := storageReq.AsInt64()
255 if !ok {
256 return fmt.Errorf("PVC requesting more than 2^63 bytes of storage, this is not supported")
257 }
258
Lorenz Brunb15abad2020-04-16 11:17:12 +0200259 volumeID := "pvc-" + string(pvc.ObjectMeta.UID)
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200260 volumePath := p.volumePath(volumeID)
Lorenz Brunb15abad2020-04-16 11:17:12 +0200261
Serge Bazanskic7359672020-10-30 16:38:57 +0100262 p.logger.Infof("Creating local PV %s", volumeID)
Lorenz Brun37050122021-03-30 14:00:27 +0200263
264 switch *pvc.Spec.VolumeMode {
265 case "", v1.PersistentVolumeFilesystem:
266 if err := os.Mkdir(volumePath, 0644); err != nil && !os.IsExist(err) {
267 return fmt.Errorf("failed to create volume directory: %w", err)
268 }
269 files, err := ioutil.ReadDir(volumePath)
270 if err != nil {
271 return fmt.Errorf("failed to list files in newly-created volume: %w", err)
272 }
273 if len(files) > 0 {
274 return errors.New("newly-created volume already contains data, bailing")
275 }
276 if err := fsquota.SetQuota(volumePath, uint64(capacity), 100000); err != nil {
277 return fmt.Errorf("failed to update quota: %v", err)
278 }
279 case v1.PersistentVolumeBlock:
280 imageFile, err := os.OpenFile(volumePath, os.O_CREATE|os.O_RDWR, 0644)
281 if err != nil {
282 return fmt.Errorf("failed to create volume image: %w", err)
283 }
284 defer imageFile.Close()
285 if err := unix.Fallocate(int(imageFile.Fd()), 0, 0, capacity); err != nil {
286 return fmt.Errorf("failed to fallocate() volume image: %w", err)
287 }
288 default:
289 return fmt.Errorf("VolumeMode \"%s\" is unsupported", *pvc.Spec.VolumeMode)
Lorenz Brunb15abad2020-04-16 11:17:12 +0200290 }
291
292 vol := &v1.PersistentVolume{
293 ObjectMeta: metav1.ObjectMeta{
294 Name: volumeID,
295 Annotations: map[string]string{
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200296 "pv.kubernetes.io/provisioned-by": csiProvisionerServerName},
Lorenz Brunb15abad2020-04-16 11:17:12 +0200297 },
298 Spec: v1.PersistentVolumeSpec{
299 AccessModes: []v1.PersistentVolumeAccessMode{v1.ReadWriteOnce},
300 Capacity: v1.ResourceList{
301 v1.ResourceStorage: storageReq, // We're always giving the exact amount
302 },
303 PersistentVolumeSource: v1.PersistentVolumeSource{
304 CSI: &v1.CSIPersistentVolumeSource{
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200305 Driver: csiProvisionerServerName,
Lorenz Brunb15abad2020-04-16 11:17:12 +0200306 VolumeHandle: volumeID,
307 },
308 },
Lorenz Brun37050122021-03-30 14:00:27 +0200309 ClaimRef: claimRef,
310 VolumeMode: pvc.Spec.VolumeMode,
Lorenz Brunb15abad2020-04-16 11:17:12 +0200311 NodeAffinity: &v1.VolumeNodeAffinity{
312 Required: &v1.NodeSelector{
313 NodeSelectorTerms: []v1.NodeSelectorTerm{
314 {
315 MatchExpressions: []v1.NodeSelectorRequirement{
316 {
317 Key: "kubernetes.io/hostname",
318 Operator: v1.NodeSelectorOpIn,
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200319 Values: []string{p.NodeName},
Lorenz Brunb15abad2020-04-16 11:17:12 +0200320 },
321 },
322 },
323 },
324 },
325 },
326 StorageClassName: *pvc.Spec.StorageClassName,
327 PersistentVolumeReclaimPolicy: *storageClass.ReclaimPolicy,
328 },
329 }
330
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200331 _, err = p.Kubernetes.CoreV1().PersistentVolumes().Create(context.Background(), vol, metav1.CreateOptions{})
332 if err != nil && !apierrs.IsAlreadyExists(err) {
Lorenz Brunb15abad2020-04-16 11:17:12 +0200333 return fmt.Errorf("failed to create PV object: %w", err)
334 }
335 return nil
336}
337
338// processPV looks at a single PV item from the queue and checks if it has been released and needs to be deleted. If yes
339// it deletes the associated quota, directory and the PV object and logs the result to the recorder.
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200340func (p *csiProvisionerServer) processPV(key string) error {
Lorenz Brunb15abad2020-04-16 11:17:12 +0200341 _, name, err := cache.SplitMetaNamespaceKey(key)
342 if err != nil {
343 return fmt.Errorf("invalid resource key: %s", key)
344 }
345 pv, err := p.pvInformer.Lister().Get(name)
346 if apierrs.IsNotFound(err) {
347 return nil // nothing to do, no error
348 } else if err != nil {
349 return fmt.Errorf("failed to get PV for processing: %w", err)
350 }
351
352 if !p.isOurPV(pv) {
353 return nil
354 }
355 if pv.Spec.PersistentVolumeReclaimPolicy != v1.PersistentVolumeReclaimDelete || pv.Status.Phase != "Released" {
356 return nil
357 }
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200358 volumePath := p.volumePath(pv.Spec.CSI.VolumeHandle)
Lorenz Brunb15abad2020-04-16 11:17:12 +0200359
360 // Log deletes for auditing purposes
Serge Bazanskic7359672020-10-30 16:38:57 +0100361 p.logger.Infof("Deleting persistent volume %s", pv.Spec.CSI.VolumeHandle)
Lorenz Brun37050122021-03-30 14:00:27 +0200362 switch *pv.Spec.VolumeMode {
363 case "", v1.PersistentVolumeFilesystem:
364 if err := fsquota.SetQuota(volumePath, 0, 0); err != nil {
365 // We record these here manually since a successful deletion removes the PV we'd be attaching them to
366 p.recorder.Eventf(pv, v1.EventTypeWarning, "DeprovisioningFailed", "Failed to remove quota: %v", err)
367 return fmt.Errorf("failed to remove quota: %w", err)
368 }
369 if err := os.RemoveAll(volumePath); err != nil && !os.IsNotExist(err) {
370 p.recorder.Eventf(pv, v1.EventTypeWarning, "DeprovisioningFailed", "Failed to delete volume: %v", err)
371 return fmt.Errorf("failed to delete volume: %w", err)
372 }
373 case v1.PersistentVolumeBlock:
374 if err := os.Remove(volumePath); err != nil && !os.IsNotExist(err) {
375 p.recorder.Eventf(pv, v1.EventTypeWarning, "DeprovisioningFailed", "Failed to delete volume: %v", err)
376 return fmt.Errorf("failed to delete volume: %w", err)
377 }
378 default:
379 p.recorder.Eventf(pv, v1.EventTypeWarning, "DeprovisioningFailed", "Invalid volume mode \"%v\"", *pv.Spec.VolumeMode)
380 return fmt.Errorf("invalid volume mode \"%v\"", *pv.Spec.VolumeMode)
Lorenz Brunb15abad2020-04-16 11:17:12 +0200381 }
382
Serge Bazanskic2c7ad92020-07-13 17:20:09 +0200383 err = p.Kubernetes.CoreV1().PersistentVolumes().Delete(context.Background(), pv.Name, metav1.DeleteOptions{})
Lorenz Brunb15abad2020-04-16 11:17:12 +0200384 if err != nil && !apierrs.IsNotFound(err) {
385 p.recorder.Eventf(pv, v1.EventTypeWarning, "DeprovisioningFailed", "Failed to delete PV object from K8s API: %v", err)
386 return fmt.Errorf("failed to delete PV object: %w", err)
387 }
388 return nil
389}