blob: 7288c84f111b600dbc1a752ce682741097a036a2 [file] [log] [blame]
// Copyright 2020 The Monogon Project Authors.
//
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package kubernetes
import (
"context"
"errors"
"fmt"
"os"
"path/filepath"
"golang.org/x/sys/unix"
v1 "k8s.io/api/core/v1"
storagev1 "k8s.io/api/storage/v1"
apierrs "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/informers"
coreinformers "k8s.io/client-go/informers/core/v1"
storageinformers "k8s.io/client-go/informers/storage/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/kubernetes/scheme"
typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1"
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/tools/record"
ref "k8s.io/client-go/tools/reference"
"k8s.io/client-go/util/workqueue"
"source.monogon.dev/metropolis/node/core/localstorage"
"source.monogon.dev/metropolis/pkg/fsquota"
"source.monogon.dev/metropolis/pkg/logtree"
"source.monogon.dev/metropolis/pkg/supervisor"
)
// ONCHANGE(//metropolis/node/kubernetes/reconciler:resources_csi.go): needs to
// match csiProvisionerServerName declared.
const csiProvisionerServerName = "dev.monogon.metropolis.vfs"
// csiProvisionerServer is responsible for the provisioning and deprovisioning
// of CSI-based container volumes. It runs on all nodes and watches PVCs for
// ones assigned to the node it's running on and fulfills the provisioning
// request by creating a directory, applying a quota and creating the
// corresponding PV. When the PV is released and its retention policy is
// Delete, the directory and the PV resource are deleted.
type csiProvisionerServer struct {
NodeName string
Kubernetes kubernetes.Interface
InformerFactory informers.SharedInformerFactory
VolumesDirectory *localstorage.DataVolumesDirectory
claimQueue workqueue.RateLimitingInterface
pvQueue workqueue.RateLimitingInterface
recorder record.EventRecorder
pvcInformer coreinformers.PersistentVolumeClaimInformer
pvInformer coreinformers.PersistentVolumeInformer
storageClassInformer storageinformers.StorageClassInformer
logger logtree.LeveledLogger
}
// runCSIProvisioner runs the main provisioning machinery. It consists of a
// bunch of informers which keep track of the events happening on the
// Kubernetes control plane and informs us when something happens. If anything
// happens to PVCs or PVs, we enqueue the identifier of that resource in a work
// queue. Queues are being worked on by only one worker to limit load and avoid
// complicated locking infrastructure. Failed items are requeued.
func (p *csiProvisionerServer) Run(ctx context.Context) error {
// The recorder is used to log Kubernetes events for successful or failed
// volume provisions. These events then show up in `kubectl describe pvc`
// and can be used by admins to debug issues with this provisioner.
eventBroadcaster := record.NewBroadcaster()
eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: p.Kubernetes.CoreV1().Events("")})
p.recorder = eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: csiProvisionerServerName, Host: p.NodeName})
p.pvInformer = p.InformerFactory.Core().V1().PersistentVolumes()
p.pvcInformer = p.InformerFactory.Core().V1().PersistentVolumeClaims()
p.storageClassInformer = p.InformerFactory.Storage().V1().StorageClasses()
p.claimQueue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter())
p.pvQueue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter())
p.pvcInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: p.enqueueClaim,
UpdateFunc: func(old, new interface{}) {
p.enqueueClaim(new)
},
})
p.pvInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: p.enqueuePV,
UpdateFunc: func(old, new interface{}) {
p.enqueuePV(new)
},
})
p.logger = supervisor.Logger(ctx)
go p.pvcInformer.Informer().Run(ctx.Done())
go p.pvInformer.Informer().Run(ctx.Done())
go p.storageClassInformer.Informer().Run(ctx.Done())
// These will self-terminate once the queues are shut down
go p.processQueueItems(p.claimQueue, func(key string) error {
return p.processPVC(key)
})
go p.processQueueItems(p.pvQueue, func(key string) error {
return p.processPV(key)
})
supervisor.Signal(ctx, supervisor.SignalHealthy)
<-ctx.Done()
p.claimQueue.ShutDown()
p.pvQueue.ShutDown()
return nil
}
// isOurPVC checks if the given PVC is is to be provisioned by this provisioner
// and has been scheduled onto this node
func (p *csiProvisionerServer) isOurPVC(pvc *v1.PersistentVolumeClaim) bool {
if pvc.ObjectMeta.Annotations["volume.beta.kubernetes.io/storage-provisioner"] != csiProvisionerServerName {
return false
}
if pvc.ObjectMeta.Annotations["volume.kubernetes.io/selected-node"] != p.NodeName {
return false
}
return true
}
// isOurPV checks if the given PV has been provisioned by this provisioner and
// has been scheduled onto this node
func (p *csiProvisionerServer) isOurPV(pv *v1.PersistentVolume) bool {
if pv.ObjectMeta.Annotations["pv.kubernetes.io/provisioned-by"] != csiProvisionerServerName {
return false
}
if pv.Spec.NodeAffinity.Required.NodeSelectorTerms[0].MatchExpressions[0].Values[0] != p.NodeName {
return false
}
return true
}
// enqueueClaim adds an added/changed PVC to the work queue
func (p *csiProvisionerServer) enqueueClaim(obj interface{}) {
key, err := cache.MetaNamespaceKeyFunc(obj)
if err != nil {
p.logger.Errorf("Not queuing PVC because key could not be derived: %v", err)
return
}
p.claimQueue.Add(key)
}
// enqueuePV adds an added/changed PV to the work queue
func (p *csiProvisionerServer) enqueuePV(obj interface{}) {
key, err := cache.MetaNamespaceKeyFunc(obj)
if err != nil {
p.logger.Errorf("Not queuing PV because key could not be derived: %v", err)
return
}
p.pvQueue.Add(key)
}
// processQueueItems gets items from the given work queue and calls the process
// function for each of them. It self- terminates once the queue is shut down.
func (p *csiProvisionerServer) processQueueItems(queue workqueue.RateLimitingInterface, process func(key string) error) {
for {
obj, shutdown := queue.Get()
if shutdown {
return
}
func(obj interface{}) {
defer queue.Done(obj)
key, ok := obj.(string)
if !ok {
queue.Forget(obj)
p.logger.Errorf("Expected string in workqueue, got %+v", obj)
return
}
if err := process(key); err != nil {
p.logger.Warningf("Failed processing item %q, requeueing (numrequeues: %d): %v", key, queue.NumRequeues(obj), err)
queue.AddRateLimited(obj)
}
queue.Forget(obj)
}(obj)
}
}
// volumePath gets the path where the volume is stored.
func (p *csiProvisionerServer) volumePath(volumeID string) string {
return filepath.Join(p.VolumesDirectory.FullPath(), volumeID)
}
// processPVC looks at a single PVC item from the queue, determines if it needs
// to be provisioned and logs the provisioning result to the recorder
func (p *csiProvisionerServer) processPVC(key string) error {
namespace, name, err := cache.SplitMetaNamespaceKey(key)
if err != nil {
return fmt.Errorf("invalid resource key: %s", key)
}
pvc, err := p.pvcInformer.Lister().PersistentVolumeClaims(namespace).Get(name)
if apierrs.IsNotFound(err) {
return nil // nothing to do, no error
} else if err != nil {
return fmt.Errorf("failed to get PVC for processing: %w", err)
}
if !p.isOurPVC(pvc) {
return nil
}
if pvc.Status.Phase != "Pending" {
// If the PVC is not pending, we don't need to provision anything
return nil
}
storageClass, err := p.storageClassInformer.Lister().Get(*pvc.Spec.StorageClassName)
if err != nil {
return fmt.Errorf("")
}
if storageClass.Provisioner != csiProvisionerServerName {
// We're not responsible for this PVC. Can only happen if
// controller-manager makes a mistake setting the annotations, but
// we're bailing here anyways for safety.
return nil
}
err = p.provisionPVC(pvc, storageClass)
if err != nil {
p.recorder.Eventf(pvc, v1.EventTypeWarning, "ProvisioningFailed", "Failed to provision PV: %v", err)
return err
}
p.recorder.Eventf(pvc, v1.EventTypeNormal, "Provisioned", "Successfully provisioned PV")
return nil
}
// provisionPVC creates the directory where the volume lives, sets a quota for
// the requested amount of storage and creates the PV object representing this
// new volume
func (p *csiProvisionerServer) provisionPVC(pvc *v1.PersistentVolumeClaim, storageClass *storagev1.StorageClass) error {
claimRef, err := ref.GetReference(scheme.Scheme, pvc)
if err != nil {
return fmt.Errorf("failed to get reference to PVC: %w", err)
}
storageReq := pvc.Spec.Resources.Requests[v1.ResourceStorage]
if storageReq.IsZero() {
return fmt.Errorf("PVC is not requesting any storage, this is not supported")
}
capacity, ok := storageReq.AsInt64()
if !ok {
return fmt.Errorf("PVC requesting more than 2^63 bytes of storage, this is not supported")
}
volumeID := "pvc-" + string(pvc.ObjectMeta.UID)
volumePath := p.volumePath(volumeID)
p.logger.Infof("Creating local PV %s", volumeID)
switch *pvc.Spec.VolumeMode {
case "", v1.PersistentVolumeFilesystem:
if err := os.Mkdir(volumePath, 0644); err != nil && !os.IsExist(err) {
return fmt.Errorf("failed to create volume directory: %w", err)
}
files, err := os.ReadDir(volumePath)
if err != nil {
return fmt.Errorf("failed to list files in newly-created volume: %w", err)
}
if len(files) > 0 {
return errors.New("newly-created volume already contains data, bailing")
}
if err := fsquota.SetQuota(volumePath, uint64(capacity), 100000); err != nil {
return fmt.Errorf("failed to update quota: %v", err)
}
case v1.PersistentVolumeBlock:
imageFile, err := os.OpenFile(volumePath, os.O_CREATE|os.O_RDWR, 0644)
if err != nil {
return fmt.Errorf("failed to create volume image: %w", err)
}
defer imageFile.Close()
if err := unix.Fallocate(int(imageFile.Fd()), 0, 0, capacity); err != nil {
return fmt.Errorf("failed to fallocate() volume image: %w", err)
}
default:
return fmt.Errorf("VolumeMode \"%s\" is unsupported", *pvc.Spec.VolumeMode)
}
vol := &v1.PersistentVolume{
ObjectMeta: metav1.ObjectMeta{
Name: volumeID,
Annotations: map[string]string{
"pv.kubernetes.io/provisioned-by": csiProvisionerServerName},
},
Spec: v1.PersistentVolumeSpec{
AccessModes: []v1.PersistentVolumeAccessMode{v1.ReadWriteOnce},
Capacity: v1.ResourceList{
v1.ResourceStorage: storageReq, // We're always giving the exact amount
},
PersistentVolumeSource: v1.PersistentVolumeSource{
CSI: &v1.CSIPersistentVolumeSource{
Driver: csiProvisionerServerName,
VolumeHandle: volumeID,
},
},
ClaimRef: claimRef,
VolumeMode: pvc.Spec.VolumeMode,
NodeAffinity: &v1.VolumeNodeAffinity{
Required: &v1.NodeSelector{
NodeSelectorTerms: []v1.NodeSelectorTerm{
{
MatchExpressions: []v1.NodeSelectorRequirement{
{
Key: "kubernetes.io/hostname",
Operator: v1.NodeSelectorOpIn,
Values: []string{p.NodeName},
},
},
},
},
},
},
StorageClassName: *pvc.Spec.StorageClassName,
PersistentVolumeReclaimPolicy: *storageClass.ReclaimPolicy,
},
}
_, err = p.Kubernetes.CoreV1().PersistentVolumes().Create(context.Background(), vol, metav1.CreateOptions{})
if err != nil && !apierrs.IsAlreadyExists(err) {
return fmt.Errorf("failed to create PV object: %w", err)
}
return nil
}
// processPV looks at a single PV item from the queue and checks if it has been
// released and needs to be deleted. If yes it deletes the associated quota,
// directory and the PV object and logs the result to the recorder.
func (p *csiProvisionerServer) processPV(key string) error {
_, name, err := cache.SplitMetaNamespaceKey(key)
if err != nil {
return fmt.Errorf("invalid resource key: %s", key)
}
pv, err := p.pvInformer.Lister().Get(name)
if apierrs.IsNotFound(err) {
return nil // nothing to do, no error
} else if err != nil {
return fmt.Errorf("failed to get PV for processing: %w", err)
}
if !p.isOurPV(pv) {
return nil
}
if pv.Spec.PersistentVolumeReclaimPolicy != v1.PersistentVolumeReclaimDelete || pv.Status.Phase != "Released" {
return nil
}
volumePath := p.volumePath(pv.Spec.CSI.VolumeHandle)
// Log deletes for auditing purposes
p.logger.Infof("Deleting persistent volume %s", pv.Spec.CSI.VolumeHandle)
switch *pv.Spec.VolumeMode {
case "", v1.PersistentVolumeFilesystem:
if err := fsquota.SetQuota(volumePath, 0, 0); err != nil {
// We record these here manually since a successful deletion
// removes the PV we'd be attaching them to.
p.recorder.Eventf(pv, v1.EventTypeWarning, "DeprovisioningFailed", "Failed to remove quota: %v", err)
return fmt.Errorf("failed to remove quota: %w", err)
}
if err := os.RemoveAll(volumePath); err != nil && !os.IsNotExist(err) {
p.recorder.Eventf(pv, v1.EventTypeWarning, "DeprovisioningFailed", "Failed to delete volume: %v", err)
return fmt.Errorf("failed to delete volume: %w", err)
}
case v1.PersistentVolumeBlock:
if err := os.Remove(volumePath); err != nil && !os.IsNotExist(err) {
p.recorder.Eventf(pv, v1.EventTypeWarning, "DeprovisioningFailed", "Failed to delete volume: %v", err)
return fmt.Errorf("failed to delete volume: %w", err)
}
default:
p.recorder.Eventf(pv, v1.EventTypeWarning, "DeprovisioningFailed", "Invalid volume mode \"%v\"", *pv.Spec.VolumeMode)
return fmt.Errorf("invalid volume mode \"%v\"", *pv.Spec.VolumeMode)
}
err = p.Kubernetes.CoreV1().PersistentVolumes().Delete(context.Background(), pv.Name, metav1.DeleteOptions{})
if err != nil && !apierrs.IsNotFound(err) {
p.recorder.Eventf(pv, v1.EventTypeWarning, "DeprovisioningFailed", "Failed to delete PV object from K8s API: %v", err)
return fmt.Errorf("failed to delete PV object: %w", err)
}
return nil
}