blob: be9e3f5f42e7c38168365f6a340d174dafb90050 [file] [log] [blame]
// Copyright The Monogon Project Authors.
// SPDX-License-Identifier: Apache-2.0
package kubernetes
import (
"context"
"errors"
"fmt"
"net"
"os"
"path/filepath"
"regexp"
"github.com/container-storage-interface/spec/lib/go/csi"
"golang.org/x/sys/unix"
"google.golang.org/grpc"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
"google.golang.org/protobuf/types/known/wrapperspb"
pluginregistration "k8s.io/kubelet/pkg/apis/pluginregistration/v1"
"source.monogon.dev/go/logging"
"source.monogon.dev/metropolis/node/core/localstorage"
"source.monogon.dev/osbase/fsquota"
"source.monogon.dev/osbase/loop"
"source.monogon.dev/osbase/supervisor"
)
// Derived from K8s spec for acceptable names, but shortened to 130 characters
// to avoid issues with maximum path length. We don't provision longer names so
// this applies only if you manually create a volume with a name of more than
// 130 characters.
var acceptableNames = regexp.MustCompile("^[a-z][a-z0-9-.]{0,128}[a-z0-9]$")
type csiPluginServer struct {
*csi.UnimplementedNodeServer
KubeletDirectory *localstorage.DataKubernetesKubeletDirectory
VolumesDirectory *localstorage.DataVolumesDirectory
logger logging.Leveled
}
func (s *csiPluginServer) Run(ctx context.Context) error {
s.logger = supervisor.Logger(ctx)
// Try to remove socket if an unclean shutdown happened.
os.Remove(s.KubeletDirectory.Plugins.VFS.FullPath())
pluginListener, err := net.ListenUnix("unix", &net.UnixAddr{Name: s.KubeletDirectory.Plugins.VFS.FullPath(), Net: "unix"})
if err != nil {
return fmt.Errorf("failed to listen on CSI socket: %w", err)
}
pluginServer := grpc.NewServer()
csi.RegisterIdentityServer(pluginServer, s)
csi.RegisterNodeServer(pluginServer, s)
// Enable graceful shutdown since we don't have long-running RPCs and most
// of them shouldn't and can't be cancelled anyways.
if err := supervisor.Run(ctx, "csi-node", supervisor.GRPCServer(pluginServer, pluginListener, true)); err != nil {
return err
}
r := pluginRegistrationServer{
regErr: make(chan error, 1),
KubeletDirectory: s.KubeletDirectory,
}
if err := supervisor.Run(ctx, "registration", r.Run); err != nil {
return err
}
supervisor.Signal(ctx, supervisor.SignalHealthy)
supervisor.Signal(ctx, supervisor.SignalDone)
return nil
}
func (s *csiPluginServer) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolumeRequest) (*csi.NodePublishVolumeResponse, error) {
if !acceptableNames.MatchString(req.VolumeId) {
return nil, status.Error(codes.InvalidArgument, "invalid characters in volume id")
}
// TODO(q3k): move this logic to localstorage?
volumePath := filepath.Join(s.VolumesDirectory.FullPath(), req.VolumeId)
switch req.VolumeCapability.AccessMode.Mode {
case csi.VolumeCapability_AccessMode_SINGLE_NODE_WRITER:
case csi.VolumeCapability_AccessMode_SINGLE_NODE_READER_ONLY:
default:
return nil, status.Error(codes.InvalidArgument, "unsupported access mode")
}
switch req.VolumeCapability.AccessType.(type) {
case *csi.VolumeCapability_Mount:
if err := os.MkdirAll(req.TargetPath, 0700); err != nil {
return nil, status.Errorf(codes.Internal, "unable to create requested target path: %v", err)
}
err := unix.Mount(volumePath, req.TargetPath, "", unix.MS_BIND, "")
switch {
case errors.Is(err, unix.ENOENT):
return nil, status.Error(codes.NotFound, "volume not found")
case err != nil:
return nil, status.Errorf(codes.Unavailable, "failed to bind-mount volume: %v", err)
}
var flags uintptr = unix.MS_REMOUNT | unix.MS_BIND | unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
if req.Readonly {
flags |= unix.MS_RDONLY
}
if err := unix.Mount("", req.TargetPath, "", flags, ""); err != nil {
_ = unix.Unmount(req.TargetPath, 0) // Best-effort
return nil, status.Errorf(codes.Internal, "unable to set mount-point flags: %v", err)
}
case *csi.VolumeCapability_Block:
f, err := os.OpenFile(volumePath, os.O_RDWR, 0)
if err != nil {
return nil, status.Errorf(codes.Unavailable, "failed to open block volume: %v", err)
}
defer f.Close()
var flags uint32 = loop.FlagDirectIO
if req.Readonly {
flags |= loop.FlagReadOnly
}
loopdev, err := loop.Create(f, loop.Config{Flags: flags})
if err != nil {
return nil, status.Errorf(codes.Unavailable, "failed to create loop device: %v", err)
}
loopdevNum, err := loopdev.Dev()
if err != nil {
loopdev.Remove()
return nil, status.Errorf(codes.Internal, "device number not available: %v", err)
}
if err := unix.Mknod(req.TargetPath, unix.S_IFBLK|0640, int(loopdevNum)); err != nil {
loopdev.Remove()
return nil, status.Errorf(codes.Unavailable, "failed to create device node at target path: %v", err)
}
loopdev.Close()
default:
return nil, status.Error(codes.InvalidArgument, "unsupported access type")
}
return &csi.NodePublishVolumeResponse{}, nil
}
func (s *csiPluginServer) NodeUnpublishVolume(ctx context.Context, req *csi.NodeUnpublishVolumeRequest) (*csi.NodeUnpublishVolumeResponse, error) {
loopdev, err := loop.Open(req.TargetPath)
if err == nil {
defer loopdev.Close()
// We have a block device
if err := loopdev.Remove(); err != nil {
return nil, status.Errorf(codes.Unavailable, "failed to remove loop device: %v", err)
}
if err := os.Remove(req.TargetPath); err != nil && !os.IsNotExist(err) {
return nil, status.Errorf(codes.Unavailable, "failed to remove device inode: %v", err)
}
return &csi.NodeUnpublishVolumeResponse{}, nil
}
// Otherwise try a normal unmount
if err := unix.Unmount(req.TargetPath, 0); err != nil {
return nil, status.Errorf(codes.Unavailable, "failed to unmount volume: %v", err)
}
return &csi.NodeUnpublishVolumeResponse{}, nil
}
func (*csiPluginServer) NodeGetVolumeStats(ctx context.Context, req *csi.NodeGetVolumeStatsRequest) (*csi.NodeGetVolumeStatsResponse, error) {
quota, err := fsquota.GetQuota(req.VolumePath)
if os.IsNotExist(err) {
return nil, status.Error(codes.NotFound, "volume does not exist at this path")
} else if err != nil {
return nil, status.Errorf(codes.Unavailable, "failed to get quota: %v", err)
}
return &csi.NodeGetVolumeStatsResponse{
Usage: []*csi.VolumeUsage{
{
Total: int64(quota.Bytes),
Unit: csi.VolumeUsage_BYTES,
Used: int64(quota.BytesUsed),
Available: int64(quota.Bytes - quota.BytesUsed),
},
{
Total: int64(quota.Inodes),
Unit: csi.VolumeUsage_INODES,
Used: int64(quota.InodesUsed),
Available: int64(quota.Inodes - quota.InodesUsed),
},
},
}, nil
}
func (s *csiPluginServer) NodeExpandVolume(ctx context.Context, req *csi.NodeExpandVolumeRequest) (*csi.NodeExpandVolumeResponse, error) {
info, err := os.Stat(req.VolumePath)
if err != nil {
return nil, status.Errorf(codes.Unavailable, "failed to stat volume: %v", err)
}
if info.IsDir() {
// Mount volume. Nothing to do here.
} else {
// Block volume.
loopdev, err := loop.Open(req.VolumePath)
if err != nil {
return nil, status.Errorf(codes.Unavailable, "failed to open loop device: %v", err)
}
defer loopdev.Close()
if err := loopdev.RefreshSize(); err != nil {
return nil, status.Errorf(codes.Unavailable, "failed to refresh loop device size: %v", err)
}
}
return &csi.NodeExpandVolumeResponse{CapacityBytes: req.CapacityRange.LimitBytes}, nil
}
func rpcCapability(cap csi.NodeServiceCapability_RPC_Type) *csi.NodeServiceCapability {
return &csi.NodeServiceCapability{
Type: &csi.NodeServiceCapability_Rpc{
Rpc: &csi.NodeServiceCapability_RPC{Type: cap},
},
}
}
func (*csiPluginServer) NodeGetCapabilities(ctx context.Context, req *csi.NodeGetCapabilitiesRequest) (*csi.NodeGetCapabilitiesResponse, error) {
return &csi.NodeGetCapabilitiesResponse{
Capabilities: []*csi.NodeServiceCapability{
rpcCapability(csi.NodeServiceCapability_RPC_EXPAND_VOLUME),
rpcCapability(csi.NodeServiceCapability_RPC_GET_VOLUME_STATS),
},
}, nil
}
func (*csiPluginServer) NodeGetInfo(ctx context.Context, req *csi.NodeGetInfoRequest) (*csi.NodeGetInfoResponse, error) {
hostname, err := os.Hostname()
if err != nil {
return nil, status.Errorf(codes.Unavailable, "failed to get node identity: %v", err)
}
return &csi.NodeGetInfoResponse{
NodeId: hostname,
}, nil
}
// CSI Identity endpoints
func (*csiPluginServer) GetPluginInfo(ctx context.Context, req *csi.GetPluginInfoRequest) (*csi.GetPluginInfoResponse, error) {
return &csi.GetPluginInfoResponse{
Name: "dev.monogon.metropolis.vfs",
VendorVersion: "0.0.1", // TODO(lorenz): Maybe stamp?
}, nil
}
func (*csiPluginServer) GetPluginCapabilities(ctx context.Context, req *csi.GetPluginCapabilitiesRequest) (*csi.GetPluginCapabilitiesResponse, error) {
return &csi.GetPluginCapabilitiesResponse{
Capabilities: []*csi.PluginCapability{
{
Type: &csi.PluginCapability_VolumeExpansion_{
VolumeExpansion: &csi.PluginCapability_VolumeExpansion{
Type: csi.PluginCapability_VolumeExpansion_ONLINE,
},
},
},
},
}, nil
}
func (s *csiPluginServer) Probe(ctx context.Context, req *csi.ProbeRequest) (*csi.ProbeResponse, error) {
return &csi.ProbeResponse{Ready: &wrapperspb.BoolValue{Value: true}}, nil
}
// pluginRegistrationServer implements the pluginregistration.Registration
// service. It has a special restart mechanic to accomodate a design issue
// in Kubelet which requires it to remove and recreate its gRPC socket for
// every new registration attempt.
type pluginRegistrationServer struct {
// regErr has a buffer of 1, so that at least one error can always be
// sent into it in a non-blocking way. There is a race if
// NotifyRegistrationStatus is called twice with an error as the buffered
// item might have been received but not fully processed yet.
// As distinguishing between calls on different socket iterations is
// hard, doing it this way errs on the side of caution, i.e.
// generating too many restarts. This way is better as if we miss one
// such error the registration will not be available until the node
// gets restarted.
regErr chan error
KubeletDirectory *localstorage.DataKubernetesKubeletDirectory
}
func (r *pluginRegistrationServer) Run(ctx context.Context) error {
// Remove registration socket if it exists
os.Remove(r.KubeletDirectory.PluginsRegistry.VFSReg.FullPath())
registrationListener, err := net.ListenUnix("unix", &net.UnixAddr{Name: r.KubeletDirectory.PluginsRegistry.VFSReg.FullPath(), Net: "unix"})
if err != nil {
return fmt.Errorf("failed to listen on CSI registration socket: %w", err)
}
defer registrationListener.Close()
grpcS := grpc.NewServer()
pluginregistration.RegisterRegistrationServer(grpcS, r)
supervisor.Run(ctx, "rpc", supervisor.GRPCServer(grpcS, registrationListener, true))
supervisor.Signal(ctx, supervisor.SignalHealthy)
select {
case <-ctx.Done():
return ctx.Err()
case err = <-r.regErr:
return err
}
}
func (r *pluginRegistrationServer) GetInfo(ctx context.Context, req *pluginregistration.InfoRequest) (*pluginregistration.PluginInfo, error) {
return &pluginregistration.PluginInfo{
Type: pluginregistration.CSIPlugin,
Name: "dev.monogon.metropolis.vfs",
Endpoint: r.KubeletDirectory.Plugins.VFS.FullPath(),
SupportedVersions: []string{"1.2"}, // Keep in sync with container-storage-interface/spec package version
}, nil
}
func (r *pluginRegistrationServer) NotifyRegistrationStatus(ctx context.Context, req *pluginregistration.RegistrationStatus) (*pluginregistration.RegistrationStatusResponse, error) {
if !req.PluginRegistered {
select {
case r.regErr <- fmt.Errorf("registration failed: %v", req.Error):
default:
}
}
return &pluginregistration.RegistrationStatusResponse{}, nil
}