m/node/kubernetes: remove local-strict storage class
It turns out that the local-strict storage class did not have an effect
on readonly volumes, or on gVisor. And after updating runc to 1.2.0, it
no longer has an effect anywhere. It appears that setting noexec and
similar flags in the CSI server, using a storage class, is the wrong
approach and just happened to work by accident. Instead, this should
probably be implemented as a Kubernetes feature to set per-mount-point
flags on the VolumeMount.
This commit thus removes the local-strict storage class and the mount
options processing in the provisioner and CSI server. This will allow
updating runc.
Additionally, the StatefulSet end-to-end test is extended to also run
tests with gVisor. gVisor apparently does not support block volumes.
See: https://github.com/monogon-dev/monogon/issues/361
Change-Id: Ic2f50aa3bc9442ca1dbb9e8742d5b8fecbfc3614
Reviewed-on: https://review.monogon.dev/c/monogon/+/3658
Tested-by: Jenkins CI
Reviewed-by: Lorenz Brun <lorenz@monogon.tech>
diff --git a/metropolis/node/kubernetes/csi.go b/metropolis/node/kubernetes/csi.go
index 40d54d0..f7ff00a 100644
--- a/metropolis/node/kubernetes/csi.go
+++ b/metropolis/node/kubernetes/csi.go
@@ -107,12 +107,7 @@
return nil, status.Errorf(codes.Internal, "unable to create requested target path: %v", err)
}
- var mountFlags uintptr = unix.MS_BIND
- if req.Readonly {
- mountFlags |= unix.MS_RDONLY
- }
-
- err := unix.Mount(volumePath, req.TargetPath, "", mountFlags, "")
+ err := unix.Mount(volumePath, req.TargetPath, "", unix.MS_BIND, "")
switch {
case errors.Is(err, unix.ENOENT):
return nil, status.Error(codes.NotFound, "volume not found")
@@ -120,48 +115,13 @@
return nil, status.Errorf(codes.Unavailable, "failed to bind-mount volume: %v", err)
}
- flagSet := make(map[string]bool)
- for _, flag := range req.VolumeCapability.GetMount().GetMountFlags() {
- flagSet[flag] = true
+ var flags uintptr = unix.MS_REMOUNT | unix.MS_BIND
+ if req.Readonly {
+ flags |= unix.MS_RDONLY
}
-
- flagPairs := map[string]string{
- "exec": "noexec",
- "dev": "nodev",
- "suid": "nosuid",
- }
- for pFlag, nFlag := range flagPairs {
- if flagSet[pFlag] && flagSet[nFlag] {
- return nil, status.Errorf(codes.InvalidArgument, "contradictory flag pair found. can't have both %q and %q set", pFlag, nFlag)
- } else if !flagSet[pFlag] && !flagSet[nFlag] {
- // If neither of a flag pair is found, add the negative flag as default.
- flagSet[nFlag] = true
- }
- }
-
- var mountAttr unix.MountAttr
- for flag := range flagSet {
- switch flag {
- case "exec":
- mountAttr.Attr_clr |= unix.MOUNT_ATTR_NOEXEC
- case "noexec":
- mountAttr.Attr_set |= unix.MOUNT_ATTR_NOEXEC
- case "dev":
- mountAttr.Attr_clr |= unix.MOUNT_ATTR_NODEV
- case "nodev":
- mountAttr.Attr_set |= unix.MOUNT_ATTR_NODEV
- case "suid":
- mountAttr.Attr_clr |= unix.MOUNT_ATTR_NOSUID
- case "nosuid":
- mountAttr.Attr_set |= unix.MOUNT_ATTR_NOSUID
- default:
- return nil, status.Errorf(codes.InvalidArgument, "unknown mount flag: %s", flag)
- }
- }
-
- if err := unix.MountSetattr(-1, req.TargetPath, 0, &mountAttr); err != nil {
+ if err := unix.Mount("", req.TargetPath, "", flags, ""); err != nil {
_ = unix.Unmount(req.TargetPath, 0) // Best-effort
- return nil, status.Errorf(codes.Internal, "unable to set mount attributes: %v", err)
+ return nil, status.Errorf(codes.Internal, "unable to set mount-point flags: %v", err)
}
case *csi.VolumeCapability_Block:
f, err := os.OpenFile(volumePath, os.O_RDWR, 0)
diff --git a/metropolis/node/kubernetes/provisioner.go b/metropolis/node/kubernetes/provisioner.go
index 0c5f972..aacb949 100644
--- a/metropolis/node/kubernetes/provisioner.go
+++ b/metropolis/node/kubernetes/provisioner.go
@@ -283,7 +283,6 @@
volumeID := "pvc-" + string(pvc.ObjectMeta.UID)
volumePath := p.volumePath(volumeID)
- var mountOptions []string
p.logger.Infof("Creating local PV %s", volumeID)
@@ -302,7 +301,6 @@
if err := fsquota.SetQuota(volumePath, uint64(capacity), uint64(capacity)/inodeCapacityRatio); err != nil {
return fmt.Errorf("failed to update quota: %w", err)
}
- mountOptions = storageClass.MountOptions
case v1.PersistentVolumeBlock:
imageFile, err := os.OpenFile(volumePath, os.O_CREATE|os.O_RDWR, 0644)
if err != nil {
@@ -351,7 +349,6 @@
},
},
StorageClassName: *pvc.Spec.StorageClassName,
- MountOptions: mountOptions,
PersistentVolumeReclaimPolicy: *storageClass.ReclaimPolicy,
},
}
diff --git a/metropolis/node/kubernetes/reconciler/resources_storageclass.go b/metropolis/node/kubernetes/reconciler/resources_storageclass.go
index 36dee1c..9be81fb 100644
--- a/metropolis/node/kubernetes/reconciler/resources_storageclass.go
+++ b/metropolis/node/kubernetes/reconciler/resources_storageclass.go
@@ -68,8 +68,7 @@
"storageclass.kubernetes.io/is-default-class": "true",
"kubernetes.io/description": "local is the default storage class on Metropolis. " +
"It stores data on the node root disk and supports space limits, resizing and oversubscription but no snapshots. " +
- "It is backed by XFS and uses permissive mounting options (exec,dev,suid). " +
- "If you want more strict mounting options, chose the `local-strict` storage class.",
+ "It is backed by XFS.",
},
},
AllowVolumeExpansion: True(),
@@ -82,25 +81,5 @@
"suid",
},
},
- &storage.StorageClass{
- ObjectMeta: meta.ObjectMeta{
- Name: "local-strict",
- Labels: builtinLabels(nil),
- Annotations: map[string]string{
- "storageclass.kubernetes.io/is-default-class": "false",
- "kubernetes.io/description": "local-strict is the same as local (see its description) but uses strict mount options (noexec, nodev, nosuid). " +
- "It is best used together with readOnlyRoot to restrict exploitation vectors.",
- },
- },
- AllowVolumeExpansion: True(),
- Provisioner: csiProvisionerName,
- ReclaimPolicy: &reclaimPolicyDelete,
- VolumeBindingMode: &waitForConsumerBinding,
- MountOptions: []string{
- "noexec",
- "nodev",
- "nosuid",
- },
- },
}
}
diff --git a/metropolis/test/e2e/persistentvolume/main.go b/metropolis/test/e2e/persistentvolume/main.go
index 577c343..d9ff958 100644
--- a/metropolis/test/e2e/persistentvolume/main.go
+++ b/metropolis/test/e2e/persistentvolume/main.go
@@ -9,6 +9,7 @@
import (
"errors"
+ "flag"
"fmt"
"os"
"path/filepath"
@@ -23,6 +24,8 @@
// This is a copy of the constant in metropolis/node/kubernetes/provisioner.go.
const inodeCapacityRatio = 4 * 512
+var runtimeClass = flag.String("runtimeclass", "", "Name of the runtime class")
+
// checkFilesystemVolume checks that the filesystem containing path has the
// given mount flags and capacity.
func checkFilesystemVolume(path string, expectedFlags int64, expectedBytes uint64) error {
@@ -86,20 +89,21 @@
if err := checkFilesystemVolume("/vol/default", 0, 1*1024*1024); err != nil {
return err
}
- if err := checkFilesystemVolume("/vol/local-strict", unix.ST_NOSUID|unix.ST_NODEV|unix.ST_NOEXEC, 5*1024*1024); err != nil {
- return err
- }
if err := checkFilesystemVolume("/vol/readonly", unix.ST_RDONLY, 1*1024*1024); err != nil {
return err
}
- if err := checkBlockVolume("/vol/block", 1*1024*1024); err != nil {
- return err
+ // Block volumes are not supported on gVisor.
+ if *runtimeClass != "gvisor" {
+ if err := checkBlockVolume("/vol/block", 1*1024*1024); err != nil {
+ return err
+ }
}
return nil
}
func main() {
- fmt.Println("PersistentVolume tests starting...")
+ flag.Parse()
+ fmt.Printf("PersistentVolume tests starting on %s...\n", *runtimeClass)
if err := testPersistentVolume(); err != nil {
fmt.Println(err.Error())
diff --git a/metropolis/test/e2e/suites/kubernetes/kubernetes_helpers.go b/metropolis/test/e2e/suites/kubernetes/kubernetes_helpers.go
index 25a785d..9c67117 100644
--- a/metropolis/test/e2e/suites/kubernetes/kubernetes_helpers.go
+++ b/metropolis/test/e2e/suites/kubernetes/kubernetes_helpers.go
@@ -156,7 +156,7 @@
}
// makeTestStatefulSet generates a StatefulSet spec
-func makeTestStatefulSet(name string) *appsv1.StatefulSet {
+func makeTestStatefulSet(name string, runtimeClass string) *appsv1.StatefulSet {
return &appsv1.StatefulSet{
ObjectMeta: metav1.ObjectMeta{Name: name},
Spec: appsv1.StatefulSetSpec{
@@ -175,17 +175,6 @@
},
},
{
- ObjectMeta: metav1.ObjectMeta{Name: "vol-local-strict"},
- Spec: corev1.PersistentVolumeClaimSpec{
- AccessModes: []corev1.PersistentVolumeAccessMode{corev1.ReadWriteOnce},
- Resources: corev1.VolumeResourceRequirements{
- Requests: map[corev1.ResourceName]resource.Quantity{corev1.ResourceStorage: resource.MustParse("5Mi")},
- },
- StorageClassName: ptr.To("local-strict"),
- VolumeMode: ptr.To(corev1.PersistentVolumeFilesystem),
- },
- },
- {
ObjectMeta: metav1.ObjectMeta{Name: "vol-readonly"},
Spec: corev1.PersistentVolumeClaimSpec{
AccessModes: []corev1.PersistentVolumeAccessMode{corev1.ReadWriteOnce},
@@ -218,16 +207,13 @@
Name: "test",
ImagePullPolicy: corev1.PullIfNotPresent,
Image: "test.monogon.internal/metropolis/test/e2e/persistentvolume/persistentvolume_image",
+ Args: []string{"-runtimeclass", runtimeClass},
VolumeMounts: []corev1.VolumeMount{
{
Name: "vol-default",
MountPath: "/vol/default",
},
{
- Name: "vol-local-strict",
- MountPath: "/vol/local-strict",
- },
- {
Name: "vol-readonly",
ReadOnly: true,
MountPath: "/vol/readonly",
@@ -241,6 +227,7 @@
},
},
},
+ RuntimeClassName: ptr.To(runtimeClass),
},
},
},
diff --git a/metropolis/test/e2e/suites/kubernetes/run_test.go b/metropolis/test/e2e/suites/kubernetes/run_test.go
index 5a4be60..baaa235 100644
--- a/metropolis/test/e2e/suites/kubernetes/run_test.go
+++ b/metropolis/test/e2e/suites/kubernetes/run_test.go
@@ -370,33 +370,36 @@
return fmt.Errorf("pod is not ready: %s", errorMsg.String())
}
})
- util.TestEventual(t, "StatefulSet with PersistentVolume tests", ctx, smallTestTimeout, func(ctx context.Context) error {
- _, err := clientSet.AppsV1().StatefulSets("default").Create(ctx, makeTestStatefulSet("test-statefulset-1"), metav1.CreateOptions{})
- return err
- })
- util.TestEventual(t, "StatefulSet with PersistentVolume tests successful", ctx, smallTestTimeout, func(ctx context.Context) error {
- res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-statefulset-1"})
- if err != nil {
+ for _, runtimeClass := range []string{"runc", "gvisor"} {
+ statefulSetName := fmt.Sprintf("test-statefulset-%s", runtimeClass)
+ util.TestEventual(t, fmt.Sprintf("StatefulSet with %s tests", runtimeClass), ctx, smallTestTimeout, func(ctx context.Context) error {
+ _, err := clientSet.AppsV1().StatefulSets("default").Create(ctx, makeTestStatefulSet(statefulSetName, runtimeClass), metav1.CreateOptions{})
return err
- }
- if len(res.Items) == 0 {
- return errors.New("pod didn't get created")
- }
- pod := res.Items[0]
- lines, err := getPodLogLines(ctx, clientSet, pod.Name, 50)
- if err != nil {
- return fmt.Errorf("could not get logs: %w", err)
- }
- if len(lines) > 0 {
- switch lines[len(lines)-1] {
- case "[TESTS-PASSED]":
- return nil
- case "[TESTS-FAILED]":
- return util.Permanent(fmt.Errorf("tests failed, log:\n %s", strings.Join(lines, "\n ")))
+ })
+ util.TestEventual(t, fmt.Sprintf("StatefulSet with %s tests successful", runtimeClass), ctx, smallTestTimeout, func(ctx context.Context) error {
+ res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: fmt.Sprintf("name=%s", statefulSetName)})
+ if err != nil {
+ return err
}
- }
- return fmt.Errorf("pod is not ready: %v, log:\n %s", pod.Status.Phase, strings.Join(lines, "\n "))
- })
+ if len(res.Items) == 0 {
+ return errors.New("pod didn't get created")
+ }
+ pod := res.Items[0]
+ lines, err := getPodLogLines(ctx, clientSet, pod.Name, 50)
+ if err != nil {
+ return fmt.Errorf("could not get logs: %w", err)
+ }
+ if len(lines) > 0 {
+ switch lines[len(lines)-1] {
+ case "[TESTS-PASSED]":
+ return nil
+ case "[TESTS-FAILED]":
+ return util.Permanent(fmt.Errorf("tests failed, log:\n %s", strings.Join(lines, "\n ")))
+ }
+ }
+ return fmt.Errorf("pod is not ready: %v, log:\n %s", pod.Status.Phase, strings.Join(lines, "\n "))
+ })
+ }
util.TestEventual(t, "In-cluster self-test job", ctx, smallTestTimeout, func(ctx context.Context) error {
_, err := clientSet.BatchV1().Jobs("default").Create(ctx, makeSelftestSpec("selftest"), metav1.CreateOptions{})
return err