m/node/kubernetes: fix PV mount flags and add e2e test

Mount flags did not work because of two problems:
- The provisioner did not copy them from the StorageClass to the
  PersistentVolume.
- The CSI server used = instead of |= when adding flags, so only one of
  the flags was added or removed.

There was an existing e2e test for PVs, however this only created the
PVC/PV without even attaching it to a container. I extended this test to
attach the PV and check from inside the container that it has the
expected mount flags and quota.

The existing e2e test also created a block PV, however attaching a block
PV to a container was not tested and is apparently broken, so I removed
this test for now.

Change-Id: Ie14adfafd333eab38d2b5f1b4ce8a2aa8795eae0
Reviewed-on: https://review.monogon.dev/c/monogon/+/3613
Reviewed-by: Lorenz Brun <lorenz@monogon.tech>
Tested-by: Jenkins CI
diff --git a/metropolis/node/kubernetes/csi.go b/metropolis/node/kubernetes/csi.go
index 5ca6885..eb43ec0 100644
--- a/metropolis/node/kubernetes/csi.go
+++ b/metropolis/node/kubernetes/csi.go
@@ -142,17 +142,17 @@
 		for flag := range flagSet {
 			switch flag {
 			case "exec":
-				mountAttr.Attr_clr = unix.MOUNT_ATTR_NOEXEC
+				mountAttr.Attr_clr |= unix.MOUNT_ATTR_NOEXEC
 			case "noexec":
-				mountAttr.Attr_set = unix.MOUNT_ATTR_NOEXEC
+				mountAttr.Attr_set |= unix.MOUNT_ATTR_NOEXEC
 			case "dev":
-				mountAttr.Attr_clr = unix.MOUNT_ATTR_NODEV
+				mountAttr.Attr_clr |= unix.MOUNT_ATTR_NODEV
 			case "nodev":
-				mountAttr.Attr_set = unix.MOUNT_ATTR_NODEV
+				mountAttr.Attr_set |= unix.MOUNT_ATTR_NODEV
 			case "suid":
-				mountAttr.Attr_clr = unix.MOUNT_ATTR_NOSUID
+				mountAttr.Attr_clr |= unix.MOUNT_ATTR_NOSUID
 			case "nosuid":
-				mountAttr.Attr_set = unix.MOUNT_ATTR_NOSUID
+				mountAttr.Attr_set |= unix.MOUNT_ATTR_NOSUID
 			default:
 				return nil, status.Errorf(codes.InvalidArgument, "unknown mount flag: %s", flag)
 			}
diff --git a/metropolis/node/kubernetes/provisioner.go b/metropolis/node/kubernetes/provisioner.go
index aacb949..0c5f972 100644
--- a/metropolis/node/kubernetes/provisioner.go
+++ b/metropolis/node/kubernetes/provisioner.go
@@ -283,6 +283,7 @@
 
 	volumeID := "pvc-" + string(pvc.ObjectMeta.UID)
 	volumePath := p.volumePath(volumeID)
+	var mountOptions []string
 
 	p.logger.Infof("Creating local PV %s", volumeID)
 
@@ -301,6 +302,7 @@
 		if err := fsquota.SetQuota(volumePath, uint64(capacity), uint64(capacity)/inodeCapacityRatio); err != nil {
 			return fmt.Errorf("failed to update quota: %w", err)
 		}
+		mountOptions = storageClass.MountOptions
 	case v1.PersistentVolumeBlock:
 		imageFile, err := os.OpenFile(volumePath, os.O_CREATE|os.O_RDWR, 0644)
 		if err != nil {
@@ -349,6 +351,7 @@
 				},
 			},
 			StorageClassName:              *pvc.Spec.StorageClassName,
+			MountOptions:                  mountOptions,
 			PersistentVolumeReclaimPolicy: *storageClass.ReclaimPolicy,
 		},
 	}
diff --git a/metropolis/test/e2e/BUILD.bazel b/metropolis/test/e2e/BUILD.bazel
index ecbdc97..883dcb8 100644
--- a/metropolis/test/e2e/BUILD.bazel
+++ b/metropolis/test/e2e/BUILD.bazel
@@ -4,6 +4,7 @@
     name = "testimages_manifest",
     images = [
         "//metropolis/test/e2e/selftest:selftest_image",
+        "//metropolis/test/e2e/persistentvolume:persistentvolume_image",
         "//metropolis/test/e2e/httpserver:httpserver_image",
     ],
     visibility = [
diff --git a/metropolis/test/e2e/persistentvolume/BUILD.bazel b/metropolis/test/e2e/persistentvolume/BUILD.bazel
new file mode 100644
index 0000000..fec0886
--- /dev/null
+++ b/metropolis/test/e2e/persistentvolume/BUILD.bazel
@@ -0,0 +1,44 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_library")
+
+go_library(
+    name = "persistentvolume_lib",
+    srcs = ["main.go"],
+    importpath = "source.monogon.dev/metropolis/test/e2e/persistentvolume",
+    visibility = ["//visibility:private"],
+    deps = ["@org_golang_x_sys//unix"],
+)
+
+go_binary(
+    name = "persistentvolume",
+    embed = [":persistentvolume_lib"],
+    pure = "on",
+    visibility = ["//visibility:private"],
+)
+
+load("@aspect_bazel_lib//lib:transitions.bzl", "platform_transition_binary")
+
+platform_transition_binary(
+    name = "persistentvolume_transitioned",
+    binary = ":persistentvolume",
+    target_platform = "//build/platforms:linux_amd64_static",
+    visibility = ["//visibility:private"],
+)
+
+load("@rules_pkg//pkg:tar.bzl", "pkg_tar")
+
+pkg_tar(
+    name = "persistentvolume_layer",
+    srcs = [":persistentvolume_transitioned"],
+    visibility = ["//visibility:private"],
+)
+
+load("@rules_oci//oci:defs.bzl", "oci_image")
+
+oci_image(
+    name = "persistentvolume_image",
+    base = "@distroless_base",
+    entrypoint = ["/persistentvolume"],
+    tars = [":persistentvolume_layer"],
+    visibility = ["//metropolis/test/e2e:__pkg__"],
+    workdir = "/app",
+)
diff --git a/metropolis/test/e2e/persistentvolume/main.go b/metropolis/test/e2e/persistentvolume/main.go
new file mode 100644
index 0000000..38cf329
--- /dev/null
+++ b/metropolis/test/e2e/persistentvolume/main.go
@@ -0,0 +1,98 @@
+// This is a test for PersistentVolumes provided by our provisioner. It tests
+// that volumes have the right mount flags, and the expected quotas.
+//
+// The package here is a binary which will run in a Pod in our Kubernetes
+// end-to-end test. See the function makeTestStatefulSet in
+// metropolis/test/e2e/suites/kubernetes/kubernetes_helpers.go for how the Pod
+// is created.
+package main
+
+import (
+	"errors"
+	"fmt"
+	"os"
+	"path/filepath"
+	"syscall"
+	"time"
+
+	"golang.org/x/sys/unix"
+)
+
+// This is a copy of the constant in metropolis/node/kubernetes/provisioner.go.
+const inodeCapacityRatio = 4 * 512
+
+// checkFilesystemVolume checks that the filesystem containing path has the
+// given mount flags and capacity.
+func checkFilesystemVolume(path string, expectedFlags int64, expectedBytes uint64) error {
+	var statfs unix.Statfs_t
+	err := unix.Statfs(path, &statfs)
+	if err != nil {
+		return fmt.Errorf("failed to statfs volume %q: %w", path, err)
+	}
+
+	if statfs.Flags&unix.ST_RDONLY != expectedFlags&unix.ST_RDONLY {
+		return fmt.Errorf("volume %q has readonly flag %v, expected the opposite", path, statfs.Flags&unix.ST_RDONLY != 0)
+	}
+	if statfs.Flags&unix.ST_NOSUID != expectedFlags&unix.ST_NOSUID {
+		return fmt.Errorf("volume %q has nosuid flag %v, expected the opposite", path, statfs.Flags&unix.ST_NOSUID != 0)
+	}
+	if statfs.Flags&unix.ST_NODEV != expectedFlags&unix.ST_NODEV {
+		return fmt.Errorf("volume %q has nodev flag %v, expected the opposite", path, statfs.Flags&unix.ST_NODEV != 0)
+	}
+	if statfs.Flags&unix.ST_NOEXEC != expectedFlags&unix.ST_NOEXEC {
+		return fmt.Errorf("volume %q has noexec flag %v, expected the opposite", path, statfs.Flags&unix.ST_NOEXEC != 0)
+	}
+
+	sizeBytes := statfs.Blocks * uint64(statfs.Bsize)
+	if sizeBytes != expectedBytes {
+		return fmt.Errorf("volume %q has capacity %v bytes, expected %v bytes", path, sizeBytes, expectedBytes)
+	}
+	expectedFiles := expectedBytes / inodeCapacityRatio
+	if statfs.Files != expectedFiles {
+		return fmt.Errorf("volume %q has capacity for %v files, expected %v files", path, statfs.Files, expectedFiles)
+	}
+
+	// Try writing a file. This should only work if the volume is not read-only.
+	err = os.WriteFile(filepath.Join(path, "test.txt"), []byte("hello"), 0o644)
+	if expectedFlags&unix.ST_RDONLY != 0 {
+		if err == nil {
+			return fmt.Errorf("write did not fail in read-only volume %q", path)
+		} else if !errors.Is(err, syscall.EROFS) {
+			return fmt.Errorf("write failed with unexpected error in read-only volume %q: %w", path, err)
+		}
+	} else if err != nil {
+		return fmt.Errorf("failed to write file in volume %q: %w", path, err)
+	}
+
+	return nil
+}
+
+func testPersistentVolume() error {
+	if err := checkFilesystemVolume("/vol/default", 0, 1*1024*1024); err != nil {
+		return err
+	}
+	if err := checkFilesystemVolume("/vol/local-strict", unix.ST_NOSUID|unix.ST_NODEV|unix.ST_NOEXEC, 5*1024*1024); err != nil {
+		return err
+	}
+	if err := checkFilesystemVolume("/vol/readonly", unix.ST_RDONLY, 1*1024*1024); err != nil {
+		return err
+	}
+	return nil
+}
+
+func main() {
+	fmt.Println("PersistentVolume tests starting...")
+
+	if err := testPersistentVolume(); err != nil {
+		fmt.Println(err.Error())
+		// The final log line communicates the test outcome to the e2e test.
+		fmt.Println("[TESTS-FAILED]")
+	} else {
+		fmt.Println("[TESTS-PASSED]")
+	}
+
+	// Sleep forever, because if the process exits, Kubernetes will restart it.
+	for {
+		time.Sleep(time.Hour)
+	}
+}
diff --git a/metropolis/test/e2e/suites/kubernetes/BUILD.bazel b/metropolis/test/e2e/suites/kubernetes/BUILD.bazel
index a5ec530..318b7cf 100644
--- a/metropolis/test/e2e/suites/kubernetes/BUILD.bazel
+++ b/metropolis/test/e2e/suites/kubernetes/BUILD.bazel
@@ -13,6 +13,7 @@
         "@io_k8s_apimachinery//pkg/apis/meta/v1:meta",
         "@io_k8s_apimachinery//pkg/util/intstr",
         "@io_k8s_client_go//kubernetes",
+        "@io_k8s_utils//ptr",
     ],
 )
 
diff --git a/metropolis/test/e2e/suites/kubernetes/kubernetes_helpers.go b/metropolis/test/e2e/suites/kubernetes/kubernetes_helpers.go
index 60d611b..85f8909 100644
--- a/metropolis/test/e2e/suites/kubernetes/kubernetes_helpers.go
+++ b/metropolis/test/e2e/suites/kubernetes/kubernetes_helpers.go
@@ -30,6 +30,7 @@
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/util/intstr"
 	"k8s.io/client-go/kubernetes"
+	"k8s.io/utils/ptr"
 )
 
 // makeTestDeploymentSpec generates a Deployment spec for a single pod running
@@ -155,7 +156,7 @@
 }
 
 // makeTestStatefulSet generates a StatefulSet spec
-func makeTestStatefulSet(name string, volumeMode corev1.PersistentVolumeMode) *appsv1.StatefulSet {
+func makeTestStatefulSet(name string) *appsv1.StatefulSet {
 	return &appsv1.StatefulSet{
 		ObjectMeta: metav1.ObjectMeta{Name: name},
 		Spec: appsv1.StatefulSetSpec{
@@ -164,13 +165,34 @@
 			}},
 			VolumeClaimTemplates: []corev1.PersistentVolumeClaim{
 				{
-					ObjectMeta: metav1.ObjectMeta{Name: "www"},
+					ObjectMeta: metav1.ObjectMeta{Name: "vol-default"},
 					Spec: corev1.PersistentVolumeClaimSpec{
 						AccessModes: []corev1.PersistentVolumeAccessMode{corev1.ReadWriteOnce},
 						Resources: corev1.VolumeResourceRequirements{
-							Requests: map[corev1.ResourceName]resource.Quantity{corev1.ResourceStorage: resource.MustParse("50Mi")},
+							Requests: map[corev1.ResourceName]resource.Quantity{corev1.ResourceStorage: resource.MustParse("1Mi")},
 						},
-						VolumeMode: &volumeMode,
+						VolumeMode: ptr.To(corev1.PersistentVolumeFilesystem),
+					},
+				},
+				{
+					ObjectMeta: metav1.ObjectMeta{Name: "vol-local-strict"},
+					Spec: corev1.PersistentVolumeClaimSpec{
+						AccessModes: []corev1.PersistentVolumeAccessMode{corev1.ReadWriteOnce},
+						Resources: corev1.VolumeResourceRequirements{
+							Requests: map[corev1.ResourceName]resource.Quantity{corev1.ResourceStorage: resource.MustParse("5Mi")},
+						},
+						StorageClassName: ptr.To("local-strict"),
+						VolumeMode:       ptr.To(corev1.PersistentVolumeFilesystem),
+					},
+				},
+				{
+					ObjectMeta: metav1.ObjectMeta{Name: "vol-readonly"},
+					Spec: corev1.PersistentVolumeClaimSpec{
+						AccessModes: []corev1.PersistentVolumeAccessMode{corev1.ReadWriteOnce},
+						Resources: corev1.VolumeResourceRequirements{
+							Requests: map[corev1.ResourceName]resource.Quantity{corev1.ResourceStorage: resource.MustParse("1Mi")},
+						},
+						VolumeMode: ptr.To(corev1.PersistentVolumeFilesystem),
 					},
 				},
 			},
@@ -184,11 +206,21 @@
 					Containers: []corev1.Container{
 						{
 							Name:            "test",
-							ImagePullPolicy: corev1.PullNever,
-							Image:           "bazel/metropolis/test/e2e/preseedtest:preseedtest_image",
-							ReadinessProbe: &corev1.Probe{
-								ProbeHandler: corev1.ProbeHandler{
-									HTTPGet: &corev1.HTTPGetAction{Port: intstr.FromInt(80)},
+							ImagePullPolicy: corev1.PullIfNotPresent,
+							Image:           "test.monogon.internal/metropolis/test/e2e/persistentvolume/persistentvolume_image",
+							VolumeMounts: []corev1.VolumeMount{
+								corev1.VolumeMount{
+									Name:      "vol-default",
+									MountPath: "/vol/default",
+								},
+								corev1.VolumeMount{
+									Name:      "vol-local-strict",
+									MountPath: "/vol/local-strict",
+								},
+								corev1.VolumeMount{
+									Name:      "vol-readonly",
+									ReadOnly:  true,
+									MountPath: "/vol/readonly",
 								},
 							},
 						},
@@ -214,6 +246,8 @@
 	}
 	lineStr := strings.Trim(buf.String(), "\n")
 	lines := strings.Split(lineStr, "\n")
-	lines = lines[len(lines)-int(nlines):]
+	if len(lines) > int(nlines) {
+		lines = lines[len(lines)-int(nlines):]
+	}
 	return lines, nil
 }
diff --git a/metropolis/test/e2e/suites/kubernetes/run_test.go b/metropolis/test/e2e/suites/kubernetes/run_test.go
index 0d53e07..5a4be60 100644
--- a/metropolis/test/e2e/suites/kubernetes/run_test.go
+++ b/metropolis/test/e2e/suites/kubernetes/run_test.go
@@ -370,11 +370,11 @@
 			return fmt.Errorf("pod is not ready: %s", errorMsg.String())
 		}
 	})
-	util.TestEventual(t, "Simple StatefulSet with PVC", ctx, largeTestTimeout, func(ctx context.Context) error {
-		_, err := clientSet.AppsV1().StatefulSets("default").Create(ctx, makeTestStatefulSet("test-statefulset-1", corev1.PersistentVolumeFilesystem), metav1.CreateOptions{})
+	util.TestEventual(t, "StatefulSet with PersistentVolume tests", ctx, smallTestTimeout, func(ctx context.Context) error {
+		_, err := clientSet.AppsV1().StatefulSets("default").Create(ctx, makeTestStatefulSet("test-statefulset-1"), metav1.CreateOptions{})
 		return err
 	})
-	util.TestEventual(t, "Simple StatefulSet with PVC is running", ctx, largeTestTimeout, func(ctx context.Context) error {
+	util.TestEventual(t, "StatefulSet with PersistentVolume tests successful", ctx, smallTestTimeout, func(ctx context.Context) error {
 		res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-statefulset-1"})
 		if err != nil {
 			return err
@@ -383,38 +383,19 @@
 			return errors.New("pod didn't get created")
 		}
 		pod := res.Items[0]
-		if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
-			return nil
-		}
-		events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
-		if err != nil || len(events.Items) == 0 {
-			return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
-		} else {
-			return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
-		}
-	})
-	util.TestEventual(t, "Simple StatefulSet with Block PVC", ctx, largeTestTimeout, func(ctx context.Context) error {
-		_, err := clientSet.AppsV1().StatefulSets("default").Create(ctx, makeTestStatefulSet("test-statefulset-2", corev1.PersistentVolumeBlock), metav1.CreateOptions{})
-		return err
-	})
-	util.TestEventual(t, "Simple StatefulSet with Block PVC is running", ctx, largeTestTimeout, func(ctx context.Context) error {
-		res, err := clientSet.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "name=test-statefulset-2"})
+		lines, err := getPodLogLines(ctx, clientSet, pod.Name, 50)
 		if err != nil {
-			return err
+			return fmt.Errorf("could not get logs: %w", err)
 		}
-		if len(res.Items) == 0 {
-			return errors.New("pod didn't get created")
+		if len(lines) > 0 {
+			switch lines[len(lines)-1] {
+			case "[TESTS-PASSED]":
+				return nil
+			case "[TESTS-FAILED]":
+				return util.Permanent(fmt.Errorf("tests failed, log:\n  %s", strings.Join(lines, "\n  ")))
+			}
 		}
-		pod := res.Items[0]
-		if podv1.IsPodAvailable(&pod, 1, metav1.NewTime(time.Now())) {
-			return nil
-		}
-		events, err := clientSet.CoreV1().Events("default").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("involvedObject.name=%s,involvedObject.namespace=default", pod.Name)})
-		if err != nil || len(events.Items) == 0 {
-			return fmt.Errorf("pod is not ready: %v", pod.Status.Phase)
-		} else {
-			return fmt.Errorf("pod is not ready: %v", events.Items[0].Message)
-		}
+		return fmt.Errorf("pod is not ready: %v, log:\n  %s", pod.Status.Phase, strings.Join(lines, "\n  "))
 	})
 	util.TestEventual(t, "In-cluster self-test job", ctx, smallTestTimeout, func(ctx context.Context) error {
 		_, err := clientSet.BatchV1().Jobs("default").Create(ctx, makeSelftestSpec("selftest"), metav1.CreateOptions{})