Add Kubernetes Worker and infrastructure
Adds Kubernetes Kubelet with patches for syscall-based mounting and
syscall-based (and much faster) metrics. fsquota patches have been
deferred to a further revision (for robust emptyDir capacity isolation).
Changes encoding of the node ID to hex since Base64-URL is not supported
as a character set for K8s names. Also adds `/etc/machine-id` and
`/etc/os-release` since Kubernetes wants them. `os-release` is generated
by stamping, `machine-id` is the hex-encoded node ID derived from the
public key.
Also includes a primitive reconciler which automatically ensures a set of
built-in Kubernetes objects are always present. Currently this includes
a PSP and some basic RBAC policies that are elementary to proper cluster
operations.
Adds an additional gRPC service (NodeDebugService) to cleanly
communicate with external debug and test tooling. It supports reading
from logbuffers for all externally-run components, checking conditions
(for replacing log matching in testing and debugging) and getting
debug credentials for the Kubernetes cluster.
A small utility (dbg) is provided that interfaces with NodeDebugService
and provides access to its functions from the CLI. It also incorporates
a kubectl wrapper which directly grabs credentials from the Debug API
and passes them to kubectl
(e.g. `bazel run //core/cmd/dbg -- kubectl describe node`).
Test Plan:
Manually tested.
Kubernetes:
`bazel run //core/cmd/dbg -- kubectl create -f test.yml`
Checked that pods run, logs are accessible and exec works.
Reading buffers:
`bazel run //core/cmd/dbg -- logs containerd`
Outputs containerd logs in the right order.
Automated testing is in the works, but has been deferred to a future
revision because this one is already too big again.
X-Origin-Diff: phab/D525
GitOrigin-RevId: 0fbfa0c433de405526c7f09ef10c466896331328
diff --git a/core/internal/node/main.go b/core/internal/node/main.go
index b0674d2..4041cb8 100644
--- a/core/internal/node/main.go
+++ b/core/internal/node/main.go
@@ -25,14 +25,16 @@
"crypto/tls"
"crypto/x509"
"crypto/x509/pkix"
- "encoding/base64"
+ "encoding/hex"
"errors"
"flag"
"fmt"
+ "git.monogon.dev/source/nexantic.git/core/internal/containerd"
"io/ioutil"
"math/big"
"net"
"os"
+ "strings"
"time"
apipb "git.monogon.dev/source/nexantic.git/core/generated/api"
@@ -43,6 +45,7 @@
"git.monogon.dev/source/nexantic.git/core/internal/kubernetes"
"git.monogon.dev/source/nexantic.git/core/internal/network"
"git.monogon.dev/source/nexantic.git/core/internal/storage"
+ "golang.org/x/sys/unix"
"github.com/cenkalti/backoff/v4"
"github.com/gogo/protobuf/proto"
@@ -62,12 +65,15 @@
Consensus *consensus.Service
Storage *storage.Manager
Kubernetes *kubernetes.Service
+ Containerd *containerd.Service
Network *network.Service
logger *zap.Logger
state common.SmalltownState
hostname string
enrolmentConfig *apipb.EnrolmentConfig
+
+ debugServer *grpc.Server
}
)
@@ -101,12 +107,18 @@
return nil, err
}
+ containerdService, err := containerd.New()
+ if err != nil {
+ return nil, err
+ }
+
s := &SmalltownNode{
- Consensus: consensusService,
- Storage: strg,
- Network: ntwk,
- logger: logger,
- hostname: hostname,
+ Consensus: consensusService,
+ Containerd: containerdService,
+ Storage: strg,
+ Network: ntwk,
+ logger: logger,
+ hostname: hostname,
}
apiService, err := api.NewApiServer(&api.Config{}, logger.With(zap.String("module", "api")), s.Consensus)
@@ -118,6 +130,9 @@
s.Kubernetes = kubernetes.New(logger.With(zap.String("module", "kubernetes")), consensusService)
+ s.debugServer = grpc.NewServer()
+ apipb.RegisterNodeDebugServiceServer(s.debugServer, s)
+
logger.Info("Created SmalltownNode")
return s, nil
@@ -126,6 +141,8 @@
func (s *SmalltownNode) Start(ctx context.Context) error {
s.logger.Info("Starting Smalltown node")
+ s.startDebugSvc()
+
// TODO(lorenz): Abstracting enrolment sounds like a good idea, but ends up being painful
// because of things like storage access. I'm keeping it this way until the more complex
// enrolment procedures are fleshed out. This is also a bit panic()-happy, but there is really
@@ -157,6 +174,30 @@
panic("Unreachable")
}
+func (s *SmalltownNode) startDebugSvc() {
+ debugListenHost := fmt.Sprintf(":%v", common.DebugServicePort)
+ debugListener, err := net.Listen("tcp", debugListenHost)
+ if err != nil {
+ s.logger.Fatal("failed to listen", zap.Error(err))
+ }
+
+ go func() {
+ if err := s.debugServer.Serve(debugListener); err != nil {
+ s.logger.Fatal("failed to serve", zap.Error(err))
+ }
+ }()
+}
+
+func (s *SmalltownNode) initHostname() error {
+ if err := unix.Sethostname([]byte(s.hostname)); err != nil {
+ return err
+ }
+ if err := ioutil.WriteFile("/etc/hosts", []byte(fmt.Sprintf("%v %v", "127.0.0.1", s.hostname)), 0644); err != nil {
+ return err
+ }
+ return ioutil.WriteFile("/etc/machine-id", []byte(strings.TrimPrefix(s.hostname, "smalltown-")), 0644)
+}
+
func (s *SmalltownNode) startEnrolling(ctx context.Context) error {
s.logger.Info("Initializing subsystems for enrolment")
s.state = common.StateEnrollMode
@@ -166,6 +207,11 @@
return err
}
+ s.hostname = nodeID
+ if err := s.initHostname(); err != nil {
+ return err
+ }
+
// We only support TPM2 at the moment, any abstractions here would be premature
trustAgent := tpm2.TPM2Agent{}
@@ -207,11 +253,18 @@
if err != nil {
return err
}
+ s.hostname = nodeID
+ if err := s.initHostname(); err != nil {
+ return err
+ }
if err := s.initNodeAPI(); err != nil {
return err
}
+ // TODO: Use supervisor.Run for this
+ go s.Containerd.Run()(context.TODO())
+
dataPath, err := s.Storage.GetPathInPlace(storage.PlaceData, "etcd")
if err != nil {
return err
@@ -314,7 +367,7 @@
return []byte{}, "", fmt.Errorf("failed to write node key: %w", err)
}
- name := "smalltown-" + base64.RawStdEncoding.EncodeToString([]byte(pubKey))
+ name := "smalltown-" + hex.EncodeToString([]byte(pubKey[:16]))
// This has no SANs because it authenticates by public key, not by name
nodeCert := &x509.Certificate{
@@ -429,6 +482,11 @@
s.logger.Info("Initializing subsystems for production")
s.state = common.StateJoined
+ s.hostname = s.enrolmentConfig.NodeId
+ if err := s.initHostname(); err != nil {
+ return err
+ }
+
trustAgent := tpm2.TPM2Agent{}
unlockOp := func() error {
unlockKey, err := trustAgent.Unlock(*s.enrolmentConfig)
@@ -449,6 +507,9 @@
s.initNodeAPI()
+ // TODO: Use supervisor.Run for this
+ go s.Containerd.Run()(context.TODO())
+
err := s.Consensus.Start()
if err != nil {
return err