blob: b0674d2b7c50e8c3015fc160358b419559ee924f [file] [log] [blame]
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +02001// Copyright 2020 The Monogon Project Authors.
2//
3// SPDX-License-Identifier: Apache-2.0
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
17package node
18
19import (
Lorenz Brunaa6b7342019-12-12 02:55:02 +010020 "bytes"
Serge Bazanskicdb8c782020-02-17 12:34:02 +010021 "context"
Lorenz Brunaa6b7342019-12-12 02:55:02 +010022 "crypto/ed25519"
23 "crypto/rand"
24 "crypto/sha512"
25 "crypto/tls"
26 "crypto/x509"
27 "crypto/x509/pkix"
28 "encoding/base64"
29 "errors"
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +020030 "flag"
Lorenz Brunaa6b7342019-12-12 02:55:02 +010031 "fmt"
32 "io/ioutil"
33 "math/big"
34 "net"
Lorenz Brunaa6b7342019-12-12 02:55:02 +010035 "os"
Serge Bazanskicdb8c782020-02-17 12:34:02 +010036 "time"
Lorenz Brunaa6b7342019-12-12 02:55:02 +010037
38 apipb "git.monogon.dev/source/nexantic.git/core/generated/api"
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +020039 "git.monogon.dev/source/nexantic.git/core/internal/api"
40 "git.monogon.dev/source/nexantic.git/core/internal/common"
41 "git.monogon.dev/source/nexantic.git/core/internal/consensus"
Lorenz Brunaa6b7342019-12-12 02:55:02 +010042 "git.monogon.dev/source/nexantic.git/core/internal/integrity/tpm2"
Lorenz Brun6e8f69c2019-11-18 10:44:24 +010043 "git.monogon.dev/source/nexantic.git/core/internal/kubernetes"
Lorenz Brunaa6b7342019-12-12 02:55:02 +010044 "git.monogon.dev/source/nexantic.git/core/internal/network"
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +020045 "git.monogon.dev/source/nexantic.git/core/internal/storage"
46
Serge Bazanskicdb8c782020-02-17 12:34:02 +010047 "github.com/cenkalti/backoff/v4"
Lorenz Brunaa6b7342019-12-12 02:55:02 +010048 "github.com/gogo/protobuf/proto"
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +020049 "go.uber.org/zap"
Serge Bazanskicdb8c782020-02-17 12:34:02 +010050 "google.golang.org/grpc"
51 "google.golang.org/grpc/credentials"
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +020052)
53
Lorenz Brunaa6b7342019-12-12 02:55:02 +010054var (
55 // From RFC 5280 Section 4.1.2.5
56 unknownNotAfter = time.Unix(253402300799, 0)
57)
58
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +020059type (
60 SmalltownNode struct {
Lorenz Brun6e8f69c2019-11-18 10:44:24 +010061 Api *api.Server
62 Consensus *consensus.Service
63 Storage *storage.Manager
64 Kubernetes *kubernetes.Service
Lorenz Brunaa6b7342019-12-12 02:55:02 +010065 Network *network.Service
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +020066
Lorenz Brunaa6b7342019-12-12 02:55:02 +010067 logger *zap.Logger
68 state common.SmalltownState
69 hostname string
70 enrolmentConfig *apipb.EnrolmentConfig
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +020071 }
72)
73
Serge Bazanskicdb8c782020-02-17 12:34:02 +010074func NewSmalltownNode(logger *zap.Logger, ntwk *network.Service, strg *storage.Manager) (*SmalltownNode, error) {
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +020075 flag.Parse()
76 logger.Info("Creating Smalltown node")
Serge Bazanskicdb8c782020-02-17 12:34:02 +010077 ctx := context.Background()
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +020078
Leopold Schabel68c58752019-11-14 21:00:59 +010079 hostname, err := os.Hostname()
80 if err != nil {
81 panic(err)
82 }
83
Serge Bazanskicdb8c782020-02-17 12:34:02 +010084 // Wait for IP adddress...
85 ctxT, ctxTC := context.WithTimeout(ctx, time.Second*10)
86 defer ctxTC()
87 externalIP, err := ntwk.GetIP(ctxT, true)
Lorenz Brunaa6b7342019-12-12 02:55:02 +010088 if err != nil {
Serge Bazanskicdb8c782020-02-17 12:34:02 +010089 logger.Panic("Could not get IP address", zap.Error(err))
Lorenz Brunaa6b7342019-12-12 02:55:02 +010090 }
91
92 // Important to know if the GetIP above hangs
93 logger.Info("Node has IP", zap.String("ip", externalIP.String()))
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +020094
95 consensusService, err := consensus.NewConsensusService(consensus.Config{
Lorenz Brun6e8f69c2019-11-18 10:44:24 +010096 Name: hostname,
Lorenz Brun6e8f69c2019-11-18 10:44:24 +010097 ListenHost: "0.0.0.0",
Lorenz Brunaa6b7342019-12-12 02:55:02 +010098 ExternalHost: externalIP.String(),
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +020099 }, logger.With(zap.String("module", "consensus")))
100 if err != nil {
101 return nil, err
102 }
103
104 s := &SmalltownNode{
105 Consensus: consensusService,
Serge Bazanskicdb8c782020-02-17 12:34:02 +0100106 Storage: strg,
107 Network: ntwk,
Leopold Schabel68c58752019-11-14 21:00:59 +0100108 logger: logger,
109 hostname: hostname,
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +0200110 }
111
Lorenz Brunaa6b7342019-12-12 02:55:02 +0100112 apiService, err := api.NewApiServer(&api.Config{}, logger.With(zap.String("module", "api")), s.Consensus)
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +0200113 if err != nil {
114 return nil, err
115 }
116
117 s.Api = apiService
118
Lorenz Brun6e8f69c2019-11-18 10:44:24 +0100119 s.Kubernetes = kubernetes.New(logger.With(zap.String("module", "kubernetes")), consensusService)
120
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +0200121 logger.Info("Created SmalltownNode")
122
123 return s, nil
124}
125
Serge Bazanskicdb8c782020-02-17 12:34:02 +0100126func (s *SmalltownNode) Start(ctx context.Context) error {
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +0200127 s.logger.Info("Starting Smalltown node")
128
Lorenz Brunaa6b7342019-12-12 02:55:02 +0100129 // TODO(lorenz): Abstracting enrolment sounds like a good idea, but ends up being painful
130 // because of things like storage access. I'm keeping it this way until the more complex
131 // enrolment procedures are fleshed out. This is also a bit panic()-happy, but there is really
132 // no good way out of an invalid enrolment configuration.
133 enrolmentPath, err := s.Storage.GetPathInPlace(storage.PlaceESP, "enrolment.pb")
134 if err != nil {
135 s.logger.Panic("ESP configuration partition not available", zap.Error(err))
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +0200136 }
Lorenz Brunaa6b7342019-12-12 02:55:02 +0100137 enrolmentConfigRaw, err := ioutil.ReadFile(enrolmentPath)
138 if err == nil {
139 // We have an enrolment file, let's check its contents
140 var enrolmentConfig apipb.EnrolmentConfig
141 if err := proto.Unmarshal(enrolmentConfigRaw, &enrolmentConfig); err != nil {
142 s.logger.Panic("Invalid enrolment configuration provided", zap.Error(err))
143 }
144 s.enrolmentConfig = &enrolmentConfig
145 // The enrolment secret is only zeroed after
146 if len(enrolmentConfig.EnrolmentSecret) == 0 {
147 return s.startFull()
148 }
Serge Bazanskicdb8c782020-02-17 12:34:02 +0100149 return s.startEnrolling(ctx)
Lorenz Brunaa6b7342019-12-12 02:55:02 +0100150 } else if os.IsNotExist(err) {
151 // This is ok like this, once a new cluster has been set up the initial node also generates
152 // its own enrolment config
Serge Bazanskicdb8c782020-02-17 12:34:02 +0100153 return s.startForSetup(ctx)
Lorenz Brunaa6b7342019-12-12 02:55:02 +0100154 }
155 // Unknown error reading enrolment config (disk issues/invalid configuration format/...)
156 s.logger.Panic("Invalid enrolment configuration provided", zap.Error(err))
157 panic("Unreachable")
158}
159
Serge Bazanskicdb8c782020-02-17 12:34:02 +0100160func (s *SmalltownNode) startEnrolling(ctx context.Context) error {
Lorenz Brunaa6b7342019-12-12 02:55:02 +0100161 s.logger.Info("Initializing subsystems for enrolment")
162 s.state = common.StateEnrollMode
163
Serge Bazanskicdb8c782020-02-17 12:34:02 +0100164 nodeInfo, nodeID, err := s.InitializeNode(ctx)
Lorenz Brunaa6b7342019-12-12 02:55:02 +0100165 if err != nil {
166 return err
167 }
168
169 // We only support TPM2 at the moment, any abstractions here would be premature
170 trustAgent := tpm2.TPM2Agent{}
171
172 initializeOp := func() error {
173 if err := trustAgent.Initialize(*nodeInfo, *s.enrolmentConfig); err != nil {
174 s.logger.Warn("Failed to initialize integrity backend", zap.Error(err))
175 return err
176 }
177 return nil
178 }
179
180 if err := backoff.Retry(initializeOp, getIntegrityBackoff()); err != nil {
181 panic("invariant violated: integrity initialization retry can never fail")
182 }
183
184 enrolmentPath, err := s.Storage.GetPathInPlace(storage.PlaceESP, "enrolment.pb")
185 if err != nil {
186 panic(err)
187 }
188
189 s.enrolmentConfig.EnrolmentSecret = []byte{}
190 s.enrolmentConfig.NodeId = nodeID
191
192 enrolmentConfigRaw, err := proto.Marshal(s.enrolmentConfig)
193 if err != nil {
194 panic(err)
195 }
196 if err := ioutil.WriteFile(enrolmentPath, enrolmentConfigRaw, 0600); err != nil {
197 return err
198 }
199 s.logger.Info("Node successfully enrolled")
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +0200200
201 return nil
202}
203
Serge Bazanskicdb8c782020-02-17 12:34:02 +0100204func (s *SmalltownNode) startForSetup(ctx context.Context) error {
Lorenz Brunaa6b7342019-12-12 02:55:02 +0100205 s.logger.Info("Setting up a new cluster")
Serge Bazanskicdb8c782020-02-17 12:34:02 +0100206 initData, nodeID, err := s.InitializeNode(ctx)
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +0200207 if err != nil {
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +0200208 return err
209 }
210
Lorenz Brunaa6b7342019-12-12 02:55:02 +0100211 if err := s.initNodeAPI(); err != nil {
212 return err
213 }
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +0200214
Lorenz Brunaa6b7342019-12-12 02:55:02 +0100215 dataPath, err := s.Storage.GetPathInPlace(storage.PlaceData, "etcd")
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +0200216 if err != nil {
217 return err
218 }
219
Lorenz Brunaa6b7342019-12-12 02:55:02 +0100220 // Spin up etcd
221 config := s.Consensus.GetConfig()
222 config.NewCluster = true
223 config.Name = s.hostname
224 config.DataDir = dataPath
225 s.Consensus.SetConfig(config)
226
227 // Generate the cluster CA and store it to local storage.
228 if err := s.Consensus.PrecreateCA(); err != nil {
229 return err
230 }
231
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +0200232 err = s.Consensus.Start()
233 if err != nil {
234 return err
235 }
236
Lorenz Brunaa6b7342019-12-12 02:55:02 +0100237 // Now that the cluster is up and running, we can persist the CA to the cluster.
238 if err := s.Consensus.InjectCA(); err != nil {
239 return err
240 }
241
242 if err := s.Api.BootstrapNewClusterHook(initData); err != nil {
243 return err
244 }
245
246 if err := s.Kubernetes.NewCluster(); err != nil {
247 return err
248 }
249
250 if err := s.Kubernetes.Start(); err != nil {
251 return err
252 }
253
254 if err := s.Api.Start(); err != nil {
255 s.logger.Error("Failed to start the API service", zap.Error(err))
256 return err
257 }
258
259 enrolmentPath, err := s.Storage.GetPathInPlace(storage.PlaceESP, "enrolment.pb")
260 if err != nil {
261 panic(err)
262 }
263
264 masterCert, err := s.Api.GetMasterCert()
265 if err != nil {
266 return err
267 }
268
Serge Bazanskicdb8c782020-02-17 12:34:02 +0100269 ip, err := s.Network.GetIP(ctx, true)
270 if err != nil {
271 return fmt.Errorf("could not get node IP: %v", err)
272 }
Lorenz Brunaa6b7342019-12-12 02:55:02 +0100273 enrolmentConfig := &apipb.EnrolmentConfig{
274 EnrolmentSecret: []byte{}, // First node is always already enrolled
275 MastersCert: masterCert,
Serge Bazanskicdb8c782020-02-17 12:34:02 +0100276 MasterIps: [][]byte{[]byte(*ip)},
Lorenz Brunaa6b7342019-12-12 02:55:02 +0100277 NodeId: nodeID,
278 }
279 enrolmentConfigRaw, err := proto.Marshal(enrolmentConfig)
280 if err != nil {
281 panic(err)
282 }
283 if err := ioutil.WriteFile(enrolmentPath, enrolmentConfigRaw, 0600); err != nil {
284 return err
285 }
286 masterCertFingerprint := sha512.Sum512_256(masterCert)
287 s.logger.Info("New Smalltown cluster successfully bootstrapped", zap.Binary("fingerprint", masterCertFingerprint[:]))
288
289 return nil
290}
291
292func (s *SmalltownNode) generateNodeID() ([]byte, string, error) {
293 serialNumberLimit := new(big.Int).Lsh(big.NewInt(1), 127)
294 serialNumber, err := rand.Int(rand.Reader, serialNumberLimit)
295 if err != nil {
296 return []byte{}, "", fmt.Errorf("Failed to generate serial number: %w", err)
297 }
298
299 pubKey, privKeyRaw, err := ed25519.GenerateKey(rand.Reader)
300 if err != nil {
301 return []byte{}, "", err
302 }
303 privkey, err := x509.MarshalPKCS8PrivateKey(privKeyRaw)
304 if err != nil {
305 return []byte{}, "", err
306 }
307
308 nodeKeyPath, err := s.Storage.GetPathInPlace(storage.PlaceData, "node-key.der")
309 if err != nil {
310 return []byte{}, "", err
311 }
312
313 if err := ioutil.WriteFile(nodeKeyPath, privkey, 0600); err != nil {
314 return []byte{}, "", fmt.Errorf("failed to write node key: %w", err)
315 }
316
317 name := "smalltown-" + base64.RawStdEncoding.EncodeToString([]byte(pubKey))
318
319 // This has no SANs because it authenticates by public key, not by name
320 nodeCert := &x509.Certificate{
321 SerialNumber: serialNumber,
322 Subject: pkix.Name{
323 // We identify nodes by their ID public keys (not hashed since a strong hash is longer and serves no benefit)
324 CommonName: name,
325 },
326 IsCA: false,
327 BasicConstraintsValid: true,
328 NotBefore: time.Now(),
329 NotAfter: unknownNotAfter,
330 // Certificate is used both as server & client
331 ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageClientAuth, x509.ExtKeyUsageServerAuth},
332 }
333 cert, err := x509.CreateCertificate(rand.Reader, nodeCert, nodeCert, pubKey, privKeyRaw)
334 if err != nil {
335 return []byte{}, "", err
336 }
337
338 nodeCertPath, err := s.Storage.GetPathInPlace(storage.PlaceData, "node.der")
339 if err != nil {
340 return []byte{}, "", err
341 }
342
343 if err := ioutil.WriteFile(nodeCertPath, cert, 0600); err != nil {
344 return []byte{}, "", fmt.Errorf("failed to write node cert: %w", err)
345 }
346 return cert, name, nil
347}
348
349func (s *SmalltownNode) initNodeAPI() error {
350 certPath, err := s.Storage.GetPathInPlace(storage.PlaceData, "node.der")
351 if err != nil {
352 s.logger.Panic("Invariant violated: Data is available once this is called")
353 }
354 keyPath, err := s.Storage.GetPathInPlace(storage.PlaceData, "node-key.der")
355 if err != nil {
356 s.logger.Panic("Invariant violated: Data is available once this is called")
357 }
358
359 certRaw, err := ioutil.ReadFile(certPath)
360 if err != nil {
361 return err
362 }
363 privKeyRaw, err := ioutil.ReadFile(keyPath)
364 if err != nil {
365 return err
366 }
367
368 var nodeID tls.Certificate
369
370 cert, err := x509.ParseCertificate(certRaw)
371 if err != nil {
372 return err
373 }
374
375 privKey, err := x509.ParsePKCS8PrivateKey(privKeyRaw)
376 if err != nil {
377 return err
378 }
379
380 nodeID.Certificate = [][]byte{certRaw}
381 nodeID.PrivateKey = privKey
382 nodeID.Leaf = cert
383
384 secureTransport := &tls.Config{
385 Certificates: []tls.Certificate{nodeID},
386 ClientAuth: tls.RequireAndVerifyClientCert,
387 InsecureSkipVerify: true,
388 // Critical function, please review any changes with care
389 // TODO(lorenz): Actively check that this actually provides the security guarantees that we need
390 VerifyPeerCertificate: func(rawCerts [][]byte, verifiedChains [][]*x509.Certificate) error {
391 for _, cert := range rawCerts {
392 // X.509 certificates in DER can be compared like this since DER has a unique representation
393 // for each certificate.
394 if bytes.Equal(cert, s.enrolmentConfig.MastersCert) {
395 return nil
396 }
397 }
398 return errors.New("failed to find authorized NMS certificate")
399 },
400 MinVersion: tls.VersionTLS13,
401 }
402 secureTransportCreds := credentials.NewTLS(secureTransport)
403
404 masterListenHost := fmt.Sprintf(":%d", common.NodeServicePort)
405 lis, err := net.Listen("tcp", masterListenHost)
406 if err != nil {
407 s.logger.Fatal("failed to listen", zap.Error(err))
408 }
409
410 nodeGRPCServer := grpc.NewServer(grpc.Creds(secureTransportCreds))
411 apipb.RegisterNodeServiceServer(nodeGRPCServer, s)
412 go func() {
413 if err := nodeGRPCServer.Serve(lis); err != nil {
414 panic(err) // Can only happen during initialization and is always fatal
415 }
416 }()
417 return nil
418}
419
420func getIntegrityBackoff() *backoff.ExponentialBackOff {
421 unlockBackoff := backoff.NewExponentialBackOff()
422 unlockBackoff.MaxElapsedTime = time.Duration(0)
423 unlockBackoff.InitialInterval = 5 * time.Second
424 unlockBackoff.MaxInterval = 5 * time.Minute
425 return unlockBackoff
426}
427
428func (s *SmalltownNode) startFull() error {
429 s.logger.Info("Initializing subsystems for production")
430 s.state = common.StateJoined
431
432 trustAgent := tpm2.TPM2Agent{}
433 unlockOp := func() error {
434 unlockKey, err := trustAgent.Unlock(*s.enrolmentConfig)
435 if err != nil {
436 s.logger.Warn("Failed to unlock", zap.Error(err))
437 return err
438 }
439 if err := s.Storage.MountData(unlockKey); err != nil {
440 s.logger.Panic("Failed to mount storage", zap.Error(err))
441 return err
442 }
443 return nil
444 }
445
446 if err := backoff.Retry(unlockOp, getIntegrityBackoff()); err != nil {
447 s.logger.Panic("Invariant violated: Unlock retry can never fail")
448 }
449
450 s.initNodeAPI()
451
452 err := s.Consensus.Start()
453 if err != nil {
454 return err
455 }
456
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +0200457 err = s.Api.Start()
458 if err != nil {
459 s.logger.Error("Failed to start the API service", zap.Error(err))
460 return err
461 }
462
Lorenz Brun6e8f69c2019-11-18 10:44:24 +0100463 err = s.Kubernetes.Start()
464 if err != nil {
465 s.logger.Error("Failed to start the Kubernetes Service", zap.Error(err))
466 }
467
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +0200468 return nil
469}
470
471func (s *SmalltownNode) Stop() error {
472 s.logger.Info("Stopping Smalltown node")
473 return nil
474}