blob: b1d74d6b2e8d2eb193d0c8f3ce0605379bac04d4 [file] [log] [blame]
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +02001// Copyright 2020 The Monogon Project Authors.
2//
3// SPDX-License-Identifier: Apache-2.0
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
17package node
18
19import (
Lorenz Brunaa6b7342019-12-12 02:55:02 +010020 "bytes"
21 "crypto/ed25519"
22 "crypto/rand"
23 "crypto/sha512"
24 "crypto/tls"
25 "crypto/x509"
26 "crypto/x509/pkix"
27 "encoding/base64"
28 "errors"
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +020029 "flag"
Lorenz Brunaa6b7342019-12-12 02:55:02 +010030 "fmt"
31 "io/ioutil"
32 "math/big"
33 "net"
34 "time"
Lorenz Brun6e8f69c2019-11-18 10:44:24 +010035
Lorenz Brunaa6b7342019-12-12 02:55:02 +010036 "os"
37
38 apipb "git.monogon.dev/source/nexantic.git/core/generated/api"
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +020039 "git.monogon.dev/source/nexantic.git/core/internal/api"
40 "git.monogon.dev/source/nexantic.git/core/internal/common"
41 "git.monogon.dev/source/nexantic.git/core/internal/consensus"
Lorenz Brunaa6b7342019-12-12 02:55:02 +010042 "git.monogon.dev/source/nexantic.git/core/internal/integrity/tpm2"
Lorenz Brun6e8f69c2019-11-18 10:44:24 +010043 "git.monogon.dev/source/nexantic.git/core/internal/kubernetes"
Lorenz Brunaa6b7342019-12-12 02:55:02 +010044 "git.monogon.dev/source/nexantic.git/core/internal/network"
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +020045 "git.monogon.dev/source/nexantic.git/core/internal/storage"
Lorenz Brunaa6b7342019-12-12 02:55:02 +010046 "github.com/cenkalti/backoff/v4"
47 "google.golang.org/grpc"
48 "google.golang.org/grpc/credentials"
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +020049
Lorenz Brunaa6b7342019-12-12 02:55:02 +010050 "github.com/gogo/protobuf/proto"
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +020051 "go.uber.org/zap"
52)
53
Lorenz Brunaa6b7342019-12-12 02:55:02 +010054var (
55 // From RFC 5280 Section 4.1.2.5
56 unknownNotAfter = time.Unix(253402300799, 0)
57)
58
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +020059type (
60 SmalltownNode struct {
Lorenz Brun6e8f69c2019-11-18 10:44:24 +010061 Api *api.Server
62 Consensus *consensus.Service
63 Storage *storage.Manager
64 Kubernetes *kubernetes.Service
Lorenz Brunaa6b7342019-12-12 02:55:02 +010065 Network *network.Service
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +020066
Lorenz Brunaa6b7342019-12-12 02:55:02 +010067 logger *zap.Logger
68 state common.SmalltownState
69 hostname string
70 enrolmentConfig *apipb.EnrolmentConfig
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +020071 }
72)
73
Lorenz Brunaa6b7342019-12-12 02:55:02 +010074func NewSmalltownNode(logger *zap.Logger) (*SmalltownNode, error) {
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +020075 flag.Parse()
76 logger.Info("Creating Smalltown node")
77
Leopold Schabel68c58752019-11-14 21:00:59 +010078 hostname, err := os.Hostname()
79 if err != nil {
80 panic(err)
81 }
82
Lorenz Brunaa6b7342019-12-12 02:55:02 +010083 networkService, err := network.NewNetworkService(network.Config{}, logger.With(zap.String("component", "network")))
84 if err != nil {
85 panic(err)
86 }
87
88 if err := networkService.Start(); err != nil {
89 logger.Panic("Failed to start network service", zap.Error(err))
90 }
91
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +020092 storageManager, err := storage.Initialize(logger.With(zap.String("component", "storage")))
93 if err != nil {
94 logger.Error("Failed to initialize storage manager", zap.Error(err))
95 return nil, err
96 }
Lorenz Brunaa6b7342019-12-12 02:55:02 +010097 externalIP := networkService.GetIP(true)
98 if externalIP == nil {
99 logger.Panic("Waited for IP but didn't get one")
100 }
101
102 // Important to know if the GetIP above hangs
103 logger.Info("Node has IP", zap.String("ip", externalIP.String()))
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +0200104
105 consensusService, err := consensus.NewConsensusService(consensus.Config{
Lorenz Brun6e8f69c2019-11-18 10:44:24 +0100106 Name: hostname,
Lorenz Brun6e8f69c2019-11-18 10:44:24 +0100107 ListenHost: "0.0.0.0",
Lorenz Brunaa6b7342019-12-12 02:55:02 +0100108 ExternalHost: externalIP.String(),
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +0200109 }, logger.With(zap.String("module", "consensus")))
110 if err != nil {
111 return nil, err
112 }
113
114 s := &SmalltownNode{
115 Consensus: consensusService,
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +0200116 Storage: storageManager,
Lorenz Brunaa6b7342019-12-12 02:55:02 +0100117 Network: networkService,
Leopold Schabel68c58752019-11-14 21:00:59 +0100118 logger: logger,
119 hostname: hostname,
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +0200120 }
121
Lorenz Brunaa6b7342019-12-12 02:55:02 +0100122 apiService, err := api.NewApiServer(&api.Config{}, logger.With(zap.String("module", "api")), s.Consensus)
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +0200123 if err != nil {
124 return nil, err
125 }
126
127 s.Api = apiService
128
Lorenz Brun6e8f69c2019-11-18 10:44:24 +0100129 s.Kubernetes = kubernetes.New(logger.With(zap.String("module", "kubernetes")), consensusService)
130
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +0200131 logger.Info("Created SmalltownNode")
132
133 return s, nil
134}
135
136func (s *SmalltownNode) Start() error {
137 s.logger.Info("Starting Smalltown node")
138
Lorenz Brunaa6b7342019-12-12 02:55:02 +0100139 // TODO(lorenz): Abstracting enrolment sounds like a good idea, but ends up being painful
140 // because of things like storage access. I'm keeping it this way until the more complex
141 // enrolment procedures are fleshed out. This is also a bit panic()-happy, but there is really
142 // no good way out of an invalid enrolment configuration.
143 enrolmentPath, err := s.Storage.GetPathInPlace(storage.PlaceESP, "enrolment.pb")
144 if err != nil {
145 s.logger.Panic("ESP configuration partition not available", zap.Error(err))
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +0200146 }
Lorenz Brunaa6b7342019-12-12 02:55:02 +0100147 enrolmentConfigRaw, err := ioutil.ReadFile(enrolmentPath)
148 if err == nil {
149 // We have an enrolment file, let's check its contents
150 var enrolmentConfig apipb.EnrolmentConfig
151 if err := proto.Unmarshal(enrolmentConfigRaw, &enrolmentConfig); err != nil {
152 s.logger.Panic("Invalid enrolment configuration provided", zap.Error(err))
153 }
154 s.enrolmentConfig = &enrolmentConfig
155 // The enrolment secret is only zeroed after
156 if len(enrolmentConfig.EnrolmentSecret) == 0 {
157 return s.startFull()
158 }
159 return s.startEnrolling()
160 } else if os.IsNotExist(err) {
161 // This is ok like this, once a new cluster has been set up the initial node also generates
162 // its own enrolment config
163 return s.startForSetup()
164 }
165 // Unknown error reading enrolment config (disk issues/invalid configuration format/...)
166 s.logger.Panic("Invalid enrolment configuration provided", zap.Error(err))
167 panic("Unreachable")
168}
169
170func (s *SmalltownNode) startEnrolling() error {
171 s.logger.Info("Initializing subsystems for enrolment")
172 s.state = common.StateEnrollMode
173
174 nodeInfo, nodeID, err := s.InitializeNode()
175 if err != nil {
176 return err
177 }
178
179 // We only support TPM2 at the moment, any abstractions here would be premature
180 trustAgent := tpm2.TPM2Agent{}
181
182 initializeOp := func() error {
183 if err := trustAgent.Initialize(*nodeInfo, *s.enrolmentConfig); err != nil {
184 s.logger.Warn("Failed to initialize integrity backend", zap.Error(err))
185 return err
186 }
187 return nil
188 }
189
190 if err := backoff.Retry(initializeOp, getIntegrityBackoff()); err != nil {
191 panic("invariant violated: integrity initialization retry can never fail")
192 }
193
194 enrolmentPath, err := s.Storage.GetPathInPlace(storage.PlaceESP, "enrolment.pb")
195 if err != nil {
196 panic(err)
197 }
198
199 s.enrolmentConfig.EnrolmentSecret = []byte{}
200 s.enrolmentConfig.NodeId = nodeID
201
202 enrolmentConfigRaw, err := proto.Marshal(s.enrolmentConfig)
203 if err != nil {
204 panic(err)
205 }
206 if err := ioutil.WriteFile(enrolmentPath, enrolmentConfigRaw, 0600); err != nil {
207 return err
208 }
209 s.logger.Info("Node successfully enrolled")
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +0200210
211 return nil
212}
213
214func (s *SmalltownNode) startForSetup() error {
Lorenz Brunaa6b7342019-12-12 02:55:02 +0100215 s.logger.Info("Setting up a new cluster")
216 initData, nodeID, err := s.InitializeNode()
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +0200217 if err != nil {
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +0200218 return err
219 }
220
Lorenz Brunaa6b7342019-12-12 02:55:02 +0100221 if err := s.initNodeAPI(); err != nil {
222 return err
223 }
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +0200224
Lorenz Brunaa6b7342019-12-12 02:55:02 +0100225 dataPath, err := s.Storage.GetPathInPlace(storage.PlaceData, "etcd")
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +0200226 if err != nil {
227 return err
228 }
229
Lorenz Brunaa6b7342019-12-12 02:55:02 +0100230 // Spin up etcd
231 config := s.Consensus.GetConfig()
232 config.NewCluster = true
233 config.Name = s.hostname
234 config.DataDir = dataPath
235 s.Consensus.SetConfig(config)
236
237 // Generate the cluster CA and store it to local storage.
238 if err := s.Consensus.PrecreateCA(); err != nil {
239 return err
240 }
241
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +0200242 err = s.Consensus.Start()
243 if err != nil {
244 return err
245 }
246
Lorenz Brunaa6b7342019-12-12 02:55:02 +0100247 // Now that the cluster is up and running, we can persist the CA to the cluster.
248 if err := s.Consensus.InjectCA(); err != nil {
249 return err
250 }
251
252 if err := s.Api.BootstrapNewClusterHook(initData); err != nil {
253 return err
254 }
255
256 if err := s.Kubernetes.NewCluster(); err != nil {
257 return err
258 }
259
260 if err := s.Kubernetes.Start(); err != nil {
261 return err
262 }
263
264 if err := s.Api.Start(); err != nil {
265 s.logger.Error("Failed to start the API service", zap.Error(err))
266 return err
267 }
268
269 enrolmentPath, err := s.Storage.GetPathInPlace(storage.PlaceESP, "enrolment.pb")
270 if err != nil {
271 panic(err)
272 }
273
274 masterCert, err := s.Api.GetMasterCert()
275 if err != nil {
276 return err
277 }
278
279 enrolmentConfig := &apipb.EnrolmentConfig{
280 EnrolmentSecret: []byte{}, // First node is always already enrolled
281 MastersCert: masterCert,
282 MasterIps: [][]byte{[]byte(*s.Network.GetIP(true))},
283 NodeId: nodeID,
284 }
285 enrolmentConfigRaw, err := proto.Marshal(enrolmentConfig)
286 if err != nil {
287 panic(err)
288 }
289 if err := ioutil.WriteFile(enrolmentPath, enrolmentConfigRaw, 0600); err != nil {
290 return err
291 }
292 masterCertFingerprint := sha512.Sum512_256(masterCert)
293 s.logger.Info("New Smalltown cluster successfully bootstrapped", zap.Binary("fingerprint", masterCertFingerprint[:]))
294
295 return nil
296}
297
298func (s *SmalltownNode) generateNodeID() ([]byte, string, error) {
299 serialNumberLimit := new(big.Int).Lsh(big.NewInt(1), 127)
300 serialNumber, err := rand.Int(rand.Reader, serialNumberLimit)
301 if err != nil {
302 return []byte{}, "", fmt.Errorf("Failed to generate serial number: %w", err)
303 }
304
305 pubKey, privKeyRaw, err := ed25519.GenerateKey(rand.Reader)
306 if err != nil {
307 return []byte{}, "", err
308 }
309 privkey, err := x509.MarshalPKCS8PrivateKey(privKeyRaw)
310 if err != nil {
311 return []byte{}, "", err
312 }
313
314 nodeKeyPath, err := s.Storage.GetPathInPlace(storage.PlaceData, "node-key.der")
315 if err != nil {
316 return []byte{}, "", err
317 }
318
319 if err := ioutil.WriteFile(nodeKeyPath, privkey, 0600); err != nil {
320 return []byte{}, "", fmt.Errorf("failed to write node key: %w", err)
321 }
322
323 name := "smalltown-" + base64.RawStdEncoding.EncodeToString([]byte(pubKey))
324
325 // This has no SANs because it authenticates by public key, not by name
326 nodeCert := &x509.Certificate{
327 SerialNumber: serialNumber,
328 Subject: pkix.Name{
329 // We identify nodes by their ID public keys (not hashed since a strong hash is longer and serves no benefit)
330 CommonName: name,
331 },
332 IsCA: false,
333 BasicConstraintsValid: true,
334 NotBefore: time.Now(),
335 NotAfter: unknownNotAfter,
336 // Certificate is used both as server & client
337 ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageClientAuth, x509.ExtKeyUsageServerAuth},
338 }
339 cert, err := x509.CreateCertificate(rand.Reader, nodeCert, nodeCert, pubKey, privKeyRaw)
340 if err != nil {
341 return []byte{}, "", err
342 }
343
344 nodeCertPath, err := s.Storage.GetPathInPlace(storage.PlaceData, "node.der")
345 if err != nil {
346 return []byte{}, "", err
347 }
348
349 if err := ioutil.WriteFile(nodeCertPath, cert, 0600); err != nil {
350 return []byte{}, "", fmt.Errorf("failed to write node cert: %w", err)
351 }
352 return cert, name, nil
353}
354
355func (s *SmalltownNode) initNodeAPI() error {
356 certPath, err := s.Storage.GetPathInPlace(storage.PlaceData, "node.der")
357 if err != nil {
358 s.logger.Panic("Invariant violated: Data is available once this is called")
359 }
360 keyPath, err := s.Storage.GetPathInPlace(storage.PlaceData, "node-key.der")
361 if err != nil {
362 s.logger.Panic("Invariant violated: Data is available once this is called")
363 }
364
365 certRaw, err := ioutil.ReadFile(certPath)
366 if err != nil {
367 return err
368 }
369 privKeyRaw, err := ioutil.ReadFile(keyPath)
370 if err != nil {
371 return err
372 }
373
374 var nodeID tls.Certificate
375
376 cert, err := x509.ParseCertificate(certRaw)
377 if err != nil {
378 return err
379 }
380
381 privKey, err := x509.ParsePKCS8PrivateKey(privKeyRaw)
382 if err != nil {
383 return err
384 }
385
386 nodeID.Certificate = [][]byte{certRaw}
387 nodeID.PrivateKey = privKey
388 nodeID.Leaf = cert
389
390 secureTransport := &tls.Config{
391 Certificates: []tls.Certificate{nodeID},
392 ClientAuth: tls.RequireAndVerifyClientCert,
393 InsecureSkipVerify: true,
394 // Critical function, please review any changes with care
395 // TODO(lorenz): Actively check that this actually provides the security guarantees that we need
396 VerifyPeerCertificate: func(rawCerts [][]byte, verifiedChains [][]*x509.Certificate) error {
397 for _, cert := range rawCerts {
398 // X.509 certificates in DER can be compared like this since DER has a unique representation
399 // for each certificate.
400 if bytes.Equal(cert, s.enrolmentConfig.MastersCert) {
401 return nil
402 }
403 }
404 return errors.New("failed to find authorized NMS certificate")
405 },
406 MinVersion: tls.VersionTLS13,
407 }
408 secureTransportCreds := credentials.NewTLS(secureTransport)
409
410 masterListenHost := fmt.Sprintf(":%d", common.NodeServicePort)
411 lis, err := net.Listen("tcp", masterListenHost)
412 if err != nil {
413 s.logger.Fatal("failed to listen", zap.Error(err))
414 }
415
416 nodeGRPCServer := grpc.NewServer(grpc.Creds(secureTransportCreds))
417 apipb.RegisterNodeServiceServer(nodeGRPCServer, s)
418 go func() {
419 if err := nodeGRPCServer.Serve(lis); err != nil {
420 panic(err) // Can only happen during initialization and is always fatal
421 }
422 }()
423 return nil
424}
425
426func getIntegrityBackoff() *backoff.ExponentialBackOff {
427 unlockBackoff := backoff.NewExponentialBackOff()
428 unlockBackoff.MaxElapsedTime = time.Duration(0)
429 unlockBackoff.InitialInterval = 5 * time.Second
430 unlockBackoff.MaxInterval = 5 * time.Minute
431 return unlockBackoff
432}
433
434func (s *SmalltownNode) startFull() error {
435 s.logger.Info("Initializing subsystems for production")
436 s.state = common.StateJoined
437
438 trustAgent := tpm2.TPM2Agent{}
439 unlockOp := func() error {
440 unlockKey, err := trustAgent.Unlock(*s.enrolmentConfig)
441 if err != nil {
442 s.logger.Warn("Failed to unlock", zap.Error(err))
443 return err
444 }
445 if err := s.Storage.MountData(unlockKey); err != nil {
446 s.logger.Panic("Failed to mount storage", zap.Error(err))
447 return err
448 }
449 return nil
450 }
451
452 if err := backoff.Retry(unlockOp, getIntegrityBackoff()); err != nil {
453 s.logger.Panic("Invariant violated: Unlock retry can never fail")
454 }
455
456 s.initNodeAPI()
457
458 err := s.Consensus.Start()
459 if err != nil {
460 return err
461 }
462
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +0200463 err = s.Api.Start()
464 if err != nil {
465 s.logger.Error("Failed to start the API service", zap.Error(err))
466 return err
467 }
468
Lorenz Brun6e8f69c2019-11-18 10:44:24 +0100469 err = s.Kubernetes.Start()
470 if err != nil {
471 s.logger.Error("Failed to start the Kubernetes Service", zap.Error(err))
472 }
473
Hendrik Hofstadt0d7c91e2019-10-23 21:44:47 +0200474 return nil
475}
476
477func (s *SmalltownNode) Stop() error {
478 s.logger.Info("Stopping Smalltown node")
479 return nil
480}