blob: 64867682eade22254e9fde206f9832ddef10d6fe [file] [log] [blame]
Lorenz Brun35fcf032023-06-29 04:15:58 +02001package update
2
3import (
4 "archive/zip"
5 "bytes"
6 "context"
Lorenz Brund14be0e2023-07-31 16:46:14 +02007 "debug/pe"
Lorenz Brun35fcf032023-06-29 04:15:58 +02008 "errors"
9 "fmt"
10 "io"
11 "net/http"
12 "os"
13 "path/filepath"
14 "regexp"
15 "strconv"
Lorenz Brund14be0e2023-07-31 16:46:14 +020016 "strings"
Lorenz Brun35fcf032023-06-29 04:15:58 +020017
18 "github.com/cenkalti/backoff/v4"
Lorenz Brund14be0e2023-07-31 16:46:14 +020019 "golang.org/x/sys/unix"
Lorenz Brun35fcf032023-06-29 04:15:58 +020020 "google.golang.org/grpc/codes"
21 "google.golang.org/grpc/status"
Lorenz Brun54a5a052023-10-02 16:40:11 +020022 "google.golang.org/protobuf/proto"
Lorenz Brun35fcf032023-06-29 04:15:58 +020023
24 "source.monogon.dev/metropolis/node/build/mkimage/osimage"
Lorenz Brun54a5a052023-10-02 16:40:11 +020025 abloaderpb "source.monogon.dev/metropolis/node/core/abloader/spec"
Lorenz Brun35fcf032023-06-29 04:15:58 +020026 "source.monogon.dev/metropolis/pkg/blockdev"
27 "source.monogon.dev/metropolis/pkg/efivarfs"
Tim Windelschmidt8e87a062023-07-31 01:33:10 +000028 "source.monogon.dev/metropolis/pkg/gpt"
Lorenz Brund14be0e2023-07-31 16:46:14 +020029 "source.monogon.dev/metropolis/pkg/kexec"
Lorenz Brun35fcf032023-06-29 04:15:58 +020030 "source.monogon.dev/metropolis/pkg/logtree"
31)
32
33// Service contains data and functionality to perform A/B updates on a
34// Metropolis node.
35type Service struct {
36 // Path to the mount point of the EFI System Partition (ESP).
37 ESPPath string
Tim Windelschmidt8e87a062023-07-31 01:33:10 +000038 // gpt.Partition of the ESP System Partition.
39 ESPPart *gpt.Partition
Lorenz Brun35fcf032023-06-29 04:15:58 +020040 // Partition number (1-based) of the ESP in the GPT partitions array.
41 ESPPartNumber uint32
Tim Windelschmidt8e87a062023-07-31 01:33:10 +000042
Lorenz Brun35fcf032023-06-29 04:15:58 +020043 // Logger service for the update service.
44 Logger logtree.LeveledLogger
45}
46
47type Slot int
48
49const (
50 SlotInvalid Slot = 0
51 SlotA Slot = 1
52 SlotB Slot = 2
53)
54
55// Other returns the "other" slot, i.e. returns slot A for B and B for A.
56// It returns SlotInvalid for any s which is not SlotA or SlotB.
57func (s Slot) Other() Slot {
58 switch s {
59 case SlotA:
60 return SlotB
61 case SlotB:
62 return SlotA
63 default:
64 return SlotInvalid
65 }
66}
67
68func (s Slot) String() string {
69 switch s {
70 case SlotA:
71 return "A"
72 case SlotB:
73 return "B"
74 default:
75 return "<invalid slot>"
76 }
77}
78
79func (s Slot) EFIBootPath() string {
80 switch s {
81 case SlotA:
82 return osimage.EFIBootAPath
83 case SlotB:
84 return osimage.EFIBootBPath
85 default:
86 return ""
87 }
88}
89
90var slotRegexp = regexp.MustCompile(`PARTLABEL=METROPOLIS-SYSTEM-([AB])`)
91
92// ProvideESP is a convenience function for providing information about the
93// ESP after the update service has been instantiated.
Tim Windelschmidt8e87a062023-07-31 01:33:10 +000094func (s *Service) ProvideESP(path string, partNum uint32, part *gpt.Partition) {
Lorenz Brun35fcf032023-06-29 04:15:58 +020095 s.ESPPath = path
96 s.ESPPartNumber = partNum
Tim Windelschmidt8e87a062023-07-31 01:33:10 +000097 s.ESPPart = part
Lorenz Brun35fcf032023-06-29 04:15:58 +020098}
99
100// CurrentlyRunningSlot returns the slot the current system is booted from.
101func (s *Service) CurrentlyRunningSlot() Slot {
102 cmdline, err := os.ReadFile("/proc/cmdline")
103 if err != nil {
104 return SlotInvalid
105 }
106 slotMatches := slotRegexp.FindStringSubmatch(string(cmdline))
107 if len(slotMatches) != 2 {
108 return SlotInvalid
109 }
110 switch slotMatches[1] {
111 case "A":
112 return SlotA
113 case "B":
114 return SlotB
115 default:
116 panic("unreachable")
117 }
118}
119
120var bootVarRegexp = regexp.MustCompile(`^Boot([0-9A-Fa-f]{4})$`)
121
122func (s *Service) getAllBootEntries() (map[int]*efivarfs.LoadOption, error) {
123 res := make(map[int]*efivarfs.LoadOption)
124 varNames, err := efivarfs.List(efivarfs.ScopeGlobal)
125 if err != nil {
126 return nil, fmt.Errorf("failed to list EFI variables: %w", err)
127 }
128 for _, varName := range varNames {
129 m := bootVarRegexp.FindStringSubmatch(varName)
130 if m == nil {
131 continue
132 }
133 idx, err := strconv.ParseUint(m[1], 16, 16)
134 if err != nil {
135 // This cannot be hit as all regexp matches are parseable.
136 panic(err)
137 }
138 e, err := efivarfs.GetBootEntry(int(idx))
139 if err != nil {
Lorenz Brun95636732023-08-07 16:59:40 +0200140 s.Logger.Warningf("Unable to get boot entry %d, skipping: %v", idx, err)
141 continue
Lorenz Brun35fcf032023-06-29 04:15:58 +0200142 }
143 res[int(idx)] = e
144 }
145 return res, nil
146}
147
Lorenz Brun35fcf032023-06-29 04:15:58 +0200148// MarkBootSuccessful must be called after each boot if some implementation-
149// defined criteria for a successful boot are met. If an update has been
150// installed and booted and this function is called, the updated version is
151// marked as default. If an issue occurs during boot and so this function is
152// not called the old version will be started again on next boot.
153func (s *Service) MarkBootSuccessful() error {
154 if s.ESPPath == "" {
155 return errors.New("no ESP information provided to update service, cannot continue")
156 }
Lorenz Brun35fcf032023-06-29 04:15:58 +0200157 activeSlot := s.CurrentlyRunningSlot()
Lorenz Brun54a5a052023-10-02 16:40:11 +0200158 abState, err := s.getABState()
Lorenz Brun35fcf032023-06-29 04:15:58 +0200159 if err != nil {
Lorenz Brun54a5a052023-10-02 16:40:11 +0200160 s.Logger.Warningf("Error while getting A/B loader state, recreating: %v", err)
161 abState = &abloaderpb.ABLoaderData{
162 ActiveSlot: abloaderpb.Slot(activeSlot),
Lorenz Brun35fcf032023-06-29 04:15:58 +0200163 }
Lorenz Brun54a5a052023-10-02 16:40:11 +0200164 err := s.setABState(abState)
165 if err != nil {
166 return fmt.Errorf("while recreating A/B loader state: %w", err)
Lorenz Brun35fcf032023-06-29 04:15:58 +0200167 }
168 }
Lorenz Brun54a5a052023-10-02 16:40:11 +0200169 if Slot(abState.ActiveSlot) != activeSlot {
170 err := s.setABState(&abloaderpb.ABLoaderData{
171 ActiveSlot: abloaderpb.Slot(activeSlot),
172 })
173 if err != nil {
174 return fmt.Errorf("while setting next A/B slot: %w", err)
Lorenz Brun35fcf032023-06-29 04:15:58 +0200175 }
176 s.Logger.Infof("Permanently activated slot %v", activeSlot)
177 } else {
178 s.Logger.Infof("Normal boot from slot %v", activeSlot)
179 }
180
181 return nil
182}
183
184func openSystemSlot(slot Slot) (*blockdev.Device, error) {
185 switch slot {
186 case SlotA:
187 return blockdev.Open("/dev/system-a")
188 case SlotB:
189 return blockdev.Open("/dev/system-b")
190 default:
191 return nil, errors.New("invalid slot identifier given")
192 }
193}
194
Lorenz Brun54a5a052023-10-02 16:40:11 +0200195func (s *Service) getABState() (*abloaderpb.ABLoaderData, error) {
196 abDataRaw, err := os.ReadFile(filepath.Join(s.ESPPath, "EFI/metropolis/loader_state.pb"))
197 if err != nil {
198 return nil, err
199 }
200 var abData abloaderpb.ABLoaderData
201 if err := proto.Unmarshal(abDataRaw, &abData); err != nil {
202 return nil, err
203 }
204 return &abData, nil
205}
206
207func (s *Service) setABState(d *abloaderpb.ABLoaderData) error {
208 abDataRaw, err := proto.Marshal(d)
209 if err != nil {
210 return fmt.Errorf("while marshaling: %w", err)
211 }
212 if err := os.WriteFile(filepath.Join(s.ESPPath, "EFI/metropolis/loader_state.pb"), abDataRaw, 0666); err != nil {
213 return err
214 }
215 return nil
216}
217
Lorenz Brun35fcf032023-06-29 04:15:58 +0200218// InstallBundle installs the bundle at the given HTTP(S) URL into the currently
219// inactive slot and sets that slot to boot next. If it doesn't return an error,
220// a reboot boots into the new slot.
Lorenz Brund14be0e2023-07-31 16:46:14 +0200221func (s *Service) InstallBundle(ctx context.Context, bundleURL string, withKexec bool) error {
Lorenz Brun35fcf032023-06-29 04:15:58 +0200222 if s.ESPPath == "" {
223 return errors.New("no ESP information provided to update service, cannot continue")
224 }
225 // Download into a buffer as ZIP files cannot efficiently be read from
226 // HTTP in Go as the ReaderAt has no way of indicating continuous sections,
227 // thus a ton of small range requests would need to be used, causing
228 // a huge latency penalty as well as costing a lot of money on typical
229 // object storages. This should go away when we switch to a better bundle
230 // format which can be streamed.
231 var bundleRaw bytes.Buffer
232 b := backoff.NewExponentialBackOff()
233 err := backoff.Retry(func() error {
234 return s.tryDownloadBundle(ctx, bundleURL, &bundleRaw)
235 }, backoff.WithContext(b, ctx))
236 if err != nil {
237 return fmt.Errorf("error downloading Metropolis bundle: %v", err)
238 }
239 bundle, err := zip.NewReader(bytes.NewReader(bundleRaw.Bytes()), int64(bundleRaw.Len()))
240 if err != nil {
241 return fmt.Errorf("failed to open node bundle: %w", err)
242 }
243 efiPayload, err := bundle.Open("kernel_efi.efi")
244 if err != nil {
245 return fmt.Errorf("invalid bundle: %w", err)
246 }
247 defer efiPayload.Close()
248 systemImage, err := bundle.Open("verity_rootfs.img")
249 if err != nil {
250 return fmt.Errorf("invalid bundle: %w", err)
251 }
252 defer systemImage.Close()
253 activeSlot := s.CurrentlyRunningSlot()
254 if activeSlot == SlotInvalid {
255 return errors.New("unable to determine active slot, cannot continue")
256 }
257 targetSlot := activeSlot.Other()
258
Lorenz Brun35fcf032023-06-29 04:15:58 +0200259 systemPart, err := openSystemSlot(targetSlot)
260 if err != nil {
261 return status.Errorf(codes.Internal, "Inactive system slot unavailable: %v", err)
262 }
263 defer systemPart.Close()
264 if _, err := io.Copy(blockdev.NewRWS(systemPart), systemImage); err != nil {
265 return status.Errorf(codes.Unavailable, "Failed to copy system image: %v", err)
266 }
267
268 bootFile, err := os.Create(filepath.Join(s.ESPPath, targetSlot.EFIBootPath()))
269 if err != nil {
270 return fmt.Errorf("failed to open boot file: %w", err)
271 }
272 defer bootFile.Close()
273 if _, err := io.Copy(bootFile, efiPayload); err != nil {
274 return fmt.Errorf("failed to write boot file: %w", err)
275 }
276
Lorenz Brund14be0e2023-07-31 16:46:14 +0200277 if withKexec {
278 if err := s.stageKexec(bootFile, targetSlot); err != nil {
279 return fmt.Errorf("while kexec staging: %w", err)
280 }
281 } else {
Lorenz Brun54a5a052023-10-02 16:40:11 +0200282 err := s.setABState(&abloaderpb.ABLoaderData{
283 ActiveSlot: abloaderpb.Slot(activeSlot),
284 NextSlot: abloaderpb.Slot(targetSlot),
285 })
286 if err != nil {
287 return fmt.Errorf("while setting next A/B slot: %w", err)
Lorenz Brund14be0e2023-07-31 16:46:14 +0200288 }
Lorenz Brun35fcf032023-06-29 04:15:58 +0200289 }
290
291 return nil
292}
293
294func (*Service) tryDownloadBundle(ctx context.Context, bundleURL string, bundleRaw *bytes.Buffer) error {
295 bundleReq, err := http.NewRequestWithContext(ctx, "GET", bundleURL, nil)
296 bundleRes, err := http.DefaultClient.Do(bundleReq)
297 if err != nil {
298 return fmt.Errorf("HTTP request failed: %w", err)
299 }
300 defer bundleRes.Body.Close()
301 switch bundleRes.StatusCode {
302 case http.StatusTooEarly, http.StatusTooManyRequests,
303 http.StatusInternalServerError, http.StatusBadGateway,
304 http.StatusServiceUnavailable, http.StatusGatewayTimeout:
305 return fmt.Errorf("HTTP error %d", bundleRes.StatusCode)
306 default:
307 // Non-standard code range used for proxy-related issue by various
308 // vendors. Treat as non-permanent error.
309 if bundleRes.StatusCode >= 520 && bundleRes.StatusCode < 599 {
310 return fmt.Errorf("HTTP error %d", bundleRes.StatusCode)
311 }
312 if bundleRes.StatusCode != 200 {
313 return backoff.Permanent(fmt.Errorf("HTTP error %d", bundleRes.StatusCode))
314 }
315 }
316 if _, err := bundleRaw.ReadFrom(bundleRes.Body); err != nil {
317 bundleRaw.Reset()
318 return err
319 }
320 return nil
321}
Lorenz Brund14be0e2023-07-31 16:46:14 +0200322
323// newMemfile creates a new file which is not located on a specific filesystem,
324// but is instead backed by anonymous memory.
325func newMemfile(name string, flags int) (*os.File, error) {
326 fd, err := unix.MemfdCreate(name, flags)
327 if err != nil {
328 return nil, fmt.Errorf("memfd_create: %w", err)
329 }
330 return os.NewFile(uintptr(fd), name), nil
331}
332
333// stageKexec stages the kernel, command line and initramfs if available for
334// a future kexec. It extracts the relevant data from the EFI boot executable.
335func (s *Service) stageKexec(bootFile io.ReaderAt, targetSlot Slot) error {
336 bootPE, err := pe.NewFile(bootFile)
337 if err != nil {
338 return fmt.Errorf("unable to open bootFile as PE: %w", err)
339 }
340 var cmdlineRaw []byte
341 cmdlineSection := bootPE.Section(".cmdline")
342 if cmdlineSection == nil {
343 return fmt.Errorf("no .cmdline section in boot PE")
344 }
345 cmdlineRaw, err = cmdlineSection.Data()
346 if err != nil {
347 return fmt.Errorf("while reading .cmdline PE section: %w", err)
348 }
349 cmdline := string(bytes.TrimRight(cmdlineRaw, "\x00"))
350 cmdline = strings.ReplaceAll(cmdline, "METROPOLIS-SYSTEM-X", fmt.Sprintf("METROPOLIS-SYSTEM-%s", targetSlot))
351 kernelFile, err := newMemfile("kernel", 0)
352 if err != nil {
353 return fmt.Errorf("failed to create kernel memfile: %w", err)
354 }
355 defer kernelFile.Close()
356 kernelSection := bootPE.Section(".linux")
357 if kernelSection == nil {
358 return fmt.Errorf("no .linux section in boot PE")
359 }
360 if _, err := io.Copy(kernelFile, kernelSection.Open()); err != nil {
361 return fmt.Errorf("while copying .linux PE section: %w", err)
362 }
363
364 initramfsSection := bootPE.Section(".initrd")
365 var initramfsFile *os.File
366 if initramfsSection != nil && initramfsSection.Size > 0 {
367 initramfsFile, err = newMemfile("initramfs", 0)
368 if err != nil {
369 return fmt.Errorf("failed to create initramfs memfile: %w", err)
370 }
371 defer initramfsFile.Close()
372 if _, err := io.Copy(initramfsFile, initramfsSection.Open()); err != nil {
373 return fmt.Errorf("while copying .initrd PE section: %w", err)
374 }
375 }
376 if err := kexec.FileLoad(kernelFile, initramfsFile, cmdline); err != nil {
377 return fmt.Errorf("while staging new kexec kernel: %w", err)
378 }
379 return nil
380}