blob: 92e2e88627f15cf133bdac64ca2bf1f5ed6535f1 [file] [log] [blame]
Lorenz Brun35fcf032023-06-29 04:15:58 +02001package update
2
3import (
4 "archive/zip"
5 "bytes"
6 "context"
7 "errors"
8 "fmt"
9 "io"
10 "net/http"
11 "os"
12 "path/filepath"
13 "regexp"
14 "strconv"
15
16 "github.com/cenkalti/backoff/v4"
17 "github.com/google/uuid"
18 "google.golang.org/grpc/codes"
19 "google.golang.org/grpc/status"
20
21 "source.monogon.dev/metropolis/node/build/mkimage/osimage"
22 "source.monogon.dev/metropolis/pkg/blockdev"
23 "source.monogon.dev/metropolis/pkg/efivarfs"
24 "source.monogon.dev/metropolis/pkg/logtree"
25)
26
27// Service contains data and functionality to perform A/B updates on a
28// Metropolis node.
29type Service struct {
30 // Path to the mount point of the EFI System Partition (ESP).
31 ESPPath string
32 // UUID of the ESP System Partition.
33 ESPUUID uuid.UUID
34 // Partition number (1-based) of the ESP in the GPT partitions array.
35 ESPPartNumber uint32
36 // Logger service for the update service.
37 Logger logtree.LeveledLogger
38}
39
40type Slot int
41
42const (
43 SlotInvalid Slot = 0
44 SlotA Slot = 1
45 SlotB Slot = 2
46)
47
48// Other returns the "other" slot, i.e. returns slot A for B and B for A.
49// It returns SlotInvalid for any s which is not SlotA or SlotB.
50func (s Slot) Other() Slot {
51 switch s {
52 case SlotA:
53 return SlotB
54 case SlotB:
55 return SlotA
56 default:
57 return SlotInvalid
58 }
59}
60
61func (s Slot) String() string {
62 switch s {
63 case SlotA:
64 return "A"
65 case SlotB:
66 return "B"
67 default:
68 return "<invalid slot>"
69 }
70}
71
72func (s Slot) EFIBootPath() string {
73 switch s {
74 case SlotA:
75 return osimage.EFIBootAPath
76 case SlotB:
77 return osimage.EFIBootBPath
78 default:
79 return ""
80 }
81}
82
83var slotRegexp = regexp.MustCompile(`PARTLABEL=METROPOLIS-SYSTEM-([AB])`)
84
85// ProvideESP is a convenience function for providing information about the
86// ESP after the update service has been instantiated.
87func (s *Service) ProvideESP(path string, partUUID uuid.UUID, partNum uint32) {
88 s.ESPPath = path
89 s.ESPPartNumber = partNum
90 s.ESPUUID = partUUID
91}
92
93// CurrentlyRunningSlot returns the slot the current system is booted from.
94func (s *Service) CurrentlyRunningSlot() Slot {
95 cmdline, err := os.ReadFile("/proc/cmdline")
96 if err != nil {
97 return SlotInvalid
98 }
99 slotMatches := slotRegexp.FindStringSubmatch(string(cmdline))
100 if len(slotMatches) != 2 {
101 return SlotInvalid
102 }
103 switch slotMatches[1] {
104 case "A":
105 return SlotA
106 case "B":
107 return SlotB
108 default:
109 panic("unreachable")
110 }
111}
112
113var bootVarRegexp = regexp.MustCompile(`^Boot([0-9A-Fa-f]{4})$`)
114
115func (s *Service) getAllBootEntries() (map[int]*efivarfs.LoadOption, error) {
116 res := make(map[int]*efivarfs.LoadOption)
117 varNames, err := efivarfs.List(efivarfs.ScopeGlobal)
118 if err != nil {
119 return nil, fmt.Errorf("failed to list EFI variables: %w", err)
120 }
121 for _, varName := range varNames {
122 m := bootVarRegexp.FindStringSubmatch(varName)
123 if m == nil {
124 continue
125 }
126 idx, err := strconv.ParseUint(m[1], 16, 16)
127 if err != nil {
128 // This cannot be hit as all regexp matches are parseable.
129 panic(err)
130 }
131 e, err := efivarfs.GetBootEntry(int(idx))
132 if err != nil {
133 return nil, fmt.Errorf("failed to get boot entry %d: %w", idx, err)
134 }
135 res[int(idx)] = e
136 }
137 return res, nil
138}
139
140func (s *Service) getOrMakeBootEntry(existing map[int]*efivarfs.LoadOption, slot Slot) (int, error) {
141 for idx, e := range existing {
142 if len(e.FilePath) != 2 {
143 // Not our entry
144 continue
145 }
146 switch p := e.FilePath[0].(type) {
147 case *efivarfs.HardDrivePath:
148 gptMatch, ok := p.PartitionMatch.(*efivarfs.PartitionGPT)
149 if ok && gptMatch.PartitionUUID != s.ESPUUID {
150 // Not related to our ESP
151 continue
152 }
153 default:
154 continue
155 }
156 switch p := e.FilePath[1].(type) {
157 case efivarfs.FilePath:
158 if string(p) == slot.EFIBootPath() {
159 return idx, nil
160 }
161 default:
162 continue
163 }
164 }
165 newEntry := &efivarfs.LoadOption{
166 Description: fmt.Sprintf("Metropolis Slot %s", slot),
167 FilePath: efivarfs.DevicePath{
168 &efivarfs.HardDrivePath{
169 PartitionNumber: s.ESPPartNumber,
170 PartitionMatch: efivarfs.PartitionGPT{
171 PartitionUUID: s.ESPUUID,
172 },
173 },
174 efivarfs.FilePath(slot.EFIBootPath()),
175 },
176 }
177 newIdx, err := efivarfs.AddBootEntry(newEntry)
178 if err == nil {
179 existing[newIdx] = newEntry
180 }
181 return newIdx, err
182}
183
184// MarkBootSuccessful must be called after each boot if some implementation-
185// defined criteria for a successful boot are met. If an update has been
186// installed and booted and this function is called, the updated version is
187// marked as default. If an issue occurs during boot and so this function is
188// not called the old version will be started again on next boot.
189func (s *Service) MarkBootSuccessful() error {
190 if s.ESPPath == "" {
191 return errors.New("no ESP information provided to update service, cannot continue")
192 }
193 bootEntries, err := s.getAllBootEntries()
194 if err != nil {
195 return fmt.Errorf("while getting boot entries: %w", err)
196 }
197 aIdx, err := s.getOrMakeBootEntry(bootEntries, SlotA)
198 if err != nil {
199 return fmt.Errorf("while ensuring slot A boot entry: %w", err)
200 }
201 bIdx, err := s.getOrMakeBootEntry(bootEntries, SlotB)
202 if err != nil {
203 return fmt.Errorf("while ensuring slot B boot entry: %w", err)
204 }
205
206 activeSlot := s.CurrentlyRunningSlot()
207 firstSlot := SlotInvalid
208
209 ord, err := efivarfs.GetBootOrder()
210 if err != nil {
211 return fmt.Errorf("failed to get boot order: %w", err)
212 }
213
214 for _, e := range ord {
215 if int(e) == aIdx {
216 firstSlot = SlotA
217 break
218 }
219 if int(e) == bIdx {
220 firstSlot = SlotB
221 break
222 }
223 }
224
225 if firstSlot == SlotInvalid {
226 bootOrder := make(efivarfs.BootOrder, 2)
227 switch activeSlot {
228 case SlotA:
229 bootOrder[0], bootOrder[1] = uint16(aIdx), uint16(bIdx)
230 case SlotB:
231 bootOrder[0], bootOrder[1] = uint16(bIdx), uint16(aIdx)
232 default:
233 return fmt.Errorf("invalid active slot")
234 }
235 efivarfs.SetBootOrder(bootOrder)
236 s.Logger.Infof("Metropolis missing from boot order, recreated it")
237 } else if activeSlot != firstSlot {
238 var aPos, bPos int
239 for i, e := range ord {
240 if int(e) == aIdx {
241 aPos = i
242 }
243 if int(e) == bIdx {
244 bPos = i
245 }
246 }
247 // swap A and B slots in boot order
248 ord[aPos], ord[bPos] = ord[bPos], ord[aPos]
249 if err := efivarfs.SetBootOrder(ord); err != nil {
250 return fmt.Errorf("failed to set boot order to permanently switch slot: %w", err)
251 }
252 s.Logger.Infof("Permanently activated slot %v", activeSlot)
253 } else {
254 s.Logger.Infof("Normal boot from slot %v", activeSlot)
255 }
256
257 return nil
258}
259
260func openSystemSlot(slot Slot) (*blockdev.Device, error) {
261 switch slot {
262 case SlotA:
263 return blockdev.Open("/dev/system-a")
264 case SlotB:
265 return blockdev.Open("/dev/system-b")
266 default:
267 return nil, errors.New("invalid slot identifier given")
268 }
269}
270
271// InstallBundle installs the bundle at the given HTTP(S) URL into the currently
272// inactive slot and sets that slot to boot next. If it doesn't return an error,
273// a reboot boots into the new slot.
274func (s *Service) InstallBundle(ctx context.Context, bundleURL string) error {
275 if s.ESPPath == "" {
276 return errors.New("no ESP information provided to update service, cannot continue")
277 }
278 // Download into a buffer as ZIP files cannot efficiently be read from
279 // HTTP in Go as the ReaderAt has no way of indicating continuous sections,
280 // thus a ton of small range requests would need to be used, causing
281 // a huge latency penalty as well as costing a lot of money on typical
282 // object storages. This should go away when we switch to a better bundle
283 // format which can be streamed.
284 var bundleRaw bytes.Buffer
285 b := backoff.NewExponentialBackOff()
286 err := backoff.Retry(func() error {
287 return s.tryDownloadBundle(ctx, bundleURL, &bundleRaw)
288 }, backoff.WithContext(b, ctx))
289 if err != nil {
290 return fmt.Errorf("error downloading Metropolis bundle: %v", err)
291 }
292 bundle, err := zip.NewReader(bytes.NewReader(bundleRaw.Bytes()), int64(bundleRaw.Len()))
293 if err != nil {
294 return fmt.Errorf("failed to open node bundle: %w", err)
295 }
296 efiPayload, err := bundle.Open("kernel_efi.efi")
297 if err != nil {
298 return fmt.Errorf("invalid bundle: %w", err)
299 }
300 defer efiPayload.Close()
301 systemImage, err := bundle.Open("verity_rootfs.img")
302 if err != nil {
303 return fmt.Errorf("invalid bundle: %w", err)
304 }
305 defer systemImage.Close()
306 activeSlot := s.CurrentlyRunningSlot()
307 if activeSlot == SlotInvalid {
308 return errors.New("unable to determine active slot, cannot continue")
309 }
310 targetSlot := activeSlot.Other()
311
312 bootEntries, err := s.getAllBootEntries()
313 if err != nil {
314 return fmt.Errorf("while getting boot entries: %w", err)
315 }
316 targetSlotBootEntryIdx, err := s.getOrMakeBootEntry(bootEntries, targetSlot)
317 if err != nil {
318 return fmt.Errorf("while ensuring target slot boot entry: %w", err)
319 }
320 targetSlotBootEntry := bootEntries[targetSlotBootEntryIdx]
321
322 // Disable boot entry while the corresponding slot is being modified.
323 targetSlotBootEntry.Inactive = true
324 if err := efivarfs.SetBootEntry(targetSlotBootEntryIdx, targetSlotBootEntry); err != nil {
325 return fmt.Errorf("failed setting boot entry %d inactive: %w", targetSlotBootEntryIdx, err)
326 }
327
328 systemPart, err := openSystemSlot(targetSlot)
329 if err != nil {
330 return status.Errorf(codes.Internal, "Inactive system slot unavailable: %v", err)
331 }
332 defer systemPart.Close()
333 if _, err := io.Copy(blockdev.NewRWS(systemPart), systemImage); err != nil {
334 return status.Errorf(codes.Unavailable, "Failed to copy system image: %v", err)
335 }
336
337 bootFile, err := os.Create(filepath.Join(s.ESPPath, targetSlot.EFIBootPath()))
338 if err != nil {
339 return fmt.Errorf("failed to open boot file: %w", err)
340 }
341 defer bootFile.Close()
342 if _, err := io.Copy(bootFile, efiPayload); err != nil {
343 return fmt.Errorf("failed to write boot file: %w", err)
344 }
345
346 // Reenable target slot boot entry after boot and system have been written
347 // fully. The slot should now be bootable again.
348 targetSlotBootEntry.Inactive = false
349 if err := efivarfs.SetBootEntry(targetSlotBootEntryIdx, targetSlotBootEntry); err != nil {
350 return fmt.Errorf("failed setting boot entry %d active: %w", targetSlotBootEntryIdx, err)
351 }
352
353 if err := efivarfs.SetBootNext(uint16(targetSlotBootEntryIdx)); err != nil {
354 return fmt.Errorf("failed to set BootNext variable: %w", err)
355 }
356
357 return nil
358}
359
360func (*Service) tryDownloadBundle(ctx context.Context, bundleURL string, bundleRaw *bytes.Buffer) error {
361 bundleReq, err := http.NewRequestWithContext(ctx, "GET", bundleURL, nil)
362 bundleRes, err := http.DefaultClient.Do(bundleReq)
363 if err != nil {
364 return fmt.Errorf("HTTP request failed: %w", err)
365 }
366 defer bundleRes.Body.Close()
367 switch bundleRes.StatusCode {
368 case http.StatusTooEarly, http.StatusTooManyRequests,
369 http.StatusInternalServerError, http.StatusBadGateway,
370 http.StatusServiceUnavailable, http.StatusGatewayTimeout:
371 return fmt.Errorf("HTTP error %d", bundleRes.StatusCode)
372 default:
373 // Non-standard code range used for proxy-related issue by various
374 // vendors. Treat as non-permanent error.
375 if bundleRes.StatusCode >= 520 && bundleRes.StatusCode < 599 {
376 return fmt.Errorf("HTTP error %d", bundleRes.StatusCode)
377 }
378 if bundleRes.StatusCode != 200 {
379 return backoff.Permanent(fmt.Errorf("HTTP error %d", bundleRes.StatusCode))
380 }
381 }
382 if _, err := bundleRaw.ReadFrom(bundleRes.Body); err != nil {
383 bundleRaw.Reset()
384 return err
385 }
386 return nil
387}