blob: 16dedf0d12d2dcdf8d5e7a62269f7d40fd76f9e1 [file] [log] [blame]
Tim Windelschmidt6d33a432025-02-04 14:34:25 +01001// Copyright The Monogon Project Authors.
2// SPDX-License-Identifier: Apache-2.0
3
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +02004package main
5
6import (
7 "context"
Tim Windelschmidtd5f851b2024-04-23 14:59:37 +02008 "errors"
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +02009 "fmt"
10 "net/netip"
11 "slices"
12 "strings"
13 "time"
14
15 "github.com/packethost/packngo"
16 "k8s.io/klog/v2"
17
18 "source.monogon.dev/cloud/bmaas/bmdb"
19 "source.monogon.dev/cloud/bmaas/bmdb/model"
20 "source.monogon.dev/cloud/equinix/wrapngo"
21 "source.monogon.dev/cloud/lib/sinbin"
22 "source.monogon.dev/cloud/shepherd"
23 "source.monogon.dev/cloud/shepherd/manager"
24)
25
26type equinixProvider struct {
27 config *providerConfig
28 api wrapngo.Client
29 sshKey *manager.SSHKey
30
31 // badReservations is a holiday resort for Equinix hardware reservations which
32 // failed to be provisioned for some reason or another. We keep a list of them in
33 // memory just so that we don't repeatedly try to provision the same known bad
34 // machines.
35 badReservations sinbin.Sinbin[string]
36
37 reservationDeadline time.Time
38 reservationCache []packngo.HardwareReservation
39}
40
41func (ep *equinixProvider) RebootMachine(ctx context.Context, id shepherd.ProviderID) error {
42 if err := ep.api.RebootDevice(ctx, string(id)); err != nil {
43 return fmt.Errorf("failed to reboot device: %w", err)
44 }
45
46 // TODO(issue/215): replace this
47 // This is required as Equinix doesn't reboot the machines synchronously
48 // during the API call.
49 select {
50 case <-time.After(time.Duration(ep.config.RebootWaitSeconds) * time.Second):
51 case <-ctx.Done():
52 return fmt.Errorf("while waiting for reboot: %w", ctx.Err())
53 }
54 return nil
55}
56
57func (ep *equinixProvider) ReinstallMachine(ctx context.Context, id shepherd.ProviderID) error {
58 return shepherd.ErrNotImplemented
59}
60
61func (ep *equinixProvider) GetMachine(ctx context.Context, id shepherd.ProviderID) (shepherd.Machine, error) {
62 machines, err := ep.ListMachines(ctx)
63 if err != nil {
64 return nil, err
65 }
66
67 for _, machine := range machines {
68 if machine.ID() == id {
69 return machine, nil
70 }
71 }
72
73 return nil, shepherd.ErrMachineNotFound
74}
75
76func (ep *equinixProvider) ListMachines(ctx context.Context) ([]shepherd.Machine, error) {
77 if ep.reservationDeadline.Before(time.Now()) {
78 reservations, err := ep.listReservations(ctx)
79 if err != nil {
80 return nil, err
81 }
82 ep.reservationCache = reservations
83 ep.reservationDeadline = time.Now().Add(ep.config.ReservationCacheTimeout)
84 }
85
86 devices, err := ep.managedDevices(ctx)
87 if err != nil {
88 return nil, err
89 }
90
91 machines := make([]shepherd.Machine, 0, len(ep.reservationCache)+len(devices))
92 for _, device := range devices {
93 machines = append(machines, &machine{device})
94 }
95
96 for _, res := range ep.reservationCache {
97 machines = append(machines, reservation{res})
98 }
99
100 return machines, nil
101}
102
103func (ep *equinixProvider) CreateMachine(ctx context.Context, session *bmdb.Session, request shepherd.CreateMachineRequest) (shepherd.Machine, error) {
104 if request.UnusedMachine == nil {
105 return nil, fmt.Errorf("parameter UnusedMachine is missing")
106 }
107
108 //TODO: Do we just trust the implementation to be correct?
109 res, ok := request.UnusedMachine.(reservation)
110 if !ok {
111 return nil, fmt.Errorf("invalid type for parameter UnusedMachine")
112 }
113
114 d, err := ep.provision(ctx, session, res.HardwareReservation)
115 if err != nil {
116 klog.Errorf("Failed to provision reservation %s: %v", res.HardwareReservation.ID, err)
117 until := time.Now().Add(time.Hour)
118 klog.Errorf("Adding hardware reservation %s to sinbin until %s", res.HardwareReservation.ID, until)
119 ep.badReservations.Add(res.HardwareReservation.ID, until)
120 return nil, err
121 }
122
123 return &machine{*d}, nil
124}
125
126func (ep *equinixProvider) Type() model.Provider {
127 return model.ProviderEquinix
128}
129
130type reservation struct {
131 packngo.HardwareReservation
132}
133
Tim Windelschmidtfdd87ab2023-12-07 18:03:21 +0100134func (e reservation) Failed() bool {
135 return false
136}
137
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +0200138func (e reservation) ID() shepherd.ProviderID {
139 return shepherd.InvalidProviderID
140}
141
142func (e reservation) Addr() netip.Addr {
143 return netip.Addr{}
144}
145
Tim Windelschmidtc4dd0032024-02-19 13:13:31 +0100146func (e reservation) Availability() shepherd.Availability {
147 return shepherd.AvailabilityKnownUnused
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +0200148}
149
150type machine struct {
151 packngo.Device
152}
153
Tim Windelschmidtfdd87ab2023-12-07 18:03:21 +0100154func (e *machine) Failed() bool {
Tim Windelschmidtc4dd0032024-02-19 13:13:31 +0100155 return e.State == "failed"
Tim Windelschmidtfdd87ab2023-12-07 18:03:21 +0100156}
157
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +0200158func (e *machine) ID() shepherd.ProviderID {
159 return shepherd.ProviderID(e.Device.ID)
160}
161
162func (e *machine) Addr() netip.Addr {
163 ni := e.GetNetworkInfo()
164
165 var addr string
166 if ni.PublicIPv4 != "" {
167 addr = ni.PublicIPv4
168 } else if ni.PublicIPv6 != "" {
169 addr = ni.PublicIPv6
170 } else {
171 klog.Errorf("missing address for machine: %v", e.ID())
172 return netip.Addr{}
173 }
174
175 a, err := netip.ParseAddr(addr)
176 if err != nil {
177 klog.Errorf("failed parsing address %q: %v", addr, err)
178 return netip.Addr{}
179 }
180
181 return a
182}
183
Tim Windelschmidtc4dd0032024-02-19 13:13:31 +0100184func (e *machine) Availability() shepherd.Availability {
185 return shepherd.AvailabilityKnownUsed
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +0200186}
187
188// listReservations doesn't lock the mutex and expects the caller to lock.
189func (ep *equinixProvider) listReservations(ctx context.Context) ([]packngo.HardwareReservation, error) {
190 klog.Infof("Retrieving hardware reservations, this will take a while...")
191 reservations, err := ep.api.ListReservations(ctx, ep.config.ProjectId)
192 if err != nil {
193 return nil, fmt.Errorf("failed to list reservations: %w", err)
194 }
195
196 var available []packngo.HardwareReservation
197 var inUse, notProvisionable, penalized int
198 for _, reservation := range reservations {
199 if reservation.Device != nil {
200 inUse++
201 continue
202 }
203 if !reservation.Provisionable {
204 notProvisionable++
205 continue
206 }
207 if ep.badReservations.Penalized(reservation.ID) {
208 penalized++
209 continue
210 }
211 available = append(available, reservation)
212 }
213 klog.Infof("Retrieved hardware reservations: %d (total), %d (available), %d (in use), %d (not provisionable), %d (penalized)", len(reservations), len(available), inUse, notProvisionable, penalized)
214
215 return available, nil
216}
217
218// provision attempts to create a device within Equinix using given Hardware
219// Reservation rsv. The resulting device is registered with BMDB, and tagged as
220// "provided" in the process.
221func (ep *equinixProvider) provision(ctx context.Context, sess *bmdb.Session, rsv packngo.HardwareReservation) (*packngo.Device, error) {
222 klog.Infof("Creating a new device using reservation ID %s.", rsv.ID)
223 hostname := ep.config.DevicePrefix + rsv.ID[:18]
224 kid, err := ep.sshEquinixId(ctx)
225 if err != nil {
226 return nil, err
227 }
228 req := &packngo.DeviceCreateRequest{
229 Hostname: hostname,
230 OS: ep.config.OS,
231 Plan: rsv.Plan.Slug,
232 ProjectID: ep.config.ProjectId,
233 HardwareReservationID: rsv.ID,
234 ProjectSSHKeys: []string{kid},
235 }
236 if ep.config.UseProjectKeys {
237 klog.Warningf("INSECURE: Machines will be created with ALL PROJECT SSH KEYS!")
238 req.ProjectSSHKeys = nil
239 }
240
241 nd, err := ep.api.CreateDevice(ctx, req)
242 if err != nil {
243 return nil, fmt.Errorf("while creating new device within Equinix: %w", err)
244 }
245 klog.Infof("Created a new device within Equinix (RID: %s, PID: %s, HOST: %s)", rsv.ID, nd.ID, hostname)
246
Tim Windelschmidt62507052024-04-23 15:06:14 +0200247 ep.reservationCache = slices.DeleteFunc(ep.reservationCache, func(v packngo.HardwareReservation) bool {
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +0200248 return rsv.ID == v.ID
249 })
250
251 err = ep.assimilate(ctx, sess, nd.ID)
252 if err != nil {
253 // TODO(serge@monogon.tech) at this point the device at Equinix isn't
254 // matched by a BMDB record. Schedule device deletion or make sure this
255 // case is being handled elsewhere.
256 return nil, err
257 }
258 return nd, nil
259}
260
261// assimilate brings in an already existing machine from Equinix into the BMDB.
262// This is only used in manual testing.
263func (ep *equinixProvider) assimilate(ctx context.Context, sess *bmdb.Session, deviceID string) error {
264 return sess.Transact(ctx, func(q *model.Queries) error {
265 // Create a new machine record within BMDB.
266 m, err := q.NewMachine(ctx)
267 if err != nil {
268 return fmt.Errorf("while creating a new machine record in BMDB: %w", err)
269 }
270
271 // Link the new machine with the Equinix device, and tag it "provided".
272 p := model.MachineAddProvidedParams{
273 MachineID: m.MachineID,
274 ProviderID: deviceID,
275 Provider: model.ProviderEquinix,
276 }
277 klog.Infof("Setting \"provided\" tag (ID: %s, PID: %s, Provider: %s).", p.MachineID, p.ProviderID, p.Provider)
278 if err := q.MachineAddProvided(ctx, p); err != nil {
279 return fmt.Errorf("while tagging machine active: %w", err)
280 }
281 return nil
282 })
283}
284
285// sshEquinixGet looks up the Equinix key matching providerConfig.KeyLabel,
286// returning its packngo.SSHKey instance.
287func (ep *equinixProvider) sshEquinix(ctx context.Context) (*packngo.SSHKey, error) {
288 ks, err := ep.api.ListSSHKeys(ctx)
289 if err != nil {
290 return nil, fmt.Errorf("while listing SSH keys: %w", err)
291 }
292
293 for _, k := range ks {
294 if k.Label == ep.config.KeyLabel {
295 return &k, nil
296 }
297 }
Tim Windelschmidt513df182024-04-18 23:44:50 +0200298 return nil, ErrNoSuchKey
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +0200299}
300
301// sshEquinixId looks up the Equinix key identified by providerConfig.KeyLabel,
302// returning its Equinix-assigned UUID.
303func (ep *equinixProvider) sshEquinixId(ctx context.Context) (string, error) {
304 k, err := ep.sshEquinix(ctx)
305 if err != nil {
306 return "", err
307 }
308 return k.ID, nil
309}
310
311// sshEquinixUpdate makes sure the existing SSH key registered with Equinix
312// matches the one from sshPub.
313func (ep *equinixProvider) sshEquinixUpdate(ctx context.Context, kid string) error {
314 pub, err := ep.sshKey.PublicKey()
315 if err != nil {
316 return err
317 }
318 _, err = ep.api.UpdateSSHKey(ctx, kid, &packngo.SSHKeyUpdateRequest{
319 Key: &pub,
320 })
321 if err != nil {
322 return fmt.Errorf("while updating the SSH key: %w", err)
323 }
324 return nil
325}
326
327// sshEquinixUpload registers a new SSH key from sshPub.
328func (ep *equinixProvider) sshEquinixUpload(ctx context.Context) error {
329 pub, err := ep.sshKey.PublicKey()
330 if err != nil {
331 return fmt.Errorf("while generating public key: %w", err)
332 }
333 _, err = ep.api.CreateSSHKey(ctx, &packngo.SSHKeyCreateRequest{
334 Label: ep.config.KeyLabel,
335 Key: pub,
336 ProjectID: ep.config.ProjectId,
337 })
338 if err != nil {
339 return fmt.Errorf("while creating an SSH key: %w", err)
340 }
341 return nil
342}
343
344// SSHEquinixEnsure initializes the locally managed SSH key (from a persistence
345// path or explicitly set key) and updates or uploads it to Equinix. The key is
346// generated as needed The key is generated as needed
347func (ep *equinixProvider) SSHEquinixEnsure(ctx context.Context) error {
348 k, err := ep.sshEquinix(ctx)
Tim Windelschmidtd5f851b2024-04-23 14:59:37 +0200349 switch {
Tim Windelschmidt513df182024-04-18 23:44:50 +0200350 case errors.Is(err, ErrNoSuchKey):
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +0200351 if err := ep.sshEquinixUpload(ctx); err != nil {
352 return fmt.Errorf("while uploading key: %w", err)
353 }
354 return nil
Tim Windelschmidtd5f851b2024-04-23 14:59:37 +0200355 case err == nil:
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +0200356 if err := ep.sshEquinixUpdate(ctx, k.ID); err != nil {
357 return fmt.Errorf("while updating key: %w", err)
358 }
359 return nil
360 default:
361 return err
362 }
363}
364
365// managedDevices provides a map of device provider IDs to matching
366// packngo.Device instances. It calls Equinix API's ListDevices. The returned
367// devices are filtered according to DevicePrefix provided through Opts. The
368// returned error value, if not nil, will originate in wrapngo.
369func (ep *equinixProvider) managedDevices(ctx context.Context) (map[string]packngo.Device, error) {
370 ds, err := ep.api.ListDevices(ctx, ep.config.ProjectId)
371 if err != nil {
372 return nil, err
373 }
374 dm := map[string]packngo.Device{}
375 for _, d := range ds {
376 if strings.HasPrefix(d.Hostname, ep.config.DevicePrefix) {
377 dm[d.ID] = d
378 }
379 }
380 return dm, nil
381}