blob: ebd9699854b9e325596661a4c6bb9b765526a330 [file] [log] [blame]
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +02001package main
2
3import (
4 "context"
Tim Windelschmidtd5f851b2024-04-23 14:59:37 +02005 "errors"
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +02006 "fmt"
7 "net/netip"
8 "slices"
9 "strings"
10 "time"
11
12 "github.com/packethost/packngo"
13 "k8s.io/klog/v2"
14
15 "source.monogon.dev/cloud/bmaas/bmdb"
16 "source.monogon.dev/cloud/bmaas/bmdb/model"
17 "source.monogon.dev/cloud/equinix/wrapngo"
18 "source.monogon.dev/cloud/lib/sinbin"
19 "source.monogon.dev/cloud/shepherd"
20 "source.monogon.dev/cloud/shepherd/manager"
21)
22
23type equinixProvider struct {
24 config *providerConfig
25 api wrapngo.Client
26 sshKey *manager.SSHKey
27
28 // badReservations is a holiday resort for Equinix hardware reservations which
29 // failed to be provisioned for some reason or another. We keep a list of them in
30 // memory just so that we don't repeatedly try to provision the same known bad
31 // machines.
32 badReservations sinbin.Sinbin[string]
33
34 reservationDeadline time.Time
35 reservationCache []packngo.HardwareReservation
36}
37
38func (ep *equinixProvider) RebootMachine(ctx context.Context, id shepherd.ProviderID) error {
39 if err := ep.api.RebootDevice(ctx, string(id)); err != nil {
40 return fmt.Errorf("failed to reboot device: %w", err)
41 }
42
43 // TODO(issue/215): replace this
44 // This is required as Equinix doesn't reboot the machines synchronously
45 // during the API call.
46 select {
47 case <-time.After(time.Duration(ep.config.RebootWaitSeconds) * time.Second):
48 case <-ctx.Done():
49 return fmt.Errorf("while waiting for reboot: %w", ctx.Err())
50 }
51 return nil
52}
53
54func (ep *equinixProvider) ReinstallMachine(ctx context.Context, id shepherd.ProviderID) error {
55 return shepherd.ErrNotImplemented
56}
57
58func (ep *equinixProvider) GetMachine(ctx context.Context, id shepherd.ProviderID) (shepherd.Machine, error) {
59 machines, err := ep.ListMachines(ctx)
60 if err != nil {
61 return nil, err
62 }
63
64 for _, machine := range machines {
65 if machine.ID() == id {
66 return machine, nil
67 }
68 }
69
70 return nil, shepherd.ErrMachineNotFound
71}
72
73func (ep *equinixProvider) ListMachines(ctx context.Context) ([]shepherd.Machine, error) {
74 if ep.reservationDeadline.Before(time.Now()) {
75 reservations, err := ep.listReservations(ctx)
76 if err != nil {
77 return nil, err
78 }
79 ep.reservationCache = reservations
80 ep.reservationDeadline = time.Now().Add(ep.config.ReservationCacheTimeout)
81 }
82
83 devices, err := ep.managedDevices(ctx)
84 if err != nil {
85 return nil, err
86 }
87
88 machines := make([]shepherd.Machine, 0, len(ep.reservationCache)+len(devices))
89 for _, device := range devices {
90 machines = append(machines, &machine{device})
91 }
92
93 for _, res := range ep.reservationCache {
94 machines = append(machines, reservation{res})
95 }
96
97 return machines, nil
98}
99
100func (ep *equinixProvider) CreateMachine(ctx context.Context, session *bmdb.Session, request shepherd.CreateMachineRequest) (shepherd.Machine, error) {
101 if request.UnusedMachine == nil {
102 return nil, fmt.Errorf("parameter UnusedMachine is missing")
103 }
104
105 //TODO: Do we just trust the implementation to be correct?
106 res, ok := request.UnusedMachine.(reservation)
107 if !ok {
108 return nil, fmt.Errorf("invalid type for parameter UnusedMachine")
109 }
110
111 d, err := ep.provision(ctx, session, res.HardwareReservation)
112 if err != nil {
113 klog.Errorf("Failed to provision reservation %s: %v", res.HardwareReservation.ID, err)
114 until := time.Now().Add(time.Hour)
115 klog.Errorf("Adding hardware reservation %s to sinbin until %s", res.HardwareReservation.ID, until)
116 ep.badReservations.Add(res.HardwareReservation.ID, until)
117 return nil, err
118 }
119
120 return &machine{*d}, nil
121}
122
123func (ep *equinixProvider) Type() model.Provider {
124 return model.ProviderEquinix
125}
126
127type reservation struct {
128 packngo.HardwareReservation
129}
130
Tim Windelschmidtfdd87ab2023-12-07 18:03:21 +0100131func (e reservation) Failed() bool {
132 return false
133}
134
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +0200135func (e reservation) ID() shepherd.ProviderID {
136 return shepherd.InvalidProviderID
137}
138
139func (e reservation) Addr() netip.Addr {
140 return netip.Addr{}
141}
142
Tim Windelschmidtc4dd0032024-02-19 13:13:31 +0100143func (e reservation) Availability() shepherd.Availability {
144 return shepherd.AvailabilityKnownUnused
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +0200145}
146
147type machine struct {
148 packngo.Device
149}
150
Tim Windelschmidtfdd87ab2023-12-07 18:03:21 +0100151func (e *machine) Failed() bool {
Tim Windelschmidtc4dd0032024-02-19 13:13:31 +0100152 return e.State == "failed"
Tim Windelschmidtfdd87ab2023-12-07 18:03:21 +0100153}
154
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +0200155func (e *machine) ID() shepherd.ProviderID {
156 return shepherd.ProviderID(e.Device.ID)
157}
158
159func (e *machine) Addr() netip.Addr {
160 ni := e.GetNetworkInfo()
161
162 var addr string
163 if ni.PublicIPv4 != "" {
164 addr = ni.PublicIPv4
165 } else if ni.PublicIPv6 != "" {
166 addr = ni.PublicIPv6
167 } else {
168 klog.Errorf("missing address for machine: %v", e.ID())
169 return netip.Addr{}
170 }
171
172 a, err := netip.ParseAddr(addr)
173 if err != nil {
174 klog.Errorf("failed parsing address %q: %v", addr, err)
175 return netip.Addr{}
176 }
177
178 return a
179}
180
Tim Windelschmidtc4dd0032024-02-19 13:13:31 +0100181func (e *machine) Availability() shepherd.Availability {
182 return shepherd.AvailabilityKnownUsed
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +0200183}
184
185// listReservations doesn't lock the mutex and expects the caller to lock.
186func (ep *equinixProvider) listReservations(ctx context.Context) ([]packngo.HardwareReservation, error) {
187 klog.Infof("Retrieving hardware reservations, this will take a while...")
188 reservations, err := ep.api.ListReservations(ctx, ep.config.ProjectId)
189 if err != nil {
190 return nil, fmt.Errorf("failed to list reservations: %w", err)
191 }
192
193 var available []packngo.HardwareReservation
194 var inUse, notProvisionable, penalized int
195 for _, reservation := range reservations {
196 if reservation.Device != nil {
197 inUse++
198 continue
199 }
200 if !reservation.Provisionable {
201 notProvisionable++
202 continue
203 }
204 if ep.badReservations.Penalized(reservation.ID) {
205 penalized++
206 continue
207 }
208 available = append(available, reservation)
209 }
210 klog.Infof("Retrieved hardware reservations: %d (total), %d (available), %d (in use), %d (not provisionable), %d (penalized)", len(reservations), len(available), inUse, notProvisionable, penalized)
211
212 return available, nil
213}
214
215// provision attempts to create a device within Equinix using given Hardware
216// Reservation rsv. The resulting device is registered with BMDB, and tagged as
217// "provided" in the process.
218func (ep *equinixProvider) provision(ctx context.Context, sess *bmdb.Session, rsv packngo.HardwareReservation) (*packngo.Device, error) {
219 klog.Infof("Creating a new device using reservation ID %s.", rsv.ID)
220 hostname := ep.config.DevicePrefix + rsv.ID[:18]
221 kid, err := ep.sshEquinixId(ctx)
222 if err != nil {
223 return nil, err
224 }
225 req := &packngo.DeviceCreateRequest{
226 Hostname: hostname,
227 OS: ep.config.OS,
228 Plan: rsv.Plan.Slug,
229 ProjectID: ep.config.ProjectId,
230 HardwareReservationID: rsv.ID,
231 ProjectSSHKeys: []string{kid},
232 }
233 if ep.config.UseProjectKeys {
234 klog.Warningf("INSECURE: Machines will be created with ALL PROJECT SSH KEYS!")
235 req.ProjectSSHKeys = nil
236 }
237
238 nd, err := ep.api.CreateDevice(ctx, req)
239 if err != nil {
240 return nil, fmt.Errorf("while creating new device within Equinix: %w", err)
241 }
242 klog.Infof("Created a new device within Equinix (RID: %s, PID: %s, HOST: %s)", rsv.ID, nd.ID, hostname)
243
244 slices.DeleteFunc(ep.reservationCache, func(v packngo.HardwareReservation) bool {
245 return rsv.ID == v.ID
246 })
247
248 err = ep.assimilate(ctx, sess, nd.ID)
249 if err != nil {
250 // TODO(serge@monogon.tech) at this point the device at Equinix isn't
251 // matched by a BMDB record. Schedule device deletion or make sure this
252 // case is being handled elsewhere.
253 return nil, err
254 }
255 return nd, nil
256}
257
258// assimilate brings in an already existing machine from Equinix into the BMDB.
259// This is only used in manual testing.
260func (ep *equinixProvider) assimilate(ctx context.Context, sess *bmdb.Session, deviceID string) error {
261 return sess.Transact(ctx, func(q *model.Queries) error {
262 // Create a new machine record within BMDB.
263 m, err := q.NewMachine(ctx)
264 if err != nil {
265 return fmt.Errorf("while creating a new machine record in BMDB: %w", err)
266 }
267
268 // Link the new machine with the Equinix device, and tag it "provided".
269 p := model.MachineAddProvidedParams{
270 MachineID: m.MachineID,
271 ProviderID: deviceID,
272 Provider: model.ProviderEquinix,
273 }
274 klog.Infof("Setting \"provided\" tag (ID: %s, PID: %s, Provider: %s).", p.MachineID, p.ProviderID, p.Provider)
275 if err := q.MachineAddProvided(ctx, p); err != nil {
276 return fmt.Errorf("while tagging machine active: %w", err)
277 }
278 return nil
279 })
280}
281
282// sshEquinixGet looks up the Equinix key matching providerConfig.KeyLabel,
283// returning its packngo.SSHKey instance.
284func (ep *equinixProvider) sshEquinix(ctx context.Context) (*packngo.SSHKey, error) {
285 ks, err := ep.api.ListSSHKeys(ctx)
286 if err != nil {
287 return nil, fmt.Errorf("while listing SSH keys: %w", err)
288 }
289
290 for _, k := range ks {
291 if k.Label == ep.config.KeyLabel {
292 return &k, nil
293 }
294 }
295 return nil, NoSuchKey
296}
297
298// sshEquinixId looks up the Equinix key identified by providerConfig.KeyLabel,
299// returning its Equinix-assigned UUID.
300func (ep *equinixProvider) sshEquinixId(ctx context.Context) (string, error) {
301 k, err := ep.sshEquinix(ctx)
302 if err != nil {
303 return "", err
304 }
305 return k.ID, nil
306}
307
308// sshEquinixUpdate makes sure the existing SSH key registered with Equinix
309// matches the one from sshPub.
310func (ep *equinixProvider) sshEquinixUpdate(ctx context.Context, kid string) error {
311 pub, err := ep.sshKey.PublicKey()
312 if err != nil {
313 return err
314 }
315 _, err = ep.api.UpdateSSHKey(ctx, kid, &packngo.SSHKeyUpdateRequest{
316 Key: &pub,
317 })
318 if err != nil {
319 return fmt.Errorf("while updating the SSH key: %w", err)
320 }
321 return nil
322}
323
324// sshEquinixUpload registers a new SSH key from sshPub.
325func (ep *equinixProvider) sshEquinixUpload(ctx context.Context) error {
326 pub, err := ep.sshKey.PublicKey()
327 if err != nil {
328 return fmt.Errorf("while generating public key: %w", err)
329 }
330 _, err = ep.api.CreateSSHKey(ctx, &packngo.SSHKeyCreateRequest{
331 Label: ep.config.KeyLabel,
332 Key: pub,
333 ProjectID: ep.config.ProjectId,
334 })
335 if err != nil {
336 return fmt.Errorf("while creating an SSH key: %w", err)
337 }
338 return nil
339}
340
341// SSHEquinixEnsure initializes the locally managed SSH key (from a persistence
342// path or explicitly set key) and updates or uploads it to Equinix. The key is
343// generated as needed The key is generated as needed
344func (ep *equinixProvider) SSHEquinixEnsure(ctx context.Context) error {
345 k, err := ep.sshEquinix(ctx)
Tim Windelschmidtd5f851b2024-04-23 14:59:37 +0200346 switch {
347 case errors.Is(err, NoSuchKey):
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +0200348 if err := ep.sshEquinixUpload(ctx); err != nil {
349 return fmt.Errorf("while uploading key: %w", err)
350 }
351 return nil
Tim Windelschmidtd5f851b2024-04-23 14:59:37 +0200352 case err == nil:
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +0200353 if err := ep.sshEquinixUpdate(ctx, k.ID); err != nil {
354 return fmt.Errorf("while updating key: %w", err)
355 }
356 return nil
357 default:
358 return err
359 }
360}
361
362// managedDevices provides a map of device provider IDs to matching
363// packngo.Device instances. It calls Equinix API's ListDevices. The returned
364// devices are filtered according to DevicePrefix provided through Opts. The
365// returned error value, if not nil, will originate in wrapngo.
366func (ep *equinixProvider) managedDevices(ctx context.Context) (map[string]packngo.Device, error) {
367 ds, err := ep.api.ListDevices(ctx, ep.config.ProjectId)
368 if err != nil {
369 return nil, err
370 }
371 dm := map[string]packngo.Device{}
372 for _, d := range ds {
373 if strings.HasPrefix(d.Hostname, ep.config.DevicePrefix) {
374 dm[d.ID] = d
375 }
376 }
377 return dm, nil
378}