blob: 7d3c0d2b4b4b65d9b4fae5f4f74e0473cace9287 [file] [log] [blame]
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +02001package main
2
3import (
4 "context"
5 "fmt"
6 "net/netip"
7 "slices"
8 "strings"
9 "time"
10
11 "github.com/packethost/packngo"
12 "k8s.io/klog/v2"
13
14 "source.monogon.dev/cloud/bmaas/bmdb"
15 "source.monogon.dev/cloud/bmaas/bmdb/model"
16 "source.monogon.dev/cloud/equinix/wrapngo"
17 "source.monogon.dev/cloud/lib/sinbin"
18 "source.monogon.dev/cloud/shepherd"
19 "source.monogon.dev/cloud/shepherd/manager"
20)
21
22type equinixProvider struct {
23 config *providerConfig
24 api wrapngo.Client
25 sshKey *manager.SSHKey
26
27 // badReservations is a holiday resort for Equinix hardware reservations which
28 // failed to be provisioned for some reason or another. We keep a list of them in
29 // memory just so that we don't repeatedly try to provision the same known bad
30 // machines.
31 badReservations sinbin.Sinbin[string]
32
33 reservationDeadline time.Time
34 reservationCache []packngo.HardwareReservation
35}
36
37func (ep *equinixProvider) RebootMachine(ctx context.Context, id shepherd.ProviderID) error {
38 if err := ep.api.RebootDevice(ctx, string(id)); err != nil {
39 return fmt.Errorf("failed to reboot device: %w", err)
40 }
41
42 // TODO(issue/215): replace this
43 // This is required as Equinix doesn't reboot the machines synchronously
44 // during the API call.
45 select {
46 case <-time.After(time.Duration(ep.config.RebootWaitSeconds) * time.Second):
47 case <-ctx.Done():
48 return fmt.Errorf("while waiting for reboot: %w", ctx.Err())
49 }
50 return nil
51}
52
53func (ep *equinixProvider) ReinstallMachine(ctx context.Context, id shepherd.ProviderID) error {
54 return shepherd.ErrNotImplemented
55}
56
57func (ep *equinixProvider) GetMachine(ctx context.Context, id shepherd.ProviderID) (shepherd.Machine, error) {
58 machines, err := ep.ListMachines(ctx)
59 if err != nil {
60 return nil, err
61 }
62
63 for _, machine := range machines {
64 if machine.ID() == id {
65 return machine, nil
66 }
67 }
68
69 return nil, shepherd.ErrMachineNotFound
70}
71
72func (ep *equinixProvider) ListMachines(ctx context.Context) ([]shepherd.Machine, error) {
73 if ep.reservationDeadline.Before(time.Now()) {
74 reservations, err := ep.listReservations(ctx)
75 if err != nil {
76 return nil, err
77 }
78 ep.reservationCache = reservations
79 ep.reservationDeadline = time.Now().Add(ep.config.ReservationCacheTimeout)
80 }
81
82 devices, err := ep.managedDevices(ctx)
83 if err != nil {
84 return nil, err
85 }
86
87 machines := make([]shepherd.Machine, 0, len(ep.reservationCache)+len(devices))
88 for _, device := range devices {
89 machines = append(machines, &machine{device})
90 }
91
92 for _, res := range ep.reservationCache {
93 machines = append(machines, reservation{res})
94 }
95
96 return machines, nil
97}
98
99func (ep *equinixProvider) CreateMachine(ctx context.Context, session *bmdb.Session, request shepherd.CreateMachineRequest) (shepherd.Machine, error) {
100 if request.UnusedMachine == nil {
101 return nil, fmt.Errorf("parameter UnusedMachine is missing")
102 }
103
104 //TODO: Do we just trust the implementation to be correct?
105 res, ok := request.UnusedMachine.(reservation)
106 if !ok {
107 return nil, fmt.Errorf("invalid type for parameter UnusedMachine")
108 }
109
110 d, err := ep.provision(ctx, session, res.HardwareReservation)
111 if err != nil {
112 klog.Errorf("Failed to provision reservation %s: %v", res.HardwareReservation.ID, err)
113 until := time.Now().Add(time.Hour)
114 klog.Errorf("Adding hardware reservation %s to sinbin until %s", res.HardwareReservation.ID, until)
115 ep.badReservations.Add(res.HardwareReservation.ID, until)
116 return nil, err
117 }
118
119 return &machine{*d}, nil
120}
121
122func (ep *equinixProvider) Type() model.Provider {
123 return model.ProviderEquinix
124}
125
126type reservation struct {
127 packngo.HardwareReservation
128}
129
Tim Windelschmidtfdd87ab2023-12-07 18:03:21 +0100130func (e reservation) Failed() bool {
131 return false
132}
133
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +0200134func (e reservation) ID() shepherd.ProviderID {
135 return shepherd.InvalidProviderID
136}
137
138func (e reservation) Addr() netip.Addr {
139 return netip.Addr{}
140}
141
142func (e reservation) State() shepherd.State {
143 return shepherd.StateKnownUnused
144}
145
146type machine struct {
147 packngo.Device
148}
149
Tim Windelschmidtfdd87ab2023-12-07 18:03:21 +0100150func (e *machine) Failed() bool {
151 return e.Device.State == "failed"
152}
153
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +0200154func (e *machine) ID() shepherd.ProviderID {
155 return shepherd.ProviderID(e.Device.ID)
156}
157
158func (e *machine) Addr() netip.Addr {
159 ni := e.GetNetworkInfo()
160
161 var addr string
162 if ni.PublicIPv4 != "" {
163 addr = ni.PublicIPv4
164 } else if ni.PublicIPv6 != "" {
165 addr = ni.PublicIPv6
166 } else {
167 klog.Errorf("missing address for machine: %v", e.ID())
168 return netip.Addr{}
169 }
170
171 a, err := netip.ParseAddr(addr)
172 if err != nil {
173 klog.Errorf("failed parsing address %q: %v", addr, err)
174 return netip.Addr{}
175 }
176
177 return a
178}
179
180func (e *machine) State() shepherd.State {
181 return shepherd.StateKnownUsed
182}
183
184// listReservations doesn't lock the mutex and expects the caller to lock.
185func (ep *equinixProvider) listReservations(ctx context.Context) ([]packngo.HardwareReservation, error) {
186 klog.Infof("Retrieving hardware reservations, this will take a while...")
187 reservations, err := ep.api.ListReservations(ctx, ep.config.ProjectId)
188 if err != nil {
189 return nil, fmt.Errorf("failed to list reservations: %w", err)
190 }
191
192 var available []packngo.HardwareReservation
193 var inUse, notProvisionable, penalized int
194 for _, reservation := range reservations {
195 if reservation.Device != nil {
196 inUse++
197 continue
198 }
199 if !reservation.Provisionable {
200 notProvisionable++
201 continue
202 }
203 if ep.badReservations.Penalized(reservation.ID) {
204 penalized++
205 continue
206 }
207 available = append(available, reservation)
208 }
209 klog.Infof("Retrieved hardware reservations: %d (total), %d (available), %d (in use), %d (not provisionable), %d (penalized)", len(reservations), len(available), inUse, notProvisionable, penalized)
210
211 return available, nil
212}
213
214// provision attempts to create a device within Equinix using given Hardware
215// Reservation rsv. The resulting device is registered with BMDB, and tagged as
216// "provided" in the process.
217func (ep *equinixProvider) provision(ctx context.Context, sess *bmdb.Session, rsv packngo.HardwareReservation) (*packngo.Device, error) {
218 klog.Infof("Creating a new device using reservation ID %s.", rsv.ID)
219 hostname := ep.config.DevicePrefix + rsv.ID[:18]
220 kid, err := ep.sshEquinixId(ctx)
221 if err != nil {
222 return nil, err
223 }
224 req := &packngo.DeviceCreateRequest{
225 Hostname: hostname,
226 OS: ep.config.OS,
227 Plan: rsv.Plan.Slug,
228 ProjectID: ep.config.ProjectId,
229 HardwareReservationID: rsv.ID,
230 ProjectSSHKeys: []string{kid},
231 }
232 if ep.config.UseProjectKeys {
233 klog.Warningf("INSECURE: Machines will be created with ALL PROJECT SSH KEYS!")
234 req.ProjectSSHKeys = nil
235 }
236
237 nd, err := ep.api.CreateDevice(ctx, req)
238 if err != nil {
239 return nil, fmt.Errorf("while creating new device within Equinix: %w", err)
240 }
241 klog.Infof("Created a new device within Equinix (RID: %s, PID: %s, HOST: %s)", rsv.ID, nd.ID, hostname)
242
243 slices.DeleteFunc(ep.reservationCache, func(v packngo.HardwareReservation) bool {
244 return rsv.ID == v.ID
245 })
246
247 err = ep.assimilate(ctx, sess, nd.ID)
248 if err != nil {
249 // TODO(serge@monogon.tech) at this point the device at Equinix isn't
250 // matched by a BMDB record. Schedule device deletion or make sure this
251 // case is being handled elsewhere.
252 return nil, err
253 }
254 return nd, nil
255}
256
257// assimilate brings in an already existing machine from Equinix into the BMDB.
258// This is only used in manual testing.
259func (ep *equinixProvider) assimilate(ctx context.Context, sess *bmdb.Session, deviceID string) error {
260 return sess.Transact(ctx, func(q *model.Queries) error {
261 // Create a new machine record within BMDB.
262 m, err := q.NewMachine(ctx)
263 if err != nil {
264 return fmt.Errorf("while creating a new machine record in BMDB: %w", err)
265 }
266
267 // Link the new machine with the Equinix device, and tag it "provided".
268 p := model.MachineAddProvidedParams{
269 MachineID: m.MachineID,
270 ProviderID: deviceID,
271 Provider: model.ProviderEquinix,
272 }
273 klog.Infof("Setting \"provided\" tag (ID: %s, PID: %s, Provider: %s).", p.MachineID, p.ProviderID, p.Provider)
274 if err := q.MachineAddProvided(ctx, p); err != nil {
275 return fmt.Errorf("while tagging machine active: %w", err)
276 }
277 return nil
278 })
279}
280
281// sshEquinixGet looks up the Equinix key matching providerConfig.KeyLabel,
282// returning its packngo.SSHKey instance.
283func (ep *equinixProvider) sshEquinix(ctx context.Context) (*packngo.SSHKey, error) {
284 ks, err := ep.api.ListSSHKeys(ctx)
285 if err != nil {
286 return nil, fmt.Errorf("while listing SSH keys: %w", err)
287 }
288
289 for _, k := range ks {
290 if k.Label == ep.config.KeyLabel {
291 return &k, nil
292 }
293 }
294 return nil, NoSuchKey
295}
296
297// sshEquinixId looks up the Equinix key identified by providerConfig.KeyLabel,
298// returning its Equinix-assigned UUID.
299func (ep *equinixProvider) sshEquinixId(ctx context.Context) (string, error) {
300 k, err := ep.sshEquinix(ctx)
301 if err != nil {
302 return "", err
303 }
304 return k.ID, nil
305}
306
307// sshEquinixUpdate makes sure the existing SSH key registered with Equinix
308// matches the one from sshPub.
309func (ep *equinixProvider) sshEquinixUpdate(ctx context.Context, kid string) error {
310 pub, err := ep.sshKey.PublicKey()
311 if err != nil {
312 return err
313 }
314 _, err = ep.api.UpdateSSHKey(ctx, kid, &packngo.SSHKeyUpdateRequest{
315 Key: &pub,
316 })
317 if err != nil {
318 return fmt.Errorf("while updating the SSH key: %w", err)
319 }
320 return nil
321}
322
323// sshEquinixUpload registers a new SSH key from sshPub.
324func (ep *equinixProvider) sshEquinixUpload(ctx context.Context) error {
325 pub, err := ep.sshKey.PublicKey()
326 if err != nil {
327 return fmt.Errorf("while generating public key: %w", err)
328 }
329 _, err = ep.api.CreateSSHKey(ctx, &packngo.SSHKeyCreateRequest{
330 Label: ep.config.KeyLabel,
331 Key: pub,
332 ProjectID: ep.config.ProjectId,
333 })
334 if err != nil {
335 return fmt.Errorf("while creating an SSH key: %w", err)
336 }
337 return nil
338}
339
340// SSHEquinixEnsure initializes the locally managed SSH key (from a persistence
341// path or explicitly set key) and updates or uploads it to Equinix. The key is
342// generated as needed The key is generated as needed
343func (ep *equinixProvider) SSHEquinixEnsure(ctx context.Context) error {
344 k, err := ep.sshEquinix(ctx)
345 switch err {
346 case NoSuchKey:
347 if err := ep.sshEquinixUpload(ctx); err != nil {
348 return fmt.Errorf("while uploading key: %w", err)
349 }
350 return nil
351 case nil:
352 if err := ep.sshEquinixUpdate(ctx, k.ID); err != nil {
353 return fmt.Errorf("while updating key: %w", err)
354 }
355 return nil
356 default:
357 return err
358 }
359}
360
361// managedDevices provides a map of device provider IDs to matching
362// packngo.Device instances. It calls Equinix API's ListDevices. The returned
363// devices are filtered according to DevicePrefix provided through Opts. The
364// returned error value, if not nil, will originate in wrapngo.
365func (ep *equinixProvider) managedDevices(ctx context.Context) (map[string]packngo.Device, error) {
366 ds, err := ep.api.ListDevices(ctx, ep.config.ProjectId)
367 if err != nil {
368 return nil, err
369 }
370 dm := map[string]packngo.Device{}
371 for _, d := range ds {
372 if strings.HasPrefix(d.Hostname, ep.config.DevicePrefix) {
373 dm[d.ID] = d
374 }
375 }
376 return dm, nil
377}