blob: edc8f3f6b740b6a0c7648852364ee0953c25313e [file] [log] [blame]
Tim Windelschmidtb6308cd2023-10-10 21:19:03 +02001package main
2
3import (
4 "context"
5 "fmt"
6 "net/netip"
7 "slices"
8 "strings"
9 "time"
10
11 "github.com/packethost/packngo"
12 "k8s.io/klog/v2"
13
14 "source.monogon.dev/cloud/bmaas/bmdb"
15 "source.monogon.dev/cloud/bmaas/bmdb/model"
16 "source.monogon.dev/cloud/equinix/wrapngo"
17 "source.monogon.dev/cloud/lib/sinbin"
18 "source.monogon.dev/cloud/shepherd"
19 "source.monogon.dev/cloud/shepherd/manager"
20)
21
22type equinixProvider struct {
23 config *providerConfig
24 api wrapngo.Client
25 sshKey *manager.SSHKey
26
27 // badReservations is a holiday resort for Equinix hardware reservations which
28 // failed to be provisioned for some reason or another. We keep a list of them in
29 // memory just so that we don't repeatedly try to provision the same known bad
30 // machines.
31 badReservations sinbin.Sinbin[string]
32
33 reservationDeadline time.Time
34 reservationCache []packngo.HardwareReservation
35}
36
37func (ep *equinixProvider) RebootMachine(ctx context.Context, id shepherd.ProviderID) error {
38 if err := ep.api.RebootDevice(ctx, string(id)); err != nil {
39 return fmt.Errorf("failed to reboot device: %w", err)
40 }
41
42 // TODO(issue/215): replace this
43 // This is required as Equinix doesn't reboot the machines synchronously
44 // during the API call.
45 select {
46 case <-time.After(time.Duration(ep.config.RebootWaitSeconds) * time.Second):
47 case <-ctx.Done():
48 return fmt.Errorf("while waiting for reboot: %w", ctx.Err())
49 }
50 return nil
51}
52
53func (ep *equinixProvider) ReinstallMachine(ctx context.Context, id shepherd.ProviderID) error {
54 return shepherd.ErrNotImplemented
55}
56
57func (ep *equinixProvider) GetMachine(ctx context.Context, id shepherd.ProviderID) (shepherd.Machine, error) {
58 machines, err := ep.ListMachines(ctx)
59 if err != nil {
60 return nil, err
61 }
62
63 for _, machine := range machines {
64 if machine.ID() == id {
65 return machine, nil
66 }
67 }
68
69 return nil, shepherd.ErrMachineNotFound
70}
71
72func (ep *equinixProvider) ListMachines(ctx context.Context) ([]shepherd.Machine, error) {
73 if ep.reservationDeadline.Before(time.Now()) {
74 reservations, err := ep.listReservations(ctx)
75 if err != nil {
76 return nil, err
77 }
78 ep.reservationCache = reservations
79 ep.reservationDeadline = time.Now().Add(ep.config.ReservationCacheTimeout)
80 }
81
82 devices, err := ep.managedDevices(ctx)
83 if err != nil {
84 return nil, err
85 }
86
87 machines := make([]shepherd.Machine, 0, len(ep.reservationCache)+len(devices))
88 for _, device := range devices {
89 machines = append(machines, &machine{device})
90 }
91
92 for _, res := range ep.reservationCache {
93 machines = append(machines, reservation{res})
94 }
95
96 return machines, nil
97}
98
99func (ep *equinixProvider) CreateMachine(ctx context.Context, session *bmdb.Session, request shepherd.CreateMachineRequest) (shepherd.Machine, error) {
100 if request.UnusedMachine == nil {
101 return nil, fmt.Errorf("parameter UnusedMachine is missing")
102 }
103
104 //TODO: Do we just trust the implementation to be correct?
105 res, ok := request.UnusedMachine.(reservation)
106 if !ok {
107 return nil, fmt.Errorf("invalid type for parameter UnusedMachine")
108 }
109
110 d, err := ep.provision(ctx, session, res.HardwareReservation)
111 if err != nil {
112 klog.Errorf("Failed to provision reservation %s: %v", res.HardwareReservation.ID, err)
113 until := time.Now().Add(time.Hour)
114 klog.Errorf("Adding hardware reservation %s to sinbin until %s", res.HardwareReservation.ID, until)
115 ep.badReservations.Add(res.HardwareReservation.ID, until)
116 return nil, err
117 }
118
119 return &machine{*d}, nil
120}
121
122func (ep *equinixProvider) Type() model.Provider {
123 return model.ProviderEquinix
124}
125
126type reservation struct {
127 packngo.HardwareReservation
128}
129
130func (e reservation) ID() shepherd.ProviderID {
131 return shepherd.InvalidProviderID
132}
133
134func (e reservation) Addr() netip.Addr {
135 return netip.Addr{}
136}
137
138func (e reservation) State() shepherd.State {
139 return shepherd.StateKnownUnused
140}
141
142type machine struct {
143 packngo.Device
144}
145
146func (e *machine) ID() shepherd.ProviderID {
147 return shepherd.ProviderID(e.Device.ID)
148}
149
150func (e *machine) Addr() netip.Addr {
151 ni := e.GetNetworkInfo()
152
153 var addr string
154 if ni.PublicIPv4 != "" {
155 addr = ni.PublicIPv4
156 } else if ni.PublicIPv6 != "" {
157 addr = ni.PublicIPv6
158 } else {
159 klog.Errorf("missing address for machine: %v", e.ID())
160 return netip.Addr{}
161 }
162
163 a, err := netip.ParseAddr(addr)
164 if err != nil {
165 klog.Errorf("failed parsing address %q: %v", addr, err)
166 return netip.Addr{}
167 }
168
169 return a
170}
171
172func (e *machine) State() shepherd.State {
173 return shepherd.StateKnownUsed
174}
175
176// listReservations doesn't lock the mutex and expects the caller to lock.
177func (ep *equinixProvider) listReservations(ctx context.Context) ([]packngo.HardwareReservation, error) {
178 klog.Infof("Retrieving hardware reservations, this will take a while...")
179 reservations, err := ep.api.ListReservations(ctx, ep.config.ProjectId)
180 if err != nil {
181 return nil, fmt.Errorf("failed to list reservations: %w", err)
182 }
183
184 var available []packngo.HardwareReservation
185 var inUse, notProvisionable, penalized int
186 for _, reservation := range reservations {
187 if reservation.Device != nil {
188 inUse++
189 continue
190 }
191 if !reservation.Provisionable {
192 notProvisionable++
193 continue
194 }
195 if ep.badReservations.Penalized(reservation.ID) {
196 penalized++
197 continue
198 }
199 available = append(available, reservation)
200 }
201 klog.Infof("Retrieved hardware reservations: %d (total), %d (available), %d (in use), %d (not provisionable), %d (penalized)", len(reservations), len(available), inUse, notProvisionable, penalized)
202
203 return available, nil
204}
205
206// provision attempts to create a device within Equinix using given Hardware
207// Reservation rsv. The resulting device is registered with BMDB, and tagged as
208// "provided" in the process.
209func (ep *equinixProvider) provision(ctx context.Context, sess *bmdb.Session, rsv packngo.HardwareReservation) (*packngo.Device, error) {
210 klog.Infof("Creating a new device using reservation ID %s.", rsv.ID)
211 hostname := ep.config.DevicePrefix + rsv.ID[:18]
212 kid, err := ep.sshEquinixId(ctx)
213 if err != nil {
214 return nil, err
215 }
216 req := &packngo.DeviceCreateRequest{
217 Hostname: hostname,
218 OS: ep.config.OS,
219 Plan: rsv.Plan.Slug,
220 ProjectID: ep.config.ProjectId,
221 HardwareReservationID: rsv.ID,
222 ProjectSSHKeys: []string{kid},
223 }
224 if ep.config.UseProjectKeys {
225 klog.Warningf("INSECURE: Machines will be created with ALL PROJECT SSH KEYS!")
226 req.ProjectSSHKeys = nil
227 }
228
229 nd, err := ep.api.CreateDevice(ctx, req)
230 if err != nil {
231 return nil, fmt.Errorf("while creating new device within Equinix: %w", err)
232 }
233 klog.Infof("Created a new device within Equinix (RID: %s, PID: %s, HOST: %s)", rsv.ID, nd.ID, hostname)
234
235 slices.DeleteFunc(ep.reservationCache, func(v packngo.HardwareReservation) bool {
236 return rsv.ID == v.ID
237 })
238
239 err = ep.assimilate(ctx, sess, nd.ID)
240 if err != nil {
241 // TODO(serge@monogon.tech) at this point the device at Equinix isn't
242 // matched by a BMDB record. Schedule device deletion or make sure this
243 // case is being handled elsewhere.
244 return nil, err
245 }
246 return nd, nil
247}
248
249// assimilate brings in an already existing machine from Equinix into the BMDB.
250// This is only used in manual testing.
251func (ep *equinixProvider) assimilate(ctx context.Context, sess *bmdb.Session, deviceID string) error {
252 return sess.Transact(ctx, func(q *model.Queries) error {
253 // Create a new machine record within BMDB.
254 m, err := q.NewMachine(ctx)
255 if err != nil {
256 return fmt.Errorf("while creating a new machine record in BMDB: %w", err)
257 }
258
259 // Link the new machine with the Equinix device, and tag it "provided".
260 p := model.MachineAddProvidedParams{
261 MachineID: m.MachineID,
262 ProviderID: deviceID,
263 Provider: model.ProviderEquinix,
264 }
265 klog.Infof("Setting \"provided\" tag (ID: %s, PID: %s, Provider: %s).", p.MachineID, p.ProviderID, p.Provider)
266 if err := q.MachineAddProvided(ctx, p); err != nil {
267 return fmt.Errorf("while tagging machine active: %w", err)
268 }
269 return nil
270 })
271}
272
273// sshEquinixGet looks up the Equinix key matching providerConfig.KeyLabel,
274// returning its packngo.SSHKey instance.
275func (ep *equinixProvider) sshEquinix(ctx context.Context) (*packngo.SSHKey, error) {
276 ks, err := ep.api.ListSSHKeys(ctx)
277 if err != nil {
278 return nil, fmt.Errorf("while listing SSH keys: %w", err)
279 }
280
281 for _, k := range ks {
282 if k.Label == ep.config.KeyLabel {
283 return &k, nil
284 }
285 }
286 return nil, NoSuchKey
287}
288
289// sshEquinixId looks up the Equinix key identified by providerConfig.KeyLabel,
290// returning its Equinix-assigned UUID.
291func (ep *equinixProvider) sshEquinixId(ctx context.Context) (string, error) {
292 k, err := ep.sshEquinix(ctx)
293 if err != nil {
294 return "", err
295 }
296 return k.ID, nil
297}
298
299// sshEquinixUpdate makes sure the existing SSH key registered with Equinix
300// matches the one from sshPub.
301func (ep *equinixProvider) sshEquinixUpdate(ctx context.Context, kid string) error {
302 pub, err := ep.sshKey.PublicKey()
303 if err != nil {
304 return err
305 }
306 _, err = ep.api.UpdateSSHKey(ctx, kid, &packngo.SSHKeyUpdateRequest{
307 Key: &pub,
308 })
309 if err != nil {
310 return fmt.Errorf("while updating the SSH key: %w", err)
311 }
312 return nil
313}
314
315// sshEquinixUpload registers a new SSH key from sshPub.
316func (ep *equinixProvider) sshEquinixUpload(ctx context.Context) error {
317 pub, err := ep.sshKey.PublicKey()
318 if err != nil {
319 return fmt.Errorf("while generating public key: %w", err)
320 }
321 _, err = ep.api.CreateSSHKey(ctx, &packngo.SSHKeyCreateRequest{
322 Label: ep.config.KeyLabel,
323 Key: pub,
324 ProjectID: ep.config.ProjectId,
325 })
326 if err != nil {
327 return fmt.Errorf("while creating an SSH key: %w", err)
328 }
329 return nil
330}
331
332// SSHEquinixEnsure initializes the locally managed SSH key (from a persistence
333// path or explicitly set key) and updates or uploads it to Equinix. The key is
334// generated as needed The key is generated as needed
335func (ep *equinixProvider) SSHEquinixEnsure(ctx context.Context) error {
336 k, err := ep.sshEquinix(ctx)
337 switch err {
338 case NoSuchKey:
339 if err := ep.sshEquinixUpload(ctx); err != nil {
340 return fmt.Errorf("while uploading key: %w", err)
341 }
342 return nil
343 case nil:
344 if err := ep.sshEquinixUpdate(ctx, k.ID); err != nil {
345 return fmt.Errorf("while updating key: %w", err)
346 }
347 return nil
348 default:
349 return err
350 }
351}
352
353// managedDevices provides a map of device provider IDs to matching
354// packngo.Device instances. It calls Equinix API's ListDevices. The returned
355// devices are filtered according to DevicePrefix provided through Opts. The
356// returned error value, if not nil, will originate in wrapngo.
357func (ep *equinixProvider) managedDevices(ctx context.Context) (map[string]packngo.Device, error) {
358 ds, err := ep.api.ListDevices(ctx, ep.config.ProjectId)
359 if err != nil {
360 return nil, err
361 }
362 dm := map[string]packngo.Device{}
363 for _, d := range ds {
364 if strings.HasPrefix(d.Hostname, ep.config.DevicePrefix) {
365 dm[d.ID] = d
366 }
367 }
368 return dm, nil
369}