cloud: split shepherd up

Change-Id: I8e386d9eaaf17543743e1e8a37a8d71426910d59
Reviewed-on: https://review.monogon.dev/c/monogon/+/2213
Reviewed-by: Serge Bazanski <serge@monogon.tech>
Tested-by: Jenkins CI
diff --git a/cloud/equinix/wrapngo/BUILD.bazel b/cloud/equinix/wrapngo/BUILD.bazel
new file mode 100644
index 0000000..1574a6a
--- /dev/null
+++ b/cloud/equinix/wrapngo/BUILD.bazel
@@ -0,0 +1,31 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_library(
+    name = "wrapngo",
+    srcs = [
+        "duct_tape.go",
+        "metrics.go",
+        "wrapn.go",
+    ],
+    importpath = "source.monogon.dev/cloud/equinix/wrapngo",
+    visibility = ["//visibility:public"],
+    deps = [
+        "@com_github_cenkalti_backoff_v4//:backoff",
+        "@com_github_google_uuid//:uuid",
+        "@com_github_packethost_packngo//:packngo",
+        "@com_github_prometheus_client_golang//prometheus",
+        "@io_k8s_klog_v2//:klog",
+    ],
+)
+
+go_test(
+    name = "wrapngo_test",
+    timeout = "eternal",
+    srcs = ["wrapngo_live_test.go"],
+    args = ["-test.v"],
+    embed = [":wrapngo"],
+    deps = [
+        "@com_github_packethost_packngo//:packngo",
+        "@org_golang_x_crypto//ssh",
+    ],
+)
diff --git a/cloud/equinix/wrapngo/duct_tape.go b/cloud/equinix/wrapngo/duct_tape.go
new file mode 100644
index 0000000..d5dab7c
--- /dev/null
+++ b/cloud/equinix/wrapngo/duct_tape.go
@@ -0,0 +1,126 @@
+package wrapngo
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"net/http"
+	"time"
+
+	"github.com/cenkalti/backoff/v4"
+	"github.com/packethost/packngo"
+	"k8s.io/klog/v2"
+)
+
+// wrap a given fn in some reliability-increasing duct tape: context support and
+// exponential backoff retries for intermittent connectivity issues. This allows
+// us to use packngo code instead of writing our own API stub for Equinix Metal.
+//
+// The given fn will be retried until it returns a 'permanent' Equinix error (see
+// isPermanentEquinixError) or the given context expires. Additionally, fn will
+// be called with a brand new packngo client tied to the context of the wrap
+// call. Finally, the given client will also have some logging middleware
+// attached to it which can be activated by setting verbosity 5 (or greater) on
+// this file.
+//
+// The wrapped fn can be either just a plain packngo method or some complicated
+// idempotent logic, as long as it cooperates with the above contract.
+func wrap[U any](ctx context.Context, cl *client, fn func(*packngo.Client) (U, error)) (U, error) {
+	var zero U
+	if err := cl.serializer.up(ctx); err != nil {
+		return zero, err
+	}
+	defer cl.serializer.down()
+
+	bc := backoff.WithContext(cl.o.BackOff(), ctx)
+	pngo, err := cl.clientForContext(ctx)
+	if err != nil {
+		// Generally this shouldn't happen other than with programming errors, so we
+		// don't back this off.
+		return zero, fmt.Errorf("could not crate equinix client: %w", err)
+	}
+
+	var res U
+	err = backoff.Retry(func() error {
+		res, err = fn(pngo)
+		if isPermanentEquinixError(err) {
+			return backoff.Permanent(err)
+		}
+		return err
+	}, bc)
+	if err != nil {
+		return zero, err
+	}
+	return res, nil
+}
+
+type injectContextRoundTripper struct {
+	ctx      context.Context
+	original http.RoundTripper
+	metrics  *metricsSet
+}
+
+func (r *injectContextRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) {
+	klog.V(5).Infof("Request -> %v", req.URL.String())
+	start := time.Now()
+	res, err := r.original.RoundTrip(req.WithContext(r.ctx))
+	latency := time.Since(start)
+	r.metrics.onAPIRequestDone(req, res, err, latency)
+
+	if err != nil {
+		klog.V(5).Infof("HTTP error <- %v", err)
+	} else {
+		klog.V(5).Infof("Response <- %v", res.Status)
+	}
+	return res, err
+}
+
+func (c *client) clientForContext(ctx context.Context) (*packngo.Client, error) {
+	httpcl := &http.Client{
+		Transport: &injectContextRoundTripper{
+			ctx:      ctx,
+			original: http.DefaultTransport,
+			metrics:  c.metrics,
+		},
+	}
+	return packngo.NewClient(packngo.WithAuth(c.username, c.token), packngo.WithHTTPClient(httpcl))
+}
+
+// httpStatusCode extracts the status code from error values returned by
+// packngo methods.
+func httpStatusCode(err error) int {
+	var er *packngo.ErrorResponse
+	if err != nil && errors.As(err, &er) {
+		return er.Response.StatusCode
+	}
+	return -1
+}
+
+// IsNotFound returns true if the given error is an Equinix packngo/wrapngo 'not
+// found' error.
+func IsNotFound(err error) bool {
+	return httpStatusCode(err) == http.StatusNotFound
+}
+
+func isPermanentEquinixError(err error) bool {
+	// Invalid argument/state errors from wrapping.
+	if errors.Is(err, ErrRaceLost) {
+		return true
+	}
+	if errors.Is(err, ErrNoReservationProvided) {
+		return true
+	}
+	// Real errors returned from equinix.
+	st := httpStatusCode(err)
+	switch st {
+	case http.StatusUnauthorized:
+		return true
+	case http.StatusForbidden:
+		return true
+	case http.StatusNotFound:
+		return true
+	case http.StatusUnprocessableEntity:
+		return true
+	}
+	return false
+}
diff --git a/cloud/equinix/wrapngo/metrics.go b/cloud/equinix/wrapngo/metrics.go
new file mode 100644
index 0000000..fef506b
--- /dev/null
+++ b/cloud/equinix/wrapngo/metrics.go
@@ -0,0 +1,129 @@
+package wrapngo
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"net/http"
+	"regexp"
+	"strings"
+	"time"
+
+	"github.com/prometheus/client_golang/prometheus"
+	"k8s.io/klog/v2"
+)
+
+// metricsSet contains all the Prometheus metrics collected by wrapngo.
+type metricsSet struct {
+	requestLatencies *prometheus.HistogramVec
+	waiting          prometheus.GaugeFunc
+	inFlight         prometheus.GaugeFunc
+}
+
+func newMetricsSet(ser *serializer) *metricsSet {
+	return &metricsSet{
+		requestLatencies: prometheus.NewHistogramVec(
+			prometheus.HistogramOpts{
+				Name: "equinix_api_latency",
+				Help: "Equinix API request latency in seconds, partitioned by endpoint status code",
+			},
+			[]string{"endpoint", "status_code"},
+		),
+		waiting: prometheus.NewGaugeFunc(
+			prometheus.GaugeOpts{
+				Name: "equinix_api_waiting",
+				Help: "Number of API requests pending to be sent to Equinix but waiting on semaphore",
+			},
+			func() float64 {
+				_, waiting := ser.stats()
+				return float64(waiting)
+			},
+		),
+		inFlight: prometheus.NewGaugeFunc(
+			prometheus.GaugeOpts{
+				Name: "equinix_api_in_flight",
+				Help: "Number of API requests currently being processed by Equinix",
+			},
+			func() float64 {
+				inFlight, _ := ser.stats()
+				return float64(inFlight)
+			},
+		),
+	}
+}
+
+// getEndpointForPath converts from an Equinix API method and path (eg.
+// /metal/v1/devices/deadbeef) into an 'endpoint' name, which is an imaginary,
+// Monogon-specific name for the API endpoint accessed by this call.
+//
+// If the given path is unknown and thus cannot be converted to an endpoint name,
+// 'Unknown' is return and a warning is logged.
+//
+// We use this function to partition request statistics per API 'endpoint'. An
+// alternative to this would be to record high-level packngo function names, but
+// one packngo function call might actually emit multiple HTTP API requests - so
+// we're stuck recording the low-level requests and gathering statistics from
+// there instead.
+func getEndpointForPath(method, path string) string {
+	path = strings.TrimPrefix(path, "/metal/v1")
+	for name, match := range endpointNames {
+		if match.matches(method, path) {
+			return name
+		}
+	}
+	klog.Warningf("Unknown Equinix API %s %s - cannot determine metric endpoint name", method, path)
+	return "Unknown"
+}
+
+// requestMatch is used to match a HTTP request method/path.
+type requestMatch struct {
+	method string
+	regexp *regexp.Regexp
+}
+
+func (r *requestMatch) matches(method, path string) bool {
+	if r.method != method {
+		return false
+	}
+	return r.regexp.MatchString(path)
+}
+
+var (
+	endpointNames = map[string]requestMatch{
+		"GetDevice":           {"GET", regexp.MustCompile(`^/devices/[^/]+$`)},
+		"ListDevices":         {"GET", regexp.MustCompile(`^/(organizations|projects)/[^/]+/devices$`)},
+		"CreateDevice":        {"POST", regexp.MustCompile(`^/projects/[^/]+/devices$`)},
+		"ListReservations":    {"GET", regexp.MustCompile(`^/projects/[^/]+/hardware-reservations$`)},
+		"ListSSHKeys":         {"GET", regexp.MustCompile(`^/ssh-keys$`)},
+		"CreateSSHKey":        {"POST", regexp.MustCompile(`^/project/[^/]+/ssh-keys$`)},
+		"GetSSHKey":           {"GET", regexp.MustCompile(`^/ssh-keys/[^/]+$`)},
+		"UpdateSSHKey":        {"PATCH", regexp.MustCompile(`^/ssh-keys/[^/]+$`)},
+		"PerformDeviceAction": {"POST", regexp.MustCompile(`^/devices/[^/]+/actions$`)},
+	}
+)
+
+// onAPIRequestDone is called by the wrapngo code on every API response from
+// Equinix, and records the given parameters into metrics.
+func (m *metricsSet) onAPIRequestDone(req *http.Request, res *http.Response, err error, latency time.Duration) {
+	if m == nil {
+		return
+	}
+
+	code := "unknown"
+	if err == nil {
+		code = fmt.Sprintf("%d", res.StatusCode)
+	} else {
+		switch {
+		case errors.Is(err, context.Canceled):
+			code = "ctx canceled"
+		case errors.Is(err, context.DeadlineExceeded):
+			code = "deadline exceeded"
+		}
+	}
+	if code == "unknown" {
+		klog.Warningf("Unexpected HTTP result: req %s %s, error: %v", req.Method, req.URL.Path, res)
+	}
+
+	endpoint := getEndpointForPath(req.Method, req.URL.Path)
+	m.requestLatencies.With(prometheus.Labels{"endpoint": endpoint, "status_code": code}).Observe(latency.Seconds())
+}
diff --git a/cloud/equinix/wrapngo/wrapn.go b/cloud/equinix/wrapngo/wrapn.go
new file mode 100644
index 0000000..7bd4522
--- /dev/null
+++ b/cloud/equinix/wrapngo/wrapn.go
@@ -0,0 +1,433 @@
+// Package wrapngo wraps packngo methods providing the following usability
+// enhancements:
+// - API call rate limiting
+// - resource-aware call retries
+// - use of a configurable back-off algorithm implementation
+// - context awareness
+//
+// The implementation is provided with the following caveats:
+//
+// There can be only one call in flight. Concurrent calls to API-related
+// methods of the same client will block. Calls returning packngo structs will
+// return nil data when a non-nil error value is returned. An
+// os.ErrDeadlineExceeded will be returned after the underlying API calls time
+// out beyond the chosen back-off algorithm implementation's maximum allowed
+// retry interval. Other errors, excluding context.Canceled and
+// context.DeadlineExceeded, indicate either an error originating at Equinix'
+// API endpoint (which may still stem from invalid call inputs), or a network
+// error.
+//
+// Packngo wrappers included below may return timeout errors even after the
+// wrapped calls succeed in the event server reply could not have been
+// received.
+//
+// This implies that effects of mutating calls can't always be verified
+// atomically, requiring explicit synchronization between API users, regardless
+// of the retry/recovery logic used.
+//
+// Having that in mind, some call wrappers exposed by this package will attempt
+// to recover from this kind of situations by requesting information on any
+// resources created, and retrying the call if needed. This approach assumes
+// any concurrent mutating API users will be synchronized, as it should be in
+// any case.
+//
+// Another way of handling this problem would be to leave it up to the user to
+// retry calls if needed, though this would leak Equinix Metal API, and
+// complicate implementations depending on this package. Due to that, the prior
+// approach was chosen.
+package wrapngo
+
+import (
+	"context"
+	"errors"
+	"flag"
+	"fmt"
+	"net/http"
+	"sync/atomic"
+	"time"
+
+	"github.com/cenkalti/backoff/v4"
+	"github.com/google/uuid"
+	"github.com/packethost/packngo"
+	"github.com/prometheus/client_golang/prometheus"
+)
+
+// Opts conveys configurable Client parameters.
+type Opts struct {
+	// User and APIKey are the credentials used to authenticate with
+	// Metal API.
+
+	User   string
+	APIKey string
+
+	// Optional parameters:
+
+	// BackOff controls the client's behavior in the event of API calls failing
+	// due to IO timeouts by adjusting the lower bound on time taken between
+	// subsequent calls.
+	BackOff func() backoff.BackOff
+
+	// APIRate is the minimum time taken between subsequent API calls.
+	APIRate time.Duration
+
+	// Parallelism defines how many calls to the Equinix API will be issued in
+	// parallel. When this limit is reached, subsequent attmepts to call the API will
+	// block. The order of serving of pending calls is currently undefined.
+	//
+	// If not defined (ie. 0), defaults to 1.
+	Parallelism int
+
+	MetricsRegistry *prometheus.Registry
+}
+
+func (o *Opts) RegisterFlags() {
+	flag.StringVar(&o.User, "equinix_api_username", "", "Username for Equinix API")
+	flag.StringVar(&o.APIKey, "equinix_api_key", "", "Key/token/password for Equinix API")
+	flag.IntVar(&o.Parallelism, "equinix_parallelism", 3, "How many parallel connections to the Equinix API will be allowed")
+}
+
+// Client is a limited interface of methods that the Shepherd uses on Equinix. It
+// is provided to allow for dependency injection of a fake equinix API for tests.
+type Client interface {
+	// GetDevice wraps packngo's cl.Devices.Get.
+	//
+	// TODO(q3k): remove unused pid parameter.
+	GetDevice(ctx context.Context, pid, did string, opts *packngo.ListOptions) (*packngo.Device, error)
+	// ListDevices wraps packngo's cl.Device.List.
+	ListDevices(ctx context.Context, pid string) ([]packngo.Device, error)
+	// CreateDevice attempts to create a new device according to the provided
+	// request. The request _must_ configure a HardwareReservationID. This call
+	// attempts to be as idempotent as possible, and will return ErrRaceLost if a
+	// retry was needed but in the meantime the requested hardware reservation from
+	// which this machine was requested got lost.
+	CreateDevice(ctx context.Context, request *packngo.DeviceCreateRequest) (*packngo.Device, error)
+
+	UpdateDevice(ctx context.Context, id string, request *packngo.DeviceUpdateRequest) (*packngo.Device, error)
+	RebootDevice(ctx context.Context, did string) error
+	DeleteDevice(ctx context.Context, id string) error
+
+	// ListReservations returns a complete list of hardware reservations associated
+	// with project pid. This is an expensive method that takes a while to execute,
+	// handle with care.
+	ListReservations(ctx context.Context, pid string) ([]packngo.HardwareReservation, error)
+	// MoveReservation moves a reserved device to the given project.
+	MoveReservation(ctx context.Context, hardwareReservationDID, projectID string) (*packngo.HardwareReservation, error)
+
+	// ListSSHKeys wraps packngo's cl.Keys.List.
+	ListSSHKeys(ctx context.Context) ([]packngo.SSHKey, error)
+	// CreateSSHKey is idempotent - the key label can be used only once. Further
+	// calls referring to the same label and key will not yield errors. See the
+	// package comment for more info on this method's behavior and returned error
+	// values.
+	CreateSSHKey(ctx context.Context, req *packngo.SSHKeyCreateRequest) (*packngo.SSHKey, error)
+	// UpdateSSHKey is idempotent - values included in r can be applied only once,
+	// while subsequent updates using the same data don't produce errors. See the
+	// package comment for information on this method's behavior and returned error
+	// values.
+	UpdateSSHKey(ctx context.Context, kid string, req *packngo.SSHKeyUpdateRequest) (*packngo.SSHKey, error)
+
+	Close()
+}
+
+// client implements the Client interface.
+type client struct {
+	username string
+	token    string
+	o        *Opts
+	rlt      *time.Ticker
+
+	serializer *serializer
+	metrics    *metricsSet
+}
+
+// serializer is an N-semaphore channel (configured by opts.Parallelism) which is
+// used to limit the number of concurrent calls to the Equinix API.
+//
+// In addition, it implements some simple waiting/usage statistics for
+// metrics/introspection.
+type serializer struct {
+	sem     chan struct{}
+	usage   int64
+	waiting int64
+}
+
+// up blocks until the serializer has at least one available concurrent call
+// slot. If the given context expires before such a slot is available, the
+// context error is returned.
+func (s *serializer) up(ctx context.Context) error {
+	atomic.AddInt64(&s.waiting, 1)
+	select {
+	case s.sem <- struct{}{}:
+		atomic.AddInt64(&s.waiting, -1)
+		atomic.AddInt64(&s.usage, 1)
+		return nil
+	case <-ctx.Done():
+		atomic.AddInt64(&s.waiting, -1)
+		return ctx.Err()
+	}
+}
+
+// down releases a previously acquire concurrent call slot.
+func (s *serializer) down() {
+	atomic.AddInt64(&s.usage, -1)
+	<-s.sem
+}
+
+// stats returns the number of in-flight and waiting-for-semaphore requests.
+func (s *serializer) stats() (usage, waiting int64) {
+	usage = atomic.LoadInt64(&s.usage)
+	waiting = atomic.LoadInt64(&s.waiting)
+	return
+}
+
+// New creates a Client instance based on Opts. PACKNGO_DEBUG environment
+// variable can be set prior to the below call to enable verbose packngo
+// debug logs.
+func New(opts *Opts) Client {
+	return new(opts)
+}
+
+func new(opts *Opts) *client {
+	// Apply the defaults.
+	if opts.APIRate == 0 {
+		opts.APIRate = 2 * time.Second
+	}
+	if opts.BackOff == nil {
+		opts.BackOff = func() backoff.BackOff {
+			return backoff.NewExponentialBackOff()
+		}
+	}
+	if opts.Parallelism == 0 {
+		opts.Parallelism = 1
+	}
+
+	cl := &client{
+		username: opts.User,
+		token:    opts.APIKey,
+		o:        opts,
+		rlt:      time.NewTicker(opts.APIRate),
+
+		serializer: &serializer{
+			sem: make(chan struct{}, opts.Parallelism),
+		},
+	}
+	if opts.MetricsRegistry != nil {
+		ms := newMetricsSet(cl.serializer)
+		opts.MetricsRegistry.MustRegister(ms.inFlight, ms.waiting, ms.requestLatencies)
+		cl.metrics = ms
+	}
+	return cl
+}
+
+func (c *client) Close() {
+	c.rlt.Stop()
+}
+
+var (
+	ErrRaceLost              = errors.New("race lost with another API user")
+	ErrNoReservationProvided = errors.New("hardware reservation must be set")
+)
+
+func (e *client) PowerOffDevice(ctx context.Context, pid string) error {
+	_, err := wrap(ctx, e, func(p *packngo.Client) (*packngo.Response, error) {
+		r, err := p.Devices.PowerOff(pid)
+		if err != nil {
+			return nil, fmt.Errorf("Devices.PowerOff: %w", err)
+		}
+		return r, nil
+	})
+	return err
+}
+
+func (e *client) PowerOnDevice(ctx context.Context, pid string) error {
+	_, err := wrap(ctx, e, func(p *packngo.Client) (*packngo.Response, error) {
+		r, err := p.Devices.PowerOn(pid)
+		if err != nil {
+			return nil, fmt.Errorf("Devices.PowerOn: %w", err)
+		}
+		return r, nil
+	})
+	return err
+}
+
+func (e *client) DeleteDevice(ctx context.Context, id string) error {
+	_, err := wrap(ctx, e, func(p *packngo.Client) (*packngo.Response, error) {
+		r, err := p.Devices.Delete(id, false)
+		if err != nil {
+			return nil, fmt.Errorf("Devices.Delete: %w", err)
+		}
+		return r, nil
+	})
+	return err
+}
+
+func (e *client) CreateDevice(ctx context.Context, r *packngo.DeviceCreateRequest) (*packngo.Device, error) {
+	if r.HardwareReservationID == "" {
+		return nil, ErrNoReservationProvided
+	}
+	// Add a tag to the request to detect if someone snatches a hardware reservation
+	// from under us.
+	witnessTag := fmt.Sprintf("wrapngo-idempotency-%s", uuid.New().String())
+	r.Tags = append(r.Tags, witnessTag)
+
+	return wrap(ctx, e, func(cl *packngo.Client) (*packngo.Device, error) {
+		//Does the device already exist?
+		res, _, err := cl.HardwareReservations.Get(r.HardwareReservationID, nil)
+		if err != nil {
+			return nil, fmt.Errorf("couldn't check if device already exists: %w", err)
+		}
+		if res == nil {
+			return nil, fmt.Errorf("unexpected nil response")
+		}
+		if res.Device != nil {
+			// Check if we lost the race for this hardware reservation.
+			tags := make(map[string]bool)
+			for _, tag := range res.Device.Tags {
+				tags[tag] = true
+			}
+			if !tags[witnessTag] {
+				return nil, ErrRaceLost
+			}
+			return res.Device, nil
+		}
+
+		// No device yet. Try to create it.
+		dev, _, err := cl.Devices.Create(r)
+		if err == nil {
+			return dev, nil
+		}
+		// In case of a transient failure (eg. network issue), we retry the whole
+		// operation, which means we first check again if the device already exists. If
+		// it's a permanent error from the API, the backoff logic will fail immediately.
+		return nil, fmt.Errorf("couldn't create device: %w", err)
+	})
+}
+
+func (e *client) UpdateDevice(ctx context.Context, id string, r *packngo.DeviceUpdateRequest) (*packngo.Device, error) {
+	return wrap(ctx, e, func(cl *packngo.Client) (*packngo.Device, error) {
+		dev, _, err := cl.Devices.Update(id, r)
+		return dev, err
+	})
+}
+
+func (e *client) ListDevices(ctx context.Context, pid string) ([]packngo.Device, error) {
+	return wrap(ctx, e, func(cl *packngo.Client) ([]packngo.Device, error) {
+		// to increase the chances of a stable pagination, we sort the devices by hostname
+		res, _, err := cl.Devices.List(pid, &packngo.GetOptions{SortBy: "hostname"})
+		return res, err
+	})
+}
+
+func (e *client) GetDevice(ctx context.Context, pid, did string, opts *packngo.ListOptions) (*packngo.Device, error) {
+	return wrap(ctx, e, func(cl *packngo.Client) (*packngo.Device, error) {
+		d, _, err := cl.Devices.Get(did, opts)
+		return d, err
+	})
+}
+
+// Currently unexported, only used in tests.
+func (e *client) deleteDevice(ctx context.Context, did string) error {
+	_, err := wrap(ctx, e, func(cl *packngo.Client) (*struct{}, error) {
+		_, err := cl.Devices.Delete(did, false)
+		if httpStatusCode(err) == http.StatusNotFound {
+			// 404s may pop up as an after effect of running the back-off
+			// algorithm, and as such should not be propagated.
+			return nil, nil
+		}
+		return nil, err
+	})
+	return err
+}
+
+func (e *client) ListReservations(ctx context.Context, pid string) ([]packngo.HardwareReservation, error) {
+	return wrap(ctx, e, func(cl *packngo.Client) ([]packngo.HardwareReservation, error) {
+		res, _, err := cl.HardwareReservations.List(pid, &packngo.ListOptions{Includes: []string{"facility", "device"}})
+		return res, err
+	})
+}
+
+func (e *client) MoveReservation(ctx context.Context, hardwareReservationDID, projectID string) (*packngo.HardwareReservation, error) {
+	return wrap(ctx, e, func(cl *packngo.Client) (*packngo.HardwareReservation, error) {
+		hr, _, err := cl.HardwareReservations.Move(hardwareReservationDID, projectID)
+		if err != nil {
+			return nil, fmt.Errorf("HardwareReservations.Move: %w", err)
+		}
+		return hr, err
+	})
+}
+
+func (e *client) CreateSSHKey(ctx context.Context, r *packngo.SSHKeyCreateRequest) (*packngo.SSHKey, error) {
+	return wrap(ctx, e, func(cl *packngo.Client) (*packngo.SSHKey, error) {
+		// Does the key already exist?
+		ks, _, err := cl.SSHKeys.List()
+		if err != nil {
+			return nil, fmt.Errorf("SSHKeys.List: %w", err)
+		}
+		for _, k := range ks {
+			if k.Label == r.Label {
+				if k.Key != r.Key {
+					return nil, fmt.Errorf("key label already in use for a different key")
+				}
+				return &k, nil
+			}
+		}
+
+		// No key yet. Try to create it.
+		k, _, err := cl.SSHKeys.Create(r)
+		if err != nil {
+			return nil, fmt.Errorf("SSHKeys.Create: %w", err)
+		}
+		return k, nil
+	})
+}
+
+func (e *client) UpdateSSHKey(ctx context.Context, id string, r *packngo.SSHKeyUpdateRequest) (*packngo.SSHKey, error) {
+	return wrap(ctx, e, func(cl *packngo.Client) (*packngo.SSHKey, error) {
+		k, _, err := cl.SSHKeys.Update(id, r)
+		if err != nil {
+			return nil, fmt.Errorf("SSHKeys.Update: %w", err)
+		}
+		return k, err
+	})
+}
+
+// Currently unexported, only used in tests.
+func (e *client) deleteSSHKey(ctx context.Context, id string) error {
+	_, err := wrap(ctx, e, func(cl *packngo.Client) (struct{}, error) {
+		_, err := cl.SSHKeys.Delete(id)
+		if err != nil {
+			return struct{}{}, fmt.Errorf("SSHKeys.Delete: %w", err)
+		}
+		return struct{}{}, err
+	})
+	return err
+}
+
+func (e *client) ListSSHKeys(ctx context.Context) ([]packngo.SSHKey, error) {
+	return wrap(ctx, e, func(cl *packngo.Client) ([]packngo.SSHKey, error) {
+		ks, _, err := cl.SSHKeys.List()
+		if err != nil {
+			return nil, fmt.Errorf("SSHKeys.List: %w", err)
+		}
+		return ks, nil
+	})
+}
+
+// Currently unexported, only used in tests.
+func (e *client) getSSHKey(ctx context.Context, id string) (*packngo.SSHKey, error) {
+	return wrap(ctx, e, func(cl *packngo.Client) (*packngo.SSHKey, error) {
+		k, _, err := cl.SSHKeys.Get(id, nil)
+		if err != nil {
+			return nil, fmt.Errorf("SSHKeys.Get: %w", err)
+		}
+		return k, nil
+	})
+}
+
+func (e *client) RebootDevice(ctx context.Context, did string) error {
+	_, err := wrap(ctx, e, func(cl *packngo.Client) (struct{}, error) {
+		_, err := cl.Devices.Reboot(did)
+		return struct{}{}, err
+	})
+	return err
+}
diff --git a/cloud/equinix/wrapngo/wrapngo_live_test.go b/cloud/equinix/wrapngo/wrapngo_live_test.go
new file mode 100644
index 0000000..549071a
--- /dev/null
+++ b/cloud/equinix/wrapngo/wrapngo_live_test.go
@@ -0,0 +1,344 @@
+package wrapngo
+
+import (
+	"context"
+	"crypto/ed25519"
+	"crypto/rand"
+	"errors"
+	"fmt"
+	"log"
+	"os"
+	"testing"
+	"time"
+
+	"github.com/packethost/packngo"
+	"golang.org/x/crypto/ssh"
+)
+
+type liveTestClient struct {
+	cl  *client
+	ctx context.Context
+
+	apipid string
+	apios  string
+
+	sshKeyLabel        string
+	testDeviceHostname string
+}
+
+func newLiveTestClient(t *testing.T) *liveTestClient {
+	t.Helper()
+
+	apiuser := os.Getenv("EQUINIX_USER")
+	apikey := os.Getenv("EQUINIX_APIKEY")
+	apipid := os.Getenv("EQUINIX_PROJECT_ID")
+	apios := os.Getenv("EQUINIX_DEVICE_OS")
+
+	if apiuser == "" {
+		t.Skip("EQUINIX_USER must be set.")
+	}
+	if apikey == "" {
+		t.Skip("EQUINIX_APIKEY must be set.")
+	}
+	if apipid == "" {
+		t.Skip("EQUINIX_PROJECT_ID must be set.")
+	}
+	if apios == "" {
+		t.Skip("EQUINIX_DEVICE_OS must be set.")
+	}
+	ctx, ctxC := context.WithCancel(context.Background())
+	t.Cleanup(ctxC)
+	return &liveTestClient{
+		cl: new(&Opts{
+			User:   apiuser,
+			APIKey: apikey,
+		}),
+		ctx: ctx,
+
+		apipid: apipid,
+		apios:  apios,
+
+		sshKeyLabel:        "shepherd-livetest-client",
+		testDeviceHostname: "shepherd-livetest-device",
+	}
+}
+
+// awaitDeviceState returns nil after device matching the id reaches one of the
+// provided states. It will return a non-nil value in case of an API error, and
+// particularly if there exists no device matching id.
+func (l *liveTestClient) awaitDeviceState(t *testing.T, id string, states ...string) error {
+	t.Helper()
+
+	for {
+		d, err := l.cl.GetDevice(l.ctx, l.apipid, id, nil)
+		if err != nil {
+			if errors.Is(err, os.ErrDeadlineExceeded) {
+				continue
+			}
+			return fmt.Errorf("while fetching device info: %w", err)
+		}
+		if d == nil {
+			return fmt.Errorf("expected the test device (ID: %s) to exist.", id)
+		}
+		for _, s := range states {
+			if d.State == s {
+				return nil
+			}
+		}
+		t.Logf("Waiting for device to be provisioned (ID: %s, current state: %q)", id, d.State)
+		time.Sleep(time.Second)
+	}
+}
+
+// cleanup ensures both the test device and the test key are deleted at
+// Equinix.
+func (l *liveTestClient) cleanup(t *testing.T) {
+	t.Helper()
+
+	t.Logf("Cleaning up.")
+
+	// Ensure the device matching testDeviceHostname is deleted.
+	ds, err := l.cl.ListDevices(l.ctx, l.apipid)
+	if err != nil {
+		log.Fatalf("while listing devices: %v", err)
+	}
+	var td *packngo.Device
+	for _, d := range ds {
+		if d.Hostname == l.testDeviceHostname {
+			td = &d
+			break
+		}
+	}
+	if td != nil {
+		t.Logf("Found a test device (ID: %s) that needs to be deleted before progressing further.", td.ID)
+
+		// Devices currently being provisioned can't be deleted. After it's
+		// provisioned, device's state will match either "active", or "failed".
+		if err := l.awaitDeviceState(t, "active", "failed"); err != nil {
+			t.Fatalf("while waiting for device to be provisioned: %v", err)
+		}
+		if err := l.cl.deleteDevice(l.ctx, td.ID); err != nil {
+			t.Fatalf("while deleting test device: %v", err)
+		}
+	}
+
+	// Ensure the key matching sshKeyLabel is deleted.
+	ks, err := l.cl.ListSSHKeys(l.ctx)
+	if err != nil {
+		t.Fatalf("while listing SSH keys: %v", err)
+	}
+	for _, k := range ks {
+		if k.Label == l.sshKeyLabel {
+			t.Logf("Found a SSH test key (ID: %s) - deleting...", k.ID)
+			if err := l.cl.deleteSSHKey(l.ctx, k.ID); err != nil {
+				t.Fatalf("while deleting an SSH key: %v", err)
+			}
+			t.Logf("Deleted a SSH test key (ID: %s).", k.ID)
+		}
+	}
+}
+
+// createSSHAuthKey returns an SSH public key in OpenSSH authorized_keys
+// format.
+func createSSHAuthKey(t *testing.T) string {
+	t.Helper()
+	pub, _, err := ed25519.GenerateKey(rand.Reader)
+	if err != nil {
+		t.Errorf("while generating SSH key: %v", err)
+	}
+
+	sshpub, err := ssh.NewPublicKey(pub)
+	if err != nil {
+		t.Errorf("while generating SSH public key: %v", err)
+	}
+	return string(ssh.MarshalAuthorizedKey(sshpub))
+}
+
+// TestLiveAPI performs smoke tests of wrapngo against the real Equinix API. See
+// newLiveTestClient to see which environment variables need to be provided in
+// order for this test to run.
+func TestLiveAPI(t *testing.T) {
+	ltc := newLiveTestClient(t)
+	ltc.cleanup(t)
+
+	cl := ltc.cl
+	ctx := ltc.ctx
+
+	t.Run("ListReservations", func(t *testing.T) {
+		_, err := cl.ListReservations(ctx, ltc.apipid)
+		if err != nil {
+			t.Errorf("while listing hardware reservations: %v", err)
+		}
+	})
+
+	var sshKeyID string
+	t.Run("CreateSSHKey", func(t *testing.T) {
+		nk, err := cl.CreateSSHKey(ctx, &packngo.SSHKeyCreateRequest{
+			Label:     ltc.sshKeyLabel,
+			Key:       createSSHAuthKey(t),
+			ProjectID: ltc.apipid,
+		})
+		if err != nil {
+			t.Fatalf("while creating an SSH key: %v", err)
+		}
+		if nk.Label != ltc.sshKeyLabel {
+			t.Errorf("key labels don't match.")
+		}
+		t.Logf("Created an SSH key (ID: %s)", nk.ID)
+		sshKeyID = nk.ID
+	})
+
+	var dummySSHPK2 string
+	t.Run("UpdateSSHKey", func(t *testing.T) {
+		if sshKeyID == "" {
+			t.Skip("SSH key couldn't have been created - skipping...")
+		}
+
+		dummySSHPK2 = createSSHAuthKey(t)
+		k, err := cl.UpdateSSHKey(ctx, sshKeyID, &packngo.SSHKeyUpdateRequest{
+			Key: &dummySSHPK2,
+		})
+		if err != nil {
+			t.Fatalf("while updating an SSH key: %v", err)
+		}
+		if k.Key != dummySSHPK2 {
+			t.Errorf("updated SSH key doesn't match the original.")
+		}
+	})
+	t.Run("GetSSHKey", func(t *testing.T) {
+		if sshKeyID == "" {
+			t.Skip("SSH key couldn't have been created - skipping...")
+		}
+
+		k, err := cl.getSSHKey(ctx, sshKeyID)
+		if err != nil {
+			t.Fatalf("while getting an SSH key: %v", err)
+		}
+		if k.Key != dummySSHPK2 {
+			t.Errorf("got key contents that don't match the original.")
+		}
+	})
+	t.Run("ListSSHKeys", func(t *testing.T) {
+		if sshKeyID == "" {
+			t.Skip("SSH key couldn't have been created - skipping...")
+		}
+
+		ks, err := cl.ListSSHKeys(ctx)
+		if err != nil {
+			t.Fatalf("while listing SSH keys: %v", err)
+		}
+
+		// Check that our key is part of the list.
+		found := false
+		for _, k := range ks {
+			if k.ID == sshKeyID {
+				found = true
+				break
+			}
+		}
+		if !found {
+			t.Errorf("SSH key not listed.")
+		}
+	})
+
+	var testDevice *packngo.Device
+	t.Run("CreateDevice", func(t *testing.T) {
+		// Find a provisionable hardware reservation the device will be created with.
+		rvs, err := cl.ListReservations(ctx, ltc.apipid)
+		if err != nil {
+			t.Errorf("while listing hardware reservations: %v", err)
+		}
+		var rv *packngo.HardwareReservation
+		for _, r := range rvs {
+			if r.Provisionable {
+				rv = &r
+				break
+			}
+		}
+		if rv == nil {
+			t.Skip("could not find a provisionable hardware reservation - skipping...")
+		}
+
+		d, err := cl.CreateDevice(ctx, &packngo.DeviceCreateRequest{
+			Hostname:              ltc.testDeviceHostname,
+			OS:                    ltc.apios,
+			Plan:                  rv.Plan.Slug,
+			HardwareReservationID: rv.ID,
+			ProjectID:             ltc.apipid,
+		})
+		if err != nil {
+			t.Fatalf("while creating a device: %v", err)
+		}
+		t.Logf("Created a new test device (ID: %s)", d.ID)
+		testDevice = d
+	})
+	t.Run("GetDevice", func(t *testing.T) {
+		if testDevice == nil {
+			t.Skip("the test device couldn't have been created - skipping...")
+		}
+
+		d, err := cl.GetDevice(ctx, ltc.apipid, testDevice.ID, nil)
+		if err != nil {
+			t.Fatalf("while fetching device info: %v", err)
+		}
+		if d == nil {
+			t.Fatalf("expected the test device (ID: %s) to exist.", testDevice.ID)
+		}
+		if d.ID != testDevice.ID {
+			t.Errorf("got device ID that doesn't match the original.")
+		}
+	})
+	t.Run("ListDevices", func(t *testing.T) {
+		if testDevice == nil {
+			t.Skip("the test device couldn't have been created - skipping...")
+		}
+
+		ds, err := cl.ListDevices(ctx, ltc.apipid)
+		if err != nil {
+			t.Errorf("while listing devices: %v", err)
+		}
+		if len(ds) == 0 {
+			t.Errorf("expected at least one device.")
+		}
+	})
+	t.Run("DeleteDevice", func(t *testing.T) {
+		if testDevice == nil {
+			t.Skip("the test device couldn't have been created - skipping...")
+		}
+
+		// Devices currently being provisioned can't be deleted. After it's
+		// provisioned, device's state will match either "active", or "failed".
+		if err := ltc.awaitDeviceState(t, testDevice.ID, "active", "failed"); err != nil {
+			t.Fatalf("while waiting for device to be provisioned: %v", err)
+		}
+		t.Logf("Deleting the test device (ID: %s)", testDevice.ID)
+		if err := cl.deleteDevice(ctx, testDevice.ID); err != nil {
+			t.Fatalf("while deleting a device: %v", err)
+		}
+		d, err := cl.GetDevice(ctx, ltc.apipid, testDevice.ID, nil)
+		if err != nil && !IsNotFound(err) {
+			t.Fatalf("while fetching device info: %v", err)
+		}
+		if d != nil {
+			t.Fatalf("device should not exist.")
+		}
+		t.Logf("Deleted the test device (ID: %s)", testDevice.ID)
+	})
+	t.Run("DeleteSSHKey", func(t *testing.T) {
+		if sshKeyID == "" {
+			t.Skip("SSH key couldn't have been created - skipping...")
+		}
+
+		t.Logf("Deleting the test SSH key (ID: %s)", sshKeyID)
+		if err := cl.deleteSSHKey(ctx, sshKeyID); err != nil {
+			t.Fatalf("couldn't delete an SSH key: %v", err)
+		}
+		_, err := cl.getSSHKey(ctx, sshKeyID)
+		if err == nil {
+			t.Fatalf("SSH key should not exist")
+		}
+		t.Logf("Deleted the test SSH key (ID: %s)", sshKeyID)
+	})
+
+	ltc.cleanup(t)
+}