cloud/bmaas: test and document upgrades across migrations

Change-Id: I1c405b0f2ecc10331b79d02deb8a63f3b148f502
Reviewed-on: https://review.monogon.dev/c/monogon/+/1566
Reviewed-by: Tim Windelschmidt <tim@monogon.tech>
Tested-by: Jenkins CI
diff --git a/cloud/bmaas/DEPLOYING.md b/cloud/bmaas/DEPLOYING.md
new file mode 100644
index 0000000..0dacf0b
--- /dev/null
+++ b/cloud/bmaas/DEPLOYING.md
@@ -0,0 +1,39 @@
+Schema/Version compatibility
+===
+
+Live migration
+---
+
+BMaaS supports live migrating schemas. On startup, every component using the BMaaS
+will attempt to migrate the database up to the newest version of the schema it
+was built with.
+
+Components are implemented to support a range of schemas, and operators should
+sequence upgrades in the following way:
+
+1. Make sure that all components are at the newest possible CL, but not so new
+   that they ship a newer version of the schema than is currently running.
+2. Upgrade components in a rolling fashion to a CL version that ships the newest
+   possible schema version which is still compatible with the previous CL
+   versions of the components.
+3. Repeat from point 1 until the newest wanted CL version is running.
+
+^ ID ^ Schema range  ^ CL range ^ Notes                        ^
+|  0 | < 1672749980  | >= 0     | Initial production schema.   |
+|  1 | >= 1672768890 | >= 1565  | Exponential backoff support. |
+
+For example, if the cluster is at version 1200, it should first be upgraded to 
+< 1565 (to reach row 0), then to anything higher than 1565 (to reach row 1).
+
+Offline migration
+---
+
+For simple deployments, an offline migration is easiest. To perform an offline migration:
+
+1. Turn down all BMaaS components that communicate with the BMDB.
+2. Upgrade all components to the newer version (either newest or otherwise
+   wanted, but all components have to be at the same CL version).
+3. Turn up a single component of BMaaS torn down in 1., making sure the database is migrated.
+4. Turn up the rest of the components.
+
+This allows migrating across many incompatible schema migrations, but requires downtime.
\ No newline at end of file
diff --git a/cloud/bmaas/bmdb/migrations_test.go b/cloud/bmaas/bmdb/migrations_test.go
index 052d5e1..9ee4ae2 100644
--- a/cloud/bmaas/bmdb/migrations_test.go
+++ b/cloud/bmaas/bmdb/migrations_test.go
@@ -1,7 +1,10 @@
 package bmdb
 
 import (
+	"context"
 	"testing"
+
+	"source.monogon.dev/cloud/bmaas/bmdb/model"
 )
 
 // TestMigrateUpDown performs a full-up and full-down migration test on an
@@ -57,3 +60,85 @@
 		t.Fatalf("Initial up migration failed: %v", err)
 	}
 }
+
+func TestMigration1681826233(t *testing.T) {
+	// This migration adds a new nullable field to backoffs.
+
+	// This guarantees that versions [prev, ver] can run concurrently in a cluster.
+	min := uint(1672749980)
+	max := uint(1681826233)
+
+	ctx, ctxC := context.WithCancel(context.Background())
+	defer t.Cleanup(ctxC)
+
+	b := dut()
+	conn, err := b.Open(false)
+	if err != nil {
+		t.Fatalf("Starting empty database failed: %v", err)
+	}
+
+	// First, make sure the change can actually progress if we have some backoffs
+	// already.
+	if err := b.Database.MigrateUpToIncluding(min); err != nil {
+		t.Fatalf("Migration to minimum version failed: %v", err)
+	}
+
+	// Create machine and old-style backoff.
+	q := model.New(conn.db)
+	machine, err := q.NewMachine(ctx)
+	if err != nil {
+		t.Fatalf("Could not create machine: %v", err)
+	}
+	_, err = conn.db.Exec(`
+		INSERT INTO work_backoff
+		    (machine_id, process, until, cause)
+		VALUES
+		    ($1, 'UnitTest1', now(), 'test');
+	`, machine.MachineID)
+	if err != nil {
+		t.Fatalf("Could not create old-style backoff on old version: %v", err)
+	}
+
+	// Migrate to newer version.
+	if err := b.Database.MigrateUpToIncluding(max); err != nil {
+		t.Fatalf("Migration to maximum version failed: %v", err)
+	}
+
+	// The migration should be read succesfully.
+	boffs, err := q.WorkBackoffOf(ctx, model.WorkBackoffOfParams{
+		MachineID: machine.MachineID,
+		Process:   "UnitTest1",
+	})
+	if err != nil {
+		t.Fatalf("Reading backoff failed: %v", err)
+	}
+	if len(boffs) != 1 {
+		t.Errorf("No backoff found")
+	} else {
+		boff := boffs[0]
+		if boff.LastIntervalSeconds.Valid {
+			t.Errorf("Expected interval to be NULL")
+		}
+	}
+
+	// Simultaneously, any concurrently running bmdb user on an older version should
+	// still be able to insert and read backoffs old style.
+	_, err = conn.db.Exec(`
+		INSERT INTO work_backoff
+		    (machine_id, process, until, cause)
+		VALUES
+		    ($1, 'UnitTest2', now(), 'test');
+	`, machine.MachineID)
+	if err != nil {
+		t.Fatalf("Could not create old-style backoff on new version: %v", err)
+	}
+	rows, err := conn.db.Query(`
+		SELECT machine_id, process, until, cause FROM work_backoff
+	`)
+	for rows.Next() {
+		var mid, process, until, cause string
+		if err := rows.Scan(&mid, &process, &until, &cause); err != nil {
+			t.Errorf("Scan failed: %v", err)
+		}
+	}
+}
diff --git a/cloud/lib/component/crdb.go b/cloud/lib/component/crdb.go
index 4ba9470..7c9c1a8 100644
--- a/cloud/lib/component/crdb.go
+++ b/cloud/lib/component/crdb.go
@@ -137,7 +137,7 @@
 // by this CockroachConfig.
 func (d *CockroachConfig) MigrateUp() error {
 	dsn := d.buildDSN("cockroachdb")
-	klog.Infof("Running migrations on %s...", dsn)
+	klog.Infof("Running migrations up...")
 	m, err := migrate.NewWithSourceInstance("iofs", d.Migrations, dsn)
 	if err != nil {
 		return err
@@ -153,6 +153,17 @@
 	}
 }
 
+func (d *CockroachConfig) MigrateUpToIncluding(ver uint) error {
+	dsn := d.buildDSN("cockroachdb")
+	klog.Infof("Running migrations up to %d...", ver)
+	m, err := migrate.NewWithSourceInstance("iofs", d.Migrations, dsn)
+	if err != nil {
+		return err
+	}
+
+	return m.Migrate(ver)
+}
+
 // MigrateDownDangerDanger removes all data from the database by performing a
 // full migration down.
 //
@@ -171,7 +182,7 @@
 		return fmt.Errorf("no really, this cannot be run on non-in-memory databases")
 	}
 	dsn := d.buildDSN("cockroachdb")
-	klog.Infof("Running migrations on %s...", dsn)
+	klog.Infof("Running migrations down...")
 	m, err := migrate.NewWithSourceInstance("iofs", d.Migrations, dsn)
 	if err != nil {
 		return err