cloud/bmaas: test and document upgrades across migrations
Change-Id: I1c405b0f2ecc10331b79d02deb8a63f3b148f502
Reviewed-on: https://review.monogon.dev/c/monogon/+/1566
Reviewed-by: Tim Windelschmidt <tim@monogon.tech>
Tested-by: Jenkins CI
diff --git a/cloud/bmaas/DEPLOYING.md b/cloud/bmaas/DEPLOYING.md
new file mode 100644
index 0000000..0dacf0b
--- /dev/null
+++ b/cloud/bmaas/DEPLOYING.md
@@ -0,0 +1,39 @@
+Schema/Version compatibility
+===
+
+Live migration
+---
+
+BMaaS supports live migrating schemas. On startup, every component using the BMaaS
+will attempt to migrate the database up to the newest version of the schema it
+was built with.
+
+Components are implemented to support a range of schemas, and operators should
+sequence upgrades in the following way:
+
+1. Make sure that all components are at the newest possible CL, but not so new
+ that they ship a newer version of the schema than is currently running.
+2. Upgrade components in a rolling fashion to a CL version that ships the newest
+ possible schema version which is still compatible with the previous CL
+ versions of the components.
+3. Repeat from point 1 until the newest wanted CL version is running.
+
+^ ID ^ Schema range ^ CL range ^ Notes ^
+| 0 | < 1672749980 | >= 0 | Initial production schema. |
+| 1 | >= 1672768890 | >= 1565 | Exponential backoff support. |
+
+For example, if the cluster is at version 1200, it should first be upgraded to
+< 1565 (to reach row 0), then to anything higher than 1565 (to reach row 1).
+
+Offline migration
+---
+
+For simple deployments, an offline migration is easiest. To perform an offline migration:
+
+1. Turn down all BMaaS components that communicate with the BMDB.
+2. Upgrade all components to the newer version (either newest or otherwise
+ wanted, but all components have to be at the same CL version).
+3. Turn up a single component of BMaaS torn down in 1., making sure the database is migrated.
+4. Turn up the rest of the components.
+
+This allows migrating across many incompatible schema migrations, but requires downtime.
\ No newline at end of file
diff --git a/cloud/bmaas/bmdb/migrations_test.go b/cloud/bmaas/bmdb/migrations_test.go
index 052d5e1..9ee4ae2 100644
--- a/cloud/bmaas/bmdb/migrations_test.go
+++ b/cloud/bmaas/bmdb/migrations_test.go
@@ -1,7 +1,10 @@
package bmdb
import (
+ "context"
"testing"
+
+ "source.monogon.dev/cloud/bmaas/bmdb/model"
)
// TestMigrateUpDown performs a full-up and full-down migration test on an
@@ -57,3 +60,85 @@
t.Fatalf("Initial up migration failed: %v", err)
}
}
+
+func TestMigration1681826233(t *testing.T) {
+ // This migration adds a new nullable field to backoffs.
+
+ // This guarantees that versions [prev, ver] can run concurrently in a cluster.
+ min := uint(1672749980)
+ max := uint(1681826233)
+
+ ctx, ctxC := context.WithCancel(context.Background())
+ defer t.Cleanup(ctxC)
+
+ b := dut()
+ conn, err := b.Open(false)
+ if err != nil {
+ t.Fatalf("Starting empty database failed: %v", err)
+ }
+
+ // First, make sure the change can actually progress if we have some backoffs
+ // already.
+ if err := b.Database.MigrateUpToIncluding(min); err != nil {
+ t.Fatalf("Migration to minimum version failed: %v", err)
+ }
+
+ // Create machine and old-style backoff.
+ q := model.New(conn.db)
+ machine, err := q.NewMachine(ctx)
+ if err != nil {
+ t.Fatalf("Could not create machine: %v", err)
+ }
+ _, err = conn.db.Exec(`
+ INSERT INTO work_backoff
+ (machine_id, process, until, cause)
+ VALUES
+ ($1, 'UnitTest1', now(), 'test');
+ `, machine.MachineID)
+ if err != nil {
+ t.Fatalf("Could not create old-style backoff on old version: %v", err)
+ }
+
+ // Migrate to newer version.
+ if err := b.Database.MigrateUpToIncluding(max); err != nil {
+ t.Fatalf("Migration to maximum version failed: %v", err)
+ }
+
+ // The migration should be read succesfully.
+ boffs, err := q.WorkBackoffOf(ctx, model.WorkBackoffOfParams{
+ MachineID: machine.MachineID,
+ Process: "UnitTest1",
+ })
+ if err != nil {
+ t.Fatalf("Reading backoff failed: %v", err)
+ }
+ if len(boffs) != 1 {
+ t.Errorf("No backoff found")
+ } else {
+ boff := boffs[0]
+ if boff.LastIntervalSeconds.Valid {
+ t.Errorf("Expected interval to be NULL")
+ }
+ }
+
+ // Simultaneously, any concurrently running bmdb user on an older version should
+ // still be able to insert and read backoffs old style.
+ _, err = conn.db.Exec(`
+ INSERT INTO work_backoff
+ (machine_id, process, until, cause)
+ VALUES
+ ($1, 'UnitTest2', now(), 'test');
+ `, machine.MachineID)
+ if err != nil {
+ t.Fatalf("Could not create old-style backoff on new version: %v", err)
+ }
+ rows, err := conn.db.Query(`
+ SELECT machine_id, process, until, cause FROM work_backoff
+ `)
+ for rows.Next() {
+ var mid, process, until, cause string
+ if err := rows.Scan(&mid, &process, &until, &cause); err != nil {
+ t.Errorf("Scan failed: %v", err)
+ }
+ }
+}
diff --git a/cloud/lib/component/crdb.go b/cloud/lib/component/crdb.go
index 4ba9470..7c9c1a8 100644
--- a/cloud/lib/component/crdb.go
+++ b/cloud/lib/component/crdb.go
@@ -137,7 +137,7 @@
// by this CockroachConfig.
func (d *CockroachConfig) MigrateUp() error {
dsn := d.buildDSN("cockroachdb")
- klog.Infof("Running migrations on %s...", dsn)
+ klog.Infof("Running migrations up...")
m, err := migrate.NewWithSourceInstance("iofs", d.Migrations, dsn)
if err != nil {
return err
@@ -153,6 +153,17 @@
}
}
+func (d *CockroachConfig) MigrateUpToIncluding(ver uint) error {
+ dsn := d.buildDSN("cockroachdb")
+ klog.Infof("Running migrations up to %d...", ver)
+ m, err := migrate.NewWithSourceInstance("iofs", d.Migrations, dsn)
+ if err != nil {
+ return err
+ }
+
+ return m.Migrate(ver)
+}
+
// MigrateDownDangerDanger removes all data from the database by performing a
// full migration down.
//
@@ -171,7 +182,7 @@
return fmt.Errorf("no really, this cannot be run on non-in-memory databases")
}
dsn := d.buildDSN("cockroachdb")
- klog.Infof("Running migrations on %s...", dsn)
+ klog.Infof("Running migrations down...")
m, err := migrate.NewWithSourceInstance("iofs", d.Migrations, dsn)
if err != nil {
return err