a curated list of database news from authoritative sources

March 31, 2026

Graceful degradation in Postgres

Not all traffic is created equal.When a database is overwhelmed, you want the important queries to keep executing, even if that means shedding lower-priority work.This is a much better outcome than the alternative: a total database outage.

March 30, 2026

Read Concern "snapshot" for snapshot isolation outside explicit transactions

TL;DR: I a previous post I explained Why isn't "majority" the default read concern in MongoDB. If you’re used to SQL databases, you’ll likely prefer the snapshot read concern, which guarantees all rows are read from the same snapshot. Alternatively, run your operations in an explicit transaction, as transactions are ACID.

With majority, you avoid dirty or uncommitted reads because you only see data acknowledged by a majority. However, a scan can yield while the majority state advances, so some rows may come from a newer majority snapshot.

By default, MongoDB’s consistency boundary is the document—similar to an aggregate in domain‑driven design. For multi‑document consistency, use explicit transactions or the snapshot read concern. This is MongoDB’s tunable consistency model, whose defaults often suit event‑driven architectures more than traditional SQL workloads.

To illustrate this, here’s a demo.

I start by setting up the collection, the document count, and other parameters:


const NUM_ACCOUNTS = 1_000_000;
const INITIAL_BALANCE = 10000;  // cents
const BATCH_SIZE = 10_000;
const TOTAL_TRANSFERS = 100_000;
const NUM_WRITERS = 5;
const mydb = db.getSiblingDB("demo");
const coll = mydb.accounts;

I load one million accounts, all with the same amount:


for (let i = 0; i < NUM_ACCOUNTS; i += BATCH_SIZE) {
    const batch = [];
    for (let j = i; j < Math.min(i + BATCH_SIZE, NUM_ACCOUNTS); j++) {
        batch.push({ _id: j, balance: INITIAL_BALANCE });
    }
    coll.insertMany(batch, { ordered: false });
    if (i % 100_000 === 0) print("  " + (i / 1000) + "k...");
}
print("✅ Loaded " + NUM_ACCOUNTS.toLocaleString() + " accounts\n");

This function checks the total balance every second:


async function periodicReader(readConcern) {
    while (readingActive) {
        const result = coll.aggregate([
            { $group: { _id: null, total: { $sum: "$balance" } } }
        ], { readConcern: { level: readConcern } }
                                   // -> majority or snapshot
        ).toArray();
        const total = result.length ? result[0].total : "N/A";
        print("  📊 Aggregate read — total balance: " + total +
              " | transfers so far: " + transfersDone);
        // Wait ~1 second before next read
        await new Promise(r => setTimeout(r, 1000));
    }
}
// run it:
let transfersDone = 0;
let readingActive = true;
const readerPromise = periodicReader("majority");

With no writes and one million accounts initialized with ten thousand each, the total balance is 10,000,000,000:

  📊 Aggregate read — total balance: 10000000000 | transfers so far: 0
  📊 Aggregate read — total balance: 10000000000 | transfers so far: 0
  📊 Aggregate read — total balance: 10000000000 | transfers so far: 0
  📊 Aggregate read — total balance: 10000000000 | transfers so far: 0
  📊 Aggregate read — total balance: 10000000000 | transfers so far: 0
  📊 Aggregate read — total balance: 10000000000 | transfers so far: 0

I perform random account-to-account transfers that debit one account and credit another for the same amount in a single transaction, so the total balance stays the same, and run it in five threads:


print("⏳ Launching " + NUM_WRITERS + " writers + 1 reader...\n");
let writingDone = false;
let transfersDone = 0;
async function writer(id, count) {
    const s = mydb.getMongo().startSession();
    const sc = s.getDatabase("demo").accounts;
    for (let t = 0; t < count; t++) {
        // pick two random accounts
        const from = Math.floor(Math.random() * NUM_ACCOUNTS);
        let to =     Math.floor(Math.random() * NUM_ACCOUNTS);
        while (to === from) to = Math.floor(Math.random() * NUM_ACCOUNTS);
        // same amount to debit on one, credit on the other
        const amount = Math.floor(Math.random() * 1000);
        // do that in a transaction
        try {
            s.startTransaction();
            sc.updateOne({ _id: from }, { $inc: { balance: -amount } });
            sc.updateOne({ _id: to },   { $inc: { balance:  amount } });
            s.commitTransaction();
            transfersDone++;
        } catch (e) {
            try { s.abortTransaction(); } catch (_) {}
        }
    }
    s.endSession();
    print("  ✅ Writer " + id + " done");
}
// Run in threads:
const writerPromises = [];
for (let i = 0; i < NUM_WRITERS; i++) {
    writerPromises.push(writer(i + 1, TOTAL_TRANSFERS / NUM_WRITERS));
}

The reader thread is still running, but it now reports an inconsistent total balance due to the "majority" read concern:

  📊 Aggregate read — total balance: 10000002595 | transfers so far: 634
  📊 Aggregate read — total balance: 10000003902 | transfers so far: 1177
  📊 Aggregate read — total balance: 9999999180 | transfers so far: 1742
  📊 Aggregate read — total balance: 10000002564 | transfers so far: 2325
  📊 Aggregate read — total balance: 9999995030 | transfers so far: 2900
  📊 Aggregate read — total balance: 10000001154 | transfers so far: 3462
  📊 Aggregate read — total balance: 9999996910 | transfers so far: 4029
  📊 Aggregate read — total balance: 9999992085 | transfers so far: 4655
  📊 Aggregate read — total balance: 9999995372 | transfers so far: 5215
  📊 Aggregate read — total balance: 9999999916 | transfers so far: 5792
  📊 Aggregate read — total balance: 9999998316 | transfers so far: 6396
  📊 Aggregate read — total balance: 9999997128 | transfers so far: 6976
  📊 Aggregate read — total balance: 10000006447 | transfers so far: 7516
  📊 Aggregate read — total balance: 9999998330 | transfers so far: 8091
  📊 Aggregate read — total balance: 10000001286 | transfers so far: 8656
  📊 Aggregate read — total balance: 10000001899 | transfers so far: 9240
  📊 Aggregate read — total balance: 9999996708 | transfers so far: 9845
  📊 Aggregate read — total balance: 10000005159 | transfers so far: 10444
  📊 Aggregate read — total balance: 10000002749 | transfers so far: 11012
  📊 Aggregate read — total balance: 9999999925 | transfers so far: 11623

If you’re coming from SQL databases, it may be surprising: in SQL, every statement runs in an explicit transaction with a defined isolation level. MongoDB instead offers several consistency boundaries:

  • Document-level consistency by default
  • Statement-level consistency with the snapshot read concern
  • Transaction-level consistency with explicit transactions

I’ve been using the default document-level consistency, so the total was inconsistent, and I’ll now show the other levels that provide stronger read-time consistency.

I stop the reading thread:

readingActive = false;

Then I restart it with a snapshot read concern:

let readingActive = true;
const readerPromise = periodicReader("snapshot");

Now the results are consistent: the total balance stays the same while money is being transferred between accounts.

  📊 Aggregate read — total balance: 10000000000 | transfers so far: 92845
  📊 Aggregate read — total balance: 10000000000 | transfers so far: 93439
  📊 Aggregate read — total balance: 10000000000 | transfers so far: 94022
  📊 Aggregate read — total balance: 10000000000 | transfers so far: 94590
  📊 Aggregate read — total balance: 10000000000 | transfers so far: 95161
  📊 Aggregate read — total balance: 10000000000 | transfers so far: 95737
  📊 Aggregate read — total balance: 10000000000 | transfers so far: 96307
  📊 Aggregate read — total balance: 10000000000 | transfers so far: 96835
  📊 Aggregate read — total balance: 10000000000 | transfers so far: 97353
  📊 Aggregate read — total balance: 10000000000 | transfers so far: 97920
  📊 Aggregate read — total balance: 10000000000 | transfers so far: 98478
  📊 Aggregate read — total balance: 10000000000 | transfers so far: 99038
  📊 Aggregate read — total balance: 10000000000 | transfers so far: 99621
  ✅ Writer 3 done
  ✅ Writer 1 done
  ✅ Writer 4 done
  ✅ Writer 2 done
  ✅ Writer 5 done
  📊 Aggregate read — total balance: 10000000000 | transfers so far: 99999
  📊 Aggregate read — total balance: 10000000000 | transfers so far: 99999
  📊 Aggregate read — total balance: 10000000000 | transfers so far: 99999
  📊 Aggregate read — total balance: 10000000000 | transfers so far: 99999
  📊 Aggregate read — total balance: 10000000000 | transfers so far: 99999

This doesn't lock anything because it uses WiredTiger Multi-Version Concurrency Control (MVCC) and may even be faster as it doesn't have to wait for the majority committed snapshot that may be delayed if secondaries are not available.

I start the writes again, as they ended, without starting the reader thread as I'll run it manually:

let readingActive = false;
for (let i = 0; i < NUM_WRITERS; i++) {
    writerPromises.push(writer(i + 1, TOTAL_TRANSFERS / NUM_WRITERS));
}

I run a manual aggregate with the default read concern, and it shows an inconsistent balance:

const s = mydb.getMongo().startSession();
const sc = s.getDatabase("demo").accounts;
sc.aggregate([
        { $group: { _id: null, total: { $sum: "$balance" } } }
        ] 
)

test> sc.aggregate([
...         { $group: { _id: null, total: { $sum: "$balance" } } }
...         ] 
... )
[ { _id: null, total: 9999997810 } ]

Instead of specifying "snapshot" read concern, I run it in a transaction:

s.startTransaction();
sc.aggregate([
        { $group: { _id: null, total: { $sum: "$balance" } } }
        ] 
)
s.commitTransaction();

test> s.startTransaction();

test> sc.aggregate([
...         { $group: { _id: null, total: { $sum: "$balance" } } }
...         ] 
... )
[ { _id: null, total: 10000000000 } ]
test> s.commitTransaction();

As you would expect, transactions are ACID and consistent. It's another way to get snapshot isolation.

Before giving more explanation I stop all:

(async () => {
await Promise.all(writerPromises);
readingActive = false;
await readerPromise;
})();

All databases provide different levels of consistency, and “consistency” actually means two things:

  • Snapshot consistency: all data is read from a single commit point in time. The result is then stale because the read timestamp is fixed at the start of the statement (e.g., SQL Read Committed) or at the start of the transaction (e.g., SQL Repeatable Read). This is the consistency as defined by the ACID properties.

  • Read-your-writes consistency: you see the latest committed state, and the read time can advance during the scan to reduce staleness. This is the consistency as defined by the CAP theorem.

SQL databases don’t always use the first model. Isolation levels guarantee that you see committed changes, but not that all rows were committed at the same instant. MVCC databases usually provide snapshot consistency by setting a read timestamp in the past, but some may re-read rows later if they change and are about to be updated, to avoid restarting the whole query at a later point in time.

To validate a total balance, you need a consistent snapshot. In some other cases, reading the latest values is preferable to using a stale snapshot. Neither approach is always best.

In the SQL bubble, inconsistent snapshots often seem unacceptable, whereas in event-driven NoSQL systems, reading the latest writes makes more sense. Any database can work well if you understand it, and poorly if you ignore its behavior under race conditions.

Non-MVCC databases can return the latest writes within a single, consistent state but require heavy locking to do so. In contrast, MVCC databases avoid blocking reads, but must choose between a stable, timeline-consistent snapshot that may be stale or a latest-write view whose read times keep moving.

For SQL databases, you need to understand the isolation levels:

  • Per-statement read time in Read Committed Snapshot Isolation.
  • Per-transaction read time in Snapshot Isolation

For MongoDB, you should understand the consistency boundaries:

  • Document-level consistency by default
  • Statement-level consistency with the "snapshot" read concern
  • Transaction-level consistency with explicit transactions

High memory usage in Postgres is good, actually

A high memory percentage in PlanetScale Postgres is not necessarily a problem. Let's compare how memory and CPU usage are different, how not all memory usage is created equal, and which signals actually require attention.

March 29, 2026

The insert benchmark on a small server : Postgres 12.22 through 18.3

This has results for Postgres versions 12.22 through 18.3 with the Insert Benchmark on a small server. My previous post for the same hardware with results up to Postgres 18.1 is here. This post also has results for:

  • all 17.x releases from 17.0 through 17.9 
  • 18.2 with and without full page writes enabled
  • both 1 and 4 users

Postgres continues to be boring in a good way. It is hard to find performance regressions. Performance wasn't always stable, but I am reluctant to expect it to show no changes because there are sources of variance beyond the DBMS, especially HW (a too-hot SSD or CPU will run slower). Sometimes perf changes because there are obvious perf bugs, sometimes it changes for other reasons.

 tl;dr for a  CPU-bound workload

  • performance is stable from Postgres 12 through 18
  • performance is stable from Postgres 17.0 through 17.9
  • disabling full-page writes improves throughput on write-heavy benchmark steps
tl;dr for an IO-bound workload
  • performance is mostly stable from Postgres 12 through 18
  • performance is stable from Postgres 17.0 through 17.9
  • disabling full-page writes improves throughput on write-heavy benchmark steps
  • in a few cases there are large improvements to point-query throughput on the qp1000 benchmark step. I will try to explain that soon.
Builds, configuration and hardware

I compiled Postgres from source using -O2 -fno-omit-frame-pointer for versions 12.22, 13.23, 14.22, 15.17, 16.13, 17.0 to 17.9, 18.2 and 18.3.

The server is an Beelink SER7 with a Ryzen 7 7840HS CPU with 8 cores and AMD SMT disabled, 32G of RAM. Storage is one SSD for the OS and an NVMe SSD for the database using ext-4 with discard enabled. The OS is Ubuntu 24.04.

For versions prior to 18, the config file is named conf.diff.cx10a_c8r32 and they are as similar as possible and here for versions 1213141516 and 17.

For Postgres 18 in most cases I used a config named conf.diff.cx10b_c8r32 (aka cx10b) which is as similar as possible to the configs for versions 17 and earlier. But for tests with full-page writes disabled I used additional configs to compare with results from the cx10b config.
The Benchmark

The benchmark is explained here and is run with 1 client 4 clients. In each case each client uses a separate table. I repeated it with two workloads:
  • CPU-bound
    • for 1 user the values for X, Y, Z are 30M, 40M, 10M
    • for 4 users the values for X, Y, Z are 10M, 16M, 4M
  • IO-bound
    • for 1 user the values for X, Y, Z are 800M, 4M, 1M
    • for 4 users the values for X, Y, Z are 200M, 4M, 1M
The point query (qp100, qp500, qp1000) and range query (qr100, qr500, qr1000) steps are run for 1800 seconds each.

The benchmark steps are:

  • l.i0
    • insert X rows per table in PK order. The table has a PK index but no secondary indexes. There is one connection per client.
  • l.x
    • create 3 secondary indexes per table. There is one connection per client.
  • l.i1
    • use 2 connections/client. One inserts Y rows per table and the other does deletes at the same rate as the inserts. Each transaction modifies 50 rows (big transactions). This step is run for a fixed number of inserts, so the run time varies depending on the insert rate.
  • l.i2
    • like l.i1 but each transaction modifies 5 rows (small transactions) and Z rows are inserted and deleted per table.
    • Wait for S seconds after the step finishes to reduce variance during the read-write benchmark steps that follow. The value of S is a function of the table size.
  • qr100
    • use 3 connections/client. One does range queries and performance is reported for this. The second does does 100 inserts/s and the third does 100 deletes/s. The second and third are less busy than the first. The range queries use covering secondary indexes. If the target insert rate is not sustained then that is considered to be an SLA failure. If the target insert rate is sustained then the step does the same number of inserts for all systems tested. This step is frequently not IO-bound for the IO-bound workload.
  • qp100
    • like qr100 except uses point queries on the PK index
  • qr500
    • like qr100 but the insert and delete rates are increased from 100/s to 500/s
  • qp500
    • like qp100 but the insert and delete rates are increased from 100/s to 500/s
  • qr1000
    • like qr100 but the insert and delete rates are increased from 100/s to 1000/s
  • qp1000
    • like qp100 but the insert and delete rates are increased from 100/s to 1000/s
Results: overview

The performance reports are here for:
The summary sections from the performances report have 3 tables. The first shows absolute throughput by DBMS tested X benchmark step. The second has throughput relative to the version from the first row of the table. The third shows the background insert rate for benchmark steps with background inserts. The second table makes it easy to see how performance changes over time. The third table makes it easy to see which DBMS+configs failed to meet the SLA.

Below I use relative QPS to explain how performance changes. It is: (QPS for $me / QPS for $base) where $me is the result for some version. The base version is Postgres 12.22 for the latest point releases comparison, 17.0 for the 17.x releases comparison and 18.2 with the cx10b config for the full-page writes comparison. 

When relative QPS is > 1.0 then performance improved over time. When it is < 1.0 then there are regressions. The Q in relative QPS measures: 
  • insert/s for l.i0, l.i1, l.i2
  • indexed rows/s for l.x
  • range queries/s for qr100, qr500, qr1000
  • point queries/s for qp100, qp500, qp1000
This statement doesn't apply to this blog post, but I keep it here for copy/paste into future posts. Below I use colors to highlight the relative QPS values with red for <= 0.95, green for >= 1.05 and grey for values between 0.95 and 1.05.

Results: CPU-bound

The performance summaries are here for:
For latest point releases at 1 user
  • there is either no change or a small improvement for l.i0 (load in PK order), l.x (create indexes) and the read-write tests (qr*, qp*).
  • for l.i1 and l.i2 (random write-only) throughput drops by 5% to 10% from 12.22 to 13.23 and has been stable since then (throughput in 18.2 is similar to 13.23. The CPU per operation overhead (cpupq here) increases after 12.22 for the l.i2 step but there wasn't an obvious increase for the l.i1 step - but the way I measure this is far from perfect. The results I share here are worse than what I measured in December 2025.
For latest point releases at 4 users
  • there might be a small (3%) regression for l.i0 (load in PK order) in 18.2 vs 12.22. Perhaps this is noise. From vmstat and iostat metrics there aren't obvious changes.
  • throughput in 18.2 is better than 12.22 for all other benchmark steps
For all 17.x releases at 1 user
  • throughput is stable from 17.0 to 17.9 for all benchmark steps except l.i1 and l.i2 (random writes) where there might be a 5% regression late in 17.x. This might be from new CPU overhead - see cpupq here.
For all 17.x releases at 4 users
  • throughput is stable with small improvements from 17.0 to 17.9
For full-page writes at 1 user
  • throughput improves by ~5% for l.i1 and l.i2 (random writes) when full-page writes are disabled  and KB written to storage per commit drops by ~20% -- see wkbpi here.
  • enabling wal_compression=lz4 decreases write throughput for all write-heavy steps when full-page writes are enabled. The impact is smaller when full page writes are disabled.
For full-page writes at 4 users
  • throughput improves by <= 5% for all write-heavy steps when full-page writes are disabled
  • the impact from wal_compression=lz4 isn't obvious
Results: IO-bound

The performance summaries are here for:
For latest point releases at 1 user
  • there are small (<= 10%) improvements for l.i0 (load in PK order) and l.x (create index). I don't see anything obvious in vmstat and iostat metrics to explain this.
  • there are small (<= 10%) regressions for l.i1 and l.i2 (random writes) that might be from a sequence of small regressions from 13.x through 18.x. I don't see anything obvious in vmstat and iostat metrics to explain this.
  • throughput is unchanged for the range-query read+write tests (qr*)
  • throughput improves by ~1.4X for the point-query read+write tests (qp*). This improvement arrived in 13.x. This can be explained by large drops in CPU overhead (cpupq) and context switch rates (cspq) -- see here.
  • the results here are similar to what I measured in December 2025
For latest point releases at 4 users
  • there are small (~10%) regressions for l.i0 (load in PK order) that arrived in 17.x. The context switch rate (cspq) increases in 17.x
  • there are small (<= 20%) improvements for l.x (create index) that arrived in 13.x
  • there are large regressions for l.i1 and l.i2 (random writes) that arrive in 15.x through 18.x. There are large increases in CPU overhead (cpupq) -- see here.
  • throughput is unchanged for the range-query read+write tests (qr*)
  • throughput improves for the point-query read+write tests (qp*) at higher write rates (qp500, qp1000).
For all 17.x releases at 1 user
  • throughput is stable with a few exceptions
  • for qp1000 (point-query, read+write) it improves by ~5% in 17.1 and is then stable to 17.9
  • in 17.9 there are large (~1.4x) improvements for all of the point-query, read+write tests
  • the changes in throughput for qp1000 might be explained by a small drop in CPU overhead per query (cpupq) that arrived in 17.1 and a large drop that arrived in 17.9 -- see here.
For all 17.x releases at 4 users
  • throughput for most steps (l.i0, l.x, qr*, qp100, qp500) is stable
  • throughput for l.i1 and l.i2 (random writes) has more variance
  • throughput for qp1000 drops by up to 10% from 17.3 through 17.8 and in those cases the CPU overhead increased -- see cpupq here.
For full-page writes at 1 user
  • throughput improves by 6% for l.i1 (random writes) when full-page writes are disabled
  • throughput improved for qp* tests when either full-page writes were disabled or lz4 was used for log_compression. That is harder to explain, perhaps it is noise.
For full-page writes at 4 users
  • throughput improves by 20% for l.i1 (random writes) when full-page writes are disabled
  • throughput improved for qp* tests when either full-page writes were disabled or lz4 was used for log_compression. That is harder to explain, perhaps it is noise.

n_dead_tup vs n_live_tup

The tables below show the ratio: n_dead_tup / (n_dead_tup + n_live_tup) for the CPU-bound and IO-bound workloads using 1 user (and one table). These were measured at the end of each benchmark step.

CPU-bound
        12.22   18.3
l.i0    0.000   0.000
l.x     0.000   0.000
l.i1    0.065   0.035
l.i2    0.045   0.020
qr100   0.006   0.006
qp100   0.012   0.012
qr500   0.040   0.040
qp500   0.021   0.024
qr1000  0.031   0.036
qp1000  0.040   0.003

IO-bound
        12.22   18.3
l.i0    0.000   0.000
l.x     0.000   0.000
l.i1    0.005   0.005
l.i2    0.006   0.006
qr100   0.000   0.000
qp100   0.000   0.000
qr500   0.002   0.002
qp500   0.003   0.003
qr1000  0.005   0.005
qp1000  0.007   0.007

















 

SQL Assertions, ANSI join, and ORA-08697

In the previous post on consistency boundaries, we saw that an updatable join view can hide a write-skew anomaly when the developer assumes the consistency boundary is a single row, even though there are actually two underlying rows and only one is locked. I wanted to see how the new SQL Assertions introduced in "Oracle AI Database 26ai Release 23.26.1.0.0" handle this, since they use a different locking mechanism.

TL;DR: they handle it correctly, but without ANSI joins.

Without assertions, for an employee with a salary of 1000 and a commission of 100, two concurrent users could each add 42 to both values because they couldn’t see each other’s changes, and updating one column didn’t lock the other. I then added a SQL assertion that doesn’t allow the commission to exceed 1150, so that only one session can add 42. This assertion works as expected, waiting on the other transaction before checking the sum:

CREATE ASSERTION salary_plus_commission_le_1150
CHECK (
  NOT EXISTS (
    SELECT 'violation'
    FROM emp_salary s, emp_commission c
    WHERE s.empno = c.empno
      AND s.salary + c.commission > 1150
  )
);

When the first transaction committed, the second transaction detected the write skew:

ERROR at line 1:
ORA-08601: SQL assertion (FRANCK.SALARY_PLUS_COMMISSION_LE_1150) violated.

I'm writing this blog post because I had trouble finding the correct declaration for this assertion, and I hope it helps others. My first attempt was simply:

CREATE ASSERTION salary_plus_commission_le_1150
CHECK (
  NOT EXISTS (
    SELECT 'violation'
    FROM emp_salary s join emp_commission c USING (empno)
    WHERE s.salary + c.commission > 1150
  )
);

This joins the two tables and raises a violation is the sum of salary and commission is higher than 1150, but the CREATE ASSERTION failed with:

ORA-08689: CREATE ASSERTION failed
ORA-08697: SYS owned tables are not supported.

I'm not using SYS, and cannot use SYS because I'm on the managed service "Autonomous Database" which doesn't give SYSDBA privileges.

I reproduced this in a local environment to trace the SQL statements (ALTER SESSION SET EVENTS 'sql_trace bind=false, wait=false') and found that Oracle internally checked this constraint with:

/* SQL Analyze(250,0) */
SELECT /*+ ALL_ROWS BYPASS_RECURSIVE_CHECK */
1 FROM "SYS"."DUAL" WHERE (
  NOT EXISTS (
    SELECT 1
    FROM emp_salary s
    JOIN emp_commission c ON s.empno = c.empno
    WHERE s.salary + c.commission > 1150
  )
)

The good old DUAL table is owned by SYS and cannot be used in the assertion.

After trying many variations, I realized that SQL assertions are always evaluated with a SELECT from "SYS"."DUAL", even for cases that work, like in my previous post on SQL Assertions.

Then, I remembered what I’d learned from 30 years of working with Oracle Databases: if something fails with an ANSI join in Oracle, try the legacy join syntax (no JOIN clauses, only WHERE conditions). When ANSI joins were added in 9i, Oracle implemented them with internal transformations that affected other features. Most of those issues were fixed over the 20 years that followed, but SQL assertions seem to have fallen into that trap.

With the legacy join syntax, the internal query still reads "SYS"."DUAL", but it works:

=====================
PARSING IN CURSOR #140737366953320 len=208 dep=0 uid=136 oct=290 lid=136 tim=6202261308 hv=3501011950 ad='7ffff8be4960'
 sqlid='fzcfcfv8auczf'
CREATE ASSERTION salary_plus_commission_le_1150
CHECK (
  NOT EXISTS (
    SELECT 'violation'
    FROM emp_salary s, emp_commission c
    WHERE s.empno = c.empno
      AND s.salary + c.commission > 1150
  )
)
END OF STMT

...

=====================
PARSING IN CURSOR #140737364327008 len=286 dep=1 uid=136 oct=3 lid=0 tim=6202334328 hv=225567679 ad='783b9f10' sqlid='fv1va6n6r3sxz'
 /* SQL_ASSERTION obj#=72600 */ 
SELECT /*+  ALL_ROWS BYPASS_RECURSIVE_CHECK */ 1 
FROM "SYS"."DUAL" 
WHERE (   NOT EXISTS (SELECT 'violation' "'VIOLATION'" FROM "FRANCK"."EMP_SALARY" "S","FRANCK"."EMP_COMMISSION" "C" WHERE "S"."EMPNO"="C"."EMPNO" AND "S"."SALARY"+"C"."COMMISSION">1150) )
END OF STMT

Two things to remember:

  • SQL Assertions provide correct consistency boundaries to avoid write skew in the absence of a serializable isolation level.
  • ANSI joins are nice, but Oracle was designed before this standard, with another syntax, and some bugs persist. Falling back to the old syntax may be safer.

Consistency boundaries in SQL databases vs. MongoDB

Two common myths surround database consistency:

  • Myth 1: Only SQL databases provide strong consistency. In fact, MongoDB—a NoSQL database—supports ACID transactions with strong consistency, even in sharded clusters, with clear serializable boundaries: the document.
  • Myth 2: SQL consistency is simple and fully abstracted, so you can treat views like tables and run transactions as if race-condition anomalies never occur. In reality, no RDBMS fully achieves this ideal, so you must understand each system’s consistency boundaries (column?, scanned rows?, returned rowset?, range?, partition?, table?).

With MongoDB, the consistency boundaries follow domain driven design aggregates: concurrent writes to the same document conflict, even if they effectively read or write different fields. This guarantees that the database cannot break the business invariants set by the application. I've detailed an example in Serializable Transactions in MongoDB: The Doctor's On-Call Shift example. Document-level operations provide stronger integrity constraints, since schema validation can cover the entire aggregate, including one-to-many relationships. SQL databases need assertions to provide the same guarantees, but these are rarely implemented in RDBMSs, so only a limited subset of integrity constraints are available, like foreign keys.

Relational databases and SQL are meant to hide locking internals from developers, and full support for the serializable isolation level would make this feasible. In practice, though, serializable is often incomplete or hurts scalability, so developers use weaker isolation levels and must consider what is locked. They commonly add explicit locking, like SELECT FOR UPDATE, to prevent race-condition anomalies.

Before looking at an example, ask yourself whether you understand the consistency boundaries of your databases. Updating a single column often locks the entire row of that column. Some databases lock at a finer granularity (for example, YugabyteDB’s column-level locking), while others may lock more via lock escalation.

Normalization also affects this by spreading a logical aggregate—which in Domain-Driven Design corresponds to a consistency boundary—across multiple tables and rows. If you run a SELECT ... FOR UPDATE with a JOIN, do you expect all the tables to be locked? The FOR UPDATE clause’s OF option lets PostgreSQL specify tables and Oracle Database specify columns, but in both cases, the underlying effect is to lock all the rows involved.

Defaults may be inconsistent. SELECT FOR UPDATE without OF locks all joined rows, but UPDATE on a join behaves differently as we will demonstrate below. Column-level locking also raises another issue without joins: check constraints must see a consistent state, even for columns that were not updated (see the now-fixed YugabyteDB issue: https://github.com/yugabyte/yugabyte-db/issues/15196).

SQL is an abstraction where the tables referenced in queries can also be views. In Oracle Database, join views can be updatable, which raises an important question about consistency: when you update a single column from one underlying table through a join view, should locks be taken on both underlying tables, or only on the one that’s actually being updated?

Demonstration

I'll use Oracle Database for this demonstration because it’s the only one I know that supports updates on joins. Those who say “this wouldn’t happen on PostgreSQL” are right that PostgreSQL has a true serializable isolation level, but updatable views are more limited and would not allow to build such an example. There’s no single best database—only different ones with different trade-offs. Expertise means understanding those differences.

Here is an example. I create two tables to store the employees salary and commission separately, and build a view on top of them to expose it as one:

create table emp_salary ( 
  empno      number primary key,
  salary     number
);

create table emp_commission ( 
  empno      number primary key,
  commission number
);

insert into emp_salary     values (99, 1000);
insert into emp_commission values (99,  100);
commit;

create view emp as 
 select * from emp_salary 
 join emp_commission 
 using ( empno )
;

This view is updateable and exposes all information per employee:

21:44:07 SQL> select * from emp;

   EMPNO    SALARY    COMMISSION
________ _________ _____________
      99     1,000           100

Now, an instruction is sent to the Human Resources department to add 42 to this employee, and the HR agent can choose to add it either to the salary or to the commission.

Alice, in HR, starts a transaction (using the serializable isolation level, which does not change the behavior here because in Oracle it only achieves Snapshot Isolation). She checks the employee’s salary and commission and decides to add 42 to the commission. As an extra safety measure, to ensure the operation is idempotent, she adds a condition on the salary and commission so that the update occurs only if neither has already been increased:

21:43:58 SQL> set transaction isolation level serializable;

Transaction ISOLATION succeeded.

21:44:07 SQL> select * from emp;

   EMPNO    SALARY    COMMISSION
________ _________ _____________
      99     1,000           100

21:44:11 SQL> update emp
  2   set commission=commission+42
  3   where empno=99
  4   and salary=1000 and commission=100
  5* ;

1 row updated.

21:44:19 SQL>

Alice didn't commit yet, and her colleague, Bob, in another office got the same instruction. He does the same, but decides to add 42 to the salary:

21:43:58 SQL> set transaction isolation level serializable;

Transaction ISOLATION succeeded.

21:44:26 SQL> select * from emp;

   EMPNO    SALARY    COMMISSION
________ _________ _____________
      99     1,000           100

21:44:29 SQL> update emp
  2   set salary=salary+42
  3   where empno=99
  4   and salary=1000 and commission=100
  5* ;

1 row updated.

21:44:37 SQL>

As both check the current state (salary=1000 and commission=100) only one should actually update a row. However, nothing blocks, and nothing fails. Alice is able to commit:

21:44:19 SQL> commit;

Commit complete.

21:44:45 SQL> select * from emp;

   EMPNO    SALARY    COMMISSION
________ _________ _____________
      99     1,000           142

Bob is also allowed to commit:

21:44:37 SQL> commit;

Commit complete.

21:44:50 SQL> select * from emp;

   EMPNO    SALARY    COMMISSION
________ _________ _____________
      99     1,042           142

The employee incorrectly received a raise of 42 in both salary and commission because the database failed to prevent a race condition.

Explanation

This is a Multi-Version Concurrency Control (MVCC) anomaly: the WHERE clause was evaluated on a consistent snapshot, stale, but the UPDATE was applied to the current state, and the resulting conflict went undetected. You can reproduce this in db<>fiddle by using an autonomous transaction to simulate a second user.

Neither Bob or Alice are responsible for this error because both of them checked the initial salary and commission in the same statement that updated one of them. It's the database that lied to them, or rather didn't expose that it was not one consistnt state. Users are not supposed to know that they query a view where data is physically stored in different tables and that the database locked only one of the two rows that appeared as one after the join.

Solutions

Write to lock. For safety, users should update both columns, even when values do not change, so that write conflicts protect against concurrent updates. Hibernate does this by default, updating all mapped columns. It is less efficient, but this consistency boundary is one reason dynamic update is not the default. Object–Relational Mappers (ORMs) and document databases such as MongoDB not only map domain model entities but also help ensure their consistency when those entities are mapped to a relational model that splits aggregates behind the abstraction layer.

Explicit locking. Another option is to add FOR UPDATE to a first SELECT in the transaction. Without an OF clause, it locks all rows in the join. Bob’s session then waits, sees Alice’s changes, and skips its own update. However, unless this is wrapped in a stored procedure, it adds an extra database round trip and holds locks across context switches and network calls, which harms scalability.

Avoid abstraction. A third solution is to avoid using views on joins, as they create an ambiguous consistency boundary where the user can’t tell what is locked. Still, the user could update on a join and experience the same:

update (
  select * from emp_salary join emp_commission using ( empno )
) set salary=salary+42
  where empno=99 and salary=1000 and commission=100
;

You might then ask how JSON-relational Duality views work, since they are also updatable views over multiple tables. The following example is similar to a join:

CREATE OR REPLACE JSON RELATIONAL DUALITY VIEW emp_dv AS
  emp_salary @insert @update @delete
  {
    _id      : empno,
    salary     : salary @update,
    commission : emp_commission @update
    @link ( from: empno, to: empno ) {
     empno : empno
     commission: commission
    }
  }
/

If I try to update different fields concurrently, the conflict is detected:

UPDATE emp_dv
  SET data = JSON_TRANSFORM(data, 
             SET '$.commission.commission' = 10)
;

declare
 -- using autonomous transaction to simulate a concurrent transaction 
 pragma autonomous_transaction;
begin
 UPDATE emp_dv
  SET data = JSON_TRANSFORM(data, 
             SET '$.salary' = 100)
;
 commit;
end;
/

declare
*
ERROR at line 1:
ORA-40896: Cannot update JSON Relational Duality View 'EMP_DV': Concurrent modification detected to document with ID 'FB03C16400'.
ORA-06512: at line 5
Help: https://docs.oracle.com/error-help/db/ora-40896/

It fails, so you must implement retry logic (MongoDB would automatically retry here). JSON-Relational Duality Views use ETAG to detect conflicts on the same document rather than using row locks.

You may also wonder how SQL assertions work, since they require consistency across multiple rows. They use a different locking mechanism to ensure a consistent state. I’ve included an example in the next post.

Conclusion

Relational databases aim to hide physical storage and locking so developers don’t have to manage them directly. But to achieve good performance, and avoid consistency anomalies, developers still need to understand locking boundaries. Not all RDBMSs support updatable join views (SQL-92 only specifies updatability for single-table views), though all provide explicit locking mechanisms such as SELECT FOR UPDATE. Row-store databases usually lock at the row level, but views may aggregate multiple rows without the developer knowing it.

By contrast, MongoDB, as a NoSQL database, exposes a simpler model: the document is the consistency boundary, like the aggregate in Domain-Driven Design. Developers update documents, not logical views, and write consistency applies to the entire document rather than a portion of it.

Selecting a character set for MySQL and MariaDB clients

 MySQL and MariaDB have many character-set related options, perhaps too many:

  1. character_set_client
  2. character_set_connection
  3. character_set_database
  4. character_set_filesystem
  5. character_set_results
  6. character_set_server
  7. character_set_system
This is a topic that I don't know much about and I am still far from an expert. My focus has been other DBMS topics. But I spent time recently on this topic while explaining what looked like a performance regression, but really was just a new release of MySQL using a charset that is less CPU-efficient than the previous charset that was used.

Debugging

The intial sequence to understand what was going on was:
  1. mysql -e 'SHOW GLOBAL VARIABLES like "character_set_%"
  2. mysql -e 'SHOW SESSION VARIABLES like "character_set_%"
  3. run "SHOW SESSION VARIABLES" from my benchmark client
Note:
  • the output from steps 1 and 2 was different
    • with SHOW GLOBAL VARIABLES I got character_set_client =latin1 but with SHOW SESSION VARIABLES I got character_set_client =utf8mb3. This happens. One reason is that some MySQL client binaries autodetect the charset based on the value of LANG or LC_TYPE from your Linux env. Another reason is that if autodetection isn't done then the clients can use the default charset that was set at compile time. That charset is then passed to the server during connection handshake (see thd_init_client_charset). So it is likely that character_set_client as displayed by SHOW GLOBAL VARIABLES isn't what your client will use.
  • the output from steps 2 and 3 was different
    • autodetection is only done when mysql_options() is called with a certain flag (see below). And that is not done by the MySQL driver in sysbench, nor is it done by Python's MySQLdb. So my benchmark clients are likely selecting the default charset and don't do autodetection. And that default is determined by the version of the MySQL client library, meaning that default can change over the years. For the source that implements this, search for MYSQL_AUTODETECT_CHARSET_NAME and read sql-common/client.c.
The following enables autodetection and should be called before calling mysql_real_connect():
    mysql_options(..., 
                  MYSQL_SET_CHARSET_NAME,
                  MYSQL_AUTODETECT_CHARSET_NAME);

Note that adding the following into my.cnf isn't a workaround for clients that don't do autodetect.
    [client]
    default-character-set=...

Notes

These are from my usage of MySQL 5.7.44, 8.0.45 and 8.4.8 along with MariaDB 10.6.25, 10.11.16 and 11.4.10. All were compiled from source as was sysbench. I installed MySQLdb and the MySQL client library via apt for Ubuntu 24.04.

The values for character_set_client, character_set_results and character_set_connection were measured via the MySQL command-line client running SHOW GLOBAL VARIABLES and SHOW SESSION VARIABLES and then the benchmark clients running SHOW SESSION VARIABLES.

The reason for sharing this is to explain the many possible values your session might use for character_set_client, character_set_results and character_set_connection. And using the wrong value might waste CPU.

What per-session values are used for character_set_client|results|connection?
* my.cnf has character_set_server=latin1
* per SHOW GLOBAL VARIABLES each is set to =latin1
* values below measured via SHOW SESSION VARIABLES
Values for character_set_client|results|connection
... with "mysql" command line client
... this is easy to change with --default-character-set command line option or equivalent option in my.cnf
dbms
5.7.44 utf8
8.0.45 utf8mb4
8.4.8 utf8mb4
10.6.25 utf8mb3
10.11.16 utf8mb3
11.4.10 utf8mb3
Values for character_set_client|results|connection
... with sysbench

client library version
dbms 5.7 8.0 8.4 10.6 10.11 11.4
5.7.44 latin1 latin1 latin1 NA NA NA
8.0.45 latin1 utf8mb4 utf8mb4 NA NA NA
8.4.8 latin1 utf8mb4 utf8mb4 NA NA NA
10.6.25 latin1 latin1 latin1 utf8mb4 utf8mb4 utf8mb4
10.11.16 latin1 latin1 latin1 utf8mb4 utf8mb4 utf8mb4
11.4.10 latin1 utf8mb4 utf8mb4 utf8mb4 utf8mb4 utf8mb4
Values for character_set_client|results|connection
... with insert benchmark (Python MySQLdb and /lib/x86_64-linux-gnu/libmysqlclient.so.21
... I am not what version is libmysqlclient.so.21, this is on Ubuntu 24.04

dbms
5.7.44 latin1
8.0.45 utf8mb4
8.4.8 utf8mb4
10.6.25 latin1
10.11.16 latin1
11.4.10 utf8mb4

March 28, 2026

Measuring AI Ability to Complete Long Software Tasks

This paper from METR (Model Evaluation & Threat Research) introduces a new metric for tracking AI progress: the "50%-task-completion time horizon". This denotes the length of software engineering task (measured by how long a skilled human developer takes to complete it) that the AI model can finish with 50% success rate. The researchers evaluated 12 frontier AI models on 170 tasks across three benchmarks: HCAST (97 diverse software tasks ranging from 1 minute to 30 hours), RE-Bench (7 difficult ML research engineering tasks, each 8 hours), and SWAA (66 short software actions taking 1-30 seconds). To calibrate task difficulty, they collected over 800 human baselines from professional developers, totaling 2,529 hours of work.

The headline finding is that this time horizon has been doubling every 7 months since 2019. GPT-2 could handle tasks that take humans about 2 seconds. The o3 model reached 110 minutes. Extrapolating this trend, AI reaches a one-month time horizon (167 working hours) between mid-2028 and mid-2031, with a central estimate of mid-2029. To be precise about what this means: the "167 working hours" refers to task complexity as measured by human effort, not AI execution time. An AI agent with a one-month time horizon could complete a task that takes a human developer a month of work within hours; it works continuously, doesn't context-switch, and executes steps faster. What the one-month time horizon means is that, AI would be able to autonomously build a SaaS MVP, migrate a 200k-line legacy codebase to microservices, or implement complex features end-to-end.

That said, "50% success rate" is doing the heavy lifting in this claim. There is a significant reliability gap: the 80% success-rate time horizon is 4-6x shorter than the 50% horizon. A model that can sometimes complete a month-long task can only reliably complete week-long ones. Models also struggle in messy environments that lack clear feedback loops (like automated tests) and require proactive information gathering. This means well-structured well-tested codebases will benefit far more than legacy systems with poor documentation. That is, greenfield work will see more gains than deep maintenance of complex existing systems.

Another caveat is that, experiments with pull requests showed that AI performance aligns more closely with low-context contractors (who take 5-18x longer than expert maintainers) than with experienced developers who know the codebase. This matters because the extrapolation is calibrated against general-purpose developers, not domain experts. When the paper says "one-month time horizon", it means one month of contractor-level work, not one month of work by someone who built the system. For tasks that require deep institutional knowledge (debugging subtle production issues, evolving tightly coupled legacy systems) the effective time horizon is much shorter.

I think the external validity checks were a strong part of the paper. The paper shows that the trend holds on SWE-bench (with an even faster 70-day doubling time). Again, messy tasks show lower absolute success but the same rate of improvement. The methodology is clean and reproducible. The main weakness is the task distribution. These 170 tasks are mostly software tasks with high inter-model task correlation (r=0.73), and they largely represent isolated coding challenges rather than the full breadth of real-world software engineering. True full-stack ownership (gathering specifications, architectural decision-making, deployment, and live operations) remains untested here. Furthermore, while AI excels at isolated prototyping, engineering end-to-end production-grade distributed systems requires a high level of fault tolerance/reliability that remains incredibly challenging to automate.


Discussion: What does a one-month AI time horizon world look like?

If the extrapolation holds and AI systems can automate month-long software tasks by ~2029, what are the implications?

This changes the unit economics of software fundamentally. Tasks that currently require a team of developers working for weeks (building integrations, standing up new services, migrating databases, writing and testing features) become something you can delegate to an AI agent for a fraction of the cost and time. The AI agent operates at the level of a competent junior-to-mid engineer working on a well-scoped project. Software goes from expensive and slow to cheap and fast.

But something tells me we will find ways to squander this. When hardware kept getting faster through Moore's law, we produced bloated software that felt just as slow. Wirth's law ate Moore's law. When cloud computing made infrastructure cheap, we built sprawling microservice architectures that ended up like a ball of yarn, tangled, and sprawling, and hard to unravel. Every time we reduce one cost, we expand scope until the new bottleneck hurts just as much as the old one. I expect AI-generated software to follow the same pattern: more software would mean more complexity, and new categories of problems we don't yet have names for. 50 shades of metastability?


What does the month-long task horizon mean for big tech, enterprise software, and startups? How the hell should I know? But let me speculate anyway, since you are still reading (or at least your LLMs are). 

Big tech companies employ tens of thousands of engineers specifically for the month-long projects this extrapolation targets: internal tools, service migrations, API implementations, data pipelines. AI will accelerate all of this. But these companies are structured around large engineering orgs. Promotion ladders, team hierarchies, planning processes all assume that software requires massive human headcount. If one engineer with AI agents does the work of five, the traditional organizational model breaks. Middle management's role become muddled. We may see flattening hierarchies and small high-leverage teams. Whether the behemoth organizations can handle such a rapid restructuring is unclear.

For infrastructure and database companies, the Jevons Paradox applies. Cheaper software means more of it: more applications, more databases, more demand for managed services. The total addressable market could grow substantially. But the flip side is that AI agents that can build month-long projects can also evaluate and switch infrastructure. Vendor lock-in weakens when migration is cheap, and customers become more price-sensitive. There's also the question of whether AI agents will develop strong default preferences shaped by their training data or their creator companies. This is a brand new market force, which we didn't have to contend with before.

I think the most dramatic impact is on software startups. Today, the primary cost of a startup is engineering talent. If AI handles month-long implementation tasks, that cost drops by an order of magnitude. The barrier to entry collapses. Many more products get built.  That may mean that differentiation shifts to domain expertise, data moats, and taste. I forecasted a similar trend earlier for academic papers.

The reliability gap tells us the world of 2029-2031 is probably not "AI replaces developers" but "developers who use AI effectively are 5-10x more productive". The scarce resource shifts from the ability to write code to the ability to specify what to build, evaluate whether it is correct, and manage the complexity of systems that grow much faster than before. What worries me more is whether organizations can restructure fast enough to keep up.

March 27, 2026

Tournament of Databases: Round 2

Round 2 is set: Oracle vs. MongoDB, MySQL vs. DuckDB, PostgreSQL vs. Snowflake, SQL Server vs. Databricks. DuckDB's run continues. Who makes the Final Four?

PostgreSQL: Bye-Bye MD5 Authentication. What’s Next?

Introduction MD5 has been the most popular algorithm for encoding passwords in PostgreSQL and other database systems. It is simple to implement, causes less overhead and less latency for completing the authentication, and this is why it has been the most preferred method. However, a discussion thread in the PG community has given a signal […]

March 26, 2026

OSTEP Chapter 13: The Abstraction of Address Spaces

Chapter 13 of OSTEP provides a primer on how and why modern operating systems abstract physical hardware.

This is part of our series going through OSTEP book chapters. The OSTEP textbook is freely available at Remzi's website if you like to follow along.


Multiprogramming and Time Sharing

In the early days of computing, machines didn't provide much of a memory abstraction to users. The operating system was essentially a library of routines sitting at the bottom of physical memory, while a single running process occupied the rest of the available space. 

I am not that old, but I got to experience this in 2002 through programming sensor nodes with TinyOS. These "motes" were operating on extremely constrained hardware with just 64KB of memory, so there was no complex virtualization. The entire OS and the application were compiled together as a single process, and all memory was statically preallocated. Ah, the joy of debugging a physically distributed multi-node deployment using three LEDs per node, with everyone shouting "red fast-blinking, yellow on". Here’s a snapshot from two mote deployments we ran at UB. Most of my pre-2010 work was on sensor networks. I realize I never wrote about that period. Maybe there’s a reason.

The shift away from the simple early computer systems began with multiprogramming. The OS switched among ready processes to increase CPU utilization on expensive machines. Later, the demand for interactivity gave rise to time sharing. Initially, implementing time sharing involved saving a process's entire memory state to disk and loading another, but this had terrible overhead. Thus, operating systems began keeping multiple processes resident in memory, enabling fast context switches.



The Address Space and Virtualization

To solve the protection problems caused by sharing memory, the OS introduced the address space. This abstraction is a running program's view of memory, containing three components. 

  • Program Code: The static instructions, placed at the top of the address space.
  • Heap: Dynamically allocated, user-managed memory that grows downward.
  • Stack: Used for local variables and function calls, growing upward from the bottom.

The crux of virtual memory is how the OS translates virtual addresses into physical addresses. The OS aims for three primary goals. Transparency: The program should act as if it has its own private physical memory. Efficiency: The OS must minimize time and space overhead. Protection: The OS must isolate processes, ensuring a program cannot affect the memory of the OS or other applications.


GPUs and Beyond

So, do these classic principles still apply to modern hardware like GPUs? Yes, but the scale and complexity is larger. Early CPUs handled a few processes; GPUs handle thousands of threads. Yet, the foundational abstractions (virtual address spaces, hardware-assisted translation, and memory isolation) mostly stay the same (for now). CUDA Unified Memory pushes things further with a single address space across system RAM and GPU VRAM.

A major challenge in modern machine learning is the so-called memory wall. AI accelerators perform computations so fast that they waste most of their time waiting for data to arrive. Conventional system RAM cannot deliver the required bandwidth, and this forced the adoption of High Bandwidth Memory (HBM): stacks of memory chips placed adjacent to compute cores, connected with ultra-fine wiring capable of moving terabytes of data per second.

The memory wall is so severe that the main bottleneck in scaling LLMs today is the memory. Generating text relies on the attention mechanism, which keeps KeyValue representations of all previous tokens to avoid recomputing context. When we present an AI with a huge codebase, the GPU’s ultra-fast but limited HBM will quickly be exhausted. To avoid running out of memory, engineers come up with new designs. They use tiered memory, and keep hot data while spilling overflow to cheaper CXL-attached system RAM. Cache offloading is another key technique: it moves inactive users' KV caches to the host CPU or fast NVMe SSDs to free GPU space.  

2026 – MySQL Ecosystem Performance Benchmark Report

By Percona Lab Results  ·  2026  ·  MySQL MariaDB Percona Benchmark Database MySQL Ecosystem Performance Benchmark Report 2026 Comparative Analysis of InnoDB-Compatible Engines — Percona Lab Results Repository: github.com/Percona-Lab-results/2026-interactive-metrics Interactive graphs available: Explore the full dataset dynamically — click any graph below to open the interactive version. OLTP Read-Write Local — interactive benchmark graph OLTP […]

Navigating Regional Network Blocks

A look at recent ISP and government-directed blocks of Supabase domains in three regions—what triggered them, how we worked with authorities and customers to restore access, and what multi-tenant platforms can do to prepare.

March 25, 2026

Non-First Normal Forms ( 1NF) and MongoDB: an alternative to 4NF to address 3NF anomalies

SQL databases are grounded in relational algebra, but they are not the only databases with a strong theoretical basis. MongoDB, designed as a document database to solve practical engineering problems and improve developer experience, also builds on theory. It supports non–first-normal-form schemas and extends relational operators to work with them. This foundation is described in a 1986 paper, published 23 years earlier: Theory of Non-First Normal Form Relational Databases

MongoDB's aggregation pipeline operators ($unwind, $group, $lookup, set operations) are concrete implementations of the paper's abstract algebraic operators.

I've build the following examples while reading the paper to illustrate the concepts with practical examples.

The Basic ¬1NF Data Model

The paper defines nested relations where attributes can be atomic or relation-valued:

This structure records facts about employees, like some information about their children, their skills, and exams they passed.

This maps directly to a MongoDB document schema:

// This IS the paper's example, expressed as MongoDB documents
db.employees.insertMany([
  {
    ename: "Smith",
    Children: [
      { name: "Sam", dob: "2/10/84" },
      { name: "Sue", dob: "1/20/85" }
    ],
    Skills: [
      {
        type: "typing",
        Exams: [
          { year: 1984, city: "Atlanta" },
          { year: 1985, city: "Dallas" }
        ]
      },
      {
        type: "dictation",
        Exams: [
          { year: 1984, city: "Atlanta" }
        ]
      }
    ]
  },
  {
    ename: "Watson",
    Children: [
      { name: "Sam", dob: "3/12/78" }
    ],
    Skills: [
      {
        type: "filing",
        Exams: [
          { year: 1984, city: "Atlanta" },
          { year: 1975, city: "Austin" },
          { year: 1971, city: "Austin" }
        ]
      },
      {
        type: "typing",
        Exams: [
          { year: 1962, city: "Waco" }
        ]
      }
    ]
  }
])

MongoDB's document model is essentially what Roth called a "database scheme" — a collection of rules where attributes can be zero-order (scalar fields) or higher-order (embedded arrays of documents). The paper's Figure 3-1 is literally a MongoDB collection:

When using an object-oriented language, this model looks natural but this paper is from 1986 so there was another motivation.

Motivation: ¬1NF as an alternative to 4NF decomposition

The paper's example (Figure 1-1) shows an employee relation in 1NF requires 10 rows with massive redundancy:

employee | child | skill
-----------------------------
Smith    | Sam   | typing
Smith    | Sue   | typing
Smith    | Sam   | filing
Smith    | Sue   | filing
Jones    | Joe   | typing
Jones    | Mike  | typing
Jones    | Joe   | dictation
Jones    | Mike  | dictation
Jones    | Joe   | data entry
Jones    | Mike  | data entry

The ¬1NF version needs only 2 tuples. The paper notes three problems with 1NF:

  • Insert anomaly: Adding a child for Jones requires adding 3 tuples (one per skill)
  • Update anomaly: Changing Smith's "typing" to "word processing" requires updating multiple rows
  • Decomposition cost: The 1NF solution requires splitting into two tables and joining them back

Today, these anomalies are not a major problem because a single updateMany() or insertMany operation in a MongoDB transaction is atomic, just like an UPDATE or INSERT in SQL databases. More importantly, the document model makes multi-document operations rare—the Cartesian explosion of redundant data remains undesirable.

The first four normal forms (1NF, 2NF, 3NF, BCNF) do not address this, because there are no non-trivial functional dependencies in this relation: all three attributes together form the only candidate key. This relation is therefore already in BCNF. This is precisely why Fagin introduced 4NF in 1977.

This example illustrates a multivalued dependency (MVD) violation: children and skills are independent multivalued attributes of an employee, but they’re stored together. This creates a redundant Cartesian product (10 rows for 2 employees). Fourth Normal Form (4NF) addresses this by splitting the independent dependencies into separate tables to remove these anomalies, but at the cost of more tables touched per transaction and more joins per query.

Even a Third Normal Form (3NF) schema can still suffer from insert and update anomalies and data redundancy. MongoDB’s document model can offer many of 4NF’s benefits without incurring the join overhead.

Here is the MongoDB Equivalent:

// The ¬1NF representation — exactly what MongoDB does naturally
db.employees.insertMany([
  {
    employee: "Smith",
    Children: [{ child: "Sam" }, { child: "Sue" }],
    Skills: [{ skill: "typing" }, { skill: "filing" }]
  },
  {
    employee: "Jones",
    Children: [{ child: "Joe" }, { child: "Mike" }],
    Skills: [
      { skill: "typing" },
      { skill: "dictation" },
      { skill: "data entry" }
    ]
  }
])

Adding a new child for Jones is just one operation, no anomaly:


db.employees.updateOne(
  { employee: "Jones" },
  { $push: { Children: { child: "Sara" } } }
)

Changing Smith's "typing" to "word processing" — one operation:


db.employees.updateOne(
  { employee: "Smith", "Skills.skill": "typing" },
  { $set: { "Skills.$.skill": "word processing" } }
)

Querying everything about an employee doesnt need a join:


db.employees.findOne({ employee: "Smith" })

MongoDB exists for a fundamental reason: its document model eliminates the need for decomposition and joins, as well as the update anomalies introduced by the 1NF constraint. Most workloads rely on single-document operations, which MongoDB physically optimizes as single-shard transactions, a single disk read or write, and a single replication operation. These inserts and updates lock the document, which serves as the consistency boundary, but modify only the fields whose values change and update only the corresponding index entries.

Nest and Unnest Operators

The paper defines nest (ν) to aggregate flat rows into nested structures, and unnest (μ) to flatten nested structures back:

  • ν_{B=(C,D)}(r): nest attributes C,D into a new nested relation B
  • μ_{B}(r): unnest nested relation B

MongoDB's aggregation pipeline has direct equivalents.

The paper's unnest (μ) operator is $unwind:

db.employees.aggregate([
  { $unwind: "$Skills" }
])

Each skill becomes its own document, duplicating employee info.

Deep unnest (μ* in the paper) is possible with multiple $unnest stages:

db.employees.aggregate([
  { $unwind: "$Children" },
  { $unwind: "$Skills" }
])

This produces the full 1NF Cartesian expansion, flattening both Children and Skills.

When starting with a flat collection, the paper's nest operator (ν) is $group:

db.flat_employees.aggregate([
  {
    $group: {
      _id: "$employee",
      Children: { $addToSet: { child: "$child" } },
      Skills:   { $addToSet: { skill: "$skill" } }
    }
  }
])

The paper proves that the order of unnesting doesn't matter (Thomas & Fischer's result). In MongoDB, these produce the same fully-flat result:

db.employees.aggregate([
  { $unwind: "$Children" },
  { $unwind: "$Skills" }
])

db.employees.aggregate([
  { $unwind: "$Skills" },
  { $unwind: "$Children" }
])

The paper also notes that nest is not always an inverse for unnest in general, but IS an inverse for Partitioned Normal Form (PNF) relations.

Partitioned Normal Form (PNF)

PNF requires that the atomic attributes of each relation (and each nested relation) form a key. The paper shows a pathological relation (Figure 1-3) that violates PNF:

Smith, the same employee, has two different skill sets, and Jones has a duplicate across sets.

In MongoDB, PNF corresponds to the fundamental design principle that _id determines the document. The pathological case above would mean having two documents for "Smith" with different Skills arrays — which is exactly what MongoDB's _id uniqueness constraint prevents at the collection level.

Without using _id as the employee identifier, we could have two employees with the same name:

db.bad_design.insertMany([
  { employee: "Smith", Skills: ["typing", "filing"] },
  { employee: "Smith", Skills: ["sorting", "mailing"] }
])

The correct design if we don't want to use _id is enforcing PNF with a unique index:

db.good_design.createIndex({ employee: 1 }, { unique: true })

db.good_design.insertOne({
  employee: "Smith",
  Skills: ["typing", "filing", "sorting", "mailing"]
})

Like in many papers, the employee name is used as an identifier (two "Smith" is one person) but obviously in real life there's a generated identifier to identify the physical person.

The paper, in theorem 5-1, proves that PNF is closed under unnesting. In MongoDB terms: if your documents are well-designed (one document per logical entity), then $unwind won't create ambiguous or inconsistent flat results.

For nested relations, PNF also applies recursively.

MongoDB schema validation can enforce the PNF constraint by comparing the size of an array with the size when duplicates are ignored (using $setUnion with an empty array):

db.createCollection("employees", {
  validator: {
    $and: [
      // JSON Schema for structure and types
      {
        $jsonSchema: {
          bsonType: "object",
          required: ["ename"],
          properties: {
            ename: { bsonType: "string" },
            Children: { bsonType: "array", items: { bsonType: "object", required: ["name"], properties: { name: { bsonType: "string" }, dob: { bsonType: "string" } } } },
            Skills: { bsonType: "array", items: { bsonType: "object", required: ["type"], properties: { type: { bsonType: "string" },
             Exams: { bsonType: "array", items: { bsonType: "object", required: ["year"], properties: { year: { bsonType: "int" }, city: { bsonType: "string" } } } } } } }
          }
        }
      },
      // PNF uniqueness constraints using $expr
      {
        $expr: {
          $and: [
            // Level 1: Children.name must be unique within the document
            {
              $eq: [
                { $size: { $ifNull: ["$Children.name", []] } },
                { $size: { $setUnion: [{ $ifNull: ["$Children.name", []] }] } }
              ]
            },
            // Level 1: Skills.type must be unique within the document
            {
              $eq: [
                { $size: { $ifNull: ["$Skills.type", []] } },
                { $size: { $setUnion: [{ $ifNull: ["$Skills.type", []] }] } }
              ]
            },
            // Level 2: Within EACH Skills element, Exams.year must be unique
            {
              $allElementsTrue: {
                $map: {
                  input: { $ifNull: ["$Skills", []] },
                  as: "skill",
                  in: { $eq: [ { $size: { $ifNull: ["$$skill.Exams.year", []] } }, { $size: { $setUnion: [{ $ifNull: ["$$skill.Exams.year", []] }] } } ] }
                }
              }
            }
          ]
        }
      }
    ]
  },
  validationLevel: "strict",
  validationAction: "error"
});

The fact that nest is not always the inverse of unnest is a well-known MongoDB pitfall:

db.employees.drop()  
db.employees.insertMany([  
  {  
    _id: 1,  
    employee: "Smith",  
    Children: [{ child: "Sam" }, { child: "Sue" }],  
    Skills: [{ skill: "typing" }]  
  },  
  {  
    _id: 2,  
    employee: "Smith",  
    Children: [{ child: "Tom" }],  
    Skills: [{ skill: "filing" }, { skill: "sorting" }]  
  },  
  {  
    _id: 3,  
    employee: "Jones",  
    Children: [{ child: "Joe" }],  
    Skills: [{ skill: "typing" }]  
  }  
])  

db.employees.aggregate([  
  // unnest (μ) 
  { $unwind: "$Children" },  
  // nest (ν), grouping by employee, which is NOT a key  
  {  
    $group: {  
      _id: "$employee",  
      Children: { $addToSet: "$Children" },  
      Skills: { $first: "$Skills" }  
    }  
  },  
  // Clean up  
  { $project: { _id: 0, employee: "$_id", Children: 1, Skills: 1 } },  
  { $sort: { employee: 1 } }  
])  

This results in two Smiths collapsed into one:

[
  {
    Children: [ { child: 'Joe' } ],
    Skills: [ { skill: 'typing' } ],
    employee: 'Jones'
  },
  {
    Children: [ { child: 'Tom' }, { child: 'Sam' }, { child: 'Sue' } ],
    Skills: [ { skill: 'typing' } ],
    employee: 'Smith'
  }
]

The correct way is grouping by the proper key:

db.employees.aggregate([  
  // unnest (μ)
  { $unwind: "$Children" },  
  // nest (ν), grouping by _id, which IS a key (PNF)  
  {  
    $group: {  
      _id: "$_id",  
      employee: { $first: "$employee" },  
      Children: { $push: "$Children" },  
      Skills: { $first: "$Skills" }  
    }  
  },  
  // Clean up  
  { $project: { _id: 1, employee: 1, Children: 1, Skills: 1 } },  
  { $sort: { _id: 1 } }  
])  

This is the paper's Theorem 5-2 demonstrated in MongoDB: nest inverts unnest if and only if the grouping key satisfies PNF.

Extended Algebra Operators

I a previous post, From Relational Algebra to Document Semantics I explained how MongoDB extended the semantics of the relational selection Selection (σ) to non-1NF schemas, and mentioned that other relational operations are available in MongoDB. They are covered by the ¬1NF paper.

The paper defines extended union (∪ᵉ) to merge nested relations for tuples that agree on atomic attributes, rather than treating them as separate tuples. In MongoDB, this is achieved with $merge or with aggregation. Suppose we want to merge new course data into existing student records:

db.students.insertMany([
  {
    sname: "Jones",
    Courses: [
      { cname: "Math", grade: "A" },
      { cname: "Science", grade: "B" }
    ]
  },
  {
    sname: "Smith",
    Courses: [
      { cname: "Math", grade: "A" },
      { cname: "Physics", grade: "C" },
      { cname: "Science", grade: "A" }
    ]
  }
])

db.students.updateOne(
  { sname: "Jones" },
  { $addToSet: { Courses: { cname: "Physics", grade: "B" } } }
)

db.students.updateOne(
  { sname: "Smith" },
  {
    $addToSet: {
      Courses: {
        $each: [
          { cname: "Chemistry", grade: "A" },
          { cname: "English", grade: "B" }
        ]
      }
    }
  }
)

This added "Physics:B" to Jones, and "Chemistry:A", "English:B" to Smith without adding two tuples with different course sets like a standard union would do:

db.students.find()

[
  {
    _id: ObjectId('69c3a72344fc089068d4b0c2'),
    sname: 'Jones',
    Courses: [
      { cname: 'Math', grade: 'A' },
      { cname: 'Science', grade: 'B' },
      { cname: 'Physics', grade: 'B' }
    ]
  },
  {
    _id: ObjectId('69c3a72344fc089068d4b0c3'),
    sname: 'Smith',
    Courses: [
      { cname: 'Math', grade: 'A' },
      { cname: 'Physics', grade: 'C' },
      { cname: 'Science', grade: 'A' },
      { cname: 'Chemistry', grade: 'A' },
      { cname: 'English', grade: 'B' }
    ]
  }
]

The paper emphasizes that standard set union treats entire tuples as atomic — if two tuples differ at all, both appear in the result. Extended union is ... (truncated)

Announcing VillageSQL Server 0.0.3

VillageSQL 0.0.3 adds parametrized types, MySQL 8.4.8 LTS support, expanded SQL compatibility, and ChatGPT and Ollama inference via the vsql-ai extension.