Skip to content

Commit 1bc5468

Browse files
committed
stats: fix "non-nullable col with no value" for multiple column families
This commit fixes an edge case in the timestamp-advancing mechanism of the inconsistent scans that are used by the table statistics collection. In particular, previously on a table with multiple column families it was possible for the scan to stop in the middle of a SQL row, and if that row happens to exist at the old timestamp but to no longer exist at the new timestamp, we'd hit an internal error. The bug is now fixed by setting WholeRowsOfSize option of the BatchRequest if we're scanning a table with multiple column families which guarantees that the scan at each timestamp will stop only at the ends of SQL rows. To reproduce this behavior I extended an existing test that stresses the timestamp-advancing mechanism. Release note (bug fix): Previously, on a table with multiple column families CockroachDB could encounter "Non-nullable column "‹×›:‹×›" with no value" error during table statistics collection in rare cases. The bug has been present since v19.2 and is now fixed.
1 parent 420663f commit 1bc5468

File tree

2 files changed

+70
-26
lines changed

2 files changed

+70
-26
lines changed

pkg/sql/create_stats_test.go

Lines changed: 62 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -38,14 +38,20 @@ func TestStatsWithLowTTL(t *testing.T) {
3838
// The test depends on reasonable timings, so don't run under race.
3939
skip.UnderRace(t)
4040

41+
rng, _ := randutil.NewTestRand()
4142
var blockTableReader atomic.Bool
4243
blockCh := make(chan struct{})
4344

4445
s, db, _ := serverutils.StartServer(t, base.TestServerArgs{
4546
Knobs: base.TestingKnobs{
4647
DistSQL: &execinfra.TestingKnobs{
47-
// Set the batch size small to avoid having to use a large number of rows.
48-
TableReaderBatchBytesLimit: 100,
48+
// Set the batch size small to avoid having to use a large
49+
// number of rows.
50+
//
51+
// We use a random bytes limit so that the scans have a chance
52+
// to stop at different points within the SQL row (in case of
53+
// multiple column families).
54+
TableReaderBatchBytesLimit: 50 + int64(rng.Intn(100)),
4955
TableReaderStartScanCb: func() {
5056
if blockTableReader.Load() {
5157
<-blockCh
@@ -57,51 +63,80 @@ func TestStatsWithLowTTL(t *testing.T) {
5763
defer s.Stopper().Stop(context.Background())
5864

5965
r := sqlutils.MakeSQLRunner(db)
60-
r.Exec(t, `
61-
SET CLUSTER SETTING sql.stats.automatic_collection.enabled = false;
62-
`)
63-
r.Exec(t, `
64-
CREATE DATABASE test;
65-
USE test;
66-
CREATE TABLE t (k INT PRIMARY KEY, a INT, b INT);
67-
`)
68-
const numRows = 20
69-
r.Exec(t, `INSERT INTO t SELECT k, 2*k, 3*k FROM generate_series(0, $1) AS g(k)`, numRows-1)
66+
r.Exec(t, `SET CLUSTER SETTING sql.stats.automatic_collection.enabled = false;`)
67+
// Sometimes use single column family, sometimes use multiple.
68+
if rng.Float64() < 0.5 {
69+
r.Exec(t, `CREATE TABLE t (k INT PRIMARY KEY, a INT NOT NULL, b INT NOT NULL, FAMILY (k, a, b));`)
70+
} else {
71+
r.Exec(t, `CREATE TABLE t (k INT PRIMARY KEY, a INT NOT NULL, b INT NOT NULL, FAMILY (k), FAMILY (a), FAMILY (b));`)
72+
}
73+
const initialNumRows, maxNumRows = 20, 100
74+
r.Exec(t, `INSERT INTO t SELECT k, 2*k, 3*k FROM generate_series(0, $1) AS g(k)`, initialNumRows-1)
7075

71-
// Start a goroutine that keeps updating rows in the table and issues
72-
// GCRequests simulating a 2 second TTL. While this is running, reading at a
73-
// timestamp older than 2 seconds will likely error out.
76+
// Start a goroutine that keeps modifying rows (updating, deleting,
77+
// inserting new ones) in the table and issues GCRequests simulating a 2
78+
// second TTL. While this is running, reading at a timestamp older than 2
79+
// seconds will likely error out.
7480
var goroutineErr error
7581
var wg sync.WaitGroup
7682
wg.Add(1)
7783
stopCh := make(chan struct{})
84+
// onlyUpdates determines whether only UPDATE stmts are issued in the
85+
// goroutine.
86+
var onlyUpdates atomic.Bool
87+
onlyUpdates.Store(true)
7888

7989
go func() {
8090
defer wg.Done()
8191

8292
// Open a separate connection to the database.
8393
db2 := s.SQLConn(t)
8494

85-
_, err := db2.Exec("USE test")
86-
if err != nil {
87-
goroutineErr = err
88-
return
89-
}
90-
rng, _ := randutil.NewTestRand()
95+
nextPK := initialNumRows
9196
for {
9297
select {
9398
case <-stopCh:
9499
return
95100
default:
96101
}
97-
k := rng.Intn(numRows)
98-
if _, err := db2.Exec(`UPDATE t SET a=a+1, b=b+2 WHERE k=$1`, k); err != nil {
99-
goroutineErr = err
100-
return
102+
switch rng.Intn(4) {
103+
case 0, 1:
104+
// In 50% cases try to update an existing row (it's ok if the
105+
// row doesn't exist).
106+
k := rng.Intn(nextPK)
107+
if _, err := db2.Exec(`UPDATE t SET a=a+1, b=b+2 WHERE k=$1`, k); err != nil {
108+
goroutineErr = err
109+
return
110+
}
111+
case 2:
112+
if onlyUpdates.Load() {
113+
continue
114+
}
115+
// In 25% cases try to delete a row (it's ok if the row doesn't
116+
// exist).
117+
k := rng.Intn(nextPK)
118+
if _, err := db2.Exec(`DELETE FROM t WHERE k=$1`, k); err != nil {
119+
goroutineErr = err
120+
return
121+
}
122+
case 3:
123+
if onlyUpdates.Load() {
124+
continue
125+
}
126+
// In 25% cases insert a new row, but don't insert too many rows
127+
// to allow for the stats collection to complete.
128+
if nextPK == maxNumRows {
129+
continue
130+
}
131+
if _, err := db2.Exec(`INSERT INTO t SELECT $1, 2*$1, 3*$1`, nextPK); err != nil {
132+
goroutineErr = err
133+
return
134+
}
135+
nextPK++
101136
}
102137
// Force a table GC of values older than 2 seconds.
103138
if err := s.ForceTableGC(
104-
context.Background(), "test", "t", s.Clock().Now().Add(-int64(2*time.Second), 0),
139+
context.Background(), "defaultdb", "t", s.Clock().Now().Add(-int64(2*time.Second), 0),
105140
); err != nil {
106141
goroutineErr = err
107142
return
@@ -139,6 +174,7 @@ SET CLUSTER SETTING sql.stats.automatic_collection.enabled = false;
139174

140175
// Set up timestamp advance to keep timestamps no older than 1s.
141176
r.Exec(t, `SET CLUSTER SETTING sql.stats.max_timestamp_age = '1s'`)
177+
onlyUpdates.Store(false)
142178

143179
// Block start of the inconsistent scan for 2s so that the initial timestamp
144180
// becomes way too old.

pkg/sql/row/fetcher.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -702,6 +702,14 @@ func (rf *Fetcher) StartInconsistentScan(
702702
}
703703
}
704704

705+
if rf.args.Spec.MaxKeysPerRow > 1 {
706+
// If the table has multiple column families, we need to ensure that
707+
// the scan stops at the end of the full SQL row - otherwise, the
708+
// row might be deleted between two timestamps leading to incorrect
709+
// results (or internal errors).
710+
ba.Header.WholeRowsOfSize = int32(rf.args.Spec.MaxKeysPerRow)
711+
}
712+
705713
log.VEventf(ctx, 2, "inconsistent scan: sending a batch with %d requests", len(ba.Requests))
706714
res, err := txn.Send(ctx, ba)
707715
if err != nil {

0 commit comments

Comments
 (0)