Skip to content

Commit acdf8e7

Browse files
PS-9683 : Enable CHECK TABLE EXTENDED to detect InnoDB LOB corruptions
https://perconadev.atlassian.net/browse/PS-9683 Problem: -------- In some of the customer environments, it is found that an external LOB's first page is shared between two records. This shouldn't be possible. But it can happen rarely. The root cause is not known yet. Using table in state, can lead to corruption and assertion failures. Fix: --- But we can detect such a scenario by scanning all records and the external LOB's first page. the EXTENDED keyword currently is ignored by InnoDB. We use it to enable the LOB checks and mark index as corrupted if an external LOB's first page is shared between two records. A thread local blob map is used to identify the duplicate user record that has the same external LOB page. usage: CHECK TABLE t1 EXTENDED A sample error log when such corruption is detected: 2025-04-11T10:30:28.078607Z 9 [ERROR] [MY-011825] [InnoDB] Invalid record! External LOB first page cannot be shared between two records 2025-04-11T10:30:28.078625Z 9 [ERROR] [MY-011825] [InnoDB] The external LOB first page is [page id: space=6, page number=347] 2025-04-11T10:30:28.078631Z 9 [ERROR] [MY-011825] [InnoDB] The first occurence of the external LOB first page is in record : page_no: 3 with heap_no: 6 2025-04-11T10:30:28.078638Z 9 [ERROR] [MY-011825] [InnoDB] The second occurence of the external LOB first page is in record: page_no: 4 with heap no: 7 2025-04-11T10:30:28.078646Z 9 [ERROR] [MY-012738] [InnoDB] Apparent corruption in space 6 page 4 index `PRIMARY` 2025-04-11T10:30:28.078663Z 9 [ERROR] [MY-013050] [InnoDB] In page 4 of index `PRIMARY` of table `test`.`t1` 2025-04-11T10:30:28.088156Z 9 [Warning] [MY-012382] [InnoDB] Cannot open table test/t1Please refer to http://dev.mysql.com/doc/refman/8.0/en/innodb-troubleshooting.html for how to resolve the issue.
1 parent 3f1fccb commit acdf8e7

File tree

6 files changed

+271
-5
lines changed

6 files changed

+271
-5
lines changed
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
#
2+
# PS-9638 - Enable CHECK TABLE EXTENDED to detect InnoDB LOB corruptions
3+
#
4+
call mtr.add_suppression("\\[ERROR\\] .* Invalid record! External LOB first page cannot be shared between two records");
5+
call mtr.add_suppression("\\[ERROR\\] .* The external LOB first page is \\[page id: space=\\d+, page number=\\d+\\]");
6+
call mtr.add_suppression("\\[ERROR\\] .* The first occurence of the external LOB first page is in record : page_no: \\d+ with heap_no: \\d+");
7+
call mtr.add_suppression("\\[ERROR\\] .* The second occurence of the external LOB first page is in record: page_no: \\d+ with heap no: \\d+");
8+
call mtr.add_suppression("\\[ERROR\\] .* Apparent corruption in space \\d+ page \\d+ index `PRIMARY`");
9+
call mtr.add_suppression("\\[ERROR\\] .* In page \\d+ of index `PRIMARY` of table `test`.`t1`");
10+
call mtr.add_suppression("\\[Warning\\] .* Cannot open table test/t1Please refer to .*innodb-troubleshooting.html for how to resolve the issue.");
11+
CREATE TABLE t1 (id INT PRIMARY KEY, make_big CHAR(200), val LONGBLOB, INDEX idx1(val(50)));
12+
INSERT INTO t1 (id,val) VALUES (1,REPEAT('a',1000000));
13+
INSERT INTO t1 (id,val) VALUES (2,REPEAT('b',1000000));
14+
INSERT INTO t1 (id,val) VALUES (3,REPEAT('c',1000000));
15+
INSERT INTO t1 (id,val) VALUES (4,REPEAT('d',1000000));
16+
INSERT INTO t1 (id,val) VALUES (5,REPEAT('e',1000000));
17+
INSERT INTO t1 (id,val) VALUES (6,REPEAT('f',1000000));
18+
SET DEBUG='+d, simulate_lob_corruption';
19+
CHECK TABLE t1;
20+
Table Op Msg_type Msg_text
21+
test.t1 check status OK
22+
CHECK TABLE t1 EXTENDED;
23+
Table Op Msg_type Msg_text
24+
test.t1 check Warning InnoDB: The B-tree of index PRIMARY is corrupted.
25+
test.t1 check error Corrupt
26+
SELECT * FROM t1;
27+
ERROR 42S02: Table 'test.t1' doesn't exist
28+
DROP TABLE t1;
29+
case 2: compressed table
30+
CREATE TABLE t1 (id INT PRIMARY KEY, make_big CHAR(200), val LONGBLOB, INDEX idx1(val(50))) ROW_FORMAT=COMPRESSED;
31+
INSERT INTO t1 (id,val) VALUES (1,REPEAT('a',1000000));
32+
INSERT INTO t1 (id,val) VALUES (2,REPEAT('b',1000000));
33+
INSERT INTO t1 (id,val) VALUES (3,REPEAT('c',1000000));
34+
INSERT INTO t1 (id,val) VALUES (4,REPEAT('d',1000000));
35+
INSERT INTO t1 (id,val) VALUES (5,REPEAT('e',1000000));
36+
INSERT INTO t1 (id,val) VALUES (6,REPEAT('f',1000000));
37+
SET DEBUG='+d, simulate_lob_corruption';
38+
CHECK TABLE t1;
39+
Table Op Msg_type Msg_text
40+
test.t1 check status OK
41+
CHECK TABLE t1 EXTENDED;
42+
Table Op Msg_type Msg_text
43+
test.t1 check Warning InnoDB: The B-tree of index PRIMARY is corrupted.
44+
test.t1 check error Corrupt
45+
SELECT * FROM t1;
46+
ERROR 42S02: Table 'test.t1' doesn't exist
47+
DROP TABLE t1;
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
--source include/have_debug.inc
2+
3+
--echo #
4+
--echo # PS-9638 - Enable CHECK TABLE EXTENDED to detect InnoDB LOB corruptions
5+
--echo #
6+
call mtr.add_suppression("\\[ERROR\\] .* Invalid record! External LOB first page cannot be shared between two records");
7+
call mtr.add_suppression("\\[ERROR\\] .* The external LOB first page is \\[page id: space=\\d+, page number=\\d+\\]");
8+
call mtr.add_suppression("\\[ERROR\\] .* The first occurence of the external LOB first page is in record : page_no: \\d+ with heap_no: \\d+");
9+
call mtr.add_suppression("\\[ERROR\\] .* The second occurence of the external LOB first page is in record: page_no: \\d+ with heap no: \\d+");
10+
call mtr.add_suppression("\\[ERROR\\] .* Apparent corruption in space \\d+ page \\d+ index `PRIMARY`");
11+
call mtr.add_suppression("\\[ERROR\\] .* In page \\d+ of index `PRIMARY` of table `test`.`t1`");
12+
call mtr.add_suppression("\\[Warning\\] .* Cannot open table test/t1Please refer to .*innodb-troubleshooting.html for how to resolve the issue.");
13+
14+
CREATE TABLE t1 (id INT PRIMARY KEY, make_big CHAR(200), val LONGBLOB, INDEX idx1(val(50)));
15+
16+
INSERT INTO t1 (id,val) VALUES (1,REPEAT('a',1000000));
17+
INSERT INTO t1 (id,val) VALUES (2,REPEAT('b',1000000));
18+
INSERT INTO t1 (id,val) VALUES (3,REPEAT('c',1000000));
19+
INSERT INTO t1 (id,val) VALUES (4,REPEAT('d',1000000));
20+
INSERT INTO t1 (id,val) VALUES (5,REPEAT('e',1000000));
21+
INSERT INTO t1 (id,val) VALUES (6,REPEAT('f',1000000));
22+
23+
SET DEBUG='+d, simulate_lob_corruption';
24+
CHECK TABLE t1;
25+
26+
CHECK TABLE t1 EXTENDED;
27+
28+
--error ER_NO_SUCH_TABLE
29+
SELECT * FROM t1;
30+
31+
DROP TABLE t1;
32+
33+
--echo case 2: compressed table
34+
CREATE TABLE t1 (id INT PRIMARY KEY, make_big CHAR(200), val LONGBLOB, INDEX idx1(val(50))) ROW_FORMAT=COMPRESSED;
35+
36+
INSERT INTO t1 (id,val) VALUES (1,REPEAT('a',1000000));
37+
INSERT INTO t1 (id,val) VALUES (2,REPEAT('b',1000000));
38+
INSERT INTO t1 (id,val) VALUES (3,REPEAT('c',1000000));
39+
INSERT INTO t1 (id,val) VALUES (4,REPEAT('d',1000000));
40+
INSERT INTO t1 (id,val) VALUES (5,REPEAT('e',1000000));
41+
INSERT INTO t1 (id,val) VALUES (6,REPEAT('f',1000000));
42+
43+
SET DEBUG='+d, simulate_lob_corruption';
44+
CHECK TABLE t1;
45+
46+
CHECK TABLE t1 EXTENDED;
47+
48+
--error ER_NO_SUCH_TABLE
49+
SELECT * FROM t1;
50+
51+
DROP TABLE t1;

storage/innobase/fsp/fsp0fsp.cc

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3566,11 +3566,13 @@ bool fseg_page_is_free(fseg_header_t *seg_header, /*!< in: segment header */
35663566

35673567
const page_size_t page_size(space->flags);
35683568

3569-
seg_inode = fseg_inode_get(seg_header, space_id, page_size, &mtr);
3569+
if (seg_header != nullptr) {
3570+
seg_inode = fseg_inode_get(seg_header, space_id, page_size, &mtr);
35703571

3571-
ut_a(seg_inode);
3572-
ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
3573-
ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
3572+
ut_a(seg_inode);
3573+
ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
3574+
ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
3575+
}
35743576

35753577
descr = xdes_get_descriptor(space_id, page, page_size, &mtr);
35763578
ut_a(descr);

storage/innobase/handler/ha_innodb.cc

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18929,11 +18929,25 @@ int ha_innobase::check(THD *thd, /*!< in: user thread handle */
1892918929
continue;
1893018930
}
1893118931

18932+
/* true if user uses CHECK TABLE t1 EXTENDED */
18933+
const bool is_extended = check_opt->flags & T_EXTEND;
18934+
1893218935
if (!(check_opt->flags & T_QUICK) && !index->is_corrupted()) {
1893318936
/* Enlarge the fatal lock wait timeout during
1893418937
CHECK TABLE. */
1893518938
srv_fatal_semaphore_wait_extend.fetch_add(1);
1893618939

18940+
if (is_extended && index->is_clustered()) {
18941+
// Setup the thread local map for clustered index only
18942+
thread_local_blob_map = new blob_ref_map();
18943+
}
18944+
18945+
auto blob_ref_clear_guard = create_scope_guard([]() {
18946+
if (!thread_local_blob_map) return;
18947+
delete thread_local_blob_map;
18948+
thread_local_blob_map = nullptr;
18949+
});
18950+
1893718951
bool valid = btr_validate_index(index, m_prebuilt->trx, false);
1893818952

1893918953
/* Restore the fatal lock wait timeout after
@@ -18947,7 +18961,16 @@ int ha_innobase::check(THD *thd, /*!< in: user thread handle */
1894718961
"InnoDB: The B-tree of"
1894818962
" index %s is corrupted.",
1894918963
index->name());
18950-
continue;
18964+
18965+
// with extended mode, if clustered index is corrupted, it is marked
18966+
// as corrupted. We skip checking other indexes. The table is not
18967+
// repairable and user has to drop it
18968+
if (is_extended && index->is_clustered()) {
18969+
dict_set_corrupted(index);
18970+
break;
18971+
} else {
18972+
continue;
18973+
}
1895118974
}
1895218975
}
1895318976

storage/innobase/include/page0page.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -802,6 +802,24 @@ param[in] index index
802802
@return true if ok */
803803
bool page_is_spatial_non_leaf(const rec_t *rec, dict_index_t *index);
804804

805+
/** A blob map to track the first page no of external LOB and its parent record
806+
which is the <page_no, heap_no>. This is used to find duplicate external LOB
807+
pages that is shared between two records. This can happen only on corruption
808+
(cause unknown yet). CHECK TABLE t1 EXTENDED will use this map to report
809+
corruption and mark the table as corrupted */
810+
using blob_ref_map = std::unordered_map<page_no_t, std::pair<page_no_t, ulint>>;
811+
extern thread_local blob_ref_map *thread_local_blob_map;
812+
813+
/** Validate that the external LOB's first page is not shared between records of
814+
a clustered index
815+
@param[in] rec physical record
816+
@param[in] index index of the table
817+
@param[in] offsets the record offset array
818+
@return true If OK else false if external LOB is found to be shared between two
819+
records, ie false on failure */
820+
bool page_rec_blob_validate(const rec_t *rec, const dict_index_t *index,
821+
const ulint *offsets);
822+
805823
#include "page0page.ic"
806824

807825
#endif

storage/innobase/page/page0page.cc

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,9 @@ this program; if not, write to the Free Software Foundation, Inc.,
4646
#include "lock0lock.h"
4747
#include "srv0srv.h"
4848
#endif /* !UNIV_HOTBACKUP */
49+
#include "lob0lob.h"
50+
51+
thread_local blob_ref_map *thread_local_blob_map = nullptr;
4952

5053
/* THE INDEX PAGE
5154
==============
@@ -1721,6 +1724,124 @@ bool page_rec_validate(
17211724
return true;
17221725
}
17231726

1727+
/** Validate that the external LOB's first page is not shared between records of
1728+
a clustered index
1729+
@param[in] rec physical record
1730+
@param[in] index index of the table
1731+
@param[in] offsets the record offset array
1732+
@return true If OK else false if external LOB is found to be shared between two
1733+
records, ie false on failure */
1734+
bool page_rec_blob_validate(const rec_t *rec, const dict_index_t *index,
1735+
const ulint *offsets) {
1736+
// this means reference check is not enabled. Enabled only via
1737+
// CHECK TABLE path
1738+
if (thread_local_blob_map == nullptr) {
1739+
return true;
1740+
}
1741+
1742+
// if index is not PRIMARY, return true
1743+
if (!index->is_clustered()) {
1744+
return true;
1745+
}
1746+
1747+
// if page-level is not zero, return true because blob exists only on leaf
1748+
// level
1749+
const page_t *page = page_align(rec);
1750+
if (!page_is_leaf(page)) {
1751+
return true;
1752+
}
1753+
1754+
// if rec is not user record, blobs dont exist, return true
1755+
if (!page_rec_is_user_rec(rec)) {
1756+
return true;
1757+
}
1758+
1759+
// if rec doesn't have any external LOB, return true
1760+
if (!rec_offs_any_extern(offsets)) {
1761+
return true;
1762+
}
1763+
1764+
// if rec is deleted marked, return true, we cannot validate the blob. the
1765+
// blob pages in the deleted marked records could be freed
1766+
if (rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
1767+
return true;
1768+
}
1769+
1770+
// if rec is not the owner of the blob, we cannot validate if blob page state
1771+
// now validate that the blob first page is not marked as free from page
1772+
// bitmap
1773+
1774+
ulint n_fields = rec_offs_n_fields(offsets);
1775+
1776+
for (ulint i = 0; i < n_fields; i++) {
1777+
if (rec_offs_nth_extern(index, offsets, i)) {
1778+
// We do const_cast to remove constness because lob::ref_t doesn't have a
1779+
// variant that takes const record pointer
1780+
byte *field_ref = const_cast<byte *>(
1781+
lob::btr_rec_get_field_ref(index, rec, offsets, i));
1782+
1783+
lob::ref_t ref(field_ref);
1784+
if (!ref.is_owner() || ref.is_null() || ref.is_null_relaxed() ||
1785+
ref.is_being_modified()) {
1786+
continue;
1787+
}
1788+
1789+
if (ref.length() == 0) {
1790+
// LOB purged
1791+
continue;
1792+
}
1793+
1794+
space_id_t blob_space_id = ref.space_id();
1795+
page_no_t blob_page_no = ref.page_no();
1796+
1797+
page_id_t blob_page_id(blob_space_id, blob_page_no);
1798+
bool is_free = fseg_page_is_free(nullptr, blob_space_id, blob_page_no);
1799+
if (is_free) {
1800+
// This should not be possible. A record that owns the BLOB shouldn't
1801+
// have the first page marked as free in page bitmap
1802+
ut_ad(0);
1803+
ib::error() << "Invalid record. The record's blob reference is marked"
1804+
<< " as free although the record owns it "
1805+
<< " page_no: " << page_get_page_no(page)
1806+
<< " heap_no: " << page_rec_get_heap_no(rec);
1807+
ib::error() << "BLOB reference that is marked free " << blob_page_id;
1808+
1809+
return false;
1810+
}
1811+
1812+
DBUG_EXECUTE_IF(
1813+
"simulate_lob_corruption",
1814+
// introduce corruption after 5 external LOB entries
1815+
if (thread_local_blob_map->size() >= 5) {
1816+
// we introduce a fake entry in the map
1817+
(*thread_local_blob_map)[blob_page_no] = std::make_pair(
1818+
page_get_page_no(page) - 1, page_rec_get_heap_no(rec) - 1);
1819+
});
1820+
1821+
auto it = thread_local_blob_map->find(blob_page_no);
1822+
if (it == thread_local_blob_map->end()) {
1823+
(*thread_local_blob_map)[blob_page_no] =
1824+
std::make_pair(page_get_page_no(page), page_rec_get_heap_no(rec));
1825+
} else {
1826+
auto val = it->second;
1827+
ib::error() << "Invalid record! External LOB first page cannot be "
1828+
"shared between "
1829+
"two records";
1830+
ib::error() << "The external LOB first page is " << blob_page_id;
1831+
ib::error() << "The first occurence of the external LOB first page is "
1832+
"in record : page_no: "
1833+
<< val.first << " with heap_no: " << val.second;
1834+
ib::error() << "The second occurence of the external LOB first page is "
1835+
"in record: page_no: "
1836+
<< page_get_page_no(page)
1837+
<< " with heap no: " << page_rec_get_heap_no(rec);
1838+
return false;
1839+
}
1840+
}
1841+
}
1842+
return true;
1843+
}
1844+
17241845
#ifndef UNIV_HOTBACKUP
17251846
#ifdef UNIV_DEBUG
17261847
/** Checks that the first directory slot points to the infimum record and
@@ -2234,6 +2355,10 @@ bool page_validate(const page_t *page, dict_index_t *index) {
22342355
goto func_exit;
22352356
}
22362357

2358+
if (!page_rec_blob_validate(const_cast<byte *>(rec), index, offsets)) {
2359+
goto func_exit;
2360+
}
2361+
22372362
DBUG_EXECUTE_IF(
22382363
"check_table_set_wrong_min_bit",
22392364
if (page_rec_is_user_rec(rec) &&

0 commit comments

Comments
 (0)