Skip to content

Commit ed94777

Browse files
mmeller-wikiaadpaste
authored andcommitted
feat(PLATFORM-10268): optimize migrateLinksTable.php for pagelinks (#112)
* feat(PLATFORM-10268): optimize migrateLinksTable.php for pagelinks * chore(PLATFORM-10268): improve migrateRevisionCommentTemp migration script performance * fix(PLATFORM-10268): skip corrupted rows of revision when migrating revision_comment_temp Note: this change might be dropped from fork after all wikis are updated to 1.43
1 parent 2756a5a commit ed94777

File tree

3 files changed

+92
-39
lines changed

3 files changed

+92
-39
lines changed

maintenance/includes/LoggedUpdateMaintenance.php

+9
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,18 @@ public function execute() {
4848
return true;
4949
}
5050

51+
// Fandom-start PLATFORM-10268
52+
// log execution time of each migration script, so we could easily find migration bottlenecks
53+
$start = microtime( true );
54+
$this->output( "Running '$key'\n" );
5155
if ( !$this->doDBUpdates() ) {
56+
$elapsed = microtime( true ) - $start;
57+
$this->output( "'$key' failed after {$elapsed}s\n" );
5258
return false;
5359
}
60+
$elapsed = microtime( true ) - $start;
61+
$this->output( "'$key' finished after {$elapsed}s\n" );
62+
// Fandom-end
5463

5564
$db->newInsertQueryBuilder()
5665
->insertInto( 'updatelog' )

maintenance/migrateLinksTable.php

+53
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,18 @@ protected function doDBUpdates() {
7575
}
7676
$highestPageId = $highestPageId[0];
7777
$pageId = 0;
78+
79+
if ( $table === 'pagelinks' ) {
80+
$createdLinkTargetRows = $this->fillLinkTargetTable();
81+
$this->output( "In total created $createdLinkTargetRows linktarget rows\n" );
82+
83+
$updatedPageLinksRows = $this->handlePagelinksUpdate();
84+
$this->output( "In total updated $updatedPageLinksRows pagelinks rows\n" );
85+
86+
$updated = $updatedPageLinksRows + $createdLinkTargetRows;
87+
$this->output( "Completed normalization of $table, $updated rows updated.\n" );
88+
return true;
89+
}
7890
while ( $pageId <= $highestPageId ) {
7991
// Given the indexes and the structure of links tables,
8092
// we need to split the update into batches of pages.
@@ -88,6 +100,47 @@ protected function doDBUpdates() {
88100
return true;
89101
}
90102

103+
private function fillLinkTargetTable(): int {
104+
$batchSize = $this->getBatchSize();
105+
$query = "INSERT INTO linktarget(lt_namespace, lt_title)
106+
SELECT pl_namespace, pl_title FROM pagelinks
107+
WHERE NOT exists (SELECT * FROM linktarget WHERE pl_namespace = lt_namespace AND pl_title = lt_title)
108+
GROUP BY pl_namespace, pl_title
109+
LIMIT $batchSize";
110+
$dbw = $this->getPrimaryDB();
111+
$createdRows = 0;
112+
while ( true ) {
113+
$dbw->query( $query, __METHOD__ );
114+
$affectedRows = $dbw->affectedRows();
115+
$createdRows += $affectedRows;
116+
$this->output( "Created $affectedRows linktarget rows\n" );
117+
$this->waitForReplication();
118+
if ( $batchSize > $affectedRows ) {
119+
return $createdRows;
120+
}
121+
}
122+
}
123+
124+
private function handlePagelinksUpdate(): int {
125+
$batchSize = $this->getBatchSize();
126+
$query = "UPDATE pagelinks
127+
SET pl_target_id = (SELECT lt_id FROM linktarget WHERE pl_namespace = lt_namespace AND pl_title = lt_title)
128+
WHERE pl_target_id IS NULL OR pl_target_id = 0
129+
LIMIT $batchSize";
130+
$dbw = $this->getPrimaryDB();
131+
$updatedRows = 0;
132+
while ( true ) {
133+
$dbw->query( $query, __METHOD__ );
134+
$affectedRows = $dbw->affectedRows();
135+
$updatedRows += $affectedRows;
136+
$this->output( "Updated $affectedRows pagelinks rows\n" );
137+
$this->waitForReplication();
138+
if ( $batchSize > $affectedRows ) {
139+
return $updatedRows;
140+
}
141+
}
142+
}
143+
91144
private function handlePageBatch( $lowPageId, $mapping, $table ) {
92145
$batchSize = $this->getBatchSize();
93146
$targetColumn = $mapping[$table]['target_id'];

maintenance/migrateRevisionCommentTemp.php

+30-39
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,6 @@ protected function getUpdateKey() {
4848
}
4949

5050
protected function doDBUpdates() {
51-
$batchSize = $this->getBatchSize();
52-
5351
$dbw = $this->getDB( DB_PRIMARY );
5452
if ( !$dbw->fieldExists( 'revision', 'rev_comment_id', __METHOD__ ) ) {
5553
$this->output( "Run update.php to create rev_comment_id.\n" );
@@ -61,48 +59,41 @@ protected function doDBUpdates() {
6159
}
6260

6361
$this->output( "Merging the revision_comment_temp table into the revision table...\n" );
64-
$conds = [];
6562
$updated = 0;
66-
$sleep = (int)$this->getOption( 'sleep', 0 );
63+
$highestRevId = (int)$dbw->newSelectQueryBuilder()
64+
->select( 'rev_id' )
65+
->from( 'revision' )
66+
->limit( 1 )
67+
->caller( __METHOD__ )
68+
->orderBy( 'rev_id', 'DESC' )
69+
->fetchField();
70+
$this->output( "Max rev_id $highestRevId.\n" );
71+
// Default batchSize from "$this->getBatchSize()" is 200, use 1000 to speed migration up
72+
// There is "$this->waitForReplication()" after each batch anyway
73+
$batchSize = 1000;
74+
$lowId = -1;
75+
$highId = $batchSize;
6776
while ( true ) {
68-
$res = $dbw->newSelectQueryBuilder()
69-
->select( [ 'rev_id', 'revcomment_comment_id' ] )
70-
->from( 'revision' )
71-
->join( 'revision_comment_temp', null, 'rev_id=revcomment_rev' )
72-
->where( [ 'rev_comment_id' => 0 ] )
73-
->andWhere( $conds )
74-
->limit( $batchSize )
75-
->orderBy( 'rev_id' )
76-
->caller( __METHOD__ )
77-
->fetchResultSet();
78-
79-
$numRows = $res->numRows();
80-
81-
$last = null;
82-
foreach ( $res as $row ) {
83-
$last = $row->rev_id;
84-
$dbw->newUpdateQueryBuilder()
85-
->update( 'revision' )
86-
->set( [ 'rev_comment_id' => $row->revcomment_comment_id ] )
87-
->where( [ 'rev_id' => $row->rev_id ] )
88-
->caller( __METHOD__ )->execute();
89-
$updated += $dbw->affectedRows();
90-
}
77+
// `coalesce` covers case when some row is missing in revision_comment_temp.
78+
// Original script used `join` which skipped revision row when `revision_comment_temp` was null.
79+
//
80+
// Not sure whether we should try to fix the data first
81+
// RevisionSelectQueryBuilder::joinComment suggest that all revisions should have rev_comment_id set
82+
$query = "UPDATE revision
83+
SET rev_comment_id = COALESCE((SELECT revcomment_comment_id FROM revision_comment_temp WHERE rev_id=revcomment_rev), rev_comment_id)
84+
WHERE rev_id > $lowId AND rev_id <= $highId";
85+
$dbw->query( $query, __METHOD__ );
86+
$affected = $dbw->affectedRows();
87+
$updated += $affected;
88+
$this->output( "Updated $affected revision rows from $lowId to $highId\n" );
89+
$this->waitForReplication();
9190

92-
if ( $numRows < $batchSize ) {
93-
// We must have reached the end
91+
if ( $highId > $highestRevId ) {
92+
// We reached the end
9493
break;
9594
}
96-
97-
// @phan-suppress-next-line PhanTypeSuspiciousStringExpression last is not-null when used
98-
$this->output( "... rev_id=$last, updated $updated\n" );
99-
$conds = [ $dbw->expr( 'rev_id', '>', $last ) ];
100-
101-
// Sleep between batches for replication to catch up
102-
$this->waitForReplication();
103-
if ( $sleep > 0 ) {
104-
sleep( $sleep );
105-
}
95+
$lowId = $highId;
96+
$highId = $lowId + $batchSize;
10697
}
10798
$this->output(
10899
"Completed merge of revision_comment_temp into the revision table, "

0 commit comments

Comments
 (0)