Skip to content

Commit

Permalink
db-dump: set sequence values when importing a database dump
Browse files Browse the repository at this point in the history
By default, the import script recreates the database schema, which
includes creating new sequences with zero values. This results in the
lazy crates.io developer occasionally receiving obscure errors when
inserting records into tables that use sequences, often not on the first
or second insert due to IDs in the database dump not always being
continuous.

Rather than dumping the real sequence values from the database, we can
just recreate them based on the maximum ID in each table. Works well
enough, and means we don't have to tinker with the export script or ship
extra data.

This commit only configures the database tables that actually include
data in the database dump. There are other sequences, but since those
tables won't have data imported, it doesn't matter if they remain zero
after import.
  • Loading branch information
LawnGnome committed Dec 13, 2024
1 parent 560dbfe commit bd4780f
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 3 deletions.
13 changes: 11 additions & 2 deletions crates/crates_io_database_dump/src/configuration.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use serde::Deserialize;
use serde::{Deserialize, Serialize};
use std::collections::{BTreeMap, VecDeque};

/// An enum indicating whether a column is included in the database dumps.
Expand All @@ -15,7 +15,9 @@ pub enum ColumnVisibility {
/// and should list all tables the current tables refers to with foreign key
/// constraints on public columns. The `filter` field is a valid SQL expression
/// used in a `WHERE` clause to filter the rows of the table. The `columns`
/// field maps column names to their respective visibilities.
/// field maps column names to their respective visibilities. The `sequence`
/// field, if present, defines the sequence used by the table when generating
/// IDs, along with the ID column.
#[derive(Clone, Debug, Default, Deserialize)]
pub struct TableConfig {
#[serde(default)]
Expand All @@ -24,6 +26,13 @@ pub struct TableConfig {
pub columns: BTreeMap<String, ColumnVisibility>,
#[serde(default)]
pub column_defaults: BTreeMap<String, String>,
pub sequence: Option<SequenceConfig>,
}

#[derive(Clone, Debug, Default, Deserialize, Serialize)]
pub struct SequenceConfig {
pub column: String,
pub name: String,
}

/// Maps table names to the respective configurations. Used to load `dump_db.toml`.
Expand Down
21 changes: 21 additions & 0 deletions crates/crates_io_database_dump/src/dump-db.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ description = "public"
crates_cnt = "public"
created_at = "public"
path = "public"
[categories.sequence]
column = "id"
name = "categories_id_seq"

[crate_downloads.columns]
crate_id = "public"
Expand Down Expand Up @@ -87,6 +90,9 @@ textsearchable_index_col = "private" # This Postgres specific and can be derived
repository = "public"
max_upload_size = "public"
max_features = "public"
[crates.sequence]
column = "id"
name = "packages_id_seq"

[crates_categories]
dependencies = ["categories", "crates"]
Expand Down Expand Up @@ -130,6 +136,9 @@ features = "public"
target = "public"
kind = "public"
explicit_name = "public"
[dependencies.sequence]
column = "id"
name = "dependencies_id_seq"

[__diesel_schema_migrations.columns]
version = "private"
Expand All @@ -152,6 +161,9 @@ id = "public"
keyword = "public"
crates_cnt = "public"
created_at = "public"
[keywords.sequence]
column = "id"
name = "keywords_id_seq"

[metadata.columns]
total_downloads = "public"
Expand Down Expand Up @@ -186,6 +198,9 @@ github_id = "public"
name = "public"
avatar = "public"
org_id = "public"
[teams.sequence]
column = "id"
name = "teams_id_seq"

[users]
filter = """
Expand All @@ -207,6 +222,9 @@ is_admin = "private"
publish_notifications = "private"
[users.column_defaults]
gh_access_token = "''"
[users.sequence]
column = "id"
name = "users_id_seq"

[version_downloads]
dependencies = ["versions"]
Expand Down Expand Up @@ -253,6 +271,9 @@ documentation = "public"
repository = "public"
categories = "public"
keywords = "public"
[versions.sequence]
column = "id"
name = "versions_id_seq"

[versions_published_by.columns]
version_id = "private"
Expand Down
13 changes: 13 additions & 0 deletions crates/crates_io_database_dump/src/dump-import.sql.j2
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,19 @@ BEGIN;
{% for cd in table.column_defaults %}
ALTER TABLE "{{table.name}}" ALTER COLUMN "{{cd.column}}" DROP DEFAULT;
{%- endfor %}
{%- endfor %}

-- Set sequence values.
{% for table in tables -%}
{% if table.sequence %}
SELECT setval(
'{{table.sequence.name}}',
COALESCE(
(SELECT MAX("{{table.sequence.column}}") FROM "{{table.name}}")::BIGINT,
1
)
);
{% endif %}
{%- endfor %}

-- Reenable triggers on each table.
Expand Down
4 changes: 3 additions & 1 deletion crates/crates_io_database_dump/src/gen_scripts.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::configuration::{ColumnVisibility, TableConfig, VisibilityConfig};
use crate::configuration::{ColumnVisibility, SequenceConfig, TableConfig, VisibilityConfig};
use anyhow::Context;
use serde::Serialize;
use std::{fs::File, path::Path};
Expand All @@ -18,6 +18,7 @@ struct HandlebarsTableContext<'a> {
filter: Option<String>,
columns: String,
column_defaults: Vec<ColumnDefault<'a>>,
sequence: Option<&'a SequenceConfig>,
}

#[derive(Debug, Serialize)]
Expand Down Expand Up @@ -52,6 +53,7 @@ impl TableConfig {
filter,
columns,
column_defaults,
sequence: self.sequence.as_ref(),
})
}
}
Expand Down

0 comments on commit bd4780f

Please sign in to comment.