Skip to content

Commit 111023b

Browse files
Add config option to specify max file size (#118)
1 parent d4e6410 commit 111023b

File tree

10 files changed

+202
-29
lines changed

10 files changed

+202
-29
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
### Added
11+
12+
- Added config option `settings.maxFileSize` to control the maximum file size zoekt will index. ([#118](https://github.com/sourcebot-dev/sourcebot/pull/118))
13+
1014
## [2.6.0] - 2024-12-02
1115

1216
### Added

packages/backend/src/constants.ts

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import { Settings } from "./types.js";
12

23
/**
34
* The interval to reindex a given repository.
@@ -7,4 +8,11 @@ export const REINDEX_INTERVAL_MS = 1000 * 60 * 60;
78
/**
89
* The interval to re-sync the config.
910
*/
10-
export const RESYNC_CONFIG_INTERVAL_MS = 1000 * 60 * 60 * 24;
11+
export const RESYNC_CONFIG_INTERVAL_MS = 1000 * 60 * 60 * 24;
12+
13+
/**
14+
* Default settings.
15+
*/
16+
export const DEFAULT_SETTINGS: Settings = {
17+
maxFileSize: 2 * 1024 * 1024, // 2MB in bytes
18+
}

packages/backend/src/db.test.ts

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import { expect, test } from 'vitest';
2+
import { migration_addMaxFileSize, migration_addSettings, Schema } from './db';
3+
import { DEFAULT_SETTINGS } from './constants';
4+
import { DeepPartial } from './types';
5+
6+
7+
test('migration_addSettings adds the `settings` field with defaults if it does not exist', () => {
8+
const schema: DeepPartial<Schema> = {};
9+
10+
const migratedSchema = migration_addSettings(schema as Schema);
11+
expect(migratedSchema).toStrictEqual({
12+
settings: DEFAULT_SETTINGS,
13+
});
14+
});
15+
16+
test('migration_addMaxFileSize adds the `maxFileSize` field with the default value if it does not exist', () => {
17+
const schema: DeepPartial<Schema> = {
18+
settings: {},
19+
}
20+
21+
const migratedSchema = migration_addMaxFileSize(schema as Schema);
22+
expect(migratedSchema).toStrictEqual({
23+
settings: {
24+
maxFileSize: DEFAULT_SETTINGS.maxFileSize,
25+
}
26+
});
27+
});
28+
29+
test('migration_addMaxFileSize will throw if `settings` is not defined', () => {
30+
const schema: DeepPartial<Schema> = {};
31+
expect(() => migration_addMaxFileSize(schema as Schema)).toThrow();
32+
});

packages/backend/src/db.ts

Lines changed: 57 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,13 @@
11
import { JSONFilePreset } from "lowdb/node";
22
import { type Low } from "lowdb";
3-
import { AppContext, Repository } from "./types.js";
3+
import { AppContext, Repository, Settings } from "./types.js";
4+
import { DEFAULT_SETTINGS } from "./constants.js";
5+
import { createLogger } from "./logger.js";
46

5-
type Schema = {
7+
const logger = createLogger('db');
8+
9+
export type Schema = {
10+
settings: Settings,
611
repos: {
712
[key: string]: Repository;
813
}
@@ -11,9 +16,16 @@ type Schema = {
1116
export type Database = Low<Schema>;
1217

1318
export const loadDB = async (ctx: AppContext): Promise<Database> => {
14-
const db = await JSONFilePreset<Schema>(`${ctx.cachePath}/db.json`, { repos: {} });
19+
const db = await JSONFilePreset<Schema>(`${ctx.cachePath}/db.json`, {
20+
repos: {},
21+
settings: DEFAULT_SETTINGS,
22+
});
23+
24+
await applyMigrations(db);
25+
1526
return db;
1627
}
28+
1729
export const updateRepository = async (repoId: string, data: Repository, db: Database) => {
1830
db.data.repos[repoId] = {
1931
...db.data.repos[repoId],
@@ -22,7 +34,49 @@ export const updateRepository = async (repoId: string, data: Repository, db: Dat
2234
await db.write();
2335
}
2436

37+
export const updateSettings = async (settings: Settings, db: Database) => {
38+
db.data.settings = settings;
39+
await db.write();
40+
}
41+
2542
export const createRepository = async (repo: Repository, db: Database) => {
2643
db.data.repos[repo.id] = repo;
2744
await db.write();
45+
}
46+
47+
export const applyMigrations = async (db: Database) => {
48+
const log = (name: string) => {
49+
logger.info(`Applying migration '${name}'`);
50+
}
51+
52+
await db.update((schema) => {
53+
// @NOTE: please ensure new migrations are added after older ones!
54+
schema = migration_addSettings(schema, log);
55+
schema = migration_addMaxFileSize(schema, log);
56+
return schema;
57+
});
58+
}
59+
60+
/**
61+
* @see: https://github.com/sourcebot-dev/sourcebot/pull/118
62+
*/
63+
export const migration_addSettings = (schema: Schema, log?: (name: string) => void) => {
64+
if (!schema.settings) {
65+
log?.("addSettings");
66+
schema.settings = DEFAULT_SETTINGS;
67+
}
68+
69+
return schema;
70+
}
71+
72+
/**
73+
* @see: https://github.com/sourcebot-dev/sourcebot/pull/118
74+
*/
75+
export const migration_addMaxFileSize = (schema: Schema, log?: (name: string) => void) => {
76+
if (!schema.settings.maxFileSize) {
77+
log?.("addMaxFileSize");
78+
schema.settings.maxFileSize = DEFAULT_SETTINGS.maxFileSize;
79+
}
80+
81+
return schema;
2882
}

packages/backend/src/main.test.ts

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import { expect, test } from 'vitest';
2-
import { isRepoReindxingRequired } from './main';
3-
import { Repository } from './types';
2+
import { isAllRepoReindexingRequired, isRepoReindexingRequired } from './main';
3+
import { Repository, Settings } from './types';
44

55
test('isRepoReindexingRequired should return false when no changes are made', () => {
66
const previous: Repository = {
@@ -15,7 +15,7 @@ test('isRepoReindexingRequired should return false when no changes are made', ()
1515
};
1616
const current = previous;
1717

18-
expect(isRepoReindxingRequired(previous, current)).toBe(false);
18+
expect(isRepoReindexingRequired(previous, current)).toBe(false);
1919
})
2020

2121
test('isRepoReindexingRequired should return true when git branches change', () => {
@@ -35,7 +35,7 @@ test('isRepoReindexingRequired should return true when git branches change', ()
3535
branches: ['main', 'feature']
3636
};
3737

38-
expect(isRepoReindxingRequired(previous, current)).toBe(true);
38+
expect(isRepoReindexingRequired(previous, current)).toBe(true);
3939
});
4040

4141
test('isRepoReindexingRequired should return true when git tags change', () => {
@@ -55,7 +55,7 @@ test('isRepoReindexingRequired should return true when git tags change', () => {
5555
tags: ['v1.0', 'v2.0']
5656
};
5757

58-
expect(isRepoReindxingRequired(previous, current)).toBe(true);
58+
expect(isRepoReindexingRequired(previous, current)).toBe(true);
5959
});
6060

6161
test('isRepoReindexingRequired should return true when local excludedPaths change', () => {
@@ -74,5 +74,26 @@ test('isRepoReindexingRequired should return true when local excludedPaths chang
7474
excludedPaths: ['node_modules', 'dist']
7575
};
7676

77-
expect(isRepoReindxingRequired(previous, current)).toBe(true);
77+
expect(isRepoReindexingRequired(previous, current)).toBe(true);
7878
});
79+
80+
test('isAllRepoReindexingRequired should return false when fileLimitSize has not changed', () => {
81+
const previous: Settings = {
82+
maxFileSize: 1000,
83+
}
84+
const current: Settings = {
85+
...previous,
86+
}
87+
expect(isAllRepoReindexingRequired(previous, current)).toBe(false);
88+
});
89+
90+
test('isAllRepoReindexingRequired should return true when fileLimitSize has changed', () => {
91+
const previous: Settings = {
92+
maxFileSize: 1000,
93+
}
94+
const current: Settings = {
95+
...previous,
96+
maxFileSize: 2000,
97+
}
98+
expect(isAllRepoReindexingRequired(previous, current)).toBe(true);
99+
});

packages/backend/src/main.ts

Lines changed: 33 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,20 +5,20 @@ import { getGitHubReposFromConfig } from "./github.js";
55
import { getGitLabReposFromConfig } from "./gitlab.js";
66
import { getGiteaReposFromConfig } from "./gitea.js";
77
import { getGerritReposFromConfig } from "./gerrit.js";
8-
import { AppContext, LocalRepository, GitRepository, Repository } from "./types.js";
8+
import { AppContext, LocalRepository, GitRepository, Repository, Settings } from "./types.js";
99
import { cloneRepository, fetchRepository } from "./git.js";
1010
import { createLogger } from "./logger.js";
11-
import { createRepository, Database, loadDB, updateRepository } from './db.js';
11+
import { createRepository, Database, loadDB, updateRepository, updateSettings } from './db.js';
1212
import { arraysEqualShallow, isRemotePath, measure } from "./utils.js";
13-
import { REINDEX_INTERVAL_MS, RESYNC_CONFIG_INTERVAL_MS } from "./constants.js";
13+
import { DEFAULT_SETTINGS, REINDEX_INTERVAL_MS, RESYNC_CONFIG_INTERVAL_MS } from "./constants.js";
1414
import stripJsonComments from 'strip-json-comments';
1515
import { indexGitRepository, indexLocalRepository } from "./zoekt.js";
1616
import { getLocalRepoFromConfig, initLocalRepoFileWatchers } from "./local.js";
1717
import { captureEvent } from "./posthog.js";
1818

1919
const logger = createLogger('main');
2020

21-
const syncGitRepository = async (repo: GitRepository, ctx: AppContext) => {
21+
const syncGitRepository = async (repo: GitRepository, settings: Settings, ctx: AppContext) => {
2222
let fetchDuration_s: number | undefined = undefined;
2323
let cloneDuration_s: number | undefined = undefined;
2424

@@ -46,7 +46,7 @@ const syncGitRepository = async (repo: GitRepository, ctx: AppContext) => {
4646
}
4747

4848
logger.info(`Indexing ${repo.id}...`);
49-
const { durationMs } = await measure(() => indexGitRepository(repo, ctx));
49+
const { durationMs } = await measure(() => indexGitRepository(repo, settings, ctx));
5050
const indexDuration_s = durationMs / 1000;
5151
logger.info(`Indexed ${repo.id} in ${indexDuration_s}s`);
5252

@@ -57,18 +57,21 @@ const syncGitRepository = async (repo: GitRepository, ctx: AppContext) => {
5757
}
5858
}
5959

60-
const syncLocalRepository = async (repo: LocalRepository, ctx: AppContext, signal?: AbortSignal) => {
60+
const syncLocalRepository = async (repo: LocalRepository, settings: Settings, ctx: AppContext, signal?: AbortSignal) => {
6161
logger.info(`Indexing ${repo.id}...`);
62-
const { durationMs } = await measure(() => indexLocalRepository(repo, ctx, signal));
62+
const { durationMs } = await measure(() => indexLocalRepository(repo, settings, ctx, signal));
6363
const indexDuration_s = durationMs / 1000;
6464
logger.info(`Indexed ${repo.id} in ${indexDuration_s}s`);
6565
return {
6666
indexDuration_s,
6767
}
6868
}
6969

70-
export const isRepoReindxingRequired = (previous: Repository, current: Repository) => {
71-
70+
/**
71+
* Certain configuration changes (e.g., a branch is added) require
72+
* a reindexing of the repository.
73+
*/
74+
export const isRepoReindexingRequired = (previous: Repository, current: Repository) => {
7275
/**
7376
* Checks if the any of the `revisions` properties have changed.
7477
*/
@@ -100,6 +103,16 @@ export const isRepoReindxingRequired = (previous: Repository, current: Repositor
100103
)
101104
}
102105

106+
/**
107+
* Certain settings changes (e.g., the file limit size is changed) require
108+
* a reindexing of _all_ repositories.
109+
*/
110+
export const isAllRepoReindexingRequired = (previous: Settings, current: Settings) => {
111+
return (
112+
previous?.maxFileSize !== current?.maxFileSize
113+
)
114+
}
115+
103116
const syncConfig = async (configPath: string, db: Database, signal: AbortSignal, ctx: AppContext) => {
104117
const configContent = await (async () => {
105118
if (isRemotePath(configPath)) {
@@ -121,6 +134,13 @@ const syncConfig = async (configPath: string, db: Database, signal: AbortSignal,
121134
// @todo: we should validate the configuration file's structure here.
122135
const config = JSON.parse(stripJsonComments(configContent)) as SourcebotConfigurationSchema;
123136

137+
// Update the settings
138+
const updatedSettings: Settings = {
139+
maxFileSize: config.settings?.maxFileSize ?? DEFAULT_SETTINGS.maxFileSize,
140+
}
141+
const _isAllRepoReindexingRequired = isAllRepoReindexingRequired(db.data.settings, updatedSettings);
142+
await updateSettings(updatedSettings, db);
143+
124144
// Fetch all repositories from the config file
125145
let configRepos: Repository[] = [];
126146
for (const repoConfig of config.repos ?? []) {
@@ -172,7 +192,7 @@ const syncConfig = async (configPath: string, db: Database, signal: AbortSignal,
172192
for (const newRepo of configRepos) {
173193
if (newRepo.id in db.data.repos) {
174194
const existingRepo = db.data.repos[newRepo.id];
175-
const isReindexingRequired = isRepoReindxingRequired(existingRepo, newRepo);
195+
const isReindexingRequired = _isAllRepoReindexingRequired || isRepoReindexingRequired(existingRepo, newRepo);
176196
if (isReindexingRequired) {
177197
logger.info(`Marking ${newRepo.id} for reindexing due to configuration change.`);
178198
}
@@ -244,7 +264,7 @@ export const main = async (context: AppContext) => {
244264
const localRepos = Object.values(db.data.repos).filter(repo => repo.vcs === 'local');
245265
initLocalRepoFileWatchers(localRepos, async (repo, signal) => {
246266
logger.info(`Change detected to local repository ${repo.id}. Re-syncing...`);
247-
await syncLocalRepository(repo, context, signal);
267+
await syncLocalRepository(repo, db.data.settings, context, signal);
248268
await db.update(({ repos }) => repos[repo.id].lastIndexedDate = new Date().toUTCString());
249269
});
250270
}
@@ -285,12 +305,12 @@ export const main = async (context: AppContext) => {
285305
let cloneDuration_s: number | undefined;
286306

287307
if (repo.vcs === 'git') {
288-
const stats = await syncGitRepository(repo, context);
308+
const stats = await syncGitRepository(repo, db.data.settings, context);
289309
indexDuration_s = stats.indexDuration_s;
290310
fetchDuration_s = stats.fetchDuration_s;
291311
cloneDuration_s = stats.cloneDuration_s;
292312
} else if (repo.vcs === 'local') {
293-
const stats = await syncLocalRepository(repo, context);
313+
const stats = await syncLocalRepository(repo, db.data.settings, context);
294314
indexDuration_s = stats.indexDuration_s;
295315
}
296316

packages/backend/src/schemas/v2.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,21 @@ export type Repos = GitHubConfig | GitLabConfig | GiteaConfig | GerritConfig | L
77
*/
88
export interface SourcebotConfigurationSchema {
99
$schema?: string;
10+
settings?: Settings;
1011
/**
1112
* Defines a collection of repositories from varying code hosts that Sourcebot should sync with.
1213
*/
1314
repos?: Repos[];
1415
}
16+
/**
17+
* Global settings. These settings are applied to all repositories.
18+
*/
19+
export interface Settings {
20+
/**
21+
* The maximum size of a file (in bytes) to be indexed. Files that exceed this maximum will not be inexed. Defaults to 2MB (2097152 bytes).
22+
*/
23+
maxFileSize?: number;
24+
}
1525
export interface GitHubConfig {
1626
/**
1727
* GitHub Configuration

packages/backend/src/types.ts

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
21
interface BaseRepository {
32
vcs: 'git' | 'local';
43
id: string;
@@ -42,3 +41,12 @@ export type AppContext = {
4241

4342
configPath: string;
4443
}
44+
45+
export type Settings = {
46+
maxFileSize: number;
47+
}
48+
49+
// @see : https://stackoverflow.com/a/61132308
50+
export type DeepPartial<T> = T extends object ? {
51+
[P in keyof T]?: DeepPartial<T[P]>;
52+
} : T;

0 commit comments

Comments
 (0)