Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion cmd/kubesolo/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ type kubesolo struct {
localStorageSharedPath string
fullMode bool
disableIPv6 bool
dbWALRepair bool
embedded types.Embedded
}

Expand Down Expand Up @@ -78,6 +79,7 @@ func service() (*kubesolo, error) {
localStorageSharedPath: *flags.LocalStorageSharedPath,
fullMode: *flags.Full,
disableIPv6: *flags.DisableIPv6,
dbWALRepair: *flags.DBWALRepair,
}, nil
}

Expand Down Expand Up @@ -162,7 +164,7 @@ func (s *kubesolo) run() {
{
name: "kine",
start: func() {
kineService := kine.NewService(ctx, cancel, s.embedded.KineDir, kineReadyCh)
kineService := kine.NewService(ctx, cancel, s.embedded.KineDir, kineReadyCh, s.dbWALRepair)
s.wg.Go(func() {
kineService.Run()
})
Expand Down
1 change: 1 addition & 0 deletions internal/config/flags/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,5 +26,6 @@ var (
Debug = Application.Flag("debug", "Enable debug logging. Defaults to false.").Envar("KUBESOLO_DEBUG").Default("false").Bool()
PprofServer = Application.Flag("pprof-server", "Enable pprof server. Defaults to false.").Envar("KUBESOLO_PPROF_SERVER").Default("false").Bool()
Full = Application.Flag("full", "Disable memory-saving overrides and use upstream Kubernetes defaults. Kubesolo still uses NodeSetter in favour of the scheduler. Recommended for CI and developer environments where memory is not constrained. Leave unset for edge deployments.").Envar("KUBESOLO_FULL").Default("false").Bool()
DBWALRepair = Application.Flag("db-wal-repair", "On startup, run an integrity check against the SQLite database and remove WAL artefacts (state.db-wal, state.db-shm) if corruption is detected. Recovers from unclean shutdowns caused by power loss. Defaults to false.").Envar("KUBESOLO_DB_WAL_REPAIR").Default("false").Bool()
DisableIPv6 = Application.Flag("disable-ipv6", "Disable IPv6 support. When set, CoreDNS will not serve ip6.arpa reverse zones and kubelet will register with an explicit IPv4 node address. Defaults to false.").Envar("KUBESOLO_DISABLE_IPV6").Default("false").Bool()
)
2 changes: 2 additions & 0 deletions pkg/kine/executor.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ func (s *service) Run() error {
return err
}

s.repairWALIfCorrupt()

if err := kubesoloservice.RunServiceWithStartupCheck(func() error {
log.Debug().Str("component", "kine").Msg("starting kine server...")
_, err := endpoint.Listen(s.ctx, s.generateKineConfig())
Expand Down
78 changes: 78 additions & 0 deletions pkg/kine/repair.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
package kine

import (
"context"
"database/sql"
"os"
"path/filepath"
"time"

_ "github.com/mattn/go-sqlite3"
"github.com/rs/zerolog/log"
)

// repairWALIfCorrupt runs a quick integrity check against the kine SQLite
// database. If the check fails (or the DB cannot be opened at all) and
// --db-wal-repair is enabled, the WAL artefacts (state.db-wal, state.db-shm)
// are removed so that SQLite falls back to the last cleanly checkpointed state.
// Without the flag, a fatal log is emitted with instructions for manual recovery
// so that operators are never silently left in a broken boot loop.
func (s *service) repairWALIfCorrupt() {
dbPath := filepath.Join(s.databaseDir, "state.db")

if _, err := os.Stat(dbPath); os.IsNotExist(err) {
return
}

db, err := sql.Open("sqlite3", dbPath)
if err != nil {
s.handleCorruption(dbPath, "cannot open SQLite DB for integrity check: %v", err)
return
}
defer db.Close()

ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()

rows, err := db.QueryContext(ctx, "PRAGMA quick_check")
if err != nil {
s.handleCorruption(dbPath, "SQLite quick_check query failed: %v", err)
return
}
defer rows.Close()

if rows.Next() {
var result string
if rows.Scan(&result) == nil && result == "ok" {
log.Debug().Str("component", "kine").Msg("SQLite integrity check passed")
return
}
}

s.handleCorruption(dbPath, "SQLite integrity check failed")
}

func (s *service) handleCorruption(dbPath string, format string, args ...any) {
if s.dbWALRepair {
log.Warn().Str("component", "kine").Msgf(format+", removing WAL artefacts to recover from unclean shutdown", args...)
removeWALArtefacts(dbPath)
return
}

log.Fatal().Str("component", "kine").Msgf(
format+". The SQLite WAL artefacts may be corrupt after an unclean shutdown. "+
"Remove %s-wal and %s-shm manually, or restart with --db-wal-repair to remove them automatically.",
append(args, dbPath, dbPath)...,
)
}

func removeWALArtefacts(dbPath string) {
for _, suffix := range []string{"-wal", "-shm"} {
path := dbPath + suffix
if err := os.Remove(path); err != nil && !os.IsNotExist(err) {
log.Warn().Str("component", "kine").Msgf("failed to remove %s: %v", path, err)
} else if err == nil {
log.Info().Str("component", "kine").Msgf("removed %s", path)
}
}
}
4 changes: 3 additions & 1 deletion pkg/kine/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,16 @@ type service struct {
kineReady chan struct{}
ctx context.Context
cancel context.CancelFunc
dbWALRepair bool
}

// NewService creates a new kine service
func NewService(ctx context.Context, cancel context.CancelFunc, databaseDir string, kineReady chan struct{}) *service {
func NewService(ctx context.Context, cancel context.CancelFunc, databaseDir string, kineReady chan struct{}, dbWALRepair bool) *service {
return &service{
databaseDir: databaseDir,
kineReady: kineReady,
ctx: ctx,
cancel: cancel,
dbWALRepair: dbWALRepair,
}
}