diff --git a/.gitignore b/.gitignore index c9b2fcee..cfb5bfc2 100644 --- a/.gitignore +++ b/.gitignore @@ -39,3 +39,5 @@ delete_old_cluster.sh **/isolation/output **/isolation/isolation.conf pg_lake_iceberg/logs/polaris.log +.volume/ +Dockerfile.alpine \ No newline at end of file diff --git a/docker/.dockerignore b/docker/.dockerignore new file mode 100644 index 00000000..5c14dcb8 --- /dev/null +++ b/docker/.dockerignore @@ -0,0 +1,38 @@ +# Git +.git +.gitignore +.gitmodules + +# Documentation +*.md +!README.md +TASKFILE.md + +# Task files +Taskfile.yml + +# Docker files +docker-compose.yml +.dockerignore + +# CI/CD +../.github + +# Build artifacts +*.tar.gz +*.zip + +# Logs +*.log + +# IDE +.vscode +.idea +*.swp +*.swo +*~ + +# OS +.DS_Store +Thumbs.db + diff --git a/docker/.env b/docker/.env index 3dd323d9..b232ad66 100644 --- a/docker/.env +++ b/docker/.env @@ -1,2 +1,2 @@ PG_LAKE_REF=main -PG_MAJOR=18 +PG_MAJOR=18 \ No newline at end of file diff --git a/docker/Dockerfile b/docker/Dockerfile index 12a375b0..d2367fa7 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -9,7 +9,6 @@ ARG BASE_IMAGE_TAG # Set environment variables for non-interactive installations and PostgreSQL/PostGIS versions ENV PG_LAKE_REF=main -ENV PG_MAJOR=18 ARG PG16_VERSION=16.10 ARG PG17_VERSION=17.6 ARG PG18_VERSION=18.0 @@ -20,115 +19,115 @@ ARG PGAUDIT18_VERSION=REL_18_STABLE # Install build dependencies RUN if [ "$BASE_IMAGE_OS" = "almalinux" ]; then \ - dnf -y update && \ - dnf -y install epel-release && \ - dnf config-manager --enable crb && \ - dnf -y install \ - ca-certificates \ - cmake \ - ninja-build \ - wget \ - git \ - readline-devel \ - zlib-devel \ - flex \ - bison \ - sudo \ - nano \ - libxml2-devel \ - libxslt-devel \ - libicu-devel \ - openssl-devel \ - geos-devel \ - proj-devel \ - gdal-devel \ - json-c-devel \ - protobuf-c-devel \ - uuid-devel \ - lz4-devel \ - xz-devel \ - snappy-devel \ - perl \ - perl-IPC-Run \ - perl-IPC-Cmd \ - libtool \ - jansson-devel \ - jq \ - diffutils \ - libcurl-devel \ - patch \ - which \ - gcc-c++ \ - nodejs \ - python3.11 \ - pip \ - java-21-openjdk-devel \ - postgresql-jdbc.noarch \ - unzip \ - zip && \ - alternatives --auto java && alternatives --auto javac && java -version && \ - dnf clean all; \ + dnf -y update && \ + dnf -y install epel-release && \ + dnf config-manager --enable crb && \ + dnf -y install \ + ca-certificates \ + cmake \ + ninja-build \ + wget \ + git \ + readline-devel \ + zlib-devel \ + flex \ + bison \ + sudo \ + nano \ + libxml2-devel \ + libxslt-devel \ + libicu-devel \ + openssl-devel \ + geos-devel \ + proj-devel \ + gdal-devel \ + json-c-devel \ + protobuf-c-devel \ + uuid-devel \ + lz4-devel \ + xz-devel \ + snappy-devel \ + perl \ + perl-IPC-Run \ + perl-IPC-Cmd \ + libtool \ + jansson-devel \ + jq \ + diffutils \ + libcurl-devel \ + patch \ + which \ + gcc-c++ \ + nodejs \ + python3.11 \ + pip \ + java-21-openjdk-devel \ + postgresql-jdbc.noarch \ + unzip \ + zip && \ + alternatives --auto java && alternatives --auto javac && java -version && \ + dnf clean all; \ fi RUN if [ "$BASE_IMAGE_OS" = "debian" ]; then \ - apt-get update \ - && apt-get install -y \ - build-essential \ - cmake \ - ninja-build \ - libreadline-dev \ - zlib1g-dev \ - flex \ - bison \ - libxml2-dev \ - libxslt1-dev \ - libicu-dev \ - libssl-dev \ - libgeos-dev \ - libproj-dev \ - libgdal-dev \ - libjson-c-dev \ - libprotobuf-c-dev \ - protobuf-c-compiler \ - diffutils \ - uuid-dev \ - libossp-uuid-dev \ - liblz4-dev \ - liblzma-dev \ - libsnappy-dev \ - perl \ - libtool \ - libjansson-dev \ - libcurl4-openssl-dev \ - curl \ - patch \ - g++ \ - unzip \ - zip \ - python3.11 \ - pip \ - npm \ - nodejs \ - libipc-run-perl \ - wget \ - git \ - jq \ - sudo \ - libpostgresql-jdbc-java \ - && apt-get clean && rm -rf /var/lib/apt/lists/*; \ + apt-get update \ + && apt-get install -y \ + build-essential \ + cmake \ + ninja-build \ + libreadline-dev \ + zlib1g-dev \ + flex \ + bison \ + libxml2-dev \ + libxslt1-dev \ + libicu-dev \ + libssl-dev \ + libgeos-dev \ + libproj-dev \ + libgdal-dev \ + libjson-c-dev \ + libprotobuf-c-dev \ + protobuf-c-compiler \ + diffutils \ + uuid-dev \ + libossp-uuid-dev \ + liblz4-dev \ + liblzma-dev \ + libsnappy-dev \ + perl \ + libtool \ + libjansson-dev \ + libcurl4-openssl-dev \ + curl \ + patch \ + g++ \ + unzip \ + zip \ + python3.11 \ + pip \ + npm \ + nodejs \ + libipc-run-perl \ + wget \ + git \ + jq \ + sudo \ + libpostgresql-jdbc-java \ + && apt-get clean && rm -rf /var/lib/apt/lists/*; \ fi # Install jdk21 for debian RUN if [ "$BASE_IMAGE_OS" = "debian" ]; then \ - if [ "$(dpkg --print-architecture)" = "arm64" ]; then \ - ARCH=aarch64; \ - else \ - ARCH=x64; \ - fi && \ - wget https://download.oracle.com/java/21/latest/jdk-21_linux-${ARCH}_bin.tar.gz && \ - mkdir -p /usr/lib/jvm && \ - tar -xzf jdk-21_linux-${ARCH}_bin.tar.gz -C /usr/lib/jvm/ && \ - mv /usr/lib/jvm/jdk-21.0.9 /usr/lib/jvm/java-21-openjdk && \ - rm jdk-21_linux-${ARCH}_bin.tar.gz; \ + if [ "$(dpkg --print-architecture)" = "arm64" ]; then \ + ARCH=aarch64; \ + else \ + ARCH=x64; \ + fi && \ + wget https://download.oracle.com/java/21/latest/jdk-21_linux-${ARCH}_bin.tar.gz && \ + mkdir -p /usr/lib/jvm && \ + tar -xzf jdk-21_linux-${ARCH}_bin.tar.gz -C /usr/lib/jvm/ && \ + mv /usr/lib/jvm/jdk-21.0.9 /usr/lib/jvm/java-21-openjdk && \ + rm jdk-21_linux-${ARCH}_bin.tar.gz; \ fi ENV JAVA_HOME=/usr/lib/jvm/java-21-openjdk @@ -136,15 +135,15 @@ ENV PATH=$JAVA_HOME/bin:$PATH # Install pipenv and pre-commit RUN if [ "$BASE_IMAGE_OS" = "almalinux" ]; then \ - python3.11 -m ensurepip --upgrade && \ - python3.11 -m pip install --upgrade pip && \ - python3.11 -m pip install pipenv && \ - python3.11 -m pip install pre-commit="=4.3.0"; \ + python3.11 -m ensurepip --upgrade && \ + python3.11 -m pip install --upgrade pip && \ + python3.11 -m pip install pipenv && \ + python3.11 -m pip install pre-commit="=4.3.0"; \ fi RUN if [ "$BASE_IMAGE_OS" = "debian" ]; then \ - python3.11 -m pip install --upgrade pip --break-system-packages && \ - python3.11 -m pip install pipenv --break-system-packages && \ - python3.11 -m pip install pre-commit="=4.3.0" --break-system-packages; \ + python3.11 -m pip install --upgrade pip --break-system-packages && \ + python3.11 -m pip install pipenv --break-system-packages && \ + python3.11 -m pip install pre-commit="=4.3.0" --break-system-packages; \ fi # Install azurite @@ -243,13 +242,11 @@ RUN git clone https://github.com/citusdata/pg_cron.git && \ make install PG_CONFIG=$PGBASEDIR/pgsql-18/bin/pg_config && make clean PG_CONFIG=$PGBASEDIR/pgsql-18/bin/pg_config && \ cd .. && rm -rf pg_cron -ENV PATH=/home/postgres/pgsql-$PG_MAJOR/bin:$PATH - # Install vcpkg as postgres user ARG VCPKG_VERSION=2025.01.13 RUN git clone https://github.com/Microsoft/vcpkg.git -b $VCPKG_VERSION /home/postgres/vcpkg && \ - ./vcpkg/bootstrap-vcpkg.sh && \ + ./vcpkg/bootstrap-vcpkg.sh && \ ./vcpkg/vcpkg install azure-identity-cpp azure-storage-blobs-cpp azure-storage-files-datalake-cpp openssl ENV VCPKG_TOOLCHAIN_PATH="/home/postgres/vcpkg/scripts/buildsystems/vcpkg.cmake" @@ -259,32 +256,159 @@ FROM dev_base AS base # Clone pg_lake project RUN git clone https://github.com/snowflake-labs/pg_lake.git \ - --branch ${PG_LAKE_REF} --recurse-submodules /home/postgres/pg_lake + --branch ${PG_LAKE_REF} --recurse-submodules /home/postgres/pg_lake + +############## pg_lake_builder - Build all pg_lake extensions ############## +FROM base AS pg_lake_builder -############## pg_lake_postgres ############## -FROM base AS pg_lake_postgres +# need to redefine ARGs in each stage +ARG PG_MAJOR=18 -# Install pg_lake +# Set environment variables for the selected PostgreSQL version +ENV PG_MAJOR=${PG_MAJOR} +ENV PATH=/home/postgres/pgsql-${PG_MAJOR}/bin:$PATH + +# Install pg_lake extensions (build happens here, not in final image) RUN cd pg_lake && \ - make install-pg_extension_base && \ - make install-pg_map && \ - make install-pg_extension_updater && \ - make install-pg_lake_engine && \ - make install-avro && \ - make install-pg_lake_iceberg && \ - make install-pg_lake_table && \ - make install-pg_lake_spatial && \ - make install-pg_lake_copy && \ - make install-pg_lake && \ - make install-pg_lake_benchmark - -RUN initdb -D $PGBASEDIR/pgsql-$PG_MAJOR/data -U postgres --locale=C.UTF-8 --data-checksums - -############## pg_duck_server ############## -FROM base AS pgduck_server + PG_CONFIG=/home/postgres/pgsql-${PG_MAJOR}/bin/pg_config make install-pg_extension_base && \ + PG_CONFIG=/home/postgres/pgsql-${PG_MAJOR}/bin/pg_config make install-pg_map && \ + PG_CONFIG=/home/postgres/pgsql-${PG_MAJOR}/bin/pg_config make install-pg_extension_updater && \ + PG_CONFIG=/home/postgres/pgsql-${PG_MAJOR}/bin/pg_config make install-pg_lake_engine && \ + PG_CONFIG=/home/postgres/pgsql-${PG_MAJOR}/bin/pg_config make install-avro && \ + PG_CONFIG=/home/postgres/pgsql-${PG_MAJOR}/bin/pg_config make install-pg_lake_iceberg && \ + PG_CONFIG=/home/postgres/pgsql-${PG_MAJOR}/bin/pg_config make install-pg_lake_table && \ + PG_CONFIG=/home/postgres/pgsql-${PG_MAJOR}/bin/pg_config make install-pg_lake_spatial && \ + PG_CONFIG=/home/postgres/pgsql-${PG_MAJOR}/bin/pg_config make install-pg_lake_copy && \ + PG_CONFIG=/home/postgres/pgsql-${PG_MAJOR}/bin/pg_config make install-pg_lake && \ + PG_CONFIG=/home/postgres/pgsql-${PG_MAJOR}/bin/pg_config make install-pg_lake_benchmark + +############## pgduck_builder - Build duckdb_pglake and pgduck_server ############## +FROM base AS pgduck_builder +# need to redefine ARGs in each stage +ARG PG_MAJOR=18 ARG PGCOMPAT_BUILD_CONFIG=Release -# Install pgduck_server -RUN cd pg_lake/duckdb_pglake && make && make install && rm -r build -RUN cd pg_lake/pgduck_server && make && make install +# Set environment variables for the selected PostgreSQL version +ENV PG_MAJOR=${PG_MAJOR} +ENV PATH=/home/postgres/pgsql-${PG_MAJOR}/bin:$PATH + +# Install pgduck_server (build happens here, not in final image) +RUN cd pg_lake/duckdb_pglake && PG_CONFIG=/home/postgres/pgsql-${PG_MAJOR}/bin/pg_config make && make install && rm -r build +RUN cd pg_lake/pgduck_server && PG_CONFIG=/home/postgres/pgsql-${PG_MAJOR}/bin/pg_config make && make install + +############## runtime_base - Minimal runtime environment ############## +ARG BASE_IMAGE_OS="almalinux" +ARG BASE_IMAGE_TAG="9" +FROM ${BASE_IMAGE_OS}:${BASE_IMAGE_TAG} AS runtime_base + +# need to redefine ARGs in each stage +ARG BASE_IMAGE_OS +ARG BASE_IMAGE_TAG + +# Install ONLY runtime libraries (no -devel packages, no build tools) +RUN if [ "$BASE_IMAGE_OS" = "almalinux" ]; then \ + dnf -y update && \ + dnf -y install epel-release && \ + dnf config-manager --enable crb && \ + dnf -y install --allowerasing \ + ca-certificates \ + readline \ + zlib \ + sudo \ + nano \ + libxml2 \ + libxslt \ + libicu \ + openssl \ + geos \ + proj \ + gdal \ + json-c \ + protobuf-c \ + uuid \ + lz4 \ + xz \ + snappy \ + perl \ + jansson \ + libcurl && \ + dnf clean all; \ + fi + +RUN if [ "$BASE_IMAGE_OS" = "debian" ]; then \ + apt-get update \ + && apt-get install -y \ + libreadline8 \ + zlib1g \ + libxml2 \ + libxslt1.1 \ + libicu72 \ + libssl3 \ + libgeos-c1v5 \ + libproj25 \ + libgdal32 \ + libjson-c5 \ + libprotobuf-c1 \ + uuid-runtime \ + libossp-uuid16 \ + liblz4-1 \ + liblzma5 \ + libsnappy1v5 \ + perl \ + libjansson4 \ + libcurl4 \ + curl \ + sudo \ + && apt-get clean && rm -rf /var/lib/apt/lists/*; \ + fi + +# Create the postgres user with UID 1001 +RUN useradd -u 1001 -m -s /bin/bash postgres +RUN echo "postgres ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers.d/postgres +USER 1001:1001 +WORKDIR /home/postgres + +ENV PGBASEDIR=/home/postgres + +############## pg_lake_postgres - Final runtime image (COPY ONLY) ############## +FROM runtime_base AS pg_lake_postgres + +# need to redefine ARGs in each stage +ARG PG_MAJOR=18 + +# Set environment variables for runtime +ENV PG_MAJOR=${PG_MAJOR} +ENV PATH=/home/postgres/pgsql-${PG_MAJOR}/bin:$PATH + +# Copy PostgreSQL binaries and libraries for the selected version from dev_base +COPY --from=dev_base --chown=postgres:postgres /home/postgres/pgsql-${PG_MAJOR} /home/postgres/pgsql-${PG_MAJOR} + +# Copy pg_lake extensions from pg_lake_builder (they're already installed to pgsql-* directories) +# We need to copy the updated lib and share directories with pg_lake extensions +COPY --from=pg_lake_builder --chown=postgres:postgres /home/postgres/pgsql-${PG_MAJOR}/lib /home/postgres/pgsql-${PG_MAJOR}/lib +COPY --from=pg_lake_builder --chown=postgres:postgres /home/postgres/pgsql-${PG_MAJOR}/share /home/postgres/pgsql-${PG_MAJOR}/share + +# Initialize database (lightweight operation, doesn't compile anything) +RUN initdb -D $PGBASEDIR/pgsql-${PG_MAJOR}/data -U postgres --locale=C.UTF-8 --data-checksums + +############## pgduck_server - Final runtime image (COPY ONLY) ############## +FROM runtime_base AS pgduck_server + +# need to redefine ARGs in each stage +ARG PG_MAJOR=18 + +# Set environment variables for runtime +ENV PG_MAJOR=${PG_MAJOR} +ENV PATH=/home/postgres/pgsql-${PG_MAJOR}/bin:$PATH + +# Copy PostgreSQL binaries and libraries for the selected version +COPY --from=dev_base --chown=postgres:postgres /home/postgres/pgsql-${PG_MAJOR}/bin /home/postgres/pgsql-${PG_MAJOR}/bin +COPY --from=dev_base --chown=postgres:postgres /home/postgres/pgsql-${PG_MAJOR}/lib /home/postgres/pgsql-${PG_MAJOR}/lib +COPY --from=dev_base --chown=postgres:postgres /home/postgres/pgsql-${PG_MAJOR}/share /home/postgres/pgsql-${PG_MAJOR}/share + +# Copy pgduck_server binaries and libraries from pgduck_builder +# Note: duckdb_pglake installs libduckdb.so (not duckdb_pglake.so) +COPY --from=pgduck_builder --chown=postgres:postgres /home/postgres/pgsql-${PG_MAJOR}/bin/ /home/postgres/pgsql-${PG_MAJOR}/bin/ +COPY --from=pgduck_builder --chown=postgres:postgres /home/postgres/pgsql-${PG_MAJOR}/lib/ /home/postgres/pgsql-${PG_MAJOR}/lib/ +COPY --from=pgduck_builder --chown=postgres:postgres /home/postgres/pgsql-${PG_MAJOR}/share/ /home/postgres/pgsql-${PG_MAJOR}/share/ diff --git a/docker/LOCAL_DEV.md b/docker/LOCAL_DEV.md new file mode 100644 index 00000000..69c8815c --- /dev/null +++ b/docker/LOCAL_DEV.md @@ -0,0 +1,601 @@ +# pg_lake Local Development Guide + +Complete guide for building, running, and developing pg_lake locally. + +## ๐Ÿš€ Quick Start (3 Steps) + +### 1. Prerequisites + +- Docker Desktop (with Docker Compose) + - **Minimum**: 8GB RAM allocated to Docker + - **Recommended**: 16GB RAM allocated to Docker (required for pgduck_server compilation) + - Configure in: Docker Desktop โ†’ Settings โ†’ Resources โ†’ Memory +- [Task](https://taskfile.dev/installation/) - Task runner + +```bash +# macOS +brew install go-task + +# Linux +sh -c "$(curl --location https://taskfile.dev/install.sh)" -- -d -b /usr/local/bin + +# Windows (PowerShell - run as Administrator) +# Using Chocolatey +choco install go-task + +# Or using Scoop +scoop install task + +# Or download binary from https://github.com/go-task/task/releases +# Extract and add to PATH +``` + +**Note**: Tasks run in silent mode by default for cleaner output. Use `task -v ` to see verbose output when debugging. + +### 2. Set Environment Variables (Optional but Recommended) + +Setting these environment variables simplifies your workflow: + +```bash +# Set in your shell (add to ~/.bashrc, ~/.zshrc, or use direnv) +export PG_MAJOR=18 # Your preferred PostgreSQL version +export COMPOSE_PROJECT_NAME=pg_lake # Compose project name + +# Or use different project names to run multiple versions simultaneously +export COMPOSE_PROJECT_NAME=pg_lake_pg17 # For PG 17 instance +export COMPOSE_PROJECT_NAME=pg_lake_pg16 # For PG 16 instance +``` + +**Benefits:** + +- **No need to specify `PG_MAJOR` with every command** - `task compose:up` instead of `task compose:up PG_MAJOR=17` +- **Run multiple versions simultaneously** - Use different `COMPOSE_PROJECT_NAME` for each version to avoid conflicts +- **Consistent development environment** - All tools use the same PostgreSQL version + +**Using direnv (recommended):** + +```bash +# Install direnv (if not already installed) +brew install direnv # macOS +apt install direnv # Ubuntu/Debian + +# Enable direnv in your shell (add to ~/.bashrc or ~/.zshrc) +eval "$(direnv hook bash)" # For bash +eval "$(direnv hook zsh)" # For zsh + +# Create .envrc in the docker directory +cd docker +cat > .envrc << 'EOF' +export PG_MAJOR=17 +export COMPOSE_PROJECT_NAME=pg_lake_pg17 +EOF + +# Allow direnv to load the file +direnv allow +``` + +### 3. Available Build Variables + +You can customize builds using these variables (can be set as environment variables or passed to Task): + +| Variable | Default | Description | Example | +|----------|---------|-------------|---------| +| `PG_MAJOR` | `18` | PostgreSQL major version (16, 17, or 18) | `PG_MAJOR=17` | +| `BASE_IMAGE_OS` | `almalinux` | Base OS (almalinux or debian) | `BASE_IMAGE_OS=debian` | +| `BASE_IMAGE_TAG` | `9` | Base OS version tag | `BASE_IMAGE_TAG=12` | +| `VERSION` | `latest` | Image version tag (for registry) | `VERSION=v1.0.0` | + +### 4. Build and Start Everything + +```bash +cd docker + +# Build images and start all services (default: PostgreSQL 18) +# If PG_MAJOR is set as env var, it will use that version automatically +task compose:up + +# Build images and start all services for PostgreSQL 17 +task compose:up PG_MAJOR=17 + +# Build images and start all services for PostgreSQL 16 +task compose:up PG_MAJOR=16 + +# Build with Debian base OS +task compose:up BASE_IMAGE_OS=debian BASE_IMAGE_TAG=12 + +# Build PostgreSQL 17 with Debian +task compose:up PG_MAJOR=17 BASE_IMAGE_OS=debian BASE_IMAGE_TAG=12 +``` + +This single command will: + +- Build `pg_lake:local` and `pgduck-server:local` images for your architecture +- Start PostgreSQL with pg_lake extensions +- Start pgduck_server (DuckDB integration) +- Start LocalStack (S3-compatible storage) + +### 5. Connect and Test + +```bash +# Connect to PostgreSQL from your host +psql -h localhost -p 5432 -U postgres + +# Verify PostgreSQL version (should match your PG_MAJOR) +SHOW server_version; +# Example output for PG 18: PostgreSQL 18.0 on ... +# Example output for PG 17: PostgreSQL 17.6 on ... +# Example output for PG 16: PostgreSQL 16.10 on ... + +# Create a test Iceberg table +CREATE TABLE test(id int, name text) USING iceberg; + +# Insert some data +INSERT INTO test VALUES (1, 'Alice'), (2, 'Bob'); + +# Query it +SELECT * FROM test; +``` + +**Verify Iceberg files in S3:** + +```bash +# View the Iceberg table files stored in LocalStack S3 (no AWS CLI needed!) +task s3:list + +# You should see files like: +# ๐Ÿ“ฆ S3 Bucket Contents (s3://testbucket/pg_lake/): +# +# โ”œโ”€โ”€ data_0.parquet +# โ”œโ”€โ”€ 00000-6f561147-24ab-449d-922a-713d6adbb4ff.metadata.json +# โ”œโ”€โ”€ 00001-bf29575f-3fbd-4fe0-96c7-8666706d4625.metadata.json +# โ”œโ”€โ”€ 9f6a9c61-76ab-49ed-b336-3a27e786d1e4-m0.avro +# โ”œโ”€โ”€ snap-745562050065240723-1-9f6a9c61-76ab-49ed-b336-3a27e786d1e4.avro +``` + +--- + +## ๐Ÿ“‹ Common Tasks + +### Docker Compose Management + +```bash +# Start everything (builds if needed, default PG 18) +task compose:up + +# Start with specific PostgreSQL version +task compose:up PG_MAJOR=17 + +# Stop all services +task compose:down + +# Stop services and remove volumes (complete cleanup) +task compose:teardown + +# Restart services (use same PG_MAJOR as when started) +task compose:restart PG_MAJOR=17 + +# View logs (all services) +task compose:logs + +# View logs (specific service) +task compose:logs SERVICE=pg_lake-postgres +task compose:logs SERVICE=pgduck-server + +# Debug mode (verbose output) +task -v compose:up PG_MAJOR=17 +``` + +### Build Management + +```bash +# Build images for local docker-compose +task build:local + +# Build with specific PostgreSQL version +task build:local PG_MAJOR=17 +task build:local PG_MAJOR=16 + +# Build with different base OS +task build:local BASE_IMAGE_OS=debian BASE_IMAGE_TAG=12 + +# Rebuild after code changes +task build:local +task compose:restart +``` + +### Image Management + +```bash +# List images with architecture +task images:list + +# Clean up images +task images:clean +``` + +### S3 / LocalStack + +```bash +# View S3 bucket contents (Iceberg files) +task s3:list +``` + +--- + +## ๐Ÿ”Œ Connecting to Databases + +### PostgreSQL (from host) + +PostgreSQL is exposed on port 5432 and accessible from your host machine: + +```bash +# Using psql from host +psql -h localhost -p 5432 -U postgres + +# Or with docker-compose +docker-compose exec pg_lake-postgres psql -U postgres +``` + +### DuckDB via pgduck_server + +**Important**: `pgduck_server` only listens on Unix sockets (not TCP), so you cannot connect directly from the host. + +The `pg_lake-postgres` container shares the Unix socket with `pgduck_server`: + +```bash +# Connect via Unix socket from pg_lake container +docker exec -it pg_lake psql -h /home/postgres/pgduck_socket_dir -p 5332 -U postgres + +# Or exec into container first +docker exec -it pg_lake bash +psql -h /home/postgres/pgduck_socket_dir -p 5332 -U postgres + +# Test DuckDB version +docker exec -it pg_lake psql -h /home/postgres/pgduck_socket_dir -p 5332 -U postgres -c "select version() as duckdb_version;" +``` + +Should show something like: + +```sql + duckdb_version +---------------- + v1.3.2 +(1 row) +``` + +### Connection Architecture + +```text +Host Machine + โ”‚ + โ”œโ”€โ–บ Port 5432 (TCP) โ”€โ”€โ”€โ–บ pg_lake-postgres container + โ”‚ โ”‚ + โ”‚ โ””โ”€โ–บ Unix Socket โ”€โ”€โ”€โ–บ pgduck-server container + โ”‚ + โ””โ”€โ–บ Cannot connect directly to pgduck-server (Unix socket only) +``` + +Both containers share: + +- `pgduck-unix-socket-volume` - Unix socket for PostgreSQL protocol communication +- `pg-shared-tmp-dir-volume` - Temporary files for data exchange + +--- + +## โš™๏ธ Configuration + +### Default Configuration + +| Setting | Default Value | +|---------|---------------| +| PostgreSQL Version | 18 | +| Base OS | AlmaLinux 9 | +| Architecture | Your system's architecture (auto-detected) | + +### Change PostgreSQL Version + +```bash +# Recommended: Use compose:up (handles both build and start) +task compose:up PG_MAJOR=17 + +# Or manually: Build with PostgreSQL 17 then start +task build:local PG_MAJOR=17 +PG_MAJOR=17 docker-compose up -d + +# Build with PostgreSQL 16 +task compose:up PG_MAJOR=16 +``` + +### Environment Variables + +Create a `.env` file in the `docker` directory: + +```env +# PostgreSQL Version (16, 17, or 18) +PG_MAJOR=18 + +# AWS Profile for LocalStack (optional) +AWS_PROFILE=localstack +``` + +--- + +## ๐Ÿ”ง AWS CLI with LocalStack (Optional) + +By default, use `task s3:list` to view S3 contents (no AWS CLI installation needed). + +If you prefer using AWS CLI directly from your host, configure a LocalStack profile: + +### Setup AWS Profile for LocalStack + +**1. Create/Update `~/.aws/config`:** + +```ini +[profile localstack] +region = us-east-1 +output = json +endpoint_url = http://localhost:4566 +``` + +**2. Create/Update `~/.aws/credentials`:** + +```ini +[localstack] +aws_access_key_id = test +aws_secret_access_key = test +``` + +### Usage + +```bash +# List S3 buckets +aws --profile localstack s3 ls + +# List bucket contents +aws --profile localstack s3 ls s3://testbucket/pg_lake/ --recursive + +# Upload/download files +aws --profile localstack s3 cp myfile.txt s3://testbucket/ + +# Set as default for current session +export AWS_PROFILE=localstack +aws s3 ls +``` + +**Note**: The `task s3:list` command uses `docker exec` internally, so it works without any AWS CLI setup on your host. + +--- + +## ๐Ÿ› Troubleshooting + +### Build fails with memory error + +If builds fail with out-of-memory errors (especially when building pgduck_server): + +```bash +# Increase Docker Desktop memory: +# Docker Desktop โ†’ Settings โ†’ Resources โ†’ Memory +# Set to at least 16GB for pgduck_server compilation (DuckDB is memory-intensive) + +# On macOS/Linux, check system memory +# macOS: +sysctl hw.memsize + +# Linux: +free -h + +# Alternative: Build images separately to reduce peak memory usage +cd docker # Make sure you're in the docker directory + +# Build pg_lake first +docker buildx build --target pg_lake_postgres --load -t pg_lake:local -f Dockerfile . + +# Then build pgduck_server +docker buildx build --target pgduck_server --load -t pgduck-server:local -f Dockerfile . +``` + +### Images not found when starting docker-compose + +```bash +# Make sure you've built the images first +task build:local + +# Verify images exist +docker images | grep "pg_lake\|pgduck-server" + +# Or list all pg_lake images with details +task images:list +``` + +### Services won't start + +```bash +# Check logs +task compose:logs + +# Check container status +docker-compose ps + +# Check if ports are already in use +lsof -i :5432 # PostgreSQL port +lsof -i :4566 # LocalStack port +``` + +### "Cannot open file" errors when creating Iceberg tables + +If you see errors like: + +```text +ERROR: IO Error: Cannot open file "/home/postgres/pgsql-18/data/base/pgsql_tmp/pgsql_tmp.pg_lake_iceberg_XXX.0": No such file or directory +``` + +This means the temp directory volume isn't properly shared between containers: + +```bash +# Stop containers and remove volumes +task compose:teardown + +# Verify docker-compose.yml uses pg-shared-tmp-dir-volume for both containers +grep pg-shared-tmp-dir-volume docker-compose.yml + +# Restart services +task compose:up +``` + +### Need to rebuild from scratch + +```bash +# Stop and remove everything (including volumes) +task compose:teardown + +# Rebuild +task build:local + +# Start fresh +task compose:up +``` + +### Clean up Docker buildx + +```bash +# If builds are failing, reset buildx for specific version +task clean:cache-version PG_MAJOR=17 +task setup PG_MAJOR=17 +task build:local PG_MAJOR=17 + +# Or complete cleanup (all versions) +task clean:all +task build:local PG_MAJOR=17 +``` + +**Note**: Multi-platform builds maintain separate cache but share the same builder. If experiencing cache issues with multi-arch builds, clear the cache for the specific PostgreSQL version rather than removing the builder. + +--- + +## ๐Ÿ’ป Development Workflows + +### Making changes to Dockerfile + +```bash +# 1. Edit Dockerfile +vim Dockerfile + +# 2. Rebuild images (specify PG version if not using default) +task build:local PG_MAJOR=17 + +# 3. Restart services with new images +task compose:restart PG_MAJOR=17 + +# Or rebuild and restart in one command +task compose:up PG_MAJOR=17 + +# 4. Check logs +task compose:logs +``` + +### Testing different configurations + +```bash +# Test with Debian base (instead of default AlmaLinux) +task build:local BASE_IMAGE_OS=debian BASE_IMAGE_TAG=12 +docker-compose up -d + +# Note: For testing different PostgreSQL versions, see "Change PostgreSQL Version" +# section in Configuration above +``` + +### Using Direct Docker Commands + +After building with Task, you can use standard docker-compose commands: + +```bash +cd docker + +# Start services +docker-compose up -d + +# Stop services +docker-compose down + +# View logs +docker-compose logs -f + +# Check status +docker-compose ps + +# Execute commands in containers +docker-compose exec pg_lake-postgres psql -U postgres +docker-compose exec pg_lake-postgres psql -h /home/postgres/pgduck_socket_dir -p 5332 -U postgres +``` + +--- + +## ๐Ÿ“Š Build Details + +### What gets built? + +- **pg_lake:local** - PostgreSQL with pg_lake extensions +- **pgduck-server:local** - pgduck server with DuckDB integration + +### Build Architecture + +**Single-platform (local) builds:** + +- Builds only for your system's architecture (auto-detected) +- Uses `--load` to load images directly into Docker +- Faster and suitable for local development + +**Multi-platform builds:** + +- Builds for multiple architectures (amd64, arm64) +- Requires `--push` to registry (cannot use `--load`) +- Used for publishing images +- See [TASKFILE.md](./TASKFILE.md) for multi-platform build tasks + +**Cache Isolation:** + +Each PostgreSQL version uses its own buildx builder: + +- PG 16: `pg_lake_builder_pg16` +- PG 17: `pg_lake_builder_pg17` +- PG 18: `pg_lake_builder_pg18` + +This prevents cache conflicts when switching between versions. + +--- + +## ๐Ÿ’ก Tips + +- **Always specify `PG_MAJOR`** when working with non-default PostgreSQL versions: + + ```bash + task compose:up PG_MAJOR=17 + task compose:down PG_MAJOR=17 + task compose:logs PG_MAJOR=17 + ``` + +- Images are tagged as `pg_lake:local-pg{VERSION}` and `pgduck-server:local-pg{VERSION}` + - Default: `pg_lake:local-pg18`, `pgduck-server:local-pg18` + - PG 17: `pg_lake:local-pg17`, `pgduck-server:local-pg17` + +- For publishing images to registries, see [TASKFILE.md](./TASKFILE.md) + +- Use `task -v ` for verbose output when debugging + +- Container names vs service names: + - Service name: `pg_lake-postgres` (use with `docker-compose exec`) + - Container name: `pg_lake` (use with `docker exec`) + +- Each PostgreSQL version uses its own buildx builder for isolated caching: + - PG 16: `pg_lake_builder_pg16` + - PG 17: `pg_lake_builder_pg17` + - PG 18: `pg_lake_builder_pg18` + +--- + +## ๐Ÿ“š Additional Resources + +- [TASKFILE.md](./TASKFILE.md) - Complete Task reference with publishing workflows +- [Dockerfile](./Dockerfile) - Image build configuration +- [docker-compose.yml](./docker-compose.yml) - Service configuration +- [README.md](./README.md) - Architecture overview and optimizations diff --git a/docker/README.md b/docker/README.md index b19ba413..6ed386e0 100644 --- a/docker/README.md +++ b/docker/README.md @@ -1,10 +1,233 @@ -## Prerequisites -1. Make sure to allocate >= ~20GB memory resource to docker. -2. Your `~/.aws` folder is mounted as a read only volume to pgduck-server container so that it can read/write buckets with your aws credentials. You are expected to set up your aws credentials before. (possible to read/write from/to **production** s3 buckets) +# pg_lake Docker Setup -## How to run? -- Run `cd docker && docker compose up` to create all containers (minio, pgduck-server, and pg_lake-postgres) You can take a coffee break, this will take some time (~30 minutes on mac air m3) but a one time operation. If you want to build against different refs (default is main branch) or different postgres versions (default is 18), you can override them in [env file](.env). -- At the end, you can connect to postgres via `docker exec -it pg_lake psql`. +Multi-stage optimized Docker builds for pg_lake with Task automation. -> [!WARNING] -> These containers are mainly used for test and development purposes. Be cautious when you run them in production environments. +## ๐Ÿš€ Quick Start + +```bash +# Install Task (if needed) +brew install go-task # macOS +sh -c "$(curl -sL https://taskfile.dev/install.sh)" -- -d # Linux +choco install go-task # Windows (Chocolatey) +scoop install task # Windows (Scoop) + +# Build and start everything +cd docker +task compose:up + +# View logs +task compose:logs +``` + +## ๐Ÿ“ What's Included + +- **Dockerfile** - Optimized multi-stage build + - Builds only one PostgreSQL version at a time (saves ~3x build time & space) + - Separate builder stages for compilation + - Minimal runtime images with only necessary binaries + - Fixed vcpkg network issues with retry logic + +- **docker-compose.yml** - Local development stack + - pg_lake-postgres (PostgreSQL with pg_lake extensions) + - pgduck-server (DuckDB integration) + - [localstack](https://localstack.cloud/) (S3-compatible storage) + +- **Taskfile.yml** - Build automation + - Local single-platform single PostgreSQL version builds + - Multi-platform builds for publishing + - Docker Compose integration + - Helper tasks for common operations + +## ๐Ÿ“š Documentation + +- **[LOCAL_DEV.md](./LOCAL_DEV.md)** - Complete local development guide โญ **Start here** +- **[TASKFILE.md](./TASKFILE.md)** - Detailed Task documentation + +## ๐ŸŽฏ Common Commands + +```bash +# Development +task compose:up # Build images and start services +task compose:logs # View logs +task compose:down # Stop services +task compose:teardown # Stop services and remove volumes + +# Building +task build:local # Build for local use (fast, single platform) +task build:local PG_MAJOR=17 # Build PostgreSQL 17 + +# Debugging (verbose mode) +task -v compose:up # Show all command output + +# Testing +docker-compose exec pg_lake-postgres psql -U postgres +docker-compose exec pgduck-server psql -p 5332 -h /home/postgres/pgduck_socket_dir +``` + +**Note**: Tasks run in silent mode by default. Use `-v` flag for verbose output when debugging. + +## ๐Ÿ—๏ธ Architecture + +### Multi-Stage Build Flow + +``` +โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” +โ”‚ dev_base โ”‚ Build tools + PostgreSQL compilation +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + โ”‚ + โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” + โ”‚ โ”‚ โ”‚ +โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ–ผโ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ–ผโ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ–ผโ”€โ”€โ”€โ”€โ”€โ”€โ” +โ”‚ base โ”‚ โ”‚ base โ”‚ โ”‚ runtime_baseโ”‚ +โ”‚ (pg_lake โ”‚ โ”‚ (pg_lake โ”‚ โ”‚ (minimal โ”‚ +โ”‚ source) โ”‚ โ”‚ source) โ”‚ โ”‚ runtime) โ”‚ +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + โ”‚ โ”‚ โ”‚ +โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ–ผโ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ–ผโ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚ +โ”‚ pg_lake_ โ”‚ โ”‚ pgduck_ โ”‚ โ”‚ +โ”‚ builder โ”‚ โ”‚ builder โ”‚ โ”‚ +โ”‚ (compile โ”‚ โ”‚ (compile โ”‚ โ”‚ +โ”‚ extensions) โ”‚ โ”‚ pgduck) โ”‚ โ”‚ +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ + โ”‚ โ”‚ โ”‚ + โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + โ”‚ + โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” + โ”‚ โ”‚ +โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ–ผโ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ–ผโ”€โ”€โ”€โ”€โ”€โ”€โ” +โ”‚ pg_lake_ โ”‚ โ”‚ pgduck_ โ”‚ +โ”‚ postgres โ”‚ โ”‚ server โ”‚ +โ”‚ (FINAL) โ”‚ โ”‚ (FINAL) โ”‚ +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ +``` + +### Key Optimizations + +โœ… **Single Runner PostgreSQL Version**: Builds only PG 16, 17, or 18 (not all 3) +โœ… **Separate Build Stages**: Compilation happens in builder stages +โœ… **Minimal Runtime**: Final images contain only binaries and libraries +โœ… **Network Retry Logic**: Handles vcpkg download failures +โœ… **Multi-Platform Support**: Can build for AMD64 and ARM64 + +### Size Comparison + +| Image | Before | After | Savings | +|-------|--------|-------|---------| +| pg_lake | ~4GB | ~1.2GB | **70%** | +| pgduck-server | ~3GB | ~800MB | **73%** | + +## ๐Ÿ”ง Configuration + +### PostgreSQL Version + +```bash +# Default: PostgreSQL 18 +task build:local + +# PostgreSQL 17 +task build:local PG_MAJOR=17 + +# PostgreSQL 16 +task build:local PG_MAJOR=16 +``` + +### Base OS + +```bash +# Default: AlmaLinux 9 +task build:local + +# Debian 12 +task build:local BASE_IMAGE_OS=debian BASE_IMAGE_TAG=12 +``` + +### Multi-Platform + +```bash +# Build for multiple architectures +task build:pg-lake-postgres PLATFORMS="linux/amd64,linux/arm64" + +# Build for single architecture (faster) +task build:local # Auto-detects your system + +# Memory requirement: 16GB+ recommended for pgduck_server (DuckDB compilation) +``` + +## ๐Ÿ“ฆ Publishing Images + +### To Docker Hub (docker.io) + +```bash +# Login +export DOCKER_HUB_TOKEN=your_token +export DOCKER_HUB_USERNAME=your_username +task login:dockerhub +``` + +## ๐Ÿ› Troubleshooting + +### Build Failures + +```bash +# Out of memory +# โ†’ Increase Docker memory to 8GB+ in Docker Desktop settings + +# Network errors (vcpkg) +# โ†’ Retry logic is built-in, just run again +# โ†’ Check your internet connection + +# Buildx issues +task clean +task setup +``` + +### Service Issues + +```bash +# Services won't start +docker-compose ps # Check status +task compose:logs # Check logs +task compose:teardown # Reset everything (removes volumes) +task compose:up # Start fresh + +# Port conflicts +lsof -i :5432 # PostgreSQL +lsof -i :4566 # LocalStack +``` + +## ๐Ÿ“Š Health Checks + +The docker-compose setup includes health checks: + +- **pg_lake-postgres**: Creates and drops an Iceberg table +- **pgduck-server**: Executes `SELECT 1` +- **localstack**: HTTP health endpoint + +Check status: `docker-compose ps` + +## ๐Ÿ” Security Notes + +- Images run as non-root user (UID 1001, postgres) +- Minimal runtime attack surface +- No build tools in final images +- Only necessary runtime libraries included + +## ๐Ÿ“– Learn More + +- [Dockerfile Multi-Stage Builds](https://docs.docker.com/build/building/multi-stage/) +- [Task Documentation](https://taskfile.dev/) +- [Docker Compose](https://docs.docker.com/compose/) +- [LocalStack](https://localstack.cloud/) + +## ๐Ÿค Contributing + +When modifying the Dockerfile: + +1. Test locally: `task build:local && task compose:up` +2. Verify image sizes: `docker images | grep pg_lake` +3. Test services work: `task compose:logs` +4. Update documentation if needed + +## ๐Ÿ“ License + +See [LICENSE](../LICENSE) file in the project root. diff --git a/docker/TASKFILE.md b/docker/TASKFILE.md new file mode 100644 index 00000000..c862efaf --- /dev/null +++ b/docker/TASKFILE.md @@ -0,0 +1,551 @@ +# Docker Build Tasks + +This directory contains a Taskfile for building and pushing multi-platform Docker images. + +## Prerequisites + +1. **Install Task**: + + ```bash + # macOS + brew install go-task + + # Linux + sh -c "$(curl --location https://taskfile.dev/install.sh)" -- -d -b /usr/local/bin + ``` + +2. **Docker with buildx** (included in Docker Desktop) + +3. **Docker Memory Requirements**: + - **Minimum**: 8GB RAM allocated to Docker + - **Recommended**: 16GB RAM for building pgduck_server + - **Why**: DuckDB compilation in pgduck_server is very memory-intensive + - **Configure**: Docker Desktop โ†’ Settings โ†’ Resources โ†’ Memory + +## Build Architecture + +This Taskfile uses **per-version buildx builders** to prevent cache conflicts: + +- **PG 16**: Uses `pg_lake_builder_pg16` +- **PG 17**: Uses `pg_lake_builder_pg17` +- **PG 18**: Uses `pg_lake_builder_pg18` + +Each builder maintains its own isolated cache, allowing you to build and switch between PostgreSQL versions without cache conflicts. Builders are created automatically on first use. + +### Cache Behavior Notes + +**Single-platform builds** (e.g., `build:local`): +- Uses `--load` to load images directly into Docker +- Cache is stored per-builder instance +- Faster for local development +- Images immediately available in `docker images` + +**Multi-platform builds** (e.g., `build:all`, `build:pg-lake-postgres`): +- Builds for multiple architectures (amd64, arm64) +- **Without PUSH=true**: Builds to cache only (images not loaded locally) +- **With PUSH=true**: Builds and pushes to registry +- Cannot use `--load` with multiple platforms (Docker limitation) +- Cache is shared across architectures within the same builder + +**Important**: +- Use `task build:local` for local development (single platform, loads to Docker) +- Use `task build:all` for testing multi-arch builds (cache only, verifies build works) +- Use `task push:all` to build and publish to registry (multi-arch, available for deployment) + +## Available Tasks + +Run `task --list` to see all available tasks: + +```bash +cd docker +task --list +``` + +**Note**: All tasks run in silent mode by default for cleaner output. Use the `-v` flag when you need verbose output for debugging: + +```bash +# Normal (silent) mode +task build:local + +# Verbose mode (for debugging) +task -v build:local +``` + +## Common Usage + +### Quick Start - Local Development + +The fastest way to get started: + +```bash +# Build images and start services with docker-compose +task compose:up + +# View logs (all services) +task compose:logs + +# View logs for specific service +task compose:logs SERVICE=pgduck-server + +# Stop services +task compose:down +``` + +See [LOCAL_DEV.md](./LOCAL_DEV.md) for detailed local development guide. + +### Build Images Locally + +```bash +# Build for local docker-compose (detects your architecture automatically) +task build:local + +# Build with specific PostgreSQL version +task build:local PG_MAJOR=17 + +# List all built images with architecture +task images:list + +# Clean up local images +task images:clean + +# View S3 bucket contents (verify Iceberg files) +task s3:list +``` + +### Build Images for Registry + +```bash +# Build pg_lake_postgres for registry (multi-platform) +task build:pg-lake-postgres + +# Build pgduck_server for registry +task build:pgduck-server + +# Build both images (multi-platform, builds to cache only - doesn't push or load) +task build:all + +# Build for specific PostgreSQL version +task build:all PG_MAJOR=17 + +# Note: Multi-platform builds without PUSH=true only populate the build cache. +# Images won't be available locally or in registry until you push them. +``` + +### Build Multi-Platform Images + +```bash +# Build for multiple platforms (amd64 + arm64) +task build:all PLATFORMS="linux/amd64,linux/arm64" + +# Build all PostgreSQL versions (16, 17, 18) +task build:all-pg-versions +``` + +### Push to Registry + +```bash +# Login to Docker Hub +export DOCKER_HUB_TOKEN=your_token +export DOCKER_HUB_USERNAME=your_username +task login:dockerhub + +# Or login to GitHub Container Registry +export GITHUB_TOKEN=your_token +export GITHUB_ACTOR=your_github_username +task login:ghcr + +# Push single image +task push:pg-lake-postgres VERSION=v3.1.0 + +# Push all images +task push:all VERSION=v3.1.0 + +# Push all images for all PostgreSQL versions +task push:all-pg-versions VERSION=v3.1.0 +``` + +### Docker Compose Commands + +```bash +# Build and start all services +task compose:up + +# Stop services +task compose:down + +# Stop services and remove volumes (complete cleanup) +task compose:teardown + +# View logs (all services or specific SERVICE) +task compose:logs +task compose:logs SERVICE=pg_lake-postgres + +# Restart services +task compose:restart +``` + +### Cache Management + +Each PostgreSQL version (16, 17, 18) uses its own isolated buildx builder to prevent cache conflicts: + +```bash +# Clear cache for specific PostgreSQL version +task clean:cache-version PG_MAJOR=17 + +# Clear all buildx caches (all versions) +task clean:cache + +# Remove builder for specific version +task clean:builder PG_MAJOR=17 + +# Remove all builders (all PG versions) +task clean + +# Nuclear option: remove everything (builders + caches + local images) +task clean:all +``` + +**When to use each:** +- `clean:cache-version` - Rebuild one PG version from scratch +- `clean:cache` - Clear all caches but keep builders +- `clean:builder` - Remove a specific version's builder +- `clean` - Remove all builders (they'll be recreated on next build) +- `clean:all` - Complete cleanup when you want to start fresh + +### Image Management + +```bash +# List all pg_lake images with architecture +task images:list + +# Clean up local images +task images:clean +``` + +### Test Images + +```bash +# Test locally built images +task test:pg-lake-postgres +task test:pgduck-server +``` + +## Environment Variables + +You can customize builds using these variables: + +| Variable | Default | Description | +|----------|---------|-------------| +| `REGISTRY` | `docker.io` | Container registry (docker.io, ghcr.io, etc.) | +| `IMAGE_OWNER` | `${DOCKER_HUB_USERNAME}` | Registry username/org | +| `VERSION` | `latest` | Image version tag | +| `PG_MAJOR` | `18` | PostgreSQL major version (16, 17, or 18) | +| `PLATFORMS` | `linux/amd64,linux/arm64` | Target platforms for registry builds | +| `BASE_IMAGE_OS` | `almalinux` | Base OS (almalinux or debian) | +| `BASE_IMAGE_TAG` | `9` | Base OS version | + +**Note**: `build:local` automatically detects your system architecture (amd64 or arm64) and builds for that platform only. + +### Authentication Environment Variables + +For pushing to registries, set these environment variables: + +**Docker Hub:** + +- `DOCKER_HUB_TOKEN` - Docker Hub access token +- `DOCKER_HUB_USERNAME` - Docker Hub username + +**GitHub Container Registry:** + +- `GITHUB_TOKEN` - GitHub personal access token with `write:packages` scope +- `GITHUB_ACTOR` - GitHub username + +### Example with Custom Variables + +```bash +# Build PostgreSQL 17 image for amd64 only +task build:pg-lake-postgres \ + PG_MAJOR=17 \ + PLATFORMS=linux/amd64 \ + VERSION=v3.1.0-rc1 + +# Push to custom registry +task push:all \ + REGISTRY=docker.io \ + IMAGE_OWNER=myusername \ + VERSION=v3.1.0 +``` + +## GitHub Container Registry (GHCR) Authentication + +To push images to ghcr.io, you need to authenticate: + +```bash +# Set environment variables +export GITHUB_ACTOR=your-github-username +export GITHUB_TOKEN=your-github-token + +# Login to GHCR +task login:ghcr +``` + +## Image Tags + +### Registry Images + +Images pushed to registries are tagged as: + +``` +REGISTRY/IMAGE_OWNER/pg_lake:VERSION-pgMAJOR +REGISTRY/IMAGE_OWNER/pg_lake:VERSION +REGISTRY/IMAGE_OWNER/pgduck-server:VERSION-pgMAJOR +REGISTRY/IMAGE_OWNER/pgduck-server:VERSION +``` + +Examples: + +``` +docker.io/${DOCKER_HUB_USERNAME}/pg_lake:v3.1.0-pg18 +docker.io/${DOCKER_HUB_USERNAME}/pg_lake:v3.1.0 +docker.io/${DOCKER_HUB_USERNAME}/pg_lake:latest +docker.io/${DOCKER_HUB_USERNAME}/pgduck-server:v3.1.0-pg17 +``` + +### Local Images + +Images built with `task build:local` are tagged as: + +``` +pg_lake:local +pg_lake:local-pg18 +pgduck-server:local +pgduck-server:local-pg18 +``` + +## Pulling Images + +```bash +# Pull from Docker Hub +docker pull docker.io/${DOCKER_HUB_USERNAME}/pg_lake:latest +docker pull docker.io/${DOCKER_HUB_USERNAME}/pgduck-server:latest + +# Pull from GitHub Container Registry +docker pull ghcr.io/${DOCKER_HUB_USERNAME}/pg_lake:latest + +# Pull specific version +docker pull docker.io/${DOCKER_HUB_USERNAME}/pg_lake:v3.1.0-pg18 +``` + +## Using with Docker Compose + +### Local Development + +The included `docker-compose.yml` uses local images: + +```yaml +services: + pg_lake-postgres: + image: pg_lake:local + # ... rest of config + + pgduck-server: + image: pgduck-server:local + # ... rest of config +``` + +Build and start: + +```bash +task compose:up +``` + +### Using Registry Images + +To use published registry images, update your `docker-compose.yml`: + +```yaml +services: + pg_lake-postgres: + image: docker.io/${DOCKER_HUB_USERNAME}/pg_lake:latest + # ... rest of config + + pgduck-server: + image: docker.io/${DOCKER_HUB_USERNAME}/pgduck-server:latest + # ... rest of config +``` + +## Troubleshooting + +### Build fails with network errors + +The Taskfile includes retry logic for vcpkg downloads. If it still fails: + +```bash +# Try building with network host mode +docker buildx build --network=host ... +``` + +### Out of memory during build + +pgduck_server compilation (DuckDB) requires significant memory: + +```bash +# Increase Docker Desktop memory: +# Settings โ†’ Resources โ†’ Memory โ†’ 16GB recommended + +# Check if you have enough memory +# macOS: +sysctl hw.memsize + +# Linux: +free -h + +# Alternative: Build sequentially instead of parallel +# task build:local builds both images - if memory is tight, build separately: +docker buildx build --target pg_lake_postgres --platform linux/arm64 --load -t pg_lake:local . +docker buildx build --target pgduck_server --platform linux/arm64 --load -t pgduck-server:local . +``` + +### Multi-platform build issues + +```bash +# Clean all buildx builders and recreate for specific PG version +task clean +task setup PG_MAJOR=17 + +# Or clean cache for specific version +task clean:cache-version PG_MAJOR=17 +``` + +### Cache conflicts between PostgreSQL versions + +If you get cache errors when switching between PostgreSQL versions: + +```bash +# Clear cache for specific version +task clean:cache-version PG_MAJOR=17 + +# Or clear all caches +task clean:cache + +# Nuclear option: remove everything and start fresh +task clean:all +``` + +**Note for multi-platform builds**: If building for multiple architectures and encountering cache issues, you may need to clear the specific builder's cache: + +```bash +# Clear and rebuild for multi-platform +task clean:cache-version PG_MAJOR=17 +task build:all PG_MAJOR=17 PUSH=false + +# Or push directly to registry (recommended for multi-arch) +task push:all PG_MAJOR=17 +``` + +### Authentication issues with registries + +```bash +# For Docker Hub +export DOCKER_HUB_TOKEN=your_token_here +export DOCKER_HUB_USERNAME=your_username +task login:dockerhub + +# For GitHub Container Registry +export GITHUB_TOKEN=ghp_your_token_here +export GITHUB_ACTOR=your_github_username +task login:ghcr +``` + +### Images not showing architecture + +```bash +# List images with architecture info +task images:list +``` + +## Local Development + +### Development & Testing + +```bash +# 1. Make changes to Dockerfile or scripts + +# 2. Build and test locally (fast, single platform) +task build:local + +# 3. Start services with docker-compose +task compose:up + +# 4. View logs and test +task compose:logs + +# 5. Make changes and rebuild +task build:local +task compose:restart + +# 6. List images to verify +task images:list +``` + +### Releasing Images + +```bash +# 1. Build multi-platform for registry +task build:all VERSION=v3.1.0-rc1 + +# 2. Push to registry +task login:dockerhub +task push:all VERSION=v3.1.0-rc1 + +# 3. Build and push all PostgreSQL versions +task push:all-pg-versions VERSION=v3.1.0 +``` + +## Task Reference + +### Build Tasks + +- `build:local` - Build images for local development (auto-detects architecture, loads to Docker) +- `build:pg-lake-postgres` - Build pg_lake_postgres for registry (multi-platform, cache only unless PUSH=true) +- `build:pgduck-server` - Build pgduck_server for registry (multi-platform, cache only unless PUSH=true) +- `build:all` - Build all images (multi-platform, cache only unless PUSH=true) +- `build:all-pg-versions` - Build for PostgreSQL 16, 17, and 18 + +### Push Tasks + +- `push:pg-lake-postgres` - Build and push pg_lake_postgres +- `push:pgduck-server` - Build and push pgduck_server +- `push:all` - Build and push all images +- `push:all-pg-versions` - Build and push all PostgreSQL versions + +### Docker Compose Tasks + +- `compose:up` - Build and start all services +- `compose:down` - Stop and remove services +- `compose:teardown` - Stop services and remove volumes (complete cleanup) +- `compose:logs` - View service logs (optionally specify `SERVICE=`) +- `compose:restart` - Restart services + +### Image Management Tasks + +- `images:list` - List all pg_lake images with architecture +- `images:clean` - Remove all local pg_lake images + +### S3 Tasks + +- `s3:list` - List S3 bucket contents (LocalStack) in tree format + +### Clean Tasks + +- `clean` - Remove all buildx builders (PG 16, 17, 18) +- `clean:builder` - Remove buildx builder for specific PG_MAJOR version +- `clean:cache` - Clean buildx cache for all builders +- `clean:cache-version` - Clean buildx cache for specific PG_MAJOR version +- `clean:all` - Remove all builders, caches, and local images (complete cleanup) + +### Utility Tasks + +- `setup` - Setup Docker buildx builder for specific PG_MAJOR version +- `login:dockerhub` - Login to Docker Hub +- `login:ghcr` - Login to GitHub Container Registry diff --git a/docker/Taskfile.yml b/docker/Taskfile.yml new file mode 100644 index 00000000..2d1d3c3d --- /dev/null +++ b/docker/Taskfile.yml @@ -0,0 +1,380 @@ +version: "3" + +vars: + REGISTRY: '{{.REGISTRY | default "ghcr.io"}}' + IMAGE_OWNER: '{{.IMAGE_OWNER | default (env "DOCKER_HUB_USERNAME")}}' + VERSION: '{{.VERSION | default "latest"}}' + PG_MAJOR: '{{.PG_MAJOR | default "18"}}' + PLATFORMS: '{{.PLATFORMS | default "linux/amd64,linux/arm64"}}' + BASE_IMAGE_OS: '{{.BASE_IMAGE_OS | default "almalinux"}}' + BASE_IMAGE_TAG: '{{.BASE_IMAGE_TAG | default "9"}}' + +tasks: + setup: + desc: Setup Docker buildx for multi-platform builds (per PG_MAJOR version) + silent: true + cmds: + - docker buildx create --name pg_lake_builder_pg{{.PG_MAJOR}} --use --bootstrap || docker buildx use + pg_lake_builder_pg{{.PG_MAJOR}} + - docker buildx inspect --bootstrap + status: + - docker buildx inspect pg_lake_builder_pg{{.PG_MAJOR}} + + build:pg-lake-postgres: + desc: Build pg_lake_postgres image for multiple platforms + silent: true + cmds: + - task: setup + vars: { PG_MAJOR: "{{.PG_MAJOR}}" } + - | + docker buildx build \ + --builder pg_lake_builder_pg{{.PG_MAJOR}} \ + --platform {{.PLATFORMS}} \ + --target pg_lake_postgres \ + --build-arg PG_MAJOR={{.PG_MAJOR}} \ + --build-arg BASE_IMAGE_OS={{.BASE_IMAGE_OS}} \ + --build-arg BASE_IMAGE_TAG={{.BASE_IMAGE_TAG}} \ + -t {{.REGISTRY}}/{{.IMAGE_OWNER}}/pg_lake:{{.VERSION}}-pg{{.PG_MAJOR}}-{{.BASE_IMAGE_OS}} \ + -t {{.REGISTRY}}/{{.IMAGE_OWNER}}/pg_lake:{{.VERSION}}-pg{{.PG_MAJOR}} \ + {{if eq .PUSH "true"}}--push{{end}} \ + -f Dockerfile \ + --progress=plain \ + . + vars: + PUSH: '{{.PUSH | default "false"}}' + + build:pgduck-server: + desc: Build pgduck_server image for multiple platforms + silent: true + cmds: + - task: setup + vars: { PG_MAJOR: "{{.PG_MAJOR}}" } + - | + docker buildx build \ + --builder pg_lake_builder_pg{{.PG_MAJOR}} \ + --platform {{.PLATFORMS}} \ + --target pgduck_server \ + --build-arg PG_MAJOR={{.PG_MAJOR}} \ + --build-arg BASE_IMAGE_OS={{.BASE_IMAGE_OS}} \ + --build-arg BASE_IMAGE_TAG={{.BASE_IMAGE_TAG}} \ + -t {{.REGISTRY}}/{{.IMAGE_OWNER}}/pgduck-server:{{.VERSION}}-pg{{.PG_MAJOR}}-{{.BASE_IMAGE_OS}} \ + -t {{.REGISTRY}}/{{.IMAGE_OWNER}}/pgduck-server:{{.VERSION}}-pg{{.PG_MAJOR}} \ + {{if eq .PUSH "true"}}--push{{end}} \ + -f Dockerfile \ + --progress=plain \ + . + vars: + PUSH: '{{.PUSH | default "false"}}' + + build:all: + desc: Build all images for multiple platforms + silent: true + cmds: + - task: setup + vars: { PG_MAJOR: "{{.PG_MAJOR}}" } + - task: build:pg-lake-postgres + vars: + { + PG_MAJOR: "{{.PG_MAJOR}}", + VERSION: "{{.VERSION}}", + PUSH: "{{.PUSH}}", + BASE_IMAGE_OS: "{{.BASE_IMAGE_OS}}", + BASE_IMAGE_TAG: "{{.BASE_IMAGE_TAG}}", + PLATFORMS: "{{.PLATFORMS}}", + } + - task: build:pgduck-server + vars: + { + PG_MAJOR: "{{.PG_MAJOR}}", + VERSION: "{{.VERSION}}", + PUSH: "{{.PUSH}}", + BASE_IMAGE_OS: "{{.BASE_IMAGE_OS}}", + BASE_IMAGE_TAG: "{{.BASE_IMAGE_TAG}}", + PLATFORMS: "{{.PLATFORMS}}", + } + + push:pg-lake-postgres: + desc: Build and push pg_lake_postgres image + silent: true + cmds: + - task: build:pg-lake-postgres + vars: + { + PG_MAJOR: "{{.PG_MAJOR}}", + VERSION: "{{.VERSION}}", + PUSH: "true", + BASE_IMAGE_OS: "{{.BASE_IMAGE_OS}}", + BASE_IMAGE_TAG: "{{.BASE_IMAGE_TAG}}", + PLATFORMS: "{{.PLATFORMS}}", + } + + push:pgduck-server: + desc: Build and push pgduck_server image + silent: true + cmds: + - task: build:pgduck-server + vars: + { + PG_MAJOR: "{{.PG_MAJOR}}", + VERSION: "{{.VERSION}}", + PUSH: "true", + BASE_IMAGE_OS: "{{.BASE_IMAGE_OS}}", + BASE_IMAGE_TAG: "{{.BASE_IMAGE_TAG}}", + PLATFORMS: "{{.PLATFORMS}}", + } + + push:all: + desc: Build and push all images + silent: true + cmds: + - task: build:all + vars: + { + PG_MAJOR: "{{.PG_MAJOR}}", + VERSION: "{{.VERSION}}", + PUSH: "true", + BASE_IMAGE_OS: "{{.BASE_IMAGE_OS}}", + BASE_IMAGE_TAG: "{{.BASE_IMAGE_TAG}}", + PLATFORMS: "{{.PLATFORMS}}", + } + + build:all-pg-versions: + desc: Build all images for PostgreSQL 16, 17, and 18 + silent: true + cmds: + - task: build:all + vars: { PG_MAJOR: "16", VERSION: "{{.VERSION}}", PUSH: "{{.PUSH}}" } + - task: build:all + vars: { PG_MAJOR: "17", VERSION: "{{.VERSION}}", PUSH: "{{.PUSH}}" } + - task: build:all + vars: { PG_MAJOR: "18", VERSION: "{{.VERSION}}", PUSH: "{{.PUSH}}" } + + push:all-pg-versions: + desc: Build and push all images for PostgreSQL 16, 17, and 18 + silent: true + cmds: + - task: build:all-pg-versions + vars: + { + VERSION: "{{.VERSION}}", + PUSH: "true", + BASE_IMAGE_OS: "{{.BASE_IMAGE_OS}}", + BASE_IMAGE_TAG: "{{.BASE_IMAGE_TAG}}", + PLATFORMS: "{{.PLATFORMS}}", + } + + clean: + desc: Remove all buildx builders (all PG versions) + silent: true + cmds: + - docker buildx rm pg_lake_builder_pg16 || true + - docker buildx rm pg_lake_builder_pg17 || true + - docker buildx rm pg_lake_builder_pg18 || true + - docker buildx rm pg_lake_builder || true + - echo "โœ… All buildx builders removed" + ignore_error: true + + clean:builder: + desc: Remove buildx builder for specific PG_MAJOR version + silent: true + cmds: + - docker buildx rm pg_lake_builder_pg{{.PG_MAJOR}} || true + - echo "โœ… Buildx builder for PG {{.PG_MAJOR}} removed" + ignore_error: true + + clean:cache: + desc: Clean buildx cache (all builders) + silent: true + cmds: + - docker buildx prune -af + - echo "โœ… All buildx caches cleared" + + clean:cache-version: + desc: Clean buildx cache for specific PG_MAJOR version + silent: true + cmds: + - docker buildx use pg_lake_builder_pg{{.PG_MAJOR}} 2>/dev/null && docker buildx prune -f || echo "Builder + pg_lake_builder_pg{{.PG_MAJOR}} not found" + - echo "โœ… Cache for PG {{.PG_MAJOR}} cleared" + + clean:all: + desc: Remove all builders, caches, and local images + silent: true + cmds: + - task: clean + - task: clean:cache + - task: images:clean + - echo "๐Ÿงน Complete cleanup finished" + + login:dockerhub: + desc: Login to Docker Hub Container Registry + silent: true + cmds: + - echo "$DOCKER_HUB_TOKEN" | docker login docker.io -u $DOCKER_HUB_USERNAME --password-stdin + preconditions: + - sh: '[ ! -z "$DOCKER_HUB_TOKEN" ]' + msg: "DOCKER_HUB_TOKEN environment variable must be set" + - sh: '[ ! -z "$DOCKER_HUB_USERNAME" ]' + msg: "DOCKER_HUB_USERNAME environment variable must be set" + + login:ghcr: + desc: Login to GitHub Container Registry + silent: true + cmds: + - echo "$GITHUB_TOKEN" | docker login ghcr.io -u $GITHUB_ACTOR --password-stdin + preconditions: + - sh: '[ ! -z "$GITHUB_TOKEN" ]' + msg: "GITHUB_TOKEN environment variable must be set" + - sh: '[ ! -z "$GITHUB_ACTOR" ]' + msg: "GITHUB_ACTOR environment variable must be set" + + build:local: + desc: Build images for local docker-compose (single platform) + silent: true + cmds: + - task: setup + vars: { PG_MAJOR: "{{.PG_MAJOR}}" } + - | + docker buildx build \ + --builder pg_lake_builder_pg{{.PG_MAJOR}} \ + --platform linux/{{ARCH}} \ + --target pg_lake_postgres \ + --build-arg PG_MAJOR={{.PG_MAJOR}} \ + --build-arg BASE_IMAGE_OS={{.BASE_IMAGE_OS}} \ + --build-arg BASE_IMAGE_TAG={{.BASE_IMAGE_TAG}} \ + -t pg_lake:local-pg{{.PG_MAJOR}}-{{.BASE_IMAGE_OS}} \ + -t pg_lake:local-pg{{.PG_MAJOR}} \ + -t pg_lake:local \ + --load \ + -f Dockerfile \ + --progress=plain \ + . + - | + docker buildx build \ + --builder pg_lake_builder_pg{{.PG_MAJOR}} \ + --platform linux/{{ARCH}} \ + --target pgduck_server \ + --build-arg PG_MAJOR={{.PG_MAJOR}} \ + --build-arg BASE_IMAGE_OS={{.BASE_IMAGE_OS}} \ + --build-arg BASE_IMAGE_TAG={{.BASE_IMAGE_TAG}} \ + -t pgduck-server:local-pg{{.PG_MAJOR}}-{{.BASE_IMAGE_OS}} \ + -t pgduck-server:local-pg{{.PG_MAJOR}} \ + -t pgduck-server:local \ + --load \ + -f Dockerfile \ + --progress=plain \ + . + vars: + ARCH: + sh: uname -m | sed 's/x86_64/amd64/;s/aarch64/arm64/' + + # TODO:need to make muslc with duckdb extensions build + # build:local-alpine: + # desc: Build images with Alpine Linux (experimental - builds DuckDB extensions from source) + # silent: true + # cmds: + # - task: setup + # vars: { PG_MAJOR: "{{.PG_MAJOR}}" } + # - | + # docker buildx build \ + # --builder pg_lake_builder_pg{{.PG_MAJOR}} \ + # --platform linux/{{ARCH}} \ + # --target pg_lake_postgres \ + # --build-arg PG_MAJOR={{.PG_MAJOR}} \ + # -t pg_lake:alpine-pg{{.PG_MAJOR}} \ + # -t pg_lake:alpine \ + # -t pg_lake:local \ + # --load \ + # -f Dockerfile.alpine \ + # --progress=plain \ + # . + # - | + # docker buildx build \ + # --builder pg_lake_builder_pg{{.PG_MAJOR}} \ + # --platform linux/{{ARCH}} \ + # --target pgduck_server \ + # --build-arg PG_MAJOR={{.PG_MAJOR}} \ + # -t pgduck-server:alpine-pg{{.PG_MAJOR}} \ + # -t pgduck-server:alpine \ + # -t pgduck-server:local \ + # --load \ + # -f Dockerfile.alpine \ + # --progress=plain \ + # . + # vars: + # ARCH: + # sh: uname -m | sed 's/x86_64/amd64/;s/aarch64/arm64/' + + compose:up: + desc: Build images and start docker-compose + silent: true + cmds: + - task: build:local + vars: { PG_MAJOR: "{{.PG_MAJOR}}" } + - PG_MAJOR={{.PG_MAJOR}} docker-compose up -d + - echo "Services started! Check logs with 'docker-compose logs -f'" + + compose:down: + desc: Stop and remove docker-compose services + silent: true + cmds: + - PG_MAJOR={{.PG_MAJOR}} docker-compose down + + compose:logs: + desc: View docker-compose logs (optionally for specific SERVICE) + silent: true + cmds: + - PG_MAJOR={{.PG_MAJOR}} docker-compose logs -f {{.SERVICE}} + + compose:restart: + desc: Restart docker-compose services + silent: true + cmds: + - PG_MAJOR={{.PG_MAJOR}} docker-compose restart + + compose:teardown: + desc: Stop services and remove volumes (complete cleanup) + silent: true + cmds: + - PG_MAJOR={{.PG_MAJOR}} docker-compose down --volumes + - echo "โš ๏ธ All services stopped and volumes removed" + + images:list: + desc: List all built pg_lake images with architecture + silent: true + cmds: + - | + echo "๐Ÿ“ฆ pg_lake and pgduck-server images:" + echo "" + printf "%-30s %-15s %-15s %-12s %-20s\n" "REPOSITORY:TAG" "IMAGE ID" "ARCHITECTURE" "SIZE" "CREATED" + printf "%-30s %-15s %-15s %-12s %-20s\n" "------------------------------" "---------------" "---------------" "------------" "--------------------" + docker images --format "{{"{{"}}.Repository}}\t{{"{{"}}.Tag}}\t{{"{{"}}.ID}}\t{{"{{"}}.Size}}\t{{"{{"}}.CreatedSince}}" | grep -E "(pg_lake|pgduck-server)" | while IFS=$'\t' read -r repo tag id size created; do + arch=$(docker inspect "$id" --format='{{"{{"}}.Architecture}}') + printf "%-30s %-15s %-15s %-12s %-20s\n" "$repo:$tag" "${id:0:12}" "$arch" "$size" "$created" + done || echo "No pg_lake images found" + + images:clean: + desc: Remove all local pg_lake images + silent: true + cmds: + - docker rmi --force $(docker images -q pg_lake) 2>/dev/null || echo "No pg_lake images to remove" + - docker rmi --force $(docker images -q pgduck-server) 2>/dev/null || echo "No pgduck-server images to + remove" + - task: images:list + + s3:list: + desc: List S3 bucket contents (LocalStack) + silent: true + cmds: + - | + echo "๐Ÿ“ฆ S3 Bucket Contents (s3://testbucket/pg_lake/):" + echo "" + docker exec localstack-main awslocal s3 ls s3://testbucket/pg_lake/ --recursive | \ + awk '{print $4}' | \ + sort | \ + awk -F/ '{ + depth = NF - 1 + for(i=1; i 0) printf "โ”œโ”€โ”€ " + print $NF + }' || echo "No objects found or LocalStack not running" diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 3865a803..d4fb5d1b 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -1,24 +1,15 @@ services: pg_lake-postgres: - build: - context: . - dockerfile: Dockerfile - target: pg_lake_postgres - args: - - BASE_IMAGE_OS=almalinux - - BASE_IMAGE_TAG=9 + image: pg_lake:local + pull_policy: if_not_present container_name: pg_lake + ports: + - "5432:5432" volumes: - - ${USERPROFILE}${HOME}/.ssh:/home/postgres/.ssh:ro - - ${USERPROFILE}${HOME}/.ssh/known_hosts:/home/postgres/.ssh/known_hosts:rw - - ${USERPROFILE}${HOME}/.gitconfig:/home/postgres/.gitconfig:ro - - ${USERPROFILE}${HOME}/.aws:/home/postgres/.aws:rw - ./scripts/entrypoint-postgres.sh:/entrypoint-postgres.sh - ./scripts/init-postgres.sql:/init-postgres.sql - pgduck-unix-socket-volume:/home/postgres/pgduck_socket_dir - - pg-18-tmp-dir-volume:/home/postgres/pgsql-18/data/base/pgsql_tmp - - pg-17-tmp-dir-volume:/home/postgres/pgsql-17/data/base/pgsql_tmp - - pg-16-tmp-dir-volume:/home/postgres/pgsql-16/data/base/pgsql_tmp + - pg-shared-tmp-dir-volume:/home/postgres/pgsql-${PG_MAJOR:-18}/data/base/pgsql_tmp entrypoint: ["/entrypoint-postgres.sh"] restart: unless-stopped healthcheck: @@ -27,33 +18,24 @@ services: timeout: 2s start_period: 20s retries: 3 - env_file: - - .env cap_add: - SYS_PTRACE depends_on: - pgduck-server pgduck-server: - build: - context: . - dockerfile: Dockerfile - target: pgduck_server - args: - - BASE_IMAGE_OS=almalinux - - BASE_IMAGE_TAG=9 + image: pgduck-server:local + pull_policy: if_not_present container_name: pgduck-server - environment: - AWS_SHARED_CREDENTIALS_FILE: /home/postgres/.aws/credentials - AWS_CONFIG_FILE: /home/postgres/.aws/config + # NOTE: pgduck-server only listens on Unix sockets, not TCP + # To access DuckDB from the host, connect via pg_lake-postgres container which shares the Unix socket + # Example: psql -h localhost -p 5432 -U postgres -c "SELECT * FROM duckdb_fdw.some_table" + # IMPORTANT: Both containers must share the same temp directory volume so pgduck_server can access temp files volumes: - ./scripts/entrypoint-pgduck-server.sh:/entrypoint-pgduck-server.sh - ./scripts/init-pgduck-server.sql:/init-pgduck-server.sql - - ${USERPROFILE}${HOME}/.aws:/home/postgres/.aws:ro - pgduck-unix-socket-volume:/home/postgres/pgduck_socket_dir - - pg-18-tmp-dir-volume:/home/postgres/pgsql-18/data/base/pgsql_tmp - - pg-17-tmp-dir-volume:/home/postgres/pgsql-17/data/base/pgsql_tmp - - pg-16-tmp-dir-volume:/home/postgres/pgsql-16/data/base/pgsql_tmp + - pg-shared-tmp-dir-volume:/home/postgres/pgsql-${PG_MAJOR:-18}/data/base/pgsql_tmp entrypoint: ["/entrypoint-pgduck-server.sh"] restart: unless-stopped healthcheck: @@ -62,28 +44,25 @@ services: timeout: 2s start_period: 20s retries: 3 - env_file: - - .env cap_add: - SYS_PTRACE depends_on: - - minio + - localstack - minio: - image: minio/minio - container_name: minio + localstack: + container_name: "${LOCALSTACK_DOCKER_NAME:-localstack-main}" + image: localstack/localstack + pull_policy: if_not_present + ports: + - "127.0.0.1:4566:4566" # LocalStack Gateway + - "127.0.0.1:4510-4559:4510-4559" # external services port range + environment: + # LocalStack configuration: https://docs.localstack.cloud/references/configuration/ + - DEBUG=${DEBUG:-0} volumes: - - ./scripts/entrypoint-minio.sh:/entrypoint-minio.sh - entrypoint: ["/entrypoint-minio.sh"] - restart: unless-stopped - healthcheck: - test: ["CMD", "curl", "http://localhost:9000"] - interval: 6s - timeout: 2s - retries: 3 + - "${LOCALSTACK_VOLUME_DIR:-./.volume}:/var/lib/localstack" + - "${PWD}/scripts/init-s3.sh:/etc/localstack/init/ready.d/init-s3.sh" volumes: pgduck-unix-socket-volume: - pg-18-tmp-dir-volume: - pg-17-tmp-dir-volume: - pg-16-tmp-dir-volume: + pg-shared-tmp-dir-volume: # Shared temp directory for both PostgreSQL and pgduck_server diff --git a/docker/scripts/entrypoint-minio.sh b/docker/scripts/entrypoint-minio.sh deleted file mode 100755 index 4edfd45a..00000000 --- a/docker/scripts/entrypoint-minio.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -trap "echo 'Caught termination signal. Exiting...'; exit 0" SIGINT SIGTERM - -minio server /data & - -minio_pid=$! - -while ! curl http://localhost:9000; do - echo "Waiting for http://localhost:9000..." - sleep 1 -done - -# set access key and secret key -mc alias set local http://localhost:9000 minioadmin minioadmin - -# create test bucket -mc mb local/testbucket - -wait $minio_pid diff --git a/docker/scripts/entrypoint-pgduck-server.sh b/docker/scripts/entrypoint-pgduck-server.sh index 05381b29..40f3fd24 100755 --- a/docker/scripts/entrypoint-pgduck-server.sh +++ b/docker/scripts/entrypoint-pgduck-server.sh @@ -3,12 +3,30 @@ set -euo pipefail trap "echo 'Caught termination signal. Exiting...'; exit 0" SIGINT SIGTERM -# bind volumes have root permission at start (make this readable and writable by postgres) -sudo chown -R postgres:postgres /home/postgres/pgduck_socket_dir -# Start pgduck_server -cd /home/postgres/pg_lake/pgduck_server -./pgduck_server --cache_dir ~/cache --unix_socket_directory ~/pgduck_socket_dir --unix_socket_group postgres --init_file_path /init-pgduck-server.sql & +# Ensure PGBASEDIR and PG_MAJOR are set +PGBASEDIR=${PGBASEDIR:-/home/postgres} +PG_MAJOR=${PG_MAJOR:-18} + +# Create and fix permissions for directories BEFORE starting pgduck_server +# Docker volumes are created with root ownership, but postgres user needs write access +mkdir -p ${PGBASEDIR}/pgduck_socket_dir +mkdir -p ${PGBASEDIR}/pgsql-${PG_MAJOR}/data/base/pgsql_tmp +sudo chown -R postgres:postgres ${PGBASEDIR}/pgduck_socket_dir +sudo chown -R postgres:postgres ${PGBASEDIR}/pgsql-${PG_MAJOR}/data/base/pgsql_tmp +sudo chmod 700 ${PGBASEDIR}/pgduck_socket_dir +sudo chmod 700 ${PGBASEDIR}/pgsql-${PG_MAJOR}/data/base/pgsql_tmp + +# Start pgduck_server using the binary from the PostgreSQL bin directory +# NOTE: pgduck_server only listens on Unix sockets, not TCP +# The --port is used to create the socket file name (e.g., .s.PGSQL.5332) +# To connect from host, you need to access through the Unix socket or via the pg_lake-postgres container +${PGBASEDIR}/pgsql-${PG_MAJOR}/bin/pgduck_server \ + --cache_dir ~/cache \ + --unix_socket_directory ~/pgduck_socket_dir \ + --unix_socket_group postgres \ + --port 5332 \ + --init_file_path /init-pgduck-server.sql & pgduck_server_pid=$! wait $pgduck_server_pid diff --git a/docker/scripts/entrypoint-postgres.sh b/docker/scripts/entrypoint-postgres.sh index 6893aea4..5095b491 100755 --- a/docker/scripts/entrypoint-postgres.sh +++ b/docker/scripts/entrypoint-postgres.sh @@ -4,24 +4,35 @@ set -euo pipefail trap "echo 'Caught termination signal. Exiting...'; exit 0" SIGINT SIGTERM -# Update pg_hba.conf -echo "local all all trust" | tee $PGBASEDIR/pgsql-$PG_MAJOR/data/pg_hba.conf -echo "host all all 127.0.0.1/32 trust" | tee -a $PGBASEDIR/pgsql-$PG_MAJOR/data/pg_hba.conf -echo "host all all ::1/128 trust" | tee -a $PGBASEDIR/pgsql-$PG_MAJOR/data/pg_hba.conf - -# Update postgresql.conf -echo "port = 5432" | tee -a $PGBASEDIR/pgsql-$PG_MAJOR/data/postgresql.conf -echo "shared_preload_libraries = 'pg_extension_base'" | tee -a $PGBASEDIR/pgsql-$PG_MAJOR/data/postgresql.conf -echo "pg_lake_iceberg.default_location_prefix = 's3://testbucket/pg_lake/'" | tee -a $PGBASEDIR/pgsql-$PG_MAJOR/data/postgresql.conf -echo "pg_lake_engine.host = 'host=/home/postgres/pgduck_socket_dir port=5332'" | tee -a $PGBASEDIR/pgsql-$PG_MAJOR/data/postgresql.conf +# Ensure PGBASEDIR and PG_MAJOR are set +PGBASEDIR=${PGBASEDIR:-/home/postgres} +PG_MAJOR=${PG_MAJOR:-18} -# Start PostgreSQL server -pg_ctl -D $PGBASEDIR/pgsql-$PG_MAJOR/data start -l $PGBASEDIR/pgsql-$PG_MAJOR/data/logfile +# Create and fix permissions for temporary directory BEFORE starting PostgreSQL +# Docker volumes are created with root ownership, but postgres user needs write access +mkdir -p ${PGBASEDIR}/pgsql-${PG_MAJOR}/data/base/pgsql_tmp +sudo chown -R postgres:postgres ${PGBASEDIR}/pgsql-${PG_MAJOR}/data/base/pgsql_tmp +sudo chmod 700 ${PGBASEDIR}/pgsql-${PG_MAJOR}/data/base/pgsql_tmp -# bind volumes have root permission at start (make this readable and writable by postgres) -sudo chown -R postgres:postgres /home/postgres/pgsql-{16,17,18}/data/base/pgsql_tmp +# Update pg_hba.conf +echo "local all all trust" | tee ${PGBASEDIR}/pgsql-${PG_MAJOR}/data/pg_hba.conf +echo "host all all 127.0.0.1/32 trust" | tee -a ${PGBASEDIR}/pgsql-${PG_MAJOR}/data/pg_hba.conf +echo "host all all ::1/128 trust" | tee -a ${PGBASEDIR}/pgsql-${PG_MAJOR}/data/pg_hba.conf +echo "host all all 0.0.0.0/0 trust" | tee -a ${PGBASEDIR}/pgsql-${PG_MAJOR}/data/pg_hba.conf -# Run initialization script -psql -U postgres -f /init-postgres.sql +# Update postgresql.conf +# !!IMPORTANT!!: NOT RECOMMENDED FOR PRODUCTION +# ALLOW ACCESS FROM ANY IP ADDRESS typically used for development to access the database from outside the container +echo "listen_addresses = '*'" | tee -a ${PGBASEDIR}/pgsql-${PG_MAJOR}/data/postgresql.conf +echo "port = 5432" | tee -a ${PGBASEDIR}/pgsql-${PG_MAJOR}/data/postgresql.conf +echo "shared_preload_libraries = 'pg_extension_base'" | tee -a ${PGBASEDIR}/pgsql-${PG_MAJOR}/data/postgresql.conf +echo "pg_lake_iceberg.default_location_prefix = 's3://testbucket/pg_lake/'" | tee -a ${PGBASEDIR}/pgsql-${PG_MAJOR}/data/postgresql.conf +echo "pg_lake_engine.host = 'host=${PGBASEDIR}/pgduck_socket_dir port=5332'" | tee -a ${PGBASEDIR}/pgsql-${PG_MAJOR}/data/postgresql.conf + +# Start PostgreSQL server using explicit path +${PGBASEDIR}/pgsql-${PG_MAJOR}/bin/pg_ctl -D ${PGBASEDIR}/pgsql-${PG_MAJOR}/data start -l ${PGBASEDIR}/pgsql-${PG_MAJOR}/data/logfile + +# Run initialization script using explicit path +${PGBASEDIR}/pgsql-${PG_MAJOR}/bin/psql -U postgres -f /init-postgres.sql sleep infinity diff --git a/docker/scripts/init-pgduck-server.sql b/docker/scripts/init-pgduck-server.sql index 01cb3edf..425acc1c 100644 --- a/docker/scripts/init-pgduck-server.sql +++ b/docker/scripts/init-pgduck-server.sql @@ -1,2 +1,11 @@ --- create minio secret -create secret s3miniosecret(type s3, scope 's3://testbucket', use_ssl false, key_id 'minioadmin', secret 'minioadmin', url_style 'path', endpoint 'minio:9000'); +-- create localstack aws secret +CREATE SECRET s3_localstack_secret( + TYPE s3, + scope 's3://testbucket', + use_ssl false, + key_id 'test', + secret 'test', + url_style 'path', + endpoint 'localstack:4566', + region 'us-east-1' +); diff --git a/docker/scripts/init-s3.sh b/docker/scripts/init-s3.sh new file mode 100755 index 00000000..81e47c08 --- /dev/null +++ b/docker/scripts/init-s3.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +set -euo pipefail + +trap "echo 'Caught termination signal. Exiting...'; exit 0" SIGINT SIGTERM + +# Create test bucket if it doesn't exist +if ! awslocal s3 ls s3://testbucket 2>/dev/null; then + echo "Creating S3 bucket: testbucket" + awslocal s3 mb s3://testbucket + echo "Bucket created successfully" +else + echo "Bucket testbucket already exists" +fi + +echo "S3 initialization complete!" \ No newline at end of file