lfedgeai · dmitryk-dk · May 4, 2026 · May 4, 2026 · May 4, 2026 · May 5, 2026
diff --git a/research/telemetry_storage_backend/Makefile b/research/telemetry_storage_backend/Makefile
@@ -4,7 +4,7 @@ OUT   := $(ROOT)/out
 DATA_DIR ?= $(ROOT)/telemetry_data
 BATCH ?= 5000
 
-.PHONY: up down up-compare up-otel schema load query bench bench-compare bench-otlp
+.PHONY: up down up-compare up-vm up-otel schema load query bench bench-compare bench-vm bench-otlp
 
 up:
 	docker compose up -d doris
@@ -15,7 +15,7 @@ up:
 
 up-compare:
 	mkdir -p druid_data/var druid_data/shared
-	docker compose -f docker-compose.yml -f docker-compose.druid.yml -f docker-compose.oceanbase.yml -f docker-compose.loki.yml up -d
+	docker compose -f docker-compose.yml -f docker-compose.druid.yml -f docker-compose.oceanbase.yml -f docker-compose.loki.yml -f docker-compose.victoriametrics.yml up -d
 	@echo "Waiting for Doris health..."
 	@for i in {1..60}; do \
 	  curl -sf http://localhost:8030/api/health && echo "Doris is healthy" && break || sleep 2; \
@@ -36,9 +36,21 @@ up-compare:
 	@for i in {1..30}; do \
 	  nc -z 127.0.0.1 3100 && echo "Loki is ready" && break || sleep 2; \
 	done
+	@echo "Waiting for VictoriaMetrics (port 8428)..."
+	@for i in {1..30}; do \
+	  curl -sf http://localhost:8428/-/healthy && echo "VictoriaMetrics is ready" && break || sleep 2; \
+	done
+	@echo "Waiting for VictoriaLogs (port 9428)..."
+	@for i in {1..30}; do \
+	  curl -sf http://localhost:9428/-/healthy && echo "VictoriaLogs is ready" && break || sleep 2; \
+	done
+	@echo "Waiting for VictoriaTraces (port 10428)..."
+	@for i in {1..30}; do \
+	  curl -sf http://localhost:10428/-/healthy && echo "VictoriaTraces is ready" && break || sleep 2; \
+	done
 
 up-otel:
-	docker compose -f docker-compose.yml -f docker-compose.druid.yml -f docker-compose.oceanbase.yml -f docker-compose.otel.yml up -d
+	docker compose -f docker-compose.yml -f docker-compose.druid.yml -f docker-compose.oceanbase.yml -f docker-compose.loki.yml -f docker-compose.victoriametrics.yml -f docker-compose.otel.yml up -d
 	@echo "Waiting for Doris health..."
 	@for i in {1..60}; do \
 	  curl -sf http://localhost:8030/api/health && echo "Doris is healthy" && break || sleep 2; \
@@ -47,13 +59,40 @@ up-otel:
 	@for i in {1..60}; do \
 	  curl -sf http://localhost:28223/ping && echo "ClickHouse is healthy" && break || sleep 2; \
 	done
+	@echo "Waiting for VictoriaMetrics (port 8428)..."
+	@for i in {1..30}; do \
+	  curl -sf http://localhost:8428/-/healthy && echo "VictoriaMetrics is ready" && break || sleep 2; \
+	done
+	@echo "Waiting for VictoriaLogs (port 9428)..."
+	@for i in {1..30}; do \
+	  curl -sf http://localhost:9428/-/healthy && echo "VictoriaLogs is ready" && break || sleep 2; \
+	done
+	@echo "Waiting for VictoriaTraces (port 10428)..."
+	@for i in {1..30}; do \
+	  curl -sf http://localhost:10428/-/healthy && echo "VictoriaTraces is ready" && break || sleep 2; \
+	done
 	@echo "Waiting for OTLP collector..."
 	@for i in {1..30}; do \
 	  nc -z 127.0.0.1 4317 && echo "OTLP collector ready" && break || sleep 2; \
 	done
 
+up-vm:
+	docker compose -f docker-compose.victoriametrics.yml up -d
+	@echo "Waiting for VictoriaMetrics (port 8428)..."
+	@for i in {1..30}; do \
+	  curl -sf http://localhost:8428/-/healthy && echo "VictoriaMetrics is ready" && break || sleep 2; \
+	done
+	@echo "Waiting for VictoriaLogs (port 9428)..."
+	@for i in {1..30}; do \
+	  curl -sf http://localhost:9428/-/healthy && echo "VictoriaLogs is ready" && break || sleep 2; \
+	done
+	@echo "Waiting for VictoriaTraces (port 10428)..."
+	@for i in {1..30}; do \
+	  curl -sf http://localhost:10428/-/healthy && echo "VictoriaTraces is ready" && break || sleep 2; \
+	done
+
 down:
-	docker compose -f docker-compose.yml -f docker-compose.druid.yml -f docker-compose.otel.yml -f docker-compose.oceanbase.yml -f docker-compose.loki.yml down -v
+	docker compose -f docker-compose.yml -f docker-compose.druid.yml -f docker-compose.otel.yml -f docker-compose.oceanbase.yml -f docker-compose.loki.yml -f docker-compose.victoriametrics.yml down -v
 
 schema:
 	@echo "Applying Doris schema..."
@@ -76,6 +115,7 @@ bench-compare:
 	@echo "Running Doris vs ClickHouse vs Druid comparison..."
 	DORIS_USER="root" DORIS_PASS="" CLICKHOUSE_HTTP="http://localhost:28223" CLICKHOUSE_PASSWORD="changeme" \
 	LOKI_HTTP="http://localhost:3100" \
+	VM_HTTP="http://localhost:8428" VL_HTTP="http://localhost:9428" VT_HTTP="http://localhost:10428" \
 	python3 runner/bench_compare.py --all --data-dir "$(DATA_DIR)" --out "$(OUT)" --batch $(BATCH) $(SCALE_ARGS) $(STREAMING_ARGS)
 
 # Scaling: BATCH=10000 make bench-compare
@@ -86,10 +126,18 @@ SCALE_ARGS := $(if $(SCALE_TO),--scale-to $(SCALE_TO),)
 STREAMING_BATCH ?=
 STREAMING_ARGS := $(if $(STREAMING_BATCH),--streaming-batch $(STREAMING_BATCH),)
 
+bench-vm:
+	@mkdir -p "$(OUT)"
+	@echo "Running VictoriaMetrics-only benchmark..."
+	VM_HTTP="http://localhost:8428" VL_HTTP="http://localhost:9428" VT_HTTP="http://localhost:10428" \
+	python3 runner/bench_compare.py --all --vm-only --data-dir "$(DATA_DIR)" --out "$(OUT)" --batch $(BATCH) $(SCALE_ARGS) $(STREAMING_ARGS)
+
 bench-otlp:
 	@mkdir -p "$(OUT)"
 	@echo "Running benchmark with OTLP ingestion ($(OTLP_COUNT) spans, $(OTLP_COUNT) logs, $(OTLP_COUNT) metrics)..."
-	DORIS_USER="root" DORIS_PASS="" CLICKHOUSE_HTTP="http://localhost:28223" CLICKHOUSE_PASSWORD="changeme" python3 runner/bench_compare.py \
+	DORIS_USER="root" DORIS_PASS="" CLICKHOUSE_HTTP="http://localhost:28223" CLICKHOUSE_PASSWORD="changeme" \
+	VM_HTTP="http://localhost:8428" VL_HTTP="http://localhost:9428" VT_HTTP="http://localhost:10428" \
+	python3 runner/bench_compare.py \
 	  --data-dir "$(DATA_DIR)" --out "$(OUT)" --all --otlp --otlp-count $(OTLP_COUNT)
 OTLP_COUNT ?= 1000
 

diff --git a/research/telemetry_storage_backend/README.md b/research/telemetry_storage_backend/README.md
@@ -33,12 +33,17 @@ Tech Stack: Python (Pandas/NumPy), k8s
 
 ## 📂 Repository Structure
 ```text
-├── loeaders/           # logic to load the data
+├── loaders/            # logic to load data into each backend
 ├── out/                # benchmark run test results
 ├── queries/            # Queries to produce the performance benchmark
+│   ├── doris/          # SQL queries for Doris
+│   ├── clickhouse/     # SQL queries for ClickHouse
+│   ├── druid/          # SQL queries for Druid
+│   ├── oceanbase/      # SQL queries for OceanBase
+│   └── victoriametrics/  # MetricsQL, LogQL, TraceQL queries
 ├── runner/             # benchmark run logic
 ├── docs/               # In-depth documentation and literature review
-├── schemas/            # backend storage chemas
+├── schemas/            # backend storage schemas
 ├── telemetry_data/     # static logs, metrics, traces and metadata
 └── README.md           # This file
 ```
@@ -53,16 +58,20 @@ This harness replays pre-collected OpenTelemetry-like ground-truth (`telemetry_d
 - `docker-compose.yml` — Doris + ClickHouse for comparison trials
 - `docker-compose.druid.yml` — Druid (extends main)
 - `docker-compose.oceanbase.yml` — OceanBase CE (extends main)
+- `docker-compose.victoriametrics.yml` — VictoriaMetrics stack (metrics + logs + traces)
 - `schemas/doris.sql`, `schemas/clickhouse.sql`, `schemas/oceanbase.sql` — database and tables (logs, spans, metrics)
 - `loaders/replay_doris.py` — Doris replayer using Stream Load HTTP API
 - `loaders/replay_clickhouse.py` — ClickHouse replayer via HTTP
 - `loaders/replay_druid.py` — Druid native batch ingestion
 - `loaders/replay_oceanbase.py` — OceanBase via MySQL protocol (pymysql)
+- `loaders/replay_victoriametrics.py` — VictoriaMetrics replayer (Prometheus remote write for metrics, Loki API for logs, OTLP HTTP for traces)
+- `loaders/remote_write.py` — Prometheus remote write encoding helper
 - `queries/{doris,clickhouse,druid,oceanbase}/*.sql` — canonical queries per backend
+- `queries/victoriametrics/*.{metricsql,logql,traceql}` — VictoriaMetrics queries (MetricsQL, LogQL, TraceQL)
 - `runner/bench.py` — Doris-only: schema → load → queries → report
-- `runner/bench_compare.py` — Doris vs ClickHouse vs Druid vs OceanBase: same flow on all, combined report
+- `runner/bench_compare.py` — Doris vs ClickHouse vs Druid vs OceanBase vs VictoriaMetrics: same flow on all, combined report
 - `out/` — run outputs (`storage_bench_doris_<ts>/`, `storage_bench_compare_<ts>/`, `rolling_index.html`)
-- `otel-collector-config.yaml` — OTLP receiver → Doris + ClickHouse exporters
+- `otel-collector-config.yaml` — OTLP receiver → Doris + ClickHouse + VictoriaMetrics exporters
 - `docker-compose.otel.yml` — OTLP collector service (extends main compose)
 - `runner/run_otlp_ingest.py` — sends 1000 spans/logs/metrics via telemetrygen → collector
 - `runner/map_otlp_to_telemetry.py` — maps `otel.*` → `telemetry.*` so queries use batch + OTLP data
@@ -77,29 +86,37 @@ make bench               # run benchmark (uses telemetry_data/)
 make down                # stop
 ```
 
-**Doris vs ClickHouse vs Druid vs OceanBase comparison:**
+**Doris vs ClickHouse vs Druid vs OceanBase vs VictoriaMetrics comparison:**
 ```bash
-make up-compare          # start Doris + ClickHouse + Druid + OceanBase
+make up-compare          # start Doris + ClickHouse + Druid + OceanBase + VictoriaMetrics
 make bench-compare       # run comparison benchmark (file load only, no telemetrygen)
 make bench-compare SCALE_TO=5000   # scale to 5k rows per type
 make down
 ```
 
+**VictoriaMetrics-only:**
+```bash
+make up-vm               # start VictoriaMetrics + VictoriaLogs + VictoriaTraces
+make bench-vm            # run VM-only benchmark
+make down
+```
+
 **OTLP ingestion (telemetrygen → collector):**
 ```bash
-make up-otel             # start stack + OTLP collector
-make bench-otlp          # file load + telemetrygen (1000 spans, 1000 logs, 1000 metrics) via OTLP → Doris + ClickHouse
+make up-otel             # start stack + OTLP collector (includes VictoriaMetrics)
+make bench-otlp          # file load + telemetrygen (1000 spans, 1000 logs, 1000 metrics) via OTLP → Doris + ClickHouse + VictoriaMetrics
 make down
 ```
 
 ## Data sources
 
 See `docs/DATA_SOURCES.md` for 50k correlated benchmark options.
 
-| Run | Data source | Notes |
-|-----|-------------|-------|
-| `bench-compare` | `telemetry_data/` | Pre-collected files (logs_*.txt, traces_*.json, metrics_*.json). No telemetrygen. |
-| `bench-otlp` | `telemetry_data/` + telemetrygen | Same file load; additionally sends 1000 spans/logs/metrics via telemetrygen → OTLP collector → Doris + ClickHouse. |
+| Run             | Data source                    | Notes                                                                                                                                             |
+|-----------------|--------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------|
+| `bench-compare` | `telemetry_data/`              | Pre-collected files (logs_*.txt, traces_*.json, metrics_*.json). No telemetrygen.                                                                 |
+| `bench-vm`      | `telemetry_data/`              | Same file load, VictoriaMetrics stack only.                                                                                                       |
+| `bench-otlp`    | `telemetry_data/` + telemetrygen | Same file load; additionally sends 1000 spans/logs/metrics via telemetrygen → OTLP collector → Doris + ClickHouse + VictoriaMetrics.             |
 
 Notes:
 - Requires Docker + docker compose.
@@ -117,33 +134,35 @@ Notes:
 
 ## Outputs
 - `out/storage_bench_doris_<ts>/` — Doris-only: `summary.html`, `ingest.json`, `queries.json`
-- `out/storage_bench_compare_<ts>/` — Doris vs ClickHouse vs Druid vs OceanBase: `compare.html`, `*_queries.json` per backend
+- `out/storage_bench_compare_<ts>/` — Doris vs ClickHouse vs Druid vs OceanBase vs VictoriaMetrics: `compare.html`, `*_queries.json` per backend
 - `out/rolling_index.html` — unified index of all runs (newest first)
 
 The ingestion comparison table shows: **Backend | Mechanism | Duration (s) | Rows | Rows/sec**. Mechanism describes the ingest method (e.g. `Batch file load (5000 rows)` or `OTLP (1000 spans, 1000 logs, 1000 metrics)`). With `--otlp`, OTLP rows are appended to the table.
 
 ## Ingestion benchmark (how it works)
 
-**Batch file load** — Same JSON/JSONL files from `telemetry_data/` are replayed into Doris, ClickHouse, Druid, and OceanBase via each loader:
+**Batch file load** — Same JSON/JSONL files from `telemetry_data/` are replayed into Doris, ClickHouse, Druid, OceanBase, and VictoriaMetrics via each loader:
 - Doris: Stream Load HTTP API (`loaders/replay_doris.py`)
 - ClickHouse: HTTP INSERT (`loaders/replay_clickhouse.py`)
 - Druid: Native batch ingestion via Overlord (`loaders/replay_druid.py`)
 - OceanBase: MySQL protocol via pymysql (`loaders/replay_oceanbase.py`)
+- VictoriaMetrics: Prometheus remote write for metrics (`loaders/remote_write.py`), Loki-compatible API for logs, OTLP HTTP for traces (`loaders/replay_victoriametrics.py`)
 
-All four backends get identical data. Ingest duration and rows/sec are measured per backend. **Message size** is capped at 200KB in `loaders/common.py` to avoid huge Druid/OceanBase files when scaling.
+All five backends get identical data. Ingest duration and rows/sec are measured per backend. **Message size** is capped at 200KB in `loaders/common.py` to avoid huge Druid/OceanBase files when scaling.
 
-**OTLP ingestion** — When `--otlp` is used, telemetrygen sends N spans, N logs, and N metrics (gRPC) to the OpenTelemetry Collector (`otel-collector-config.yaml`). The collector batches and exports to Doris and ClickHouse only. Rows are counted in `otel.*` tables after a short flush delay; duration is end-to-end (telemetrygen start → last batch exported). The OTLP data is then **mapped into `telemetry.*`** via `runner/map_otlp_to_telemetry.py` (INSERT … SELECT from `otel.*` with column mapping), so canonical queries run against batch + OTLP data combined.
+**OTLP ingestion** — When `--otlp` is used, telemetrygen sends N spans, N logs, and N metrics (gRPC) to the OpenTelemetry Collector (`otel-collector-config.yaml`). The collector batches and exports to Doris, ClickHouse, and VictoriaMetrics (VictoriaTraces accepts OTLP natively on port 14317/14318). Rows are counted in `otel.*` tables after a short flush delay; duration is end-to-end (telemetrygen start → last batch exported). The OTLP data is then **mapped into `telemetry.*`** via `runner/map_otlp_to_telemetry.py` (INSERT … SELECT from `otel.*` with column mapping), so canonical queries run against batch + OTLP data combined.
 
 **Why Druid is not in OTLP** — The OpenTelemetry Collector has no Druid exporter. Doris and ClickHouse both have official OTLP/contrib exporters; Druid typically ingests OTLP data via Kafka (collector → Kafka → Druid). Adding Druid to the OTLP path would require a Kafka-based pipeline, which this harness does not implement.
 
 ## Query benchmark (how it works)
 
-After ingestion (and OTLP mapping if `--otlp`), the runner executes the same set of SQL queries on each backend. Each query file in `queries/{doris,clickhouse,druid,oceanbase}/*.sql` is run once per backend via its native API:
+After ingestion (and OTLP mapping if `--otlp`), the runner executes the same set of queries on each backend. Each query file in `queries/{doris,clickhouse,druid,oceanbase}/*.sql` and `queries/victoriametrics/*.{metricsql,logql,traceql}` is run once per backend via its native API:
 
 - **Doris** — `mysql` client over Docker (`telemetry.logs`, `telemetry.spans`, `telemetry.metrics`)
 - **ClickHouse** — HTTP POST to `:8123` with `?query=...`
 - **Druid** — HTTP POST to `:8888/druid/v2/sql` with JSON body
 - **OceanBase** — `mysql` client over Docker (port 2881, MySQL-compatible)
+- **VictoriaMetrics** — MetricsQL via `:8428/api/v1/query_range`, LogQL via `:9428/select/logsql/query`, TraceQL via `:10428/api/traces`
 
 **What is measured** — For each query, the runner records:
 - **Latency (s)** — Wall-clock time from query start to completion (includes network, parsing, execution)
@@ -154,6 +173,18 @@ After ingestion (and OTLP mapping if `--otlp`), the runner executes the same set
 - **Query comparison** — Bar chart and table with latency, result row count, and error per backend. Includes a `data_volume` query that runs full COUNT on all three tables. The fastest backend and % difference vs. slowest are shown.
 - All backends query the same data: batch load + (when `--otlp`) mapped OTLP data in `telemetry.*`.
 
+### VictoriaMetrics stack
+
+VictoriaMetrics uses a split architecture — three separate components handle different telemetry signals:
+
+| Component        | Port  | Signal  | Ingest API                         | Query language |
+|------------------|-------|---------|------------------------------------|----------------|
+| VictoriaMetrics  | 8428  | Metrics | Prometheus remote write (`/api/v1/write`) | MetricsQL      |
+| VictoriaLogs     | 9428  | Logs    | Loki-compatible (`/insert/loki/api/v1/push`) | LogQL          |
+| VictoriaTraces   | 10428 | Traces  | OTLP HTTP/gRPC (ports 14317/14318) | TraceQL        |
+
+Environment variables: `VM_HTTP`, `VL_HTTP`, `VT_HTTP` configure endpoints (defaults: `http://localhost:8428`, `:9428`, `:10428`).
+
 ### Query result differences (row count)
 
 Some queries return different row counts across backends:

diff --git a/research/telemetry_storage_backend/docker-compose.otel.yml b/research/telemetry_storage_backend/docker-compose.otel.yml
@@ -13,8 +13,16 @@ services:
       - "4317:4317"   # OTLP gRPC
       - "4318:4318"   # OTLP HTTP
     depends_on:
-      - doris
-      - clickhouse
+      doris:
+        condition: service_healthy
+      clickhouse:
+        condition: service_healthy
+      victoriametrics:
+        condition: service_healthy
+      victorialogs:
+        condition: service_healthy
+      victoriatraces:
+        condition: service_healthy
     networks:
       - default
     restart: unless-stopped

diff --git a/research/telemetry_storage_backend/docker-compose.victoriametrics.yml b/research/telemetry_storage_backend/docker-compose.victoriametrics.yml
@@ -0,0 +1,78 @@
+# VictoriaMetrics stack: metrics + logs + traces
+# Extends main compose: docker compose -f docker-compose.yml -f docker-compose.victoriametrics.yml up -d
+# VictoriaMetrics API: http://localhost:8428
+# VictoriaLogs API: http://localhost:9428
+# VictoriaTraces API: http://localhost:10428, OTLP: 14317/14318
+
+services:
+  victoriametrics:
+    image: victoriametrics/victoria-metrics:v1.142.0
+    container_name: tsb-victoriametrics
+    ports:
+      - "8428:8428"
+    volumes:
+      - vmdata:/storage
+    command:
+      - "--storageDataPath=/storage"
+      - "--httpListenAddr=:8428"
+      - "--retentionPeriod=100y"
+    networks:
+      - default
+    healthcheck:
+      test: ["CMD", "wget", "-q", "-O", "-", "http://127.0.0.1:8428/-/healthy"]
+      interval: 10s
+      timeout: 5s
+      retries: 30
+    restart: unless-stopped
+
+  victorialogs:
+    image: victoriametrics/victoria-logs:v1.50.0
+    container_name: tsb-victorialogs
+    ports:
+      - "9428:9428"
+    volumes:
+      - vldata:/vlogs
+    command:
+      - "--storageDataPath=/vlogs"
+      - "--httpListenAddr=:9428"
+      - "--retentionPeriod=100y"
+    networks:
+      - default
+    healthcheck:
+      test: ["CMD", "wget", "-q", "-O", "-", "http://127.0.0.1:9428/-/healthy"]
+      interval: 10s
+      timeout: 5s
+      retries: 30
+    restart: unless-stopped
+
+  victoriatraces:
+    image: docker.io/victoriametrics/victoria-traces:latest
+    container_name: tsb-victoriatraces
+    ports:
+      - "10428:10428"
+      - "14317:4317"
+      - "14318:4318"
+    volumes:
+      - vtdata:/vtraces
+    command:
+      - "--storageDataPath=/vtraces"
+      - "--httpListenAddr=:10428"
+      - "--retentionPeriod=100y"
+    networks:
+      - default
+    healthcheck:
+      test: ["CMD", "wget", "-q", "-O", "-", "http://127.0.0.1:10428/-/healthy"]
+      interval: 10s
+      timeout: 5s
+      retries: 30
+    restart: unless-stopped
+
+networks:
+  default:
+    name: tsb-net
+    driver: bridge
+
+volumes:
+  vmdata:
+  vldata:
+  vtdata: