diff --git a/QUERIES.md b/QUERIES.md
index 3a03ab2..dd97dfe 100644
--- a/QUERIES.md
+++ b/QUERIES.md
@@ -242,3 +242,368 @@ WITH largest_tables AS (
     GROUP BY s.schema_name, s.table_name, s.node['name']
     ORDER BY s.schema_name, s.table_name, s.node['name'];
 ```
+
+# Shard Distribution
+
+```sql
+
+SELECT
+        CASE
+            WHEN size < 1*1024*1024*1024::bigint THEN '<1GB'
+            WHEN size < 5*1024*1024*1024::bigint THEN '1GB-5GB'
+            WHEN size < 10*1024*1024*1024::bigint THEN '5GB-10GB'
+            WHEN size < 50*1024*1024*1024::bigint THEN '10GB-50GB'
+            ELSE '>=50GB'
+        END AS size_bucket,
+        COUNT(*) AS shards_in_bucket,
+        ROUND(AVG(size)::numeric / 1024 / 1024 / 1024, 2) AS avg_bucket_size_gb
+    FROM sys.shards
+    WHERE state = 'STARTED'
+    GROUP BY size_bucket
+    ORDER BY
+        CASE size_bucket
+            WHEN '<1GB' THEN 1
+            WHEN '1GB-5GB' THEN 2
+            WHEN '5GB-10GB' THEN 3
+            WHEN '10GB-50GB' THEN 4
+            ELSE 5
+        END;
+```
+
+## Shard Distribution by Node
+
+```sql
+
+SELECT
+        s.node['name'] as node_name,
+        CASE
+            WHEN size < 1*1024*1024*1024::bigint THEN '<1GB'
+            WHEN size < 5*1024*1024*1024::bigint THEN '1GB-5GB'
+            WHEN size < 10*1024*1024*1024::bigint THEN '5GB-10GB'
+            WHEN size < 50*1024*1024*1024::bigint THEN '10GB-50GB'
+            ELSE '>=50GB'
+        END AS size_bucket,
+        COUNT(*) AS shards_in_bucket,
+        ROUND(AVG(size)::numeric / 1024 / 1024 / 1024, 2) AS avg_bucket_size_gb
+    FROM sys.shards s
+    WHERE state = 'STARTED'
+    GROUP BY node_name, size_bucket
+    ORDER BY node_name, size_bucket;
+```
+
+## Active Shard detection
+```sql
+
+SELECT
+        sh.schema_name,
+        sh.table_name,
+        sh.id AS shard_id, primary, node['name'],
+        sh.partition_ident,
+        sh.translog_stats['uncommitted_size'] / 1024^2 AS translog_uncommitted_bytes,
+        sh.seq_no_stats['local_checkpoint'] - sh.seq_no_stats['global_checkpoint'] AS checkpoint_delta
+    FROM
+        sys.shards AS sh
+    WHERE
+        sh.state = 'STARTED'
+        AND sh.translog_stats['uncommitted_size'] > 10 * 1024 ^2  -- threshold: e.g., 10MB
+        OR (sh.seq_no_stats['local_checkpoint'] - sh.seq_no_stats['global_checkpoint'] > 1000) -- significant lag
+    ORDER BY
+        sh.translog_stats['uncommitted_size'] DESC,
+        checkpoint_delta DESC
+        limit 10;
+```
+
+```sql
+partition-id / values from information_schema table by using a join
+ALTER TABLE "TURVO"."shipmentFormFieldData" REROUTE CANCEL SHARD 11 on 'data-hot-8' WITH (allow_primary=False);
+```
+
+```sql
+
+SELECT
+                    sh.schema_name,
+                    sh.table_name,
+                    translate(p.values::text, ':{}', '=()') as partition_values,
+                    sh.id AS shard_id,
+                    node['name'],
+                    sh.translog_stats['uncommitted_size'] / 1024^2 AS translog_uncommitted_mb
+                FROM
+                    sys.shards AS sh
+                LEFT JOIN information_schema.table_partitions p
+                    ON sh.table_name = p.table_name
+                    AND sh.schema_name = p.table_schema
+                    AND sh.partition_ident = p.partition_ident
+                WHERE
+                    sh.state = 'STARTED'
+                    AND sh.translog_stats['uncommitted_size'] > 300 * 1024 ^2  -- threshold: e.g., 10MB
+                    AND primary=FALSE
+                ORDER BY
+                    6 DESC LIMIT 10;
++-------------+------------------------------+----------------------------+----------+--------------+-------------------------+
+| schema_name | table_name                   | partition_values           | shard_id | node['name'] | translog_uncommitted_mb |
++-------------+------------------------------+----------------------------+----------+--------------+-------------------------+
+| TURVO       | shipmentFormFieldData        | NULL                       |       14 | data-hot-6   |     7011.800104141235   |
+| TURVO       | shipmentFormFieldData        | NULL                       |       27 | data-hot-7   |     5131.491161346436   |
+| TURVO       | shipmentFormFieldData        | NULL                       |        0 | data-hot-9   |     2460.8706073760986  |
+| TURVO       | shipmentFormFieldData        | NULL                       |        7 | data-hot-2   |     1501.8993682861328  |
+| TURVO       | shipmentFormFieldData        | NULL                       |       10 | data-hot-5   |      504.0952272415161  |
+| TURVO       | shipmentFormFieldData        | NULL                       |       29 | data-hot-3   |      501.0663766860962  |
+| TURVO       | shipmentFormFieldData        | NULL                       |       16 | data-hot-8   |      497.5628480911255  |
+| TURVO       | shipmentFormFieldData_events | ("sync_day"=1757376000000) |        3 | data-hot-2   |      481.20221996307373 |
+| TURVO       | shipmentFormFieldData_events | ("sync_day"=1757376000000) |        4 | data-hot-4   |      473.12464427948    |
+| TURVO       | orderFormFieldData           | NULL                       |        5 | data-hot-1   |      469.4924907684326  |
++-------------+------------------------------+----------------------------+----------+--------------+-------------------------+
+
+```
+
+
+# Segements per Shard
+
+```sql
+SELECT
+        shard_id,
+        table_schema,
+        table_name,
+        COUNT(*) AS segment_count
+    FROM sys.segments
+    GROUP BY shard_id, table_schema, table_name
+    ORDER BY segment_count DESC
+    LIMIT 10;
+```
+
+```sql
+
+SELECT
+        s.node['name'] AS node_name,
+        CASE
+            WHEN size < 512*1024*1024::bigint THEN '<512MB'
+            WHEN size < 2.5*1024*1024*1024::bigint THEN '512MB-2.5GB'
+            WHEN size < 5*1024*1024*1024::bigint THEN '2.5GB-5GB'
+            WHEN size < 25*1024*1024*1024::bigint THEN '5GB-25GB'
+            ELSE '>=25GB'
+        END AS size_bucket,
+        COUNT(*) AS segments_in_bucket,
+        ROUND(AVG(size)::numeric / 1024 / 1024 / 1024, 2) AS avg_segment_size_gb
+    FROM sys.segments s
+    GROUP BY node_name, size_bucket
+    ORDER BY node_name, size_bucket;
+```
+
+### Count retention_lease
+
+### for a partition
+
+```sql
+cr> SELECT array_length(retention_leases['leases'], 1) as cnt_leases, id from sys.shards WHERE table_name = 'shipmentFormFieldData' AND partition_ident = '04732dpl6or3gd1
+    o60o30c1g' order by array_length(retention_leases['leases'], 1);
++------------+----+
+| cnt_leases | id |
++------------+----+
+|          1 |  5 |
+|          1 |  4 |
+|          1 |  7 |
+|          1 |  0 |
+|          1 |  3 |
+|          1 |  6 |
+|          1 |  1 |
+|          1 |  2 |
++------------+----+
+SELECT 8 rows in set (0.038 sec)
+cr>
+
+```
+
+### for a table
+
+```sql
+
+SELECT array_length(retention_leases['leases'], 1) as cnt_leases, id from sys.shards WHERE table_name = 'shipmentFormFieldData' AND array_length(retention_leases['leases'], 1) > 1 order by 1;
+```
+
+
+#### list partition ids
+
+```sql
+cr> SELECT partition_ident, values
+    FROM information_schema.table_partitions
+    WHERE table_schema = 'TURVO'
+      AND table_name = 'shipmentFormFieldData' limit 100;
++--------------------------+--------------------------------+
+| partition_ident          | values                         |
++--------------------------+--------------------------------+
+| 04732dhi6srjedhg60o30c1g | {"id_ts_month": 1627776000000} |
+| 04732d9o60qj2d9i60o30c1g | {"id_ts_month": 1580515200000} |
+| 04732dhj6krj4d1o60o30c1g | {"id_ts_month": 1635724800000} |
+| 04732dhg64qj2c1k60o30c1g | {"id_ts_month": 1601510400000} |
+| 04732dhk60sjid9i60o30c1g | {"id_ts_month": 1640995200000} |
+```
+
+cr> SELECT partition_ident, values
+    FROM information_schema.table_partitions
+    WHERE table_schema = 'TURVO'
+      AND table_name = 'shipmentFormFieldData' limit 100;
++--------------------------+--------------------------------+
+| partition_ident          | values                         |
++--------------------------+--------------------------------+
+| 04732dhi6srjedhg60o30c1g | {"id_ts_month": 1627776000000} |
+| 04732d9o60qj2d9i60o30c1g | {"id_ts_month": 1580515200000} |
+| 04732dhj6krj4d1o60o30c1g | {"id_ts_month": 1635724800000} |
+| 04732dhg64qj2c1k60o30c1g | {"id_ts_month": 1601510400000} |
+| 04732dhk60sjid9i60o30c1g | {"id_ts_month": 1640995200000} |
+| 04732dpk60rjgdpi60o30c1g | {"id_ts_month": 1740787200000} |
+| 04732dhp6ooj2e1k60o30c1g | {"id_ts_month": 1696118400000} |
+| 04732dhl6or36cpm60o30c1g | {"id_ts_month": 1656633600000} |
+| 04732d9p6op38c1g60o30c1g | {"id_ts_month": 1596240000000} |
+| 04732dhl6go38c9m60o30c1g | {"id_ts_month": 1654041600000} |
+| 04732dpg6orj8d9m60o30c1g | {"id_ts_month": 1706745600000} |
+| 04732d9p60sjce9m60o30c1g | {"id_ts_month": 1590969600000} |
+| 04732dhi6ko3idpm60o30c1g | {"id_ts_month": 1625097600000} |
+| 04732dpj6kr3ge9m60o30c1g | {"id_ts_month": 1735689600000} |
+| 04732dhm74s3acho60o30c1g | {"id_ts_month": 1669852800000} |
+| 04732dpi6koj8e1o60o30c1g | {"id_ts_month": 1725148800000} |
+| 04732dhg6orjgc1o60o30c1g | {"id_ts_month": 1606780800000} |
+| 04732dhm6gqjgchk60o30c1g | {"id_ts_month": 1664582400000} |
+| 04732d9p70sj2e1k60o30c1g | {"id_ts_month": 1598918400000} |
+| 04732dhk6cr3ecpm60o30c1g | {"id_ts_month": 1643673600000} |
+| 04732d9o6kr3ie9i60o30c1g | {"id_ts_month": 1585699200000} |
+| 04732dhp60s38e1g60o30c1g | {"id_ts_month": 1690848000000} |
+| 04732dhn6kp30e9m60o30c1g | {"id_ts_month": 1675209600000} |
+| 04732dpk6oo3adpm60o30c1g | {"id_ts_month": 1746057600000} |
+| 04732dpg74p3ac9i60o30c1g | {"id_ts_month": 1709251200000} |
+| 04732dph6gqj4c9m60o30c1g | {"id_ts_month": 1714521600000} |
+| 04732dhn68qj6c9i60o30c1g | {"id_ts_month": 1672531200000} |
+| 04732dhm6sp3cc1o60o30c1g | {"id_ts_month": 1667260800000} |
+| 04732dhl64pjccpi60o30c1g | {"id_ts_month": 1651363200000} |
+| 04732dph6sp30c1g60o30c1g | {"id_ts_month": 1717200000000} |
+| 04732dph74rjichg60o30c1g | {"id_ts_month": 1719792000000} |
+| 04732dpj6co32c9i60o30c1g | {"id_ts_month": 1733011200000} |
+| 04732dpg64pjge1o60o30c1g | {"id_ts_month": 1701388800000} |
+| 04732dpj70pjce1g60o30c1g | {"id_ts_month": 1738368000000} |
+| 04732dpk6cq3cd9m60o30c1g | {"id_ts_month": 1743465600000} |
+| 04732dhh6sp36d9i60o30c1g | {"id_ts_month": 1617235200000} |
+| 04732dpi68q3ec1k60o30c1g | {"id_ts_month": 1722470400000} |
+| 04732dho70ojce9m60o30c1g | {"id_ts_month": 1688169600000} |
+| 04732dhg6gojge1o60o30c1g | {"id_ts_month": 1604188800000} |
+| 04732dhk70rjec9i60o30c1g | {"id_ts_month": 1648771200000} |
+| 04732dhj70pj2dho60o30c1g | {"id_ts_month": 1638316800000} |
+| 04732dho60pj0dpi60o30c1g | {"id_ts_month": 1680307200000} |
+| 04732d9o6co34c1o60o30c1g | {"id_ts_month": 1583020800000} |
+| 04732dhj60q3ad1k60o30c1g | {"id_ts_month": 1630454400000} |
+| 04732dhg74q3ae9i60o30c1g | {"id_ts_month": 1609459200000} |
+| 04732dhl74pj2chg60o30c1g | {"id_ts_month": 1659312000000} |
+| 04732dpi6srj8c1o60o30c1g | {"id_ts_month": 1727740800000} |
+*| 04732dpl6go30dhk60o30c1g | {"id_ts_month": 1754006400000} |
+| 04732dhp70rjidho60o30c1g | {"id_ts_month": 1698796800000} |
+| 04732dhi68qj0d9m60o30c1g | {"id_ts_month": 1622505600000} |
+| 04732d9p6cqjcc9m60o30c1g | {"id_ts_month": 1593561600000} |
+| 04732dpg6go3cdpi60o30c1g | {"id_ts_month": 1704067200000} |
+| 04732dho68s3ie9i60o30c1g | {"id_ts_month": 1682899200000} |
+| 04732d9n6ss36dho60o30c1g | {"id_ts_month": 1577836800000} |
+| 04732dpj60q32e9i60o30c1g | {"id_ts_month": 1730419200000} |
+| 04732dhm64sjic1k60o30c1g | {"id_ts_month": 1661990400000} |
+| 04732dhh6gqjadho60o30c1g | {"id_ts_month": 1614556800000} |
+| 04732dho6kqjedpm60o30c1g | {"id_ts_month": 1685577600000} |
+| 04732dhn6sr34e1o60o30c1g | {"id_ts_month": 1677628800000} |
+| 04732dph64sj4e9m60o30c1g | {"id_ts_month": 1711929600000} |
+| 04732dhp6cqj4dhk60o30c1g | {"id_ts_month": 1693526400000} |
+| 04732dpk70rj6dhg60o30c1g | {"id_ts_month": 1748736000000} |
+| 04732dpl64pj4e1g60o30c1g | {"id_ts_month": 1751328000000} |
+*| 04732dpl6or3gd1o60o30c1g | {"id_ts_month": 1756684800000} |
+| 04732dhh74s34dpi60o30c1g | {"id_ts_month": 1619827200000} |
+| 04732dhj6co38dhk60o30c1g | {"id_ts_month": 1633046400000} |
+| 04732dhk6oo3icho60o30c1g | {"id_ts_month": 1646092800000} |
+| 04732dhh68oj6dpm60o30c1g | {"id_ts_month": 1612137600000} |
++--------------------------+--------------------------------+
+SELECT 68 rows in set (0.006 sec)
+
+## Disable Rebalancing
+
+SET GLOBAL PERSISTENT "cluster.routing.rebalance.enable"='xxx'; -- all / none
+[data-hot-7] updating [cluster.routing.rebalance.enable] from [all] to [none]`
+
+
+### Report on schema, tables, sizes, ...
+
+```sql
+WITH columns AS (
+    SELECT table_schema,
+           table_name,
+           COUNT(*) AS num_columns
+    FROM information_schema.columns
+    GROUP BY ALL
+), tables AS (
+    SELECT table_schema,
+           table_name,
+           partitioned_by,
+           clustered_by
+    FROM information_schema.tables
+), shards AS (
+    SELECT schema_name AS table_schema,
+           table_name,
+           partition_ident,
+           SUM(size) FILTER (WHERE primary = TRUE) / POWER(1024, 3) AS total_primary_size_gb,
+           AVG(size) / POWER(1024, 3) AS avg_shard_size_gb,
+           MIN(size) / POWER(1024, 3) AS min_shard_size_gb,
+           MAX(size) / POWER(1024, 3) AS max_shard_size_gb,
+           COUNT(*) FILTER (WHERE primary = TRUE) AS num_shards_primary,
+           COUNT(*) FILTER (WHERE primary = FALSE) AS num_shards_replica,
+           COUNT(*) AS num_shards_total
+    FROM sys.shards
+    GROUP BY ALL
+)
+SELECT s.*,
+       num_columns,
+       partitioned_by[1] AS partitioned_by,
+       clustered_by
+FROM shards s
+JOIN columns c ON s.table_name = c.table_name AND s.table_schema = c.table_schema
+JOIN tables t ON s.table_name = t.table_name AND s.table_schema = t.table_schema
+ORDER BY table_schema, table_name, partition_ident
+```
+
+----
+partition_ident          | values                         |
++--------------------------+--------------------------------+
+| 04732dhp6ooj2e1k60o30c1g | {"id_ts_month": 1696118400000} |
+| 04732dpk60rjgdpi60o30c1g | {"id_ts_month": 1740787200000} |
+| 04732dhl6or36cpm60o30c1g | {"id_ts_month": 1656633600000} |
+| 04732dpi6srj8c1o60o30c1g | {"id_ts_month": 1727740800000} |
+| 04732dhl74pj2chg60o30c1g | {"id_ts_month": 1659312000000} |
+| 04732dhl6go38c9m60o30c1g | {"id_ts_month": 1654041600000} |
+| 04732dpg6orj8d9m60o30c1g | {"id_ts_month": 1706745600000} |
+| 04732dpl6go30dhk60o30c1g | {"id_ts_month": 1754006400000} |
+| 04732dhp70rjidho60o30c1g | {"id_ts_month": 1698796800000} |
+| 04732dpj6kr3ge9m60o30c1g | {"id_ts_month": 1735689600000} |
+| 04732dhm74s3acho60o30c1g | {"id_ts_month": 1669852800000} |
+| 04732dpi6koj8e1o60o30c1g | {"id_ts_month": 1725148800000} |
+| 04732dhm6gqjgchk60o30c1g | {"id_ts_month": 1664582400000} |
+| 04732dpg6go3cdpi60o30c1g | {"id_ts_month": 1704067200000} |
+| 04732dho68s3ie9i60o30c1g | {"id_ts_month": 1682899200000} |
+| 04732dhp60s38e1g60o30c1g | {"id_ts_month": 1690848000000} |
+| 04732dhn6kp30e9m60o30c1g | {"id_ts_month": 1675209600000} |
+| 04732dpk6oo3adpm60o30c1g | {"id_ts_month": 1746057600000} |
+| 04732dpj60q32e9i60o30c1g | {"id_ts_month": 1730419200000} |
+| 04732dpl74p3edho60o30c1g | {"id_ts_month": 1759276800000} |
+| 04732dhm64sjic1k60o30c1g | {"id_ts_month": 1661990400000} |
+| 04732dpg74p3ac9i60o30c1g | {"id_ts_month": 1709251200000} |
+| 04732dph6gqj4c9m60o30c1g | {"id_ts_month": 1714521600000} |
+| 04732dhn68qj6c9i60o30c1g | {"id_ts_month": 1672531200000} |
+| 04732dhm6sp3cc1o60o30c1g | {"id_ts_month": 1667260800000} |
+| 04732dhl64pjccpi60o30c1g | {"id_ts_month": 1651363200000} |
+| 04732dho6kqjedpm60o30c1g | {"id_ts_month": 1685577600000} |
+| 04732dhn6sr34e1o60o30c1g | {"id_ts_month": 1677628800000} |
+| 04732dph74rjichg60o30c1g | {"id_ts_month": 1719792000000} |
+| 04732dph6sp30c1g60o30c1g | {"id_ts_month": 1717200000000} |
+| 04732dph64sj4e9m60o30c1g | {"id_ts_month": 1711929600000} |
+| 04732dpj6co32c9i60o30c1g | {"id_ts_month": 1733011200000} |
+| 04732dhp6cqj4dhk60o30c1g | {"id_ts_month": 1693526400000} |
+| 04732dpg64pjge1o60o30c1g | {"id_ts_month": 1701388800000} |
+| 04732dpk70rj6dhg60o30c1g | {"id_ts_month": 1748736000000} |
+| 04732dpl64pj4e1g60o30c1g | {"id_ts_month": 1751328000000} |
+| 04732dpj70pjce1g60o30c1g | {"id_ts_month": 1738368000000} |
+| 04732dpl6or3gd1o60o30c1g | {"id_ts_month": 1756684800000} |
+| 04732dpk6cq3cd9m60o30c1g | {"id_ts_month": 1743465600000} |
+| 04732dpi68q3ec1k60o30c1g | {"id_ts_month": 1722470400000} |
+| 04732dho70ojce9m60o30c1g | {"id_ts_month": 1688169600000} |
+| 04732dho60pj0dpi60o30c1g | {"id_ts_month": 1680307200000} |
++--------------------------+--------------------------------+
diff --git a/README.md b/README.md
index 3e1cc70..6d97ef3 100644
--- a/README.md
+++ b/README.md
@@ -31,6 +31,16 @@ pip install -e .
 ```
 
 3. Create a `.env` file with your CrateDB connection details:
+
+**For localhost CrateDB:**
+```bash
+CRATE_CONNECTION_STRING=https://localhost:4200
+CRATE_USERNAME=crate
+# CRATE_PASSWORD=  # Leave empty or unset for default crate user
+CRATE_SSL_VERIFY=false
+```
+
+**For remote CrateDB:**
 ```bash
 CRATE_CONNECTION_STRING=https://your-cluster.cratedb.net:4200
 CRATE_USERNAME=your-username
@@ -41,6 +51,12 @@ CRATE_SSL_VERIFY=true
 ## Quick Start
 
 ### Test Connection
+You can test your connection configuration with the included test script:
+```bash
+python test_connection.py
+```
+
+Or use the built-in test:
 ```bash
 xmover test-connection
 ```
@@ -112,10 +128,61 @@ Analyzes current shard distribution across nodes and zones.
 
 **Options:**
 - `--table, -t`: Analyze specific table only
+- `--largest INTEGER`: Show N largest tables/partitions by size
+- `--smallest INTEGER`: Show N smallest tables/partitions by size
+- `--no-zero-size`: Exclude zero-sized tables from smallest results (default: include zeros)
 
-**Example:**
+**Examples:**
 ```bash
+# Basic cluster analysis
+xmover analyze
+
+# Analyze specific table only
 xmover analyze --table events
+
+# Show top 10 largest tables/partitions
+xmover analyze --largest 10
+
+# Show top 5 smallest tables/partitions (includes zero-sized)
+xmover analyze --smallest 5
+
+# Show top 5 smallest non-zero tables/partitions (exclude zero-sized)
+xmover analyze --smallest 5 --no-zero-size
+
+# Combine options
+xmover analyze --table events --largest 3
+```
+
+**Sample Output (--largest 3):**
+```
+                        Largest Tables/Partitions by Size (Top 3)
+╭─────────────────────────────────┬─────────────────────────────┬────────┬───────┬──────────┬──────────┬──────────┬────────────╮
+│ Table                           │ Partition                   │ Shards │  P/R  │ Min Size │ Avg Size │ Max Size │ Total Size │
+├─────────────────────────────────┼─────────────────────────────┼────────┼───────┼──────────┼──────────┼──────────┼────────────┤
+│ TURVO.shipmentFormFieldData     │ ("id_ts_month"=162777600000 │      4 │ 2P/2R │   89.1GB │   95.3GB │  104.2GB │    381.2GB │
+│ TURVO.orderFormFieldData        │ N/A                         │      6 │ 3P/3R │   23.4GB │   28.7GB │   35.1GB │    172.2GB │
+│ TURVO.documentUploadProgress    │ ("sync_day"=1635724800000)  │      8 │ 4P/4R │   15.2GB │   18.4GB │   22.1GB │    147.2GB │
+╰─────────────────────────────────┴─────────────────────────────┴────────┴───────┴──────────┴──────────┴──────────┴────────────╯
+
+📊 Summary: 18 total shards using 700.6GB across 3 largest table/partition(s)
+```
+
+**Sample Output (--smallest 5 --no-zero-size):**
+```
+ℹ️  Found 12 table/partition(s) with 0.0GB size (excluded from results)
+
+                        Smallest Tables/Partitions by Size (Top 5)
+╭─────────────────────────────────┬─────────────────────────────┬────────┬───────┬──────────┬──────────┬──────────┬────────────╮
+│ Table                           │ Partition                   │ Shards │  P/R  │ Min Size │ Avg Size │ Max Size │ Total Size │
+├─────────────────────────────────┼─────────────────────────────┼────────┼───────┼──────────┼──────────┼──────────┼────────────┤
+│ TURVO.emailActivity_transformf… │ N/A                         │      2 │ 1P/1R │    0.001GB │   0.001GB │    0.002GB │      0.002GB │
+│ TURVO.calendarFormFieldData_tr… │ ("sync_day"=1627776000000)  │      2 │ 1P/1R │    0.005GB │   0.005GB │    0.005GB │      0.010GB │
+│ TURVO.shipmentSummary_failures  │ N/A                         │      2 │ 1P/1R │    0.100GB │   0.100GB │    0.100GB │      0.200GB │
+│ TURVO.documentActivity_failures │ N/A                         │      4 │ 2P/2R │    0.250GB │   0.325GB │    0.400GB │      1.300GB │
+│ TURVO.userActivity_logs         │ ("date"=2024-01-01)         │      6 │ 3P/3R │    0.800GB │   0.950GB │    1.100GB │      5.700GB │
+╰─────────────────────────────────┴─────────────────────────────┴────────┴───────┴──────────┴──────────┴──────────┴────────────╯
+
+📊 Summary: 16 total shards using 7.212GB across 5 smallest non-zero table/partition(s)
 ```
 
 ### `find-candidates`
@@ -262,6 +329,275 @@ xmover monitor-recovery --watch --include-transitioning
 - **PEER**: Copying shard data from another node (replication/relocation)
 - **DISK**: Rebuilding shard from local data (after restart/disk issues)
 
+**Enhanced Translog Monitoring:**
+The recovery monitor now displays detailed translog information in the format:
+```
+📋 TURVO.shipmentFormFieldData_events S4 PEER TRANSLOG 0.0% 6.2GB (TL:109.8GB / 22.1GB / 20%) data-hot-0 → data-hot-7
+```
+
+**Translog Display Format**: `TL:X.XGB / Y.YGB / ZZ%`
+- `X.XGB`: Total translog file size (`translog_stats['size']`)
+- `Y.YGB`: Uncommitted translog size (`translog_stats['uncommitted_size']`)
+- `ZZ%`: Uncommitted as percentage of total translog size
+
+**Color Coding:**
+- 🔴 **Red**: Uncommitted ≥ 5GB OR uncommitted ≥ 80% (critical)
+- 🟡 **Yellow**: Uncommitted ≥ 1GB OR uncommitted ≥ 50% (warning) 
+- 🟢 **Green**: Below warning thresholds (normal)
+
+Translog information is only shown when significant (uncommitted ≥ 10MB or total ≥ 50MB).
+
+**Enhanced Replica Progress Tracking:**
+For replica shard recoveries, the monitor now shows sequence number-based progress when available:
+```
+📋 TURVO.LINEAGE_DIRECTLY_OPEN_TO_APPOINTMENT S2R PEER TRANSLOG 99.9% (seq) 15.2GB data-hot-0 → data-hot-1
+```
+
+**Progress Display Formats:**
+- `99.9% (seq)`: Replica progress based on sequence number comparison with primary
+- `37.5% (seq) / 95.0% (rec)`: Shows both when sequence and traditional progress differ significantly (>5%)
+- `98.5%`: Primary shards or when sequence data unavailable (traditional progress)
+
+**Sequence Progress Benefits:**
+- More accurate progress indication for replica synchronization
+- Based on comparing `max_seq_no` between replica and primary shards
+- Reveals actual replication lag in terms of operations behind primary
+- Particularly useful for detecting stuck replica recoveries where traditional recovery shows 100% but replica is still far behind
+
+**Enhanced Transitioning Recovery Display:**
+The monitor now shows detailed information for transitioning recoveries instead of just "(transitioning)":
+```
+16:08:20 | 5 done (transitioning)
+         | 🔄 TURVO.accountFormFieldData S7R PEER DONE 99.8% (seq) 3.8GB data-hot-5 → data-hot-7
+         | 🔄 TURVO_MySQL.composite_mapping S11P PEER DONE 100.0% 3.0GB data-hot-5 → data-hot-6
+         | 🔄 TURVO.shipmentFormFieldData ("id_ts_month"=1633046400000) S6R PEER DONE 99.8% (seq) 8.2GB (TL:233MB / 49MB / 21%) data-hot-4 → data-hot-7
+```
+
+**Transitioning Display Features:**
+- Shows up to 5 transitioning recoveries with full details
+- Includes sequence progress, translog info, and node routing
+- Throttled to every 30 seconds to reduce noise
+- Uses 🔄 icon to indicate transitioning state
+- Distinguishes primary (P) vs replica (R) shards
+
+### `problematic-translogs`
+Find tables with problematic translog sizes and generate replica management commands.
+
+**Options:**
+- `--sizeMB INTEGER`: Minimum translog uncommitted size in MB (default: 300)
+- `--execute`: Execute the replica management commands after confirmation
+
+**Description:**
+This command identifies tables with replica shards that have large uncommitted translog sizes indicating replication issues. It shows both individual problematic shards and a summary by table/partition. It generates two types of ALTER commands: individual REROUTE CANCEL SHARD commands for each problematic shard, and replica management commands that temporarily set replicas to 0 and restore them to force recreation of problematic replicas.
+
+**Examples:**
+```bash
+# Show problematic tables with translog > 300MB (default)
+xmover problematic-translogs
+
+# Show tables with translog > 500MB
+xmover problematic-translogs --sizeMB 500
+
+# Execute replica management commands for tables > 1GB after confirmation
+xmover problematic-translogs --sizeMB 1000 --execute
+```
+
+**Sample Output:**
+```
+                   Problematic Replica Shards (translog > 300MB)
+╭────────┬───────────────────────────────┬────────────────────────────┬──────────┬────────────┬─────────────╮
+│ Schema │ Table                         │ Partition                  │ Shard ID │ Node       │ Translog MB │
+├────────┼───────────────────────────────┼────────────────────────────┼──────────┼────────────┼─────────────┤
+│ TURVO  │ shipmentFormFieldData         │ none                       │       14 │ data-hot-6 │      7040.9 │
+│ TURVO  │ shipmentFormFieldData_events  │ ("sync_day"=1757376000000) │        3 │ data-hot-2 │       481.2 │
+│ TURVO  │ orderFormFieldData            │ none                       │        5 │ data-hot-1 │       469.5 │
+╰────────┴───────────────────────────────┴────────────────────────────┴──────────┴────────────┴─────────────╯
+
+Found 2 table/partition(s) with problematic translogs:
+
+              Tables with Problematic Replicas (translog > 300MB)               
+╭────────┬───────────┬───────────┬───────────┬──────────┬─────────────┬──────────────┬──────────╮
+│ Schema │ Table     │ Partition │ Problema… │ Max      │ Shards      │ Size GB      │ Current  │
+│        │           │           │ Replicas  │ Trans.MB │ (P/R)       │ (P/R)        │ Replicas │
+├────────┼───────────┼───────────┼───────────┼──────────┼─────────────┼──────────────┼──────────┤
+│ TURVO  │ shipment… │ ("sync..  │         2 │   7011.8 │ 5P/5R       │ 12.4/12.1    │        1 │
+│ TURVO  │ orderFor… │ none      │         1 │    469.5 │ 3P/6R       │ 8.2/16.3     │        2 │
+╰────────┴───────────┴───────────┴───────────┴──────────┴─────────────┴──────────────┴──────────╯
+
+Generated ALTER Commands:
+
+ALTER TABLE "TURVO"."shipmentFormFieldData" REROUTE CANCEL SHARD 14 on 'data-hot-6' WITH (allow_primary=False);
+ALTER TABLE "TURVO"."shipmentFormFieldData_events" partition ("sync_day"=1757376000000) REROUTE CANCEL SHARD 3 on 'data-hot-2' WITH (allow_primary=False);
+ALTER TABLE "TURVO"."orderFormFieldData" REROUTE CANCEL SHARD 5 on 'data-hot-1' WITH (allow_primary=False);
+
+-- Set replicas to 0:
+ALTER TABLE "TURVO"."shipmentFormFieldData" PARTITION ("id_ts_month"=1756684800000) SET ("number_of_replicas" = 0);
+-- Restore replicas to 1:
+ALTER TABLE "TURVO"."shipmentFormFieldData" PARTITION ("id_ts_month"=1756684800000) SET ("number_of_replicas" = 1);
+
+-- Set replicas to 0:
+ALTER TABLE "TURVO"."orderFormFieldData" SET ("number_of_replicas" = 0);
+-- Restore replicas to 2:
+ALTER TABLE "TURVO"."orderFormFieldData" SET ("number_of_replicas" = 2);
+
+Total: 3 REROUTE CANCEL commands + 4 replica management commands
+```
+
+When using `--execute`, each command is presented individually for confirmation, allowing you to selectively execute specific commands as needed.
+
+### `active-shards`
+Monitors the most active shards by tracking checkpoint progression over time.
+
+**Options:**
+- `--count`: Number of most active shards to show (default: 10)
+- `--interval`: Observation interval in seconds (default: 30)
+- `--min-checkpoint-delta`: Minimum checkpoint progression between snapshots to show shard (default: 1000)
+- `--table, -t`: Monitor specific table only
+- `--node, -n`: Monitor specific node only
+- `--watch, -w`: Continuously monitor (refresh every interval)
+- `--exclude-system`: Exclude system tables (gc.*, information_schema.*, *_events, *_log)
+- `--min-rate`: Minimum activity rate (changes/sec) to show
+- `--show-replicas/--hide-replicas`: Show replica shards (default: True)
+
+**Examples:**
+```bash
+# Show top 10 most active shards over 30 seconds
+xmover active-shards
+
+# Top 20 shards with 60-second observation period
+xmover active-shards --count 20 --interval 60
+
+# Continuous monitoring with 30-second intervals
+xmover active-shards --watch --interval 30
+
+# Monitor specific table activity
+xmover active-shards --table my_table --watch
+
+# Monitor specific node with custom threshold
+xmover active-shards --node data-hot-1 --min-checkpoint-delta 500
+
+# Exclude system tables and event logs for business data focus
+xmover active-shards --exclude-system --count 20
+
+# Only show high-activity shards (≥50 changes/sec)
+xmover active-shards --min-rate 50 --count 15
+
+# Focus on primary shards only
+xmover active-shards --hide-replicas --count 20
+```
+
+This command helps identify which shards are receiving the most write activity by measuring local checkpoint progression between two snapshots.
+
+**How it works:**
+1. **Takes snapshot of ALL started shards** (not just currently active ones)
+2. **Waits for observation interval** (configurable, default: 30 seconds)
+3. **Takes second snapshot** of all started shards
+4. **Compares snapshots** to find shards with checkpoint progression ≥ threshold
+5. **Shows ranked results** with activity trends and insights
+
+**Enhanced output features:**
+- **Checkpoint visibility**: Shows actual `local_checkpoint` values (CP Start → CP End → Delta)
+- **Partition awareness**: Separate tracking for partitioned tables (different partition_ident values)
+- **Activity trends**: 🔥 HOT (≥100/s), 📈 HIGH (≥50/s), 📊 MED (≥10/s), 📉 LOW (<10/s)
+- **Smart insights**: Identifies concentration patterns and load distribution (non-watch mode)
+- **Flexible filtering**: Exclude system tables, set minimum rates, hide replicas
+- **Context information**: Total activity, average rates, observation period
+- **Clean watch mode**: Streamlined output without legend/insights for continuous monitoring
+
+This approach captures shards that become active during the observation period, providing a complete view of cluster write patterns and identifying hot spots. The enhanced filtering helps focus on business-critical activity patterns.
+
+**Sample output (single run):**
+```
+🔥 Most Active Shards (3 shown, 30s observation period)
+
+Total checkpoint activity: 190,314 changes, Average rate: 2,109.0/sec
+
+   Rank | Schema.Table           | Shard | Partition      | Node       | Type | Checkpoint Δ | Rate/sec | Trend
+   -----------------------------------------------------------------------------------------------------------
+   1    | gc.scheduled_jobs_log  | 0     | -              | data-hot-8 | P    | 113,744      | 3,791.5  | 🔥 HOT
+   2    | TURVO.events           | 0     | 04732dpl6osj8d | data-hot-0 | P    | 45,837       | 1,527.9  | 🔥 HOT
+   3    | doc.user_actions       | 1     | 04732dpk70rj6d | data-hot-2 | P    | 30,733       | 1,024.4  | 🔥 HOT
+
+Legend:
+  • Checkpoint Δ: Write operations during observation period
+  • Partition: partition_ident (truncated if >14 chars, '-' if none)
+
+Insights:
+  • 3 HOT shards (≥100 changes/sec) - consider load balancing
+  • All active shards are PRIMARY - normal write pattern
+```
+
+**Sample output (watch mode - cleaner):**
+```
+30s interval | threshold: 1,000 | top 5
+
+🔥 Most Active Shards (3 shown, 30s observation period)
+
+Total checkpoint activity: 190,314 changes, Average rate: 2,109.0/sec
+
+   Rank | Schema.Table           | Shard | Partition      | Node       | Type | Checkpoint Δ | Rate/sec | Trend
+   -----------------------------------------------------------------------------------------------------------
+   1    | gc.scheduled_jobs_log  | 0     | -              | data-hot-8 | P    | 113,744      | 3,791.5  | 🔥 HOT
+   2    | TURVO.events           | 0     | 04732dpl6osj8d | data-hot-0 | P    | 45,837       | 1,527.9  | 🔥 HOT
+   3    | doc.user_actions       | 1     | 04732dpk70rj6d | data-hot-2 | P    | 30,733       | 1,024.4  | 🔥 HOT
+
+━━━ Next update in 30s ━━━
+```
+
+### `large-translogs`
+Monitors shards with large translog uncommitted sizes that do not flush properly, displaying both primary and replica shards.
+
+**Options:**
+- `--translogsize`: Minimum translog uncommitted size threshold in MB (default: 500)
+- `--interval`: Monitoring interval in seconds for watch mode (default: 60)
+- `--watch, -w`: Continuously monitor (refresh every interval)
+- `--table, -t`: Monitor specific table only
+- `--node, -n`: Monitor specific node only
+- `--count`: Maximum number of shards with large translogs to show (default: 50)
+
+**Examples:**
+```bash
+# Show shards with translog over default 500MB threshold
+xmover large-translogs
+
+# Show shards with translog over 1GB threshold
+xmover large-translogs --translogsize 1000
+
+# Continuous monitoring every 30 seconds
+xmover large-translogs --watch --interval 30
+
+# Monitor specific table
+xmover large-translogs --table my_table --watch
+
+# Monitor specific node, show top 20
+xmover large-translogs --node data-hot-1 --count 20
+```
+
+This command helps identify shards that are not flushing properly by monitoring their translog uncommitted sizes, which can indicate replication or flush issues.
+
+**Output includes:**
+- **Schema.Table**: Combined schema and table name
+- **Partition**: Partition values or "-" for non-partitioned tables
+- **Shard**: Numeric shard identifier
+- **Node**: Node where shard is located
+- **TL MB**: Translog uncommitted size (color-coded: bright_red >1GB, red >500MB, yellow >100MB, green ≤100MB)
+- **Type**: "P" for primary shards, "R" for replica shards
+- **Timestamp**: Current time for each update
+- **Summary**: Total shards, primary/replica breakdown, average translog size
+
+**Sample output:**
+```
+Large Translogs (>400MB) - 09:45:51
+╭────────────────────────────┬──────────────────────┬───────┬────────────┬────────┬──────╮
+│ Schema.Table               │ Partition            │ Shard │ Node       │  TL MB │ Type │
+├────────────────────────────┼──────────────────────┼───────┼────────────┼────────┼──────┤
+│ TURVO.orderFormFieldData_… │ ("sync_day"=175936.… │     7 │ data-hot-7 │    510 │  P   │
+│ TURVO.orderFormFieldData   │ -                    │     8 │ data-hot-6 │    509 │  R   │
+│ TURVO.orderFormFieldData   │ -                    │    20 │ data-hot-3 │    507 │  R   │
+╰────────────────────────────┴──────────────────────┴───────┴────────────┴────────┴──────╯
+3 shards (1P/2R) - Avg translog: 509MB
+```
+
 ### `test-connection`
 Tests the connection to CrateDB and displays basic cluster information.
 
@@ -362,6 +698,40 @@ xmover monitor-recovery --watch --include-transitioning
 xmover monitor-recovery --node data-hot-3 --recovery-type DISK
 ```
 
+### Monitoring Active Shards and Write Patterns
+
+Identify which shards are receiving the most write activity:
+
+1. Quick snapshot of most active shards:
+```bash
+# Show top 10 most active shards over 30 seconds
+xmover active-shards
+
+# Longer observation period for more accurate results
+xmover active-shards --count 15 --interval 60
+```
+
+2. Continuous monitoring for real-time insights:
+```bash
+# Continuous monitoring with 30-second intervals
+xmover active-shards --watch --interval 30
+
+# Monitor specific table for focused analysis
+xmover active-shards --table critical_table --watch
+```
+
+3. Integration with rebalancing workflow:
+```bash
+# Identify hot shards first
+xmover active-shards --count 20 --interval 60
+
+# Move hot shards away from overloaded nodes
+xmover recommend --table hot_table --prioritize-space --execute
+
+# Monitor the impact
+xmover active-shards --table hot_table --watch
+```
+
 ### Manual Shard Movement
 
 1. Validate the move first:
@@ -394,8 +764,20 @@ xmover recommend --prioritize-zones --execute
 
 - `CRATE_CONNECTION_STRING`: CrateDB HTTP endpoint (required)
 - `CRATE_USERNAME`: Username for authentication (optional)
-- `CRATE_PASSWORD`: Password for authentication (optional)
-- `CRATE_SSL_VERIFY`: Enable SSL certificate verification (default: true)
+- `CRATE_PASSWORD`: Password for authentication (optional, only used if username is also provided)
+- `CRATE_SSL_VERIFY`: SSL certificate verification (default: auto-detects based on connection string)
+  - `true`: Always verify SSL certificates
+  - `false`: Disable SSL certificate verification  
+  - `auto`: Automatically disable for localhost/127.0.0.1, enable for remote connections
+
+#### Retry and Timeout Configuration
+
+For clusters under pressure, you can configure retry behavior:
+
+- `CRATE_MAX_RETRIES`: Maximum number of retries for failed queries (default: 3)
+- `CRATE_TIMEOUT`: Base timeout in seconds for queries (default: 30)
+- `CRATE_MAX_TIMEOUT`: Maximum timeout in seconds for retries (default: 120)
+- `CRATE_RETRY_BACKOFF`: Exponential backoff factor between retries (default: 2.0)
 
 ### Connection String Format
 
diff --git a/TROUBLESHOOTING.md b/TROUBLESHOOTING.md
index 83a7dd6..8126bb3 100644
--- a/TROUBLESHOOTING.md
+++ b/TROUBLESHOOTING.md
@@ -20,6 +20,81 @@ xmover validate-move SCHEMA.TABLE SHARD_ID FROM_NODE TO_NODE
 xmover explain-error "your error message here"
 ```
 
+## Cluster Under Pressure / Performance Issues
+
+### Symptoms
+- `500 Server Error: Internal Server Error`
+- `503 Service Unavailable` 
+- `429 Too Many Requests`
+- Query timeouts
+- Slow response times
+
+### Solutions
+
+#### 1. Configure Retry and Timeout Settings
+Add these to your `.env` file for better resilience:
+
+```bash
+# Increase retries for unstable clusters
+CRATE_MAX_RETRIES=5
+
+# Increase base timeout for slow queries
+CRATE_TIMEOUT=60
+
+# Allow longer timeouts for retries
+CRATE_MAX_TIMEOUT=300
+
+# Adjust backoff between retries
+CRATE_RETRY_BACKOFF=1.5
+```
+
+#### 2. Monitor Cluster Health
+```bash
+# Check cluster load
+SELECT node['name'], load, heap FROM sys.nodes;
+
+# Check query queue
+SELECT * FROM sys.jobs WHERE stmt LIKE '%ALTER TABLE%';
+
+# Check disk usage
+SELECT node['name'], fs['total'], fs['used'] FROM sys.nodes;
+```
+
+#### 3. Reduce Load During Operations
+- Run XMover during low-traffic periods
+- Move fewer shards at once with `--limit`
+- Use `--wait-time` between operations
+- Monitor with `xmover monitor` before proceeding
+
+#### 4. Temporary Cluster Adjustments
+```sql
+-- Increase query timeout temporarily
+SET SESSION "statement_timeout" = '300s';
+
+-- Reduce concurrent recoveries
+SET GLOBAL TRANSIENT cluster.routing.allocation.node_concurrent_recoveries = 1;
+
+-- Increase recovery throttling
+SET GLOBAL TRANSIENT indices.recovery.max_bytes_per_sec = '20mb';
+```
+
+#### 5. Error-Specific Solutions
+
+**500 Internal Server Error:**
+- Usually indicates cluster overload
+- Wait and retry with exponential backoff (built into XMover)
+- Check cluster logs for specific errors
+
+**503 Service Unavailable:**
+- Cluster rejecting new queries
+- Reduce concurrent operations
+- Wait for current operations to complete
+
+**429 Too Many Requests:**
+- Rate limiting active
+- Increase retry delays with higher `CRATE_RETRY_BACKOFF`
+- Reduce operation frequency
+
 ## Common Issues and Solutions
 
 ### 1. Zone Conflicts
diff --git a/config/shard_size_rules.yaml b/config/shard_size_rules.yaml
new file mode 100644
index 0000000..ffd8f77
--- /dev/null
+++ b/config/shard_size_rules.yaml
@@ -0,0 +1,194 @@
+# XMover Shard Size Monitoring Rules
+# Configuration file for analyzing CrateDB shard sizes and generating optimization recommendations
+#
+# Rules are evaluated against each table/partition combination returned by the analysis query.
+# Variables available in rule conditions:
+#   - table_schema, table_name, partition_ident
+#   - total_primary_size_gb, avg_shard_size_gb, min_shard_size_gb, max_shard_size_gb
+#   - num_shards_primary, num_shards_replica, num_shards_total
+#   - num_columns, partitioned_by, clustered_by
+#   - cluster_config dictionary with cluster-level metrics
+
+metadata:
+  version: "1.0"
+  description: "CrateDB shard size optimization rules"
+  author: "XMover"
+  last_updated: "2025-10-03"
+
+# Global thresholds referenced in rules
+thresholds:
+  # Core shard size recommendations
+  optimal_shard_size_min_gb: 3
+  optimal_shard_size_max_gb: 70
+  performance_sweet_spot_min_gb: 10
+  performance_sweet_spot_max_gb: 50
+
+  # Workload-specific ranges
+  search_optimized_max_gb: 30
+  write_heavy_min_gb: 30
+  write_heavy_max_gb: 50
+  time_series_min_gb: 20
+  time_series_max_gb: 40
+
+  # Critical thresholds
+  large_shard_threshold_gb: 50
+  small_shard_threshold_gb: 1
+  consolidation_threshold_gb: 3
+
+  # Column-related thresholds
+  wide_table_column_threshold: 500
+  wide_table_shard_max_gb: 25
+  max_columns_default: 1000
+
+  # Cluster density thresholds
+  shards_per_heap_gb_ratio: 20
+  max_shards_per_node_safe: 1000
+  cpu_per_shard_ratio: 1.5
+
+# Table/Partition level rules
+rules:
+  - name: "critical_oversized_shards"
+    category: "size_optimization"
+    severity: "critical"
+    condition: "max_shard_size_gb > thresholds['large_shard_threshold_gb']"
+    recommendation: "Shard size {max_shard_size_gb:.1f}GB exceeds {large_shard_threshold_gb}GB limit. Split shards to improve recovery times and query performance."
+    action_hint: "Consider reducing number_of_shards or using table partitioning"
+
+  - name: "undersized_shards_with_excess_count"
+    category: "size_optimization"
+    severity: "warning"
+    condition: "max_shard_size_gb < thresholds['small_shard_threshold_gb'] and num_shards_primary > cluster_config['total_nodes']"
+    recommendation: "Shards too small ({max_shard_size_gb:.2f}GB < {small_shard_threshold_gb}GB) with {num_shards_primary} primary shards across {cluster_config[total_nodes]} nodes. Consolidate to reduce overhead."
+    action_hint: "Reduce number_of_shards for future partitions or use shard shrinking"
+
+  - name: "wide_table_oversized_shards"
+    category: "performance"
+    severity: "critical"
+    condition: "num_columns > thresholds['wide_table_column_threshold'] and max_shard_size_gb > thresholds['wide_table_shard_max_gb']"
+    recommendation: "Wide table with {num_columns} columns (>{wide_table_column_threshold}) has {max_shard_size_gb:.1f}GB shards. Reduce to <{wide_table_shard_max_gb}GB to mitigate column overhead."
+    action_hint: "Increase number_of_shards or disable indexing for unused columns"
+
+  - name: "suboptimal_small_shards"
+    category: "efficiency"
+    severity: "info"
+    condition: "max_shard_size_gb < thresholds['consolidation_threshold_gb'] and max_shard_size_gb >= thresholds['small_shard_threshold_gb']"
+    recommendation: "Small shards ({max_shard_size_gb:.1f}GB) could be consolidated. Target minimum {consolidation_threshold_gb}GB per shard."
+    action_hint: "Consider reducing number_of_shards for better efficiency"
+
+  - name: "outside_performance_sweet_spot"
+    category: "performance"
+    severity: "info"
+    condition: "(max_shard_size_gb < thresholds['performance_sweet_spot_min_gb'] or max_shard_size_gb > thresholds['performance_sweet_spot_max_gb']) and max_shard_size_gb >= thresholds['consolidation_threshold_gb'] and max_shard_size_gb <= thresholds['large_shard_threshold_gb']"
+    recommendation: "Shard size {max_shard_size_gb:.1f}GB outside performance sweet spot ({performance_sweet_spot_min_gb}-{performance_sweet_spot_max_gb}GB). Consider rebalancing."
+    action_hint: "Adjust number_of_shards to reach optimal range"
+
+  - name: "excessive_column_count"
+    category: "schema_design"
+    severity: "warning"
+    condition: "num_columns > thresholds['max_columns_default']"
+    recommendation: "Table has {num_columns} columns exceeding default limit of {max_columns_default}. May require mapping.total_fields.limit adjustment and impacts memory usage."
+    action_hint: "Review schema design and disable indexing for unused columns"
+
+  - name: "uneven_shard_distribution"
+    category: "balance"
+    severity: "warning"
+    condition: "num_shards_primary > 1 and min_shard_size_gb > 0 and (max_shard_size_gb / min_shard_size_gb) > 3"
+    recommendation: "Uneven shard size distribution: largest {max_shard_size_gb:.1f}GB vs smallest {min_shard_size_gb:.1f}GB (ratio {ratio:.1f}:1). Check data skew."
+    action_hint: "Review partitioning strategy or clustering keys"
+
+  - name: "single_large_shard_table"
+    category: "scalability"
+    severity: "warning"
+    condition: "num_shards_primary == 1 and total_primary_size_gb > thresholds['performance_sweet_spot_max_gb']"
+    recommendation: "Large single shard ({total_primary_size_gb:.1f}GB) limits parallelization. Consider increasing number_of_shards."
+    action_hint: "Increase number_of_shards to enable parallel processing"
+
+# Cluster-level rules (evaluated once per analysis)
+cluster_rules:
+  - name: "heap_to_shard_ratio_exceeded"
+    category: "stability"
+    severity: "warning"
+    condition: "cluster_config['total_shards'] > (cluster_config['total_heap_gb'] * thresholds['shards_per_heap_gb_ratio'])"
+    recommendation: "Total cluster shards ({cluster_config[total_shards]}) exceed recommended ratio of {shards_per_heap_gb_ratio} per GB heap ({cluster_config[total_heap_gb]:.1f}GB). Risk of memory pressure."
+    action_hint: "Consolidate small shards or increase heap size"
+
+  - name: "node_shard_density_critical"
+    category: "stability"
+    severity: "critical"
+    condition: "cluster_config['max_shards_per_node'] > thresholds['max_shards_per_node_safe']"
+    recommendation: "At least one node has {cluster_config[max_shards_per_node]} shards, exceeding safe limit of {max_shards_per_node_safe}. Redistribute immediately."
+    action_hint: "Move shards to other nodes or add capacity"
+
+  - name: "insufficient_cpu_per_shard"
+    category: "performance"
+    severity: "info"
+    condition: "cluster_config['total_shards'] > (cluster_config['total_cpu_cores'] * thresholds['cpu_per_shard_ratio'])"
+    recommendation: "Total shards ({cluster_config[total_shards]}) may exceed CPU capacity ({cluster_config[total_cpu_cores]} cores, recommended {cpu_per_shard_ratio} vCPU per shard)."
+    action_hint: "Consider shard consolidation or adding CPU resources"
+
+# Validation rules for the configuration file itself
+validation:
+  required_fields:
+    - metadata
+    - thresholds
+    - rules
+    - cluster_rules
+
+  rule_required_fields:
+    - name
+    - category
+    - severity
+    - condition
+    - recommendation
+
+  valid_severities:
+    - critical
+    - warning
+    - info
+
+  valid_categories:
+    - size_optimization
+    - performance
+    - efficiency
+    - schema_design
+    - balance
+    - scalability
+    - stability
+
+# Documentation for rule writing
+rule_writing_guide:
+  available_variables:
+    table_level:
+      - "table_schema: Schema name"
+      - "table_name: Table name"
+      - "partition_ident: Partition identifier (may be null)"
+      - "total_primary_size_gb: Total size of primary shards in GB"
+      - "avg_shard_size_gb: Average shard size in GB"
+      - "min_shard_size_gb: Smallest shard size in GB"
+      - "max_shard_size_gb: Largest shard size in GB"
+      - "num_shards_primary: Number of primary shards"
+      - "num_shards_replica: Number of replica shards"
+      - "num_shards_total: Total number of shards"
+      - "num_columns: Number of columns in table"
+      - "partitioned_by: Partitioning column (may be null)"
+      - "clustered_by: Clustering configuration (may be null)"
+
+    cluster_level:
+      - "cluster_config['total_nodes']: Total number of nodes"
+      - "cluster_config['total_cpu_cores']: Total CPU cores"
+      - "cluster_config['total_memory_gb']: Total system memory in GB"
+      - "cluster_config['total_heap_gb']: Total JVM heap in GB"
+      - "cluster_config['max_shards_per_node']: Setting value"
+      - "cluster_config['total_shards']: Total shards in cluster"
+      - "cluster_config['max_shards_per_node']: Actual max shards on any node"
+
+  condition_examples:
+    - "max_shard_size_gb > 50"
+    - "num_columns > 500 and avg_shard_size_gb > 25"
+    - "num_shards_primary == 1 and total_primary_size_gb > 30"
+    - "table_name.startswith('logs_') and max_shard_size_gb < 20"
+
+  recommendation_formatting:
+    - "Use {variable_name} to insert values"
+    - "Use {variable_name:.1f} for decimal formatting"
+    - "Reference thresholds with {threshold_name}"
diff --git a/pyproject.toml b/pyproject.toml
index 862d230..fb5afc2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,8 +10,12 @@ dependencies = [
     "requests>=2.28.0",
     "python-dotenv>=1.0.0",
     "rich>=13.0.0",
+    "pyyaml>=6.0",
 ]
 
+[project.optional-dependencies]
+dev = ["pytest>=7.0.0"]
+
 [project.scripts]
 xmover = "xmover.cli:main"
 
diff --git a/src/xmover/analyzer.py b/src/xmover/analyzer.py
index 75af909..c5c3085 100644
--- a/src/xmover/analyzer.py
+++ b/src/xmover/analyzer.py
@@ -631,6 +631,289 @@ def _check_zone_conflict(self, recommendation: MoveRecommendation) -> Optional[s
             # If we can't check, err on the side of caution
             return f"Cannot verify zone safety: {str(e)}"
 
+    def get_shard_size_overview(self) -> Dict[str, Any]:
+        """Get shard size distribution analysis"""
+        # Only analyze STARTED shards
+        started_shards = [s for s in self.shards if s.state == 'STARTED']
+        
+        if not started_shards:
+            return {
+                'total_shards': 0,
+                'size_buckets': {},
+                'large_shards_count': 0,
+                'small_shards_percentage': 0.0,
+                'avg_shard_size_gb': 0.0
+            }
+        
+        total_shards = len(started_shards)
+        total_size_gb = sum(s.size_gb for s in started_shards)
+        avg_size_gb = total_size_gb / total_shards if total_shards > 0 else 0.0
+        
+        # Define size buckets (in GB)
+        size_buckets = {
+            '<1GB': {'count': 0, 'total_size': 0.0, 'max_size': 0.0},
+            '1GB-5GB': {'count': 0, 'total_size': 0.0, 'max_size': 0.0},
+            '5GB-10GB': {'count': 0, 'total_size': 0.0, 'max_size': 0.0},
+            '10GB-50GB': {'count': 0, 'total_size': 0.0, 'max_size': 0.0},
+            '>=50GB': {'count': 0, 'total_size': 0.0, 'max_size': 0.0}
+        }
+        
+        # Categorize shards by size
+        large_shards_count = 0  # >50GB shards
+        very_small_shards = 0  # <1GB shards (for percentage calculation)
+        
+        for shard in started_shards:
+            size_gb = shard.size_gb
+            
+            if size_gb >= 50:
+                size_buckets['>=50GB']['count'] += 1
+                size_buckets['>=50GB']['total_size'] += size_gb
+                size_buckets['>=50GB']['max_size'] = max(size_buckets['>=50GB']['max_size'], size_gb)
+                large_shards_count += 1
+            elif size_gb >= 10:
+                size_buckets['10GB-50GB']['count'] += 1
+                size_buckets['10GB-50GB']['total_size'] += size_gb
+                size_buckets['10GB-50GB']['max_size'] = max(size_buckets['10GB-50GB']['max_size'], size_gb)
+            elif size_gb >= 5:
+                size_buckets['5GB-10GB']['count'] += 1
+                size_buckets['5GB-10GB']['total_size'] += size_gb
+                size_buckets['5GB-10GB']['max_size'] = max(size_buckets['5GB-10GB']['max_size'], size_gb)
+            elif size_gb >= 1:
+                size_buckets['1GB-5GB']['count'] += 1
+                size_buckets['1GB-5GB']['total_size'] += size_gb
+                size_buckets['1GB-5GB']['max_size'] = max(size_buckets['1GB-5GB']['max_size'], size_gb)
+            else:
+                size_buckets['<1GB']['count'] += 1
+                size_buckets['<1GB']['total_size'] += size_gb
+                size_buckets['<1GB']['max_size'] = max(size_buckets['<1GB']['max_size'], size_gb)
+                very_small_shards += 1
+        
+        # Calculate average size for each bucket
+        for bucket_name, bucket_data in size_buckets.items():
+            if bucket_data['count'] > 0:
+                bucket_data['avg_size_gb'] = bucket_data['total_size'] / bucket_data['count']
+            else:
+                bucket_data['avg_size_gb'] = 0.0
+        
+        # Calculate percentage of very small shards (<1GB)
+        very_small_percentage = (very_small_shards / total_shards * 100) if total_shards > 0 else 0.0
+        
+        return {
+            'total_shards': total_shards,
+            'total_size_gb': total_size_gb,
+            'avg_shard_size_gb': avg_size_gb,
+            'size_buckets': size_buckets,
+            'large_shards_count': large_shards_count,
+            'very_small_shards_percentage': very_small_percentage
+        }
+
+    def get_large_shards_details(self) -> List[Dict[str, Any]]:
+        """Get detailed information about large shards (>=50GB) including partition values"""
+        # Optimized query to fetch only large shards directly from database
+        query = """
+        SELECT 
+            s.schema_name,
+            s.table_name,
+            translate(p.values::text, ':{}', '=()') as partition_values,
+            s.id as shard_id,
+            s.size / 1024^3 as size_gb,
+            s."primary" as is_primary,
+            s.node['name'] as node_name,
+            s.node['id'] as node_id
+        FROM sys.shards s
+        LEFT JOIN information_schema.table_partitions p 
+            ON s.table_name = p.table_name 
+            AND s.schema_name = p.table_schema 
+            AND s.partition_ident = p.partition_ident
+        WHERE s.state = 'STARTED'
+            AND s.size >= 50 * 1024^3  -- 50GB in bytes
+        ORDER BY s.size DESC
+        """
+        
+        result = self.client.execute_query(query)
+        
+        large_shards = []
+        for row in result.get('rows', []):
+            # Get zone information from our nodes data
+            node_id = row[7]
+            zone = next((node.zone for node in self.nodes if node.id == node_id), 'unknown')
+            
+            large_shards.append({
+                'schema_name': row[0] or 'doc',
+                'table_name': row[1],
+                'partition_values': row[2],
+                'shard_id': row[3],
+                'size_gb': float(row[4]) if row[4] else 0.0,
+                'is_primary': row[5] or False,
+                'node_name': row[6],
+                'zone': zone
+            })
+        
+        return large_shards
+
+    def get_small_shards_details(self, limit: int = 10) -> List[Dict[str, Any]]:
+        """Get detailed information about the smallest shards, grouped by table/partition"""
+        # Query to get all shards, ordered by size ascending to get the smallest
+        query = """
+        SELECT 
+            s.schema_name,
+            s.table_name,
+            translate(p.values::text, ':{}', '=()') as partition_values,
+            s.id as shard_id,
+            s.size / 1024^3 as size_gb,
+            s."primary" as is_primary,
+            s.node['name'] as node_name,
+            s.node['id'] as node_id
+        FROM sys.shards s
+        LEFT JOIN information_schema.table_partitions p 
+            ON s.table_name = p.table_name 
+            AND s.schema_name = p.table_schema 
+            AND s.partition_ident = p.partition_ident
+        WHERE s.state = 'STARTED'
+        ORDER BY s.size ASC
+        """
+        
+        result = self.client.execute_query(query)
+        
+        # Group by table/partition to get aggregated stats
+        table_partition_stats = {}
+        for row in result.get('rows', []):
+            # Get zone information from our nodes data
+            node_id = row[7]
+            zone = next((node.zone for node in self.nodes if node.id == node_id), 'unknown')
+            
+            # Create table key with schema
+            schema_name = row[0] or 'doc'
+            table_name = row[1]
+            table_display = table_name
+            if schema_name and schema_name != 'doc':
+                table_display = f"{schema_name}.{table_name}"
+            
+            # Create partition key
+            partition_key = row[2] or "N/A"
+            
+            # Create combined key
+            key = (table_display, partition_key)
+            
+            if key not in table_partition_stats:
+                table_partition_stats[key] = {
+                    'sizes': [],
+                    'primary_count': 0,
+                    'replica_count': 0,
+                    'total_size': 0.0
+                }
+            
+            # Aggregate stats
+            stats = table_partition_stats[key]
+            size_gb = float(row[4]) if row[4] else 0.0
+            stats['sizes'].append(size_gb)
+            stats['total_size'] += size_gb
+            if row[5]:  # is_primary
+                stats['primary_count'] += 1
+            else:
+                stats['replica_count'] += 1
+        
+        # Sort by average size ascending (smallest first) and return top tables/partitions
+        sorted_stats = []
+        for (table_name, partition_key), stats in table_partition_stats.items():
+            avg_size = sum(stats['sizes']) / len(stats['sizes']) if stats['sizes'] else 0
+            sorted_stats.append({
+                'table_name': table_name,
+                'partition_key': partition_key,
+                'stats': stats,
+                'avg_size': avg_size
+            })
+        
+        # Sort by average size and take the top 'limit' entries
+        sorted_stats.sort(key=lambda x: x['avg_size'])
+        return sorted_stats[:limit]
+
+    def get_table_size_breakdown(self, limit: Optional[int] = 10, order: str = 'largest') -> List[Dict[str, Any]]:
+        """Get table/partition size breakdown, sorted by total size
+        
+        Args:
+            limit: Number of tables/partitions to return (None for all)
+            order: 'largest' for biggest first, 'smallest' for smallest first
+        
+        Returns:
+            List of table/partition stats with size information
+        """
+        query = """
+        SELECT 
+            s.schema_name,
+            s.table_name,
+            translate(p.values::text, ':{}', '=()') as partition_values,
+            s.size / 1024^3 as size_gb,
+            s."primary" as is_primary
+        FROM sys.shards s
+        LEFT JOIN information_schema.table_partitions p 
+            ON s.table_name = p.table_name 
+            AND s.schema_name = p.table_schema 
+            AND s.partition_ident = p.partition_ident
+        WHERE s.state = 'STARTED'
+        """
+        
+        result = self.client.execute_query(query)
+        
+        # Group by table/partition to get aggregated stats
+        table_partition_stats = {}
+        for row in result.get('rows', []):
+            schema_name = row[0] or 'doc'
+            table_name = row[1]
+            table_display = table_name
+            if schema_name and schema_name != 'doc':
+                table_display = f"{schema_name}.{table_name}"
+            
+            # Create partition key
+            partition_key = row[2] or "N/A"
+            
+            # Create combined key
+            key = (table_display, partition_key)
+            
+            if key not in table_partition_stats:
+                table_partition_stats[key] = {
+                    'sizes': [],
+                    'primary_count': 0,
+                    'replica_count': 0,
+                    'total_size': 0.0
+                }
+            
+            # Aggregate stats
+            stats = table_partition_stats[key]
+            size_gb = float(row[3]) if row[3] else 0.0
+            stats['sizes'].append(size_gb)
+            stats['total_size'] += size_gb
+            if row[4]:  # is_primary
+                stats['primary_count'] += 1
+            else:
+                stats['replica_count'] += 1
+        
+        # Convert to list and calculate derived stats
+        table_stats = []
+        for (table_name, partition_key), stats in table_partition_stats.items():
+            total_shards = stats['primary_count'] + stats['replica_count']
+            min_size = min(stats['sizes']) if stats['sizes'] else 0.0
+            max_size = max(stats['sizes']) if stats['sizes'] else 0.0
+            avg_size = stats['total_size'] / total_shards if total_shards > 0 else 0.0
+            
+            table_stats.append({
+                'table_name': table_name,
+                'partition': partition_key,
+                'total_shards': total_shards,
+                'primary_count': stats['primary_count'],
+                'replica_count': stats['replica_count'],
+                'min_size': min_size,
+                'avg_size': avg_size,
+                'max_size': max_size,
+                'total_size': stats['total_size']
+            })
+        
+        # Sort by total size
+        reverse = order == 'largest'
+        table_stats.sort(key=lambda x: x['total_size'], reverse=reverse)
+        
+        return table_stats if limit is None else table_stats[:limit]
+
     def get_cluster_overview(self) -> Dict[str, Any]:
         """Get a comprehensive overview of the cluster"""
         # Get cluster watermark settings
@@ -872,6 +1155,12 @@ def get_cluster_recovery_status(self,
         
         return recoveries
     
+    def get_problematic_shards(self, 
+                             table_name: Optional[str] = None,
+                             node_name: Optional[str] = None) -> List[Dict[str, Any]]:
+        """Get shards that need attention but aren't actively recovering"""
+        return self.client.get_problematic_shards(table_name, node_name)
+    
     def get_recovery_summary(self, recoveries: List[RecoveryInfo]) -> Dict[str, Any]:
         """Generate a summary of recovery operations"""
         
@@ -966,18 +1255,23 @@ def _format_recovery_table(self, recoveries: List[RecoveryInfo]) -> str:
             return "   No recoveries of this type"
         
         # Table headers
-        headers = ["Table", "Shard", "Node", "Type", "Stage", "Progress", "Size(GB)", "Time(s)"]
+        headers = ["Table", "Shard", "Node", "Recovery", "Stage", "Progress", "Size(GB)", "Time(s)"]
         
         # Calculate column widths
         col_widths = [len(h) for h in headers]
         
         rows = []
         for recovery in recoveries:
+            # Format table name with partition values if available
+            table_display = f"{recovery.schema_name}.{recovery.table_name}"
+            if recovery.partition_values:
+                table_display = f"{table_display} {recovery.partition_values}"
+            
             row = [
-                f"{recovery.schema_name}.{recovery.table_name}",
+                table_display,
                 str(recovery.shard_id),
                 recovery.node_name,
-                recovery.shard_type,
+                recovery.recovery_type,
                 recovery.stage,
                 f"{recovery.overall_progress:.1f}%",
                 f"{recovery.size_gb:.1f}",
@@ -1003,3 +1297,170 @@ def _format_recovery_table(self, recoveries: List[RecoveryInfo]) -> str:
             output.append(data_row)
         
         return "\n".join(output)
+
+
+class ActiveShardMonitor:
+    """Monitor active shard checkpoint progression over time"""
+    
+    def __init__(self, client: CrateDBClient):
+        self.client = client
+    
+    def compare_snapshots(self, snapshot1: List['ActiveShardSnapshot'], 
+                         snapshot2: List['ActiveShardSnapshot'],
+                         min_activity_threshold: int = 0) -> List['ActiveShardActivity']:
+        """Compare two snapshots and return activity data for shards present in both
+        
+        Args:
+            snapshot1: First snapshot (baseline)
+            snapshot2: Second snapshot (comparison)
+            min_activity_threshold: Minimum checkpoint delta to consider active (default: 0)
+        """
+        from .database import ActiveShardActivity
+        
+        # Create lookup dict for snapshot1
+        snapshot1_dict = {snap.shard_identifier: snap for snap in snapshot1}
+        
+        activities = []
+        
+        for snap2 in snapshot2:
+            snap1 = snapshot1_dict.get(snap2.shard_identifier)
+            if snap1:
+                # Calculate local checkpoint delta
+                local_checkpoint_delta = snap2.local_checkpoint - snap1.local_checkpoint
+                time_diff = snap2.timestamp - snap1.timestamp
+                
+                # Filter based on actual activity between snapshots
+                if local_checkpoint_delta >= min_activity_threshold:
+                    activity = ActiveShardActivity(
+                        schema_name=snap2.schema_name,
+                        table_name=snap2.table_name,
+                        shard_id=snap2.shard_id,
+                        node_name=snap2.node_name,
+                        is_primary=snap2.is_primary,
+                        partition_ident=snap2.partition_ident,
+                        local_checkpoint_delta=local_checkpoint_delta,
+                        snapshot1=snap1,
+                        snapshot2=snap2,
+                        time_diff_seconds=time_diff
+                    )
+                    activities.append(activity)
+        
+        # Sort by activity (highest checkpoint delta first)
+        activities.sort(key=lambda x: x.local_checkpoint_delta, reverse=True)
+        
+        return activities
+    
+    def format_activity_display(self, activities: List['ActiveShardActivity'], 
+                               show_count: int = 10, watch_mode: bool = False) -> str:
+        """Format activity data for console display"""
+        if not activities:
+            return "✅ No active shards with significant checkpoint progression found"
+        
+        # Limit to requested count
+        activities = activities[:show_count]
+        
+        # Calculate observation period for context
+        if activities:
+            observation_period = activities[0].time_diff_seconds
+            output = [f"\n🔥 Most Active Shards ({len(activities)} shown, {observation_period:.0f}s observation period)"]
+        else:
+            output = [f"\n🔥 Most Active Shards ({len(activities)} shown, sorted by checkpoint activity)"]
+        
+        output.append("")
+        
+        # Add activity rate context
+        if activities:
+            total_activity = sum(a.local_checkpoint_delta for a in activities)
+            avg_rate = sum(a.activity_rate for a in activities) / len(activities)
+            output.append(f"[dim]Total checkpoint activity: {total_activity:,} changes, Average rate: {avg_rate:.1f}/sec[/dim]")
+            output.append("")
+        
+        # Create table headers
+        headers = ["Rank", "Schema.Table", "Shard", "Partition", "Node", "Type", "Checkpoint Δ", "Rate/sec", "Trend"]
+        
+        # Calculate column widths
+        col_widths = [len(h) for h in headers]
+        
+        # Prepare rows
+        rows = []
+        for i, activity in enumerate(activities, 1):
+            # Format values
+            rank = str(i)
+            table_id = activity.table_identifier
+            shard_id = str(activity.shard_id)
+            partition = activity.partition_ident[:14] + "..." if len(activity.partition_ident) > 14 else activity.partition_ident or "-"
+            node = activity.node_name
+            shard_type = "P" if activity.is_primary else "R"
+            checkpoint_delta = f"{activity.local_checkpoint_delta:,}"
+            rate = f"{activity.activity_rate:.1f}" if activity.activity_rate >= 0.1 else "<0.1"
+            
+            # Calculate activity trend indicator
+            if activity.activity_rate >= 100:
+                trend = "🔥 HOT"
+            elif activity.activity_rate >= 50:
+                trend = "📈 HIGH"
+            elif activity.activity_rate >= 10:
+                trend = "📊 MED"
+            else:
+                trend = "📉 LOW"
+            
+            row = [rank, table_id, shard_id, partition, node, shard_type, checkpoint_delta, rate, trend]
+            rows.append(row)
+            
+            # Update column widths
+            for j, cell in enumerate(row):
+                col_widths[j] = max(col_widths[j], len(cell))
+        
+        # Format table
+        header_row = "   " + " | ".join(h.ljust(w) for h, w in zip(headers, col_widths))
+        output.append(header_row)
+        output.append("   " + "-" * (len(header_row) - 3))
+        
+        # Data rows
+        for row in rows:
+            data_row = "   " + " | ".join(cell.ljust(w) for cell, w in zip(row, col_widths))
+            output.append(data_row)
+        
+        # Only show legend and insights in non-watch mode
+        if not watch_mode:
+            output.append("")
+            output.append("Legend:")
+            output.append("  • Checkpoint Δ: Write operations during observation period")
+            output.append("  • Rate/sec: Checkpoint changes per second")
+            output.append("  • Partition: partition_ident (truncated if >14 chars, '-' if none)")
+            output.append("  • Type: P=Primary, R=Replica")
+            output.append("  • Trend: 🔥 HOT (≥100/s), 📈 HIGH (≥50/s), 📊 MED (≥10/s), 📉 LOW (<10/s)")
+            
+            # Add insights about activity patterns
+            if activities:
+                output.append("")
+                output.append("Insights:")
+                
+                # Count by trend
+                hot_count = len([a for a in activities if a.activity_rate >= 100])
+                high_count = len([a for a in activities if 50 <= a.activity_rate < 100])
+                med_count = len([a for a in activities if 10 <= a.activity_rate < 50])
+                low_count = len([a for a in activities if a.activity_rate < 10])
+                
+                if hot_count > 0:
+                    output.append(f"  • {hot_count} HOT shards (≥100 changes/sec) - consider load balancing")
+                if high_count > 0:
+                    output.append(f"  • {high_count} HIGH activity shards - monitor capacity")
+                if med_count > 0:
+                    output.append(f"  • {med_count} MEDIUM activity shards - normal operation")
+                if low_count > 0:
+                    output.append(f"  • {low_count} LOW activity shards - occasional writes")
+                
+                # Identify patterns
+                primary_activities = [a for a in activities if a.is_primary]
+                if len(primary_activities) == len(activities):
+                    output.append("  • All active shards are PRIMARY - normal write pattern")
+                elif len(primary_activities) < len(activities) * 0.5:
+                    output.append("  • Many REPLICA shards active - possible recovery/replication activity")
+                
+                # Node concentration
+                nodes = set(a.node_name for a in activities)
+                if len(nodes) <= 2:
+                    output.append(f"  • Activity concentrated on {len(nodes)} node(s) - consider redistribution")
+        
+        return "\n".join(output)
diff --git a/src/xmover/cli.py b/src/xmover/cli.py
index 15b6073..569d4cb 100644
--- a/src/xmover/cli.py
+++ b/src/xmover/cli.py
@@ -5,6 +5,7 @@
 import sys
 import time
 import os
+import json
 from typing import Optional
 try:
     import click
@@ -19,8 +20,9 @@
     sys.exit(1)
 
 from .database import CrateDBClient
-from .analyzer import ShardAnalyzer, RecoveryMonitor
+from .analyzer import ShardAnalyzer, RecoveryMonitor, ActiveShardMonitor
 from .distribution_analyzer import DistributionAnalyzer
+from .shard_size_monitor import ShardSizeMonitor, validate_rules_file
 
 
 console = Console()
@@ -46,31 +48,73 @@ def format_percentage(value: float) -> str:
     return f"[{color}]{value:.1f}%[/{color}]"
 
 
+def format_table_display_with_partition(schema_name: str, table_name: str, partition_values: str = None) -> str:
+    """Format table display with partition values if available"""
+    # Create base table name
+    if schema_name and schema_name != 'doc':
+        base_display = f"{schema_name}.{table_name}"
+    else:
+        base_display = table_name
+    
+    # Add partition values if available
+    if partition_values:
+        return f"{base_display} {partition_values}"
+    else:
+        return base_display
+
+
 def format_translog_info(recovery_info) -> str:
-    """Format translog size information with color coding"""
-    tl_bytes = recovery_info.translog_size_bytes
+    """Format translog size information with color coding showing both total and uncommitted sizes"""
+    tl_total_bytes = recovery_info.translog_size_bytes
+    tl_uncommitted_bytes = recovery_info.translog_uncommitted_bytes
     
-    # Only show if significant (>10MB for production)
-    if tl_bytes < 10 * 1024 * 1024:  # 10MB for production
+    # Only show if significant (>10MB for production) - check uncommitted size primarily
+    if tl_uncommitted_bytes < 10 * 1024 * 1024 and tl_total_bytes < 50 * 1024 * 1024:  # 10MB uncommitted or 50MB total
         return ""
     
-    tl_gb = recovery_info.translog_size_gb
+    tl_total_gb = recovery_info.translog_size_gb
+    tl_uncommitted_gb = recovery_info.translog_uncommitted_gb
+    uncommitted_percentage = recovery_info.translog_uncommitted_percentage
     
-    # Color coding based on size
-    if tl_gb >= 5.0:
+    # Color coding based on uncommitted size and percentage
+    # Round percentage to handle floating-point precision issues
+    rounded_percentage = round(uncommitted_percentage, 1)
+    if tl_uncommitted_gb >= 5.0 or rounded_percentage >= 80.0:
         color = "red"
-    elif tl_gb >= 1.0:
+    elif tl_uncommitted_gb >= 1.0 or rounded_percentage >= 50.0:
         color = "yellow"
     else:
         color = "green"
     
-    # Format size
-    if tl_gb >= 1.0:
-        size_str = f"{tl_gb:.1f}GB"
+    # Format sizes
+    if tl_total_gb >= 1.0:
+        total_str = f"{tl_total_gb:.1f}GB"
+    else:
+        total_str = f"{tl_total_gb*1000:.0f}MB"
+        
+    if tl_uncommitted_gb >= 1.0:
+        uncommitted_str = f"{tl_uncommitted_gb:.1f}GB"
     else:
-        size_str = f"{tl_gb*1000:.0f}MB"
+        uncommitted_str = f"{tl_uncommitted_gb*1000:.0f}MB"
     
-    return f" [dim]([{color}]TL:{size_str}[/{color}])[/dim]"
+    return f" [dim]([{color}]TL:{total_str} / {uncommitted_str} / {uncommitted_percentage:.0f}%[/{color}])[/dim]"
+
+
+def format_recovery_progress(recovery_info) -> str:
+    """Format recovery progress, using sequence number progress for replicas when available"""
+    if not recovery_info.is_primary and recovery_info.seq_no_progress is not None:
+        # For replica shards, show sequence number progress if available
+        seq_progress = recovery_info.seq_no_progress
+        traditional_progress = recovery_info.overall_progress
+        
+        # If sequence progress is significantly different from traditional progress, show both
+        if abs(seq_progress - traditional_progress) > 5.0:
+            return f"{seq_progress:.1f}% (seq) / {traditional_progress:.1f}% (rec)"
+        else:
+            return f"{seq_progress:.1f}% (seq)"
+    else:
+        # For primary shards or when sequence progress unavailable, use traditional progress
+        return f"{recovery_info.overall_progress:.1f}%"
 
 
 @click.group()
@@ -99,9 +143,18 @@ def main(ctx):
 
 @main.command()
 @click.option('--table', '-t', help='Analyze specific table only')
+@click.option('--largest', type=int, help='Show N largest tables/partitions by size')
+@click.option('--smallest', type=int, help='Show N smallest tables/partitions by size')
+@click.option('--no-zero-size', is_flag=True, default=False, help='Exclude zero-sized tables from smallest results')
 @click.pass_context
-def analyze(ctx, table: Optional[str]):
-    """Analyze current shard distribution across nodes and zones"""
+def analyze(ctx, table: Optional[str], largest: Optional[int], smallest: Optional[int], no_zero_size: bool):
+    """Analyze current shard distribution across nodes and zones
+    
+    Use --largest N to show the N largest tables/partitions by total size.
+    Use --smallest N to show the N smallest tables/partitions by total size.
+    Use --no-zero-size with --smallest to exclude zero-sized tables from results.
+    Both options properly handle partitioned tables and show detailed size breakdowns.
+    """
     client = ctx.obj['client']
     analyzer = ShardAnalyzer(client)
 
@@ -182,6 +235,280 @@ def analyze(ctx, table: Optional[str]):
         )
 
     console.print(node_table)
+    console.print()
+
+    # Shard Size Overview
+    size_overview = analyzer.get_shard_size_overview()
+    
+    size_table = Table(title="Shard Size Distribution", box=box.ROUNDED)
+    size_table.add_column("Size Range", style="cyan")
+    size_table.add_column("Count", justify="right", style="magenta")
+    size_table.add_column("Percentage", justify="right", style="green")
+    size_table.add_column("Avg Size", justify="right", style="blue")
+    size_table.add_column("Max Size", justify="right", style="red")
+    size_table.add_column("Total Size", justify="right", style="yellow")
+
+    total_shards = size_overview['total_shards']
+    
+    # Define color coding thresholds
+    large_shards_threshold = 0   # warn if ANY shards >=50GB (red flag)
+    small_shards_percentage_threshold = 40  # warn if >40% of shards are small (<1GB)
+    
+    for bucket_name, bucket_data in size_overview['size_buckets'].items():
+        count = bucket_data['count']
+        avg_size = bucket_data['avg_size_gb']
+        total_size = bucket_data['total_size']
+        percentage = (count / total_shards * 100) if total_shards > 0 else 0
+        
+        # Apply color coding
+        count_str = str(count)
+        percentage_str = f"{percentage:.1f}%"
+        
+        # Color code large shards (>=50GB) - ANY large shard is a red flag
+        if bucket_name == '>=50GB' and count > large_shards_threshold:
+            count_str = f"[red]{count}[/red]"
+            percentage_str = f"[red]{percentage:.1f}%[/red]"
+        
+        # Color code if too many very small shards (<1GB)
+        if bucket_name == '<1GB' and percentage > small_shards_percentage_threshold:
+            count_str = f"[yellow]{count}[/yellow]"
+            percentage_str = f"[yellow]{percentage:.1f}%[/yellow]"
+        
+        size_table.add_row(
+            bucket_name,
+            count_str,
+            percentage_str,
+            f"{avg_size:.2f}GB" if avg_size > 0 else "0GB",
+            f"{bucket_data['max_size']:.2f}GB" if bucket_data['max_size'] > 0 else "0GB",
+            format_size(total_size)
+        )
+    
+    console.print(size_table)
+    
+    # Add footer showing total number of tables/partitions
+    all_tables = analyzer.get_table_size_breakdown(limit=None)
+    total_tables_partitions = len(all_tables)
+    console.print(f"[dim]📊 Total: {total_tables_partitions} table/partition(s) in cluster[/dim]")
+    
+    # Add schema breakdown table
+    schema_stats = {}
+    for table_info in all_tables:
+        # Extract schema from table name (format: "schema.table" or just "table")
+        table_name = table_info['table_name']
+        if '.' in table_name:
+            schema = table_name.split('.')[0]
+        else:
+            schema = 'doc'  # Default schema
+            
+        partition = table_info['partition']
+        has_partition = partition != 'N/A'
+        
+        if schema not in schema_stats:
+            schema_stats[schema] = {
+                'tables': 0,
+                'partitioned_tables': set(),
+                'total_partitions': 0
+            }
+        
+        if has_partition:
+            # This is a partitioned table
+            base_table_name = table_name
+            schema_stats[schema]['partitioned_tables'].add(base_table_name)
+            schema_stats[schema]['total_partitions'] += 1
+        else:
+            # This is a regular table
+            schema_stats[schema]['tables'] += 1
+    
+    # Create schema breakdown table
+    console.print()
+    schema_table = Table(title="Schema Breakdown", box=box.ROUNDED)
+    schema_table.add_column("Schema", style="cyan")
+    schema_table.add_column("Tables", justify="right", style="green")
+    schema_table.add_column("Partitioned Tables", justify="right", style="magenta")
+    schema_table.add_column("Total Partitions", justify="right", style="yellow")
+    
+    # Sort schemas alphabetically (case-insensitive)
+    for schema in sorted(schema_stats.keys(), key=str.lower):
+        stats = schema_stats[schema]
+        tables_count = stats['tables']
+        partitioned_tables_count = len(stats['partitioned_tables'])
+        total_partitions = stats['total_partitions']
+        
+        schema_table.add_row(
+            schema,
+            str(tables_count),
+            str(partitioned_tables_count),
+            str(total_partitions)
+        )
+    
+    console.print(schema_table)
+    
+    # Add warnings if thresholds are exceeded
+    warnings = []
+    if size_overview['large_shards_count'] > large_shards_threshold:
+        warnings.append(f"[red]🔥 CRITICAL: {size_overview['large_shards_count']} large shards (>=50GB) detected - IMMEDIATE ACTION REQUIRED![/red]")
+        warnings.append(f"[red]   Large shards cause slow recovery, memory pressure, and performance issues[/red]")
+    
+    # Calculate percentage of very small shards (<1GB)
+    very_small_count = size_overview['size_buckets']['<1GB']['count']
+    very_small_percentage = (very_small_count / total_shards * 100) if total_shards > 0 else 0
+    
+    if very_small_percentage > small_shards_percentage_threshold:
+        warnings.append(f"[yellow]⚠️  {very_small_percentage:.1f}% of shards are very small (<1GB) - consider optimizing shard allocation[/yellow]")
+        warnings.append(f"[yellow]   Too many small shards create metadata overhead and reduce efficiency[/yellow]")
+    
+    if warnings:
+        console.print()
+        for warning in warnings:
+            console.print(warning)
+    
+    # Show compact table/partition breakdown of large shards if any exist
+    if size_overview['large_shards_count'] > 0:
+        console.print()
+        large_shards_details = analyzer.get_large_shards_details()
+        
+        # Aggregate by table/partition
+        table_partition_stats = {}
+        for shard in large_shards_details:
+            # Create table key with schema
+            table_display = shard['table_name']
+            if shard['schema_name'] and shard['schema_name'] != 'doc':
+                table_display = f"{shard['schema_name']}.{shard['table_name']}"
+            
+            # Create partition key
+            partition_key = shard['partition_values'] or "N/A"
+            
+            # Create combined key
+            key = (table_display, partition_key)
+            
+            if key not in table_partition_stats:
+                table_partition_stats[key] = {
+                    'sizes': [],
+                    'primary_count': 0,
+                    'replica_count': 0,
+                    'total_size': 0.0
+                }
+            
+            # Aggregate stats
+            stats = table_partition_stats[key]
+            stats['sizes'].append(shard['size_gb'])
+            stats['total_size'] += shard['size_gb']
+            if shard['is_primary']:
+                stats['primary_count'] += 1
+            else:
+                stats['replica_count'] += 1
+        
+        # Create compact table
+        large_shards_table = Table(title=f"Large Shards Breakdown by Table/Partition (>=50GB)", box=box.ROUNDED)
+        large_shards_table.add_column("Table", style="cyan")
+        large_shards_table.add_column("Partition", style="blue")
+        large_shards_table.add_column("Shards", justify="right", style="magenta")
+        large_shards_table.add_column("P/R", justify="center", style="yellow") 
+        large_shards_table.add_column("Min Size", justify="right", style="green")
+        large_shards_table.add_column("Avg Size", justify="right", style="red")
+        large_shards_table.add_column("Max Size", justify="right", style="red")
+        large_shards_table.add_column("Total Size", justify="right", style="red")
+        
+        # Sort by total size descending (most problematic first)
+        sorted_stats = sorted(table_partition_stats.items(), key=lambda x: x[1]['total_size'], reverse=True)
+        
+        for (table_name, partition_key), stats in sorted_stats:
+            # Format partition display
+            partition_display = partition_key
+            if partition_display != "N/A" and len(partition_display) > 25:
+                partition_display = partition_display[:22] + "..."
+            
+            # Calculate size stats
+            sizes = stats['sizes']
+            min_size = min(sizes)
+            avg_size = sum(sizes) / len(sizes)
+            max_size = max(sizes)
+            total_size = stats['total_size']
+            total_shards = len(sizes)
+            
+            # Format primary/replica ratio
+            p_r_display = f"{stats['primary_count']}P/{stats['replica_count']}R"
+            
+            large_shards_table.add_row(
+                table_name,
+                partition_display,
+                str(total_shards),
+                p_r_display,
+                f"{min_size:.1f}GB",
+                f"{avg_size:.1f}GB", 
+                f"{max_size:.1f}GB",
+                f"{total_size:.1f}GB"
+            )
+        
+        console.print(large_shards_table)
+        
+        # Add summary stats
+        total_primary = sum(stats['primary_count'] for stats in table_partition_stats.values())
+        total_replica = sum(stats['replica_count'] for stats in table_partition_stats.values())
+        affected_table_partitions = len(table_partition_stats)
+        
+        console.print()
+        console.print(f"[dim]📊 Summary: {total_primary} primary, {total_replica} replica shards across {affected_table_partitions} table/partition(s)[/dim]")
+    
+    # Show compact table/partition breakdown of smallest shards (top 10)
+    console.print()
+    small_shards_details = analyzer.get_small_shards_details(limit=10)
+    
+    if small_shards_details:
+        # Create compact table
+        small_shards_table = Table(title=f"Smallest Shards Breakdown by Table/Partition (Top 10)", box=box.ROUNDED)
+        small_shards_table.add_column("Table", style="cyan")
+        small_shards_table.add_column("Partition", style="blue")
+        small_shards_table.add_column("Shards", justify="right", style="magenta")
+        small_shards_table.add_column("P/R", justify="center", style="yellow") 
+        small_shards_table.add_column("Min Size", justify="right", style="green")
+        small_shards_table.add_column("Avg Size", justify="right", style="red")
+        small_shards_table.add_column("Max Size", justify="right", style="red")
+        small_shards_table.add_column("Total Size", justify="right", style="red")
+        
+        for entry in small_shards_details:
+            table_name = entry['table_name']
+            partition_key = entry['partition_key']
+            stats = entry['stats']
+            
+            # Format partition display
+            partition_display = partition_key
+            if partition_display != "N/A" and len(partition_display) > 25:
+                partition_display = partition_display[:22] + "..."
+            
+            # Calculate size stats
+            sizes = stats['sizes']
+            min_size = min(sizes)
+            avg_size = sum(sizes) / len(sizes)
+            max_size = max(sizes)
+            total_size = stats['total_size']
+            total_shards = len(sizes)
+            
+            # Format primary/replica ratio
+            p_r_display = f"{stats['primary_count']}P/{stats['replica_count']}R"
+            
+            small_shards_table.add_row(
+                table_name,
+                partition_display,
+                str(total_shards),
+                p_r_display,
+                f"{min_size:.1f}GB",
+                f"{avg_size:.1f}GB", 
+                f"{max_size:.1f}GB",
+                f"{total_size:.1f}GB"
+            )
+        
+        console.print(small_shards_table)
+        
+        # Add summary stats for smallest shards
+        total_small_primary = sum(entry['stats']['primary_count'] for entry in small_shards_details)
+        total_small_replica = sum(entry['stats']['replica_count'] for entry in small_shards_details)
+        small_table_partitions = len(small_shards_details)
+        
+        console.print()
+        console.print(f"[dim]📊 Summary: {total_small_primary} primary, {total_small_replica} replica shards across {small_table_partitions} table/partition(s) with smallest average sizes[/dim]")
+    
+    console.print()
 
     # Table-specific analysis if requested
     if table:
@@ -201,6 +528,119 @@ def analyze(ctx, table: Optional[str]):
 
         console.print(table_summary)
 
+    # Show largest tables if requested
+    if largest:
+        console.print()
+        largest_tables = analyzer.get_table_size_breakdown(limit=largest, order='largest')
+        
+        largest_table = Table(title=f"Largest Tables/Partitions by Size (Top {largest})", box=box.ROUNDED)
+        largest_table.add_column("Table", style="cyan")
+        largest_table.add_column("Partition", style="magenta")
+        largest_table.add_column("Shards", justify="right", style="yellow")
+        largest_table.add_column("P/R", justify="right", style="blue")
+        largest_table.add_column("Min Size", justify="right", style="green")
+        largest_table.add_column("Avg Size", justify="right", style="bright_green")
+        largest_table.add_column("Max Size", justify="right", style="red")
+        largest_table.add_column("Total Size", justify="right", style="bright_red")
+        
+        for entry in largest_tables:
+            table_name = entry['table_name']
+            partition = entry['partition']
+            total_shards = entry['total_shards']
+            primary_count = entry['primary_count']
+            replica_count = entry['replica_count']
+            min_size = entry['min_size']
+            avg_size = entry['avg_size']
+            max_size = entry['max_size']
+            total_size = entry['total_size']
+            
+            largest_table.add_row(
+                table_name,
+                partition,
+                str(total_shards),
+                f"{primary_count}P/{replica_count}R",
+                f"{min_size:.1f}GB",
+                f"{avg_size:.1f}GB", 
+                f"{max_size:.1f}GB",
+                f"{total_size:.1f}GB"
+            )
+        
+        console.print(largest_table)
+        
+        # Add summary stats
+        total_largest_size = sum(entry['total_size'] for entry in largest_tables)
+        total_largest_shards = sum(entry['total_shards'] for entry in largest_tables)
+        
+        console.print()
+        console.print(f"[dim]📊 Summary: {total_largest_shards} total shards using {total_largest_size:.1f}GB across {len(largest_tables)} largest table/partition(s)[/dim]")
+
+    # Show smallest tables if requested
+    if smallest:
+        console.print()
+        all_smallest = analyzer.get_table_size_breakdown(limit=None, order='smallest')
+        
+        # Filter based on no_zero_size flag
+        if no_zero_size:
+            # Use tolerance for effectively zero-sized tables (handles display formatting)
+            # Since display uses {size:.1f}GB format, anything < 0.05GB displays as 0.0GB
+            zero_tolerance = 0.05  # Consider anything that displays as 0.0GB as effectively zero
+            
+            # Count effectively zero-sized tables
+            zero_sized_count = len([t for t in all_smallest if t['total_size'] < zero_tolerance])
+            # Filter out effectively zero-sized tables and take the requested number
+            non_zero_tables = [t for t in all_smallest if t['total_size'] >= zero_tolerance]
+            smallest_tables = non_zero_tables[:smallest]
+            
+            if zero_sized_count > 0:
+                console.print(f"[dim]ℹ️  Found {zero_sized_count} table/partition(s) with 0.0GB size (excluded from results)[/dim]")
+                console.print()
+        else:
+            smallest_tables = all_smallest[:smallest]
+        
+        smallest_table = Table(title=f"Smallest Tables/Partitions by Size (Top {len(smallest_tables)})", box=box.ROUNDED)
+        smallest_table.add_column("Table", style="cyan")
+        smallest_table.add_column("Partition", style="magenta")
+        smallest_table.add_column("Shards", justify="right", style="yellow")
+        smallest_table.add_column("P/R", justify="right", style="blue")
+        smallest_table.add_column("Min Size", justify="right", style="green")
+        smallest_table.add_column("Avg Size", justify="right", style="bright_green")
+        smallest_table.add_column("Max Size", justify="right", style="red")
+        smallest_table.add_column("Total Size", justify="right", style="bright_red")
+        
+        for entry in smallest_tables:
+            table_name = entry['table_name']
+            partition = entry['partition']
+            total_shards = entry['total_shards']
+            primary_count = entry['primary_count']
+            replica_count = entry['replica_count']
+            min_size = entry['min_size']
+            avg_size = entry['avg_size']
+            max_size = entry['max_size']
+            total_size = entry['total_size']
+            
+            smallest_table.add_row(
+                table_name,
+                partition,
+                str(total_shards),
+                f"{primary_count}P/{replica_count}R",
+                f"{min_size:.1f}GB",
+                f"{avg_size:.1f}GB", 
+                f"{max_size:.1f}GB",
+                f"{total_size:.1f}GB"
+            )
+        
+        console.print(smallest_table)
+        
+        # Add summary stats
+        total_smallest_size = sum(entry['total_size'] for entry in smallest_tables)
+        total_smallest_shards = sum(entry['total_shards'] for entry in smallest_tables)
+        
+        console.print()
+        if no_zero_size and len([t for t in all_smallest if t['total_size'] < 0.05]) > 0:
+            console.print(f"[dim]📊 Summary: {total_smallest_shards} total shards using {total_smallest_size:.3f}GB across {len(smallest_tables)} smallest non-zero table/partition(s)[/dim]")
+        else:
+            console.print(f"[dim]📊 Summary: {total_smallest_shards} total shards using {total_smallest_size:.3f}GB across {len(smallest_tables)} smallest table/partition(s)[/dim]")
+
 
 @main.command()
 @click.option('--table', '-t', help='Find candidates for specific table only')
@@ -1094,6 +1534,7 @@ def monitor_recovery(ctx, table: str, node: str, watch: bool, refresh_interval:
                 # Track previous state for change detection
                 previous_recoveries = {}
                 previous_timestamp = None
+                last_transitioning_display = None
                 first_run = True
 
                 while True:
@@ -1118,10 +1559,9 @@ def monitor_recovery(ctx, table: str, node: str, watch: bool, refresh_interval:
                         recovery_key = f"{recovery.schema_name}.{recovery.table_name}.{recovery.shard_id}.{recovery.node_name}"
 
                         # Create complete table name
-                        if recovery.schema_name == "doc":
-                            table_display = recovery.table_name
-                        else:
-                            table_display = f"{recovery.schema_name}.{recovery.table_name}"
+                        table_display = format_table_display_with_partition(
+                            recovery.schema_name, recovery.table_name, recovery.partition_values
+                        )
 
                         # Count active vs completed
                         if recovery.stage == "DONE" and recovery.overall_progress >= 100.0:
@@ -1145,9 +1585,17 @@ def monitor_recovery(ctx, table: str, node: str, watch: bool, refresh_interval:
                                 translog_info = format_translog_info(recovery)
                                 
                                 if diff > 0:
-                                    changes.append(f"[green]📈[/green] {table_display} S{recovery.shard_id} {recovery.overall_progress:.1f}% (+{diff:.1f}%) {recovery.size_gb:.1f}GB{translog_info}{node_route}")
+                                    table_display = format_table_display_with_partition(
+                                        recovery.schema_name, recovery.table_name, recovery.partition_values
+                                    )
+                                    progress_info = format_recovery_progress(recovery)
+                                    changes.append(f"[green]📈[/green] {table_display} S{recovery.shard_id} {recovery.recovery_type} {progress_info} (+{diff:.1f}%) {recovery.size_gb:.1f}GB{translog_info}{node_route}")
                                 else:
-                                    changes.append(f"[yellow]📉[/yellow] {table_display} S{recovery.shard_id} {recovery.overall_progress:.1f}% ({diff:.1f}%) {recovery.size_gb:.1f}GB{translog_info}{node_route}")
+                                    table_display = format_table_display_with_partition(
+                                        recovery.schema_name, recovery.table_name, recovery.partition_values
+                                    )
+                                    progress_info = format_recovery_progress(recovery)
+                                    changes.append(f"[yellow]📉[/yellow] {table_display} S{recovery.shard_id} {recovery.recovery_type} {progress_info} ({diff:.1f}%) {recovery.size_gb:.1f}GB{translog_info}{node_route}")
                             elif prev['stage'] != recovery.stage:
                                 # Create node route display
                                 node_route = ""
@@ -1159,7 +1607,11 @@ def monitor_recovery(ctx, table: str, node: str, watch: bool, refresh_interval:
                                 # Add translog info
                                 translog_info = format_translog_info(recovery)
                                 
-                                changes.append(f"[blue]🔄[/blue] {table_display} S{recovery.shard_id} {prev['stage']}→{recovery.stage} {recovery.size_gb:.1f}GB{translog_info}{node_route}")
+                                table_display = format_table_display_with_partition(
+                                    recovery.schema_name, recovery.table_name, recovery.partition_values
+                                )
+                                progress_info = format_recovery_progress(recovery)
+                                changes.append(f"[blue]🔄[/blue] {table_display} S{recovery.shard_id} {recovery.recovery_type} {prev['stage']}→{recovery.stage} {progress_info} {recovery.size_gb:.1f}GB{translog_info}{node_route}")
                         else:
                             # New recovery - show based on include_transitioning flag or first run
                             if first_run or include_transitioning or (recovery.overall_progress < 100.0 or recovery.stage != "DONE"):
@@ -1174,7 +1626,11 @@ def monitor_recovery(ctx, table: str, node: str, watch: bool, refresh_interval:
                                 # Add translog info
                                 translog_info = format_translog_info(recovery)
                                 
-                                changes.append(f"{status_icon} {table_display} S{recovery.shard_id} {recovery.stage} {recovery.overall_progress:.1f}% {recovery.size_gb:.1f}GB{translog_info}{node_route}")
+                                table_display = format_table_display_with_partition(
+                                    recovery.schema_name, recovery.table_name, recovery.partition_values
+                                )
+                                progress_info = format_recovery_progress(recovery)
+                                changes.append(f"{status_icon} {table_display} S{recovery.shard_id} {recovery.recovery_type} {recovery.stage} {progress_info} {recovery.size_gb:.1f}GB{translog_info}{node_route}")
 
                         # Store current state for next comparison
                         previous_recoveries[recovery_key] = {
@@ -1182,29 +1638,108 @@ def monitor_recovery(ctx, table: str, node: str, watch: bool, refresh_interval:
                             'stage': recovery.stage
                         }
 
-                    # Always show a status line
-                    if not recoveries:
-                        console.print(f"{current_time} | [green]No recoveries - cluster stable[/green]")
+                    # Get problematic shards for comprehensive status
+                    problematic_shards = recovery_monitor.get_problematic_shards(table, node)
+                    
+                    # Filter out shards that are already being recovered
+                    non_recovering_shards = []
+                    if problematic_shards:
+                        for shard in problematic_shards:
+                            # Check if this shard is already in our recoveries list
+                            is_recovering = any(
+                                r.shard_id == shard['shard_id'] and 
+                                r.table_name == shard['table_name'] and 
+                                r.schema_name == shard['schema_name']
+                                for r in recoveries
+                            )
+                            if not is_recovering:
+                                non_recovering_shards.append(shard)
+                    
+                    # Always show a comprehensive status line
+                    if not recoveries and not non_recovering_shards:
+                        console.print(f"{current_time} | [green]No issues - cluster stable[/green]")
+                        previous_recoveries.clear()
+                    elif not recoveries and non_recovering_shards:
+                        console.print(f"{current_time} | [yellow]{len(non_recovering_shards)} shards need attention (not recovering)[/yellow]")
+                        # Show first few problematic shards
+                        for shard in non_recovering_shards[:5]:
+                            table_display = format_table_display_with_partition(
+                                shard['schema_name'], shard['table_name'], shard.get('partition_values')
+                            )
+                            primary_indicator = "P" if shard.get('primary') else "R"
+                            console.print(f"         | [red]⚠[/red] {table_display} S{shard['shard_id']}{primary_indicator} {shard['state']}")
+                        if len(non_recovering_shards) > 5:
+                            console.print(f"         | [dim]... and {len(non_recovering_shards) - 5} more[/dim]")
                         previous_recoveries.clear()
                     else:
-                        # Build status message
-                        status = ""
+                        # Build status message for active recoveries
+                        status_parts = []
                         if active_count > 0:
-                            status = f"{active_count} active"
+                            status_parts.append(f"{active_count} recovering")
                         if completed_count > 0:
-                            status += f", {completed_count} done" if status else f"{completed_count} done"
+                            status_parts.append(f"{completed_count} done")
+                        if non_recovering_shards:
+                            status_parts.append(f"[yellow]{len(non_recovering_shards)} awaiting recovery[/yellow]")
+                        
+                        status = " | ".join(status_parts)
 
                         # Show status line with changes or periodic update
                         if changes:
                             console.print(f"{current_time} | {status}")
                             for change in changes:
                                 console.print(f"         | {change}")
+                            # Show some problematic shards if there are any
+                            if non_recovering_shards and len(changes) < 3:  # Don't overwhelm the output
+                                for shard in non_recovering_shards[:2]:
+                                    table_display = format_table_display_with_partition(
+                                        shard['schema_name'], shard['table_name'], shard.get('partition_values')
+                                    )
+                                    primary_indicator = "P" if shard.get('primary') else "R"
+                                    console.print(f"         | [red]⚠[/red] {table_display} S{shard['shard_id']}{primary_indicator} {shard['state']}")
                         else:
                             # Show periodic status even without changes
                             if include_transitioning and completed_count > 0:
-                                console.print(f"{current_time} | {status} (transitioning)")
+                                from datetime import datetime, timedelta
+                                current_dt = datetime.now()
+                                
+                                # Show transitioning details every 30 seconds or first time
+                                should_show_details = (
+                                    last_transitioning_display is None or 
+                                    (current_dt - last_transitioning_display).total_seconds() >= 30
+                                )
+                                
+                                if should_show_details:
+                                    console.print(f"{current_time} | {status} (transitioning)")
+                                    # Show details of transitioning recoveries
+                                    transitioning_recoveries = [r for r in recoveries if r.stage == "DONE" and r.overall_progress >= 100.0]
+                                    for recovery in transitioning_recoveries[:5]:  # Limit to first 5 to avoid spam
+                                        # Create node route display
+                                        node_route = ""
+                                        if recovery.recovery_type == "PEER" and recovery.source_node_name:
+                                            node_route = f" {recovery.source_node_name} → {recovery.node_name}"
+                                        elif recovery.recovery_type == "DISK":
+                                            node_route = f" disk → {recovery.node_name}"
+                                        
+                                        # Add translog info
+                                        translog_info = format_translog_info(recovery)
+                                        
+                                        table_display = format_table_display_with_partition(
+                                            recovery.schema_name, recovery.table_name, recovery.partition_values
+                                        )
+                                        progress_info = format_recovery_progress(recovery)
+                                        primary_indicator = "P" if recovery.is_primary else "R"
+                                        console.print(f"         | [cyan]🔄[/cyan] {table_display} S{recovery.shard_id}{primary_indicator} {recovery.recovery_type} {recovery.stage} {progress_info} {recovery.size_gb:.1f}GB{translog_info}{node_route}")
+                                    
+                                    if len(transitioning_recoveries) > 5:
+                                        console.print(f"         | [dim]... and {len(transitioning_recoveries) - 5} more transitioning[/dim]")
+                                    
+                                    last_transitioning_display = current_dt
+                                else:
+                                    console.print(f"{current_time} | {status} (transitioning)")
                             elif active_count > 0:
                                 console.print(f"{current_time} | {status} (no changes)")
+                            elif non_recovering_shards:
+                                console.print(f"{current_time} | {status} (issues persist)")
 
                     previous_timestamp = current_time
                     first_run = False
@@ -1220,26 +1755,55 @@ def monitor_recovery(ctx, table: str, node: str, watch: bool, refresh_interval:
                     recovery_type_filter=recovery_type,
                     include_transitioning=include_transitioning
                 )
-
-                if final_recoveries:
-                    console.print("\n📊 [bold]Final Recovery Summary:[/bold]")
-                    summary = recovery_monitor.get_recovery_summary(final_recoveries)
-
-                    # Count active vs completed
-                    active_count = len([r for r in final_recoveries if r.overall_progress < 100.0 or r.stage != "DONE"])
-                    completed_count = len(final_recoveries) - active_count
-
-                    console.print(f"   Total recoveries: {summary['total_recoveries']}")
-                    console.print(f"   Active: {active_count}, Completed: {completed_count}")
-                    console.print(f"   Total size: {summary['total_size_gb']:.1f} GB")
-                    console.print(f"   Average progress: {summary['avg_progress']:.1f}%")
-
-                    if summary['by_type']:
-                        console.print(f"   By recovery type:")
-                        for rec_type, stats in summary['by_type'].items():
-                            console.print(f"     {rec_type}: {stats['count']} recoveries, {stats['avg_progress']:.1f}% avg progress")
+                
+                final_problematic_shards = recovery_monitor.get_problematic_shards(table, node)
+                
+                # Filter out shards that are already being recovered
+                final_non_recovering_shards = []
+                if final_problematic_shards:
+                    for shard in final_problematic_shards:
+                        is_recovering = any(
+                            r.shard_id == shard['shard_id'] and 
+                            r.table_name == shard['table_name'] and 
+                            r.schema_name == shard['schema_name']
+                            for r in final_recoveries
+                        )
+                        if not is_recovering:
+                            final_non_recovering_shards.append(shard)
+
+                if final_recoveries or final_non_recovering_shards:
+                    console.print("\n📊 [bold]Final Cluster Status Summary:[/bold]")
+                    
+                    if final_recoveries:
+                        summary = recovery_monitor.get_recovery_summary(final_recoveries)
+                        # Count active vs completed
+                        active_count = len([r for r in final_recoveries if r.overall_progress < 100.0 or r.stage != "DONE"])
+                        completed_count = len(final_recoveries) - active_count
+
+                        console.print(f"   Total recoveries: {summary['total_recoveries']}")
+                        console.print(f"   Active: {active_count}, Completed: {completed_count}")
+                        console.print(f"   Total size: {summary['total_size_gb']:.1f} GB")
+                        console.print(f"   Average progress: {summary['avg_progress']:.1f}%")
+
+                        if summary['by_type']:
+                            console.print(f"   By recovery type:")
+                            for rec_type, stats in summary['by_type'].items():
+                                console.print(f"     {rec_type}: {stats['count']} recoveries, {stats['avg_progress']:.1f}% avg progress")
+                    
+                    if final_non_recovering_shards:
+                        console.print(f"   [yellow]Problematic shards needing attention: {len(final_non_recovering_shards)}[/yellow]")
+                        # Group by state for summary
+                        by_state = {}
+                        for shard in final_non_recovering_shards:
+                            state = shard['state']
+                            if state not in by_state:
+                                by_state[state] = 0
+                            by_state[state] += 1
+                        
+                        for state, count in by_state.items():
+                            console.print(f"     {state}: {count} shards")
                 else:
-                    console.print("\n[green]✅ No active recoveries at exit[/green]")
+                    console.print("\n[green]✅ Cluster stable - no issues detected[/green]")
 
                 return
 
@@ -1255,18 +1819,58 @@ def monitor_recovery(ctx, table: str, node: str, watch: bool, refresh_interval:
             display_output = recovery_monitor.format_recovery_display(recoveries)
             console.print(display_output)
 
-            if not recoveries:
+            # Get problematic shards for comprehensive status
+            problematic_shards = recovery_monitor.get_problematic_shards(table, node)
+            
+            # Filter out shards that are already being recovered
+            non_recovering_shards = []
+            if problematic_shards:
+                for shard in problematic_shards:
+                    is_recovering = any(
+                        r.shard_id == shard['shard_id'] and 
+                        r.table_name == shard['table_name'] and 
+                        r.schema_name == shard['schema_name']
+                        for r in recoveries
+                    )
+                    if not is_recovering:
+                        non_recovering_shards.append(shard)
+
+            if not recoveries and not non_recovering_shards:
                 if include_transitioning:
-                    console.print("\n[green]✅ No recoveries found (active or transitioning)[/green]")
+                    console.print("\n[green]✅ No issues found - cluster stable[/green]")
                 else:
                     console.print("\n[green]✅ No active recoveries found[/green]")
                     console.print("[dim]💡 Use --include-transitioning to see completed recoveries still transitioning[/dim]")
+            elif not recoveries and non_recovering_shards:
+                console.print(f"\n[yellow]⚠️ {len(non_recovering_shards)} shards need attention (not recovering)[/yellow]")
+                # Group by state for summary
+                by_state = {}
+                for shard in non_recovering_shards:
+                    state = shard['state']
+                    if state not in by_state:
+                        by_state[state] = 0
+                    by_state[state] += 1
+                
+                for state, count in by_state.items():
+                    console.print(f"   {state}: {count} shards")
+                    
+                # Show first few examples
+                console.print(f"\nExamples:")
+                for shard in non_recovering_shards[:5]:
+                    table_display = format_table_display_with_partition(
+                        shard['schema_name'], shard['table_name'], shard.get('partition_values')
+                    )
+                    primary_indicator = "P" if shard.get('primary') else "R"
+                    console.print(f"   [red]⚠[/red] {table_display} S{shard['shard_id']}{primary_indicator} {shard['state']}")
+                    
+                if len(non_recovering_shards) > 5:
+                    console.print(f"   [dim]... and {len(non_recovering_shards) - 5} more[/dim]")
             else:
-                # Show summary
+                # Show recovery summary
                 summary = recovery_monitor.get_recovery_summary(recoveries)
-                console.print(f"\n📊 [bold]Recovery Summary:[/bold]")
-                console.print(f"   Total recoveries: {summary['total_recoveries']}")
-                console.print(f"   Total size: {summary['total_size_gb']:.1f} GB")
+                console.print(f"\n📊 [bold]Cluster Status Summary:[/bold]")
+                console.print(f"   Active recoveries: {summary['total_recoveries']}")
+                console.print(f"   Total recovery size: {summary['total_size_gb']:.1f} GB")
                 console.print(f"   Average progress: {summary['avg_progress']:.1f}%")
 
                 # Show breakdown by type
@@ -1275,6 +1879,19 @@ def monitor_recovery(ctx, table: str, node: str, watch: bool, refresh_interval:
                     for rec_type, stats in summary['by_type'].items():
                         console.print(f"     {rec_type}: {stats['count']} recoveries, {stats['avg_progress']:.1f}% avg progress")
 
+                # Show problematic shards if any
+                if non_recovering_shards:
+                    console.print(f"\n   [yellow]Problematic shards needing attention: {len(non_recovering_shards)}[/yellow]")
+                    by_state = {}
+                    for shard in non_recovering_shards:
+                        state = shard['state']
+                        if state not in by_state:
+                            by_state[state] = 0
+                        by_state[state] += 1
+                    
+                    for state, count in by_state.items():
+                        console.print(f"     {state}: {count} shards")
+
                 console.print(f"\n[dim]💡 Use --watch flag for continuous monitoring[/dim]")
 
     except Exception as e:
@@ -1490,5 +2107,914 @@ def shard_distribution(ctx, top_tables: int, table: Optional[str]):
         console.print(f"[dim]{traceback.format_exc()}[/dim]")
 
 
+@main.command()
+@click.option('--count', default=10, help='Number of most active shards to show (default: 10)')
+@click.option('--interval', default=30, help='Observation interval in seconds (default: 30)')
+@click.option('--min-checkpoint-delta', default=1000, help='Minimum checkpoint progression between snapshots to show shard (default: 1000)')
+@click.option('--table', '-t', help='Monitor specific table only')
+@click.option('--node', '-n', help='Monitor specific node only')
+@click.option('--watch', '-w', is_flag=True, help='Continuously monitor (refresh every interval)')
+@click.option('--exclude-system', is_flag=True, help='Exclude system tables (gc.*, information_schema.*)')
+@click.option('--min-rate', type=float, help='Minimum activity rate (changes/sec) to show')
+@click.option('--show-replicas/--hide-replicas', default=True, help='Show replica shards (default: True)')
+@click.pass_context
+def active_shards(ctx, count: int, interval: int, min_checkpoint_delta: int, 
+                 table: Optional[str], node: Optional[str], watch: bool,
+                 exclude_system: bool, min_rate: Optional[float], show_replicas: bool):
+    """Monitor most active shards by checkpoint progression
+    
+    This command takes two snapshots of ALL started shards separated by the
+    observation interval, then shows the shards with the highest checkpoint
+    progression (activity) between the snapshots.
+    
+    Unlike other commands, this tracks ALL shards and filters based on actual
+    activity between snapshots, not current state. This captures shards that
+    become active during the observation period.
+    
+    Useful for identifying which shards are receiving the most write activity
+    in your cluster and understanding write patterns.
+    
+    Examples:
+        xmover active-shards --count 20 --interval 60        # Top 20 over 60 seconds
+        xmover active-shards --watch --interval 30           # Continuous monitoring
+        xmover active-shards --table my_table --watch        # Monitor specific table
+        xmover active-shards --node data-hot-1 --count 5     # Top 5 on specific node
+        xmover active-shards --min-checkpoint-delta 500      # Lower activity threshold
+        xmover active-shards --exclude-system --min-rate 50  # Skip system tables, min 50/sec
+        xmover active-shards --hide-replicas --count 20      # Only primary shards
+    """
+    client = ctx.obj['client']
+    monitor = ActiveShardMonitor(client)
+    
+    def get_filtered_snapshot():
+        """Get snapshot with optional filtering"""
+        snapshots = client.get_active_shards_snapshot(min_checkpoint_delta=min_checkpoint_delta)
+        
+        # Apply table filter if specified
+        if table:
+            snapshots = [s for s in snapshots if s.table_name == table or 
+                        f"{s.schema_name}.{s.table_name}" == table]
+        
+        # Apply node filter if specified  
+        if node:
+            snapshots = [s for s in snapshots if s.node_name == node]
+        
+        # Exclude system tables if requested
+        if exclude_system:
+            snapshots = [s for s in snapshots if not (
+                s.schema_name.startswith('gc.') or 
+                s.schema_name == 'information_schema' or
+                s.schema_name == 'sys' or
+                s.table_name.endswith('_events') or
+                s.table_name.endswith('_log')
+            )]
+            
+        return snapshots
+    
+    def run_single_analysis():
+        """Run a single analysis cycle"""
+        if not watch:
+            console.print(Panel.fit("[bold blue]Active Shards Monitor[/bold blue]"))
+        
+        # Show configuration - simplified for watch mode
+        if watch:
+            config_parts = [f"{interval}s interval", f"threshold: {min_checkpoint_delta:,}", f"top {count}"]
+            if table:
+                config_parts.append(f"table: {table}")
+            if node:
+                config_parts.append(f"node: {node}")
+            console.print(f"[dim]{' | '.join(config_parts)}[/dim]")
+        else:
+            config_info = [
+                f"Observation interval: {interval}s",
+                f"Min checkpoint delta: {min_checkpoint_delta:,}",
+                f"Show count: {count}"
+            ]
+            if table:
+                config_info.append(f"Table filter: {table}")
+            if node:
+                config_info.append(f"Node filter: {node}")
+            if exclude_system:
+                config_info.append("Excluding system tables")
+            if min_rate:
+                config_info.append(f"Min rate: {min_rate}/sec")
+            if not show_replicas:
+                config_info.append("Primary shards only")
+                
+            console.print("[dim]" + " | ".join(config_info) + "[/dim]")
+        console.print()
+        
+        # Take first snapshot
+        if not watch:
+            console.print("📷 Taking first snapshot...")
+        snapshot1 = get_filtered_snapshot()
+        
+        if not snapshot1:
+            console.print("[yellow]No started shards found matching criteria[/yellow]")
+            return
+            
+        if not watch:
+            console.print(f"   Tracking {len(snapshot1)} started shards for activity")
+            console.print(f"⏱️  Waiting {interval} seconds for activity...")
+        
+        # Wait for observation interval
+        if watch:
+            # Simplified countdown for watch mode
+            for remaining in range(interval, 0, -1):
+                if remaining % 5 == 0 or remaining <= 3:  # Show fewer updates
+                    console.print(f"[dim]⏱️  {remaining}s...[/dim]", end="\r")
+                time.sleep(1)
+            console.print(" " * 15, end="\r")  # Clear countdown
+        else:
+            time.sleep(interval)
+        
+        # Take second snapshot
+        if not watch:
+            console.print("📷 Taking second snapshot...")
+        snapshot2 = get_filtered_snapshot()
+        
+        if not snapshot2:
+            console.print("[yellow]No started shards found in second snapshot[/yellow]")
+            return
+            
+        if not watch:
+            console.print(f"   Tracking {len(snapshot2)} started shards for activity")
+        
+        # Compare snapshots and show results
+        activities = monitor.compare_snapshots(snapshot1, snapshot2, min_activity_threshold=min_checkpoint_delta)
+        
+        # Apply additional filters
+        if not show_replicas:
+            activities = [a for a in activities if a.is_primary]
+        
+        if min_rate:
+            activities = [a for a in activities if a.activity_rate >= min_rate]
+        
+        if not activities:
+            console.print(f"[green]✅ No shards exceeded activity threshold ({min_checkpoint_delta:,} checkpoint changes)[/green]")
+            if min_rate:
+                console.print(f"[dim]Also filtered by minimum rate: {min_rate}/sec[/dim]")
+        else:
+            if not watch:
+                overlap_count = len(set(s.shard_identifier for s in snapshot1) & 
+                               set(s.shard_identifier for s in snapshot2))
+                console.print(f"[dim]Analyzed {overlap_count} shards present in both snapshots[/dim]")
+            console.print(monitor.format_activity_display(activities, show_count=count, watch_mode=watch))
+    
+    try:
+        if watch:
+            console.print("[dim]Press Ctrl+C to stop monitoring[/dim]")
+            console.print()
+            
+            while True:
+                run_single_analysis()
+                if watch:
+                    console.print(f"\n[dim]━━━ Next update in {interval}s ━━━[/dim]\n")
+                time.sleep(interval)
+        else:
+            run_single_analysis()
+            
+    except KeyboardInterrupt:
+        console.print("\n[yellow]Monitoring stopped by user[/yellow]")
+    except Exception as e:
+        console.print(f"[red]Error during active shards monitoring: {e}[/red]")
+        import traceback
+        console.print(f"[dim]{traceback.format_exc()}[/dim]")
+
+
+@main.command()
+@click.option('--sizeMB', default=300, help='Minimum translog uncommitted size in MB (default: 300)')
+@click.option('--execute', is_flag=True, help='Execute the replica commands after confirmation')
+@click.pass_context
+def problematic_translogs(ctx, sizemb: int, execute: bool):
+    """Find tables with problematic translog sizes and generate comprehensive shard management commands
+    
+    This command identifies tables with replica shards that have large uncommitted translog sizes
+    indicating replication issues. It generates a complete sequence including:
+    1. Stop automatic shard rebalancing
+    2. REROUTE CANCEL commands for problematic shards  
+    3. Set replicas to 0 commands
+    4. Retention lease queries for monitoring
+    5. Set replicas to 1 commands (restored from original values)
+    6. Re-enable automatic shard rebalancing
+    With --execute, it runs them after confirmation.
+    """
+    client = ctx.obj['client']
+    
+    console.print(Panel.fit("[bold blue]Problematic Translog Analysis[/bold blue]"))
+    console.print(f"[dim]Looking for tables with replica shards having translog uncommitted size > {sizemb}MB[/dim]")
+    console.print()
+    
+    # First query to get individual problematic shards for REROUTE CANCEL commands
+    individual_shards_query = """
+        SELECT
+            sh.schema_name,
+            sh.table_name,
+            translate(p.values::text, ':{}', '=()') as partition_values,
+            sh.id AS shard_id,
+            node['name'] AS node_name,
+            sh.translog_stats['uncommitted_size'] / 1024^2 AS translog_uncommitted_mb
+        FROM
+            sys.shards AS sh
+        LEFT JOIN information_schema.table_partitions p
+            ON sh.table_name = p.table_name
+            AND sh.schema_name = p.table_schema
+            AND sh.partition_ident = p.partition_ident
+        WHERE
+            sh.state = 'STARTED'
+            AND sh.translog_stats['uncommitted_size'] > ? * 1024^2
+            AND primary=FALSE
+        ORDER BY
+            translog_uncommitted_mb DESC
+    """
+    
+    # Query to find tables with problematic replica shards, grouped by table/partition
+    summary_query = """
+        SELECT
+            all_shards.schema_name,
+            all_shards.table_name,
+            translate(p.values::text, ':{}', '=()') as partition_values,
+            p.partition_ident,
+            COUNT(CASE WHEN all_shards.primary=FALSE AND all_shards.translog_stats['uncommitted_size'] > ? * 1024^2 THEN 1 END) as problematic_replica_shards,
+            MAX(CASE WHEN all_shards.primary=FALSE AND all_shards.translog_stats['uncommitted_size'] > ? * 1024^2 THEN all_shards.translog_stats['uncommitted_size'] / 1024^2 END) AS max_translog_uncommitted_mb,
+            COUNT(CASE WHEN all_shards.primary=TRUE THEN 1 END) as total_primary_shards,
+            COUNT(CASE WHEN all_shards.primary=FALSE THEN 1 END) as total_replica_shards,
+            SUM(CASE WHEN all_shards.primary=TRUE THEN all_shards.size / 1024^3 ELSE 0 END) as total_primary_size_gb,
+            SUM(CASE WHEN all_shards.primary=FALSE THEN all_shards.size / 1024^3 ELSE 0 END) as total_replica_size_gb
+        FROM
+            sys.shards AS all_shards
+        LEFT JOIN information_schema.table_partitions p
+            ON all_shards.table_name = p.table_name
+            AND all_shards.schema_name = p.table_schema
+            AND all_shards.partition_ident = p.partition_ident
+        WHERE
+            all_shards.state = 'STARTED'
+            AND all_shards.schema_name || '.' || all_shards.table_name || COALESCE(all_shards.partition_ident, '') IN (
+                SELECT DISTINCT sh.schema_name || '.' || sh.table_name || COALESCE(sh.partition_ident, '')
+                FROM sys.shards AS sh
+                WHERE sh.state = 'STARTED'
+                AND sh.translog_stats['uncommitted_size'] > ? * 1024^2
+                AND sh.primary=FALSE
+            )
+        GROUP BY
+            all_shards.schema_name, all_shards.table_name, partition_values, p.partition_ident
+        ORDER BY
+            max_translog_uncommitted_mb DESC
+    """
+    
+    try:
+        # Get individual shards first
+        individual_result = client.execute_query(individual_shards_query, [sizemb])
+        individual_shards = individual_result.get('rows', [])
+        
+        # Get summary data
+        summary_result = client.execute_query(summary_query, [sizemb, sizemb, sizemb])
+        summary_rows = summary_result.get('rows', [])
+        
+        if not individual_shards:
+            console.print(f"[green]✓ No tables found with replica shards having translog uncommitted size > {sizemb}MB[/green]")
+            return
+        
+        # Display individual problematic shards first
+        console.print(f"[bold]Problematic Replica Shards (translog > {sizemb}MB)[/bold]")
+        from rich.table import Table
+        individual_table = Table(box=box.ROUNDED)
+        individual_table.add_column("Schema", style="cyan")
+        individual_table.add_column("Table", style="blue")
+        individual_table.add_column("Partition", style="magenta")
+        individual_table.add_column("Shard ID", justify="right", style="yellow")
+        individual_table.add_column("Node", style="green")
+        individual_table.add_column("Translog MB", justify="right", style="red")
+        
+        for row in individual_shards:
+            schema_name, table_name, partition_values, shard_id, node_name, translog_mb = row
+            partition_display = partition_values if partition_values and partition_values != 'NULL' else "none"
+            
+            individual_table.add_row(
+                schema_name,
+                table_name,
+                partition_display,
+                str(shard_id),
+                node_name,
+                f"{translog_mb:.1f}"
+            )
+        
+        console.print(individual_table)
+        console.print()
+        
+        console.print(f"Found {len(summary_rows)} table/partition(s) with problematic translogs:")
+        console.print()
+        
+        # Display summary table
+        results_table = Table(title=f"Tables with Problematic Replicas (translog > {sizemb}MB)", box=box.ROUNDED)
+        results_table.add_column("Schema", style="cyan")
+        results_table.add_column("Table", style="blue")  
+        results_table.add_column("Partition", style="magenta")
+        results_table.add_column("Problematic Replicas", justify="right", style="yellow")
+        results_table.add_column("Max Translog MB", justify="right", style="red")
+        results_table.add_column("Shards (P/R)", justify="right", style="blue")
+        results_table.add_column("Size GB (P/R)", justify="right", style="bright_blue")
+        results_table.add_column("Current Replicas", justify="right", style="green")
+        
+        # Collect table/partition info and look up current replica counts
+        table_replica_info = []
+        for row in summary_rows:
+            schema_name, table_name, partition_values, partition_ident, problematic_replica_shards, max_translog_mb, total_primary_shards, total_replica_shards, total_primary_size_gb, total_replica_size_gb = row
+            partition_display = partition_values if partition_values and partition_values != 'NULL' else "[dim]none[/dim]"
+            
+            # Look up current replica count
+            current_replicas = 0
+            try:
+                if partition_values and partition_values != 'NULL':
+                    # Partitioned table query
+                    replica_query = """
+                        SELECT number_of_replicas
+                        FROM information_schema.table_partitions
+                        WHERE table_name = ? AND table_schema = ? AND partition_ident = ?
+                    """
+                    replica_result = client.execute_query(replica_query, [table_name, schema_name, partition_ident])
+                else:
+                    # Non-partitioned table query
+                    replica_query = """
+                        SELECT number_of_replicas
+                        FROM information_schema.tables
+                        WHERE table_name = ? AND table_schema = ?
+                    """
+                    replica_result = client.execute_query(replica_query, [table_name, schema_name])
+                
+                replica_rows = replica_result.get('rows', [])
+                if replica_rows:
+                    current_replicas = replica_rows[0][0]
+            except Exception as e:
+                console.print(f"[yellow]Warning: Could not retrieve replica count for {schema_name}.{table_name}: {e}[/yellow]")
+                current_replicas = "unknown"
+            
+            table_replica_info.append((
+                schema_name, table_name, partition_values, partition_ident, 
+                problematic_replica_shards, max_translog_mb, total_primary_shards, total_replica_shards, 
+                total_primary_size_gb, total_replica_size_gb, current_replicas
+            ))
+            
+            results_table.add_row(
+                schema_name,
+                table_name,
+                partition_display,
+                str(problematic_replica_shards),
+                f"{max_translog_mb:.1f}",
+                f"{total_primary_shards}P/{total_replica_shards}R",
+                f"{total_primary_size_gb:.1f}/{total_replica_size_gb:.1f}",
+                str(current_replicas)
+            )
+        
+        console.print(results_table)
+        console.print()
+        console.print("[bold]Generated Comprehensive Shard Management Commands:[/bold]")
+        console.print()
+        
+        # 1. Stop automatic shard rebalancing
+        console.print("[bold cyan]1. Stop Automatic Shard Rebalancing:[/bold cyan]")
+        rebalance_disable_cmd = 'SET GLOBAL PERSISTENT "cluster.routing.rebalance.enable"=\'none\';'
+        console.print(rebalance_disable_cmd)
+        console.print()
+        
+        # 2. Generate REROUTE CANCEL SHARD commands for individual shards (unchanged)
+        console.print("[bold cyan]2. REROUTE CANCEL Commands (unchanged from original):[/bold cyan]")
+        reroute_commands = []
+        for row in individual_shards:
+            schema_name, table_name, partition_values, shard_id, node_name, translog_mb = row
+            cmd = f'ALTER TABLE "{schema_name}"."{table_name}" REROUTE CANCEL SHARD {shard_id} on \'{node_name}\' WITH (allow_primary=False);'
+            reroute_commands.append(cmd)
+            console.print(cmd)
+        
+        if reroute_commands:
+            console.print()
+        
+        # 3. Generate ALTER commands to set replicas to 0
+        console.print("[bold cyan]3. Set Replicas to 0:[/bold cyan]")
+        set_zero_commands = []
+        valid_table_info = []
+        
+        for info in table_replica_info:
+            schema_name, table_name, partition_values, partition_ident, problematic_replica_shards, max_translog_mb, total_primary_shards, total_replica_shards, total_primary_size_gb, total_replica_size_gb, current_replicas = info
+            
+            if current_replicas == "unknown":
+                console.print(f"[yellow]-- Skipping {schema_name}.{table_name} (unknown replica count)[/yellow]")
+                continue
+                
+            if current_replicas == 0:
+                console.print(f"[yellow]-- Skipping {schema_name}.{table_name} (already has 0 replicas)[/yellow]")
+                continue
+            
+            valid_table_info.append(info)
+            
+            # Build the ALTER command to set replicas to 0
+            if partition_values and partition_values != 'NULL':
+                # Partitioned table commands
+                cmd_set_zero = f'ALTER TABLE "{schema_name}"."{table_name}" PARTITION {partition_values} SET ("number_of_replicas" = 0);'
+            else:
+                # Non-partitioned table commands
+                cmd_set_zero = f'ALTER TABLE "{schema_name}"."{table_name}" SET ("number_of_replicas" = 0);'
+            
+            set_zero_commands.append(cmd_set_zero)
+            console.print(cmd_set_zero)
+        
+        console.print()
+        
+        # 4. Generate retention lease queries for monitoring
+        console.print("[bold cyan]4. Retention Lease Monitoring Queries:[/bold cyan]")
+        retention_queries = []
+        
+        for info in valid_table_info:
+            schema_name, table_name, partition_values, partition_ident, problematic_replica_shards, max_translog_mb, total_primary_shards, total_replica_shards, total_primary_size_gb, total_replica_size_gb, current_replicas = info
+            
+            if partition_values and partition_values != 'NULL':
+                # For partitioned tables, we need to resolve the partition_ident
+                # First, get all partition_idents for this table
+                partition_query = f"""SELECT array_length(retention_leases['leases'], 1) as cnt_leases, id 
+FROM sys.shards 
+WHERE table_name = '{table_name}' 
+  AND schema_name = '{schema_name}'
+  AND partition_ident = '{partition_ident}' 
+ORDER BY array_length(retention_leases['leases'], 1);"""
+            else:
+                # For non-partitioned tables
+                partition_query = f"""SELECT array_length(retention_leases['leases'], 1) as cnt_leases, id 
+FROM sys.shards 
+WHERE table_name = '{table_name}' 
+  AND schema_name = '{schema_name}'
+ORDER BY array_length(retention_leases['leases'], 1);"""
+            
+            retention_queries.append(partition_query)
+            console.print(f"-- For {schema_name}.{table_name}:")
+            console.print(partition_query)
+            console.print()
+        
+        # 5. Generate ALTER commands to set replicas to 1 (or original value)
+        console.print("[bold cyan]5. Restore Replicas to Original Values:[/bold cyan]")
+        restore_commands = []
+        
+        for info in valid_table_info:
+            schema_name, table_name, partition_values, partition_ident, problematic_replica_shards, max_translog_mb, total_primary_shards, total_replica_shards, total_primary_size_gb, total_replica_size_gb, current_replicas = info
+            
+            # Build the ALTER command to restore replicas
+            if partition_values and partition_values != 'NULL':
+                # Partitioned table commands
+                cmd_restore = f'ALTER TABLE "{schema_name}"."{table_name}" PARTITION {partition_values} SET ("number_of_replicas" = {current_replicas});'
+            else:
+                # Non-partitioned table commands
+                cmd_restore = f'ALTER TABLE "{schema_name}"."{table_name}" SET ("number_of_replicas" = {current_replicas});'
+            
+            restore_commands.append(cmd_restore)
+            console.print(cmd_restore)
+        
+        console.print()
+        
+        # 6. Re-enable automatic shard rebalancing
+        console.print("[bold cyan]6. Re-enable Automatic Shard Rebalancing:[/bold cyan]")
+        rebalance_enable_cmd = 'SET GLOBAL PERSISTENT "cluster.routing.rebalance.enable"=\'all\';'
+        console.print(rebalance_enable_cmd)
+        console.print()
+        
+        # Collect all commands for execution
+        all_commands = [rebalance_disable_cmd] + reroute_commands + set_zero_commands + restore_commands + [rebalance_enable_cmd]
+        
+        if not all_commands:
+            console.print("[yellow]No ALTER commands generated[/yellow]")
+            return
+            
+        console.print(f"[bold]Total Commands:[/bold]")
+        console.print(f"  • 1 rebalancing disable command")
+        console.print(f"  • {len(reroute_commands)} REROUTE CANCEL commands")
+        console.print(f"  • {len(set_zero_commands)} set replicas to 0 commands")
+        console.print(f"  • {len(retention_queries)} retention lease queries (for monitoring)")
+        console.print(f"  • {len(restore_commands)} restore replicas commands")
+        console.print(f"  • 1 rebalancing enable command")
+        
+        if execute and all_commands:
+            console.print()
+            console.print("[yellow]⚠️  WARNING: This will execute the complete shard management sequence![/yellow]")
+            console.print("[yellow]This includes disabling rebalancing, canceling problematic shards,")
+            console.print("setting replicas to 0, restoring replicas, and re-enabling rebalancing.[/yellow]")
+            console.print("[yellow]Retention lease queries will be displayed but not executed.[/yellow]")
+            console.print()
+            
+            if click.confirm("Execute all commands with individual confirmation for each?"):
+                console.print()
+                console.print("[bold blue]Executing comprehensive shard management sequence...[/bold blue]")
+                
+                executed = 0
+                failed = 0
+                cmd_num = 0
+                
+                # 1. Execute rebalancing disable command
+                cmd_num += 1
+                console.print(f"[bold]Step 1: Disable Rebalancing[/bold]")
+                console.print(f"[dim]Command {cmd_num}: {rebalance_disable_cmd}[/dim]")
+                if click.confirm(f"Execute rebalancing disable command?"):
+                    try:
+                        client.execute_query(rebalance_disable_cmd)
+                        console.print(f"[green]✓ Command {cmd_num} executed successfully[/green]")
+                        executed += 1
+                    except Exception as e:
+                        console.print(f"[red]✗ Command {cmd_num} failed: {e}[/red]")
+                        failed += 1
+                else:
+                    console.print(f"[yellow]Command {cmd_num} skipped[/yellow]")
+                console.print()
+                
+                # 2. Execute REROUTE CANCEL commands
+                if reroute_commands:
+                    console.print(f"[bold]Step 2: Execute REROUTE CANCEL Commands[/bold]")
+                    for cmd in reroute_commands:
+                        cmd_num += 1
+                        console.print(f"[dim]Command {cmd_num}: {cmd}[/dim]")
+                        if click.confirm(f"Execute this REROUTE CANCEL command?"):
+                            try:
+                                client.execute_query(cmd)
+                                console.print(f"[green]✓ Command {cmd_num} executed successfully[/green]")
+                                executed += 1
+                            except Exception as e:
+                                console.print(f"[red]✗ Command {cmd_num} failed: {e}[/red]")
+                                failed += 1
+                        else:
+                            console.print(f"[yellow]Command {cmd_num} skipped[/yellow]")
+                    console.print()
+                
+                # 3. Execute set replicas to 0 commands
+                if set_zero_commands:
+                    console.print(f"[bold]Step 3: Set Replicas to 0[/bold]")
+                    for cmd in set_zero_commands:
+                        cmd_num += 1
+                        console.print(f"[dim]Command {cmd_num}: {cmd}[/dim]")
+                        if click.confirm(f"Execute this SET replicas to 0 command?"):
+                            try:
+                                client.execute_query(cmd)
+                                console.print(f"[green]✓ Command {cmd_num} executed successfully[/green]")
+                                executed += 1
+                            except Exception as e:
+                                console.print(f"[red]✗ Command {cmd_num} failed: {e}[/red]")
+                                failed += 1
+                        else:
+                            console.print(f"[yellow]Command {cmd_num} skipped[/yellow]")
+                    console.print()
+                
+                # 4. Display retention lease queries (not executed)
+                if retention_queries:
+                    console.print(f"[bold]Step 4: Retention Lease Monitoring Queries (for reference)[/bold]")
+                    console.print("[dim]These queries are for monitoring purposes and will not be executed:[/dim]")
+                    for i, query in enumerate(retention_queries, 1):
+                        console.print(f"[dim]Query {i}:[/dim]")
+                        console.print(f"[dim]{query}[/dim]")
+                    console.print()
+                
+                # 5. Execute restore replicas commands
+                if restore_commands:
+                    console.print(f"[bold]Step 5: Restore Replicas to Original Values[/bold]")
+                    for cmd in restore_commands:
+                        cmd_num += 1
+                        console.print(f"[dim]Command {cmd_num}: {cmd}[/dim]")
+                        if click.confirm(f"Execute this RESTORE replicas command?"):
+                            try:
+                                client.execute_query(cmd)
+                                console.print(f"[green]✓ Command {cmd_num} executed successfully[/green]")
+                                executed += 1
+                            except Exception as e:
+                                console.print(f"[red]✗ Command {cmd_num} failed: {e}[/red]")
+                                failed += 1
+                        else:
+                            console.print(f"[yellow]Command {cmd_num} skipped[/yellow]")
+                    console.print()
+                
+                # 6. Execute rebalancing enable command
+                cmd_num += 1
+                console.print(f"[bold]Step 6: Re-enable Rebalancing[/bold]")
+                console.print(f"[dim]Command {cmd_num}: {rebalance_enable_cmd}[/dim]")
+                if click.confirm(f"Execute rebalancing enable command?"):
+                    try:
+                        client.execute_query(rebalance_enable_cmd)
+                        console.print(f"[green]✓ Command {cmd_num} executed successfully[/green]")
+                        executed += 1
+                    except Exception as e:
+                        console.print(f"[red]✗ Command {cmd_num} failed: {e}[/red]")
+                        failed += 1
+                else:
+                    console.print(f"[yellow]Command {cmd_num} skipped[/yellow]")
+                console.print()
+                
+                console.print(f"[bold]Execution Summary:[/bold]")
+                console.print(f"[green]✓ Successful: {executed}[/green]")
+                if failed > 0:
+                    console.print(f"[red]✗ Failed: {failed}[/red]")
+            else:
+                console.print("[yellow]Operation cancelled by user[/yellow]")
+                
+    except Exception as e:
+        console.print(f"[red]Error analyzing problematic translogs: {e}[/red]")
+        import traceback
+        console.print(f"[dim]{traceback.format_exc()}[/dim]")
+
+
+@main.command()
+@click.option('--translogsize', default=500, help='Minimum translog uncommitted size threshold in MB (default: 500)')
+@click.option('--interval', default=60, help='Monitoring interval in seconds for watch mode (default: 60)')
+@click.option('--watch', '-w', is_flag=True, help='Continuously monitor (refresh every interval)')
+@click.option('--table', '-t', help='Monitor specific table only')
+@click.option('--node', '-n', help='Monitor specific node only')
+@click.option('--count', default=50, help='Maximum number of shards with large translogs to show (default: 50)')
+@click.pass_context
+def large_translogs(ctx, translogsize: int, interval: int, watch: bool, table: Optional[str], node: Optional[str], count: int):
+    """Monitor shards with large translog uncommitted sizes that do not flush
+    
+    This command identifies shards (both primary and replica) that have large
+    translog uncommitted sizes, indicating they are not flushing properly.
+    Useful for monitoring translog growth and identifying problematic shards.
+    
+    Examples:
+        xmover large-translogs --translogsize 1000            # Shards with >1GB translog
+        xmover large-translogs --watch --interval 30          # Continuous monitoring every 30s
+        xmover large-translogs --table my_table --watch       # Monitor specific table
+        xmover large-translogs --node data-hot-1 --count 20   # Top 20 on specific node
+    """
+    client = ctx.obj['client']
+    
+    def get_large_translog_shards():
+        """Get shards with large translog uncommitted sizes"""
+        query = """
+            SELECT
+                sh.schema_name,
+                sh.table_name,
+                translate(p.values::text, ':{}', '=()') as partition_values,
+                sh.id AS shard_id,
+                node['name'] AS node_name,
+                COALESCE(sh.translog_stats['uncommitted_size'] / 1024^2, 0) AS translog_uncommitted_mb,
+                sh.primary,
+                sh.size / 1024^2 AS shard_size_mb
+            FROM
+                sys.shards AS sh
+            LEFT JOIN information_schema.table_partitions p
+                ON sh.table_name = p.table_name
+                AND sh.schema_name = p.table_schema
+                AND sh.partition_ident = p.partition_ident
+            WHERE
+                sh.state = 'STARTED'
+                AND COALESCE(sh.translog_stats['uncommitted_size'], 0) > ? * 1024^2
+        """
+        
+        params = [translogsize]
+        
+        # Add table filter if specified
+        if table:
+            if '.' in table:
+                schema_name, table_name = table.split('.', 1)
+                query += " AND sh.schema_name = ? AND sh.table_name = ?"
+                params.extend([schema_name, table_name])
+            else:
+                query += " AND sh.table_name = ?"
+                params.append(table)
+        
+        # Add node filter if specified
+        if node:
+            query += " AND node['name'] = ?"
+            params.append(node)
+        
+        query += """
+            ORDER BY
+                COALESCE(sh.translog_stats['uncommitted_size'], 0) DESC
+            LIMIT ?
+        """
+        params.append(count)
+        
+        try:
+            result = client.execute_query(query, params)
+            return result.get('rows', [])
+        except Exception as e:
+            console.print(f"[red]Error querying shards with large translogs: {e}[/red]")
+            return []
+    
+    def display_large_translog_shards(shards_data, show_header=True):
+        """Display the shards with large translogs in a table"""
+        if not shards_data:
+            threshold_display = f"{translogsize}MB" if translogsize < 1000 else f"{translogsize/1000:.1f}GB"
+            console.print(f"[green]✅ No shards found with translog uncommitted size over {threshold_display}[/green]")
+            return
+        
+        # Get current timestamp
+        import datetime
+        timestamp = datetime.datetime.now().strftime("%H:%M:%S")
+        
+        # Create condensed table
+        from rich.table import Table
+        results_table = Table(show_header=show_header, box=box.SIMPLE if watch else box.ROUNDED)
+        if show_header:
+            results_table.add_column("Schema.Table", style="cyan", max_width=50)
+            results_table.add_column("Partition", style="magenta", max_width=30)
+            results_table.add_column("Shard", justify="right", style="yellow", width=5)
+            results_table.add_column("Node", style="green", max_width=12)
+            results_table.add_column("TL MB", justify="right", style="red", width=6)
+            results_table.add_column("Type", justify="center", style="bright_white", width=4)
+        else:
+            results_table.add_column("", style="cyan", max_width=50)
+            results_table.add_column("", style="magenta", max_width=30)
+            results_table.add_column("", justify="right", style="yellow", width=5)
+            results_table.add_column("", style="green", max_width=12)
+            results_table.add_column("", justify="right", style="red", width=6)
+            results_table.add_column("", justify="center", style="bright_white", width=4)
+        
+        for row in shards_data:
+            schema_name, table_name, partition_values, shard_id, node_name, translog_mb, is_primary, shard_size_mb = row
+            
+            # Format table name
+            if schema_name and schema_name != 'doc':
+                table_display = f"{schema_name}.{table_name}"
+            else:
+                table_display = table_name
+            
+            # Format partition
+            if partition_values and partition_values != 'NULL':
+                partition_display = partition_values[:27] + "..." if len(partition_values) > 30 else partition_values
+            else:
+                partition_display = "-"
+            
+            primary_display = "P" if is_primary else "R"
+            
+            # Color code translog based on size
+            if translog_mb > 1000:
+                translog_color = "bright_red"
+            elif translog_mb > 500:
+                translog_color = "red"
+            elif translog_mb > 100:
+                translog_color = "yellow"
+            else:
+                translog_color = "green"
+            
+            results_table.add_row(
+                table_display,
+                partition_display,
+                str(shard_id),
+                node_name,
+                f"[{translog_color}]{translog_mb:.0f}[/{translog_color}]",
+                primary_display
+            )
+        
+        # Show timestamp and summary
+        total_shards = len(shards_data)
+        primary_count = sum(1 for row in shards_data if row[6])  # is_primary is at index 6
+        replica_count = total_shards - primary_count
+        avg_translog = sum(row[5] for row in shards_data) / total_shards if total_shards > 0 else 0  # translog_mb is at index 5
+        
+        if show_header:
+            threshold_display = f"{translogsize}MB" if translogsize < 1000 else f"{translogsize/1000:.1f}GB"
+            console.print(f"[bold blue]Large Translogs (>{threshold_display}) - {timestamp}[/bold blue]")
+        else:
+            console.print(f"[dim]{timestamp}[/dim]")
+            
+        console.print(results_table)
+        console.print(f"[dim]{total_shards} shards ({primary_count}P/{replica_count}R) - Avg translog: {avg_translog:.0f}MB[/dim]")
+    
+    def run_single_analysis():
+        """Run a single analysis cycle"""
+        if not watch:
+            console.print(Panel.fit("[bold blue]Large Translog Monitor[/bold blue]"))
+        
+        # Show configuration
+        threshold_display = f"{translogsize}MB" if translogsize < 1000 else f"{translogsize/1000:.1f}GB"
+        if watch:
+            config_parts = [f"{interval}s", f">{threshold_display}", f"top {count}"]
+            if table:
+                config_parts.append(f"table: {table}")
+            if node:
+                config_parts.append(f"node: {node}")
+            console.print(f"[dim]{' | '.join(config_parts)}[/dim]")
+        else:
+            config_info = [f"Threshold: >{threshold_display}"]
+            if count != 50:
+                config_info.append(f"Limit: {count}")
+            if table:
+                config_info.append(f"Table: {table}")
+            if node:
+                config_info.append(f"Node: {node}")
+                
+            console.print("[dim]" + " | ".join(config_info) + "[/dim]")
+        if not watch:
+            console.print()
+        
+        # Get shards with large translogs
+        shards_data = get_large_translog_shards()
+        
+        # Display results
+        display_large_translog_shards(shards_data, show_header=not watch)
+    
+    try:
+        if watch:
+            console.print("[dim]Press Ctrl+C to stop monitoring[/dim]")
+            console.print()
+            
+            while True:
+                run_single_analysis()
+                if watch:
+                    console.print(f"\n[dim]━━━ Next update in {interval}s ━━━[/dim]\n")
+                time.sleep(interval)
+        else:
+            run_single_analysis()
+            
+    except KeyboardInterrupt:
+        console.print("\n[yellow]Monitoring stopped by user[/yellow]")
+    except Exception as e:
+        console.print(f"[red]Error during large translog monitoring: {e}[/red]")
+        import traceback
+        console.print(f"[dim]{traceback.format_exc()}[/dim]")
+
+
+@main.command("deep-analyze")
+@click.option('--rules-file', '-r', type=click.Path(exists=True), 
+              help='Path to custom rules YAML file')
+@click.option('--schema', '-s', help='Analyze specific schema only')
+@click.option('--severity', type=click.Choice(['critical', 'warning', 'info']),
+              help='Show only violations of specified severity')
+@click.option('--export-csv', type=click.Path(), 
+              help='Export results to CSV file')
+@click.option('--validate-rules', type=click.Path(exists=True),
+              help='Validate rules file and exit')
+@click.pass_context
+def deep_analyze(ctx, rules_file: Optional[str], schema: Optional[str], 
+                 severity: Optional[str], export_csv: Optional[str],
+                 validate_rules: Optional[str]):
+    """Deep analysis of shard sizes with configurable optimization rules
+    
+    This command analyzes your CrateDB cluster's shard sizes, column counts,
+    and distribution patterns, then applies a comprehensive set of rules to
+    identify optimization opportunities and performance issues.
+    
+    Features:
+    - Cluster configuration analysis (nodes, CPU, memory, heap)
+    - Table and partition shard size analysis
+    - Configurable rule-based recommendations
+    - CSV export for spreadsheet analysis
+    - Custom rules file support
+    
+    Examples:
+    
+        # Run full analysis with default rules
+        xmover deep-analyze
+        
+        # Analyze specific schema only
+        xmover deep-analyze --schema myschema
+        
+        # Show only critical issues
+        xmover deep-analyze --severity critical
+        
+        # Export to spreadsheet
+        xmover deep-analyze --export-csv shard_analysis.csv
+        
+        # Use custom rules
+        xmover deep-analyze --rules-file custom_rules.yaml
+        
+        # Validate rules file
+        xmover deep-analyze --validate-rules custom_rules.yaml
+    """
+    if validate_rules:
+        if validate_rules_file(validate_rules):
+            console.print(f"[green]✅ Rules file {validate_rules} is valid[/green]")
+            sys.exit(0)
+        else:
+            sys.exit(1)
+    
+    try:
+        client = ctx.obj['client']
+        
+        # Initialize monitor with optional custom rules
+        monitor = ShardSizeMonitor(client, rules_file)
+        
+        console.print("[bold blue]🔍 XMover Deep Shard Size Analysis[/bold blue]")
+        console.print("Analyzing cluster configuration and shard distributions...\n")
+        
+        # Run analysis
+        report = monitor.analyze_cluster_shard_sizes(schema_filter=schema)
+        
+        # Display results
+        monitor.display_report(report, severity_filter=severity)
+        
+        # Export CSV if requested
+        if export_csv:
+            monitor.export_csv(report, export_csv)
+            console.print(f"\n[green]📊 Results exported to {export_csv}[/green]")
+        
+        # Summary footer
+        violation_counts = report.total_violations_by_severity
+        total_violations = sum(violation_counts.values())
+        
+        if total_violations > 0:
+            console.print(f"\n[bold]Analysis completed:[/bold] {total_violations} optimization opportunities identified")
+            if violation_counts['critical'] > 0:
+                console.print("[red]⚠️  Critical issues require immediate attention[/red]")
+        else:
+            console.print("\n[bold green]🎉 Excellent! No optimization issues detected[/bold green]")
+            
+    except Exception as e:
+        console.print(f"[red]Error during deep shard size analysis: {e}[/red]")
+        import traceback
+        console.print(f"[dim]{traceback.format_exc()}[/dim]")
+
+
 if __name__ == '__main__':
     main()
diff --git a/src/xmover/database.py b/src/xmover/database.py
index 2a561be..2c78e46 100644
--- a/src/xmover/database.py
+++ b/src/xmover/database.py
@@ -62,6 +62,7 @@ class RecoveryInfo:
     """Information about an active shard recovery"""
     schema_name: str
     table_name: str
+    partition_values: Optional[str]  # Partition values for partitioned tables
     shard_id: int
     node_name: str
     node_id: str
@@ -76,6 +77,9 @@ class RecoveryInfo:
     size_bytes: int
     source_node_name: Optional[str] = None  # Source node for PEER recoveries
     translog_size_bytes: int = 0  # Translog size in bytes
+    translog_uncommitted_bytes: int = 0  # Translog uncommitted size in bytes
+    max_seq_no: Optional[int] = None  # Sequence number for this shard
+    primary_max_seq_no: Optional[int] = None  # Primary shard's sequence number for replica progress
     
     @property
     def overall_progress(self) -> float:
@@ -101,10 +105,91 @@ def translog_size_gb(self) -> float:
         """Translog size in GB"""
         return self.translog_size_bytes / (1024**3)
     
+    @property
+    def translog_uncommitted_gb(self) -> float:
+        """Translog uncommitted size in GB"""
+        return self.translog_uncommitted_bytes / (1024**3)
+    
     @property
     def translog_percentage(self) -> float:
         """Translog size as percentage of shard size"""
         return (self.translog_size_bytes / self.size_bytes * 100) if self.size_bytes > 0 else 0
+    
+    @property
+    def translog_uncommitted_percentage(self) -> float:
+        """Translog uncommitted size as percentage of total translog size"""
+        return (self.translog_uncommitted_bytes / self.translog_size_bytes * 100) if self.translog_size_bytes > 0 else 0
+    
+    @property
+    def seq_no_progress(self) -> Optional[float]:
+        """Calculate replica progress based on sequence numbers (for replica shards only)"""
+        if not self.is_primary and self.max_seq_no is not None and self.primary_max_seq_no is not None:
+            if self.primary_max_seq_no == 0:
+                return 100.0  # No operations on primary yet
+            return min((self.max_seq_no / self.primary_max_seq_no * 100.0), 100.0)
+        return None
+
+
+@dataclass
+class ActiveShardSnapshot:
+    """Snapshot of active shard checkpoint data for tracking activity"""
+    schema_name: str
+    table_name: str
+    shard_id: int
+    node_name: str
+    is_primary: bool
+    partition_ident: str
+    local_checkpoint: int
+    global_checkpoint: int
+    translog_uncommitted_bytes: int
+    timestamp: float  # Unix timestamp when snapshot was taken
+    
+    @property
+    def checkpoint_delta(self) -> int:
+        """Current checkpoint delta (local - global)"""
+        return self.local_checkpoint - self.global_checkpoint
+    
+    @property
+    def translog_uncommitted_mb(self) -> float:
+        """Translog uncommitted size in MB"""
+        return self.translog_uncommitted_bytes / (1024 * 1024)
+    
+    @property
+    def shard_identifier(self) -> str:
+        """Unique identifier for this shard including partition"""
+        shard_type = "P" if self.is_primary else "R"
+        partition = f":{self.partition_ident}" if self.partition_ident else ""
+        return f"{self.schema_name}.{self.table_name}:{self.shard_id}:{self.node_name}:{shard_type}{partition}"
+
+
+@dataclass
+class ActiveShardActivity:
+    """Activity comparison between two snapshots of the same shard"""
+    schema_name: str
+    table_name: str
+    shard_id: int
+    node_name: str
+    is_primary: bool
+    partition_ident: str
+    local_checkpoint_delta: int  # Change in local checkpoint between snapshots
+    snapshot1: ActiveShardSnapshot
+    snapshot2: ActiveShardSnapshot
+    time_diff_seconds: float
+    
+    @property
+    def activity_rate(self) -> float:
+        """Activity rate as checkpoint changes per second"""
+        if self.time_diff_seconds > 0:
+            return self.local_checkpoint_delta / self.time_diff_seconds
+        return 0.0
+    
+    @property
+    def shard_type(self) -> str:
+        return "PRIMARY" if self.is_primary else "REPLICA"
+    
+    @property
+    def table_identifier(self) -> str:
+        return f"{self.schema_name}.{self.table_name}"
 
 
 class CrateDBClient:
@@ -119,7 +204,20 @@ def __init__(self, connection_string: Optional[str] = None):
         
         self.username = os.getenv('CRATE_USERNAME')
         self.password = os.getenv('CRATE_PASSWORD')
-        self.ssl_verify = os.getenv('CRATE_SSL_VERIFY', 'true').lower() == 'true'
+        
+        # Auto-disable SSL verification for localhost connections
+        is_localhost = 'localhost' in self.connection_string or '127.0.0.1' in self.connection_string
+        ssl_verify_env = os.getenv('CRATE_SSL_VERIFY', 'true').lower()
+        
+        # Default to false for localhost, true for remote connections
+        if ssl_verify_env == 'auto':
+            self.ssl_verify = not is_localhost
+        else:
+            self.ssl_verify = ssl_verify_env == 'true'
+        
+        # For localhost, disable SSL verification by default unless explicitly enabled
+        if is_localhost and ssl_verify_env == 'true' and os.getenv('CRATE_SSL_VERIFY') is None:
+            self.ssl_verify = False
         
         # Suppress SSL warnings when SSL verification is disabled
         if not self.ssl_verify:
@@ -139,9 +237,14 @@ def execute_query(self, query: str, parameters: Optional[List] = None) -> Dict[s
         if parameters:
             payload['args'] = parameters
         
+        # Handle authentication - only use auth if both username and password are provided
+        # For CrateDB, username without password should not use auth
         auth = None
         if self.username and self.password:
             auth = (self.username, self.password)
+        elif self.username and not self.password:
+            # For CrateDB 'crate' user without password, don't use auth
+            auth = None
         
         try:
             response = requests.post(
@@ -153,6 +256,15 @@ def execute_query(self, query: str, parameters: Optional[List] = None) -> Dict[s
             )
             response.raise_for_status()
             return response.json()
+        except requests.exceptions.SSLError as e:
+            # Provide helpful SSL error message for localhost connections
+            if 'localhost' in self.connection_string or '127.0.0.1' in self.connection_string:
+                raise Exception(f"SSL certificate error for localhost connection. "
+                              f"Try setting CRATE_SSL_VERIFY=false in your .env file. Error: {e}")
+            else:
+                raise Exception(f"SSL error: {e}")
+        except requests.exceptions.ConnectionError as e:
+            raise Exception(f"Connection error - check if CrateDB is running and accessible: {e}")
         except requests.exceptions.RequestException as e:
             raise Exception(f"Failed to execute query: {e}")
     
@@ -344,7 +456,9 @@ def test_connection(self) -> bool:
         try:
             result = self.execute_query("SELECT 1")
             return result.get('rowcount', 0) >= 0
-        except Exception:
+        except Exception as e:
+            # Log the actual error for debugging
+            print(f"Connection test failed: {e}")
             return False
     
     def get_cluster_watermarks(self) -> Dict[str, Any]:
@@ -420,6 +534,7 @@ def get_recovery_details(self, schema_name: str, table_name: str, shard_id: int)
         SELECT 
             s.table_name,
             s.schema_name,
+            translate(p.values::text, ':{}', '=()') as partition_values,
             s.id as shard_id,
             s.node['name'] as node_name,
             s.node['id'] as node_id,
@@ -428,8 +543,14 @@ def get_recovery_details(self, schema_name: str, table_name: str, shard_id: int)
             s.recovery,
             s.size,
             s."primary",
-            s.translog_stats['size'] as translog_size
+            s.translog_stats['size'] as translog_size,
+            s.translog_stats['uncommitted_size'] as translog_uncommitted_size,
+            s.seq_no_stats['max_seq_no'] as max_seq_no
         FROM sys.shards s
+        LEFT JOIN information_schema.table_partitions p 
+            ON s.table_name = p.table_name 
+            AND s.schema_name = p.table_schema 
+            AND s.partition_ident = p.partition_ident
         WHERE s.table_name = ? AND s.id = ?
         AND (s.state = 'RECOVERING' OR s.routing_state IN ('INITIALIZING', 'RELOCATING'))
         ORDER BY s.schema_name
@@ -445,17 +566,42 @@ def get_recovery_details(self, schema_name: str, table_name: str, shard_id: int)
         return {
             'table_name': row[0],
             'schema_name': row[1],
-            'shard_id': row[2],
-            'node_name': row[3],
-            'node_id': row[4],
-            'routing_state': row[5],
-            'state': row[6],
-            'recovery': row[7],
-            'size': row[8],
-            'primary': row[9],
-            'translog_size': row[10] or 0
+            'partition_values': row[2],
+            'shard_id': row[3],
+            'node_name': row[4],
+            'node_id': row[5],
+            'routing_state': row[6],
+            'state': row[7],
+            'recovery': row[8],
+            'size': row[9],
+            'primary': row[10],
+            'translog_size': row[11] or 0,
+            'translog_uncommitted_size': row[12] or 0,
+            'max_seq_no': row[13]
         }
     
+    def _get_primary_max_seq_no(self, schema_name: str, table_name: str, shard_id: int) -> Optional[int]:
+        """Get the max_seq_no of the primary shard for replica progress comparison"""
+        try:
+            query = """
+            SELECT s.seq_no_stats['max_seq_no'] as primary_max_seq_no
+            FROM sys.shards s
+            WHERE s.schema_name = ? AND s.table_name = ? AND s.id = ? 
+            AND s."primary" = true
+            AND s.state = 'STARTED'
+            LIMIT 1
+            """
+            
+            result = self.execute_query(query, [schema_name, table_name, shard_id])
+            
+            if result.get('rows'):
+                return result['rows'][0][0]
+            return None
+            
+        except Exception:
+            # If query fails, return None
+            return None
+    
     def get_all_recovering_shards(self, table_name: Optional[str] = None, 
                                 node_name: Optional[str] = None,
                                 include_transitioning: bool = False) -> List[RecoveryInfo]:
@@ -482,6 +628,37 @@ def get_all_recovering_shards(self, table_name: Optional[str] = None,
                 allocation['schema_name'] = recovery_detail['schema_name']
                 recovery_info = self._parse_recovery_info(allocation, recovery_detail)
                 
+                # For replica recoveries, get primary sequence number for progress tracking
+                if not recovery_info.is_primary and recovery_info.recovery_type == 'PEER':
+                    primary_seq_no = self._get_primary_max_seq_no(
+                        recovery_detail['schema_name'],
+                        recovery_detail['table_name'],
+                        recovery_detail['shard_id']
+                    )
+                    # Create updated recovery info with primary sequence number
+                    recovery_info = RecoveryInfo(
+                        schema_name=recovery_info.schema_name,
+                        table_name=recovery_info.table_name,
+                        partition_values=recovery_info.partition_values,
+                        shard_id=recovery_info.shard_id,
+                        node_name=recovery_info.node_name,
+                        node_id=recovery_info.node_id,
+                        recovery_type=recovery_info.recovery_type,
+                        stage=recovery_info.stage,
+                        files_percent=recovery_info.files_percent,
+                        bytes_percent=recovery_info.bytes_percent,
+                        total_time_ms=recovery_info.total_time_ms,
+                        routing_state=recovery_info.routing_state,
+                        current_state=recovery_info.current_state,
+                        is_primary=recovery_info.is_primary,
+                        size_bytes=recovery_info.size_bytes,
+                        source_node_name=recovery_info.source_node_name,
+                        translog_size_bytes=recovery_info.translog_size_bytes,
+                        translog_uncommitted_bytes=recovery_info.translog_uncommitted_bytes,
+                        max_seq_no=recovery_info.max_seq_no,
+                        primary_max_seq_no=primary_seq_no
+                    )
+                
                 # Filter out completed recoveries unless include_transitioning is True
                 if include_transitioning or not self._is_recovery_completed(recovery_info):
                     recoveries.append(recovery_info)
@@ -529,6 +706,7 @@ def _parse_recovery_info(self, allocation: Dict[str, Any],
         return RecoveryInfo(
             schema_name=shard_detail['schema_name'],
             table_name=shard_detail['table_name'],
+            partition_values=shard_detail.get('partition_values'),
             shard_id=shard_detail['shard_id'],
             node_name=shard_detail['node_name'],
             node_id=shard_detail['node_id'],
@@ -542,7 +720,10 @@ def _parse_recovery_info(self, allocation: Dict[str, Any],
             is_primary=shard_detail['primary'],
             size_bytes=shard_detail.get('size', 0),
             source_node_name=source_node,
-            translog_size_bytes=shard_detail.get('translog_size', 0)
+            translog_size_bytes=shard_detail.get('translog_size', 0),
+            translog_uncommitted_bytes=shard_detail.get('translog_uncommitted_size', 0),
+            max_seq_no=shard_detail.get('max_seq_no'),
+            primary_max_seq_no=None  # Will be populated later for replicas
         )
     
     def _find_source_node_for_recovery(self, schema_name: str, table_name: str, shard_id: int, target_node_id: str) -> Optional[str]:
@@ -587,4 +768,120 @@ def _is_recovery_completed(self, recovery_info: RecoveryInfo) -> bool:
         """Check if a recovery is completed but still transitioning"""
         return (recovery_info.stage == 'DONE' and 
                 recovery_info.files_percent >= 100.0 and 
-                recovery_info.bytes_percent >= 100.0)
\ No newline at end of file
+                recovery_info.bytes_percent >= 100.0)
+
+    def get_problematic_shards(self, table_name: Optional[str] = None, 
+                             node_name: Optional[str] = None) -> List[Dict[str, Any]]:
+        """Get shards that need attention but aren't actively recovering"""
+        
+        where_conditions = ["s.state != 'STARTED'"]
+        parameters = []
+        
+        if table_name:
+            where_conditions.append("s.table_name = ?")
+            parameters.append(table_name)
+            
+        if node_name:
+            where_conditions.append("s.node['name'] = ?")
+            parameters.append(node_name)
+        
+        where_clause = f"WHERE {' AND '.join(where_conditions)}"
+        
+        query = f"""
+        SELECT 
+            s.schema_name, 
+            s.table_name, 
+            translate(p.values::text, ':{{}}', '=()') as partition_values,
+            s.id as shard_id,
+            s.state, 
+            s.routing_state, 
+            s.node['name'] as node_name,
+            s.node['id'] as node_id,
+            s."primary"
+        FROM sys.shards s
+        LEFT JOIN information_schema.table_partitions p 
+            ON s.table_name = p.table_name 
+            AND s.schema_name = p.table_schema 
+            AND s.partition_ident = p.partition_ident
+        {where_clause}
+        ORDER BY s.state, s.table_name, s.id
+        """
+        
+        result = self.execute_query(query, parameters)
+        
+        problematic_shards = []
+        for row in result.get('rows', []):
+            problematic_shards.append({
+                'schema_name': row[0] or 'doc',
+                'table_name': row[1], 
+                'partition_values': row[2],
+                'shard_id': row[3],
+                'state': row[4],
+                'routing_state': row[5],
+                'node_name': row[6],
+                'node_id': row[7],
+                'primary': row[8]
+            })
+        
+        return problematic_shards
+    
+    def get_active_shards_snapshot(self, min_checkpoint_delta: int = 1000) -> List[ActiveShardSnapshot]:
+        """Get a snapshot of all started shards for activity monitoring
+        
+        Note: This captures ALL started shards regardless of current activity level.
+        The min_checkpoint_delta parameter is kept for backwards compatibility but
+        filtering is now done during snapshot comparison to catch shards that
+        become active between observations.
+        
+        Args:
+            min_checkpoint_delta: Kept for compatibility - filtering now done in comparison
+            
+        Returns:
+            List of ActiveShardSnapshot objects for all started shards
+        """
+        import time
+        
+        query = """
+        SELECT
+            sh.schema_name,
+            sh.table_name,
+            sh.id AS shard_id,
+            sh."primary",
+            node['name'] as node_name,
+            sh.partition_ident,
+            sh.translog_stats['uncommitted_size'] AS translog_uncommitted_bytes,
+            sh.seq_no_stats['local_checkpoint'] AS local_checkpoint,
+            sh.seq_no_stats['global_checkpoint'] AS global_checkpoint
+        FROM
+            sys.shards AS sh
+        WHERE
+            sh.state = 'STARTED'
+        ORDER BY
+            sh.schema_name, sh.table_name, sh.id, sh.node['name']
+        """
+        
+        try:
+            result = self.execute_query(query)
+            snapshots = []
+            current_time = time.time()
+            
+            for row in result.get('rows', []):
+                snapshot = ActiveShardSnapshot(
+                    schema_name=row[0],
+                    table_name=row[1],
+                    shard_id=row[2],
+                    is_primary=row[3],
+                    node_name=row[4],
+                    partition_ident=row[5] or '',
+                    translog_uncommitted_bytes=row[6] or 0,
+                    local_checkpoint=row[7] or 0,
+                    global_checkpoint=row[8] or 0,
+                    timestamp=current_time
+                )
+                snapshots.append(snapshot)
+                
+            return snapshots
+            
+        except Exception as e:
+            print(f"Error getting active shards snapshot: {e}")
+            return []
\ No newline at end of file
diff --git a/src/xmover/shard_size_monitor.py b/src/xmover/shard_size_monitor.py
new file mode 100644
index 0000000..fa00ebc
--- /dev/null
+++ b/src/xmover/shard_size_monitor.py
@@ -0,0 +1,689 @@
+"""
+XMover Shard Size Monitor
+
+A comprehensive tool for analyzing CrateDB shard sizes and generating optimization recommendations
+based on configurable rules. This module can be used standalone or integrated with other tools.
+"""
+
+import csv
+import os
+import sys
+import yaml
+from dataclasses import dataclass, field
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, List, Optional, Any, Union
+from rich.console import Console
+from rich.table import Table
+from rich.panel import Panel
+from rich.text import Text
+
+
+@dataclass
+class ShardSizeRule:
+    """Represents a single shard size analysis rule."""
+    name: str
+    category: str
+    severity: str  # 'critical', 'warning', 'info'
+    condition: str  # Python expression to evaluate
+    recommendation: str  # Template string with variables
+    action_hint: Optional[str] = None
+
+
+@dataclass
+class RuleViolation:
+    """Represents a violated rule with context."""
+    rule_name: str
+    category: str
+    severity: str
+    recommendation: str
+    action_hint: Optional[str]
+    table_identifier: str  # schema.table[partition]
+
+
+@dataclass
+class ShardAnalysisResult:
+    """Analysis results for a single table/partition."""
+    # Raw data from query
+    table_schema: str
+    table_name: str
+    partition_ident: Optional[str]
+    total_primary_size_gb: float
+    avg_shard_size_gb: float
+    min_shard_size_gb: float
+    max_shard_size_gb: float
+    num_shards_primary: int
+    num_shards_replica: int
+    num_shards_total: int
+    num_columns: int
+    partitioned_by: Optional[str]
+    clustered_by: Optional[str]
+    total_documents: int
+
+    # Analysis results
+    violations: List[RuleViolation] = field(default_factory=list)
+
+    @property
+    def table_identifier(self) -> str:
+        """Get human-readable table identifier."""
+        base = f"{self.table_schema}.{self.table_name}"
+        if self.partition_ident and self.partition_ident != '':
+            return f"{base}[{self.partition_ident}]"
+        return base
+
+    @property
+    def has_critical_violations(self) -> bool:
+        """Check if there are any critical violations."""
+        return any(v.severity == 'critical' for v in self.violations)
+
+    @property
+    def has_warnings(self) -> bool:
+        """Check if there are any warning violations."""
+        return any(v.severity == 'warning' for v in self.violations)
+
+
+@dataclass
+class ClusterConfiguration:
+    """Cluster-level configuration and metrics."""
+    total_nodes: int
+    total_cpu_cores: int
+    total_memory_gb: float
+    total_heap_gb: float
+    max_shards_per_node_setting: int
+    actual_max_shards_per_node: int
+    total_shards: int
+    disk_watermark_low: Optional[float] = None
+    disk_watermark_high: Optional[float] = None
+    disk_watermark_flood_stage: Optional[float] = None
+
+
+@dataclass
+class MonitoringReport:
+    """Complete analysis report."""
+    timestamp: datetime
+    cluster_config: ClusterConfiguration
+    table_results: List[ShardAnalysisResult]
+    cluster_violations: List[RuleViolation]
+
+    @property
+    def total_violations_by_severity(self) -> Dict[str, int]:
+        """Count violations by severity level."""
+        counts = {'critical': 0, 'warning': 0, 'info': 0}
+
+        # Count table-level violations
+        for result in self.table_results:
+            for violation in result.violations:
+                counts[violation.severity] += 1
+
+        # Count cluster-level violations
+        for violation in self.cluster_violations:
+            counts[violation.severity] += 1
+
+        return counts
+
+
+class RulesConfigValidator:
+    """Validates rules configuration files."""
+
+    @staticmethod
+    def validate_config(config: Dict[str, Any]) -> List[str]:
+        """Validate rules configuration and return list of errors."""
+        errors = []
+
+        # Check required top-level fields
+        required_fields = ['metadata', 'thresholds', 'rules']
+        for field in required_fields:
+            if field not in config:
+                errors.append(f"Missing required field: {field}")
+
+        if 'validation' in config and 'rule_required_fields' in config['validation']:
+            rule_required_fields = config['validation']['rule_required_fields']
+        else:
+            rule_required_fields = ['name', 'category', 'severity', 'condition', 'recommendation']
+
+        # Validate individual rules
+        if 'rules' in config:
+            for i, rule in enumerate(config['rules']):
+                for field in rule_required_fields:
+                    if field not in rule:
+                        errors.append(f"Rule {i}: Missing required field '{field}'")
+
+                # Validate severity
+                if 'severity' in rule:
+                    valid_severities = config.get('validation', {}).get('valid_severities',
+                                                                       ['critical', 'warning', 'info'])
+                    if rule['severity'] not in valid_severities:
+                        errors.append(f"Rule {i} ({rule.get('name', 'unnamed')}): "
+                                    f"Invalid severity '{rule['severity']}'")
+
+                # Try to compile condition as Python expression
+                if 'condition' in rule:
+                    try:
+                        compile(rule['condition'], '<rule_condition>', 'eval')
+                    except SyntaxError as e:
+                        errors.append(f"Rule {i} ({rule.get('name', 'unnamed')}): "
+                                    f"Invalid condition syntax: {e}")
+
+        # Validate cluster rules if present
+        if 'cluster_rules' in config:
+            for i, rule in enumerate(config['cluster_rules']):
+                for field in rule_required_fields:
+                    if field not in rule:
+                        errors.append(f"Cluster rule {i}: Missing required field '{field}'")
+
+        return errors
+
+
+class ShardSizeMonitor:
+    """Main shard size monitoring and analysis class."""
+
+    SHARD_ANALYSIS_QUERY = """
+    WITH columns AS (
+        SELECT table_schema,
+               table_name,
+               COUNT(*) AS num_columns
+        FROM information_schema.columns
+        GROUP BY ALL
+    ), tables AS (
+        SELECT table_schema,
+               table_name,
+               partitioned_by,
+               clustered_by
+        FROM information_schema.tables
+    ), shards AS (
+        SELECT schema_name AS table_schema,
+               table_name,
+               partition_ident,
+               SUM(size) FILTER (WHERE primary = TRUE) / POWER(1024, 3) AS total_primary_size_gb,
+               AVG(size) / POWER(1024, 3) AS avg_shard_size_gb,
+               MIN(size) / POWER(1024, 3) AS min_shard_size_gb,
+               MAX(size) / POWER(1024, 3) AS max_shard_size_gb,
+               COUNT(*) FILTER (WHERE primary = TRUE) AS num_shards_primary,
+               COUNT(*) FILTER (WHERE primary = FALSE) AS num_shards_replica,
+               COUNT(*) AS num_shards_total,
+               SUM(num_docs) AS total_documents
+        FROM sys.shards
+        GROUP BY ALL
+    )
+    SELECT s.*,
+           num_columns,
+           partitioned_by[1] AS partitioned_by,
+           clustered_by
+    FROM shards s
+    JOIN columns c ON s.table_name = c.table_name AND s.table_schema = c.table_schema
+    JOIN tables t ON s.table_name = t.table_name AND s.table_schema = t.table_schema
+    ORDER BY table_schema, table_name, partition_ident
+    """
+
+    def __init__(self, db_client, rules_config_path: Optional[str] = None):
+        """Initialize monitor with database client and rules configuration."""
+        self.db_client = db_client
+        self.console = Console()
+
+        # Load rules configuration
+        if rules_config_path is None:
+            # Use default rules file
+            current_dir = Path(__file__).parent.parent.parent
+            rules_config_path = current_dir / "config" / "shard_size_rules.yaml"
+
+        self.rules_config = self._load_rules_config(rules_config_path)
+        self.thresholds = self.rules_config.get('thresholds', {})
+        self.table_rules = [ShardSizeRule(**rule) for rule in self.rules_config.get('rules', [])]
+        self.cluster_rules = [ShardSizeRule(**rule) for rule in self.rules_config.get('cluster_rules', [])]
+
+    def _load_rules_config(self, config_path: Union[str, Path]) -> Dict[str, Any]:
+        """Load and validate rules configuration from YAML file."""
+        try:
+            with open(config_path, 'r') as f:
+                config = yaml.safe_load(f)
+
+            # Validate configuration
+            validator = RulesConfigValidator()
+            errors = validator.validate_config(config)
+
+            if errors:
+                self.console.print("[red]Configuration validation errors:[/red]")
+                for error in errors:
+                    self.console.print(f"  • {error}")
+                sys.exit(1)
+
+            return config
+
+        except FileNotFoundError:
+            self.console.print(f"[red]Rules configuration file not found: {config_path}[/red]")
+            sys.exit(1)
+        except yaml.YAMLError as e:
+            self.console.print(f"[red]Error parsing YAML configuration: {e}[/red]")
+            sys.exit(1)
+
+    def analyze_cluster_shard_sizes(self, schema_filter: Optional[str] = None) -> MonitoringReport:
+        """Run complete shard size analysis."""
+        self.console.print("🔍 Gathering cluster configuration...")
+        cluster_config = self._gather_cluster_config()
+
+        self.console.print("📊 Analyzing shard sizes and table schemas...")
+        table_results = self._analyze_table_shards(cluster_config, schema_filter)
+
+        self.console.print("✅ Applying analysis rules...")
+        cluster_violations = self._evaluate_cluster_rules(cluster_config, table_results)
+
+        return MonitoringReport(
+            timestamp=datetime.now(),
+            cluster_config=cluster_config,
+            table_results=table_results,
+            cluster_violations=cluster_violations
+        )
+
+    def _gather_cluster_config(self) -> ClusterConfiguration:
+        """Gather cluster-level configuration and metrics."""
+        # Get cluster nodes info
+        nodes_query = """
+        SELECT
+            COUNT(*) as total_nodes,
+            SUM(os_info['available_processors']) as total_cpu_cores,
+            SUM(mem['used'] + mem['free']) / POWER(1024, 3) as total_memory_gb,
+            SUM(heap['max']) / POWER(1024, 3) as total_heap_gb
+        FROM sys.nodes
+        WHERE name IS NOT NULL
+        """
+        nodes_result = self.db_client.execute_query(nodes_query)
+        nodes_data = nodes_result.get('rows', [])[0]
+
+        # Get cluster settings - use default if sys.cluster is not accessible
+        max_shards_setting = 1000  # CrateDB default
+
+        try:
+            settings_query = """
+            SELECT settings['cluster']['max_shards_per_node'] as max_shards_per_node
+            FROM sys.cluster
+            """
+            settings_result = self.db_client.execute_query(settings_query)
+
+            rows = settings_result.get('rows', [])
+            if rows and rows[0][0] is not None:
+                max_shards_setting = int(rows[0][0])
+        except Exception as e:
+            # sys.cluster might not be accessible in CrateDB Cloud
+            self.console.print(f"[yellow]Warning: Could not access cluster settings, using default max_shards_per_node=1000[/yellow]")
+
+        # Get total shard count and max shards per node
+        shards_query = """
+        SELECT
+            COUNT(*) as total_shards
+        FROM sys.shards
+        """
+        shards_result = self.db_client.execute_query(shards_query)
+        shards_data = shards_result.get('rows', [])[0]
+
+        # Get actual max shards per node (current distribution)
+        try:
+            max_shards_query = """
+            SELECT node['name'], COUNT(*) as shard_count
+            FROM sys.shards
+            GROUP BY node['name']
+            ORDER BY shard_count DESC
+            LIMIT 1
+            """
+            max_shards_result = self.db_client.execute_query(max_shards_query)
+            max_shards_rows = max_shards_result.get('rows', [])
+            actual_max_shards_per_node = max_shards_rows[0][1] if max_shards_rows else 0
+        except Exception as e:
+            # Calculate approximate value: total_shards / total_nodes
+            actual_max_shards_per_node = int(shards_data[0] / max(nodes_data[0], 1))
+            self.console.print(f"[dim]Using approximate max shards per node: {actual_max_shards_per_node}[/dim]")
+
+        return ClusterConfiguration(
+            total_nodes=nodes_data[0],
+            total_cpu_cores=nodes_data[1] or 0,
+            total_memory_gb=nodes_data[2] or 0.0,
+            total_heap_gb=nodes_data[3] or 0.0,
+            max_shards_per_node_setting=max_shards_setting,
+            actual_max_shards_per_node=actual_max_shards_per_node,
+            total_shards=shards_data[0]
+        )
+
+    def _analyze_table_shards(self, cluster_config: ClusterConfiguration,
+                            schema_filter: Optional[str] = None) -> List[ShardAnalysisResult]:
+        """Analyze individual table shard configurations."""
+        query = self.SHARD_ANALYSIS_QUERY
+
+        if schema_filter:
+            # Add WHERE clause for schema filtering
+            query = query.replace(
+                "ORDER BY table_schema",
+                f"WHERE s.table_schema = '{schema_filter}' ORDER BY table_schema"
+            )
+
+        results = self.db_client.execute_query(query)
+
+        table_results = []
+        for row in results.get('rows', []):
+            # Parse query results
+            analysis_result = ShardAnalysisResult(
+                table_schema=row[0],
+                table_name=row[1],
+                partition_ident=row[2],
+                total_primary_size_gb=float(row[3] or 0),
+                avg_shard_size_gb=float(row[4] or 0),
+                min_shard_size_gb=float(row[5] or 0),
+                max_shard_size_gb=float(row[6] or 0),
+                num_shards_primary=int(row[7] or 0),
+                num_shards_replica=int(row[8] or 0),
+                num_shards_total=int(row[9] or 0),
+                total_documents=int(row[10] or 0),
+                num_columns=int(row[11] or 0),
+                partitioned_by=row[12],
+                clustered_by=row[13]
+            )
+
+            # Evaluate rules for this table
+            analysis_result.violations = self._evaluate_table_rules(analysis_result, cluster_config)
+            table_results.append(analysis_result)
+
+        return table_results
+
+    def _evaluate_table_rules(self, result: ShardAnalysisResult,
+                            cluster_config: ClusterConfiguration) -> List[RuleViolation]:
+        """Evaluate table-level rules against a single table/partition."""
+        violations = []
+
+        # Prepare evaluation context
+        context = {
+            # Table data
+            'table_schema': result.table_schema,
+            'table_name': result.table_name,
+            'partition_ident': result.partition_ident,
+            'total_primary_size_gb': result.total_primary_size_gb,
+            'avg_shard_size_gb': result.avg_shard_size_gb,
+            'min_shard_size_gb': result.min_shard_size_gb,
+            'max_shard_size_gb': result.max_shard_size_gb,
+            'num_shards_primary': result.num_shards_primary,
+            'num_shards_replica': result.num_shards_replica,
+            'num_shards_total': result.num_shards_total,
+            'num_columns': result.num_columns,
+            'partitioned_by': result.partitioned_by,
+            'clustered_by': result.clustered_by,
+
+            # Cluster context
+            'cluster_config': {
+                'total_nodes': cluster_config.total_nodes,
+                'total_cpu_cores': cluster_config.total_cpu_cores,
+                'total_memory_gb': cluster_config.total_memory_gb,
+                'total_heap_gb': cluster_config.total_heap_gb,
+                'max_shards_per_node': cluster_config.max_shards_per_node_setting,
+                'total_shards': cluster_config.total_shards
+            },
+
+            # Thresholds
+            'thresholds': self.thresholds
+        }
+
+        # Evaluate each rule
+        for rule in self.table_rules:
+            try:
+                if eval(rule.condition, {"__builtins__": {}}, context):
+                    # Create formatting context with flattened values
+                    format_context = {
+                        **context,
+                        **self.thresholds,
+                        'ratio': context['max_shard_size_gb'] / context['min_shard_size_gb'] if context['min_shard_size_gb'] > 0 else 0
+                    }
+                    # Add flattened cluster_config values for easier formatting
+                    for key, value in context['cluster_config'].items():
+                        format_context[f'cluster_config[{key}]'] = value
+
+                    recommendation = rule.recommendation.format(**format_context)
+
+                    violations.append(RuleViolation(
+                        rule_name=rule.name,
+                        category=rule.category,
+                        severity=rule.severity,
+                        recommendation=recommendation,
+                        action_hint=rule.action_hint,
+                        table_identifier=result.table_identifier
+                    ))
+            except Exception as e:
+                self.console.print(f"[yellow]Warning: Error evaluating rule '{rule.name}': {e}[/yellow]")
+
+        return violations
+
+    def _evaluate_cluster_rules(self, cluster_config: ClusterConfiguration,
+                               table_results: List[ShardAnalysisResult]) -> List[RuleViolation]:
+        """Evaluate cluster-level rules."""
+        violations = []
+
+        # Prepare cluster-level context
+        context = {
+            'cluster_config': {
+                'total_nodes': cluster_config.total_nodes,
+                'total_cpu_cores': cluster_config.total_cpu_cores,
+                'total_memory_gb': cluster_config.total_memory_gb,
+                'total_heap_gb': cluster_config.total_heap_gb,
+                'max_shards_per_node': cluster_config.actual_max_shards_per_node,
+                'total_shards': cluster_config.total_shards
+            },
+            'thresholds': self.thresholds,
+            'total_shards': cluster_config.total_shards,
+            'total_heap_gb': cluster_config.total_heap_gb,
+            'max_shards_per_node': cluster_config.actual_max_shards_per_node,
+            'total_cpu_cores': cluster_config.total_cpu_cores
+        }
+
+        # Evaluate each cluster rule
+        for rule in self.cluster_rules:
+            try:
+                if eval(rule.condition, {"__builtins__": {}}, context):
+                    # Create formatting context with flattened values
+                    format_context = {
+                        **context,
+                        **self.thresholds
+                    }
+                    # Add flattened cluster_config values for easier formatting
+                    for key, value in context['cluster_config'].items():
+                        format_context[f'cluster_config[{key}]'] = value
+
+                    recommendation = rule.recommendation.format(**format_context)
+
+                    violations.append(RuleViolation(
+                        rule_name=rule.name,
+                        category=rule.category,
+                        severity=rule.severity,
+                        recommendation=recommendation,
+                        action_hint=rule.action_hint,
+                        table_identifier="[CLUSTER]"
+                    ))
+            except Exception as e:
+                self.console.print(f"[yellow]Warning: Error evaluating cluster rule '{rule.name}': {e}[/yellow]")
+
+        return violations
+
+    def display_report(self, report: MonitoringReport, severity_filter: Optional[str] = None):
+        """Display analysis report to console."""
+        # Header with cluster info
+        self.console.print(Panel(
+            f"[bold blue]CrateDB Shard Size Analysis Report[/bold blue]\n"
+            f"[dim]Generated: {report.timestamp.strftime('%Y-%m-%d %H:%M:%S')}[/dim]\n\n"
+            f"[bold]Cluster Overview:[/bold]\n"
+            f"• Nodes: {report.cluster_config.total_nodes}\n"
+            f"• Total Shards: {report.cluster_config.total_shards}\n"
+            f"• CPU Cores: {report.cluster_config.total_cpu_cores}\n"
+            f"• Heap Memory: {report.cluster_config.total_heap_gb:.1f}GB\n"
+            f"• Max Shards/Node: {report.cluster_config.actual_max_shards_per_node} "
+            f"(limit: {report.cluster_config.max_shards_per_node_setting})",
+            expand=False
+        ))
+
+        # Summary of violations
+        violation_counts = report.total_violations_by_severity
+        if any(violation_counts.values()):
+            summary_text = Text()
+            if violation_counts['critical'] > 0:
+                summary_text.append(f"🔴 {violation_counts['critical']} Critical  ", style="bold red")
+            if violation_counts['warning'] > 0:
+                summary_text.append(f"🟡 {violation_counts['warning']} Warning  ", style="bold yellow")
+            if violation_counts['info'] > 0:
+                summary_text.append(f"🔵 {violation_counts['info']} Info", style="bold blue")
+
+            self.console.print(Panel(summary_text, title="Issue Summary"))
+        else:
+            self.console.print(Panel("✅ No issues found", style="green"))
+            return
+
+        # Cluster-level violations
+        cluster_violations = [v for v in report.cluster_violations
+                            if not severity_filter or v.severity == severity_filter]
+        if cluster_violations:
+            self.console.print("\n[bold]🏢 Cluster-Level Issues:[/bold]")
+            for violation in cluster_violations:
+                severity_color = {'critical': 'red', 'warning': 'yellow', 'info': 'blue'}[violation.severity]
+                self.console.print(f"[{severity_color}]• [{violation.severity.upper()}] {violation.recommendation}[/{severity_color}]")
+                if violation.action_hint:
+                    self.console.print(f"  💡 {violation.action_hint}")
+
+        # Table-level violations
+        tables_with_violations = [r for r in report.table_results if r.violations]
+        if severity_filter:
+            tables_with_violations = [r for r in tables_with_violations
+                                    if any(v.severity == severity_filter for v in r.violations)]
+
+        if tables_with_violations:
+            self.console.print(f"\n[bold]📊 Table/Partition Issues ({len(tables_with_violations)} affected):[/bold]")
+
+            for result in tables_with_violations:
+                violations_to_show = [v for v in result.violations
+                                    if not severity_filter or v.severity == severity_filter]
+
+                if not violations_to_show:
+                    continue
+
+                # Table header with key metrics
+                table_info = (f"{result.table_identifier} "
+                            f"({result.num_shards_primary}s/{result.num_shards_replica}r, "
+                            f"{result.max_shard_size_gb:.1f}GB max, "
+                            f"avg {result.avg_shard_size_gb:.1f}GB, "
+                            f"{result.total_documents:,} docs, "
+                            f"{result.num_columns} cols)")
+
+                self.console.print(f"\n[bold cyan]{table_info}[/bold cyan]")
+
+                for violation in violations_to_show:
+                    severity_color = {'critical': 'red', 'warning': 'yellow', 'info': 'blue'}[violation.severity]
+                    self.console.print(f"  [{severity_color}]• [{violation.severity.upper()}] {violation.recommendation}[/{severity_color}]")
+                    if violation.action_hint:
+                        self.console.print(f"    💡 {violation.action_hint}")
+
+    def export_csv(self, report: MonitoringReport, filename: str):
+        """Export analysis results to CSV file."""
+        with open(filename, 'w', newline='') as csvfile:
+            fieldnames = [
+                'timestamp', 'violation_level', 'table_schema', 'table_name', 'partition_ident',
+                'severity', 'category', 'rule_name', 'recommendation', 'action_hint',
+                'total_primary_size_gb', 'avg_shard_size_gb', 'min_shard_size_gb', 'max_shard_size_gb',
+                'num_shards_primary', 'num_shards_replica', 'num_shards_total', 'num_columns', 'total_documents'
+            ]
+            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+            writer.writeheader()
+
+            # Write cluster-level violations
+            for violation in report.cluster_violations:
+                writer.writerow({
+                    'timestamp': report.timestamp.isoformat(),
+                    'violation_level': 'cluster',
+                    'table_schema': None,
+                    'table_name': None,
+                    'partition_ident': None,
+                    'severity': violation.severity,
+                    'category': violation.category,
+                    'rule_name': violation.rule_name,
+                    'recommendation': violation.recommendation,
+                    'action_hint': violation.action_hint,
+                    'total_primary_size_gb': None,
+                    'avg_shard_size_gb': None,
+                    'min_shard_size_gb': None,
+                    'max_shard_size_gb': None,
+                    'num_shards_primary': None,
+                    'num_shards_replica': None,
+                    'num_shards_total': None,
+                    'num_columns': None,
+                    'total_documents': None
+                })
+
+            # Write table-level violations
+            for result in report.table_results:
+                if result.violations:
+                    for violation in result.violations:
+                        writer.writerow({
+                            'timestamp': report.timestamp.isoformat(),
+                            'violation_level': 'table',
+                            'table_schema': result.table_schema,
+                            'table_name': result.table_name,
+                            'partition_ident': result.partition_ident,
+                            'severity': violation.severity,
+                            'category': violation.category,
+                            'rule_name': violation.rule_name,
+                            'recommendation': violation.recommendation,
+                            'action_hint': violation.action_hint,
+                            'total_primary_size_gb': result.total_primary_size_gb,
+                            'avg_shard_size_gb': result.avg_shard_size_gb,
+                            'min_shard_size_gb': result.min_shard_size_gb,
+                            'max_shard_size_gb': result.max_shard_size_gb,
+                            'num_shards_primary': result.num_shards_primary,
+                            'num_shards_replica': result.num_shards_replica,
+                            'num_shards_total': result.num_shards_total,
+                            'num_columns': result.num_columns,
+                            'total_documents': result.total_documents
+                        })
+                else:
+                    # Include tables without violations for complete dataset
+                    writer.writerow({
+                        'timestamp': report.timestamp.isoformat(),
+                        'violation_level': 'table',
+                        'table_schema': result.table_schema,
+                        'table_name': result.table_name,
+                        'partition_ident': result.partition_ident,
+                        'severity': None,
+                        'category': None,
+                        'rule_name': None,
+                        'recommendation': None,
+                        'action_hint': None,
+                        'total_primary_size_gb': result.total_primary_size_gb,
+                        'avg_shard_size_gb': result.avg_shard_size_gb,
+                        'min_shard_size_gb': result.min_shard_size_gb,
+                        'max_shard_size_gb': result.max_shard_size_gb,
+                        'num_shards_primary': result.num_shards_primary,
+                        'num_shards_replica': result.num_shards_replica,
+                        'num_shards_total': result.num_shards_total,
+                        'num_columns': result.num_columns,
+                        'total_documents': result.total_documents
+                    })
+
+
+def validate_rules_file(config_path: str) -> bool:
+    """Standalone function to validate a rules configuration file."""
+    console = Console()
+
+    try:
+        with open(config_path, 'r') as f:
+            config = yaml.safe_load(f)
+
+        validator = RulesConfigValidator()
+        errors = validator.validate_config(config)
+
+        if errors:
+            console.print(f"[red]❌ Validation failed for {config_path}:[/red]")
+            for error in errors:
+                console.print(f"  • {error}")
+            return False
+        else:
+            console.print(f"[green]✅ Configuration file {config_path} is valid[/green]")
+            return True
+
+    except FileNotFoundError:
+        console.print(f"[red]❌ File not found: {config_path}[/red]")
+        return False
+    except yaml.YAMLError as e:
+        console.print(f"[red]❌ YAML parsing error: {e}[/red]")
+        return False
+    except Exception as e:
+        console.print(f"[red]❌ Unexpected error: {e}[/red]")
+        return False
diff --git a/tests/test_active_shard_monitor.py b/tests/test_active_shard_monitor.py
new file mode 100644
index 0000000..3fb6613
--- /dev/null
+++ b/tests/test_active_shard_monitor.py
@@ -0,0 +1,461 @@
+"""
+Tests for ActiveShardMonitor functionality
+"""
+
+import pytest
+import time
+from unittest.mock import Mock, patch
+from xmover.database import CrateDBClient, ActiveShardSnapshot, ActiveShardActivity
+from xmover.analyzer import ActiveShardMonitor
+
+
+class TestActiveShardSnapshot:
+    """Test ActiveShardSnapshot dataclass"""
+    
+    def test_checkpoint_delta(self):
+        """Test checkpoint delta calculation"""
+        snapshot = ActiveShardSnapshot(
+            schema_name="test_schema",
+            table_name="test_table",
+            shard_id=1,
+            node_name="node1",
+            is_primary=True,
+            partition_ident="",
+            local_checkpoint=1500,
+            global_checkpoint=500,
+            translog_uncommitted_bytes=10485760,  # 10MB
+            timestamp=time.time()
+        )
+        
+        assert snapshot.checkpoint_delta == 1000
+        assert snapshot.translog_uncommitted_mb == 10.0
+        assert snapshot.shard_identifier == "test_schema.test_table:1:node1:P"
+
+
+class TestActiveShardActivity:
+    """Test ActiveShardActivity dataclass"""
+    
+    def test_activity_calculations(self):
+        """Test activity rate and property calculations"""
+        snapshot1 = ActiveShardSnapshot(
+            schema_name="test_schema",
+            table_name="test_table",
+            shard_id=1,
+            node_name="node1",
+            is_primary=True,
+            partition_ident="",
+            local_checkpoint=1000,
+            global_checkpoint=500,
+            translog_uncommitted_bytes=5242880,  # 5MB
+            timestamp=100.0
+        )
+        
+        snapshot2 = ActiveShardSnapshot(
+            schema_name="test_schema",
+            table_name="test_table",
+            shard_id=1,
+            node_name="node1",
+            is_primary=True,
+            partition_ident="",
+            local_checkpoint=1500,
+            global_checkpoint=500,
+            translog_uncommitted_bytes=10485760,  # 10MB
+            timestamp=130.0  # 30 seconds later
+        )
+        
+        activity = ActiveShardActivity(
+            schema_name="test_schema",
+            table_name="test_table",
+            shard_id=1,
+            node_name="node1",
+            is_primary=True,
+            partition_ident="",
+            local_checkpoint_delta=500,
+            snapshot1=snapshot1,
+            snapshot2=snapshot2,
+            time_diff_seconds=30.0
+        )
+        
+        assert activity.activity_rate == 500 / 30.0  # ~16.67 changes/sec
+        assert activity.shard_type == "PRIMARY"
+        assert activity.table_identifier == "test_schema.test_table"
+
+
+class TestCrateDBClientActiveShards:
+    """Test CrateDB client active shards functionality"""
+    
+    @patch.object(CrateDBClient, 'execute_query')
+    def test_get_active_shards_snapshot_success(self, mock_execute):
+        """Test successful snapshot retrieval"""
+        mock_execute.return_value = {
+            'rows': [
+                ['schema1', 'table1', 1, True, 'node1', '', 10485760, 1500, 500],
+                ['schema1', 'table2', 2, False, 'node2', 'part1', 20971520, 2000, 800]
+            ]
+        }
+        
+        client = CrateDBClient("http://test")
+        snapshots = client.get_active_shards_snapshot(min_checkpoint_delta=1000)
+        
+        assert len(snapshots) == 2
+        
+        # Check first snapshot
+        snap1 = snapshots[0]
+        assert snap1.schema_name == 'schema1'
+        assert snap1.table_name == 'table1'
+        assert snap1.shard_id == 1
+        assert snap1.is_primary is True
+        assert snap1.node_name == 'node1'
+        assert snap1.local_checkpoint == 1500
+        assert snap1.global_checkpoint == 500
+        assert snap1.checkpoint_delta == 1000
+        assert snap1.translog_uncommitted_mb == 10.0
+        
+        # Check second snapshot
+        snap2 = snapshots[1]
+        assert snap2.schema_name == 'schema1'
+        assert snap2.table_name == 'table2'
+        assert snap2.shard_id == 2
+        assert snap2.is_primary is False
+        assert snap2.node_name == 'node2'
+        assert snap2.partition_ident == 'part1'
+        assert snap2.checkpoint_delta == 1200
+        assert snap2.translog_uncommitted_mb == 20.0
+        
+        # Verify query was called without checkpoint delta filter (new behavior)
+        mock_execute.assert_called_once()
+        args = mock_execute.call_args[0]
+        # No longer passes min_checkpoint_delta parameter
+        assert len(args) == 1  # Only the query, no parameters
+    
+    @patch.object(CrateDBClient, 'execute_query')
+    def test_get_active_shards_snapshot_empty(self, mock_execute):
+        """Test snapshot retrieval with no results"""
+        mock_execute.return_value = {'rows': []}
+        
+        client = CrateDBClient("http://test")
+        snapshots = client.get_active_shards_snapshot(min_checkpoint_delta=1000)
+        
+        assert snapshots == []
+    
+    @patch.object(CrateDBClient, 'execute_query')
+    def test_get_active_shards_snapshot_error(self, mock_execute):
+        """Test snapshot retrieval with database error"""
+        mock_execute.side_effect = Exception("Database connection failed")
+        
+        client = CrateDBClient("http://test")
+        snapshots = client.get_active_shards_snapshot(min_checkpoint_delta=1000)
+        
+        assert snapshots == []
+
+
+class TestActiveShardMonitor:
+    """Test ActiveShardMonitor class"""
+    
+    def setup_method(self):
+        """Set up test fixtures"""
+        self.mock_client = Mock(spec=CrateDBClient)
+        self.monitor = ActiveShardMonitor(self.mock_client)
+    
+    def create_test_snapshot(self, schema: str, table: str, shard_id: int, node: str, 
+                           is_primary: bool, local_checkpoint: int, timestamp: float):
+        """Helper to create test snapshots"""
+        return ActiveShardSnapshot(
+            schema_name=schema,
+            table_name=table,
+            shard_id=shard_id,
+            node_name=node,
+            is_primary=is_primary,
+            partition_ident="",
+            local_checkpoint=local_checkpoint,
+            global_checkpoint=500,  # Fixed for simplicity
+            translog_uncommitted_bytes=10485760,  # 10MB
+            timestamp=timestamp
+        )
+    
+    def test_compare_snapshots_with_activity(self):
+        """Test comparing snapshots with active shards"""
+        # Create first snapshot
+        snapshot1 = [
+            self.create_test_snapshot("schema1", "table1", 1, "node1", True, 1000, 100.0),
+            self.create_test_snapshot("schema1", "table2", 1, "node2", False, 2000, 100.0),
+            self.create_test_snapshot("schema1", "table3", 1, "node1", True, 3000, 100.0),
+        ]
+        
+        # Create second snapshot (30 seconds later with activity)
+        snapshot2 = [
+            self.create_test_snapshot("schema1", "table1", 1, "node1", True, 1500, 130.0),  # +500
+            self.create_test_snapshot("schema1", "table2", 1, "node2", False, 2200, 130.0),  # +200
+            self.create_test_snapshot("schema1", "table3", 1, "node1", True, 3000, 130.0),   # No change
+            self.create_test_snapshot("schema1", "table4", 1, "node3", True, 1000, 130.0),   # New shard
+        ]
+        
+        activities = self.monitor.compare_snapshots(snapshot1, snapshot2, min_activity_threshold=1)
+        
+        # Should have 2 activities (table3 had no change, table4 is new)
+        assert len(activities) == 2
+        
+        # Check activities are sorted by checkpoint delta (highest first)
+        assert activities[0].local_checkpoint_delta == 500  # table1
+        assert activities[0].schema_name == "schema1"
+        assert activities[0].table_name == "table1"
+        
+        assert activities[1].local_checkpoint_delta == 200  # table2
+        assert activities[1].schema_name == "schema1"
+        assert activities[1].table_name == "table2"
+        
+        # Check activity rate calculation
+        assert activities[0].activity_rate == 500 / 30.0  # ~16.67/sec
+        assert activities[1].activity_rate == 200 / 30.0  # ~6.67/sec
+    
+    def test_compare_snapshots_no_activity(self):
+        """Test comparing snapshots with no activity"""
+        # Create identical snapshots
+        snapshot1 = [
+            self.create_test_snapshot("schema1", "table1", 1, "node1", True, 1000, 100.0),
+        ]
+        
+        snapshot2 = [
+            self.create_test_snapshot("schema1", "table1", 1, "node1", True, 1000, 130.0),  # No change
+        ]
+        
+        activities = self.monitor.compare_snapshots(snapshot1, snapshot2, min_activity_threshold=1)
+        
+        assert activities == []
+    
+    def test_compare_snapshots_no_overlap(self):
+        """Test comparing snapshots with no overlapping shards"""
+        snapshot1 = [
+            self.create_test_snapshot("schema1", "table1", 1, "node1", True, 1000, 100.0),
+        ]
+        
+        snapshot2 = [
+            self.create_test_snapshot("schema1", "table2", 1, "node2", True, 1500, 130.0),  # Different shard
+        ]
+        
+        activities = self.monitor.compare_snapshots(snapshot1, snapshot2, min_activity_threshold=1)
+        
+        assert activities == []
+    
+    def test_format_activity_display_with_activities(self):
+        """Test formatting activity display with data"""
+        # Create test activities
+        snapshot1 = self.create_test_snapshot("schema1", "table1", 1, "node1", True, 1000, 100.0)
+        snapshot2 = self.create_test_snapshot("schema1", "table1", 1, "node1", True, 1500, 130.0)
+        
+        activity = ActiveShardActivity(
+            schema_name="schema1",
+            table_name="table1",
+            shard_id=1,
+            node_name="node1",
+            is_primary=True,
+            partition_ident="",
+            local_checkpoint_delta=500,
+            snapshot1=snapshot1,
+            snapshot2=snapshot2,
+            time_diff_seconds=30.0
+        )
+        
+        display = self.monitor.format_activity_display([activity], show_count=10, watch_mode=False)
+        
+        # Check that output contains expected elements
+        assert "Most Active Shards" in display
+        assert "schema1.table1" in display
+        assert "500" in display  # checkpoint delta
+        assert "16.7" in display  # activity rate
+        assert "P" in display  # primary indicator
+        assert "Legend:" in display
+        assert "Trend:" in display  # new trend column explanation
+        assert "Partition:" in display  # new partition column explanation
+    
+    def test_format_activity_display_empty(self):
+        """Test formatting activity display with no data"""
+        display = self.monitor.format_activity_display([], show_count=10, watch_mode=False)
+        
+        assert "No active shards with significant checkpoint progression found" in display
+    
+    def test_format_activity_display_count_limit(self):
+        """Test that display respects show_count limit"""
+        # Create multiple activities
+        activities = []
+        for i in range(15):
+            snapshot1 = self.create_test_snapshot("schema1", f"table{i}", 1, "node1", True, 1000, 100.0)
+            snapshot2 = self.create_test_snapshot("schema1", f"table{i}", 1, "node1", True, 1000 + (i+1)*100, 130.0)
+            
+            activity = ActiveShardActivity(
+                schema_name="schema1",
+                table_name=f"table{i}",
+                shard_id=1,
+                node_name="node1",
+                is_primary=True,
+                partition_ident="",
+                local_checkpoint_delta=(i+1)*100,
+                snapshot1=snapshot1,
+                snapshot2=snapshot2,
+                time_diff_seconds=30.0
+            )
+            activities.append(activity)
+        
+        # Sort activities by checkpoint delta (highest first) - same as compare_snapshots does
+        activities.sort(key=lambda x: x.local_checkpoint_delta, reverse=True)
+        
+        # Should only show top 5
+        display = self.monitor.format_activity_display(activities, show_count=5, watch_mode=False)
+        
+        # Count number of table entries in display
+        table_count = display.count("schema1.table")
+        assert table_count == 5  # Should only show 5 entries
+        
+        # Should show highest activity first (table14 has highest checkpoint delta)
+        assert "schema1.table14" in display
+    
+    def test_compare_snapshots_with_activity_threshold(self):
+        """Test filtering activities by minimum threshold"""
+        # Create snapshots with various activity levels
+        snapshot1 = [
+            self.create_test_snapshot("schema1", "table1", 1, "node1", True, 1000, 100.0),  # Will have +2000 delta
+            self.create_test_snapshot("schema1", "table2", 1, "node2", False, 2000, 100.0), # Will have +500 delta  
+            self.create_test_snapshot("schema1", "table3", 1, "node1", True, 3000, 100.0),  # Will have +100 delta
+        ]
+        
+        snapshot2 = [
+            self.create_test_snapshot("schema1", "table1", 1, "node1", True, 3000, 130.0),  # +2000 delta
+            self.create_test_snapshot("schema1", "table2", 1, "node2", False, 2500, 130.0), # +500 delta
+            self.create_test_snapshot("schema1", "table3", 1, "node1", True, 3100, 130.0),  # +100 delta
+        ]
+        
+        # Test with threshold of 1000 - should only show table1 (2000 delta)
+        activities_high_threshold = self.monitor.compare_snapshots(snapshot1, snapshot2, min_activity_threshold=1000)
+        assert len(activities_high_threshold) == 1
+        assert activities_high_threshold[0].table_name == "table1"
+        assert activities_high_threshold[0].local_checkpoint_delta == 2000
+        
+        # Test with threshold of 200 - should show table1 and table2
+        activities_medium_threshold = self.monitor.compare_snapshots(snapshot1, snapshot2, min_activity_threshold=200)
+        assert len(activities_medium_threshold) == 2
+        assert activities_medium_threshold[0].local_checkpoint_delta == 2000  # table1 first (highest)
+        assert activities_medium_threshold[1].local_checkpoint_delta == 500   # table2 second
+        
+        # Test with threshold of 0 - should show all three
+        activities_low_threshold = self.monitor.compare_snapshots(snapshot1, snapshot2, min_activity_threshold=0)
+        assert len(activities_low_threshold) == 3
+        assert activities_low_threshold[0].local_checkpoint_delta == 2000  # Sorted by activity
+        assert activities_low_threshold[1].local_checkpoint_delta == 500
+        assert activities_low_threshold[2].local_checkpoint_delta == 100
+    
+    def test_primary_replica_separation(self):
+        """Test that primary and replica shards are tracked separately"""
+        # Create snapshots with same table/shard but different primary/replica
+        snapshot1 = [
+            # Primary shard
+            self.create_test_snapshot("gc", "scheduled_jobs_log", 0, "data-hot-8", True, 15876, 100.0),
+            # Replica shard (same table/shard/node but different type)
+            self.create_test_snapshot("gc", "scheduled_jobs_log", 0, "data-hot-8", False, 129434, 100.0),
+        ]
+        
+        snapshot2 = [
+            # Primary shard progresses normally
+            self.create_test_snapshot("gc", "scheduled_jobs_log", 0, "data-hot-8", True, 16000, 130.0),  # +124 delta
+            # Replica shard progresses normally  
+            self.create_test_snapshot("gc", "scheduled_jobs_log", 0, "data-hot-8", False, 129500, 130.0),  # +66 delta
+        ]
+        
+        activities = self.monitor.compare_snapshots(snapshot1, snapshot2, min_activity_threshold=1)
+        
+        # Should have 2 separate activities (primary and replica tracked separately)
+        assert len(activities) == 2
+        
+        # Find primary and replica activities
+        primary_activity = next(a for a in activities if a.is_primary)
+        replica_activity = next(a for a in activities if not a.is_primary)
+        
+        # Verify deltas are calculated correctly for each type
+        assert primary_activity.local_checkpoint_delta == 124  # 16000 - 15876
+        assert replica_activity.local_checkpoint_delta == 66   # 129500 - 129434
+        
+        # Verify they have different shard identifiers
+        assert primary_activity.snapshot1.shard_identifier != replica_activity.snapshot1.shard_identifier
+        assert "data-hot-8:P" in primary_activity.snapshot1.shard_identifier
+        assert "data-hot-8:R" in replica_activity.snapshot1.shard_identifier
+        
+        # This test prevents the bug where we mixed primary CP End with replica CP Start
+        # which created fake deltas like 129434 - 15876 = 113558
+    
+    def test_partition_separation(self):
+        """Test that partitions within the same table/shard are tracked separately"""
+        # Create snapshots with same table/shard but different partitions
+        snapshot1 = [
+            # Partition 1
+            self.create_test_snapshot("TURVO", "appointmentFormFieldData_events", 0, "data-hot-8", True, 32684, 100.0),
+            # Partition 2 (same table/shard/node/type but different partition)
+            self.create_test_snapshot("TURVO", "appointmentFormFieldData_events", 0, "data-hot-8", True, 54289, 100.0),
+        ]
+        
+        # Modify partition_ident for the snapshots to simulate different partitions
+        snapshot1[0].partition_ident = "04732dpl6osj8d1g60o30c1g"
+        snapshot1[1].partition_ident = "04732dpl6os3adpm60o30c1g"
+        
+        snapshot2 = [
+            # Partition 1 progresses  
+            self.create_test_snapshot("TURVO", "appointmentFormFieldData_events", 0, "data-hot-8", True, 32800, 130.0),  # +116 delta
+            # Partition 2 progresses
+            self.create_test_snapshot("TURVO", "appointmentFormFieldData_events", 0, "data-hot-8", True, 54400, 130.0),  # +111 delta
+        ]
+        
+        # Set partition_ident for second snapshot
+        snapshot2[0].partition_ident = "04732dpl6osj8d1g60o30c1g"  
+        snapshot2[1].partition_ident = "04732dpl6os3adpm60o30c1g"
+        
+        activities = self.monitor.compare_snapshots(snapshot1, snapshot2, min_activity_threshold=1)
+        
+        # Should have 2 separate activities (partitions tracked separately)
+        assert len(activities) == 2
+        
+        # Verify deltas are calculated correctly for each partition
+        partition1_activity = next(a for a in activities if "04732dpl6osj8d1g60o30c1g" in a.snapshot1.shard_identifier)
+        partition2_activity = next(a for a in activities if "04732dpl6os3adpm60o30c1g" in a.snapshot1.shard_identifier)
+        
+        assert partition1_activity.local_checkpoint_delta == 116  # 32800 - 32684
+        assert partition2_activity.local_checkpoint_delta == 111  # 54400 - 54289
+        
+        # Verify they have different shard identifiers due to partition
+        assert partition1_activity.snapshot1.shard_identifier != partition2_activity.snapshot1.shard_identifier
+        assert ":04732dpl6osj8d1g60o30c1g" in partition1_activity.snapshot1.shard_identifier
+        assert ":04732dpl6os3adpm60o30c1g" in partition2_activity.snapshot1.shard_identifier
+        
+        # This test prevents mixing partitions which would create fake activity measurements
+    
+    def test_format_activity_display_watch_mode(self):
+        """Test that watch mode excludes legend and insights"""
+        snapshot1 = self.create_test_snapshot("schema1", "table1", 1, "node1", True, 1000, 100.0)
+        snapshot2 = self.create_test_snapshot("schema1", "table1", 1, "node1", True, 1500, 130.0)
+        
+        activity = ActiveShardActivity(
+            schema_name="schema1",
+            table_name="table1",
+            shard_id=1,
+            node_name="node1",
+            is_primary=True,
+            partition_ident="",
+            local_checkpoint_delta=500,
+            snapshot1=snapshot1,
+            snapshot2=snapshot2,
+            time_diff_seconds=30.0
+        )
+        
+        # Test non-watch mode (should include legend and insights)
+        normal_display = self.monitor.format_activity_display([activity], show_count=10, watch_mode=False)
+        assert "Legend:" in normal_display
+        assert "Insights:" in normal_display
+        assert "Checkpoint Δ:" in normal_display
+        
+        # Test watch mode (should exclude legend and insights)
+        watch_display = self.monitor.format_activity_display([activity], show_count=10, watch_mode=True)
+        assert "Legend:" not in watch_display
+        assert "Insights:" not in watch_display
+        assert "Checkpoint Δ" in watch_display  # Core data should still be present
+        
+        # But should still contain the core data
+        assert "Most Active Shards" in watch_display
+        assert "schema1.table1" in watch_display
+        assert "500" in watch_display  # checkpoint delta
\ No newline at end of file
diff --git a/tests/test_distribution_analyzer.py b/tests/test_distribution_analyzer.py
new file mode 100644
index 0000000..42e92a0
--- /dev/null
+++ b/tests/test_distribution_analyzer.py
@@ -0,0 +1,294 @@
+"""
+Tests for distribution analyzer functionality
+"""
+
+import pytest
+from unittest.mock import Mock, patch
+from xmover.distribution_analyzer import DistributionAnalyzer, TableDistribution, DistributionAnomaly
+from xmover.database import CrateDBClient, NodeInfo
+
+
+class TestDistributionAnalyzer:
+    
+    def setup_method(self):
+        """Set up test fixtures"""
+        self.mock_client = Mock(spec=CrateDBClient)
+        self.analyzer = DistributionAnalyzer(self.mock_client)
+    
+    def test_coefficient_of_variation_calculation(self):
+        """Test CV calculation with different scenarios"""
+        
+        # Normal case
+        values = [10, 12, 8, 14, 6]
+        cv = self.analyzer.calculate_coefficient_of_variation(values)
+        assert cv > 0
+        
+        # All equal values (should return 0)
+        equal_values = [10, 10, 10, 10]
+        cv_equal = self.analyzer.calculate_coefficient_of_variation(equal_values)
+        assert cv_equal == 0.0
+        
+        # Empty list
+        empty_values = []
+        cv_empty = self.analyzer.calculate_coefficient_of_variation(empty_values)
+        assert cv_empty == 0.0
+        
+        # Single value
+        single_value = [10]
+        cv_single = self.analyzer.calculate_coefficient_of_variation(single_value)
+        assert cv_single == 0.0
+    
+    def test_get_largest_tables_distribution(self):
+        """Test fetching table distribution data"""
+        
+        # Mock query results
+        mock_results = [
+            # schema, table, node, primary_shards, replica_shards, total_shards, total_size, primary_size, replica_size, docs
+            ['doc', 'large_table', 'node1', 5, 2, 7, 100.5, 80.2, 20.3, 1000000],
+            ['doc', 'large_table', 'node2', 4, 3, 7, 95.1, 75.8, 19.3, 950000],
+            ['doc', 'large_table', 'node3', 6, 1, 7, 110.2, 85.9, 24.3, 1100000],
+            ['custom', 'another_table', 'node1', 3, 2, 5, 50.1, 40.2, 9.9, 500000],
+            ['custom', 'another_table', 'node2', 2, 3, 5, 45.8, 35.1, 10.7, 480000],
+        ]
+        
+        self.mock_client.execute_query.return_value = {'rows': mock_results}
+        
+        distributions = self.analyzer.get_largest_tables_distribution(top_n=10)
+        
+        # Verify query was called with correct parameters
+        self.mock_client.execute_query.assert_called_once()
+        call_args = self.mock_client.execute_query.call_args
+        assert call_args[0][1] == [10]  # top_n parameter
+        
+        # Verify we got the expected number of tables
+        assert len(distributions) == 2
+        
+        # Verify table data structure
+        large_table = next(d for d in distributions if d.table_name == 'large_table')
+        assert large_table.schema_name == 'doc'
+        assert large_table.full_table_name == 'large_table'  # Should omit 'doc' schema
+        assert len(large_table.node_distributions) == 3
+        
+        another_table = next(d for d in distributions if d.table_name == 'another_table')
+        assert another_table.schema_name == 'custom'
+        assert another_table.full_table_name == 'custom.another_table'
+        assert len(another_table.node_distributions) == 2
+        
+        # Verify sorting by primary size (descending)
+        assert distributions[0].total_primary_size_gb >= distributions[1].total_primary_size_gb
+    
+    def test_detect_shard_count_imbalance(self):
+        """Test shard count imbalance detection"""
+        
+        # Create test table with imbalanced shard distribution
+        imbalanced_table = TableDistribution(
+            schema_name='doc',
+            table_name='imbalanced_table',
+            total_primary_size_gb=500.0,
+            node_distributions={
+                'node1': {'total_shards': 10, 'primary_shards': 5, 'replica_shards': 5},
+                'node2': {'total_shards': 15, 'primary_shards': 8, 'replica_shards': 7},
+                'node3': {'total_shards': 5, 'primary_shards': 2, 'replica_shards': 3},
+            }
+        )
+        
+        anomaly = self.analyzer.detect_shard_count_imbalance(imbalanced_table)
+        
+        assert anomaly is not None
+        assert anomaly.anomaly_type == "Shard Count Imbalance"
+        assert anomaly.combined_score > 0
+        assert len(anomaly.recommendations) > 0
+        
+        # Create balanced table (should not detect anomaly)
+        balanced_table = TableDistribution(
+            schema_name='doc',
+            table_name='balanced_table',
+            total_primary_size_gb=100.0,
+            node_distributions={
+                'node1': {'total_shards': 8, 'primary_shards': 4, 'replica_shards': 4},
+                'node2': {'total_shards': 8, 'primary_shards': 4, 'replica_shards': 4},
+                'node3': {'total_shards': 8, 'primary_shards': 4, 'replica_shards': 4},
+            }
+        )
+        
+        no_anomaly = self.analyzer.detect_shard_count_imbalance(balanced_table)
+        assert no_anomaly is None
+    
+    def test_detect_storage_imbalance(self):
+        """Test storage imbalance detection"""
+        
+        # Create test table with storage imbalance
+        storage_imbalanced_table = TableDistribution(
+            schema_name='doc',
+            table_name='storage_imbalanced',
+            total_primary_size_gb=300.0,
+            node_distributions={
+                'node1': {'total_size_gb': 150.0, 'primary_size_gb': 100.0, 'replica_size_gb': 50.0},
+                'node2': {'total_size_gb': 50.0, 'primary_size_gb': 30.0, 'replica_size_gb': 20.0},
+                'node3': {'total_size_gb': 100.0, 'primary_size_gb': 70.0, 'replica_size_gb': 30.0},
+            }
+        )
+        
+        anomaly = self.analyzer.detect_storage_imbalance(storage_imbalanced_table)
+        
+        assert anomaly is not None
+        assert anomaly.anomaly_type == "Storage Imbalance"
+        assert anomaly.combined_score > 0
+        
+        # Small table (should be ignored)
+        small_table = TableDistribution(
+            schema_name='doc',
+            table_name='small_table',
+            total_primary_size_gb=0.1,
+            node_distributions={
+                'node1': {'total_size_gb': 0.5, 'primary_size_gb': 0.05, 'replica_size_gb': 0.05},
+                'node2': {'total_size_gb': 0.1, 'primary_size_gb': 0.03, 'replica_size_gb': 0.02},
+            }
+        )
+        
+        no_anomaly = self.analyzer.detect_storage_imbalance(small_table)
+        assert no_anomaly is None
+    
+    def test_detect_node_coverage_issues(self):
+        """Test node coverage issue detection"""
+        
+        # Mock nodes_info to simulate cluster with 4 nodes
+        class MockNode:
+            def __init__(self, name):
+                self.name = name
+        
+        mock_nodes = [
+            MockNode('node1'), MockNode('node2'), 
+            MockNode('node3'), MockNode('node4')
+        ]
+        self.mock_client.get_nodes_info.return_value = mock_nodes
+        
+        # Table with limited coverage (only on 2 out of 4 nodes)
+        limited_coverage_table = TableDistribution(
+            schema_name='doc',
+            table_name='limited_coverage',
+            total_primary_size_gb=100.0,  # Significant size
+            node_distributions={
+                'node1': {'total_shards': 10, 'primary_shards': 5, 'replica_shards': 5},
+                'node2': {'total_shards': 10, 'primary_shards': 5, 'replica_shards': 5},
+                # node3 and node4 missing
+            }
+        )
+        
+        anomaly = self.analyzer.detect_node_coverage_issues(limited_coverage_table)
+        
+        assert anomaly is not None
+        assert anomaly.anomaly_type == "Node Coverage Issue"
+        assert 'node3' in anomaly.details['nodes_without_shards']
+        assert 'node4' in anomaly.details['nodes_without_shards']
+        assert len(anomaly.recommendations) > 0
+    
+    def test_detect_document_imbalance(self):
+        """Test document imbalance detection"""
+        
+        # Table with document imbalance
+        doc_imbalanced_table = TableDistribution(
+            schema_name='doc',
+            table_name='doc_imbalanced',
+            total_primary_size_gb=200.0,
+            node_distributions={
+                'node1': {'total_documents': 1000000},  # 1M docs
+                'node2': {'total_documents': 500000},   # 500K docs
+                'node3': {'total_documents': 100000},   # 100K docs (5x imbalance)
+            }
+        )
+        
+        anomaly = self.analyzer.detect_document_imbalance(doc_imbalanced_table)
+        
+        assert anomaly is not None
+        assert anomaly.anomaly_type == "Document Imbalance"
+        assert "data skew" in anomaly.recommendations[0].lower()
+        
+        # Table with very few documents (should be ignored)
+        low_doc_table = TableDistribution(
+            schema_name='doc',
+            table_name='low_docs',
+            total_primary_size_gb=100.0,
+            node_distributions={
+                'node1': {'total_documents': 1000},
+                'node2': {'total_documents': 500},
+            }
+        )
+        
+        no_anomaly = self.analyzer.detect_document_imbalance(low_doc_table)
+        assert no_anomaly is None
+    
+    def test_analyze_distribution_integration(self):
+        """Test the full analysis workflow"""
+        
+        # Mock the get_largest_tables_distribution method
+        mock_table = TableDistribution(
+            schema_name='doc',
+            table_name='test_table',
+            total_primary_size_gb=500.0,
+            node_distributions={
+                'node1': {
+                    'total_shards': 15, 'primary_shards': 8, 'replica_shards': 7,
+                    'total_size_gb': 200.0, 'primary_size_gb': 120.0, 'replica_size_gb': 80.0,
+                    'total_documents': 2000000
+                },
+                'node2': {
+                    'total_shards': 8, 'primary_shards': 4, 'replica_shards': 4,
+                    'total_size_gb': 100.0, 'primary_size_gb': 60.0, 'replica_size_gb': 40.0,
+                    'total_documents': 1000000
+                },
+                'node3': {
+                    'total_shards': 5, 'primary_shards': 3, 'replica_shards': 2,
+                    'total_size_gb': 50.0, 'primary_size_gb': 30.0, 'replica_size_gb': 20.0,
+                    'total_documents': 500000
+                },
+            }
+        )
+        
+        with patch.object(self.analyzer, 'get_largest_tables_distribution', return_value=[mock_table]):
+            anomalies, tables_analyzed = self.analyzer.analyze_distribution(top_tables=10)
+            
+            # Should detect multiple types of anomalies
+            assert len(anomalies) > 0
+            assert tables_analyzed == 1  # We provided 1 mock table
+            
+            # Anomalies should be sorted by combined score (descending)
+            if len(anomalies) > 1:
+                for i in range(len(anomalies) - 1):
+                    assert anomalies[i].combined_score >= anomalies[i + 1].combined_score
+            
+            # Each anomaly should have required fields
+            for anomaly in anomalies:
+                assert anomaly.table is not None
+                assert anomaly.anomaly_type is not None
+                assert anomaly.combined_score >= 0
+                assert isinstance(anomaly.recommendations, list)
+    
+    def test_format_distribution_report_no_anomalies(self):
+        """Test report formatting when no anomalies found"""
+        
+        # This should not raise an exception
+        with patch('builtins.print'):  # Mock print to avoid console output during tests
+            self.analyzer.format_distribution_report([], 5)
+    
+    def test_format_distribution_report_with_anomalies(self):
+        """Test report formatting with anomalies"""
+        
+        mock_anomaly = DistributionAnomaly(
+            table=TableDistribution('doc', 'test_table', 100.0, {}),
+            anomaly_type='Test Anomaly',
+            severity_score=7.5,
+            impact_score=8.0,
+            combined_score=60.0,
+            description='Test description',
+            details={},
+            recommendations=['Test recommendation']
+        )
+        
+        # This should not raise an exception
+        with patch('builtins.print'):  # Mock print to avoid console output during tests
+            self.analyzer.format_distribution_report([mock_anomaly], 3)
+
+
+if __name__ == '__main__':
+    pytest.main([__file__])
\ No newline at end of file
diff --git a/tests/test_problematic_translogs.py b/tests/test_problematic_translogs.py
new file mode 100644
index 0000000..9446e59
--- /dev/null
+++ b/tests/test_problematic_translogs.py
@@ -0,0 +1,402 @@
+"""
+Tests for problematic translogs functionality with replica management
+"""
+
+import pytest
+from unittest.mock import Mock, patch
+from click.testing import CliRunner
+from xmover.cli import main, problematic_translogs
+from xmover.database import CrateDBClient
+
+
+class TestProblematicTranslogs:
+
+    def setup_method(self):
+        """Set up test fixtures"""
+        self.runner = CliRunner()
+        self.mock_client = Mock(spec=CrateDBClient)
+
+    def test_no_problematic_tables(self):
+        """Test when no tables meet the criteria"""
+        self.mock_client.execute_query.return_value = {'rows': []}
+        self.mock_client.test_connection.return_value = True
+
+        with patch('xmover.cli.CrateDBClient', return_value=self.mock_client):
+            result = self.runner.invoke(main, ['problematic-translogs', '--sizeMB', '300'])
+
+        assert result.exit_code == 0
+        assert 'No tables found with replica shards having translog uncommitted size > 300MB' in result.output
+
+    def test_non_partitioned_table_command_generation(self):
+        """Test ALTER command generation for non-partitioned tables"""
+        # Individual shards data (6 columns)
+        individual_shards_data = [
+            ['TURVO', 'shipmentFormFieldData', None, 14, 'data-hot-6', 7011.8],
+            ['TURVO', 'orderFormFieldData', 'NULL', 5, 'data-hot-1', 469.5]
+        ]
+        # Summary data (10 columns from query, displayed as 8 by combining P/R columns)
+        summary_data = [
+            ['TURVO', 'shipmentFormFieldData', None, None, 3, 7011.8, 5, 5, 12.4, 12.1],
+            ['TURVO', 'orderFormFieldData', 'NULL', None, 1, 469.5, 3, 6, 8.2, 16.3]
+        ]
+        self.mock_client.execute_query.side_effect = [
+            {'rows': individual_shards_data},  # Individual shards query
+            {'rows': summary_data},            # Summary query
+            {'rows': [[1]]},                   # Replica count for shipmentFormFieldData
+            {'rows': [[2]]},                   # Replica count for orderFormFieldData
+        ]
+        self.mock_client.test_connection.return_value = True
+
+        with patch('xmover.cli.CrateDBClient', return_value=self.mock_client):
+            result = self.runner.invoke(main, ['problematic-translogs', '--sizeMB', '300'])
+
+        assert result.exit_code == 0
+        assert 'Found 2 table/partition(s) with problematic translogs' in result.output
+        assert 'Tables with Problematic Replicas' in result.output
+        assert 'Generated ALTER Commands:' in result.output
+
+        # Check that replica management commands are present
+        assert 'SET ("number_of_replicas" = 0)' in result.output
+        assert 'SET ("number_of_replicas" = 1)' in result.output
+        assert 'SET ("number_of_replicas" = 2)' in result.output
+        assert 'ALTER TABLE "TURVO"."shipmentFormFieldData"' in result.output
+        assert 'ALTER TABLE "TURVO"."orderFormFieldData"' in result.output
+
+    def test_partitioned_table_command_generation(self):
+        """Test ALTER command generation for partitioned tables"""
+        # Individual shards data (6 columns)
+        individual_shards_data = [
+            ['TURVO', 'shipmentFormFieldData_events', '("sync_day"=1757376000000)', 3, 'data-hot-2', 481.2],
+        ]
+        # Summary data (10 columns from query, displayed as 8 by combining P/R columns)
+        summary_data = [
+            ['TURVO', 'shipmentFormFieldData_events', '("sync_day"=1757376000000)', 'partition123', 2, 481.2, 2, 2, 1.1, 1.0],
+        ]
+        self.mock_client.execute_query.side_effect = [
+            {'rows': individual_shards_data},  # Individual shards query
+            {'rows': summary_data},            # Summary query
+            {'rows': [[1]]},                   # Replica count for partitioned table
+        ]
+        self.mock_client.test_connection.return_value = True
+
+        with patch('xmover.cli.CrateDBClient', return_value=self.mock_client):
+            result = self.runner.invoke(main, ['problematic-translogs', '--sizeMB', '400'])
+
+        assert result.exit_code == 0
+        assert 'Found 1 table/partition(s) with problematic translogs' in result.output
+        assert 'Generated ALTER Commands:' in result.output
+
+        # Check that partitioned table commands are present (handle Rich line wrapping)
+        assert 'ALTER TABLE "TURVO"."shipmentFormFieldData_events"' in result.output
+        assert 'PARTITION' in result.output
+        assert '("sync_day"=1757376000000)' in result.output
+        assert 'SET ("number_of_replicas" = 0)' in result.output
+        assert 'SET ("number_of_replicas" = 1)' in result.output
+
+    def test_mixed_partitioned_non_partitioned(self):
+        """Test handling of both partitioned and non-partitioned tables"""
+        # Individual shards data (6 columns)
+        individual_shards_data = [
+            ['TURVO', 'shipmentFormFieldData', None, 14, 'data-hot-6', 7011.8],
+            ['TURVO', 'shipmentFormFieldData_events', '("sync_day"=1757376000000)', 3, 'data-hot-2', 481.2],
+            ['TURVO', 'orderFormFieldData', 'NULL', 5, 'data-hot-1', 469.5]
+        ]
+        # Summary data (10 columns from query, displayed as 8 by combining P/R columns)
+        summary_data = [
+            ['TURVO', 'shipmentFormFieldData', None, None, 2, 7011.8, 5, 5, 12.4, 12.1],
+            ['TURVO', 'shipmentFormFieldData_events', '("sync_day"=1757376000000)', 'partition123', 1, 481.2, 2, 2, 1.1, 1.0],
+            ['TURVO', 'orderFormFieldData', 'NULL', None, 1, 469.5, 3, 6, 8.2, 16.3]
+        ]
+        self.mock_client.execute_query.side_effect = [
+            {'rows': individual_shards_data},  # Individual shards query
+            {'rows': summary_data},            # Summary query
+            {'rows': [[2]]},                   # Replica count for shipmentFormFieldData
+            {'rows': [[1]]},                   # Replica count for partitioned table
+            {'rows': [[3]]},                   # Replica count for orderFormFieldData
+        ]
+        self.mock_client.test_connection.return_value = True
+
+        with patch('xmover.cli.CrateDBClient', return_value=self.mock_client):
+            result = self.runner.invoke(main, ['problematic-translogs', '--sizeMB', '200'])
+
+        assert result.exit_code == 0
+        assert 'Found 3 table/partition(s) with problematic translogs' in result.output
+
+        # Check non-partitioned commands
+        assert 'ALTER TABLE "TURVO"."shipmentFormFieldData" SET ("number_of_replicas" = 0)' in result.output
+        assert 'ALTER TABLE "TURVO"."shipmentFormFieldData" SET ("number_of_replicas" = 2)' in result.output
+
+        # Check partitioned commands (handle Rich line wrapping)
+        assert 'ALTER TABLE "TURVO"."shipmentFormFieldData_events"' in result.output
+        assert 'PARTITION' in result.output
+        assert '("sync_day"=1757376000000)' in result.output
+        # Check that both 0 and 1 replica settings are present for partitioned table
+        assert 'SET ("number_of_replicas" = 0)' in result.output
+        assert 'SET ("number_of_replicas" = 1)' in result.output
+
+        # Check NULL partition handled as non-partitioned
+        assert 'ALTER TABLE "TURVO"."orderFormFieldData" SET ("number_of_replicas" = 0)' in result.output
+        assert 'ALTER TABLE "TURVO"."orderFormFieldData" SET ("number_of_replicas" = 3)' in result.output
+
+    def test_query_parameters(self):
+        """Test that the query is called with correct parameters"""
+        self.mock_client.execute_query.return_value = {'rows': []}
+        self.mock_client.test_connection.return_value = True
+
+        with patch('xmover.cli.CrateDBClient', return_value=self.mock_client):
+            result = self.runner.invoke(main, ['problematic-translogs', '--sizeMB', '500'])
+
+        # Verify the query was called twice (individual shards + summary)
+        assert self.mock_client.execute_query.call_count == 2
+        call_args = self.mock_client.execute_query.call_args
+        query = call_args[0][0]
+        parameters = call_args[0][1]
+
+        assert 'sh.translog_stats[\'uncommitted_size\'] > ? * 1024^2' in query
+        assert 'primary=FALSE' in query
+        assert 'GROUP BY' in query
+        assert 'max_translog_uncommitted_mb DESC' in query
+        assert parameters == [500, 500, 500]
+
+    def test_execute_flag_user_confirmation_no(self):
+        """Test --execute flag with user declining confirmation"""
+        # Individual shards data (6 columns)
+        individual_shards_data = [
+            ['TURVO', 'shipmentFormFieldData', None, 14, 'data-hot-6', 7011.8]
+        ]
+        # Summary data (10 columns from query, displayed as 8 by combining P/R columns)
+        summary_data = [
+            ['TURVO', 'shipmentFormFieldData', None, None, 1, 7011.8, 5, 5, 12.4, 12.1]
+        ]
+        self.mock_client.execute_query.side_effect = [
+            {'rows': individual_shards_data},  # Individual shards query
+            {'rows': summary_data},            # Summary query
+            {'rows': [[1]]},                   # Replica count query
+        ]
+        self.mock_client.test_connection.return_value = True
+
+        with patch('xmover.cli.CrateDBClient', return_value=self.mock_client), \
+             patch('click.confirm', return_value=False):
+            result = self.runner.invoke(main, ['problematic-translogs', '--execute'])
+
+        assert result.exit_code == 0
+        assert 'Operation cancelled by user' in result.output
+        # Should be called 3 times: individual shards query, summary query, replica count query
+        assert self.mock_client.execute_query.call_count == 3
+
+    def test_execute_flag_user_confirmation_set_zero_only(self):
+        """Test --execute flag with user confirming reroute but skipping replica commands"""
+        # Individual shards data (6 columns)
+        individual_shards_data = [
+            ['TURVO', 'shipmentFormFieldData', None, 14, 'data-hot-6', 7011.8]
+        ]
+        # Summary data (10 columns from query, displayed as 8 by combining P/R columns)
+        summary_data = [
+            ['TURVO', 'shipmentFormFieldData', None, None, 1, 7011.8, 5, 5, 12.4, 12.1]
+        ]
+        self.mock_client.execute_query.side_effect = [
+            {'rows': individual_shards_data},  # Individual shards query
+            {'rows': summary_data},            # Summary query
+            {'rows': [[1]]},                   # Replica count query
+            None,                              # REROUTE CANCEL execution
+        ]
+        self.mock_client.test_connection.return_value = True
+
+        # Confirm overall execution, confirm REROUTE CANCEL, skip SET to 0
+        with patch('xmover.cli.CrateDBClient', return_value=self.mock_client), \
+             patch('click.confirm', side_effect=[True, True, False]):
+            result = self.runner.invoke(main, ['problematic-translogs', '--execute'])
+
+        assert result.exit_code == 0
+        assert 'Executing commands individually' in result.output
+        assert 'executed successfully' in result.output
+        assert 'skipped' in result.output
+
+        # Should be called 4 times: individual query, summary query, replica count, reroute execution
+        assert self.mock_client.execute_query.call_count == 4
+
+    def test_execute_flag_user_confirmation_both_steps(self):
+        """Test --execute flag with user confirming all commands"""
+        # Individual shards data (6 columns)
+        individual_shards_data = [
+            ['TURVO', 'shipmentFormFieldData', None, 14, 'data-hot-6', 7011.8]
+        ]
+        # Summary data (10 columns from query, displayed as 8 by combining P/R columns)
+        summary_data = [
+            ['TURVO', 'shipmentFormFieldData', None, None, 1, 7011.8, 5, 5, 12.4, 12.1]
+        ]
+        self.mock_client.execute_query.side_effect = [
+            {'rows': individual_shards_data},  # Individual shards query
+            {'rows': summary_data},            # Summary query
+            {'rows': [[1]]},                   # Replica count query
+            None,                              # REROUTE CANCEL execution
+            None,                              # SET to 0 execution
+            None,                              # RESTORE execution
+        ]
+        self.mock_client.test_connection.return_value = True
+
+        # Confirm overall execution, confirm REROUTE CANCEL, confirm SET to 0, confirm RESTORE
+        with patch('xmover.cli.CrateDBClient', return_value=self.mock_client), \
+             patch('click.confirm', side_effect=[True, True, True, True]):
+            result = self.runner.invoke(main, ['problematic-translogs', '--execute'])
+
+        assert result.exit_code == 0
+        assert 'Executing commands individually' in result.output
+        assert 'executed successfully' in result.output
+        assert 'Execution Summary:' in result.output
+        assert 'Successful: 3' in result.output
+
+        # Should be called 6 times: individual query, summary query, replica count, reroute, set to 0, restore
+        assert self.mock_client.execute_query.call_count == 6
+
+    def test_execution_failure_handling(self):
+        """Test handling of command execution failures"""
+        # Individual shards data (6 columns)
+        individual_shards_data = [
+            ['TURVO', 'shipmentFormFieldData', None, 14, 'data-hot-6', 7011.8]
+        ]
+        # Summary data (10 columns from query, displayed as 8 by combining P/R columns)
+        summary_data = [
+            ['TURVO', 'shipmentFormFieldData', None, None, 1, 7011.8, 5, 5, 12.4, 12.1]
+        ]
+        self.mock_client.execute_query.side_effect = [
+            {'rows': individual_shards_data},  # Individual shards query
+            {'rows': summary_data},            # Summary query
+            {'rows': [[1]]},                   # Replica count query
+            Exception("REROUTE failed"),       # REROUTE CANCEL execution fails
+        ]
+        self.mock_client.test_connection.return_value = True
+
+        # Confirm overall execution, confirm REROUTE CANCEL (which fails), then decline next command
+        with patch('xmover.cli.CrateDBClient', return_value=self.mock_client), \
+             patch('click.confirm', side_effect=[True, True, False]):
+            result = self.runner.invoke(main, ['problematic-translogs', '--execute'])
+
+        assert result.exit_code == 0
+        assert 'failed' in result.output
+        assert 'REROUTE failed' in result.output
+        assert 'Failed: 1' in result.output
+
+    def test_skip_tables_with_unknown_replicas(self):
+        """Test skipping tables with unknown replica counts"""
+        # Individual shards data (6 columns)
+        individual_shards_data = [
+            ['TURVO', 'shipmentFormFieldData', None, 14, 'data-hot-6', 7011.8]
+        ]
+        # Summary data (10 columns from query, displayed as 8 by combining P/R columns)
+        summary_data = [
+            ['TURVO', 'shipmentFormFieldData', None, None, 1, 7011.8, 5, 5, 12.4, 12.1]
+        ]
+        self.mock_client.execute_query.side_effect = [
+            {'rows': individual_shards_data},  # Individual shards query
+            {'rows': summary_data},            # Summary query
+            Exception("Cannot get replica count"),  # Replica count query fails
+        ]
+        self.mock_client.test_connection.return_value = True
+
+        with patch('xmover.cli.CrateDBClient', return_value=self.mock_client):
+            result = self.runner.invoke(main, ['problematic-translogs'])
+
+        assert result.exit_code == 0
+        assert 'Warning: Could not retrieve replica count' in result.output
+        assert 'Skipping' in result.output
+        assert 'unknown replica count' in result.output
+        assert 'REROUTE CANCEL commands' in result.output
+        assert '1 REROUTE CANCEL commands + 0 replica management commands' in result.output
+
+    def test_skip_tables_with_zero_replicas(self):
+        """Test skipping tables that already have 0 replicas"""
+        # Individual shards data (6 columns)
+        individual_shards_data = [
+            ['TURVO', 'shipmentFormFieldData', None, 14, 'data-hot-6', 7011.8]
+        ]
+        # Summary data (10 columns from query, displayed as 8 by combining P/R columns)
+        summary_data = [
+            ['TURVO', 'shipmentFormFieldData', None, None, 1, 7011.8, 5, 5, 12.4, 12.1]
+        ]
+        self.mock_client.execute_query.side_effect = [
+            {'rows': individual_shards_data},  # Individual shards query
+            {'rows': summary_data},            # Summary query
+            {'rows': [[0]]},                   # Replica count query returns 0
+        ]
+        self.mock_client.test_connection.return_value = True
+
+        with patch('xmover.cli.CrateDBClient', return_value=self.mock_client):
+            result = self.runner.invoke(main, ['problematic-translogs'])
+
+        assert result.exit_code == 0
+        assert 'Skipping' in result.output
+        assert 'already has 0 replicas' in result.output
+        assert 'REROUTE CANCEL commands' in result.output
+        assert '1 REROUTE CANCEL commands + 0 replica management commands' in result.output
+
+    def test_database_error_handling(self):
+        """Test handling of database connection errors"""
+        self.mock_client.execute_query.side_effect = Exception("Connection failed")
+        self.mock_client.test_connection.return_value = True
+
+        with patch('xmover.cli.CrateDBClient', return_value=self.mock_client):
+            result = self.runner.invoke(main, ['problematic-translogs'])
+
+        assert result.exit_code == 0
+        assert 'Error analyzing problematic translogs' in result.output
+        assert 'Connection failed' in result.output
+
+    def test_default_size_mb(self):
+        """Test that default sizeMB is 300"""
+        self.mock_client.execute_query.return_value = {'rows': []}
+        self.mock_client.test_connection.return_value = True
+
+        with patch('xmover.cli.CrateDBClient', return_value=self.mock_client):
+            result = self.runner.invoke(main, ['problematic-translogs'])
+
+        assert result.exit_code == 0
+        assert '300MB' in result.output
+
+        # Verify query was called with default value
+        call_args = self.mock_client.execute_query.call_args
+        parameters = call_args[0][1]
+        assert parameters == [300, 300, 300]
+
+    def test_partitioned_and_non_partitioned_replica_queries(self):
+        """Test that correct replica queries are used for partitioned vs non-partitioned tables"""
+        # Individual shards data (6 columns)
+        individual_shards_data = [
+            ['TURVO', 'partitioned_table', '("id"=123)', 14, 'data-hot-6', 500.0],
+            ['TURVO', 'regular_table', None, 5, 'data-hot-1', 400.0]
+        ]
+        # Summary data (10 columns from query, displayed as 8 by combining P/R columns)  
+        summary_data = [
+            ['TURVO', 'partitioned_table', '("id"=123)', 'part123', 1, 500.0, 3, 3, 5.5, 5.2],
+            ['TURVO', 'regular_table', None, None, 1, 400.0, 2, 4, 3.1, 6.2]
+        ]
+        self.mock_client.execute_query.side_effect = [
+            {'rows': individual_shards_data},  # Individual shards query
+            {'rows': summary_data},            # Summary query
+            {'rows': [[1]]},                   # Partitioned table replica count
+            {'rows': [[2]]},                   # Regular table replica count
+        ]
+        self.mock_client.test_connection.return_value = True
+
+        with patch('xmover.cli.CrateDBClient', return_value=self.mock_client):
+            result = self.runner.invoke(main, ['problematic-translogs'])
+
+        assert result.exit_code == 0
+
+        # Verify the replica queries were called correctly
+        calls = self.mock_client.execute_query.call_args_list
+
+        # First two calls are the individual shards and summary queries
+        assert len(calls) == 4
+
+        # Third call should be partitioned table replica query
+        partitioned_query = calls[2][0][0]
+        assert 'information_schema.table_partitions' in partitioned_query
+        assert 'partition_ident' in partitioned_query
+        assert calls[2][0][1] == ['partitioned_table', 'TURVO', 'part123']
+
+        # Fourth call should be regular table replica query
+        regular_query = calls[3][0][0]
+        assert 'information_schema.tables' in regular_query
+        assert 'partition_ident' not in regular_query
+        assert calls[3][0][1] == ['regular_table', 'TURVO']
diff --git a/tests/test_recovery_monitor.py b/tests/test_recovery_monitor.py
new file mode 100644
index 0000000..1821e5f
--- /dev/null
+++ b/tests/test_recovery_monitor.py
@@ -0,0 +1,302 @@
+#!/usr/bin/env python3
+"""
+Test script for XMover recovery monitoring functionality
+
+This script tests the recovery monitoring features by creating mock recovery scenarios
+and verifying the output formatting and data parsing.
+"""
+
+import sys
+import os
+from unittest.mock import Mock, patch
+from typing import Dict, List, Any
+
+# Add the src directory to the path
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
+
+from xmover.database import CrateDBClient, RecoveryInfo
+from xmover.analyzer import RecoveryMonitor
+
+
+def create_mock_allocation(schema_name: str, table_name: str, shard_id: int,
+                          current_state: str, node_id: str) -> Dict[str, Any]:
+    """Create a mock allocation response"""
+    return {
+        'schema_name': schema_name,
+        'table_name': table_name,
+        'shard_id': shard_id,
+        'current_state': current_state,
+        'node_id': node_id,
+        'explanation': None
+    }
+
+
+def create_mock_shard_detail(schema_name: str, table_name: str, shard_id: int,
+                           node_name: str, node_id: str, recovery_type: str,
+                           stage: str, files_percent: float, bytes_percent: float,
+                           total_time: int, size: int, is_primary: bool,
+                           translog_size: int = 0, translog_uncommitted_size: int = 0) -> Dict[str, Any]:
+    """Create a mock shard detail response"""
+    return {
+        'schema_name': schema_name,
+        'table_name': table_name,
+        'shard_id': shard_id,
+        'node_name': node_name,
+        'node_id': node_id,
+        'routing_state': 'RELOCATING',
+        'state': 'RECOVERING',
+        'recovery': {
+            'type': recovery_type,
+            'stage': stage,
+            'files': {
+                'percent': files_percent,
+                'recovered': int(files_percent * 100),
+                'used': 100
+            },
+            'size': {
+                'percent': bytes_percent,
+                'recovered': int(bytes_percent * size),
+                'used': size
+            },
+            'total_time': total_time
+        },
+        'size': size,
+        'primary': is_primary,
+        'translog_size': translog_size,
+        'translog_uncommitted_size': translog_uncommitted_size,
+        'max_seq_no': None
+    }
+
+
+def test_recovery_info_parsing():
+    """Test RecoveryInfo dataclass and its properties"""
+    print("Testing RecoveryInfo parsing...")
+
+    recovery = RecoveryInfo(
+        schema_name='CURVO',
+        table_name='PartioffD',
+        partition_values=None,
+        shard_id=19,
+        node_name='data-hot-1',
+        node_id='ZH6fBanGSjanGqeSh-sw0A',
+        recovery_type='PEER',
+        stage='DONE',
+        files_percent=100.0,
+        bytes_percent=100.0,
+        total_time_ms=1555907,
+        routing_state='RELOCATING',
+        current_state='RELOCATING',
+        is_primary=False,
+        size_bytes=56565284209,
+        translog_size_bytes=0,
+        translog_uncommitted_bytes=0,
+        max_seq_no=None,
+        primary_max_seq_no=None
+    )
+
+    # Test properties
+    assert recovery.overall_progress == 100.0, f"Expected 100.0, got {recovery.overall_progress}"
+    assert abs(recovery.size_gb - 52.681) < 0.01, f"Expected ~52.681, got {recovery.size_gb:.3f}"
+    assert recovery.shard_type == "REPLICA", f"Expected REPLICA, got {recovery.shard_type}"
+    assert recovery.total_time_seconds == 1555.907, f"Expected 1555.907, got {recovery.total_time_seconds}"
+
+    print("✅ RecoveryInfo parsing tests passed")
+
+
+def test_database_client_parsing():
+    """Test database client recovery parsing logic"""
+    print("Testing database client recovery parsing...")
+
+    # Create a real client instance to test the parsing method
+    client = CrateDBClient.__new__(CrateDBClient)  # Create without calling __init__
+
+    # Create test data
+    allocation = create_mock_allocation('CURVO', 'PartioffD', 19, 'RELOCATING', 'node1')
+    shard_detail = create_mock_shard_detail(
+        'CURVO', 'PartioffD', 19, 'data-hot-1', 'node1',
+        'PEER', 'DONE', 100.0, 100.0, 1555907, 56565284209, False
+    )
+
+    # Test the parsing method directly
+    recovery_info = client._parse_recovery_info(allocation, shard_detail)
+
+    assert recovery_info.recovery_type == 'PEER'
+    assert recovery_info.stage == 'DONE'
+    assert recovery_info.overall_progress == 100.0
+
+    print("✅ Database client parsing tests passed")
+
+
+def test_recovery_monitor_formatting():
+    """Test recovery monitor display formatting"""
+    print("Testing recovery monitor formatting...")
+
+    # Create mock client
+    mock_client = Mock(spec=CrateDBClient)
+    monitor = RecoveryMonitor(mock_client)
+
+    # Create test recovery data
+    recoveries = [
+        RecoveryInfo(
+            schema_name='CURVO',
+            table_name='PartioffD',
+            partition_values=None,
+            shard_id=19,
+            node_name='data-hot-1',
+            node_id='node1',
+            recovery_type='PEER',
+            stage='DONE',
+            files_percent=100.0,
+            bytes_percent=100.0,
+            total_time_ms=1555907,
+            routing_state='RELOCATING',
+            current_state='RELOCATING',
+            is_primary=False,
+            size_bytes=56565284209,
+            translog_size_bytes=0,
+            translog_uncommitted_bytes=0,
+            max_seq_no=None,
+            primary_max_seq_no=None
+        ),
+        RecoveryInfo(
+            schema_name='CURVO',
+            table_name='orderTracking',
+            partition_values=None,
+            shard_id=7,
+            node_name='data-hot-2',
+            node_id='node2',
+            recovery_type='DISK',
+            stage='INDEX',
+            files_percent=75.5,
+            bytes_percent=67.8,
+            total_time_ms=890234,
+            routing_state='INITIALIZING',
+            current_state='INITIALIZING',
+            is_primary=True,
+            size_bytes=25120456789,
+            translog_size_bytes=0,
+            translog_uncommitted_bytes=0,
+            max_seq_no=None,
+            primary_max_seq_no=None
+        )
+    ]
+
+    # Test summary generation
+    summary = monitor.get_recovery_summary(recoveries)
+
+    assert summary['total_recoveries'] == 2
+    assert 'PEER' in summary['by_type']
+    assert 'DISK' in summary['by_type']
+    assert summary['by_type']['PEER']['count'] == 1
+    assert summary['by_type']['DISK']['count'] == 1
+
+    # Test display formatting
+    display_output = monitor.format_recovery_display(recoveries)
+
+    assert "Active Shard Recoveries (2 total)" in display_output
+    assert "PEER Recoveries (1)" in display_output
+    assert "DISK Recoveries (1)" in display_output
+    assert "PartioffD" in display_output
+    assert "orderTracking" in display_output
+
+    print("✅ Recovery monitor formatting tests passed")
+
+
+def test_empty_recovery_handling():
+    """Test handling of no active recoveries"""
+    print("Testing empty recovery handling...")
+
+    mock_client = Mock(spec=CrateDBClient)
+    monitor = RecoveryMonitor(mock_client)
+
+    # Test empty list
+    empty_recoveries = []
+
+    summary = monitor.get_recovery_summary(empty_recoveries)
+    assert summary['total_recoveries'] == 0
+    assert summary['by_type'] == {}
+
+    display_output = monitor.format_recovery_display(empty_recoveries)
+    assert "No active shard recoveries found" in display_output
+
+    print("✅ Empty recovery handling tests passed")
+
+
+def test_recovery_type_filtering():
+    """Test filtering by recovery type"""
+    print("Testing recovery type filtering...")
+
+    mock_client = Mock(spec=CrateDBClient)
+
+    # Mock the get_all_recovering_shards method
+    mock_recoveries = [
+        RecoveryInfo(
+            schema_name='test', table_name='table1', partition_values=None, shard_id=1,
+            node_name='node1', node_id='n1', recovery_type='PEER',
+            stage='DONE', files_percent=100.0, bytes_percent=100.0,
+            total_time_ms=1000, routing_state='RELOCATING',
+            current_state='RELOCATING', is_primary=True, size_bytes=1000000,
+            translog_size_bytes=0, translog_uncommitted_bytes=0,
+            max_seq_no=None, primary_max_seq_no=None
+        ),
+        RecoveryInfo(
+            schema_name='test', table_name='table2', partition_values=None, shard_id=2,
+            node_name='node2', node_id='n2', recovery_type='DISK',
+            stage='INDEX', files_percent=50.0, bytes_percent=75.0,
+            total_time_ms=2000, routing_state='INITIALIZING',
+            current_state='INITIALIZING', is_primary=False, size_bytes=2000000,
+            translog_size_bytes=0, translog_uncommitted_bytes=0,
+            max_seq_no=None, primary_max_seq_no=None
+        )
+    ]
+
+    mock_client.get_all_recovering_shards.return_value = mock_recoveries
+
+    monitor = RecoveryMonitor(mock_client)
+
+    # Test filtering
+    peer_only = monitor.get_cluster_recovery_status(recovery_type_filter='PEER')
+    assert len(peer_only) == 1
+    assert peer_only[0].recovery_type == 'PEER'
+
+    disk_only = monitor.get_cluster_recovery_status(recovery_type_filter='DISK')
+    assert len(disk_only) == 1
+    assert disk_only[0].recovery_type == 'DISK'
+
+    all_recoveries = monitor.get_cluster_recovery_status(recovery_type_filter='all')
+    assert len(all_recoveries) == 2
+
+    print("✅ Recovery type filtering tests passed")
+
+
+def main():
+    """Run all tests"""
+    print("🧪 Running XMover Recovery Monitor Tests")
+    print("=" * 50)
+
+    try:
+        test_recovery_info_parsing()
+        test_database_client_parsing()
+        test_recovery_monitor_formatting()
+        test_empty_recovery_handling()
+        test_recovery_type_filtering()
+
+        print("\n🎉 All tests passed successfully!")
+        print("\n📋 Test Summary:")
+        print("   ✅ RecoveryInfo data class and properties")
+        print("   ✅ Database client parsing logic")
+        print("   ✅ Recovery monitor display formatting")
+        print("   ✅ Empty recovery state handling")
+        print("   ✅ Recovery type filtering")
+
+        print("\n🚀 Recovery monitoring feature is ready for use!")
+
+    except Exception as e:
+        print(f"\n❌ Test failed: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/validate_rules.py b/validate_rules.py
new file mode 100644
index 0000000..f1cedfb
--- /dev/null
+++ b/validate_rules.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+"""
+Standalone rules validation script for XMover shard size monitoring rules.
+
+This script validates the YAML configuration file used by the shard size monitor
+to ensure proper syntax, required fields, and rule structure.
+
+Usage:
+    python validate_rules.py [config_file]
+    python validate_rules.py config/shard_size_rules.yaml
+"""
+
+import sys
+import argparse
+from pathlib import Path
+
+# Add src to path for imports
+sys.path.insert(0, str(Path(__file__).parent / "src"))
+
+try:
+    from xmover.shard_size_monitor import validate_rules_file
+except ImportError as e:
+    print(f"Error importing validation module: {e}")
+    print("Make sure you're running from the xmover project root directory")
+    sys.exit(1)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Validate XMover shard size monitoring rules configuration",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  python validate_rules.py                              # Validate default rules
+  python validate_rules.py config/shard_size_rules.yaml # Validate specific file
+  python validate_rules.py my_custom_rules.yaml         # Validate custom rules
+        """
+    )
+    
+    parser.add_argument(
+        'config_file', 
+        nargs='?',
+        default='config/shard_size_rules.yaml',
+        help='Path to rules configuration file (default: config/shard_size_rules.yaml)'
+    )
+    
+    args = parser.parse_args()
+    
+    # Resolve path relative to script location
+    config_path = Path(args.config_file)
+    if not config_path.is_absolute():
+        config_path = Path(__file__).parent / config_path
+    
+    print(f"Validating rules configuration: {config_path}")
+    print("-" * 60)
+    
+    if validate_rules_file(str(config_path)):
+        print("\n✅ Validation completed successfully!")
+        sys.exit(0)
+    else:
+        print("\n❌ Validation failed!")
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file