Add ta tests (#43)

* Add TA tests * Fix client behaviour * Add grafana dashboard & alerts * Get datanode thread pool worker count from env * Increase datanode sync timeout * Increase resources * Add thread pool size test env
kysre · Feb 13, 2024 · 17559ab · 17559ab
1 parent e6aff50
commit 17559ab
Show file tree

Hide file tree

Showing 15 changed files with 556 additions and 144 deletions.
diff --git a/client_go/client.go b/client_go/client.go
@@ -84,16 +84,12 @@ func (c *queueClient) acknowledgePull(ctx context.Context, key string) {
 }
 
 func (c *queueClient) runSubscribe(function SubscribeFunction) {
-	tickerPeriod := time.Duration(1) * time.Second
-	ticker := time.NewTicker(tickerPeriod)
 	for {
-		select {
-		case <-ticker.C:
-			key, value := c.Pull()
-			if key == "" {
-				continue
-			}
-			function(key, value)
+		key, value := c.Pull()
+		if key == "" {
+			time.Sleep(time.Duration(10) * time.Millisecond)
+			continue
 		}
+		function(key, value)
 	}
 }
diff --git a/client_py/client.py b/client_py/client.py
@@ -15,8 +15,8 @@ class QueueClient:
     replica_stub = None
     HOST = "64.226.122.208"
     PORT, REPLICA_PORT = "8000", "8001"
-    SUBSCRIBE_WORKERS = 3
-    SUBSCRIBE_SLEEP_TIMEOUT = 2
+    SUBSCRIBE_WORKERS = 1
+    SUBSCRIBE_SLEEP_TIMEOUT = 0.01
 
     @classmethod
     def get_stub(cls, host: str, port: str):
@@ -85,7 +85,8 @@ def run_subscribe(self, f):
                         pull_response = self.pull()
                         if pull_response is not None and pull_response is not False and pull_response[0] != '':
                             futures.append(executer.submit(f, pull_response[0], pull_response[1]))
-                        time.sleep(QueueClient.SUBSCRIBE_SLEEP_TIMEOUT)
+                        else:
+                            time.sleep(QueueClient.SUBSCRIBE_SLEEP_TIMEOUT)
 
                     _ = [future.result() for future in futures]
 

diff --git a/datanode/src/configs/configs.py b/datanode/src/configs/configs.py
@@ -20,8 +20,9 @@ class ConfigManager:
         leader_host=env_config('LEADER_HOST'),
         leader_port=env_config('LEADER_PORT'),
         datanode_name=env_config('DATANODE_NAME'),
+        server_thread_pool_size=env_config('SERVER_THREAD_POOL_SIZE'),
     )
 
     @staticmethod
     def get_prop(key):
-        return ConfigManager.configs[key]
+        return ConfigManager.configs[key]
diff --git a/datanode/src/configs/test.env b/datanode/src/configs/test.env
@@ -7,4 +7,5 @@ PENDING_TIMEOUT=15
 CLEANER_PERIOD=5
 LEADER_HOST=leader_0
 LEADER_PORT=8888
-DATANODE_NAME=datanode_0
+DATANODE_NAME=datanode_0
+SERVER_THREAD_POOL_SIZE=100
diff --git a/datanode/src/datanode_server.py b/datanode/src/datanode_server.py
@@ -204,7 +204,11 @@ def serve():
 
     datanode_name = ConfigManager.get_prop('datanode_name')
 
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=50))
+    server = grpc.server(
+        futures.ThreadPoolExecutor(
+            max_workers=ConfigManager.get_prop('server_thread_pool_size')
+        )
+    )
     datanode = DataNode(partitions_count, home_path, datanode_name)
     datanode_pb2_grpc.add_DataNodeServicer_to_server(datanode, server)
 

diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -12,7 +12,7 @@ services:
     deploy:
       resources:
         limits:
-          cpus: '0.2'
+          cpus: '0.3'
           memory: 300M
 
   node_exporter:
@@ -44,8 +44,8 @@ services:
     deploy:
       resources:
         limits:
-          cpus: '0.4'
-          memory: 400M
+          cpus: '0.5'
+          memory: 600M
 
   leader_0:
     image: kysre/turtlemq:leader-${LEADER_IMAGE_TAG}
@@ -54,16 +54,17 @@ services:
       - prometheus
     environment:
       - LEADER_LOGGING_LEVEL=info
-      - LEADER_LEADER_DATANODEREMAININGCHECKPERIOD=2
+      - LEADER_LEADER_DATANODEREMAININGCHECKPERIOD=1
       - LEADER_LEADER_DATANODEPARTITIONCOUNT=100
+      - LEADER_LEADER_DATANODESYNCTIMEOUT=1000
       - LEADER_LEADER_REPLICAHOST=leader_1
     ports:
       - '8000:8888'
     deploy:
       resources:
         limits:
-          cpus: '0.2'
-          memory: 200M
+          cpus: '0.3'
+          memory: 300M
 
   leader_1:
     image: kysre/turtlemq:leader-${LEADER_IMAGE_TAG}
@@ -73,16 +74,17 @@ services:
       - leader_0
     environment:
       - LEADER_LOGGING_LEVEL=info
-      - LEADER_LEADER_DATANODEREMAININGCHECKPERIOD=2
+      - LEADER_LEADER_DATANODEREMAININGCHECKPERIOD=1
       - LEADER_LEADER_DATANODEPARTITIONCOUNT=100
+      - LEADER_LEADER_DATANODESYNCTIMEOUT=1000
       - LEADER_LEADER_REPLICAHOST=leader_0
     ports:
       - '8001:8888'
     deploy:
       resources:
         limits:
-          cpus: '0.2'
-          memory: 200M
+          cpus: '0.3'
+          memory: 300M
 
   datanode_0:
     image: kysre/turtlemq:datanode-${DATANODE_IMAGE_TAG}
@@ -100,13 +102,14 @@ services:
       - PENDING_TIMEOUT=15
       - CLEANER_PERIOD=3
       - PARTITIONS_COUNT=100
+      - SERVER_THREAD_POOL_SIZE=100
     volumes:
       - datanode_0_vol:/var/lib/turtlemq/data/
     deploy:
       resources:
         limits:
-          cpus: '0.15'
-          memory: 300M
+          cpus: '0.3'
+          memory: 400M
 
   datanode_1:
     image: kysre/turtlemq:datanode-${DATANODE_IMAGE_TAG}
@@ -124,13 +127,14 @@ services:
       - PENDING_TIMEOUT=15
       - CLEANER_PERIOD=3
       - PARTITIONS_COUNT=100
+      - SERVER_THREAD_POOL_SIZE=100
     volumes:
       - datanode_1_vol:/var/lib/turtlemq/data/
     deploy:
       resources:
         limits:
-          cpus: '0.15'
-          memory: 300M
+          cpus: '0.3'
+          memory: 400M
 
   datanode_2:
     image: kysre/turtlemq:datanode-${DATANODE_IMAGE_TAG}
@@ -148,13 +152,14 @@ services:
       - PENDING_TIMEOUT=15
       - CLEANER_PERIOD=3
       - PARTITIONS_COUNT=100
+      - SERVER_THREAD_POOL_SIZE=100
     volumes:
       - datanode_2_vol:/var/lib/turtlemq/data/
     deploy:
       resources:
         limits:
-          cpus: '0.15'
-          memory: 300M
+          cpus: '0.3'
+          memory: 400M
 
   datanode_3:
     image: kysre/turtlemq:datanode-${DATANODE_IMAGE_TAG}
@@ -172,13 +177,14 @@ services:
       - PENDING_TIMEOUT=15
       - CLEANER_PERIOD=3
       - PARTITIONS_COUNT=100
+      - SERVER_THREAD_POOL_SIZE=100
     volumes:
       - datanode_3_vol:/var/lib/turtlemq/data/
     deploy:
       resources:
         limits:
-          cpus: '0.15'
-          memory: 300M
+          cpus: '0.3'
+          memory: 400M
 
 volumes:
   prom_data:

diff --git a/grafana/alert_config.json b/grafana/alert_config.json
@@ -0,0 +1,48 @@
+{
+  "template_files": {},
+  "template_file_provenances": {},
+  "alertmanager_config": {
+    "route": {
+      "receiver": "turtlemq-contact",
+      "group_by": [
+        "grafana_folder",
+        "alertname"
+      ]
+    },
+    "templates": null,
+    "receivers": [
+      {
+        "name": "grafana-default-email",
+        "grafana_managed_receiver_configs": [
+          {
+            "uid": "e729eec1-03e8-4d08-8d55-633ed1b6cb04",
+            "name": "email receiver",
+            "type": "email",
+            "disableResolveMessage": false,
+            "settings": {
+              "addresses": "<[email protected]>"
+            },
+            "secureFields": {}
+          }
+        ]
+      },
+      {
+        "name": "turtlemq-contact",
+        "grafana_managed_receiver_configs": [
+          {
+            "uid": "e54db3b1-ec77-44a7-9caf-abef76cbdcb1",
+            "name": "turtlemq-contact",
+            "type": "email",
+            "disableResolveMessage": false,
+            "settings": {
+              "addresses": "[email protected]\n[email protected]\n",
+              "singleEmail": true,
+              "message": "TurtleMQ alerts are FIRING!"
+            },
+            "secureFields": {}
+          }
+        ]
+      }
+    ]
+  }
+}