Skip to content

Commit 35ea489

Browse files
committed
redirector: more fixes for publish errors
- close connection on error (still throw to caller) - rpc ping pong keep track of sent pings - reduce scrubber/nodes_monitor intensity
1 parent 8ad250a commit 35ea489

File tree

7 files changed

+70
-52
lines changed

7 files changed

+70
-52
lines changed

config.js

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -75,14 +75,16 @@ config.IO_HTTP_TRUNCATE_PART_SIZE = false;
7575
// REBUILD CONFIG //
7676
////////////////////
7777

78-
config.REBUILD_BATCH_SIZE = 20;
79-
config.REBUILD_BATCH_DELAY = 75;
80-
config.REBUILD_BATCH_ERROR_DELAY = 3000;
81-
78+
config.REBUILD_NODE_ENABLED = true;
8279
config.REBUILD_NODE_CONCURRENCY = 3;
8380
config.REBUILD_NODE_OFFLINE_GRACE = 5 * 60000;
81+
config.REBUILD_NODE_BATCH_SIZE = 10;
82+
config.REBUILD_NODE_BATCH_DELAY = 50;
8483

8584
config.SCRUBBER_ENABLED = true;
85+
config.SCRUBBER_BATCH_SIZE = 10;
86+
config.SCRUBBER_BATCH_DELAY = 50;
87+
config.SCRUBBER_ERROR_DELAY = 3000;
8688
config.SCRUBBER_RESTART_DELAY = 30000;
8789

8890
//////////////////////

src/rpc/rpc.js

Lines changed: 22 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -558,13 +558,13 @@ RPC.prototype._accept_new_connection = function(conn) {
558558
conn._sent_requests = new Map();
559559
conn._received_requests = new Map();
560560
if (self._disconnected_state) {
561-
conn.close();
562-
throw new Error('RPC IN DISCONNECTED STATE - rejecting connection ' + conn.connid);
561+
const err = new Error('RPC IN DISCONNECTED STATE - rejecting connection ' + conn.connid);
562+
conn.emit('error', err);
563+
throw err;
563564
}
564565
conn.on('message', msg => self._on_message(conn, msg));
565566
conn.on('close', err => self._connection_closed(conn, err));
566-
// we prefer to let the connection handle it's own errors and decide if to close or not
567-
// conn.on('error', self._connection_error.bind(self, conn));
567+
// we let the connection handle it's own errors and decide if to close or not
568568

569569
if (!conn.transient) {
570570
// always replace previous connection in the address map,
@@ -575,11 +575,21 @@ RPC.prototype._accept_new_connection = function(conn) {
575575
// send pings to keepalive
576576
conn._ping_interval = setInterval(function() {
577577
dbg.log4('RPC PING', conn.connid);
578-
conn._ping_last_reqid = conn._alloc_reqid();
578+
const reqid = conn._alloc_reqid();
579+
conn._ping_reqid_set = conn._ping_reqid_set || new Set();
580+
conn._ping_reqid_set.add(reqid);
581+
if (conn._ping_reqid_set.size > 3) {
582+
const err = new Error(`RPC PINGPONG EXHAUSTED pings ${
583+
Array.from(conn._ping_reqid_set).join(',')
584+
} connid ${conn.connid}`);
585+
dbg.warn(err);
586+
conn.emit('error', err);
587+
return null;
588+
}
579589
P.resolve()
580590
.then(() => conn.send(RpcRequest.encode_message({
581591
op: 'ping',
582-
reqid: conn._ping_last_reqid
592+
reqid: reqid
583593
})))
584594
.catch(_.noop); // already means the conn is closed
585595
return null;
@@ -652,15 +662,6 @@ RPC.prototype.disconnect_all = function() {
652662
};
653663

654664

655-
/**
656-
*
657-
*/
658-
RPC.prototype._connection_error = function(conn, err) {
659-
dbg.error('RPC CONNECTION ERROR:', conn.connid, conn.url.href, err.stack || err);
660-
conn.close();
661-
};
662-
663-
664665
/**
665666
*
666667
*/
@@ -746,16 +747,14 @@ RPC.prototype._on_message = function(conn, msg_buffer) {
746747
.catch(_.noop); // already means the conn is closed
747748
break;
748749
case 'pong':
749-
if (conn._ping_last_reqid === msg.header.reqid) {
750+
if (conn._ping_reqid_set && conn._ping_reqid_set.has(msg.header.reqid)) {
750751
dbg.log4('RPC PINGPONG', conn.connid);
751-
conn._ping_mismatch_count = 0;
752+
conn._ping_reqid_set.delete(msg.header.reqid);
752753
} else {
753-
conn._ping_mismatch_count = (conn._ping_mismatch_count || 0) + 1;
754-
if (conn._ping_mismatch_count < 3) {
755-
dbg.warn('RPC PINGPONG MISMATCH #' + conn._ping_mismatch_count, conn.connid);
756-
} else {
757-
conn.close(new Error('RPC PINGPONG MISMATCH #' + conn._ping_mismatch_count + ' ' + conn.connid));
758-
}
754+
dbg.warn(`RPC PINGPONG MISMATCH pings ${
755+
Array.from(conn._ping_reqid_set).join(',')
756+
} pong ${msg.header.reqid
757+
} connid ${conn.connid}`);
759758
}
760759
break;
761760
default:

src/rpc/rpc_conn_set.js

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,22 +14,22 @@ class RpcConnSet {
1414

1515
add(conn) {
1616
if (conn.is_closed()) {
17-
dbg.warn(this.name, 'ignore closed connection', conn.url.href);
17+
dbg.warn(this.name, 'ignore closed connection', conn.connid);
1818
return;
1919
}
2020
if (this.set.has(conn)) {
21-
dbg.log0(this.name, 'already registered', conn.url.href);
21+
dbg.log0(this.name, 'already registered', conn.connid);
2222
return;
2323
}
24-
dbg.log0(this.name, 'adding connection', conn.url.href);
24+
dbg.log0(this.name, 'adding connection', conn.connid);
2525
this.set.add(conn);
2626
const close_listener = () => this.remove(conn);
2727
conn[CLOSE_LISTENER_SYMBOL] = close_listener;
2828
conn.once('close', close_listener);
2929
}
3030

3131
remove(conn) {
32-
dbg.warn(this.name, 'removing connection', conn.url.href);
32+
dbg.warn(this.name, 'removing connection', conn.connid);
3333
const close_listener = conn[CLOSE_LISTENER_SYMBOL];
3434
delete conn[CLOSE_LISTENER_SYMBOL];
3535
conn.removeListener('close', close_listener);

src/server/bg_services/scrubber.js

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,28 +31,28 @@ function background_worker() {
3131
dbg.log0('SCRUBBER:', 'BEGIN');
3232
}
3333

34-
return MDStore.instance().iterate_all_chunks(this.marker, config.REBUILD_BATCH_SIZE)
34+
return MDStore.instance().iterate_all_chunks(this.marker, config.SCRUBBER_BATCH_SIZE)
3535
.then(res => {
3636
// update the marker for next time
3737
this.marker = res.marker;
3838
if (!res.chunk_ids.length) return;
3939
dbg.log0('SCRUBBER:', 'WORKING ON', res.chunk_ids.length, 'CHUNKS');
4040
const builder = new map_builder.MapBuilder(res.chunk_ids);
41-
return builder.run()
42-
.catch(err => dbg.error('SCRUBBER:', 'BUILD ERROR', err, err.stack));
41+
return builder.run();
4342
})
4443
.then(() => {
4544
// return the delay before next batch
4645
if (this.marker) {
4746
dbg.log0('SCRUBBER:', 'CONTINUE', this.marker);
48-
return config.REBUILD_BATCH_DELAY;
47+
return config.SCRUBBER_BATCH_DELAY;
4948
}
5049
dbg.log0('SCRUBBER:', 'END');
5150
return config.SCRUBBER_RESTART_DELAY;
52-
}, err => {
51+
})
52+
.catch(err => {
5353
// return the delay before next batch
5454
dbg.error('SCRUBBER:', 'ERROR', err, err.stack);
55-
return config.REBUILD_BATCH_ERROR_DELAY;
55+
return config.SCRUBBER_ERROR_DELAY;
5656
});
5757
}
5858

src/server/node_services/nodes_monitor.js

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1496,7 +1496,7 @@ class NodesMonitor extends EventEmitter {
14961496
setTimeout(() => {
14971497
this._set_need_rebuild.add(item);
14981498
this._wakeup_rebuild();
1499-
}, config.REBUILD_BATCH_DELAY).unref();
1499+
}, config.REBUILD_NODE_BATCH_DELAY).unref();
15001500
}
15011501
}
15021502

@@ -1508,12 +1508,14 @@ class NodesMonitor extends EventEmitter {
15081508
setTimeout(() => {
15091509
this._set_need_rebuild.add(item);
15101510
this._wakeup_rebuild();
1511-
}, config.REBUILD_BATCH_DELAY).unref();
1511+
}, config.REBUILD_NODE_BATCH_DELAY).unref();
15121512
}
15131513
}
15141514
}
15151515

15161516
_wakeup_rebuild() {
1517+
if (!this._started) return;
1518+
if (!config.REBUILD_NODE_ENABLED) return;
15171519
const count = Math.min(
15181520
config.REBUILD_NODE_CONCURRENCY,
15191521
this._set_need_rebuild.size - this._num_running_rebuilds);
@@ -1556,7 +1558,7 @@ class NodesMonitor extends EventEmitter {
15561558
.then(() => MDStore.instance().iterate_node_chunks({
15571559
node_id: item.node._id,
15581560
marker: start_marker,
1559-
limit: config.REBUILD_BATCH_SIZE,
1561+
limit: config.REBUILD_NODE_BATCH_SIZE,
15601562
}))
15611563
.then(res => {
15621564
// we update the stage marker even if failed to advance the scan

src/server/system_services/redirector.js

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
/* Copyright (C) 2016 NooBaa */
22
'use strict';
33

4+
const _ = require('lodash');
45
const P = require('../../util/promise');
56
const dbg = require('../../util/debug_module')(__filename);
67
const server_rpc = require('../server_rpc');
@@ -17,11 +18,19 @@ function publish_to_cluster(req) {
1718
const api_name = req.rpc_params.method_api.slice(0, -4); // remove _api suffix
1819
const method = req.rpc_params.method_name;
1920
const connections = cluster_conn_set.list();
20-
dbg.log0('publish_to_cluster:', connections.length);
21+
dbg.log0('publish_to_cluster:',
22+
api_name, method, req.rpc_params.request_params,
23+
_.map(connections, 'connid'));
2124
return P.map(connections,
22-
conn => server_rpc.client[api_name][method](req.rpc_params.request_params, {
25+
conn => P.resolve(server_rpc.client[api_name][method](req.rpc_params.request_params, {
2326
connection: conn,
2427
auth_token: req.auth_token,
28+
}))
29+
.catch(err => {
30+
// close this connection, assuming this can help to recover
31+
conn.emit('error', new Error(`publish_to_cluster: disconnect on error ${err.message} ${conn.connid}`));
32+
// throw the original error so that callers will receive the root cause reason
33+
throw err;
2534
})
2635
)
2736
.then(res => ({
@@ -41,7 +50,9 @@ function unregister_from_alerts(req) {
4150

4251
function publish_alerts(req) {
4352
const connections = alerts_conn_set.list();
44-
dbg.log3('publish_alerts:', req.rpc_params.request_params, connections.length);
53+
dbg.log3('publish_alerts:',
54+
req.rpc_params.request_params,
55+
_.map(connections, 'connid'));
4556
return P.map(connections, conn =>
4657
server_rpc.client.frontend_notifications.alert(req.rpc_params.request_params, {
4758
connection: conn,

src/server/utils/clustering_utils.js

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -255,15 +255,19 @@ function get_member_upgrade_status(ip) {
255255
function send_master_update(is_master, master_address) {
256256
let system = system_store.data.systems[0];
257257
if (!system) return P.resolve();
258-
let hosted_agents_promise = is_master ? server_rpc.client.hosted_agents.start() : server_rpc.client.hosted_agents.stop();
259-
let update_master_promise = _.isUndefined(master_address) ? P.resolve() : server_rpc.client.redirector.publish_to_cluster({
260-
method_api: 'server_inter_process_api',
261-
method_name: 'update_master_change',
262-
target: '', // required but irrelevant
263-
request_params: {
264-
master_address: master_address
265-
}
266-
});
258+
let hosted_agents_promise = is_master ?
259+
server_rpc.client.hosted_agents.start() :
260+
server_rpc.client.hosted_agents.stop();
261+
let update_master_promise = _.isUndefined(master_address) ?
262+
P.resolve() :
263+
server_rpc.client.redirector.publish_to_cluster({
264+
method_api: 'server_inter_process_api',
265+
method_name: 'update_master_change',
266+
target: '', // required but irrelevant
267+
request_params: {
268+
master_address: master_address
269+
}
270+
});
267271
return P.join(
268272
server_rpc.client.system.set_webserver_master_state({
269273
is_master: is_master

0 commit comments

Comments
 (0)