From b7686ccdff5fd8b38def7ef5b0b386c1d16b20b5 Mon Sep 17 00:00:00 2001 From: Guy Margalit Date: Sun, 12 Jan 2025 11:30:50 +0200 Subject: [PATCH 1/4] GUY TEMP: remove node-rdkafka Signed-off-by: Guy Margalit --- package-lock.json | 15 --------------- package.json | 1 - src/util/notifications_util.js | 2 +- 3 files changed, 1 insertion(+), 17 deletions(-) diff --git a/package-lock.json b/package-lock.json index 40e5b241ce..aa23008a77 100644 --- a/package-lock.json +++ b/package-lock.json @@ -45,7 +45,6 @@ "morgan": "1.10.0", "nan": "2.22.2", "node-addon-api": "8.3.1", - "node-rdkafka": "3.3.1", "performance-now": "2.1.0", "pg": "8.14.1", "ping": "0.4.4", @@ -9818,20 +9817,6 @@ "dev": true, "license": "MIT" }, - "node_modules/node-rdkafka": { - "version": "3.3.1", - "resolved": "https://registry.npmjs.org/node-rdkafka/-/node-rdkafka-3.3.1.tgz", - "integrity": "sha512-dx4vHvt1RkoGsOVqGQBpu72WRx/7+1FyeZMB+f3frQODRlAFnUgFhK+h+dWuL9Majx0FFdgVx8VehOgJT6vGrw==", - "hasInstallScript": true, - "license": "MIT", - "dependencies": { - "bindings": "^1.3.1", - "nan": "^2.22.0" - }, - "engines": { - "node": ">=16" - } - }, "node_modules/node-releases": { "version": "2.0.19", "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.19.tgz", diff --git a/package.json b/package.json index c3674929e1..d722aa67e7 100644 --- a/package.json +++ b/package.json @@ -106,7 +106,6 @@ "morgan": "1.10.0", "nan": "2.22.2", "node-addon-api": "8.3.1", - "node-rdkafka": "3.3.1", "performance-now": "2.1.0", "pg": "8.14.1", "ping": "0.4.4", diff --git a/src/util/notifications_util.js b/src/util/notifications_util.js index fa7d63d360..eb4b527afd 100644 --- a/src/util/notifications_util.js +++ b/src/util/notifications_util.js @@ -4,7 +4,7 @@ const dbg = require('../util/debug_module')(__filename); const config = require('../../config'); const { PersistentLogger, LogFile } = require('../util/persistent_logger'); -const Kafka = require('node-rdkafka'); +const Kafka = null; //require('node-rdkafka'); const os = require('os'); const fs = require('fs'); const http = require('http'); From ac805a60f55e6129d81004af8bf9a30f94feb0b9 Mon Sep 17 00:00:00 2001 From: Guy Margalit Date: Tue, 14 Jan 2025 01:11:22 +0200 Subject: [PATCH 2/4] tools and improves Signed-off-by: Guy Margalit --- .eslintrc.js | 2 +- config.js | 6 +-- package.json | 2 +- src/agent/block_store_speed.js | 10 +--- src/sdk/object_sdk.js | 28 +++++----- src/tools/coding_speed.js | 20 ++------ src/tools/cpu_speed.js | 10 +--- src/tools/fs_speed.js | 53 +++++++++++-------- src/tools/rand_speed.js | 16 +++--- src/tools/tcp_speed.js | 37 +++++-------- src/util/http_recorder.js | 22 ++++---- src/util/http_utils.js | 94 ++++++++++++++++++++++++++-------- src/util/rand_stream.js | 12 ++--- src/util/speedometer.js | 83 +++++++++++++++++++++--------- 14 files changed, 229 insertions(+), 166 deletions(-) diff --git a/.eslintrc.js b/.eslintrc.js index 621df5916f..890f460ac1 100644 --- a/.eslintrc.js +++ b/.eslintrc.js @@ -138,7 +138,7 @@ module.exports = { 'one-var': ['error', 'never'], '@stylistic/js/space-before-function-paren': ['error', { - 'anonymous': 'never', + 'anonymous': 'ignore', 'named': 'never', 'asyncArrow': 'always' }], diff --git a/config.js b/config.js index d841415afe..753e3a468a 100644 --- a/config.js +++ b/config.js @@ -1173,7 +1173,7 @@ function _get_data_from_file(file_name) { try { data = fs.readFileSync(file_name).toString(); } catch (e) { - console.warn(`Error accrued while getting the data from ${file_name}: ${e}`); + // console.log(`Error accrued while getting the data from ${file_name}: ${e}`); return; } return data; @@ -1189,7 +1189,7 @@ function _get_config_root() { const data = _get_data_from_file(redirect_path); config_root = data.toString().trim(); } catch (err) { - console.warn('config.get_config_root - could not find custom config_root, will use the default config_root ', config_root); + // console.log('config.get_config_root - could not find custom config_root, will use the default config_root ', config_root); } return config_root; } @@ -1244,7 +1244,7 @@ function load_nsfs_nc_config() { try { if (!config.NSFS_NC_CONF_DIR) { config.NSFS_NC_CONF_DIR = _get_config_root(); - console.warn('load_nsfs_nc_config.setting config.NSFS_NC_CONF_DIR', config.NSFS_NC_CONF_DIR); + // console.warn('load_nsfs_nc_config.setting config.NSFS_NC_CONF_DIR', config.NSFS_NC_CONF_DIR); } const config_path = path.join(config.NSFS_NC_CONF_DIR, 'config.json'); const config_data = require(config_path); diff --git a/package.json b/package.json index d722aa67e7..76c3ecc810 100644 --- a/package.json +++ b/package.json @@ -16,7 +16,7 @@ "scripts": { "install": "echo install hook override to avoid npm default hook calling node-gyp", "build": "npm run build:native", - "build:native": "node-gyp configure && node-gyp build", + "build:native": "node-gyp configure build", "rebuild": "npm run clean:install && npm run clean:build && npm install && npm run build", "pkg": "pkg . --public --options unhandled-rejections=warn --compress Brotli", "---": "------------------------------------------------------------------", diff --git a/src/agent/block_store_speed.js b/src/agent/block_store_speed.js index b3580732ee..d85b34b858 100644 --- a/src/agent/block_store_speed.js +++ b/src/agent/block_store_speed.js @@ -3,7 +3,6 @@ // const _ = require('lodash'); const argv = require('minimist')(process.argv); -const cluster = require('cluster'); const mongodb = require('mongodb'); const api = require('../api'); @@ -26,17 +25,10 @@ argv.timeout = argv.timeout || 60000; let block_index = 0; -const master_speedometer = new Speedometer('Total Speed'); const speedometer = new Speedometer('Block Store Speed'); - -if (argv.forks > 1 && cluster.isMaster) { - master_speedometer.fork(argv.forks); -} else { - main(); -} +speedometer.run_workers(argv.forks, main, argv); async function main() { - console.log('ARGS', argv); const rpc = api.new_rpc(); const client = rpc.new_client(); const signal_client = rpc.new_client(); diff --git a/src/sdk/object_sdk.js b/src/sdk/object_sdk.js index ae2bd7430b..1329199dd8 100644 --- a/src/sdk/object_sdk.js +++ b/src/sdk/object_sdk.js @@ -142,25 +142,25 @@ class ObjectSDK { this.abort_controller.abort(err); }); - // TODO: aborted event is being deprecated since nodejs 16 - // https://nodejs.org/dist/latest-v16.x/docs/api/http.html#event-aborted recommends on listening to close event - // req.once('close', () => { - // dbg.log0('request aborted1', req.url); - - // if (req.destroyed) { - // dbg.log0('request aborted', req.url); - // this.abort_controller.abort(new Error('request aborted ' + req.url)); - // } + // Note: aborted event is deprecated in favor of the close event + // https://nodejs.org/dist/latest-v16.x/docs/api/http.html#event-aborted + // req.once('aborted', () => { + // dbg.log0('request aborted', req.url); + // this.abort_controller.abort(new Error('request aborted ' + req.url)); // }); - - req.once('aborted', () => { - dbg.log0('request aborted', req.url); - this.abort_controller.abort(new Error('request aborted ' + req.url)); + req.once('close', () => { + // dbg.log1('request closed', req.url); + if (req.errored) { + dbg.log0('request aborted', req.url); + this.abort_controller.abort(new Error('request aborted ' + req.url)); + } }); + } throw_if_aborted() { - if (this.abort_controller.signal.aborted) throw new Error('request aborted signal'); + this.abort_controller.signal.throwIfAborted(); + // if (this.abort_controller.signal.aborted) throw new Error('request aborted signal'); } add_abort_handler(handler) { diff --git a/src/tools/coding_speed.js b/src/tools/coding_speed.js index 42ae6b2693..b8803f452c 100644 --- a/src/tools/coding_speed.js +++ b/src/tools/coding_speed.js @@ -7,7 +7,6 @@ const _ = require('lodash'); const argv = require('minimist')(process.argv); const stream = require('stream'); const assert = require('assert'); -const cluster = require('cluster'); const crypto = require('crypto'); const config = require('../../config'); @@ -15,7 +14,6 @@ const ChunkCoder = require('../util/chunk_coder'); const RandStream = require('../util/rand_stream'); const Speedometer = require('../util/speedometer'); const ChunkEraser = require('../util/chunk_eraser'); -const stream_utils = require('../util/stream_utils'); const ChunkSplitter = require('../util/chunk_splitter'); const FlattenStream = require('../util/flatten_stream'); // const CoalesceStream = require('../util/coalesce_stream'); @@ -35,17 +33,10 @@ argv.verbose = Boolean(argv.verbose); // default is false argv.sse_c = Boolean(argv.sse_c); // default is false delete argv._; -const master_speedometer = new Speedometer('Total Speed'); const speedometer = new Speedometer('Chunk Coder Speed'); - -if (argv.forks > 1 && cluster.isMaster) { - master_speedometer.fork(argv.forks); -} else { - main(); -} +speedometer.run_workers(argv.forks, main, argv); function main() { - console.log('Arguments:', JSON.stringify(argv, null, 2)); const chunk_split_config = { avg_chunk: config.CHUNK_SPLIT_AVG_CHUNK, @@ -106,11 +97,10 @@ function main() { let total_size = 0; let num_parts = 0; - const reporter = new stream.Transform({ + const reporter = new stream.Writable({ objectMode: true, - allowHalfOpen: false, highWaterMark: 50, - transform(chunk, encoding, callback) { + write(chunk, encoding, callback) { if (argv.verbose) console.log({ ...chunk, data: 'ommitted' }); if (argv.compare && chunk.original_data) { assert(Buffer.concat(chunk.original_data).equals(chunk.data)); @@ -122,7 +112,7 @@ function main() { } }); - /** @type {stream.Stream[]} */ + /** @type {(stream.Readable | stream.Transform | stream.Writable)[]} */ const transforms = [ input, splitter, @@ -137,7 +127,7 @@ function main() { transforms.push(new FlattenStream()); } transforms.push(reporter); - return stream_utils.pipeline(transforms) + return stream.promises.pipeline(transforms) .then(() => { console.log('AVERAGE CHUNK SIZE', (total_size / num_parts).toFixed(0)); if (splitter.md5) { diff --git a/src/tools/cpu_speed.js b/src/tools/cpu_speed.js index 7d322bf6de..527e56bac3 100644 --- a/src/tools/cpu_speed.js +++ b/src/tools/cpu_speed.js @@ -3,7 +3,6 @@ require('../util/fips'); const crypto = require('crypto'); -const cluster = require('cluster'); const argv = require('minimist')(process.argv); const Speedometer = require('../util/speedometer'); @@ -13,17 +12,12 @@ argv.forks = argv.forks || 1; argv.size = argv.size || (10 * 1024); argv.hash = argv.hash || 'sha256'; -if (argv.forks > 1 && cluster.isMaster) { - const master_speedometer = new Speedometer('Total Speed'); - master_speedometer.fork(argv.forks); -} else { - main(); -} +const speedometer = new Speedometer(`CPU(${argv.hash})`); +speedometer.run_workers(argv.forks, main, argv); function main() { const hasher = crypto.createHash(argv.hash); const buf = crypto.randomBytes(1024 * 1024); - const speedometer = new Speedometer('CPU Speed'); let size = argv.size * 1024 * 1024; console.log(`Crunching ${argv.size} MB with ${argv.hash}...`); run(); diff --git a/src/tools/fs_speed.js b/src/tools/fs_speed.js index 1a67c42976..52d2246841 100644 --- a/src/tools/fs_speed.js +++ b/src/tools/fs_speed.js @@ -1,11 +1,13 @@ /* Copyright (C) 2016 NooBaa */ 'use strict'; +require('../util/panic'); + const fs = require('fs'); const util = require('util'); const path = require('path'); const argv = require('minimist')(process.argv); -const cluster = require('cluster'); +const { cluster } = require('../util/fork_utils'); const execAsync = util.promisify(require('child_process').exec); const Speedometer = require('../util/speedometer'); const RandStream = require('../util/rand_stream'); @@ -55,6 +57,7 @@ argv.file_size_units = argv.file_size_units || 'MB'; argv.block_size_units = argv.block_size_units || 'MB'; argv.fsync = Boolean(argv.fsync); argv.mode = argv.mode || 'nsfs'; +argv.backend = argv.backend || 'GPFS'; if (argv.mode === 'dd') { argv.device = argv.device || '/dev/zero'; } else { @@ -89,18 +92,15 @@ const file_size = argv.file_size * size_units_table[argv.file_size_units]; const block_count = Math.ceil(file_size / block_size); const file_size_aligned = block_count * block_size; const nb_native = argv.mode === 'nsfs' && require('../util/nb_native'); -const is_master = cluster.isMaster; -const speedometer = new Speedometer(is_master ? 'Total Speed' : 'FS Speed'); +const is_master = cluster.isPrimary; const start_time = Date.now(); const end_time = start_time + (argv.time * 1000); -if (argv.forks > 1 && is_master) { - speedometer.fork(argv.forks); -} else { - main(); -} +const speedometer = new Speedometer('FS Speed'); +speedometer.run_workers(argv.forks, main, argv); async function main() { + // nb_native().fs.set_debug_level(5); const promises = []; fs.mkdirSync(argv.dir, { recursive: true }); for (let i = 0; i < argv.concur; ++i) promises.push(worker(i)); @@ -127,15 +127,23 @@ async function worker(id) { if (file_start_time >= end_time) break; const file_path = path.join(dir, `file-${file_id}`); file_id += 1; - if (argv.mode === 'nsfs') { - await work_with_nsfs(file_path); - } else if (argv.mode === 'nodejs') { - await work_with_nodejs(file_path); - } else if (argv.mode === 'dd') { - await work_with_dd(file_path); + try { + if (argv.mode === 'nsfs') { + await work_with_nsfs(file_path); + } else if (argv.mode === 'nodejs') { + await work_with_nodejs(file_path); + } else if (argv.mode === 'dd') { + await work_with_dd(file_path); + } + const took_ms = Date.now() - file_start_time; + speedometer.add_op(took_ms); + } catch (err) { + if (argv.read && err.code === 'ENOENT') { + file_id = 0; + } else { + throw err; + } } - const took_ms = Date.now() - file_start_time; - speedometer.add_op(took_ms); } } @@ -157,24 +165,25 @@ async function work_with_nsfs(file_path) { const fs_context = { // uid: 666, // gid: 666, - backend: 'GPFS', - warn_threshold_ms: 1000, + backend: argv.backend, + warn_threshold_ms: 10000, }; - const file = await nb_native().fs.open(fs_context, file_path, argv.read ? 'r' : 'w', 0x660); + const file = await nb_native().fs.open(fs_context, file_path, argv.read ? 'r' : 'w', 0o660); for (let pos = 0; pos < file_size_aligned; pos += block_size) { const buf_start_time = Date.now(); if (buf_start_time >= end_time) break; const buf = rand_stream.generator(block_size); if (argv.nvec > 1) { if (argv.read) { - await file.readv(fs_context, split_to_nvec(buf, argv.nvec)); + // await file.readv(fs_context, split_to_nvec(buf, argv.nvec)); + throw new Error('TODO: readv is not yet available in NativeFile'); } else { await file.writev(fs_context, split_to_nvec(buf, argv.nvec)); } } else if (argv.read) { await file.read(fs_context, buf, 0, buf.length, pos); } else { - await file.write(fs_context, buf); + await file.write(fs_context, buf, buf.length, pos); } speedometer.update(block_size); } @@ -187,7 +196,7 @@ async function work_with_nodejs(file_path) { highWaterMark: 2 * block_size, generator: argv.read ? 'noinit' : argv.generator, }); - const file = await fs.promises.open(file_path, argv.read ? 'r' : 'w', 0x660); + const file = await fs.promises.open(file_path, argv.read ? 'r' : 'w', 0o660); for (let pos = 0; pos < file_size_aligned; pos += block_size) { const buf_start_time = Date.now(); if (buf_start_time >= end_time) break; diff --git a/src/tools/rand_speed.js b/src/tools/rand_speed.js index 31737fbb5c..134795a4e1 100644 --- a/src/tools/rand_speed.js +++ b/src/tools/rand_speed.js @@ -2,25 +2,21 @@ 'use strict'; const zlib = require('zlib'); -const cluster = require('cluster'); const RandStream = require('../util/rand_stream'); const Speedometer = require('../util/speedometer'); const argv = require('minimist')(process.argv); -argv.forks = argv.forks || 1; +argv.forks ||= 1; +argv.buf ||= 1024 * 1024; +argv.generator ||= 'crypto'; // see RandStream for options -if (argv.forks > 1 && cluster.isMaster) { - const master_speedometer = new Speedometer('Total Speed'); - master_speedometer.fork(argv.forks); -} else { - main(); -} +const speedometer = new Speedometer('RandStream'); +speedometer.run_workers(argv.forks, main, argv); function main() { - const speedometer = new Speedometer('Rand Speed'); const len = (argv.len * 1024 * 1024) || Infinity; const input = new RandStream(len, { - highWaterMark: 1024 * 1024, + highWaterMark: argv.buf, generator: argv.generator, }); input.on('data', data => speedometer.update(data.length)); diff --git a/src/tools/tcp_speed.js b/src/tools/tcp_speed.js index db09b75a4d..29f249103d 100644 --- a/src/tools/tcp_speed.js +++ b/src/tools/tcp_speed.js @@ -5,7 +5,6 @@ const net = require('net'); const tls = require('tls'); const argv = require('minimist')(process.argv); const crypto = require('crypto'); -const cluster = require('cluster'); const ssl_utils = require('../util/ssl_utils'); const semaphore = require('../util/semaphore'); const Speedometer = require('../util/speedometer'); @@ -35,20 +34,9 @@ const buffers_pool = new buffer_utils.BuffersPool({ warning_timeout: 2 * 60 * 1000, }); -const send_speedometer = new Speedometer('Send Speed'); -const recv_speedometer = new Speedometer('Receive Speed'); -const master_speedometer = new Speedometer('Total Speed'); - -if (cluster.isMaster) { - delete argv._; - console.log('ARGV', JSON.stringify(argv)); -} - -if (argv.forks > 1 && cluster.isMaster) { - master_speedometer.fork(argv.forks); -} else { - main(); -} +const recv_speedometer = new Speedometer('TCP Recv'); +const send_speedometer = new Speedometer('TCP Send'); +send_speedometer.run_workers(argv.forks, main, argv); function main() { if (argv.help) { @@ -58,7 +46,7 @@ function main() { return run_server(); } if (argv.client) { - argv.client = (typeof(argv.client) === 'string' && argv.client) || 'localhost'; + argv.client = (typeof argv.client === 'string' && argv.client) || 'localhost'; return run_client(); } return usage(); @@ -78,9 +66,9 @@ function run_server() { net.createServer(); server.on('error', err => { - console.error('TCP server error', err.message); - process.exit(); - }) + console.error('TCP server error', err.message); + process.exit(); + }) .on('close', () => { console.error('TCP server closed'); process.exit(); @@ -114,12 +102,13 @@ function run_client() { function run_client_conn() { /** @type {net.Socket} */ + // @ts-ignore const conn = (argv.ssl ? tls : net).connect({ - port: argv.port, - host: argv.client, - // we allow self generated certificates to avoid public CA signing: - rejectUnauthorized: false, - }) + port: argv.port, + host: argv.client, + // we allow self generated certificates to avoid public CA signing: + rejectUnauthorized: false, + }) .once('error', err => { console.error('TCP client connection error', err.message); process.exit(); diff --git a/src/util/http_recorder.js b/src/util/http_recorder.js index f5b439a89c..0245c9777b 100644 --- a/src/util/http_recorder.js +++ b/src/util/http_recorder.js @@ -1,10 +1,12 @@ /* Copyright (C) 2016 NooBaa */ +/* eslint-disable no-bitwise, max-params */ 'use strict'; const fs = require('fs'); const stream = require('stream'); +// @ts-ignore const http_parser = process.binding('http_parser'); -const HTTPParser = http_parser.HTTPParser; +const { HTTPParser, ConnectionsList } = http_parser; const _cached_array_push = Array.prototype.push; @@ -14,7 +16,8 @@ class HTTPRecorder extends stream.Writable { super(); this.file_namer = file_namer; this.max_headers = 2000; - this._parser = new HTTPParser(HTTPParser.REQUEST); + this._parser = new HTTPParser(); + this._connections = new ConnectionsList(); this._start_message(); } @@ -22,7 +25,14 @@ class HTTPRecorder extends stream.Writable { if (this._out_file) this._out_file.end(); this._out_file = null; this._pending = []; - this._parser.reinitialize(HTTPParser.REQUEST, true); + + this._parser.initialize( + HTTPParser.REQUEST, + {}, // new HTTPServerAsyncResource('HTTPINCOMINGMESSAGE', socket), + 0, // server.maxHeaderSize || 0, + HTTPParser.kLenientAll | 0, // lenient ? kLenientAll : kLenientNone, + this._connections, // server[kConnections], + ); let slow_url = ''; const slow_headers = []; @@ -31,11 +41,8 @@ class HTTPRecorder extends stream.Writable { // this request. // `url` is not set for response parsers but that's not applicable here since // all our parsers are request parsers. - // eslint-disable-next-line max-params - // eslint-disable-next-line no-bitwise this._parser[HTTPParser.kOnHeadersComplete | 0] = ( versionMajor, versionMinor, headers, method, url, - //eslint-disable-next-line max-params statusCode, statusMessage, upgrade, shouldKeepAlive) => { // console.log('kOnHeadersComplete', // method, url, versionMajor, versionMinor, @@ -71,7 +78,6 @@ class HTTPRecorder extends stream.Writable { // processed in a single run. This method is also // called to process trailing HTTP headers. // Once we exceeded headers limit - stop collecting them - // eslint-disable-next-line no-bitwise this._parser[HTTPParser.kOnHeaders | 0] = (headers, url) => { console.log('kOnHeaders', headers, url); slow_url += url; @@ -79,12 +85,10 @@ class HTTPRecorder extends stream.Writable { _cached_array_push.apply(slow_headers, add); }; - // eslint-disable-next-line no-bitwise this._parser[HTTPParser.kOnBody | 0] = (buf, start, len) => { // console.log('kOnBody', buf.length, start, len); }; - // eslint-disable-next-line no-bitwise this._parser[HTTPParser.kOnMessageComplete | 0] = () => { // console.log('kOnMessageComplete'); this._start_message(); diff --git a/src/util/http_utils.js b/src/util/http_utils.js index 387db25a5a..22b59f6a24 100644 --- a/src/util/http_utils.js +++ b/src/util/http_utils.js @@ -64,6 +64,44 @@ const no_proxy_list = (NO_PROXY ? NO_PROXY.split(',') : []).map(addr => { const parse_xml_to_js = xml2js.parseStringPromise; const non_printable_regexp = /[\x00-\x1F]/; +/** + * Since header values can be either string or array of strings we need to handle both cases. + * While most callers might prefer to always handle a single string value, which is why we + * have this helper, some callers might prefer to always convert to array of strings, + * which is why we have hdr_as_arr(). + * + * @param {import('http').IncomingHttpHeaders} headers + * @param {string} key the header name + * @param {string} [join_sep] optional separator to join multiple values, if not provided only the first value is returned + * @returns {string|undefined} the header string value or undefined if not found + */ +function hdr_as_str(headers, key, join_sep) { + const v = headers[key]; + if (v === undefined) return undefined; + if (typeof v === 'string') return v; + if (!Array.isArray(v)) return String(v); // should not happen but would not fail a request for it + if (join_sep === undefined) return String(v[0]); // if not joining - return just the first + return v.join(join_sep); // join all values with the separator +} + +/** + * Since header values can be either string or array of strings we need to handle both cases. + * While most callers might prefer to always handle a single string value, which is why we + * have hdr_as_str(), some callers might prefer to always convert to array of strings, + * which is why we have this helper. + * + * @param {import('http').IncomingHttpHeaders} headers + * @param {string} key the header name + * @returns {string[]|undefined} the header string value or undefined if not found + */ +function hdr_as_arr(headers, key) { + const v = headers[key]; + if (v === undefined) return undefined; + if (typeof v === 'string') return [v]; + if (!Array.isArray(v)) return [String(v)]; // should not happen but would not fail a request for it + return v; +} + function parse_url_query(req) { req.originalUrl = req.url; const query_pos = req.url.indexOf('?'); @@ -84,7 +122,6 @@ function parse_client_ip(req) { return fwd.includes(',') ? fwd.split(',', 1)[0] : fwd; } - /** * @typedef {{ * if_modified_since?: number, @@ -534,6 +571,12 @@ function make_https_request(options, body, body_encoding) { }); } +/** + * + * @param {http.RequestOptions} options + * @param {*} body + * @returns {Promise} + */ async function make_http_request(options, body) { return new Promise((resolve, reject) => { http.request(options, resolve) @@ -770,10 +813,10 @@ function authorize_session_token(req, options) { } function validate_server_ip_whitelist(req) { + if (config.S3_SERVER_IP_WHITELIST.length === 0) return; // remove prefix for V4 IPs for whitelist validation // TODO: replace the equality check with net.BlockList() usage const server_ip = req.connection.localAddress.replace(/^::ffff:/, ''); - if (config.S3_SERVER_IP_WHITELIST.length === 0) return; for (const whitelist_ip of config.S3_SERVER_IP_WHITELIST) { if (server_ip === whitelist_ip) { return; @@ -841,7 +884,8 @@ function listen_port(port, server, server_type) { if (server_type !== 'METRICS') { setup_endpoint_server(server); } - server.listen(port, err => { + const local_ip = process.env.LOCAL_IP || '0.0.0.0'; + server.listen(port, local_ip, err => { if (err) { dbg.error('ENDPOINT FAILED to listen', err); reject(err); @@ -871,24 +915,32 @@ function setup_endpoint_server(server) { }); // See https://nodejs.org/api/http.html#http_event_clienterror - server.on('clientError', function on_s3_client_error(err, socket) { - - // On parsing errors we reply 400 Bad Request to conform with AWS - // These errors come from the nodejs native http parser. - if (typeof err.code === 'string' && - err.code.startsWith('HPE_INVALID_') && - err.bytesParsed > 0) { - console.error('ENDPOINT CLIENT ERROR - REPLY WITH BAD REQUEST', err); - socket.write('HTTP/1.1 400 Bad Request\r\n'); - socket.write(`Date: ${new Date().toUTCString()}\r\n`); - socket.write('Connection: close\r\n'); - socket.write('Content-Length: 0\r\n'); - socket.end('\r\n'); - } + server.on('clientError', + /** + * @param {Error & { code?: string, bytesParsed?: number }} err + * @param {net.Socket} socket + */ + (err, socket) => { + + if (err.code === 'ECONNRESET' || !socket.writable) { + return; + } + // On parsing errors we reply 400 Bad Request to conform with AWS + // These errors come from the nodejs native http parser. + if (typeof err.code === 'string' && + err.code.startsWith('HPE_INVALID_') && + err.bytesParsed > 0) { + console.error('ENDPOINT CLIENT ERROR - REPLY WITH BAD REQUEST', err); + socket.write('HTTP/1.1 400 Bad Request\r\n'); + socket.write(`Date: ${new Date().toUTCString()}\r\n`); + socket.write('Connection: close\r\n'); + socket.write('Content-Length: 0\r\n'); + socket.end('\r\n'); + } - // in any case we destroy the socket - socket.destroy(); - }); + // in any case we destroy the socket + socket.destroy(); + }); server.keepAliveTimeout = config.ENDPOINT_HTTP_SERVER_KEEPALIVE_TIMEOUT; server.requestTimeout = config.ENDPOINT_HTTP_SERVER_REQUEST_TIMEOUT; @@ -924,6 +976,8 @@ function set_response_headers_from_request(req, res) { if (req.query['response-expires']) res.setHeader('Expires', req.query['response-expires']); } +exports.hdr_as_str = hdr_as_str; +exports.hdr_as_arr = hdr_as_arr; exports.parse_url_query = parse_url_query; exports.parse_client_ip = parse_client_ip; exports.get_md_conditions = get_md_conditions; diff --git a/src/util/rand_stream.js b/src/util/rand_stream.js index 7895e8df30..10c01707fa 100644 --- a/src/util/rand_stream.js +++ b/src/util/rand_stream.js @@ -33,7 +33,7 @@ class RandStream extends stream.Readable { super(options); this.max_length = max_length; this.chunk_size = (options && options.highWaterMark) || 1024 * 1024; - this.generator = this[`generate_${(options && options.generator) || 'cipher'}`]; + this.generator = this[`generate_${(options && options.generator) || 'crypto'}`]; this.cipher_seed = options && options.cipher_seed; this.pos = 0; this.ticks = 0; @@ -44,7 +44,7 @@ class RandStream extends stream.Readable { * * crypto.randomBytes() used to be slow ~50 MB/sec - BUT it is no longer so... * - * The speed of this mode is ~2000 MB/sec. + * The speed of this mode is ~4000 MB/sec. */ generate_crypto(size) { return crypto.randomBytes(size); @@ -106,7 +106,7 @@ class RandStream extends stream.Readable { * The overall expected speed can be calculated by: * speed = fake_factor * speed(crypto.randomBytes) * - * The speed of this mode is ~4500 MB/sec (with fake_factor=64) + * The speed of this mode is ~16,000 MB/sec (with fake_factor=64) * */ generate_fake(size) { @@ -135,7 +135,7 @@ class RandStream extends stream.Readable { /** * generate_zeros: * - * The speed of this mode is ~7000 MB/sec. + * The speed of this mode is ~30,000 MB/sec. */ generate_zeros(size) { return Buffer.alloc(size); @@ -144,7 +144,7 @@ class RandStream extends stream.Readable { /** * generate_fill: * - * The speed of this mode is ~7000 MB/sec. + * The speed of this mode is ~30,000 MB/sec. */ generate_fill(size) { return Buffer.alloc(size, crypto.randomInt(0, 256)); @@ -156,7 +156,7 @@ class RandStream extends stream.Readable { * Just allocates memory, no initialization. * Do not use if your process memory might contain sensitive data. * - * The speed of this mode is ~100,000 MB/sec. + * The speed of this mode is ~250,000 MB/sec. */ generate_noinit(size) { return Buffer.allocUnsafe(size); diff --git a/src/util/speedometer.js b/src/util/speedometer.js index 10cb232b13..b104ab961f 100644 --- a/src/util/speedometer.js +++ b/src/util/speedometer.js @@ -1,7 +1,9 @@ /* Copyright (C) 2016 NooBaa */ 'use strict'; -const cluster = require('cluster'); +const cluster = /** @type {import('node:cluster').Cluster} */ ( + /** @type {unknown} */ (require('node:cluster')) +); class Speedometer { @@ -19,39 +21,65 @@ class Speedometer { this.sum_latency = 0; this.last_latency = 0; - this.min_latency = Infinity; - this.max_latency = -Infinity; + this.min_latency = -1; + this.max_latency = -1; + } - this.worker_mode = cluster.isWorker; + run_workers(count, worker_func, args) { + if (cluster.isPrimary) { + console.log('ARGS:', JSON.stringify(args, null, 2)); + } + if (count > 1 && cluster.isPrimary) { + this.fork(count); + } else { + // primary will run the worker_func as well (if count <= 1 or undefined) + worker_func(); + } } fork(count) { - if (cluster.isMaster) { - cluster.on('message', (worker, bytes) => this.update(bytes)); - cluster.on('exit', worker => { - if (!Object.keys(cluster.workers).length) { - this.clear_interval(); - this.report(); - // process.exit(); - } - }); - } + if (cluster.isWorker) throw new Error('fork should be called only from the primary process'); + cluster.on('message', (worker, { bytes, ops, sum_latency, min_latency, max_latency }) => { + this.num_bytes += bytes; + this.num_ops += ops; + this.sum_latency += sum_latency; + if (min_latency >= 0 && (this.min_latency < 0 || min_latency < this.min_latency)) this.min_latency = min_latency; + if (max_latency >= 0 && (this.max_latency < 0 || max_latency > this.max_latency)) this.max_latency = max_latency; + if (!this.interval) this.set_interval(); + }); + cluster.on('exit', worker => { + if (!Object.keys(cluster.workers).length) { + this.clear_interval(); + this.report(); + // process.exit(); + } + }); for (let i = 0; i < count; ++i) { const worker = cluster.fork(); console.warn('Worker start', worker.process.pid); } } + is_primary() { + return cluster.isPrimary; + } + + is_worker() { + return cluster.isWorker; + } + update(bytes) { this.num_bytes += bytes; if (!this.interval) this.set_interval(); } add_op(took_ms) { + if (took_ms < 0) throw new Error('Speedometer: negative took_ms ' + took_ms); this.num_ops += 1; this.sum_latency += took_ms; - if (took_ms > this.max_latency) this.max_latency = took_ms; - if (took_ms < this.min_latency) this.min_latency = took_ms; + if (this.min_latency < 0 || took_ms < this.min_latency) this.min_latency = took_ms; + if (this.max_latency < 0 || took_ms > this.max_latency) this.max_latency = took_ms; + if (!this.interval) this.set_interval(); } set_interval(delay_ms) { @@ -72,15 +100,22 @@ class Speedometer { if (min_delay_ms && now - this.last_time < min_delay_ms) { return; } - if (this.worker_mode) { - process.send(this.num_bytes - this.last_bytes); + const bytes = this.num_bytes - this.last_bytes; + const ops = this.num_ops - this.last_ops; + const sum_latency = this.sum_latency - this.last_latency; + if (cluster.isWorker) { + process.send({ + bytes, + ops, + sum_latency, + min_latency: this.min_latency, // Infinity will send as null + max_latency: this.max_latency, // Infinity will send as null + }); } else { - const speed = (this.num_bytes - this.last_bytes) / + const speed = bytes / Math.max(0.001, now - this.last_time) * 1000 / 1024 / 1024; const avg_speed = this.num_bytes / Math.max(0.001, now - this.start_time) * 1000 / 1024 / 1024; - const ops = this.num_ops - this.last_ops; - const avg_latency = this.sum_latency - this.last_latency; console.log( this.name + ': ' + speed.toFixed(1) + ' MB/sec' + @@ -89,7 +124,7 @@ class Speedometer { ' | OPS: ' + ops + ' min:' + this.min_latency.toFixed(1) + 'ms' + ' max:' + this.max_latency.toFixed(1) + 'ms' + - ' avg:' + (avg_latency / ops).toFixed(1) + 'ms' + ' avg:' + (sum_latency / ops).toFixed(1) + 'ms' ) : '') ); } @@ -97,8 +132,8 @@ class Speedometer { this.last_bytes = this.num_bytes; this.last_ops = this.num_ops; this.last_latency = this.sum_latency; - this.min_latency = Infinity; - this.max_latency = -Infinity; + this.min_latency = -1; + this.max_latency = -1; } } From 5eeeb78e9e6ee2ac9df6464bea894a6ffd607e0f Mon Sep 17 00:00:00 2001 From: Guy Margalit Date: Tue, 14 Jan 2025 01:11:22 +0200 Subject: [PATCH 3/4] rdma scripts Signed-off-by: Guy Margalit --- cudaguy.c | 59 +++++++++++++++++++++++++++++++++ rdma-build1.sh | 2 ++ rdma-build2.sh | 2 ++ rdma-push1.sh | 3 ++ rdma-push2.sh | 3 ++ rdma-run.sh | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 158 insertions(+) create mode 100644 cudaguy.c create mode 100755 rdma-build1.sh create mode 100755 rdma-build2.sh create mode 100755 rdma-push1.sh create mode 100755 rdma-push2.sh create mode 100644 rdma-run.sh diff --git a/cudaguy.c b/cudaguy.c new file mode 100644 index 0000000000..8bcca440ac --- /dev/null +++ b/cudaguy.c @@ -0,0 +1,59 @@ +// Build: gcc -o cudaguy cudaguy.c -I/usr/local/cuda/include/ -L/usr/local/cuda/lib64 -lcuda +// Run: ./cudaguy + +#include +#include +#include +#include + +#define CU_TRY(fn) \ + do { \ + CUresult r = fn; \ + if (r != CUDA_SUCCESS) { \ + const char* cuda_err = ""; \ + cuGetErrorName(r, &cuda_err); \ + fprintf(stderr, "CUDA error: %s %s\n", cuda_err, #fn); \ + exit(1); \ + } \ + } while (0) + +int +main() +{ + size_t size = 8 * 1024 * 1024; + CUdevice cuda_device = 0; + CUcontext cuda_ctx = 0; + CUdeviceptr cuda_ptr = 0; + CUmemorytype mem_type = CU_MEMORYTYPE_HOST; + char* host_ptr = (char*)malloc(size); + + CU_TRY(cuInit(0)); + CU_TRY(cuDeviceGet(&cuda_device, 0);); + CU_TRY(cuCtxCreate(&cuda_ctx, 0, cuda_device)); + fprintf(stderr, "CUDA initialized: device %d context %p\n", cuda_device, (void*)cuda_ctx); + + CU_TRY(cuMemAlloc(&cuda_ptr, size)); + CU_TRY(cuMemsetD8(cuda_ptr, 'A', size)); + CU_TRY(cuCtxSynchronize()); + fprintf(stderr, "CUDA allocated %p size %zu\n", (void*)cuda_ptr, size); + + CU_TRY(cuPointerGetAttribute(&mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, cuda_ptr)); + fprintf(stderr, "CUDA buffer mem type: %d\n", mem_type); + + memset(host_ptr, 'B', size); + CU_TRY(cuMemcpyDtoH(host_ptr, cuda_ptr, size)); + + // skip repeating 'A' at the end, while keeping the first 10 chars, + // and terminate the string for printing + int i = size - 1; + while (i > 10 && host_ptr[i] == 'A') --i; + host_ptr[i] = '\0'; + fprintf(stderr, "CUDA copied to host: %s\n", host_ptr); + + free(host_ptr); + CU_TRY(cuMemFree(cuda_ptr)); + CU_TRY(cuCtxDestroy(cuda_ctx)); + fprintf(stderr, "CUDA freed\n"); + + return 0; +} \ No newline at end of file diff --git a/rdma-build1.sh b/rdma-build1.sh new file mode 100755 index 0000000000..e6bcf395fd --- /dev/null +++ b/rdma-build1.sh @@ -0,0 +1,2 @@ +ssh $GDS4N1 'cd /root/guym/noobaa-core && source /root/.nvm/nvm.sh && make RDMA=1' && + printf "\n\n IT'S DONE. \n\n" diff --git a/rdma-build2.sh b/rdma-build2.sh new file mode 100755 index 0000000000..6cb57b7932 --- /dev/null +++ b/rdma-build2.sh @@ -0,0 +1,2 @@ +ssh $GDS4N2 'cd /root/guym/noobaa-core && source /root/.nvm/nvm.sh && make RDMA=1 CUDA=1' && + printf "\n\n IT'S DONE. \n\n" diff --git a/rdma-push1.sh b/rdma-push1.sh new file mode 100755 index 0000000000..bf67a690ea --- /dev/null +++ b/rdma-push1.sh @@ -0,0 +1,3 @@ +git push -f ssh://$GDS4N1:/root/guym/noobaa-core guy-rdma && + ssh $GDS4N1 'cd /root/guym/noobaa-core && git rebase guy-rdma' && + printf "\n\n IT'S DONE. \n\n" diff --git a/rdma-push2.sh b/rdma-push2.sh new file mode 100755 index 0000000000..8c21a4511b --- /dev/null +++ b/rdma-push2.sh @@ -0,0 +1,3 @@ +git push -f ssh://$GDS4N2:/root/guym/noobaa-core guy-rdma && + ssh $GDS4N2 'cd /root/guym/noobaa-core && git rebase guy-rdma' && + printf "\n\n IT'S DONE. \n\n" diff --git a/rdma-run.sh b/rdma-run.sh new file mode 100644 index 0000000000..e1ed6d943e --- /dev/null +++ b/rdma-run.sh @@ -0,0 +1,89 @@ +## variables +CUDA_PATH="$(realpath /usr/local/cuda)" +CUOBJ_PATH="$(realpath ../cuObject-0.7.2-Linux_x86_64/src)" +CUFILE_ENV_PATH_JSON="$(realpath ../cuobj.json)" +RDMA_LIBS="$CUOBJ_PATH/lib/libcuobjserver.so $CUOBJ_PATH/lib/libcuobjclient.so $CUOBJ_PATH/lib/libcufile.so.1.13.0 $CUOBJ_PATH/lib/libcufile_rdma.so.1.13.0" + +## git push to hosts +./rdma-push.sh + +## build script +./rdma-build.sh + +## build commands +make RDMA=1 +make RDMA=1 CUDA=1 + +# quick rebuild: +rm -rf build/Release/obj.target/{rdma,cuda}_napi/ && + rm -f build/Release/{rdma,cuda}_napi.a && + rm -f build/Release/nb_native.node + + +## rdma_speed +UV_THREADPOOL_SIZE=4 LD_PRELOAD="$RDMA_LIBS" node src/tools/rdma_speed.js --server +UV_THREADPOOL_SIZE=4 LD_PRELOAD="$RDMA_LIBS" CUFILE_ENV_PATH_JSON="$CUFILE_ENV_PATH_JSON" node src/tools/rdma_speed.js --client --op GET +UV_THREADPOOL_SIZE=4 LD_PRELOAD="$RDMA_LIBS" CUFILE_ENV_PATH_JSON="$CUFILE_ENV_PATH_JSON" node src/tools/rdma_speed.js --client --op PUT + # --op PUT --forks 1 --concur 16 + # --pool_size $((4*32)) --size 32 + # --perf-basic-prof + +## http_speed +node src/tools/http_speed.js --server --buf $((8*1024*1024)) --size 8 --forks 8 +node src/tools/http_speed.js --client 172.16.0.61 --buf $((8*1024*1024)) --size 8 --forks 8 --concur 8 --method GET +node src/tools/http_speed.js --client 172.16.0.61 --buf $((8*1024*1024)) --size 8 --forks 8 --concur 8 --method PUT + +## noobaa server (local ips 172.16.0.61 and 172.16.0.71) +LD_PRELOAD="$RDMA_LIBS" LOCAL_IP=172.16.0.61 node src/cmd/nsfs.js +LD_PRELOAD="$RDMA_LIBS" LOCAL_IP=172.16.0.71 node src/cmd/nsfs.js + +#################################################### +## client (local ips 172.16.0.62 and 172.16.0.72) ## +#################################################### + +## s3cat +DISABLE_INIT_RANDOM_SEED=true \ + node src/tools/s3cat.js \ + --endpoint http://172.16.0.61:6001 \ + --access_key $AWS_ACCESS_KEY_ID \ + --secret_key $AWS_SECRET_ACCESS_KEY \ + --bucket bucket1 \ + --ls + + # --upload + # --get upload-m6s4i12b + + +UV_THREADPOOL_SIZE=4 \ + DISABLE_INIT_RANDOM_SEED=true \ + LD_PRELOAD="$RDMA_LIBS" \ + CUFILE_ENV_PATH_JSON="$CUFILE_ENV_PATH_JSON" \ + node src/tools/s3perf.js \ + --local_ip 172.16.0.62 \ + --endpoint http://172.16.0.61:6001 \ + --selfsigned \ + --access_key $AWS_ACCESS_KEY_ID \ + --secret_key $AWS_SECRET_ACCESS_KEY \ + --bucket bucket1 \ + --time 120 \ + --get fs_speed \ + --concur 1 \ + --forks 1 \ + --rdma \ + --cuda + + +## warp +../warp get \ + --host 172.16.0.61:6001 \ + --access-key $AWS_ACCESS_KEY_ID \ + --secret-key $AWS_SECRET_ACCESS_KEY \ + --duration 20s \ + --obj.size 100MiB \ + --objects 100 \ + --concurrent 20 \ + --disable-multipart \ + --disable-sha256-payload \ + --noclear \ + --list-existing + From 15630a36d7b0c1e67e5659549f8d41178a68300d Mon Sep 17 00:00:00 2001 From: Guy Margalit Date: Tue, 14 Jan 2025 01:11:22 +0200 Subject: [PATCH 4/4] s3 over rdma Signed-off-by: Guy Margalit --- .gitignore | 1 + Makefile | 9 +- config.js | 9 + cudaguy.c | 59 -- cudaguy.cpp | 113 ++++ docs/design/S3-over-RDMA.md | 109 +++ go/cmd/http_speed/http_speed.go | 144 ++++ go/cmd/tcp_speed/tcp_speed.go | 139 ++++ go/go.mod | 3 + go/internal/goutils/goutils.go | 120 ++++ package-lock.json | 183 +++-- package.json | 4 +- rdma-run.sh | 28 +- src/agent/block_store_speed.js | 18 +- src/endpoint/s3/ops/s3_get_object.js | 6 + src/endpoint/s3/ops/s3_head_object.js | 10 +- src/endpoint/s3/ops/s3_put_object.js | 15 +- src/endpoint/s3/ops/s3_put_object_uploadId.js | 6 + src/native/common.gypi | 9 - src/native/cuda/cuda_napi.cpp | 280 ++++++++ src/native/cuda/cuda_napi.gyp | 51 ++ src/native/nb_native.cpp | 27 +- src/native/nb_native.gyp | 35 +- src/native/rdma/rdma_client_napi.cpp | 362 ++++++++++ src/native/rdma/rdma_napi.gyp | 44 ++ src/native/rdma/rdma_server_napi.cpp | 588 +++++++++++++++++ src/native/s3select/s3select.gyp | 17 +- src/native/tools/crypto_napi.cpp | 78 +-- src/native/util/backtrace.h | 27 +- src/native/util/common.h | 2 + src/native/util/worker.h | 95 +++ src/native/warnings.gypi | 28 + src/sdk/namespace_fs.js | 300 ++++----- src/sdk/nb.d.ts | 85 ++- src/test/qa/capacity.js | 57 +- .../jest_tests/test_file_reader.test.js | 133 ++++ src/test/unit_tests/test_chunk_coder.js | 23 +- src/tools/coding_speed.js | 47 +- src/tools/cpu_speed.js | 29 +- src/tools/file_writer_hashing.js | 9 +- src/tools/fs_speed.js | 251 +++---- src/tools/http_speed.go | 200 ------ src/tools/http_speed.js | 350 +++++----- src/tools/rdma_speed.js | 267 ++++++++ src/tools/s3cat.js | 320 +++++---- src/tools/s3perf.js | 623 ++++++++++-------- src/tools/tcp_speed.go | 202 ------ src/util/buffer_utils.js | 72 +- src/util/file_reader.js | 230 +++++++ src/util/file_writer.js | 77 ++- src/util/fips.js | 4 +- src/util/http_utils.js | 25 + src/util/native_fs_utils.js | 119 +++- src/util/nb_native.js | 13 +- src/util/rdma_utils.js | 333 ++++++++++ src/util/speedometer.js | 522 ++++++++++++--- src/util/stream_utils.js | 4 +- 57 files changed, 5167 insertions(+), 1747 deletions(-) delete mode 100644 cudaguy.c create mode 100644 cudaguy.cpp create mode 100644 docs/design/S3-over-RDMA.md create mode 100644 go/cmd/http_speed/http_speed.go create mode 100644 go/cmd/tcp_speed/tcp_speed.go create mode 100644 go/go.mod create mode 100644 go/internal/goutils/goutils.go create mode 100644 src/native/cuda/cuda_napi.cpp create mode 100644 src/native/cuda/cuda_napi.gyp create mode 100644 src/native/rdma/rdma_client_napi.cpp create mode 100644 src/native/rdma/rdma_napi.gyp create mode 100644 src/native/rdma/rdma_server_napi.cpp create mode 100644 src/native/util/worker.h create mode 100644 src/native/warnings.gypi create mode 100644 src/test/unit_tests/jest_tests/test_file_reader.test.js delete mode 100644 src/tools/http_speed.go create mode 100644 src/tools/rdma_speed.js delete mode 100644 src/tools/tcp_speed.go create mode 100644 src/util/rdma_utils.js diff --git a/.gitignore b/.gitignore index 1c1c29bffe..e04c6186a6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ # BUILD node_modules /build +/target /noobaa.rpm # TEST diff --git a/Makefile b/Makefile index 6419fc2602..ba18b907dc 100644 --- a/Makefile +++ b/Makefile @@ -84,6 +84,13 @@ endif BUILD_S3SELECT?=1 BUILD_S3SELECT_PARQUET?=0 +GYP_DEFINES?= +GYP_DEFINES+=$(if $(RDMA),"BUILD_RDMA_NAPI=1",) +GYP_DEFINES+=$(if $(CUDA),"BUILD_CUDA_NAPI=1",) +GYP_DEFINES+=$(if $(CUDA_PATH),"CUDA_PATH=$(CUDA_PATH)",) +GYP_DEFINES+=$(if $(CUOBJ_PATH),"CUOBJ_PATH=$(CUOBJ_PATH)",) + + ## RPM VARIABLES DATE := $(shell date +'%Y%m%d') NOOBAA_PKG_VERSION := $(shell jq -r '.version' < ./package.json) @@ -106,7 +113,7 @@ default: build # this target builds incrementally build: npm install - npm run build + GYP_DEFINES='$(GYP_DEFINES)' npm run build --verbose .PHONY: build clean_build: diff --git a/config.js b/config.js index 753e3a468a..598a210c16 100644 --- a/config.js +++ b/config.js @@ -958,6 +958,7 @@ config.NSFS_GLACIER_MIGRATE_LOG_THRESHOLD = 50 * 1024; config.ANONYMOUS_ACCOUNT_NAME = 'anonymous'; config.NFSF_UPLOAD_STREAM_MEM_THRESHOLD = 8 * 1024 * 1024; +config.NFSF_DOWNLOAD_STREAM_MEM_THRESHOLD = 8 * 1024 * 1024; // we want to change our handling related to EACCESS error config.NSFS_LIST_IGNORE_ENTRY_ON_EACCES = true; @@ -1091,6 +1092,14 @@ config.DEFAULT_REGION = 'us-east-1'; config.VACCUM_ANALYZER_INTERVAL = 86400000; + +////////////// +/// RDMA /// +////////////// + +config.RDMA_ENABLED = true; // TODO STILL EXPERIMENTAL - should be false by default + + ///////////////////// // // // OVERRIDES // diff --git a/cudaguy.c b/cudaguy.c deleted file mode 100644 index 8bcca440ac..0000000000 --- a/cudaguy.c +++ /dev/null @@ -1,59 +0,0 @@ -// Build: gcc -o cudaguy cudaguy.c -I/usr/local/cuda/include/ -L/usr/local/cuda/lib64 -lcuda -// Run: ./cudaguy - -#include -#include -#include -#include - -#define CU_TRY(fn) \ - do { \ - CUresult r = fn; \ - if (r != CUDA_SUCCESS) { \ - const char* cuda_err = ""; \ - cuGetErrorName(r, &cuda_err); \ - fprintf(stderr, "CUDA error: %s %s\n", cuda_err, #fn); \ - exit(1); \ - } \ - } while (0) - -int -main() -{ - size_t size = 8 * 1024 * 1024; - CUdevice cuda_device = 0; - CUcontext cuda_ctx = 0; - CUdeviceptr cuda_ptr = 0; - CUmemorytype mem_type = CU_MEMORYTYPE_HOST; - char* host_ptr = (char*)malloc(size); - - CU_TRY(cuInit(0)); - CU_TRY(cuDeviceGet(&cuda_device, 0);); - CU_TRY(cuCtxCreate(&cuda_ctx, 0, cuda_device)); - fprintf(stderr, "CUDA initialized: device %d context %p\n", cuda_device, (void*)cuda_ctx); - - CU_TRY(cuMemAlloc(&cuda_ptr, size)); - CU_TRY(cuMemsetD8(cuda_ptr, 'A', size)); - CU_TRY(cuCtxSynchronize()); - fprintf(stderr, "CUDA allocated %p size %zu\n", (void*)cuda_ptr, size); - - CU_TRY(cuPointerGetAttribute(&mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, cuda_ptr)); - fprintf(stderr, "CUDA buffer mem type: %d\n", mem_type); - - memset(host_ptr, 'B', size); - CU_TRY(cuMemcpyDtoH(host_ptr, cuda_ptr, size)); - - // skip repeating 'A' at the end, while keeping the first 10 chars, - // and terminate the string for printing - int i = size - 1; - while (i > 10 && host_ptr[i] == 'A') --i; - host_ptr[i] = '\0'; - fprintf(stderr, "CUDA copied to host: %s\n", host_ptr); - - free(host_ptr); - CU_TRY(cuMemFree(cuda_ptr)); - CU_TRY(cuCtxDestroy(cuda_ctx)); - fprintf(stderr, "CUDA freed\n"); - - return 0; -} \ No newline at end of file diff --git a/cudaguy.cpp b/cudaguy.cpp new file mode 100644 index 0000000000..731059cca3 --- /dev/null +++ b/cudaguy.cpp @@ -0,0 +1,113 @@ +/* +Usage: +----- +CUDA_PATH="/usr/local/cuda" +CUOBJ_PATH="../cuObject-0.8.1-Linux_x86_64/src" +CUOBJ_LIBS="$CUOBJ_PATH/lib/libcuobjserver.so $CUOBJ_PATH/lib/libcuobjclient.so $CUOBJ_PATH/lib/libcufile.so.1.13.0 $CUOBJ_PATH/lib/libcufile_rdma.so.1.13.0" +g++ -o cudaguy cudaguy.cpp -I$CUDA_PATH/include/ -L$CUDA_PATH/lib64 -lcuda -I$CUOBJ_PATH/include/ $CUOBJ_LIBS +LD_PRELOAD=$CUOBJ_LIBS ./cudaguy +----- +*/ + +#include +#include +#include +#include + +#include "cuobjclient.h" +#include "protocol.h" +#include + +#define CU_TRY(fn) \ + do { \ + CUresult r = fn; \ + if (r != CUDA_SUCCESS) { \ + const char* cuda_err = ""; \ + cuGetErrorName(r, &cuda_err); \ + fprintf(stderr, "CUDA error: %s %s\n", cuda_err, #fn); \ + exit(1); \ + } \ + } while (0) + +ssize_t +cuobj_get( + const void* handle, + char* ptr, + size_t size, + loff_t offset, + const cufileRDMAInfo_t* rdma_info) +{ + fprintf(stderr, "cuobj_get: handle %p ptr %p size %zu offset %ld\n", handle, ptr, size, offset); + return size; +} + +ssize_t +cuobj_put( + const void* handle, + const char* ptr, + size_t size, + loff_t offset, + const cufileRDMAInfo_t* rdma_info) +{ + fprintf(stderr, "cuobj_put: handle %p ptr %p size %zu offset %ld\n", handle, ptr, size, offset); + return size; +} + +int +main() +{ + size_t size = 8 * 1024 * 1024; + CUdevice cuda_device = 0; + CUdevice cuda_device2 = 0; + CUcontext cuda_ctx = 0; + CUdeviceptr cuda_ptr = 0; + CUmemorytype mem_type = CU_MEMORYTYPE_HOST; + char* host_ptr = (char*)malloc(size); + + CU_TRY(cuInit(0)); + CU_TRY(cuDeviceGet(&cuda_device, 0);); + // CU_TRY(cuCtxCreate(&cuda_ctx, 0, cuda_device)); + CU_TRY(cuDevicePrimaryCtxRetain(&cuda_ctx, cuda_device)); + CU_TRY(cuCtxSetCurrent(cuda_ctx)); + fprintf(stderr, "CUDA initialized: device %d context %p\n", cuda_device, (void*)cuda_ctx); + + CU_TRY(cuCtxGetDevice(&cuda_device2)); + fprintf(stderr, "CUDA get device %d\n", cuda_device2); + + CU_TRY(cuMemAlloc(&cuda_ptr, size)); + CU_TRY(cuMemsetD8(cuda_ptr, 'A', size)); + CU_TRY(cuCtxSynchronize()); + fprintf(stderr, "CUDA allocated %p size %zu\n", (void*)cuda_ptr, size); + + CU_TRY(cuPointerGetAttribute(&mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, cuda_ptr)); + fprintf(stderr, "CUDA buffer mem type: %d\n", mem_type); + + CUObjIOOps cuobj_ops = { .get = cuobj_get, .put = cuobj_put }; + cuObjClient cuobj_client(cuobj_ops); + cuObjErr_t cuobj_err = cuobj_client.cuMemObjGetDescriptor((void*)cuda_ptr, size); + fprintf(stderr, "cuObjClient::cuMemObjGetDescriptor: %d\n", cuobj_err); + + cuObjMemoryType_t cuobj_mem_type = cuObjClient::getMemoryType((void*)cuda_ptr); + fprintf(stderr, "cuObjClient::getMemoryType: %d\n", cuobj_mem_type); + + ssize_t ret_size = cuobj_client.cuObjGet(NULL, (void*)cuda_ptr, size); + fprintf(stderr, "cuObjClient::cuObjGet: %zd\n", ret_size); + + memset(host_ptr, 'B', size); + CU_TRY(cuMemcpyDtoH(host_ptr, cuda_ptr, size)); + + // skip repeating 'A' at the end, while keeping the first 10 chars, + // and terminate the string for printing + int i = size - 1; + while (i > 10 && host_ptr[i] == 'A') --i; + host_ptr[i] = '\0'; + fprintf(stderr, "CUDA copied to host: %s\n", host_ptr); + + free(host_ptr); + CU_TRY(cuMemFree(cuda_ptr)); + CU_TRY(cuDevicePrimaryCtxRelease(cuda_device)); + // CU_TRY(cuCtxDestroy(cuda_ctx)); + fprintf(stderr, "CUDA freed\n"); + + return 0; +} \ No newline at end of file diff --git a/docs/design/S3-over-RDMA.md b/docs/design/S3-over-RDMA.md new file mode 100644 index 0000000000..1ff4c78d89 --- /dev/null +++ b/docs/design/S3-over-RDMA.md @@ -0,0 +1,109 @@ +# S3 over RDMA (EXPERIMENTAL) + +## Overview + +S3 over RDMA is a new technology that enhances I/O performance directly to the applications memory, or directly to GPU memory! RDMA is extremely efficient, it bypasses the operating system, TCP stack, and much of the networking CPU overhead. Layering S3 on top of RDMA fits like a glove for modern applications. And the same endpoints can serve both RDMA and non-RDMA clients with a simple HTTP header. + +This feature is still EXPERIMENTAL and is not yet available for production use. This document outlines the usage and design of this feature. + +## What is needed to use S3 over RDMA? + +Hardware: +- High performance RDMA network 100G/.../800G +- Infiniband or RoCE (must support DC transport) +- Compute Nodes with optional GPU devices and NVIDIA CUDA toolkit +- Storage Nodes with NVMe drives, can be same as compute nodes + +Software: +- RHEL / UBUNTU +- High performance file system (e.g GPFS) +- NooBaa RPM / build from source with RDMA support. +- NVIDIA's cuObject (beta) and cuFile RDMA libraries. + + +## Which applications can benefit from S3 over RDMA? + +- boto3 - S3 sdk for python applications +- s3-connector-for-pytorch - library for AI/ML applications (data loaders, checkpoints, etc.) +- rclone - a standalone CLI that can copy data between files/dirs and S3 +- nodejs - using aws-sdk-js-v3 to store data collected from web services +- (share with us your use case and we will add to the list...) + +## Lets dig right in + +- Clone the noobaa-core repository +- Install the required dependencies (nodejs, nasm, etc. - see the noobaa-core README) +- Standard build - simple `make` should succeed. + +Build the project with RDMA support: + +```bash +$ make RDMA=1 +``` + +or with RDMA and CUDA support: + +```bash +$ make RDMA=1 CUDA=1 +``` + +Define the following runtime variables: + +```bash +CUDA_PATH="$(realpath /usr/local/cuda)" +CUOBJ_PATH="$(realpath ../cuObject-0.8.1-Linux_x86_64/src)" +CUFILE_ENV_PATH_JSON="$(realpath ../cuobj.json)" +CUOBJ_LIBS="$CUOBJ_PATH/lib/libcuobjserver.so $CUOBJ_PATH/lib/libcuobjclient.so $CUOBJ_PATH/lib/libcufile.so.1.13.0 $CUOBJ_PATH/lib/libcufile_rdma.so.1.13.0" +``` + +**NOTE**: If compilation fails to find cuda_runtime.h use: `touch $CUOBJ_PATH/include/cuda_runtime.h` + +Create the configuration directory as described in [this doc](https://github.com/noobaa/noobaa-core/blob/master/docs/NooBaaNonContainerized/GettingStarted.md#configuration) (no need to build and install RPM because we build from source), and finally start the noobaa server with RDMA support: + +```bash +$ LD_PRELOAD=$CUOBJ_LIBS node src/cmd/nsfs +``` + +## Getting Started + +First we use the s3perf tool in the noobaa repo to test the RDMA performance. Here is a basic example that reads the same 8MB file 10 continuously and reports the speed: + +```bash +$ LD_PRELOAD="$CUOBJ_LIBS" \ + CUFILE_ENV_PATH_JSON="$CUFILE_ENV_PATH_JSON" \ + UV_THREADPOOL_SIZE=16 \ + DISABLE_INIT_RANDOM_SEED=true \ + node src/tools/s3perf.js \ + --endpoint http://172.16.0.61:6001 \ + --access_key "AK" --secret_key "SK" \ + --bucket bucket1 --get file8M --samekey \ + --time 120 --size_units MB --size 8 --concur 8 --forks 6 --rdma +``` + +Will output something like: + +```sh +Feb-20 5:50:05.386 [/3039076] [LOG] CONSOLE:: S3: 11240.0 MB/sec (average 9650.2) | OPS: 1405 min:20.7ms max:50.8ms avg:34.2ms +Feb-20 5:50:06.386 [/3039076] [LOG] CONSOLE:: S3: 11216.0 MB/sec (average 9685.5) | OPS: 1402 min:20.3ms max:54.2ms avg:34.3ms +Feb-20 5:50:07.386 [/3039076] [LOG] CONSOLE:: S3: 11040.0 MB/sec (average 9715.4) | OPS: 1380 min:17.1ms max:55.8ms avg:34.7ms +Feb-20 5:50:08.387 [/3039076] [LOG] CONSOLE:: S3: 11024.0 MB/sec (average 9743.7) | OPS: 1378 min:17.4ms max:58.3ms avg:34.9ms +``` + +Remove the --rdma flag to compare the performance with and without RDMA. + +```bash +Feb-20 5:53:16.867 [/3040865] [LOG] CONSOLE:: S3: 3931.9 MB/sec (average 3785.4) | OPS: 495 min:53.1ms max:169.3ms avg:98.0ms +Feb-20 5:53:17.869 [/3040865] [LOG] CONSOLE:: S3: 3918.4 MB/sec (average 3788.3) | OPS: 490 min:58.0ms max:161.3ms avg:98.0ms +Feb-20 5:53:18.869 [/3040865] [LOG] CONSOLE:: S3: 3978.2 MB/sec (average 3792.3) | OPS: 497 min:50.9ms max:157.1ms avg:97.2ms +Feb-20 5:53:19.871 [/3040865] [LOG] CONSOLE:: S3: 3949.0 MB/sec (average 3795.5) | OPS: 489 min:52.5ms max:159.1ms avg:96.6ms +``` + +The --cuda flag tests the performance using the GPU memory. It can be used with or without the --rdma flag. Currently this is failing. Stay tuned. + +```bash + +## Next steps + +- Integrate S3 over RDMA to python applications +- Support multiple Server IP's +- Optimization for GPFS diff --git a/go/cmd/http_speed/http_speed.go b/go/cmd/http_speed/http_speed.go new file mode 100644 index 0000000000..db95bc7e90 --- /dev/null +++ b/go/cmd/http_speed/http_speed.go @@ -0,0 +1,144 @@ +package main + +import ( + "crypto/tls" + "flag" + "fmt" + "io" + "log" + "net/http" + "os" + "runtime/pprof" + "strconv" + "sync" + + "github.com/noobaa/noobaa-core/go/internal/goutils" +) + +var flagClient = flag.String("client", "", "run client") +var flagConcur = flag.Int("concur", 1, "concurrent client requests") +var flagSize = flag.Int("size", 1, "request size in MB") +var flagBuf = flag.Int("buf", 128*1024, "memory buffer size in bytes") +var flagPort = flag.Int("port", 50505, "tcp port to use") +var flagSSL = flag.Bool("ssl", false, "use ssl") +var flagProf = flag.String("prof", "", "write cpu profile to file") + +type Speedometer = goutils.Speedometer + +func main() { + flag.Parse() + if *flagProf != "" { + f, err := os.Create(*flagProf) + if err != nil { + log.Fatal(err) + } + pprof.StartCPUProfile(f) + defer pprof.StopCPUProfile() + } + if *flagClient != "" { + runClient() + } else { + runServer() + } +} + +func runClient() { + var addr string + var client *http.Client + var speedometer Speedometer + + host := *flagClient + ":" + strconv.Itoa(*flagPort) + if *flagSSL { + addr = "https://" + host + client = &http.Client{ + Transport: &http.Transport{ + TLSClientConfig: &tls.Config{ + InsecureSkipVerify: true, + }, + }, + } + } else { + addr = "http://" + host + client = &http.Client{} + } + + speedometer.Init() + + wait := sync.WaitGroup{} + wait.Add(*flagConcur) + for i := 0; i < *flagConcur; i++ { + go runClientWorker(addr, client, &speedometer, &wait) + } + wait.Wait() +} + +func runClientWorker(addr string, client *http.Client, speedometer *Speedometer, wait *sync.WaitGroup) { + defer wait.Done() + for { + req, err := http.NewRequest("GET", addr, nil) + fatal(err) + res, err := client.Do(req) + fatal(err) + err = readBody(res.Body, speedometer) + fatal(err) + } +} + +func runServer() { + var speedometer Speedometer + speedometer.Init() + server := &http.Server{ + Addr: ":" + strconv.Itoa(*flagPort), + Handler: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + var err error + err = r.Body.Close() + fatal(err) + err = writeBody(w, *flagSize*1024*1024, &speedometer) + fatal(err) + }), + } + if *flagSSL { + err, cert := goutils.GenerateCert() + fatal(err) + server.TLSConfig = &tls.Config{Certificates: []tls.Certificate{*cert}} + server.ListenAndServeTLS("", "") + } else { + fmt.Println("Listening on port", *flagPort) + server.ListenAndServe() + } +} + +func writeBody(body http.ResponseWriter, size int, speedometer *Speedometer) error { + buf := make([]byte, *flagBuf) + n := 0 + for n < size { + nwrite, err := body.Write(buf) + if err != nil { + return err + } + n += nwrite + speedometer.Update(uint64(nwrite)) + } + return nil +} + +func readBody(body io.ReadCloser, speedometer *Speedometer) error { + buf := make([]byte, *flagBuf) + for { + nread, err := body.Read(buf) + if err == io.EOF { + break + } + if err != nil { + return err + } + speedometer.Update(uint64(nread)) + } + return nil +} + +func fatal(err error) { + if err != nil { + log.Panic(err) + } +} diff --git a/go/cmd/tcp_speed/tcp_speed.go b/go/cmd/tcp_speed/tcp_speed.go new file mode 100644 index 0000000000..95d9c2582e --- /dev/null +++ b/go/cmd/tcp_speed/tcp_speed.go @@ -0,0 +1,139 @@ +package main + +import ( + "bufio" + "crypto/tls" + "encoding/binary" + "errors" + "flag" + "fmt" + "io" + "log" + "net" + "os" + "runtime/pprof" + "strconv" + + "github.com/noobaa/noobaa-core/go/internal/goutils" +) + +type Speedometer = goutils.Speedometer + +var flagClient = flag.String("client", "", "run client") +var flagSSL = flag.Bool("ssl", false, "use ssl") +var flagPort = flag.Int("port", 50505, "tcp port to use") +var flagProf = flag.String("prof", "", "write cpu profile to file") +var flagBuf = flag.Int("buf", 128*1024, "memory buffer size in bytes") +var flagFrame = flag.Bool("frame", false, "send/receive framed messages") + +func main() { + flag.Parse() + if *flagProf != "" { + f, err := os.Create(*flagProf) + if err != nil { + log.Fatal(err) + } + pprof.StartCPUProfile(f) + defer pprof.StopCPUProfile() + } + if *flagClient != "" { + runClient() + } else { + runServer() + } +} + +func runClient() { + var conn net.Conn + var err error + host := *flagClient + ":" + strconv.Itoa(*flagPort) + if *flagSSL { + config := &tls.Config{InsecureSkipVerify: true} + conn, err = tls.Dial("tcp", host, config) + } else { + conn, err = net.Dial("tcp", host) + } + fatal(err) + buf := make([]byte, *flagBuf) + var speedometer Speedometer + speedometer.Init() + for { + if *flagFrame { + n := uint32(len(buf)) // uint32(float64(len(buf))*(1+rand.Float64())/8) * 4 + err := binary.Write(conn, binary.BigEndian, n) + fatal(err) + // nwrite, err := conn.Write(buf[0:n]) + nwrite, err := conn.Write(buf) + if err == io.EOF { + break + } + fatal(err) + speedometer.Update(uint64(nwrite)) + + } else { + nwrite, err := conn.Write(buf) + if err == io.EOF { + break + } + fatal(err) + speedometer.Update(uint64(nwrite)) + } + } + conn.Close() +} + +func runServer() { + var listener net.Listener + var err error + address := ":" + strconv.Itoa(*flagPort) + if *flagSSL { + err, cert := goutils.GenerateCert() + fatal(err) + config := &tls.Config{Certificates: []tls.Certificate{*cert}} + listener, err = tls.Listen("tcp", address, config) + } else { + listener, err = net.Listen("tcp", address) + } + fatal(err) + fmt.Println("Listening on port", *flagPort) + conn, err := listener.Accept() + fatal(err) + listener.Close() + // reader := conn + reader := bufio.NewReaderSize(conn, *flagBuf) + buf := make([]byte, *flagBuf) + var speedometer Speedometer + speedometer.Init() + for { + if *flagFrame { + var n uint32 + err := binary.Read(reader, binary.BigEndian, &n) + if err == io.EOF { + break + } + fatal(err) + if int(n) > len(buf) { + fatal(errors.New("Frame too big")) + } + nread, err := io.ReadAtLeast(reader, buf, int(n)) + if err == io.EOF { + break + } + fatal(err) + speedometer.Update(uint64(nread)) + } else { + nread, err := reader.Read(buf) + if err == io.EOF { + break + } + fatal(err) + speedometer.Update(uint64(nread)) + } + } +} + +func fatal(err error) { + if err != nil { + log.Panic(err) + } +} diff --git a/go/go.mod b/go/go.mod new file mode 100644 index 0000000000..87a57c7884 --- /dev/null +++ b/go/go.mod @@ -0,0 +1,3 @@ +module github.com/noobaa/noobaa-core/go + +go 1.24.2 diff --git a/go/internal/goutils/goutils.go b/go/internal/goutils/goutils.go new file mode 100644 index 0000000000..895026cc91 --- /dev/null +++ b/go/internal/goutils/goutils.go @@ -0,0 +1,120 @@ +package goutils + +import ( + "bytes" + "crypto/rand" + "crypto/rsa" + "crypto/tls" + "crypto/x509" + "crypto/x509/pkix" + "encoding/pem" + "fmt" + "io" + "math/big" + "net" + "time" +) + +type SpeedBodyReader struct { + io.ReadCloser + n uint64 + reqsize uint64 + speedometer *Speedometer +} + +func (r *SpeedBodyReader) Read(p []byte) (n int, err error) { + if r.n > r.reqsize { + return 0, io.EOF + } + l := uint64(len(p)) + r.n += l + r.speedometer.Update(l) + return len(p), nil +} + +func (r *SpeedBodyReader) Close() error { + return nil +} + +// Speedometer is a speed measurement util +type Speedometer struct { + bytes uint64 + lastBytes uint64 + lastTime time.Time + inputChan chan uint64 +} + +// Init a speedometer +func (s *Speedometer) Init() { + s.lastTime = time.Now() + s.inputChan = make(chan uint64, 64*1024) + go func() { + for { + bytes := <-s.inputChan + s.bytes += bytes + took := time.Since(s.lastTime).Seconds() + if took >= 1 { + fmt.Printf("%7.1f MB/sec \n", float64(s.bytes-s.lastBytes)/1024/1024/took) + s.lastTime = time.Now() + s.lastBytes = s.bytes + } + } + }() +} + +// Update a speedometer +func (s *Speedometer) Update(bytes uint64) { + s.inputChan <- bytes +} + +// GenerateCert generates a self signed TLS certificate +func GenerateCert() (error, *tls.Certificate) { + + cert := &x509.Certificate{ + SerialNumber: big.NewInt(1), + Subject: pkix.Name{Organization: []string{"Acme Co"}}, + NotBefore: time.Now(), + NotAfter: time.Now().AddDate(1, 0, 0), + IsCA: true, + BasicConstraintsValid: true, + KeyUsage: x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature | x509.KeyUsageCertSign, + ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth, x509.ExtKeyUsageClientAuth}, + IPAddresses: []net.IP{net.ParseIP("127.0.0.1")}, + DNSNames: nil, + } + + privateKey, err := rsa.GenerateKey(rand.Reader, 4096) + if err != nil { + return err, nil + } + + certBytes, err := x509.CreateCertificate(rand.Reader, cert, cert, &privateKey.PublicKey, privateKey) + if err != nil { + return err, nil + } + + certPEM := new(bytes.Buffer) + err = pem.Encode(certPEM, &pem.Block{ + Type: "CERTIFICATE", + Bytes: certBytes, + }) + if err != nil { + return err, nil + } + + privateKeyPEM := new(bytes.Buffer) + err = pem.Encode(privateKeyPEM, &pem.Block{ + Type: "RSA PRIVATE KEY", + Bytes: x509.MarshalPKCS1PrivateKey(privateKey), + }) + if err != nil { + return err, nil + } + + tlsCert, err := tls.X509KeyPair(certPEM.Bytes(), privateKeyPEM.Bytes()) + if err != nil { + return err, nil + } + + return nil, &tlsCert +} diff --git a/package-lock.json b/package-lock.json index aa23008a77..c2cc1f27d7 100644 --- a/package-lock.json +++ b/package-lock.json @@ -16,7 +16,7 @@ "@azure/monitor-query": "1.3.1", "@azure/storage-blob": "12.27.0", "@google-cloud/storage": "7.15.2", - "@smithy/node-http-handler": "3.3.3", + "@smithy/node-http-handler": "4.0.3", "ajv": "8.17.1", "aws-sdk": "2.1692.0", "bcrypt": "5.1.1", @@ -49,6 +49,7 @@ "pg": "8.14.1", "ping": "0.4.4", "prom-client": "15.1.3", + "protobufjs": "7.4.0", "ps-node": "0.1.6", "seedrandom": "3.0.5", "setimmediate": "1.0.5", @@ -2030,6 +2031,12 @@ "dev": true, "license": "MIT" }, + "node_modules/@datadog/sketches-js": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/@datadog/sketches-js/-/sketches-js-2.1.1.tgz", + "integrity": "sha512-d5RjycE+MObE/hU+8OM5Zp4VjTwiPLRa8299fj7muOmR16fb942z8byoMbCErnGh0lBevvgkGrLclQDvINbIyg==", + "license": "Apache-2.0" + }, "node_modules/@eslint-community/eslint-utils": { "version": "4.5.1", "resolved": "https://registry.npmjs.org/@eslint-community/eslint-utils/-/eslint-utils-4.5.1.tgz", @@ -3010,6 +3017,70 @@ "node": ">=14" } }, + "node_modules/@protobufjs/aspromise": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@protobufjs/aspromise/-/aspromise-1.1.2.tgz", + "integrity": "sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ==", + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/base64": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@protobufjs/base64/-/base64-1.1.2.tgz", + "integrity": "sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg==", + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/codegen": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/@protobufjs/codegen/-/codegen-2.0.4.tgz", + "integrity": "sha512-YyFaikqM5sH0ziFZCN3xDC7zeGaB/d0IUb9CATugHWbd1FRFwWwt4ld4OYMPWu5a3Xe01mGAULCdqhMlPl29Jg==", + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/eventemitter": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/eventemitter/-/eventemitter-1.1.0.tgz", + "integrity": "sha512-j9ednRT81vYJ9OfVuXG6ERSTdEL1xVsNgqpkxMsbIabzSo3goCjDIveeGv5d03om39ML71RdmrGNjG5SReBP/Q==", + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/fetch": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/fetch/-/fetch-1.1.0.tgz", + "integrity": "sha512-lljVXpqXebpsijW71PZaCYeIcE5on1w5DlQy5WH6GLbFryLUrBD4932W/E2BSpfRJWseIL4v/KPgBFxDOIdKpQ==", + "license": "BSD-3-Clause", + "dependencies": { + "@protobufjs/aspromise": "^1.1.1", + "@protobufjs/inquire": "^1.1.0" + } + }, + "node_modules/@protobufjs/float": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/@protobufjs/float/-/float-1.0.2.tgz", + "integrity": "sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ==", + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/inquire": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/inquire/-/inquire-1.1.0.tgz", + "integrity": "sha512-kdSefcPdruJiFMVSbn801t4vFK7KB/5gd2fYvrxhuJYg8ILrmn9SKSX2tZdV6V+ksulWqS7aXjBcRXl3wHoD9Q==", + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/path": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@protobufjs/path/-/path-1.1.2.tgz", + "integrity": "sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA==", + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/pool": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/pool/-/pool-1.1.0.tgz", + "integrity": "sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw==", + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/utf8": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/utf8/-/utf8-1.1.0.tgz", + "integrity": "sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw==", + "license": "BSD-3-Clause" + }, "node_modules/@sinclair/typebox": { "version": "0.27.8", "resolved": "https://registry.npmjs.org/@sinclair/typebox/-/typebox-0.27.8.tgz", @@ -3432,83 +3503,19 @@ } }, "node_modules/@smithy/node-http-handler": { - "version": "3.3.3", - "resolved": "https://registry.npmjs.org/@smithy/node-http-handler/-/node-http-handler-3.3.3.tgz", - "integrity": "sha512-BrpZOaZ4RCbcJ2igiSNG16S+kgAc65l/2hmxWdmhyoGWHTLlzQzr06PXavJp9OBlPEG/sHlqdxjWmjzV66+BSQ==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/abort-controller": "^3.1.9", - "@smithy/protocol-http": "^4.1.8", - "@smithy/querystring-builder": "^3.0.11", - "@smithy/types": "^3.7.2", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=16.0.0" - } - }, - "node_modules/@smithy/node-http-handler/node_modules/@smithy/abort-controller": { - "version": "3.1.9", - "resolved": "https://registry.npmjs.org/@smithy/abort-controller/-/abort-controller-3.1.9.tgz", - "integrity": "sha512-yiW0WI30zj8ZKoSYNx90no7ugVn3khlyH/z5W8qtKBtVE6awRALbhSG+2SAHA1r6bO/6M9utxYKVZ3PCJ1rWxw==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/types": "^3.7.2", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=16.0.0" - } - }, - "node_modules/@smithy/node-http-handler/node_modules/@smithy/protocol-http": { - "version": "4.1.8", - "resolved": "https://registry.npmjs.org/@smithy/protocol-http/-/protocol-http-4.1.8.tgz", - "integrity": "sha512-hmgIAVyxw1LySOwkgMIUN0kjN8TG9Nc85LJeEmEE/cNEe2rkHDUWhnJf2gxcSRFLWsyqWsrZGw40ROjUogg+Iw==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/types": "^3.7.2", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=16.0.0" - } - }, - "node_modules/@smithy/node-http-handler/node_modules/@smithy/querystring-builder": { - "version": "3.0.11", - "resolved": "https://registry.npmjs.org/@smithy/querystring-builder/-/querystring-builder-3.0.11.tgz", - "integrity": "sha512-u+5HV/9uJaeLj5XTb6+IEF/dokWWkEqJ0XiaRRogyREmKGUgZnNecLucADLdauWFKUNbQfulHFEZEdjwEBjXRg==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/types": "^3.7.2", - "@smithy/util-uri-escape": "^3.0.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=16.0.0" - } - }, - "node_modules/@smithy/node-http-handler/node_modules/@smithy/types": { - "version": "3.7.2", - "resolved": "https://registry.npmjs.org/@smithy/types/-/types-3.7.2.tgz", - "integrity": "sha512-bNwBYYmN8Eh9RyjS1p2gW6MIhSO2rl7X9QeLM8iTdcGRP+eDiIWDt66c9IysCc22gefKszZv+ubV9qZc7hdESg==", - "license": "Apache-2.0", - "dependencies": { - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=16.0.0" - } - }, - "node_modules/@smithy/node-http-handler/node_modules/@smithy/util-uri-escape": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/@smithy/util-uri-escape/-/util-uri-escape-3.0.0.tgz", - "integrity": "sha512-LqR7qYLgZTD7nWLBecUi4aqolw8Mhza9ArpNEQ881MJJIU2sE5iHCK6TdyqqzcDLy0OPe10IY4T8ctVdtynubg==", + "version": "4.0.3", + "resolved": "https://registry.npmjs.org/@smithy/node-http-handler/-/node-http-handler-4.0.3.tgz", + "integrity": "sha512-dYCLeINNbYdvmMLtW0VdhW1biXt+PPCGazzT5ZjKw46mOtdgToQEwjqZSS9/EN8+tNs/RO0cEWG044+YZs97aA==", "license": "Apache-2.0", "dependencies": { + "@smithy/abort-controller": "^4.0.1", + "@smithy/protocol-http": "^5.0.1", + "@smithy/querystring-builder": "^4.0.1", + "@smithy/types": "^4.1.0", "tslib": "^2.6.2" }, "engines": { - "node": ">=16.0.0" + "node": ">=18.0.0" } }, "node_modules/@smithy/property-provider": { @@ -8912,6 +8919,12 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/long": { + "version": "5.3.1", + "resolved": "https://registry.npmjs.org/long/-/long-5.3.1.tgz", + "integrity": "sha512-ka87Jz3gcx/I7Hal94xaN2tZEOPoUOEVftkQqZx2EeQRN7LGdfLlI3FvZ+7WDplm+vK2Urx9ULrvSowtdCieng==", + "license": "Apache-2.0" + }, "node_modules/lru-cache": { "version": "5.1.1", "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz", @@ -10979,6 +10992,30 @@ "node": ">= 6" } }, + "node_modules/protobufjs": { + "version": "7.4.0", + "resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-7.4.0.tgz", + "integrity": "sha512-mRUWCc3KUU4w1jU8sGxICXH/gNS94DvI1gxqDvBzhj1JpcsimQkYiOJfwsPUykUI5ZaspFbSgmBLER8IrQ3tqw==", + "hasInstallScript": true, + "license": "BSD-3-Clause", + "dependencies": { + "@protobufjs/aspromise": "^1.1.2", + "@protobufjs/base64": "^1.1.2", + "@protobufjs/codegen": "^2.0.4", + "@protobufjs/eventemitter": "^1.1.0", + "@protobufjs/fetch": "^1.1.0", + "@protobufjs/float": "^1.0.2", + "@protobufjs/inquire": "^1.1.0", + "@protobufjs/path": "^1.1.2", + "@protobufjs/pool": "^1.1.0", + "@protobufjs/utf8": "^1.1.0", + "@types/node": ">=13.7.0", + "long": "^5.0.0" + }, + "engines": { + "node": ">=12.0.0" + } + }, "node_modules/proxy-addr": { "version": "2.0.7", "resolved": "https://registry.npmjs.org/proxy-addr/-/proxy-addr-2.0.7.tgz", diff --git a/package.json b/package.json index 76c3ecc810..c8a3652488 100644 --- a/package.json +++ b/package.json @@ -76,8 +76,9 @@ "@azure/identity": "4.8.0", "@azure/monitor-query": "1.3.1", "@azure/storage-blob": "12.27.0", + "@datadog/sketches-js": "2.1.1", "@google-cloud/storage": "7.15.2", - "@smithy/node-http-handler": "3.3.3", + "@smithy/node-http-handler": "4.0.3", "ajv": "8.17.1", "aws-sdk": "2.1692.0", "bcrypt": "5.1.1", @@ -110,6 +111,7 @@ "pg": "8.14.1", "ping": "0.4.4", "prom-client": "15.1.3", + "protobufjs": "7.4.0", "ps-node": "0.1.6", "seedrandom": "3.0.5", "setimmediate": "1.0.5", diff --git a/rdma-run.sh b/rdma-run.sh index e1ed6d943e..d2a67ee1cf 100644 --- a/rdma-run.sh +++ b/rdma-run.sh @@ -1,8 +1,8 @@ ## variables CUDA_PATH="$(realpath /usr/local/cuda)" -CUOBJ_PATH="$(realpath ../cuObject-0.7.2-Linux_x86_64/src)" +CUOBJ_PATH="$(realpath ../cuObject-0.8.1-Linux_x86_64/src)" CUFILE_ENV_PATH_JSON="$(realpath ../cuobj.json)" -RDMA_LIBS="$CUOBJ_PATH/lib/libcuobjserver.so $CUOBJ_PATH/lib/libcuobjclient.so $CUOBJ_PATH/lib/libcufile.so.1.13.0 $CUOBJ_PATH/lib/libcufile_rdma.so.1.13.0" +CUOBJ_LIBS="$CUOBJ_PATH/lib/libcuobjserver.so $CUOBJ_PATH/lib/libcuobjclient.so $CUOBJ_PATH/lib/libcufile.so.1.13.0 $CUOBJ_PATH/lib/libcufile_rdma.so.1.13.0" ## git push to hosts ./rdma-push.sh @@ -21,9 +21,9 @@ rm -rf build/Release/obj.target/{rdma,cuda}_napi/ && ## rdma_speed -UV_THREADPOOL_SIZE=4 LD_PRELOAD="$RDMA_LIBS" node src/tools/rdma_speed.js --server -UV_THREADPOOL_SIZE=4 LD_PRELOAD="$RDMA_LIBS" CUFILE_ENV_PATH_JSON="$CUFILE_ENV_PATH_JSON" node src/tools/rdma_speed.js --client --op GET -UV_THREADPOOL_SIZE=4 LD_PRELOAD="$RDMA_LIBS" CUFILE_ENV_PATH_JSON="$CUFILE_ENV_PATH_JSON" node src/tools/rdma_speed.js --client --op PUT +UV_THREADPOOL_SIZE=4 LD_PRELOAD="$CUOBJ_LIBS" node src/tools/rdma_speed.js --server +UV_THREADPOOL_SIZE=4 LD_PRELOAD="$CUOBJ_LIBS" CUFILE_ENV_PATH_JSON="$CUFILE_ENV_PATH_JSON" node src/tools/rdma_speed.js --client --op GET +UV_THREADPOOL_SIZE=4 LD_PRELOAD="$CUOBJ_LIBS" CUFILE_ENV_PATH_JSON="$CUFILE_ENV_PATH_JSON" node src/tools/rdma_speed.js --client --op PUT # --op PUT --forks 1 --concur 16 # --pool_size $((4*32)) --size 32 # --perf-basic-prof @@ -34,8 +34,15 @@ node src/tools/http_speed.js --client 172.16.0.61 --buf $((8*1024*1024)) --size node src/tools/http_speed.js --client 172.16.0.61 --buf $((8*1024*1024)) --size 8 --forks 8 --concur 8 --method PUT ## noobaa server (local ips 172.16.0.61 and 172.16.0.71) -LD_PRELOAD="$RDMA_LIBS" LOCAL_IP=172.16.0.61 node src/cmd/nsfs.js -LD_PRELOAD="$RDMA_LIBS" LOCAL_IP=172.16.0.71 node src/cmd/nsfs.js +LD_PRELOAD="$CUOBJ_LIBS" LOCAL_IP=172.16.0.61 node src/cmd/nsfs.js +LD_PRELOAD="$CUOBJ_LIBS" LOCAL_IP=172.16.0.71 node src/cmd/nsfs.js + +## cuobj benchmark +LD_PRELOAD="$CUOBJ_LIBS" benchmark/cuobjio_server -a 172.16.0.61 -P /root/guym/cuobjio_server_objects +LD_PRELOAD="$CUOBJ_LIBS" CUFILE_ENV_PATH_JSON="$CUFILE_ENV_PATH_JSON" benchmark/cuobjio_client -a 172.16.0.61 -T 10 -s $((8*1024*1024)) -t 16 -i 1 -m 0 // 16x PUT (CUDA_MALLOC) +LD_PRELOAD="$CUOBJ_LIBS" CUFILE_ENV_PATH_JSON="$CUFILE_ENV_PATH_JSON" benchmark/cuobjio_client -a 172.16.0.61 -T 10 -s $((8*1024*1024)) -t 16 -i 1 -m 1 // 16x PUT (HOST_MEM) +LD_PRELOAD="$CUOBJ_LIBS" CUFILE_ENV_PATH_JSON="$CUFILE_ENV_PATH_JSON" benchmark/cuobjio_client -a 172.16.0.61 -T 10 -s $((8*1024*1024)) -t 16 -i 0 -m 0 // 16x GET (CUDA_MALLOC) +LD_PRELOAD="$CUOBJ_LIBS" CUFILE_ENV_PATH_JSON="$CUFILE_ENV_PATH_JSON" benchmark/cuobjio_client -a 172.16.0.61 -T 10 -s $((8*1024*1024)) -t 16 -i 0 -m 1 // 16x GET (HOST_MEM) #################################################### ## client (local ips 172.16.0.62 and 172.16.0.72) ## @@ -56,7 +63,7 @@ DISABLE_INIT_RANDOM_SEED=true \ UV_THREADPOOL_SIZE=4 \ DISABLE_INIT_RANDOM_SEED=true \ - LD_PRELOAD="$RDMA_LIBS" \ + LD_PRELOAD="$CUOBJ_LIBS" \ CUFILE_ENV_PATH_JSON="$CUFILE_ENV_PATH_JSON" \ node src/tools/s3perf.js \ --local_ip 172.16.0.62 \ @@ -87,3 +94,8 @@ UV_THREADPOOL_SIZE=4 \ --noclear \ --list-existing +################################# +# server cpu +while true; do top -b -c -w 500 -d 0.5 -n 10 | grep noobaa | awk '{s+=$9} END {print strftime("%T"),"CPU% =",s/10}'; done +# client cpu +while true; do top -b -c -w 500 -d 0.5 -n 10 | grep node | awk '{s+=$9} END {print strftime("%T"),"CPU% =",s/10}'; done diff --git a/src/agent/block_store_speed.js b/src/agent/block_store_speed.js index d85b34b858..0119601a87 100644 --- a/src/agent/block_store_speed.js +++ b/src/agent/block_store_speed.js @@ -25,10 +25,15 @@ argv.timeout = argv.timeout || 60000; let block_index = 0; -const speedometer = new Speedometer('Block Store Speed'); -speedometer.run_workers(argv.forks, main, argv); +const speedometer = new Speedometer({ + name: 'Block Store Speed', + argv, + num_workers: argv.forks, + workers_func, +}); +speedometer.start(); -async function main() { +async function workers_func() { const rpc = api.new_rpc(); const client = rpc.new_client(); const signal_client = rpc.new_client(); @@ -40,20 +45,18 @@ async function main() { system: argv.system, }); await Promise.all(Array(argv.concur).fill(0).map(() => worker(client))); - process.exit(); } async function worker(client) { while (block_index < argv.count) { block_index += 1; - await write_block(client); - speedometer.update(argv.size); + await speedometer.measure(async () => write_block(client)); } } async function write_block(client) { const block_id = new mongodb.ObjectId(); - return client.block_store.write_block({ + await client.block_store.write_block({ [RPC_BUFFERS]: { data: Buffer.allocUnsafe(argv.size) }, block_md: { id: block_id, @@ -64,4 +67,5 @@ async function write_block(client) { address: argv.address, timeout: argv.timeout, }); + return argv.size; } diff --git a/src/endpoint/s3/ops/s3_get_object.js b/src/endpoint/s3/ops/s3_get_object.js index 6a183fa336..4d5493a0d4 100644 --- a/src/endpoint/s3/ops/s3_get_object.js +++ b/src/endpoint/s3/ops/s3_get_object.js @@ -5,9 +5,12 @@ const dbg = require('../../../util/debug_module')(__filename); const S3Error = require('../s3_errors').S3Error; const s3_utils = require('../s3_utils'); const http_utils = require('../../../util/http_utils'); +const rdma_utils = require('../../../util/rdma_utils'); /** * http://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectGET.html + * @param {nb.S3Request} req + * @param {nb.S3Response} res */ async function get_object(req, res) { @@ -16,6 +19,7 @@ async function get_object(req, res) { const noobaa_trigger_agent = agent_header && agent_header.includes('exec-env/NOOBAA_FUNCTION'); const encryption = s3_utils.parse_encryption(req); const version_id = s3_utils.parse_version_id(req.query.versionId); + const rdma_info = rdma_utils.parse_rdma_info(req); let part_number; // If set, part_number should be positive integer from 1 to 10000 if (req.query.partNumber) { @@ -29,6 +33,7 @@ async function get_object(req, res) { version_id, md_conditions, encryption, + rdma_info, }; if (req.query.get_from_cache !== undefined) { md_params.get_from_cache = true; @@ -60,6 +65,7 @@ async function get_object(req, res) { noobaa_trigger_agent, md_conditions, encryption, + rdma_info, }; if (md_params.get_from_cache) { diff --git a/src/endpoint/s3/ops/s3_head_object.js b/src/endpoint/s3/ops/s3_head_object.js index 1cd3543c3a..e64bf646d8 100644 --- a/src/endpoint/s3/ops/s3_head_object.js +++ b/src/endpoint/s3/ops/s3_head_object.js @@ -3,20 +3,25 @@ // const S3Error = require('../s3_errors').S3Error; const s3_utils = require('../s3_utils'); -const http_utils = require('../../../util/http_utils'); const S3Error = require('../s3_errors').S3Error; +const http_utils = require('../../../util/http_utils'); +const rdma_utils = require('../../../util/rdma_utils'); /** * http://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectHEAD.html + * @param {nb.S3Request} req + * @param {nb.S3Response} res */ async function head_object(req, res) { const encryption = s3_utils.parse_encryption(req); + const rdma_info = rdma_utils.parse_rdma_info(req); const params = { bucket: req.params.bucket, key: req.params.key, version_id: s3_utils.parse_version_id(req.query.versionId), md_conditions: http_utils.get_md_conditions(req), - encryption + encryption, + rdma_info, }; if (req.query.partNumber) { params.part_number = s3_utils.parse_part_number(req.query.partNumber, S3Error.InvalidArgument); @@ -29,6 +34,7 @@ async function head_object(req, res) { s3_utils.set_response_object_md(res, object_md); s3_utils.set_encryption_response_headers(req, res, object_md.encryption); http_utils.set_response_headers_from_request(req, res); + rdma_utils.set_rdma_response_header(req, res, rdma_info, object_md.rdma_reply); } module.exports = { diff --git a/src/endpoint/s3/ops/s3_put_object.js b/src/endpoint/s3/ops/s3_put_object.js index d8bbdddd1e..74b7c1bcb7 100644 --- a/src/endpoint/s3/ops/s3_put_object.js +++ b/src/endpoint/s3/ops/s3_put_object.js @@ -1,26 +1,31 @@ /* Copyright (C) 2016 NooBaa */ 'use strict'; +const mime = require('mime-types'); const dbg = require('../../../util/debug_module')(__filename); const s3_utils = require('../s3_utils'); const S3Error = require('../s3_errors').S3Error; const http_utils = require('../../../util/http_utils'); -const mime = require('mime-types'); +const rdma_utils = require('../../../util/rdma_utils'); const config = require('../../../../config'); const s3_error_options = { ErrorClass: S3Error, error_missing_content_length: S3Error.MissingContentLength }; + /** * http://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectPUT.html * http://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectCOPY.html + * @param {nb.S3Request} req + * @param {nb.S3Response} res */ async function put_object(req, res) { const encryption = s3_utils.parse_encryption(req); const copy_source = s3_utils.parse_copy_source(req); const tagging = s3_utils.parse_tagging_header(req); const storage_class = s3_utils.parse_storage_class_header(req); + const rdma_info = rdma_utils.parse_rdma_info(req); if (config.DENY_UPLOAD_TO_STORAGE_CLASS_STANDARD && storage_class === s3_utils.STORAGE_CLASS_STANDARD) { throw new S3Error(S3Error.InvalidStorageClass); } @@ -45,6 +50,7 @@ async function put_object(req, res) { content_encoding: req.headers['content-encoding'], copy_source, source_stream, + rdma_info, size, md5_b64, sha256_b64, @@ -57,20 +63,23 @@ async function put_object(req, res) { encryption, lock_settings, storage_class, - azure_invalid_md_header: req.headers['azure-metadata-handling'] || undefined + azure_invalid_md_header: req.headers['azure-metadata-handling'] || undefined, }); if (reply.version_id && reply.version_id !== 'null') { res.setHeader('x-amz-version-id', reply.version_id); } s3_utils.set_encryption_response_headers(req, res, reply.encryption); + rdma_utils.set_rdma_response_header(req, res, rdma_info, reply.rdma_reply); res.size_for_notif = size || reply.size; if (copy_source) { // TODO: This needs to be checked regarding copy between diff namespaces // In that case we do not have the copy_source property and just read and upload the stream - if (reply.copy_source && reply.copy_source.version_id) res.setHeader('x-amz-copy-source-version-id', reply.copy_source.version_id); + if (reply.copy_source && reply.copy_source.version_id) { + res.setHeader('x-amz-copy-source-version-id', reply.copy_source.version_id); + } return { CopyObjectResult: { // TODO S3 last modified and etag should be for the new part diff --git a/src/endpoint/s3/ops/s3_put_object_uploadId.js b/src/endpoint/s3/ops/s3_put_object_uploadId.js index f165f4d0e6..a2a34e35dd 100644 --- a/src/endpoint/s3/ops/s3_put_object_uploadId.js +++ b/src/endpoint/s3/ops/s3_put_object_uploadId.js @@ -5,6 +5,7 @@ const dbg = require('../../../util/debug_module')(__filename); const S3Error = require('../s3_errors').S3Error; const s3_utils = require('../s3_utils'); const http_utils = require('../../../util/http_utils'); +const rdma_utils = require('../../../util/rdma_utils'); const s3_error_options = { ErrorClass: S3Error, @@ -13,12 +14,15 @@ const s3_error_options = { /** * http://docs.aws.amazon.com/AmazonS3/latest/API/mpUploadUploadPart.html * http://docs.aws.amazon.com/AmazonS3/latest/API/mpUploadUploadPartCopy.html + * @param {nb.S3Request} req + * @param {nb.S3Response} res */ async function put_object_uploadId(req, res) { const encryption = s3_utils.parse_encryption(req); const num = s3_utils.parse_part_number(req.query.partNumber, S3Error.InvalidArgument); const copy_source = s3_utils.parse_copy_source(req); + const rdma_info = rdma_utils.parse_rdma_info(req); // Copy request sends empty content and not relevant to the object data const { size, md5_b64, sha256_b64 } = copy_source ? {} : { @@ -39,6 +43,7 @@ async function put_object_uploadId(req, res) { num, copy_source, source_stream, + rdma_info, size, md5_b64, sha256_b64, @@ -53,6 +58,7 @@ async function put_object_uploadId(req, res) { throw e; } s3_utils.set_encryption_response_headers(req, res, reply.encryption); + rdma_utils.set_rdma_response_header(req, res, rdma_info, reply.rdma_reply); // TODO: We do not return the VersionId of the object that was copied res.setHeader('ETag', `"${reply.etag}"`); diff --git a/src/native/common.gypi b/src/native/common.gypi index bafc205f86..edacda9f71 100644 --- a/src/native/common.gypi +++ b/src/native/common.gypi @@ -1,15 +1,6 @@ # Copyright (C) 2016 NooBaa { 'variables': { # NOTE: variables in the same scope cannot expand each other! - 'cflags_warnings': [ - '-W', - '-Wall', - '-Wextra', - '-Werror', - '-Wpedantic', -#Can be removed when https://github.com/nodejs/nan/issues/953 is resolved. - '-Wno-error=deprecated-declarations' - ], # see https://nodejs.org/docs/latest-v12.x/api/process.html#process_process_arch # Possible values are: # 'arm', 'arm64', 'ia32', 'mips','mipsel', 'ppc', 'ppc64', 's390', 's390x', 'x32', and 'x64'. diff --git a/src/native/cuda/cuda_napi.cpp b/src/native/cuda/cuda_napi.cpp new file mode 100644 index 0000000000..1b95c82d25 --- /dev/null +++ b/src/native/cuda/cuda_napi.cpp @@ -0,0 +1,280 @@ +/* Copyright (C) 2016 NooBaa */ +#include "../util/common.h" +#include "../util/napi.h" +#include "../util/worker.h" +#include "cuda.h" + +#define CU_TRY(fn) \ + do { \ + CUresult r = fn; \ + if (r != CUDA_SUCCESS) { \ + const char* cuda_err = ""; \ + cuGetErrorName(r, &cuda_err); \ + throw Napi::Error::New(env, \ + XSTR() << __func__ << " " #fn " " \ + << DVAL(r) << DVAL(cuda_err)); \ + } \ + } while (0) + +#define CU_WARN(fn) \ + do { \ + CUresult r = fn; \ + if (r != CUDA_SUCCESS) { \ + const char* cuda_err = ""; \ + cuGetErrorName(r, &cuda_err); \ + LOG("WARNING: " \ + << __func__ << " " #fn " " \ + << DVAL(r) << DVAL(cuda_err)); \ + } \ + } while (0) + +#define CUDA_TRY(fn) \ + do { \ + cudaError_t r = fn; \ + if (r != cudaSuccess) { \ + const char* cuda_err = cudaGetErrorName(r); \ + throw Napi::Error::New(env, \ + XSTR() << __func__ << " " #fn " " \ + << DVAL(r) << DVAL(cuda_err)); \ + } \ + } while (0) + +namespace noobaa +{ + +DBG_INIT(1); + +CUdevice cuda_napi_dev_num = -1; +CUdevice cuda_napi_dev = -1; +CUcontext cuda_napi_ctx = 0; + +// About context management: +// read this most helpful answer - https://forums.developer.nvidia.com/t/cuda-context-and-threading/26625/6 +// Main points: +// 1) a context belongs to a single device. +// 2) a thread has a single context bound at a time (ignoring context stack stuff) +// 3) a context can be bound to multiple threads simultaneously +// so we use the primary context on each device and bind it to our worker threads. +// in order to bind it we +static void +cuda_napi_ctx_init(Napi::Env env, int dev_num = 0) +{ + if (cuda_napi_dev_num == dev_num && cuda_napi_ctx) return; + + CUdevice dev = -1; + CUcontext ctx = 0; + + CU_TRY(cuInit(0)); + CU_TRY(cuDeviceGet(&dev, dev_num)); + + CU_TRY(cuDevicePrimaryCtxRetain(&ctx, dev)); + CU_TRY(cuCtxSetCurrent(ctx)); + + cuda_napi_dev_num = dev_num; + cuda_napi_dev = dev; + cuda_napi_ctx = ctx; + + // rdma_napi needs worker threads to set the cuda context + // and since we depend on rdma_napi we set the context here. + extern CUcontext rdma_napi_cuda_ctx; + rdma_napi_cuda_ctx = ctx; + + LOG("cuda_napi_ctx_init " << DVAL(dev_num) << DVAL(dev) << DVAL(ctx)); +} + +struct CudaSlice +{ + CUdeviceptr ptr; + size_t size; + + CudaSlice slice(size_t start, size_t end) + { + if (start > size) start = size; + if (end > size) end = size; + if (end < start) end = start; + return { ptr + start, end - start }; + } + + friend std::ostream& + operator<<(std::ostream& os, CudaSlice& x) + { + return os << "[" << ((void*)x.ptr) << "+" << ((void*)x.size) << "]"; + } +}; + +struct CudaMemory : public Napi::ObjectWrap +{ + static Napi::FunctionReference constructor; + CudaSlice mem; + + static Napi::Function Init(Napi::Env env); + CudaMemory(const Napi::CallbackInfo& info); + ~CudaMemory(); + Napi::Value free(const Napi::CallbackInfo& info); + Napi::Value fill(const Napi::CallbackInfo& info); + Napi::Value as_buffer(const Napi::CallbackInfo& info); + Napi::Value copy_to_host_new(const Napi::CallbackInfo& info); + Napi::Value copy_to_host(const Napi::CallbackInfo& info); + Napi::Value copy_from_host(const Napi::CallbackInfo& info); +}; + +Napi::FunctionReference CudaMemory::constructor; + +Napi::Function +CudaMemory::Init(Napi::Env env) +{ + constructor = Napi::Persistent(DefineClass(env, + "CudaMemory", + { + InstanceMethod<&CudaMemory::free>("free"), + InstanceMethod<&CudaMemory::fill>("fill"), + InstanceMethod<&CudaMemory::as_buffer>("as_buffer"), + InstanceMethod<&CudaMemory::copy_to_host_new>("copy_to_host_new"), + InstanceMethod<&CudaMemory::copy_to_host>("copy_to_host"), + InstanceMethod<&CudaMemory::copy_from_host>("copy_from_host"), + })); + constructor.SuppressDestruct(); + return constructor.Value(); +} + +CudaMemory::CudaMemory(const Napi::CallbackInfo& info) + : Napi::ObjectWrap(info) +{ + auto env = info.Env(); + size_t size = info[0].As().Int64Value(); + CUdeviceptr ptr = 0; + cuda_napi_ctx_init(env); + CU_TRY(cuMemAlloc(&ptr, size)); + mem = { ptr, size }; + DBG1("CudaMemory::ctor " << DVAL(mem)); +} + +/** + * On dtor free the memory and reset the pointer and size to 0. + */ +CudaMemory::~CudaMemory() +{ + if (mem.ptr) { + auto free_mem = mem; + mem = { 0, 0 }; + CU_WARN(cuMemFree(free_mem.ptr)); + DBG1("CudaMemory::dtor " << DVAL(free_mem)); + } +} + +/** + * Free the memory and reset the pointer and size to 0. + */ +Napi::Value +CudaMemory::free(const Napi::CallbackInfo& info) +{ + auto env = info.Env(); + if (mem.ptr) { + auto free_mem = mem; + mem = { 0, 0 }; + CU_TRY(cuMemFree(free_mem.ptr)); + DBG1("CudaMemory::free " << DVAL(free_mem)); + } + return env.Undefined(); +} + +Napi::Value +CudaMemory::fill(const Napi::CallbackInfo& info) +{ + auto env = info.Env(); + uint32_t value = info[0].As().Uint32Value(); + size_t start = info.Length() > 1 ? info[1].As().Int64Value() : 0; + size_t end = info.Length() > 2 ? info[2].As().Int64Value() : mem.size; + auto slice = mem.slice(start, end); + uint8_t byte = value & 0xff; + CU_TRY(cuMemsetD8(slice.ptr, byte, slice.size)); + DBG1("CudaMemory::fill " << DVAL(mem) << DVAL(slice) << DVAL(byte)); + return Napi::Number::New(info.Env(), slice.size); +} + +Napi::Value +CudaMemory::as_buffer(const Napi::CallbackInfo& info) +{ + size_t start = info.Length() > 0 ? info[0].As().Int64Value() : 0; + size_t end = info.Length() > 1 ? info[1].As().Int64Value() : mem.size; + auto slice = mem.slice(start, end); + auto buffer = Napi::Buffer::New(info.Env(), (uint8_t*)slice.ptr, slice.size); + DBG1("CudaMemory::as_buffer " << DVAL(mem) << DVAL(slice) << DBUF(buffer)); + return buffer; +} + +Napi::Value +CudaMemory::copy_to_host_new(const Napi::CallbackInfo& info) +{ + auto env = info.Env(); + size_t start = info.Length() > 0 ? info[0].As().Int64Value() : 0; + size_t end = info.Length() > 1 ? info[1].As().Int64Value() : mem.size; + auto slice = mem.slice(start, end); + auto buffer = Napi::Buffer::New(info.Env(), slice.size); + CU_TRY(cuMemcpyDtoH(buffer.Data(), slice.ptr, slice.size)); + DBG1("CudaMemory::copy_to_host_new " << DVAL(mem) << DVAL(slice) << DVAL(buffer.Data())); + return buffer; +} + +Napi::Value +CudaMemory::copy_to_host(const Napi::CallbackInfo& info) +{ + auto env = info.Env(); + auto buffer = info[0].As>(); + size_t start = info.Length() > 1 ? info[1].As().Int64Value() : 0; + size_t end = info.Length() > 2 ? info[2].As().Int64Value() : mem.size; + auto slice = mem.slice(start, end); + size_t len = std::min(slice.size, buffer.Length()); + CU_TRY(cuMemcpyDtoH(buffer.Data(), slice.ptr, len)); + DBG1("CudaMemory::copy_to_host " << DVAL(mem) << DVAL(slice) << DBUF(buffer) << DVAL(len)); + return Napi::Number::New(info.Env(), len); +} + +Napi::Value +CudaMemory::copy_from_host(const Napi::CallbackInfo& info) +{ + auto env = info.Env(); + auto buffer = info[0].As>(); + size_t start = info.Length() > 1 ? info[1].As().Int64Value() : 0; + size_t end = info.Length() > 2 ? info[2].As().Int64Value() : mem.size; + auto slice = mem.slice(start, end); + size_t len = std::min(slice.size, buffer.Length()); + CU_TRY(cuMemcpyHtoD(slice.ptr, buffer.Data(), len)); + DBG1("CudaMemory::copy_from_host " << DVAL(mem) << DVAL(slice) << DBUF(buffer) << DVAL(len)); + return Napi::Number::New(info.Env(), len); +} + +/** + * + */ +Napi::Value +cuda_malloc(const Napi::CallbackInfo& info) +{ +#if USE_CUDA + size_t size = info[0].As().Int64Value(); + // CUDA_TRY(cudaMemcpy(host_ptr, cuda_ptr, size, cudaMemcpyDeviceToHost)); + // CUDA_TRY(cudaMemcpy(cuda_ptr, host_ptr, size, cudaMemcpyHostToDevice)); + size_t size = info[0].As().Int64Value(); + void* cuda_ptr = 0; + CUDA_TRY(cudaMalloc(&cuda_ptr, size)); + CUDA_TRY(cudaMemset(cuda_ptr, 'A', size)); + CUDA_TRY(cudaStreamSynchronize(0)); + cuObjMemoryType_t mem_type = cuObjClient::getMemoryType(cuda_ptr); + LOG("cuda_malloc: " << DVAL(cuda_ptr) << DVAL(size) << DVAL(mem_type)); + + auto finalizer = [](Napi::Env, uint8_t* ptr) { cudaFree(ptr); }; + auto buf = Napi::Buffer::New(info.Env(), (uint8_t*)cuda_ptr, size, finalizer); + return buf; +#else + return info.Env().Undefined(); +#endif +} + +void +cuda_napi(Napi::Env env, Napi::Object exports) +{ + exports["CudaMemory"] = CudaMemory::Init(env); + DBG0("CUDA: library loaded."); +} + +} // namespace noobaa \ No newline at end of file diff --git a/src/native/cuda/cuda_napi.gyp b/src/native/cuda/cuda_napi.gyp new file mode 100644 index 0000000000..7c063cfd8f --- /dev/null +++ b/src/native/cuda/cuda_napi.gyp @@ -0,0 +1,51 @@ +# Copyright (C) 2016 NooBaa +# Copyright (C) 2016 NooBaa +{ + 'includes': ['../common.gypi', '../warnings.gypi'], + + 'targets': [{ + 'target_name': 'cuda_napi', + 'type': 'static_library', + 'sources': [ + 'cuda_napi.cpp', + ], + 'variables': { + 'CUDA_PATH%': '/usr/local/cuda', + 'CUOBJ_PATH%': ''' + +typedef off_t loff_t; +#include "cuobjclient.h" +#include "protocol.h" + +namespace noobaa +{ + +DBG_INIT(0); + +typedef enum cuObjOpType_enum +{ + CUOBJ_GET = 0, /**< GET operation */ + CUOBJ_PUT = 1, /**< PUT operation */ + CUOBJ_INVALID = 9999 +} cuObjOpType_t; + +typedef Napi::External ExternalIovec; + +/** + * RdmaClientNapi is a napi object wrapper for cuObjClient. + */ +struct RdmaClientNapi : public Napi::ObjectWrap +{ + static Napi::FunctionReference constructor; + std::shared_ptr _client; + Napi::ThreadSafeFunction _thread_callback; + + static Napi::Function Init(Napi::Env env); + RdmaClientNapi(const Napi::CallbackInfo& info); + ~RdmaClientNapi(); + Napi::Value close(const Napi::CallbackInfo& info); + Napi::Value rdma(const Napi::CallbackInfo& info); +}; + +/** + * RdmaClientWorker is a napi worker for RdmaClientNapi::rdma() + */ +struct RdmaClientWorker : public ObjectWrapWorker +{ + cuObjOpType_t _op_type; + void* _ptr; + size_t _size; + std::string _rdma_desc; + std::string _rdma_addr; + size_t _rdma_size; + loff_t _rdma_offset; + ssize_t _ret_size; + std::mutex _mutex; + std::condition_variable _cond; + Napi::FunctionReference _func; + + RdmaClientWorker(const Napi::CallbackInfo& info); + virtual void Execute() override; + virtual void OnOK() override; + + ssize_t start_op( + cuObjOpType_t op_type, + const void* handle, + const void* ptr, + size_t size, + loff_t offset, + const cufileRDMAInfo_t* rdma_info); + void send_op(Napi::Env env); +}; + +Napi::FunctionReference RdmaClientNapi::constructor; + +Napi::Function +RdmaClientNapi::Init(Napi::Env env) +{ + constructor = Napi::Persistent(DefineClass(env, + "RdmaClientNapi", + { + InstanceMethod<&RdmaClientNapi::close>("close"), + InstanceMethod<&RdmaClientNapi::rdma>("rdma"), + })); + constructor.SuppressDestruct(); + return constructor.Value(); +} + +static ssize_t +get_op_fn(const void* handle, char* ptr, size_t size, loff_t offset, const cufileRDMAInfo_t* rdma_info) +{ + RdmaClientWorker* w = reinterpret_cast(cuObjClient::getCtx(handle)); + return w->start_op(CUOBJ_GET, handle, ptr, size, offset, rdma_info); +} + +static ssize_t +put_op_fn(const void* handle, const char* ptr, size_t size, loff_t offset, const cufileRDMAInfo_t* rdma_info) +{ + RdmaClientWorker* w = reinterpret_cast(cuObjClient::getCtx(handle)); + return w->start_op(CUOBJ_PUT, handle, ptr, size, offset, rdma_info); +} + +/** + * Create a new RdmaClientNapi object wrapper. + * There is not much to configure programmatically, but the client will load cufile.json + * which is located by env var: CUFILE_ENV_PATH_JSON=/etc/cufile.json. + * @see {@link https://docs.nvidia.com/gpudirect-storage/configuration-guide/index.html#gds-parameters} + * @see {@link https://docs.nvidia.com/gpudirect-storage/api-reference-guide/index.html} + * + * Currently the client is synchronous and requires a callback to the main thread to send the http request. + * This means that calling rdma() on the same client will add contention, and instead we should create + * a separate client per each concurrent request. This is not a problem because the client is lightweight enough + * but it is something to be aware of, and in the future we will prefer the library to be async. + */ +RdmaClientNapi::RdmaClientNapi(const Napi::CallbackInfo& info) + : Napi::ObjectWrap(info) +{ + DBG0("RdmaClientNapi::ctor"); + + uint32_t log_flags = + // CUOBJ_LOG_PATH_DEBUG | + // CUOBJ_LOG_PATH_INFO | + CUOBJ_LOG_PATH_ERROR; + + cuObjClient::setupTelemetry(true, &std::cout); + cuObjClient::setTelemFlags(log_flags); + + CUObjOps_t ops = { + .get = &get_op_fn, + .put = &put_op_fn, + }; + std::shared_ptr client(new cuObjClient(ops, CUOBJ_PROTO_RDMA_DC_V1)); + + if (!client->isConnected()) { + throw Napi::Error::New(info.Env(), + XSTR() << "RdmaClientNapi::ctor connect failed (check rdma_dev_addr_list in cufile.json)"); + } + + // initialize a thread safe callback to the main thread + // actual callback will be set in the worker + auto noop = Napi::Function::New( + info.Env(), [](const Napi::CallbackInfo& info) {}); + _thread_callback = Napi::ThreadSafeFunction::New( + info.Env(), noop, "RdmaClientNapiThreadCallback", 0, 1, [](Napi::Env) {}); + + _client = client; +} + +RdmaClientNapi::~RdmaClientNapi() +{ + DBG0("RdmaClientNapi::dtor"); + _client.reset(); +} + +Napi::Value +RdmaClientNapi::close(const Napi::CallbackInfo& info) +{ + DBG0("RdmaClientNapi::close"); + _client.reset(); + return info.Env().Undefined(); +} + +/** + * async function to start and await a RdmaClientWorker threadpool worker + * + * @param {'GET'|'PUT'} op_type = info[0] + * @param {Buffer} buf = info[1] + * @param {(rdma_info, callback) => void} func = info[2] + * @returns {Promise} + */ +Napi::Value +RdmaClientNapi::rdma(const Napi::CallbackInfo& info) +{ + return await_worker(info); +} + +RdmaClientWorker::RdmaClientWorker(const Napi::CallbackInfo& info) + : ObjectWrapWorker(info) + , _op_type(CUOBJ_INVALID) + , _ptr(0) + , _size(0) + , _rdma_size(0) + , _rdma_offset(0) + , _ret_size(-1) +{ + auto op_type = info[0].As().Utf8Value(); + auto buf = info[1].As>(); + auto func = info[2].As(); + + if (op_type == "GET") { + _op_type = CUOBJ_GET; + } else if (op_type == "PUT") { + _op_type = CUOBJ_PUT; + } else { + throw Napi::Error::New(info.Env(), + XSTR() << "RdmaClientWorker: bad op type " << DVAL(op_type)); + } + + _ptr = buf.Data(); + _size = buf.Length(); + _func = Napi::Persistent(func); +} + +// will be set by cuda_napi when loaded +CUcontext rdma_napi_cuda_ctx = 0; + +void +RdmaClientWorker::Execute() +{ + DBG1("RdmaClientWorker: Execute " + << DVAL(_op_type) + << DVAL(_ptr) + << DVAL(_size)); + std::shared_ptr client(_wrap->_client); + + cuObjMemoryType_t mem_type = cuObjClient::getMemoryType(_ptr); + DBG1("RdmaClientWorker: buffer " << DVAL(_ptr) << DVAL(_size) << DVAL(mem_type)); + + // mem_type doesn't seem to identify the memory type correctly + // so we need to set the context manually instead of this condition + // mem_type == CUOBJ_MEMORY_CUDA_DEVICE || mem_type == CUOBJ_MEMORY_CUDA_MANAGED + + if (rdma_napi_cuda_ctx) { + CUresult res = cuCtxSetCurrent(rdma_napi_cuda_ctx); + if (res != CUDA_SUCCESS) { + SetError(XSTR() << "RdmaClientWorker: Failed to set current context " << DVAL(res)); + return; + } + } + + // register rdma buffer + cuObjErr_t ret_get_mem = client->cuMemObjGetDescriptor(_ptr, _size); + if (ret_get_mem != CU_OBJ_SUCCESS) { + SetError(XSTR() << "RdmaClientWorker: Failed to register rdma buffer " << DVAL(ret_get_mem)); + return; + } + StackCleaner cleaner([&] { + // release rdma buffer + cuObjErr_t ret_put_mem = client->cuMemObjPutDescriptor(_ptr); + if (ret_put_mem != CU_OBJ_SUCCESS) { + SetError(XSTR() << "RdmaClientWorker: Failed to release rdma buffer " << DVAL(ret_put_mem)); + } + }); + + if (_op_type == CUOBJ_GET) { + _ret_size = client->cuObjGet(this, _ptr, _size); + } else if (_op_type == CUOBJ_PUT) { + _ret_size = client->cuObjPut(this, _ptr, _size); + } else { + PANIC("bad op type " << DVAL(_op_type)); + } + + if (_ret_size < 0 || _ret_size != ssize_t(_size)) { + SetError(XSTR() << "RdmaClientWorker: failed " + << DVAL(_op_type) << DVAL(_ret_size)); + } +} + +void +RdmaClientWorker::OnOK() +{ + _promise.Resolve(Napi::Number::New(Env(), _ret_size)); +} + +/** + * Start an operation on the worker thread. + */ +ssize_t +RdmaClientWorker::start_op( + cuObjOpType_t op_type, + const void* handle, + const void* ptr, + size_t size, + loff_t offset, + const cufileRDMAInfo_t* rdma_info) +{ + std::string rdma_desc(rdma_info->desc_str, rdma_info->desc_len - 1); + DBG1("RdmaClientWorker::start_op " << DVAL(op_type) << DVAL(ptr) << DVAL(size) << DVAL(offset) << DVAL(rdma_desc)); + + // this lock and condition variable are used to synchronize the worker thread + // with the main thread, as the main threas is sending the http request to the server. + std::unique_lock lock(_mutex); + + // check that the parameters are as expected + ASSERT(op_type == _op_type, DVAL(op_type) << DVAL(_op_type)); + ASSERT(ptr == _ptr, DVAL(ptr) << DVAL(_ptr)); + ASSERT(size == _size, DVAL(size) << DVAL(_size)); + ASSERT(offset == 0, DVAL(offset)); + + // save info for the server request + _rdma_desc = rdma_desc; + _rdma_addr = XSTR() << std::hex << uintptr_t(ptr); + _rdma_size = size; + _rdma_offset = offset; + + // send the op on the main thread by calling a Napi::ThreadSafeFunction. + // this model is cumbwersome and would be replaced by an async worker in the future. + // but for now the library requires us to make the http request sychronously from the worker thread, + // so we need to send the op on the main thread and then wait for the worker to be woken up. + _wrap->_thread_callback.Acquire(); + _wrap->_thread_callback.BlockingCall( + [this](Napi::Env env, Napi::Function noop) { + send_op(env); + }); + _wrap->_thread_callback.Release(); + + // after sending the op on main thread, the worker now waits for wakeup + _cond.wait(lock); + lock.unlock(); + + // _ret_size was set by the server response in the callback + DBG1("RdmaClientWorker::start_op done " << DVAL(_ret_size)); + return _ret_size; +} + +/** + * Send the rdma_info to the server on the main thread. + * When the server responds and the callback is called, the worker will be woken up. + */ +void +RdmaClientWorker::send_op(Napi::Env env) +{ + DBG1("RdmaClientWorker::send_op"); + Napi::HandleScope scope(env); + + auto rdma_info = Napi::Object::New(env); + rdma_info["desc"] = Napi::String::New(env, _rdma_desc); + rdma_info["addr"] = Napi::String::New(env, _rdma_addr); + rdma_info["size"] = Napi::Number::New(env, _rdma_size); + rdma_info["offset"] = Napi::Number::New(env, _rdma_offset); + + // prepare a node-style callback function(err, result) + auto callback = Napi::Function::New(env, [this](const Napi::CallbackInfo& info) { + // this lock can be problematic because it is on the main thread + // but it works well if we a separate clients per each concurrent request + // and then locking is immediate because at this point the worker is already waiting + // on the condition and the mutex is free. + std::unique_lock lock(_mutex); + + // setting _ret_size according to the server response + // and waking up the worker to continue + if (info[0].ToBoolean() || !info[1].IsNumber()) { + _ret_size = -1; + } else { + _ret_size = info[1].As().Int64Value(); + } + + _cond.notify_one(); + lock.unlock(); + }); + + // call the user provided function with the rdma_info and the callback + // notice that we do not await here so the function must call the callback + _func.Call({ rdma_info, callback }); +} + +void +rdma_client_napi(Napi::Env env, Napi::Object exports) +{ + exports["RdmaClientNapi"] = RdmaClientNapi::Init(env); + DBG0("RDMA: client library loaded."); +} + +} // namespace noobaa diff --git a/src/native/rdma/rdma_napi.gyp b/src/native/rdma/rdma_napi.gyp new file mode 100644 index 0000000000..8d9d6b8f3e --- /dev/null +++ b/src/native/rdma/rdma_napi.gyp @@ -0,0 +1,44 @@ +# Copyright (C) 2016 NooBaa +{ + 'includes': ['../common.gypi', '../warnings.gypi'], + + 'targets': [{ + 'target_name': 'rdma_napi', + 'type': 'static_library', + 'sources': [ + 'rdma_server_napi.cpp', + 'rdma_client_napi.cpp', + ], + 'variables': { + 'CUOBJ_PATH%': ''' +#include + +typedef off_t loff_t; +#include "cuobjserver.h" +#include "protocol.h" + +struct rdma_buffer; +typedef struct rdma_buffer RdmaBuf; + +namespace noobaa +{ + +DBG_INIT(0); + +struct AsyncEvent +{ + std::shared_ptr deferred; + ssize_t size; + uint16_t channel_id; +}; + +/** + * RdmaServerNapi is a napi object wrapper for cuObjServer. + */ +struct RdmaServerNapi : public Napi::ObjectWrap +{ + static Napi::FunctionReference constructor; + std::shared_ptr _server; + Napi::Reference _buffer_symbol; + std::set _async_channels; + uv_prepare_t _uv_async_handler; + bool _use_async_events = false; + + static Napi::Function Init(Napi::Env env); + RdmaServerNapi(const Napi::CallbackInfo& info); + ~RdmaServerNapi(); + Napi::Value close(const Napi::CallbackInfo& info); + Napi::Value register_buffer(const Napi::CallbackInfo& info); + Napi::Value deregister_buffer(const Napi::CallbackInfo& info); + Napi::Value is_registered_buffer(const Napi::CallbackInfo& info); + Napi::Value rdma(const Napi::CallbackInfo& info); + Napi::Value rdma_async_event(const Napi::CallbackInfo& info); + void _handle_async_events(); +}; + +/** + * RdmaServerWorker is a napi worker for RdmaServerNapi::rdma() + */ +struct RdmaServerWorker : public ObjectWrapWorker +{ + std::shared_ptr _server; + cuObjOpType_t _op_type; + std::string _op_key; + void* _ptr; + size_t _size; + RdmaBuf* _rdma_buf; + std::string _rdma_desc; + uint64_t _rdma_addr; + size_t _rdma_size; + loff_t _rdma_offset; + ssize_t _ret_size; + thread_local static uint16_t _thread_channel_id; + + RdmaServerWorker(const Napi::CallbackInfo& info); + virtual void Execute() override; + virtual void OnOK() override; +}; + +Napi::FunctionReference RdmaServerNapi::constructor; +thread_local uint16_t RdmaServerWorker::_thread_channel_id = INVALID_CHANNEL_ID; +typedef Napi::External ExternalRdmaBuf; + +static inline int32_t asi32(Napi::Value v); +static inline uint32_t asu32(Napi::Value v); +static inline uint32_t asi64(Napi::Value v); +static inline std::string asstr(Napi::Value v); + +static void +_uv_handle_async_events(uv_prepare_t* handle) +{ + static_cast(handle->data)->_handle_async_events(); +} + +Napi::Function +RdmaServerNapi::Init(Napi::Env env) +{ + constructor = Napi::Persistent(DefineClass(env, + "RdmaServerNapi", + { + InstanceMethod<&RdmaServerNapi::close>("close"), + InstanceMethod<&RdmaServerNapi::register_buffer>("register_buffer"), + InstanceMethod<&RdmaServerNapi::deregister_buffer>("deregister_buffer"), + InstanceMethod<&RdmaServerNapi::is_registered_buffer>("is_registered_buffer"), + InstanceMethod<&RdmaServerNapi::rdma>("rdma"), + })); + constructor.SuppressDestruct(); + return constructor.Value(); +} + +/** + * @param {{ + * ip: string, + * port: number, + * log_level?: 'ERROR'|'INFO'|'DEBUG', + * num_dcis?: number, + * cq_depth?: number, + * dc_key?: number, + * ibv_poll_max_comp_event?: number, + * service_level?: number, + * min_rnr_timer?: number, + * hop_limit?: number, + * pkey_index?: number, + * max_wr?: number, + * max_sge?: number, + * delay_mode?: number, + * delay_interval?: number, + * }} params = info[0] + */ +RdmaServerNapi::RdmaServerNapi(const Napi::CallbackInfo& info) + : Napi::ObjectWrap(info) +{ + auto env = info.Env(); + const Napi::Object params = info[0].As(); + std::string ip = params["ip"].As().Utf8Value(); + unsigned short port = params["port"].As().Uint32Value(); + + uint32_t log_flags = 0; + if (params["log_level"].IsString()) { + std::string log_level = asstr(params["log_level"]); + if (log_level == "DEBUG") { + log_flags |= CUOBJ_LOG_PATH_ERROR; + log_flags |= CUOBJ_LOG_PATH_INFO; + log_flags |= CUOBJ_LOG_PATH_DEBUG; + } else if (log_level == "INFO") { + log_flags |= CUOBJ_LOG_PATH_ERROR; + log_flags |= CUOBJ_LOG_PATH_INFO; + } else if (log_level == "ERROR") { + log_flags |= CUOBJ_LOG_PATH_ERROR; + } else { + throw Napi::Error::New(env, + XSTR() << "RdmaServerNapi::ctor bad " << DVAL(log_level)); + } + } + + cuObjRDMATunable rdma_params; + if (params["num_dcis"].IsNumber()) { + rdma_params.setNumDcis(asi32(params["num_dcis"])); + } + if (params["cq_depth"].IsNumber()) { + rdma_params.setCqDepth(asu32(params["cq_depth"])); + } + if (params["dc_key"].IsNumber()) { + rdma_params.setDcKey(asi64(params["dc_key"])); + } + if (params["ibv_poll_max_comp_event"].IsNumber()) { + rdma_params.setIbvPollMaxCompEv(asi32(params["ibv_poll_max_comp_event"])); + } + if (params["service_level"].IsNumber()) { + rdma_params.setServiceLevel(asi32(params["service_level"])); + } + if (params["min_rnr_timer"].IsNumber()) { + rdma_params.setMinRnrTimer(asi32(params["min_rnr_timer"])); + } + if (params["hop_limit"].IsNumber()) { + rdma_params.setHopLimit(asu32(params["hop_limit"])); + } + if (params["pkey_index"].IsNumber()) { + rdma_params.setPkeyIndex(asi32(params["pkey_index"])); + } + if (params["max_wr"].IsNumber()) { + rdma_params.setMaxWr(asi32(params["max_wr"])); + } + if (params["max_sge"].IsNumber()) { + rdma_params.setMaxSge(asi32(params["max_sge"])); + } + if (params["delay_mode"].IsNumber()) { + rdma_params.setDelayMode(cuObjDelayMode_t(asi32(params["delay_mode"]))); + } else { + // rdma_params.setDelayMode(CUOBJ_DELAY_NONE); + } + if (params["delay_interval"].IsNumber()) { + rdma_params.setDelayInterval(asu32(params["delay_interval"])); + } else { + // rdma_params.setDelayInterval(0); + } + + DBG0("RdmaServerNapi::ctor " + << DVAL(ip) << DVAL(port) << DVAL(log_flags) + << "num_dcis=" << rdma_params.getNumDcis() << " " + << "cq_depth=" << rdma_params.getCqDepth() << " " + << "dc_key=" << rdma_params.getDcKey() << " " + << "ibv_poll_max_comp_event=" << rdma_params.getIbvPollMaxCompEv() << " " + << "service_level=" << rdma_params.getServiceLevel() << " " + << "min_rnr_timer=" << rdma_params.getMinRnrTimer() << " " + << "hop_limit=" << rdma_params.getHopLimit() << " " + << "pkey_index=" << rdma_params.getPkeyIndex() << " " + << "max_wr=" << rdma_params.getMaxWr() << " " + << "max_sge=" << rdma_params.getMaxSge() << " " + << "delay_mode=" << rdma_params.getDelayMode() << " " + << "delay_interval=" << rdma_params.getDelayInterval() << " "); + + cuObjServer::setupTelemetry(true, &std::cout); + cuObjServer::setTelemFlags(log_flags); + + std::shared_ptr server(new cuObjServer( + ip.c_str(), port, CUOBJ_PROTO_RDMA_DC_V1, rdma_params)); + + if (!server->isConnected()) { + throw Napi::Error::New(env, + XSTR() << "RdmaServerNapi::ctor connect failed " << DVAL(ip) << DVAL(port)); + } + + _server = server; + _buffer_symbol = Napi::Persistent(Napi::Symbol::New(env, "RdmaServerNapiBufferSymbol")); + + _uv_async_handler.data = this; + uv_prepare_init(uv_default_loop(), &_uv_async_handler); + _use_async_events = params["use_async_events"].ToBoolean(); +} + +RdmaServerNapi::~RdmaServerNapi() +{ + DBG0("RdmaServerNapi::dtor"); + uv_prepare_stop(&_uv_async_handler); + _server.reset(); +} + +Napi::Value +RdmaServerNapi::close(const Napi::CallbackInfo& info) +{ + DBG0("RdmaServerNapi::close"); + uv_prepare_stop(&_uv_async_handler); + _server.reset(); + return info.Env().Undefined(); +} + +/** + * Register a buffer for RDMA and get an rdma_buf handle. + * The handle is stored in the buffer object as an external reference. + * This allows any buffer to be registered lazily and get the handle from the buffer when needed. + * @param {Buffer} buf = info[0] + */ +Napi::Value +RdmaServerNapi::register_buffer(const Napi::CallbackInfo& info) +{ + auto env = info.Env(); + auto buf = info[0].As>(); + void* ptr = buf.Data(); + size_t size = buf.Length(); + auto sym = _buffer_symbol.Value(); + + // check if already registered and return so callers can easily lazy register any buffer + if (buf.Get(sym).IsExternal()) { + return env.Undefined(); + } + + RdmaBuf* rdma_buf = _server->registerBuffer(ptr, size); + if (!rdma_buf) { + throw Napi::Error::New(env, + XSTR() << "RdmaServerNapi::register_buffer Failed to register rdma buffer " + << DVAL(ptr) << DVAL(size)); + } + + // TODO add a finalizer to de-register on GC of the external, currently we need to manuall call de-register or we leak the RDMA handle + buf.Set(sym, ExternalRdmaBuf::New(env, rdma_buf)); + return env.Undefined(); +} + +/** + * @param {Buffer} buf = info[0] + */ +Napi::Value +RdmaServerNapi::deregister_buffer(const Napi::CallbackInfo& info) +{ + auto env = info.Env(); + auto buf = info[0].As>(); + void* ptr = buf.Data(); + size_t size = buf.Length(); + auto sym = _buffer_symbol.Value(); + + if (!buf.Get(sym).IsExternal()) { + throw Napi::Error::New(env, + XSTR() << "RdmaServerNapi::deregister_buffer No registered rdma buffer " + << DVAL(ptr) << DVAL(size)); + } + + auto rdma_buf = buf.Get(sym).As().Data(); + _server->deRegisterBuffer(rdma_buf); + + buf.Delete(sym); + return env.Undefined(); +} + +/** + * @param {Buffer} buf = info[0] + * @returns {boolean} + */ +Napi::Value +RdmaServerNapi::is_registered_buffer(const Napi::CallbackInfo& info) +{ + auto env = info.Env(); + auto buf = info[0].As>(); + auto sym = _buffer_symbol.Value(); + bool is_registered = buf.Get(sym).IsExternal(); + return Napi::Boolean::New(env, is_registered); +} + +/** + * async function to start and await a RdmaServerWorker threadpool worker + * + * @param {'GET'|'PUT'} op_type = info[0] + * @param {string} op_key = info[1] + * @param {Buffer} buf = info[2] + * @param {{ + * desc: string, + * addr: string, + * size: number, + * offset: number, + * }} rdma_info = info[3] + */ +Napi::Value +RdmaServerNapi::rdma(const Napi::CallbackInfo& info) +{ + // NOTE: at the moment the async events mode works slower than the threadpool mode. + if (_use_async_events) { + return rdma_async_event(info); + } else { + return await_worker(info); + } +} + +RdmaServerWorker::RdmaServerWorker(const Napi::CallbackInfo& info) + : ObjectWrapWorker(info) + , _server(_wrap->_server) + , _op_type(CUOBJ_INVALID) + , _ptr(0) + , _size(0) + , _rdma_buf(0) + , _rdma_addr(0) + , _rdma_size(0) + , _rdma_offset(0) + , _ret_size(-1) +{ + auto env = info.Env(); + auto op_type = info[0].As().Utf8Value(); + _op_key = info[1].As().Utf8Value(); + auto buf = info[2].As>(); + auto rdma_info = info[3].As(); + + _rdma_desc = rdma_info.Get("desc").As().Utf8Value(); + auto rdma_addr = rdma_info.Get("addr").As().Utf8Value(); + auto rdma_size = rdma_info.Get("size").As().Int64Value(); + _rdma_offset = rdma_info.Get("offset").As().Int64Value(); + + if (op_type == "GET") { + _op_type = CUOBJ_GET; + } else if (op_type == "PUT") { + _op_type = CUOBJ_PUT; + } else { + throw Napi::Error::New(env, + XSTR() << "RdmaServerWorker: bad op type " << DVAL(op_type)); + } + + _ptr = buf.Data(); + _size = buf.Length(); + _rdma_addr = strtoull(rdma_addr.c_str(), 0, 16); + _rdma_size = size_t(rdma_size); + auto sym = _wrap->_buffer_symbol.Value(); + + if (_rdma_desc.size() + 1 != sizeof RDMA_DESC_STR) { + throw Napi::Error::New(env, + XSTR() << "RdmaServerWorker: bad rdma desc " << DVAL(_rdma_desc)); + } + if (_rdma_addr == 0) { + throw Napi::Error::New(env, + XSTR() << "RdmaServerWorker: bad rdma addr " << DVAL(rdma_addr) << DVAL(_rdma_addr)); + } + if (rdma_size <= 0) { + throw Napi::Error::New(env, + XSTR() << "RdmaServerWorker: bad rdma size " << DVAL(rdma_size)); + } + if (_rdma_offset < 0) { + throw Napi::Error::New(env, + XSTR() << "RdmaServerWorker: bad rdma offset " << DVAL(_rdma_offset)); + } + if (!buf.Get(sym).IsExternal()) { + throw Napi::Error::New(env, + XSTR() << "RdmaServerWorker: No registered rdma buffer " << DVAL(_ptr) << DVAL(_size)); + } + + _rdma_buf = buf.Get(sym).As().Data(); +} + +void +RdmaServerWorker::Execute() +{ + DBG1("RdmaServerWorker: Execute " + << DVAL(_op_type) + << DVAL(_op_key) + << DVAL(_ptr) + << DVAL(_size) + << DVAL(_rdma_buf) + << DVAL(_rdma_desc) + << DVAL(_rdma_addr) + << DVAL(_rdma_size) + << DVAL(_rdma_offset)); + + size_t real_size = std::min(_size, _rdma_size); + + // lazy allocate channel id and keep it in thread local storage + // we currently do not free those channel ids + if (_thread_channel_id == INVALID_CHANNEL_ID) { + _thread_channel_id = _server->allocateChannelId(); + if (_thread_channel_id == INVALID_CHANNEL_ID) { + SetError(XSTR() << "RdmaServerWorker: Failed to allocate channel id"); + return; + } + } + + if (_op_type == CUOBJ_GET) { + _ret_size = _server->handleGetObject( + _op_key, _rdma_buf, _rdma_addr, real_size, _rdma_desc, _thread_channel_id); + } else if (_op_type == CUOBJ_PUT) { + _ret_size = _server->handlePutObject( + _op_key, _rdma_buf, _rdma_addr, real_size, _rdma_desc, _thread_channel_id); + } else { + PANIC("bad op type " << DVAL(_op_type)); + } + + if (_ret_size < 0) { + SetError(XSTR() << "RdmaServerWorker: op failed " + << DVAL(_op_type) << DVAL(_ret_size)); + } +} + +void +RdmaServerWorker::OnOK() +{ + _promise.Resolve(Napi::Number::New(Env(), _ret_size)); +} + +Napi::Value +RdmaServerNapi::rdma_async_event(const Napi::CallbackInfo& info) +{ + auto env = info.Env(); + auto op_type = info[0].As().Utf8Value(); + auto op_key = info[1].As().Utf8Value(); + auto buf = info[2].As>(); + auto sym = _buffer_symbol.Value(); + auto rdma_info = info[3].As(); + auto rdma_desc = rdma_info.Get("desc").As().Utf8Value(); + auto rdma_addr = rdma_info.Get("addr").As().Utf8Value(); + auto rdma_size = rdma_info.Get("size").As().Int64Value(); + auto rdma_offset = rdma_info.Get("offset").As().Int64Value(); + + void* ptr = buf.Data(); + size_t size = buf.Length(); + size_t real_size = std::min(size, size_t(rdma_size)); + uint64_t remote_addr = strtoull(rdma_addr.c_str(), 0, 16); + + if (rdma_desc.size() + 1 != sizeof RDMA_DESC_STR) { + throw Napi::Error::New(env, + XSTR() << "RdmaServerNapi: bad rdma desc " << DVAL(rdma_desc)); + } + if (remote_addr == 0) { + throw Napi::Error::New(env, + XSTR() << "RdmaServerNapi: bad rdma addr " << DVAL(remote_addr) << DVAL(rdma_addr)); + } + if (rdma_size <= 0) { + throw Napi::Error::New(env, + XSTR() << "RdmaServerNapi: bad rdma size " << DVAL(rdma_size)); + } + if (rdma_offset < 0) { + throw Napi::Error::New(env, + XSTR() << "RdmaServerNapi: bad rdma offset " << DVAL(rdma_offset)); + } + if (!buf.Get(sym).IsExternal()) { + throw Napi::Error::New(env, + XSTR() << "RdmaServerNapi: No registered rdma buffer " << DVAL(ptr) << DVAL(size)); + } + auto rdma_buf = buf.Get(sym).As().Data(); + + uint16_t channel_id = _server->allocateChannelId(); + auto deferred = std::make_shared(env); + auto async_event = std::make_unique(deferred, real_size, channel_id); + + // LOG("RdmaServerNapi: queue async event " << DVAL(deferred.get()) << DVAL(_num_pending) << DVAL(size)); + int r = 0; + ibv_wc_status status = IBV_WC_SUCCESS; + if (op_type == "GET") { + r = _server->handleGetObject( + op_key, rdma_buf, remote_addr, real_size, rdma_desc, channel_id, 0, &status, async_event.get()); + } else if (op_type == "PUT") { + r = _server->handlePutObject( + op_key, rdma_buf, remote_addr, real_size, rdma_desc, channel_id, 0, &status, async_event.get()); + } else { + throw Napi::Error::New(env, + XSTR() << "RdmaServerNapi: bad op type " << DVAL(op_type)); + } + if (r < 0) { + _server->freeChannelId(channel_id); + throw Napi::Error::New(env, + XSTR() << "RdmaServerNapi: handle call error " << DVAL(r)); + } else { + async_event.release(); + if (_async_channels.empty()) { + uv_prepare_start(&_uv_async_handler, _uv_handle_async_events); + } + _async_channels.insert(channel_id); + return deferred->Promise(); + } +} + +void +RdmaServerNapi::_handle_async_events() +{ + // LOG("RdmaServerNapi::_handle_async_events " << DVAL(_async_channels.size())); + if (_async_channels.empty()) { + // uv_prepare_stop(&_uv_async_handler); + return; + } + for (auto it = _async_channels.begin(); it != _async_channels.end();) { + uint16_t channel_id = *it; + cuObjAsyncEvent_t poll_event = { nullptr, IBV_WC_SUCCESS }; + int num_events = _server->poll(&poll_event, 1, channel_id); + if (num_events == 0) { + ++it; + continue; + } + assert(num_events == 1); + auto async_event = std::unique_ptr(static_cast(poll_event.async_handle)); + assert(async_event.channel_id == channel_id); + auto deferred = async_event->deferred; + auto env = deferred->Env(); + Napi::HandleScope scope(env); + // LOG("RdmaServerNapi::_handle_async_events complete " << DVAL(deferred.get()) << DVAL(event.status)); + if (poll_event.status != IBV_WC_SUCCESS) { + auto err = Napi::Error::New(env, + XSTR() << "RdmaServerNapi: op failed " << DVAL(poll_event.status)); + deferred->Reject(err.Value()); + } else { + deferred->Resolve(Napi::Number::New(env, async_event->size)); + } + _server->freeChannelId(channel_id); + it = _async_channels.erase(it); // erase and advance + } + // if (_async_channels.empty()) { + // uv_prepare_stop(&_uv_async_handler); + // } +} + +static inline int32_t +asi32(Napi::Value v) +{ + return v.As().Int32Value(); +} + +static inline uint32_t +asu32(Napi::Value v) +{ + return v.As().Uint32Value(); +} + +static inline uint32_t +asi64(Napi::Value v) +{ + return v.As().Int64Value(); +} + +static inline std::string +asstr(Napi::Value v) +{ + return v.As().Utf8Value(); +} + +void +rdma_server_napi(Napi::Env env, Napi::Object exports) +{ + exports["RdmaServerNapi"] = RdmaServerNapi::Init(env); + DBG0("RDMA: server library loaded."); +} + +} // namespace noobaa diff --git a/src/native/s3select/s3select.gyp b/src/native/s3select/s3select.gyp index c459d23725..ab3969ce5b 100644 --- a/src/native/s3select/s3select.gyp +++ b/src/native/s3select/s3select.gyp @@ -13,20 +13,19 @@ 'sources': [ 's3select_napi.cpp' ], - 'link_settings': { - 'libraries': ['/lib64/libboost_thread.so.1.75.0'] + 'link_settings': { + 'libraries': ['/lib64/libboost_thread.so.1.75.0'] }, - 'variables': { + 'variables': { 'BUILD_S3SELECT_PARQUET%':0 }, - 'conditions': [ + 'conditions': [ ['BUILD_S3SELECT_PARQUET==1', { 'link_settings': { - 'libraries': ['/lib64/libarrow.so', '/lib64/libparquet.so'] - }, - 'cflags' : ['-D_ARROW_EXIST'] - }] + 'libraries': ['/lib64/libarrow.so', '/lib64/libparquet.so'] + }, + 'defines': ['_ARROW_EXIST'] + }] ], - }] } diff --git a/src/native/tools/crypto_napi.cpp b/src/native/tools/crypto_napi.cpp index 4cfb1ff234..9c564da0af 100644 --- a/src/native/tools/crypto_napi.cpp +++ b/src/native/tools/crypto_napi.cpp @@ -5,73 +5,11 @@ #include "../util/common.h" #include "../util/endian.h" #include "../util/napi.h" +#include "../util/worker.h" namespace noobaa { -template -static Napi::Value -api(const Napi::CallbackInfo& info) -{ - auto w = new T(info); - Napi::Promise promise = w->_deferred.Promise(); - w->Queue(); - return promise; -} - -/** - * CryptoWorker is a general async worker for our crypto operations - */ -struct CryptoWorker : public Napi::AsyncWorker -{ - Napi::Promise::Deferred _deferred; - // _args_ref is used to keep refs to all the args for the worker lifetime, - // which is needed for workers that receive buffers, - // because in their ctor they copy the pointers to the buffer's memory, - // and if the JS caller scope does not keep a ref to the buffers until after the call, - // then the worker may access invalid memory... - Napi::ObjectReference _args_ref; - - CryptoWorker(const Napi::CallbackInfo& info) - : AsyncWorker(info.Env()) - , _deferred(Napi::Promise::Deferred::New(info.Env())) - , _args_ref(Napi::Persistent(Napi::Object::New(info.Env()))) - { - for (int i = 0; i < (int)info.Length(); ++i) _args_ref.Set(i, info[i]); - } - virtual void OnOK() override - { - // LOG("CryptoWorker::OnOK: undefined"); - _deferred.Resolve(Env().Undefined()); - } - virtual void OnError(Napi::Error const& error) override - { - LOG("CryptoWorker::OnError: " << DVAL(error.Message())); - auto obj = error.Value(); - _deferred.Reject(obj); - } -}; - -/** - * CryptoWrapWorker is meant to simplify adding async CryptoWorker instance methods to ObjectWrap types - * like MD5Wrap, while keeping the object referenced during that action. - */ -template -struct CryptoWrapWorker : public CryptoWorker -{ - Wrapper* _wrap; - CryptoWrapWorker(const Napi::CallbackInfo& info) - : CryptoWorker(info) - { - _wrap = Wrapper::Unwrap(info.This().As()); - _wrap->Ref(); - } - ~CryptoWrapWorker() - { - _wrap->Unref(); - } -}; - struct MD5Wrap : public Napi::ObjectWrap { size_t _NWORDS = MD5_DIGEST_NWORDS; @@ -117,12 +55,12 @@ struct MD5Wrap : public Napi::ObjectWrap Napi::FunctionReference MD5Wrap::constructor; -struct MD5Update : public CryptoWrapWorker +struct MD5Update : public ObjectWrapWorker { uint8_t* _buf; size_t _len; MD5Update(const Napi::CallbackInfo& info) - : CryptoWrapWorker(info) + : ObjectWrapWorker(info) , _buf(0) , _len(0) { @@ -136,11 +74,11 @@ struct MD5Update : public CryptoWrapWorker } }; -struct MD5Digest : public CryptoWrapWorker +struct MD5Digest : public ObjectWrapWorker { std::vector _digest; MD5Digest(const Napi::CallbackInfo& info) - : CryptoWrapWorker(info) + : ObjectWrapWorker(info) { } virtual void Execute() @@ -154,20 +92,20 @@ struct MD5Digest : public CryptoWrapWorker virtual void OnOK() { Napi::Env env = Env(); - _deferred.Resolve(Napi::Buffer::Copy(env, _digest.data(), _wrap->_NWORDS)); + _promise.Resolve(Napi::Buffer::Copy(env, _digest.data(), _wrap->_NWORDS)); } }; Napi::Value MD5Wrap::update(const Napi::CallbackInfo& info) { - return api(info); + return await_worker(info); } Napi::Value MD5Wrap::digest(const Napi::CallbackInfo& info) { - return api(info); + return await_worker(info); } void diff --git a/src/native/util/backtrace.h b/src/native/util/backtrace.h index 8944bc0e71..29bcfc0027 100644 --- a/src/native/util/backtrace.h +++ b/src/native/util/backtrace.h @@ -51,15 +51,26 @@ class Backtrace if (!dladdr(trace[i], &info)) { break; } - int status; - std::string file(info.dli_fname); - std::string func(info.dli_sname); - char* demangled = abi::__cxa_demangle(info.dli_sname, NULL, 0, &status); - if (status == 0 && demangled) { - func = demangled; + std::string func; + if (info.dli_sname) { + int status = -1; + char* demangled = abi::__cxa_demangle(info.dli_sname, NULL, 0, &status); + if (status == 0 && demangled) { + func = demangled; + } else { + func = info.dli_sname; + } + if (demangled) { + free(demangled); + } + } else { + std::stringstream s; + s << "0x" << std::hex << uintptr_t(info.dli_saddr); + func = s.str(); } - if (demangled) { - free(demangled); + std::string file; + if (info.dli_fname) { + file = info.dli_fname; } if (file.empty()) { break; // entries after main diff --git a/src/native/util/common.h b/src/native/util/common.h index e8149b1b05..da11bc26df 100644 --- a/src/native/util/common.h +++ b/src/native/util/common.h @@ -32,6 +32,8 @@ namespace noobaa #endif #define DVAL(x) #x "=" << x << " " +#define DMEM(x,ptr,len) #x "=[" << ((void*)ptr) << "+" << ((void*)len) << "] " +#define DBUF(x) DMEM(x,(x).Data(),(x).Length()) extern bool LOG_TO_STDERR_ENABLED; extern bool LOG_TO_SYSLOG_ENABLED; diff --git a/src/native/util/worker.h b/src/native/util/worker.h new file mode 100644 index 0000000000..b98e166c60 --- /dev/null +++ b/src/native/util/worker.h @@ -0,0 +1,95 @@ +/* Copyright (C) 2016 NooBaa */ +#pragma once + +#include "napi.h" + +namespace noobaa +{ + +/** + * PromiseWorker is a base async worker that runs in a separate thread and + * returns a promise. It makes sure to hold reference to keep the JS arguments + * alive until the worker is done. + * Inherit from this class and override Execute() to do the async work. + * Override OnOK() to resolve the promise with the result. + */ +struct PromiseWorker : public Napi::AsyncWorker +{ + Napi::Promise::Deferred _promise; + + // keep refs to all the args/this for the worker lifetime. + // this is needed mainly for workers that receive buffers, + // and uses them to access the buffer memory. + // these refs are released when the worker is deleted. + Napi::ObjectReference _args_ref; + Napi::Reference _this_ref; + + PromiseWorker(const Napi::CallbackInfo& info) + : AsyncWorker(info.Env()) + , _promise(Napi::Promise::Deferred::New(info.Env())) + , _args_ref(Napi::Persistent(Napi::Object::New(info.Env()))) + , _this_ref(Napi::Persistent(info.This())) + { + for (int i = 0; i < (int)info.Length(); ++i) _args_ref.Set(i, info[i]); + } + + /** + * This is a simple OnOK() that just resolves the promise with undefined. + * However, most workers will needs to return a value that they compute + * during Execute(), but Execute() runs in another thread and cannot access + * JS objects. Instead, Execute() should keep native values/structures in + * member variables, and override OnOK() to build the resulting JS value + * and resolve the promise with it. + */ + virtual void OnOK() override + { + // DBG1("PromiseWorker::OnOK: resolved (empty)"); + _promise.Resolve(Env().Undefined()); + } + + /** + * Handle worker error by rejecting the promise with the error message. + */ + virtual void OnError(Napi::Error const& error) override + { + LOG("PromiseWorker::OnError: " << DVAL(error.Message())); + auto obj = error.Value(); + _promise.Reject(obj); + } +}; + +/** + * ObjectWrapWorker is a base class that simplifies adding async instance methods + * to ObjectWrap types while keeping the object referenced during that action. + */ +template +struct ObjectWrapWorker : public PromiseWorker +{ + ObjectWrapType* _wrap; + ObjectWrapWorker(const Napi::CallbackInfo& info) + : PromiseWorker(info) + { + _wrap = ObjectWrapType::Unwrap(info.This().As()); + _wrap->Ref(); + } + ~ObjectWrapWorker() + { + _wrap->Unref(); + } +}; + +/** + * await_worker is a helper function to submit a PromiseWorker or ObjectWrapWorker + * WorkerType should anyway be a subclass of PromiseWorker. + */ +template +Napi::Value +await_worker(const Napi::CallbackInfo& info) +{ + PromiseWorker* worker = new WorkerType(info); + Napi::Promise promise = worker->_promise.Promise(); + worker->Queue(); // this will delete the worker when done + return promise; +} + +} // namespace noobaa diff --git a/src/native/warnings.gypi b/src/native/warnings.gypi new file mode 100644 index 0000000000..d216ca77d4 --- /dev/null +++ b/src/native/warnings.gypi @@ -0,0 +1,28 @@ +# Copyright (C) 2016 NooBaa +{ + 'variables': { + 'cflags_warnings': [ + '-W', + '-Wall', + '-Wextra', + '-Werror', + '-Wpedantic', + '-Wno-unused-parameter', + # Can be removed when https://github.com/nodejs/nan/issues/953 is resolved. + '-Wno-error=deprecated-declarations', + ], + }, + + 'target_defaults': { + + 'cflags': ['<@(cflags_warnings)'], + + 'conditions' : [ + [ 'OS=="mac"', { + 'xcode_settings': { + 'WARNING_CFLAGS': ['<@(cflags_warnings)'], + }, + }], + ], + }, +} diff --git a/src/sdk/namespace_fs.js b/src/sdk/namespace_fs.js index 70b4b4f3cf..25c97af55e 100644 --- a/src/sdk/namespace_fs.js +++ b/src/sdk/namespace_fs.js @@ -8,13 +8,14 @@ const fs = require('fs'); const path = require('path'); const util = require('util'); const mime = require('mime-types'); +const stream = require('stream'); const P = require('../util/promise'); const dbg = require('../util/debug_module')(__filename); const config = require('../../config'); const crypto = require('crypto'); const s3_utils = require('../endpoint/s3/s3_utils'); +const rdma_utils = require('../util/rdma_utils'); const error_utils = require('../util/error_utils'); -const stream_utils = require('../util/stream_utils'); const buffer_utils = require('../util/buffer_utils'); const size_utils = require('../util/size_utils'); const http_utils = require('../util/http_utils'); @@ -28,6 +29,11 @@ const lifecycle_utils = require('../util/lifecycle_utils'); const NoobaaEvent = require('../manage_nsfs/manage_nsfs_events_utils').NoobaaEvent; const { PersistentLogger } = require('../util/persistent_logger'); const { Glacier } = require('./glacier'); +const { FileReader } = require('../util/file_reader'); +const Speedometer = require('../util/speedometer'); + +const speedometer = new Speedometer({ name: 'NSFS READ' }); +speedometer.start_lite(); const multi_buffer_pool = new buffer_utils.MultiSizeBuffersPool({ sorted_buf_sizes: [ @@ -43,6 +49,11 @@ const multi_buffer_pool = new buffer_utils.MultiSizeBuffersPool({ }, { size: config.NSFS_BUF_SIZE_L, sem_size: config.NSFS_BUF_POOL_MEM_LIMIT_L, + is_default: true, // use as default when size is not specified in the request + }, { + // TODO - this is a temporary solution to use larger buffers for rdma + size: 8 * config.NSFS_BUF_SIZE_L, + sem_size: config.NSFS_BUF_POOL_MEM_LIMIT_L / 2, }, ], warning_timeout: config.NSFS_BUF_POOL_WARNING_TIMEOUT, @@ -102,7 +113,7 @@ function sort_entries_by_name(a, b) { return 0; } -function _get_version_id_by_stat({ino, mtimeNsBigint}) { +function _get_version_id_by_stat({ ino, mtimeNsBigint }) { // TODO: GPFS might require generation number to be added to version_id return 'mtime-' + mtimeNsBigint.toString(36) + '-ino-' + ino.toString(36); } @@ -236,21 +247,6 @@ function is_symbolic_link(stat) { } } -/** - * NOTICE that even files that were written sequentially, can still be identified as sparse: - * 1. After writing, but before all the data is synced, the size is higher than blocks size. - * 2. For files that were moved to an archive tier. - * 3. For files that fetch and cache data from remote storage, which are still not in the cache. - * It's not good enough for avoiding recall storms as needed by _fail_if_archived_or_sparse_file. - * However, using this check is useful for guessing that a reads is going to take more time - * and avoid holding off large buffers from the buffers_pool. - * @param {nb.NativeFSStats} stat - * @returns {boolean} - */ -function is_sparse_file(stat) { - return (stat.blocks * 512 < stat.size); -} - /** * @param {fs.Dirent} e * @returns {string} @@ -512,7 +508,6 @@ class NamespaceFS { this.versioning = (config.NSFS_VERSIONING_ENABLED && versioning) || VERSIONING_STATUS_ENUM.VER_DISABLED; this.stats = stats; this.force_md5_etag = force_md5_etag; - this.warmup_buffer = nb_native().fs.dio_buffer_alloc(4096); } /** @@ -834,7 +829,7 @@ class NamespaceFS { if (version_id_marker) start_marker = version_id_marker; marker_index = _.findIndex( sorted_entries, - {name: start_marker} + { name: start_marker } ) + 1; } else { marker_index = _.sortedLastIndexBy( @@ -879,7 +874,7 @@ class NamespaceFS { try { dbg.warn('NamespaceFS: open dir streaming', dir_path, 'size', cached_dir.stat.size); dir_handle = await nb_native().fs.opendir(fs_context, dir_path); //, { bufferSize: 128 }); - for (;;) { + for (; ;) { const dir_entry = await dir_handle.read(fs_context); if (!dir_entry) break; await process_entry(dir_entry); @@ -975,8 +970,9 @@ class NamespaceFS { let isDir; let retries = (this._is_versioning_enabled() || this._is_versioning_suspended()) ? config.NSFS_RENAME_RETRIES : 0; try { - for (;;) { + for (; ;) { try { + object_sdk.throw_if_aborted(); file_path = await this._find_version_path(fs_context, params, true); await this._check_path_in_bucket_boundaries(fs_context, file_path); await this._load_bucket(params, fs_context); @@ -1002,6 +998,7 @@ class NamespaceFS { dbg.warn(`NamespaceFS.read_object_md: retrying retries=${retries} file_path=${file_path}`, err); retries -= 1; if (retries <= 0 || !native_fs_utils.should_retry_link_unlink(err)) throw err; + object_sdk.throw_if_aborted(); await P.delay(get_random_delay(config.NSFS_RANDOM_DELAY_BASE, 0, 50)); } } @@ -1029,25 +1026,33 @@ class NamespaceFS { } catch (err) { //failed to get object new NoobaaEvent(NoobaaEvent.OBJECT_GET_FAILED).create_event(params.key, - {bucket_path: this.bucket_path, object_name: params.key}, err); + { bucket_path: this.bucket_path, object_name: params.key }, err); dbg.log0('NamespaceFS: read_object_stream couldnt find dir content xattr', err); } } return false; } - // eslint-disable-next-line max-statements + /** + * + * @param {*} params + * @param {nb.ObjectSDK} object_sdk + * @param {nb.S3Response|stream.Writable} res + * @returns + */ async read_object_stream(params, object_sdk, res) { - let buffer_pool_cleanup = null; const fs_context = this.prepare_fs_context(object_sdk); + const signal = object_sdk.abort_controller.signal; let file_path; let file; + try { await this._load_bucket(params, fs_context); let retries = (this._is_versioning_enabled() || this._is_versioning_suspended()) ? config.NSFS_RENAME_RETRIES : 0; let stat; - for (;;) { + for (; ;) { try { + object_sdk.throw_if_aborted(); file_path = await this._find_version_path(fs_context, params); await this._check_path_in_bucket_boundaries(fs_context, file_path); @@ -1082,9 +1087,10 @@ class NamespaceFS { retries -= 1; if (retries <= 0 || !native_fs_utils.should_retry_link_unlink(err)) { new NoobaaEvent(NoobaaEvent.OBJECT_GET_FAILED).create_event(params.key, - {bucket_path: this.bucket_path, object_name: params.key}, err); + { bucket_path: this.bucket_path, object_name: params.key }, err); throw err; } + object_sdk.throw_if_aborted(); await P.delay(get_random_delay(config.NSFS_RANDOM_DELAY_BASE, 0, 50)); } } @@ -1098,121 +1104,72 @@ class NamespaceFS { const start = Number(params.start) || 0; const end = isNaN(Number(params.end)) ? Infinity : Number(params.end); - let num_bytes = 0; - let num_buffers = 0; - const log2_size_histogram = {}; - let drain_promise = null; + object_sdk.throw_if_aborted(); - dbg.log0('NamespaceFS: read_object_stream', { + dbg.log1('NamespaceFS: read_object_stream', { file_path, start, end, size: stat.size, }); - let count = 1; - for (let pos = start; pos < end;) { - object_sdk.throw_if_aborted(); - - // Our buffer pool keeps large buffers and we want to avoid spending - // all our large buffers and then have them waiting for high latency calls - // such as reading from archive/on-demand cache files. - // Instead, we detect the case where a file is "sparse", - // and then use just a small buffer to wait for a tiny read, - // which will recall the file from archive or load from remote into cache, - // and once it returns we can continue to the full fledged read. - if (config.NSFS_BUF_WARMUP_SPARSE_FILE_READS && is_sparse_file(stat)) { - dbg.log0('NamespaceFS: read_object_stream - warmup sparse file', { - file_path, pos, size: stat.size, blocks: stat.blocks, - }); - await file.read(fs_context, this.warmup_buffer, 0, 1, pos); - } - - const remain_size = Math.min(Math.max(0, end - pos), stat.size); - - // allocate or reuse buffer - // TODO buffers_pool and the underlying semaphore should support abort signal - // to avoid sleeping inside the semaphore until the timeout while the request is already aborted. - const { buffer, callback } = await multi_buffer_pool.get_buffers_pool(remain_size).get_buffer(); - buffer_pool_cleanup = callback; // must be called ***IMMEDIATELY*** after get_buffer - object_sdk.throw_if_aborted(); - - // read from file - const read_size = Math.min(buffer.length, remain_size); - const bytesRead = await file.read(fs_context, buffer, 0, read_size, pos); - if (!bytesRead) { - buffer_pool_cleanup = null; - callback(); - break; - } - object_sdk.throw_if_aborted(); - const data = buffer.slice(0, bytesRead); - - // update stats - pos += bytesRead; - num_bytes += bytesRead; - num_buffers += 1; - const log2_size = Math.ceil(Math.log2(bytesRead)); - log2_size_histogram[log2_size] = (log2_size_histogram[log2_size] || 0) + 1; - - // collect read stats - this.stats?.update_nsfs_read_stats({ - namespace_resource_id: this.namespace_resource_id, - bucket_name: params.bucket, - size: bytesRead, - count - }); - // clear count for next updates - count = 0; - - // wait for response buffer to drain before adding more data if needed - - // this occurs when the output network is slower than the input file - if (drain_promise) { - await drain_promise; - drain_promise = null; - object_sdk.throw_if_aborted(); - } + const file_reader = new FileReader({ + fs_context, + file, + file_path, + start, + end, + stat, + multi_buffer_pool, + signal, + stats: this.stats, + bucket: params.bucket, + namespace_resource_id: this.namespace_resource_id, + }); - // write the data out to response - buffer_pool_cleanup = null; // cleanup is now in the socket responsibility - const write_ok = res.write(data, null, callback); - if (!write_ok) { - drain_promise = stream_utils.wait_drain(res, { signal: object_sdk.abort_controller.signal }); - drain_promise.catch(() => undefined); // this avoids UnhandledPromiseRejection - } + const start_time = process.hrtime.bigint(); + if (params.rdma_info) { + const size = await rdma_utils.read_file_to_rdma( + params.rdma_info, + file_reader, + multi_buffer_pool, + signal, + ); + const http_res = /** @type {nb.S3Response} */ (res); + if (!http_res.setHeader) throw new Error('read_object_stream: cannot rdma to non http response'); + http_res.setHeader('Content-Length', 0); + rdma_utils.set_rdma_response_header(null, http_res, params.rdma_info, { size }); + } else { + await file_reader.read_into_stream(res); } + const took_ms = Number(process.hrtime.bigint() - start_time) / 1e6; + speedometer.update(stat.size, took_ms); - await this._glacier_force_expire_on_get(fs_context, file_path, file, stat); + // end the response stream to complete the response + res.end(); + + dbg.log1('NamespaceFS: read_object_stream completed', { + file_path, start, end, size: stat.size, + num_bytes: file_reader.num_bytes, + num_buffers: file_reader.num_buffers, + avg_buffer: file_reader.num_bytes / file_reader.num_buffers, + log2_size_histogram: file_reader.log2_size_histogram, + }); await file.close(fs_context); file = null; - object_sdk.throw_if_aborted(); - - // wait for the last drain if pending. - if (drain_promise) { - await drain_promise; - drain_promise = null; - object_sdk.throw_if_aborted(); - } - // end the stream - res.end(); - - await stream_utils.wait_finished(res, { signal: object_sdk.abort_controller.signal }); + // wait for the response to finish to make sure we handled the error if any object_sdk.throw_if_aborted(); + await stream.promises.finished(res, { signal }); - dbg.log0('NamespaceFS: read_object_stream completed file', file_path, { - num_bytes, - num_buffers, - avg_buffer: num_bytes / num_buffers, - log2_size_histogram, - }); + await this._glacier_force_expire_on_get(fs_context, file_path, file, stat); - // return null to signal the caller that we already handled the response + // return null to let the caller know that we already handled the response return null; } catch (err) { dbg.log0('NamespaceFS: read_object_stream error file', file_path, err); //failed to get object new NoobaaEvent(NoobaaEvent.OBJECT_STREAM_GET_FAILED).create_event(params.key, - {bucket_path: this.bucket_path, object_name: params.key}, err); + { bucket_path: this.bucket_path, object_name: params.key }, err); throw native_fs_utils.translate_error_codes(err, native_fs_utils.entity_enum.OBJECT); } finally { @@ -1224,18 +1181,6 @@ class NamespaceFS { } catch (err) { dbg.warn('NamespaceFS: read_object_stream file close error', err); } - try { - // release buffer back to pool if needed - if (buffer_pool_cleanup) { - dbg.log0('NamespaceFS: read_object_stream finally buffer_pool_cleanup', file_path); - buffer_pool_cleanup(); - } - } catch (err) { - //failed to get object - new NoobaaEvent(NoobaaEvent.OBJECT_CLEANUP_FAILED).create_event(params.key, - { bucket_path: this.bucket_path, object_name: params.key }, err); - dbg.warn('NamespaceFS: read_object_stream buffer pool cleanup error', err); - } } } @@ -1262,21 +1207,22 @@ class NamespaceFS { await this._throw_if_storage_class_not_supported(params.storage_class); upload_params = await this._start_upload(fs_context, object_sdk, file_path, params, open_mode); + let upload_res; if (!params.copy_source || upload_params.copy_res === COPY_STATUS_ENUM.FALLBACK) { // We are taking the buffer size closest to the sized upload const bp = multi_buffer_pool.get_buffers_pool(params.size); - const upload_res = await bp.sem.surround_count( + upload_res = await bp.sem.surround_count( bp.buf_size, async () => this._upload_stream(upload_params)); upload_params.digest = upload_res.digest; } const upload_info = await this._finish_upload(upload_params); - return upload_info; + return { ...upload_info, rdma_reply: upload_res.rdma_reply }; } catch (err) { this.run_update_issues_report(object_sdk, err); //filed to put object new NoobaaEvent(NoobaaEvent.OBJECT_UPLOAD_FAILED).create_event(params.key, - {bucket_path: this.bucket_path, object_name: params.key}, err); + { bucket_path: this.bucket_path, object_name: params.key }, err); dbg.warn('NamespaceFS: upload_object buffer pool cleanup error', err); throw native_fs_utils.translate_error_codes(err, native_fs_utils.entity_enum.OBJECT); } finally { @@ -1388,7 +1334,7 @@ class NamespaceFS { // xattr_copy = false implies on non server side copy fallback copy (copy status = FALLBACK) // target file can be undefined when it's a folder created and size is 0 async _finish_upload({ fs_context, params, open_mode, target_file, upload_path, file_path, digest = undefined, - copy_res = undefined, offset }) { + copy_res = undefined, offset }) { const part_upload = file_path === upload_path; const same_inode = params.copy_source && copy_res === COPY_STATUS_ENUM.SAME_INODE; const should_replace_xattr = params.copy_source ? copy_res === COPY_STATUS_ENUM.FALLBACK : true; @@ -1480,7 +1426,7 @@ class NamespaceFS { dbg.log2('_move_to_dest', fs_context, source_path, dest_path, target_file, open_mode, key); let retries = config.NSFS_RENAME_RETRIES; // will retry renaming a file in case of parallel deleting of the destination path - for (;;) { + for (; ;) { try { if (this._is_versioning_disabled()) { await native_fs_utils._make_path_dirs(dest_path, fs_context); @@ -1528,7 +1474,7 @@ class NamespaceFS { const is_gpfs = native_fs_utils._is_gpfs(fs_context); const is_dir_content = this._is_directory_content(latest_ver_path, key); let retries = config.NSFS_RENAME_RETRIES; - for (;;) { + for (; ;) { try { let new_ver_info; let latest_ver_info; @@ -1663,7 +1609,7 @@ class NamespaceFS { async _upload_stream({ fs_context, params, target_file, object_sdk, offset }) { const { copy_source } = params; try { - // Not using async iterators with ReadableStreams due to unsettled promises issues on abort/destroy + let rdma_reply; const md5_enabled = this._is_force_md5_enabled(object_sdk); const file_writer = new FileWriter({ target_file, @@ -1672,7 +1618,6 @@ class NamespaceFS { md5_enabled, stats: this.stats, bucket: params.bucket, - large_buf_size: multi_buffer_pool.get_buffers_pool(undefined).buf_size, namespace_resource_id: this.namespace_resource_id, }); file_writer.on('error', err => dbg.error('namespace_fs._upload_stream: error occured on FileWriter: ', err)); @@ -1683,11 +1628,17 @@ class NamespaceFS { await this.read_object_stream(copy_source, object_sdk, file_writer); } else if (params.source_params) { await params.source_ns.read_object_stream(params.source_params, object_sdk, file_writer); + } else if (params.rdma_info) { + rdma_reply = await rdma_utils.write_file_from_rdma( + params.rdma_info, + file_writer, + multi_buffer_pool, + object_sdk.abort_controller.signal, + ); } else { - await stream_utils.pipeline([params.source_stream, file_writer]); - await stream_utils.wait_finished(file_writer); + await file_writer.write_entire_stream(params.source_stream, { signal: object_sdk.abort_controller.signal }); } - return { digest: file_writer.digest, total_bytes: file_writer.total_bytes }; + return { digest: file_writer.digest, total_bytes: file_writer.total_bytes, rdma_reply }; } catch (error) { dbg.error('_upload_stream had error: ', error); throw error; @@ -1759,8 +1710,8 @@ class NamespaceFS { fs_context, path.join(params.mpu_path, 'create_object_upload'), Buffer.from(create_params), { - mode: native_fs_utils.get_umasked_mode(config.BASE_MODE_FILE), - }, + mode: native_fs_utils.get_umasked_mode(config.BASE_MODE_FILE), + }, ); return { obj_id: params.obj_id }; } catch (err) { @@ -1833,7 +1784,7 @@ class NamespaceFS { md_upload_params = { ...md_upload_params, offset, digest: upload_res.digest }; const upload_info = await this._finish_upload(md_upload_params); - return upload_info; + return { ...upload_info, rdma_reply: upload_res.rdma_reply }; } catch (err) { this.run_update_issues_report(object_sdk, err); throw native_fs_utils.translate_error_codes(err, native_fs_utils.entity_enum.OBJECT); @@ -1856,18 +1807,18 @@ class NamespaceFS { const entries = await nb_native().fs.readdir(fs_context, params.mpu_path); const multiparts = await Promise.all( entries - .filter(e => e.name.startsWith('part-')) - .map(async e => { - const num = Number(e.name.slice('part-'.length)); - const part_path = path.join(params.mpu_path, e.name); - const stat = await nb_native().fs.stat(fs_context, part_path); - return { - num, - size: stat.size, - etag: this._get_etag(stat), - last_modified: new Date(stat.mtime), - }; - }) + .filter(e => e.name.startsWith('part-')) + .map(async e => { + const num = Number(e.name.slice('part-'.length)); + const part_path = path.join(params.mpu_path, e.name); + const stat = await nb_native().fs.stat(fs_context, part_path); + return { + num, + size: stat.size, + etag: this._get_etag(stat), + last_modified: new Date(stat.mtime), + }; + }) ); return { is_truncated: false, @@ -2179,14 +2130,14 @@ class NamespaceFS { dbg.error(`NamespaceFS.delete_object_tagging: failed in dir ${file_path} with error: `, err); throw native_fs_utils.translate_error_codes(err, native_fs_utils.entity_enum.OBJECT); } - return {version_id: params.version_id}; + return { version_id: params.version_id }; } async put_object_tagging(params, object_sdk) { const fs_xattr = {}; const tagging = params.tagging && Object.fromEntries(params.tagging.map(tag => ([tag.key, tag.value]))); for (const [xattr_key, xattr_value] of Object.entries(tagging)) { - fs_xattr[XATTR_TAG + xattr_key] = xattr_value; + fs_xattr[XATTR_TAG + xattr_key] = xattr_value; } const fs_context = this.prepare_fs_context(object_sdk); const file_path = await this._find_version_path(fs_context, params, true); @@ -2373,7 +2324,7 @@ class NamespaceFS { // INTERNALS // /////////////// - _get_file_path({key}) { + _get_file_path({ key }) { // not allowing keys with dots follow by slash which can be treated as relative paths and "leave" the bucket_path // We are not using `path.isAbsolute` as path like '/../..' will return true and we can still "leave" the bucket_path if (key.includes('./')) throw new Error('Bad relative path key ' + key); @@ -2580,7 +2531,8 @@ class NamespaceFS { * @param {fs.Dirent} ent * @returns {string} */ - _get_version_entry_key(dir_key, ent) { + _get_version_entry_key(dir_key, ent) { + if (ent.name === config.NSFS_FOLDER_OBJECT_NAME) return dir_key; return dir_key + HIDDEN_VERSIONS_PATH + '/' + ent.name; } @@ -2628,6 +2580,7 @@ class NamespaceFS { const storage_class = Glacier.storage_class_from_xattr(stat.xattr); const size = Number(stat.xattr?.[XATTR_DIR_CONTENT] || stat.size); const tag_count = stat.xattr ? this._number_of_tags_fs_xttr(stat.xattr) : 0; + const restore_status = Glacier.get_restore_status(stat.xattr, new Date(), this._get_file_path({ key })); const nc_noncurrent_time = (stat.xattr?.[XATTR_NON_CURRENT_TIMESTASMP] && Number(stat.xattr[XATTR_NON_CURRENT_TIMESTASMP])) || stat.ctime.getTime(); @@ -2644,7 +2597,7 @@ class NamespaceFS { is_latest, delete_marker, storage_class, - restore_status: Glacier.get_restore_status(stat.xattr, new Date(), this._get_file_path({key})), + restore_status, xattr: to_xattr(stat.xattr), tag_count, tagging: get_tags_from_xattr(stat.xattr), @@ -2701,7 +2654,8 @@ class NamespaceFS { async _load_bucket(params, fs_context) { try { - await nb_native().fs.stat(fs_context, this.bucket_path); + // GGG HACK TODO: UNCOMMENT + // await nb_native().fs.stat(fs_context, this.bucket_path); } catch (err) { dbg.warn('_load_bucket failed, on bucket_path', this.bucket_path, 'got error', err); throw native_fs_utils.translate_error_codes(err, native_fs_utils.entity_enum.BUCKET); @@ -2972,7 +2926,7 @@ class NamespaceFS { } _get_version_id_by_xattr(stat) { - return (stat && stat.xattr[XATTR_VERSION_ID]) || 'null'; + return (stat && stat.xattr[XATTR_VERSION_ID]) || 'null'; } _get_versions_dir_path(key, is_dir_content) { @@ -3278,7 +3232,7 @@ class NamespaceFS { dbg.log1('Namespace_fs._promote_version_to_latest', params, deleted_version_info, latest_ver_path); let retries = config.NSFS_RENAME_RETRIES; - for (;;) { + for (; ;) { try { const latest_version_info = await this._get_version_info(fs_context, latest_ver_path); if (latest_version_info) return; @@ -3334,7 +3288,7 @@ class NamespaceFS { let retries = config.NSFS_RENAME_RETRIES; let latest_ver_info; let versioned_path; - for (;;) { + for (; ;) { try { latest_ver_info = await this._get_version_info(fs_context, latest_ver_path); dbg.log1('Namespace_fs._delete_latest_version:', latest_ver_info); @@ -3402,7 +3356,7 @@ class NamespaceFS { const is_gpfs = native_fs_utils._is_gpfs(fs_context); let retries = config.NSFS_RENAME_RETRIES; - for (;;) { + for (; ;) { try { const null_versioned_path_info = await this._get_version_info(fs_context, null_versioned_path); dbg.log1('Namespace_fs._delete_null_version_from_versions_directory:', null_versioned_path, null_versioned_path_info); @@ -3430,7 +3384,7 @@ class NamespaceFS { let retries = config.NSFS_RENAME_RETRIES; let upload_params; let delete_marker_version_id; - for (;;) { + for (; ;) { try { upload_params = await this._start_upload(fs_context, undefined, undefined, params, 'w'); @@ -3483,12 +3437,12 @@ class NamespaceFS { // find max past version by comparing the mtimeNsBigint val const max_entry_info = arr.reduce((acc, cur) => (cur && cur.mtimeNsBigint > acc.mtimeNsBigint ? cur : acc), - { mtimeNsBigint: BigInt(0), name: undefined }); + { mtimeNsBigint: BigInt(0), name: undefined }); return max_entry_info.mtimeNsBigint > BigInt(0) && this._get_version_info(fs_context, path.join(versions_dir, max_entry_info.name)); } catch (err) { dbg.warn('namespace_fs.find_max_version_past: .versions/ folder could not be found', err); - } + } } _is_hidden_version_path(dir_key) { @@ -3536,7 +3490,7 @@ class NamespaceFS { } return { move_to_versions: { src_file: dst_file, dir_file }, - move_to_dst: { src_file, dst_file, dir_file} + move_to_dst: { src_file, dst_file, dir_file } }; } catch (err) { dbg.warn('NamespaceFS._open_files couldn\'t open files', err); diff --git a/src/sdk/nb.d.ts b/src/sdk/nb.d.ts index 8a443268af..0718870c30 100644 --- a/src/sdk/nb.d.ts +++ b/src/sdk/nb.d.ts @@ -5,7 +5,7 @@ import * as mongodb from 'mongodb'; import { EventEmitter } from 'events'; import { Readable, Writable } from 'stream'; import { IncomingMessage, ServerResponse } from 'http'; -import { ObjectPart, Checksum} from '@aws-sdk/client-s3'; +import { ObjectPart, Checksum } from '@aws-sdk/client-s3'; type Semaphore = import('../util/semaphore').Semaphore; type KeysSemaphore = import('../util/keys_semaphore'); @@ -29,7 +29,19 @@ type NodeType = type S3Response = ServerResponse; type S3Request = IncomingMessage & { + query: any; + body?: any; + params: { + bucket: string; + key: string; + }, + op_name: string; object_sdk: ObjectSDK; + virtual_hosted_bucket?: string; + content_md5?: Buffer; + content_sha256_buf?: Buffer; + content_sha256_sig?: string; + chunked_content?: boolean; }; type ReplicationLogAction = 'copy' | 'delete' | 'conflict'; @@ -826,15 +838,15 @@ interface Namespace { get_blob_block_lists(params: object, object_sdk: ObjectSDK): Promise; restore_object(params: object, object_sdk: ObjectSDK): Promise; - get_object_attributes(params: object, object_sdk: ObjectSDK): Promise; + get_object_attribute?(params: object, object_sdk: ObjectSDK): Promise; } interface BucketSpace { read_account_by_access_key({ access_key: string }): Promise; read_bucket_sdk_info({ name: string }): Promise; - check_same_stat_bucket(bucket_name: string, bucket_stat: nb.NativeFSStats); // only implemented in bucketspace_fs - check_same_stat_account(account_name: string|Symbol, account_stat: nb.NativeFSStats); // only implemented in bucketspace_fs + check_same_stat_bucket(bucket_name: string, bucket_stat: nb.NativeFSStats); // only implemented in bucketspace_fs + check_same_stat_account(account_name: string | Symbol, account_stat: nb.NativeFSStats); // only implemented in bucketspace_fs list_buckets(params: object, object_sdk: ObjectSDK): Promise; read_bucket(params: object): Promise; @@ -946,6 +958,10 @@ interface Native { S3Select: { new(options: S3SelectOptions): S3Select }; select_parquet: boolean; + + RdmaServerNapi: { new(params: RdmaServerNapiParams): RdmaServerNapi }; + RdmaClientNapi: { new(): RdmaClientNapi }; + CudaMemory: { new(size: number): CudaMemory }; } interface NativeFS { @@ -1145,6 +1161,65 @@ interface S3Select { select_parquet(): Promise; } +interface RdmaInfo { + desc: string; + addr: string; + size: number; + offset: number; +} + +interface RdmaReply { + size: number; +} + +interface RdmaServerNapiParams { + ip: string; + port: number; + log_level?: 'ERROR' | 'INFO' | 'DEBUG'; + use_async_events?: boolean; + num_dcis?: number; + cq_depth?: number; + dc_key?: number; + ibv_poll_max_comp_event?: number; + service_level?: number; + min_rnr_timer?: number; + hop_limit?: number; + pkey_index?: number; + max_wr?: number; + max_sge?: number; + delay_mode?: number; + delay_interval?: number; +} + +interface RdmaServerNapi { + register_buffer(buf: Buffer): void; + deregister_buffer(buf: Buffer): void; + is_registered_buffer(buf: Buffer): boolean; + rdma( + op_type: 'GET' | 'PUT', + op_key: string, + buf: Buffer, + rdma_info: RdmaInfo, + ): Promise; +} + +interface RdmaClientNapi { + rdma( + op_type: 'GET' | 'PUT', + buf: Buffer, + func: (rdma_info: RdmaInfo, callback: NodeCallback) => void, + ): Promise; +} + +interface CudaMemory { + free(): void; + fill(value: number, start?: number, end?: number): number; + as_buffer(start?: number, end?: number): Buffer; + copy_to_host_new(start?: number, end?: number): Buffer; + copy_to_host(buffer: Buffer, start?: number, end?: number): number; + copy_from_host(buffer: Buffer, start?: number, end?: number): number; +} + type NodeCallback = (err: Error | null, res?: T) => void; type RestoreState = 'CAN_RESTORE' | 'ONGOING' | 'RESTORED'; @@ -1178,4 +1253,4 @@ interface GetObjectAttributesParts { MaxParts?: number; IsTruncated?: boolean; Parts?: ObjectPart[]; - } +} diff --git a/src/test/qa/capacity.js b/src/test/qa/capacity.js index 43c6b81ce5..e1cf87bbe1 100644 --- a/src/test/qa/capacity.js +++ b/src/test/qa/capacity.js @@ -1,17 +1,17 @@ /* Copyright (C) 2016 NooBaa */ 'use strict'; -const _ = require('lodash'); const AWS = require('aws-sdk'); const argv = require('minimist')(process.argv); const http = require('http'); const https = require('https'); -const P = require('../../util/promise'); const Speedometer = require('../../util/speedometer'); const RandStream = require('../../util/rand_stream'); +// @ts-ignore http.globalAgent.keepAlive = true; +// @ts-ignore https.globalAgent.keepAlive = true; if (argv.endpoint) { @@ -27,7 +27,6 @@ argv.concur = argv.concur || 16; argv.count = argv.count || 1; argv.workers = argv.workers || 1; -const speedometer = new Speedometer('Capacity Upload Speed'); const s3 = new AWS.S3({ endpoint: argv.endpoint, accessKeyId: argv.access_key, @@ -37,9 +36,6 @@ const s3 = new AWS.S3({ computeChecksums: argv.checksum || false, // disabled by default for performance s3DisableBodySigning: !argv.signing || true, // disabled by default for performance region: argv.region || 'us-east-1', - params: { - Bucket: argv.bucket - }, }); // AWS config does not use https.globalAgent @@ -54,6 +50,7 @@ if (s3.endpoint.protocol === 'https:') { } }); if (!argv.selfsigned) { + // @ts-ignore AWS.events.on('error', err => { if (err.message === 'self signed certificate') { setTimeout(() => console.log( @@ -64,17 +61,34 @@ if (s3.endpoint.protocol === 'https:') { } } -function upload_file() { +const speedometer = new Speedometer({ + name: 'Capacity Upload Speed', + argv, + workers_func, +}); +speedometer.start(); + +async function workers_func() { + await Promise.all(Array(argv.workers).fill(0).map(() => worker())); +} + +async function worker() { + for (let i = 0; i < argv.count; ++i) { + await speedometer.measure(upload_file); + } +} + +async function upload_file() { const key = `${argv.dir}capacity-${Date.now()}-${Math.floor(Math.random() * 1000000)}`; console.log(ts(), 'upload start:', key, '...'); - let last_progress = 0; const upload_params = { + Bucket: argv.bucket, Key: key, Body: new RandStream(argv.file_size, { highWaterMark: argv.part_size, }), ContentType: 'application/octet-stream', - ContentLength: argv.file_size + ContentLength: argv.file_size, }; const upload = argv.multipart ? s3.upload(upload_params, { @@ -82,31 +96,20 @@ function upload_file() { queueSize: argv.concur }) : s3.putObject(upload_params); + + let last_progress = 0; upload.on('httpUploadProgress', progress => { speedometer.update(progress.loaded - last_progress); last_progress = progress.loaded; }); - return P.fromCallback(callback => upload.send(callback)) - .then(() => console.log(ts(), 'upload done.', key)) - .catch(err => { - console.error(ts(), 'UPLOAD ERROR', err); - return P.delay(1000); - }); -} -function main() { - P.all(_.times(argv.workers, function worker() { - if (argv.count <= 0) return; - argv.count -= 1; - return upload_file().then(worker); - })) - .then(() => speedometer.report()); + await upload.send(); + console.log(ts(), 'upload done.', key); + + // size already counted by httpUploadProgress event + return 0; } function ts() { return new Date().toISOString(); } - -if (require.main === module) { - main(); -} diff --git a/src/test/unit_tests/jest_tests/test_file_reader.test.js b/src/test/unit_tests/jest_tests/test_file_reader.test.js new file mode 100644 index 0000000000..46642d9cca --- /dev/null +++ b/src/test/unit_tests/jest_tests/test_file_reader.test.js @@ -0,0 +1,133 @@ +/* Copyright (C) 2020 NooBaa */ +'use strict'; + +const fs = require('fs'); +const path = require('path'); +const assert = require('assert'); +const buffer_utils = require('../../../util/buffer_utils'); +const native_fs_utils = require('../../../util/native_fs_utils'); +const { FileReader } = require('../../../util/file_reader'); +const { multi_buffer_pool } = require('../../../sdk/namespace_fs'); + +const fs_context = {}; + +describe('FileReader', () => { + + const test_files = fs.readdirSync(__dirname).map(file => path.join(__dirname, file)); + + /** + * @param {(file_path: string, start?: number, end?: number) => void} tester + */ + function describe_read_cases(tester) { + describe('list files and read entire', () => { + for (const file_path of test_files) { + tester(file_path); + } + }); + describe('skip start cases', () => { + tester(__filename, 1, Infinity); + tester(__filename, 3, Infinity); + tester(__filename, 11, Infinity); + tester(__filename, 1023, Infinity); + tester(__filename, 1024, Infinity); + tester(__filename, 1025, Infinity); + }); + describe('edge cases', () => { + tester(__filename, 0, 1); + tester(__filename, 0, 2); + tester(__filename, 0, 3); + tester(__filename, 1, 2); + tester(__filename, 1, 3); + tester(__filename, 2, 3); + tester(__filename, 0, 1023); + tester(__filename, 0, 1024); + tester(__filename, 0, 1025); + tester(__filename, 1, 1023); + tester(__filename, 1, 1024); + tester(__filename, 1, 1025); + tester(__filename, 1023, 1024); + tester(__filename, 1023, 1025); + tester(__filename, 1024, 1025); + tester(__filename, 123, 345); + tester(__filename, 1000000000, Infinity); + }); + } + + describe('as stream.Readable', () => { + + describe_read_cases(tester); + + function tester(file_path, start = 0, end = Infinity) { + const basename = path.basename(file_path); + it(`test read ${start}-${end} ${basename}`, async () => { + await native_fs_utils.use_file({ + fs_context, + bucket_path: file_path, + open_path: file_path, + scope: async file => { + const stat = await file.stat(fs_context); + const aborter = new AbortController(); + const signal = aborter.signal; + const file_reader = new FileReader({ + fs_context, + file, + file_path, + stat, + start, + end, + signal, + multi_buffer_pool, + highWaterMark: 1024, // bytes + }); + const data = await buffer_utils.read_stream_join(file_reader); + const node_fs_stream = fs.createReadStream(file_path, { start, end: end > 0 ? end - 1 : 0 }); + const node_fs_data = await buffer_utils.read_stream_join(node_fs_stream); + assert.strictEqual(data.length, node_fs_data.length); + assert.strictEqual(data.toString(), node_fs_data.toString()); + } + }); + }); + } + }); + + describe('read_into_stream with buffer pooling', () => { + + describe_read_cases(tester); + + function tester(file_path, start = 0, end = Infinity) { + const basename = path.basename(file_path); + it(`test read ${start}-${end} ${basename}`, async () => { + await native_fs_utils.use_file({ + fs_context, + bucket_path: file_path, + open_path: file_path, + scope: async file => { + const stat = await file.stat(fs_context); + const aborter = new AbortController(); + const signal = aborter.signal; + const file_reader = new FileReader({ + fs_context, + file, + file_path, + stat, + start, + end, + signal, + multi_buffer_pool, + highWaterMark: 1024, // bytes + }); + const writable = buffer_utils.write_stream(); + await file_reader.read_into_stream(writable); + const data = writable.join(); + const node_fs_stream = fs.createReadStream(file_path, { start, end: end > 0 ? end - 1 : 0 }); + const node_fs_data = await buffer_utils.read_stream_join(node_fs_stream); + assert.strictEqual(data.length, node_fs_data.length); + assert.strictEqual(data.toString(), node_fs_data.toString()); + } + }); + }); + } + + }); + +}); diff --git a/src/test/unit_tests/test_chunk_coder.js b/src/test/unit_tests/test_chunk_coder.js index 3162b5d8b9..8be3dd389c 100644 --- a/src/test/unit_tests/test_chunk_coder.js +++ b/src/test/unit_tests/test_chunk_coder.js @@ -254,7 +254,7 @@ mocha.describe('nb_native chunk_coder', function() { async function test_stream({ erase, decode, generator, input_size, chunk_split_config, chunk_coder_config }) { try { - const speedometer = new Speedometer('Chunk Coder Speed'); + const speedometer = new Speedometer({ name: 'Chunk Coder Speed' }); const input = new RandStream(input_size, { highWaterMark: 16 * 1024, @@ -285,27 +285,28 @@ async function test_stream({ erase, decode, generator, input_size, chunk_split_c coder: 'dec', }); - const reporter = new stream.Transform({ + let count = 0; + let pos = 0; + const reporter = new stream.Writable({ objectMode: true, - allowHalfOpen: false, highWaterMark: 50, - transform(chunk, encoding, callback) { - this.count = (this.count || 0) + 1; - this.pos = this.pos || 0; + write(chunk, encoding, callback) { + count += 1; // checking the position is continuous - assert.strictEqual(this.pos, chunk.pos); - this.pos += chunk.size; + assert.strictEqual(pos, chunk.pos); + pos += chunk.size; speedometer.update(chunk.size); callback(); }, - flush(callback) { + final(callback) { speedometer.clear_interval(); - // speedometer.report(); - // console.log('AVERAGE CHUNK SIZE', (this.pos / this.count).toFixed(0)); + speedometer.summary(); + console.log('AVERAGE CHUNK SIZE', (pos / count).toFixed(0)); callback(); } }); + /** @type {(stream.Readable | stream.Transform | stream.Writable)[]} */ const transforms = [input, splitter, coder, diff --git a/src/tools/coding_speed.js b/src/tools/coding_speed.js index b8803f452c..922015e4e3 100644 --- a/src/tools/coding_speed.js +++ b/src/tools/coding_speed.js @@ -33,10 +33,15 @@ argv.verbose = Boolean(argv.verbose); // default is false argv.sse_c = Boolean(argv.sse_c); // default is false delete argv._; -const speedometer = new Speedometer('Chunk Coder Speed'); -speedometer.run_workers(argv.forks, main, argv); +const speedometer = new Speedometer({ + name: 'Chunk Coder Speed', + argv, + num_workers: argv.forks, + workers_func, +}); +speedometer.start(); -function main() { +async function workers_func() { const chunk_split_config = { avg_chunk: config.CHUNK_SPLIT_AVG_CHUNK, @@ -127,22 +132,22 @@ function main() { transforms.push(new FlattenStream()); } transforms.push(reporter); - return stream.promises.pipeline(transforms) - .then(() => { - console.log('AVERAGE CHUNK SIZE', (total_size / num_parts).toFixed(0)); - if (splitter.md5) { - console.log('MD5 =', splitter.md5.toString('base64')); - } - if (splitter.sha256) { - console.log('SHA256 =', splitter.sha256.toString('base64')); - } - }) - .catch(err => { - if (!err.chunks) throw err; - let message = ''; - for (const chunk of err.chunks) { - message += 'CHUNK ERRORS: ' + chunk.errors.join(',') + '\n'; - } - throw new Error(err.message + '\n' + message); - }); + + try { + await stream.promises.pipeline(transforms); + console.log('AVERAGE CHUNK SIZE', (total_size / num_parts).toFixed(0)); + if (splitter.md5) { + console.log('MD5 =', splitter.md5.toString('base64')); + } + if (splitter.sha256) { + console.log('SHA256 =', splitter.sha256.toString('base64')); + } + } catch (err) { + if (!err.chunks) throw err; + let message = ''; + for (const chunk of err.chunks) { + message += 'CHUNK ERRORS: ' + chunk.errors.join(',') + '\n'; + } + throw new Error(err.message + '\n' + message); + } } diff --git a/src/tools/cpu_speed.js b/src/tools/cpu_speed.js index 527e56bac3..119651844d 100644 --- a/src/tools/cpu_speed.js +++ b/src/tools/cpu_speed.js @@ -4,29 +4,34 @@ require('../util/fips'); const crypto = require('crypto'); const argv = require('minimist')(process.argv); +const setImmediateAsync = require('timers/promises').setImmediate; const Speedometer = require('../util/speedometer'); require('../util/console_wrapper').original_console(); -argv.forks = argv.forks || 1; -argv.size = argv.size || (10 * 1024); +argv.forks = Number(argv.forks ?? 1); +argv.size = Number(argv.size ?? (10 * 1024)); argv.hash = argv.hash || 'sha256'; -const speedometer = new Speedometer(`CPU(${argv.hash})`); -speedometer.run_workers(argv.forks, main, argv); +const speedometer = new Speedometer({ + name: `CPU(${argv.hash})`, + argv, + num_workers: argv.forks, + workers_func, +}); +speedometer.start(); -function main() { +async function workers_func() { const hasher = crypto.createHash(argv.hash); const buf = crypto.randomBytes(1024 * 1024); let size = argv.size * 1024 * 1024; console.log(`Crunching ${argv.size} MB with ${argv.hash}...`); - run(); - - function run() { - if (size <= 0) process.exit(); - hasher.update(buf); - speedometer.update(buf.length); + while (size > 0) { + await speedometer.measure(async () => { + hasher.update(buf); + return buf.length; + }); size -= buf.length; - setImmediate(run); + await setImmediateAsync(); // release CPU } } diff --git a/src/tools/file_writer_hashing.js b/src/tools/file_writer_hashing.js index e3f2c980dc..6542a9d3ed 100644 --- a/src/tools/file_writer_hashing.js +++ b/src/tools/file_writer_hashing.js @@ -6,7 +6,6 @@ const assert = require('assert'); const FileWriter = require('../util/file_writer'); const config = require('../../config'); const nb_native = require('../util/nb_native'); -const stream_utils = require('../util/stream_utils'); const P = require('../util/promise'); const stream = require('stream'); const fs = require('fs'); @@ -72,12 +71,11 @@ async function hash_target(chunk_size = CHUNK, parts = PARTS, iov_max = IOV_MAX) }()); const target = new TargetHash(); const file_writer = new FileWriter({ - target_file: target, + target_file: /**@type {any}*/ (target), fs_context: DEFAULT_FS_CONFIG, namespace_resource_id: 'MajesticSloth' }); - await stream_utils.pipeline([source_stream, file_writer]); - await stream_utils.wait_finished(file_writer); + await file_writer.write_entire_stream(source_stream); const write_hash = target.digest(); console.log( 'Hash target', @@ -114,8 +112,7 @@ async function file_target(chunk_size = CHUNK, parts = PARTS, iov_max = IOV_MAX) fs_context: DEFAULT_FS_CONFIG, namespace_resource_id: 'MajesticSloth' }); - await stream_utils.pipeline([source_stream, file_writer]); - await stream_utils.wait_finished(file_writer); + await file_writer.write_entire_stream(source_stream); if (XATTR) { await target_file.replacexattr( DEFAULT_FS_CONFIG, diff --git a/src/tools/fs_speed.js b/src/tools/fs_speed.js index 52d2246841..832d542696 100644 --- a/src/tools/fs_speed.js +++ b/src/tools/fs_speed.js @@ -6,39 +6,37 @@ require('../util/panic'); const fs = require('fs'); const util = require('util'); const path = require('path'); +const crypto = require('crypto'); const argv = require('minimist')(process.argv); -const { cluster } = require('../util/fork_utils'); const execAsync = util.promisify(require('child_process').exec); const Speedometer = require('../util/speedometer'); -const RandStream = require('../util/rand_stream'); function print_usage() { console.log(` Usage: --help show this usage - --dir (default "./fs_speed_output") where to write the files + --path (default "./fs_speed_output") where to write the files --time (default 10) limit time to run --concur (default 1) number of concurrent writers --forks (default 1) number of forks to create (total writers is concur * forks). -Sizes: - --file_size (default 1024 MB) file size to write - --block_size (default 8 MB) block size to write - --file_size_units (default is "MB") options are "GB", "MB", "KB", "B" - --block_size_units (default is "MB") options are "GB", "MB", "KB", "B" Modes: - --read invoke reads instead of writes. - --fsync trigger fsync at the end of each file + --write do write test (default is false) + --fsync trigger fsync at the end of each file (default is true) --mode (default is "nsfs") options are "nsfs" use the native fs_napi module used in nsfs "nodejs" use nodejs fs module "dd" execute dd commands +Sizes: + --file_size (default 64 MB) file size to write + --block_size (default 8 MB) block size to write + --file_size_units (default is "MB") options are "GB", "MB", "KB", "B" + --block_size_units (default is "MB") options are "GB", "MB", "KB", "B" Advanced: --device (default is "/dev/zero") input device to use for dd mode - --generator (default is "zeros") options are from rand stream (not for dd mode) --nvec (default is 1) split blocks to use writev if > 1 (not for dd mode) Example: - node src/tools/fs_speed --dir /mnt/fs/fs_speed_output --time 30 --concur 4 --file_size 256 --block_size 4 --fsync --mode dd + node src/tools/fs_speed --path /mnt/fs/fs_speed_output --time 30 --forks 16 `); } @@ -47,15 +45,16 @@ if (argv.help) { process.exit(0); } -argv.dir = argv.dir || 'fs_speed_output'; -argv.time = argv.time || 10; // stop after X seconds -argv.concur = argv.concur || 1; -argv.forks = argv.forks || 1; -argv.file_size = argv.file_size || 1024; -argv.block_size = argv.block_size || 8; +argv.path = argv.path || 'fs_speed_output'; +argv.write = Boolean(argv.write || false); +argv.time = Number(argv.time ?? 10); // stop after X seconds +argv.concur = Number(argv.concur ?? 1); +argv.forks = Number(argv.forks ?? 1); +argv.file_size = Number(argv.file_size ?? 64); +argv.block_size = Number(argv.block_size ?? 8); argv.file_size_units = argv.file_size_units || 'MB'; argv.block_size_units = argv.block_size_units || 'MB'; -argv.fsync = Boolean(argv.fsync); +argv.fsync = Boolean(argv.fsync ?? true); // true unless otherwise specified argv.mode = argv.mode || 'nsfs'; argv.backend = argv.backend || 'GPFS'; if (argv.mode === 'dd') { @@ -64,12 +63,8 @@ if (argv.mode === 'dd') { // flags that are ignored on dd mode // nvec larger than 1 will use writev instead of write argv.nvec = argv.nvec || 1; - // generator value should be one that RandStream supports - 'crypto' | 'cipher' | 'fake' | 'zeros' | 'fill' | 'noinit' - argv.generator = argv.generator || 'zeros'; } - Object.freeze(argv); -console.log(argv); if (!['nsfs', 'nodejs', 'dd'].includes(argv.mode)) { throw new Error('Invalid mode ' + argv.mode); @@ -89,57 +84,105 @@ if (!size_units_table[argv.block_size_units]) { const block_size = argv.block_size * size_units_table[argv.block_size_units]; const file_size = argv.file_size * size_units_table[argv.file_size_units]; +const size_name = String(argv.file_size) + String(argv.file_size_units); const block_count = Math.ceil(file_size / block_size); const file_size_aligned = block_count * block_size; const nb_native = argv.mode === 'nsfs' && require('../util/nb_native'); -const is_master = cluster.isPrimary; -const start_time = Date.now(); -const end_time = start_time + (argv.time * 1000); -const speedometer = new Speedometer('FS Speed'); -speedometer.run_workers(argv.forks, main, argv); +const fs_context = { + backend: argv.backend, + warn_threshold_ms: 10000, +}; + +const speedometer = new Speedometer({ + name: 'FS Speed', + argv, + num_workers: argv.forks, + primary_init, + workers_init, + workers_func, +}); +speedometer.start(); + +let _read_files = []; + +async function primary_init() { + if (!argv.write) { + const dir = path.join(argv.path, size_name); + console.log('Reading dir', dir, '(recursive) ...'); + const entries = await fs.promises.readdir(dir, { recursive: true, withFileTypes: true }); + for (const entry of entries) { + if (entry.isFile()) { + const file_path = path.join(entry.parentPath, entry.name); + _read_files.push(file_path); + } + } + console.log('Found', _read_files.length, 'files to read in dir', dir); + return _read_files; + } +} -async function main() { +async function workers_init(worker_id, worker_info) { + if (!argv.write) { + _read_files = worker_info; + console.log('Workers got', _read_files.length, 'files to read'); + } +} + +async function workers_func(worker_id, worker_info) { // nb_native().fs.set_debug_level(5); const promises = []; - fs.mkdirSync(argv.dir, { recursive: true }); - for (let i = 0; i < argv.concur; ++i) promises.push(worker(i)); + fs.mkdirSync(argv.path, { recursive: true }); + for (let i = 0; i < argv.concur; ++i) promises.push(io_worker(worker_id, i)); await Promise.all(promises); - speedometer.clear_interval(); - if (is_master) speedometer.report(); - process.exit(0); } /** - * @param {number} id + * @param {number} worker_id + * @param {number} io_worker_id */ -async function worker(id) { +async function io_worker(worker_id, io_worker_id) { const dir = path.join( - argv.dir, - `${id}`, // first level is id so that repeating runs will be collected together - // `pid-${process.pid}`, + argv.path, + size_name, + `${worker_id}`, + `${io_worker_id}`, ); await fs.promises.mkdir(dir, { recursive: true }); + const worker_buf = Buffer.allocUnsafeSlow(block_size); + + const start_time = Date.now(); + const end_time = start_time + (argv.time * 1000); + + for (; ;) { + const now = Date.now(); + if (now >= end_time) break; + + let file_path; + const hash_dir = path.join(dir, String(now % 256)); + if (argv.write) { + file_path = path.join(hash_dir, `file${size_name}-${now.toString(36)}`); + } else { + file_path = _read_files[crypto.randomInt(0, _read_files.length)]; + } - let file_id = 0; - for (;;) { - const file_start_time = Date.now(); - if (file_start_time >= end_time) break; - const file_path = path.join(dir, `file-${file_id}`); - file_id += 1; try { - if (argv.mode === 'nsfs') { - await work_with_nsfs(file_path); - } else if (argv.mode === 'nodejs') { - await work_with_nodejs(file_path); - } else if (argv.mode === 'dd') { - await work_with_dd(file_path); - } - const took_ms = Date.now() - file_start_time; - speedometer.add_op(took_ms); + await speedometer.measure(async () => { + if (argv.mode === 'nsfs') { + return work_with_nsfs(file_path, worker_buf); + } else if (argv.mode === 'nodejs') { + return work_with_nodejs(file_path, worker_buf); + } else if (argv.mode === 'dd') { + return work_with_dd(file_path); + } + }); } catch (err) { - if (argv.read && err.code === 'ENOENT') { - file_id = 0; + if (err.code === 'ENOENT') { + if (argv.write) { + await fs.promises.mkdir(hash_dir, { recursive: true }); + } else { + console.warn('file not found', file_path); + } } else { throw err; } @@ -147,77 +190,59 @@ async function worker(id) { } } -async function work_with_dd(file_path) { - const cmd = argv.read ? - `dd if=${file_path} of=/dev/null bs=${block_size} count=${block_count}` : - `dd if=${argv.device} of=${file_path} bs=${block_size} count=${block_count}`; - // console.log(cmd); - await execAsync(cmd); - if (argv.fsync) await execAsync(`sync ${file_path}`); - speedometer.update(file_size_aligned); -} - -async function work_with_nsfs(file_path) { - const rand_stream = new RandStream(file_size_aligned, { - highWaterMark: 2 * block_size, - generator: argv.read ? 'noinit' : argv.generator, - }); - const fs_context = { - // uid: 666, - // gid: 666, - backend: argv.backend, - warn_threshold_ms: 10000, - }; - const file = await nb_native().fs.open(fs_context, file_path, argv.read ? 'r' : 'w', 0o660); - for (let pos = 0; pos < file_size_aligned; pos += block_size) { - const buf_start_time = Date.now(); - if (buf_start_time >= end_time) break; - const buf = rand_stream.generator(block_size); - if (argv.nvec > 1) { - if (argv.read) { - // await file.readv(fs_context, split_to_nvec(buf, argv.nvec)); +async function work_with_nsfs(file_path, buf) { + const file = await nb_native().fs.open(fs_context, file_path, argv.write ? 'w' : 'r', 0o660); + if (!argv.write) { + const stat = await file.stat(fs_context); + if (stat.size !== file_size_aligned) { + throw new Error(`File size mismatch: expected ${file_size_aligned}, got ${stat.size}`); + } + } + for (let pos = 0; pos < file_size_aligned; pos += buf.length) { + if (argv.write) { + await (argv.nvec > 1 ? + file.writev(fs_context, split_to_nvec(buf, argv.nvec)) : + file.write(fs_context, buf, buf.length, pos)); + } else { + if (argv.nvec > 1) { throw new Error('TODO: readv is not yet available in NativeFile'); - } else { - await file.writev(fs_context, split_to_nvec(buf, argv.nvec)); } - } else if (argv.read) { await file.read(fs_context, buf, 0, buf.length, pos); - } else { - await file.write(fs_context, buf, buf.length, pos); } - speedometer.update(block_size); + speedometer.update(buf.length); } - if (argv.fsync) await file.fsync(fs_context); + if (argv.write && argv.fsync) await file.fsync(fs_context); await file.close(fs_context); } -async function work_with_nodejs(file_path) { - const rand_stream = new RandStream(file_size_aligned, { - highWaterMark: 2 * block_size, - generator: argv.read ? 'noinit' : argv.generator, - }); - const file = await fs.promises.open(file_path, argv.read ? 'r' : 'w', 0o660); - for (let pos = 0; pos < file_size_aligned; pos += block_size) { - const buf_start_time = Date.now(); - if (buf_start_time >= end_time) break; - const buf = rand_stream.generator(block_size); - if (argv.nvec > 1) { - if (argv.read) { - await file.readv(split_to_nvec(buf, argv.nvec)); - } else { - await file.writev(split_to_nvec(buf, argv.nvec)); - } - } else if (argv.read) { - await file.read(buf); +async function work_with_nodejs(file_path, buf) { + const file = await fs.promises.open(file_path, argv.write ? 'w' : 'r', 0o660); + for (let pos = 0; pos < file_size_aligned; pos += buf.length) { + if (argv.write) { + await (argv.nvec > 1 ? + file.writev(split_to_nvec(buf, argv.nvec)) : + file.write(buf)); } else { - await file.write(buf); + await (argv.nvec > 1 ? + file.readv(split_to_nvec(buf, argv.nvec)) : + file.read(buf)); } - speedometer.update(block_size); + speedometer.update(buf.length); } - if (argv.fsync) await file.sync(); + if (argv.write && argv.fsync) await file.sync(); await file.close(); } +async function work_with_dd(file_path) { + const cmd = argv.write ? + `dd if=${argv.device} of=${file_path} bs=${block_size} count=${block_count}` : + `dd if=${file_path} of=/dev/null bs=${block_size} count=${block_count}`; + // console.log(cmd); + await execAsync(cmd); + if (argv.write && argv.fsync) await execAsync(`sync ${file_path}`); + speedometer.update(file_size_aligned); +} + function split_to_nvec(buf, nvec) { const len = Math.ceil(buf.length / nvec); const bufs = []; diff --git a/src/tools/http_speed.go b/src/tools/http_speed.go deleted file mode 100644 index 2dc2abe924..0000000000 --- a/src/tools/http_speed.go +++ /dev/null @@ -1,200 +0,0 @@ -package main - -import ( - "bytes" - "crypto/rand" - "crypto/rsa" - "crypto/tls" - "crypto/x509" - "crypto/x509/pkix" - "encoding/pem" - "flag" - "fmt" - "io" - "log" - "math/big" - "net" - "net/http" - "os" - "runtime/pprof" - "strconv" - "time" -) - -var client = flag.Bool("client", false, "run client") -var ssl = flag.Bool("ssl", false, "use ssl") -var port = flag.Int("port", 50505, "tcp port to use") -var prof = flag.String("prof", "", "write cpu profile to file") -var reqsizeMB = flag.Int("size", 1024, "request size in MB") -var bufsize = flag.Int("buf", 128*1024, "memory buffer size") - -func main() { - flag.Parse() - if *prof != "" { - f, err := os.Create(*prof) - if err != nil { - log.Fatal(err) - } - pprof.StartCPUProfile(f) - defer pprof.StopCPUProfile() - } - if *client { - runSender() - } else { - runReceiver() - } -} - -type reqBodyReader struct { - io.ReadCloser - n uint64 - reqsize uint64 - speedometer *Speedometer -} - -func (r *reqBodyReader) Read(p []byte) (n int, err error) { - if r.n > r.reqsize { - return 0, io.EOF - } - l := uint64(len(p)) - r.n += l - r.speedometer.Update(l) - return len(p), nil -} - -func (r *reqBodyReader) Close() error { - return nil -} - -func runSender() { - var addr string - var client *http.Client - var speedometer Speedometer - - if *ssl { - addr = "https://localhost:" + strconv.Itoa(*port) - client = &http.Client{ - Transport: &http.Transport{ - TLSClientConfig: &tls.Config{ - InsecureSkipVerify: true, - }, - }, - } - } else { - addr = "http://localhost:" + strconv.Itoa(*port) - client = &http.Client{} - } - - speedometer.Init() - - for { - req, err := http.NewRequest("PUT", addr, &reqBodyReader{ - reqsize: uint64(*reqsizeMB * 1024 * 1024), - speedometer: &speedometer, - }) - fatal(err) - res, err := client.Do(req) - fatal(err) - err = res.Body.Close() - fatal(err) - } -} - -func runReceiver() { - var speedometer Speedometer - - speedometer.Init() - - server := &http.Server{ - Addr: ":" + strconv.Itoa(*port), - Handler: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - for { - buf := make([]byte, *bufsize) - nread, err := r.Body.Read(buf) - if err == io.EOF { - break - } - fatal(err) - speedometer.Update(uint64(nread)) - } - w.WriteHeader(200) - }), - } - - if *ssl { - server.TLSConfig = &tls.Config{Certificates: []tls.Certificate{GenerateCert()}} - server.ListenAndServeTLS("", "") - } else { - fmt.Println("Listening on port", *port) - server.ListenAndServe() - } -} - -// Speedometer is a speed measurement util -type Speedometer struct { - bytes uint64 - lastBytes uint64 - lastTime time.Time -} - -// Init a speedometer -func (s *Speedometer) Init() { - s.lastTime = time.Now() -} - -// Update a speedometer -func (s *Speedometer) Update(bytes uint64) { - s.bytes += bytes - took := time.Since(s.lastTime).Seconds() - if took >= 1 { - fmt.Printf("%7.1f MB/sec \n", float64(s.bytes-s.lastBytes)/1024/1024/took) - s.lastTime = time.Now() - s.lastBytes = s.bytes - } -} - -func fatal(err error) { - if err != nil { - log.Panic(err) - } -} - -// GenerateCert generates a self signed TLS certificate -func GenerateCert() tls.Certificate { - - cert := &x509.Certificate{ - SerialNumber: big.NewInt(1), - Subject: pkix.Name{Organization: []string{"Acme Co"}}, - NotBefore: time.Now(), - NotAfter: time.Now().AddDate(1, 0, 0), - IsCA: true, - BasicConstraintsValid: true, - KeyUsage: x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature | x509.KeyUsageCertSign, - ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth, x509.ExtKeyUsageClientAuth}, - IPAddresses: []net.IP{net.ParseIP("127.0.0.1")}, - DNSNames: nil, - } - - privateKey, err := rsa.GenerateKey(rand.Reader, 4096) - fatal(err) - - certBytes, err := x509.CreateCertificate(rand.Reader, cert, cert, &privateKey.PublicKey, privateKey) - fatal(err) - - certPEM := new(bytes.Buffer) - fatal(pem.Encode(certPEM, &pem.Block{ - Type: "CERTIFICATE", - Bytes: certBytes, - })) - - privateKeyPEM := new(bytes.Buffer) - fatal(pem.Encode(privateKeyPEM, &pem.Block{ - Type: "RSA PRIVATE KEY", - Bytes: x509.MarshalPKCS1PrivateKey(privateKey), - })) - - tlsCert, err := tls.X509KeyPair(certPEM.Bytes(), privateKeyPEM.Bytes()) - fatal(err) - - return tlsCert -} diff --git a/src/tools/http_speed.js b/src/tools/http_speed.js index 8c9228fc36..ae70651651 100644 --- a/src/tools/http_speed.js +++ b/src/tools/http_speed.js @@ -4,45 +4,58 @@ const argv = require('minimist')(process.argv); const http = require('http'); const https = require('https'); -const stream = require('stream'); const crypto = require('crypto'); -const cluster = require('cluster'); const ssl_utils = require('../util/ssl_utils'); const semaphore = require('../util/semaphore'); +const http_utils = require('../util/http_utils'); const Speedometer = require('../util/speedometer'); const buffer_utils = require('../util/buffer_utils'); +const stream_utils = require('../util/stream_utils'); require('../util/console_wrapper').original_console(); -const size_units_mult = { +const size_units_mult = Object.freeze({ KB: 1024, MB: 1024 * 1024, GB: 1024 * 1024 * 1024, TB: 1024 * 1024 * 1024 * 1024, -}; +}); -// common +argv.server = Boolean(argv.server); +argv.client = argv.client === true ? 'localhost' : argv.client; argv.port = Number(argv.port) || 50505; argv.ssl = Boolean(argv.ssl); -argv.forks = argv.forks || 1; -// client -argv.client = argv.client === true ? 'localhost' : argv.client; -argv.size = argv.size || 100; +argv.forks = Number(argv.forks ?? 1); +argv.time = Number(argv.time ?? 60); +argv.hash = argv.hash ? String(argv.hash) : ''; +argv.size = Number(argv.size ?? 1); argv.size_units = argv.size_units || 'MB'; const size_bytes = argv.size * (size_units_mult[argv.size_units] || 1); -argv.buf = Math.min(argv.buf || 128 * 1024, size_bytes); // in Bytes -argv.concur = argv.concur || 1; -// server -argv.server = Boolean(argv.server); -argv.hash = argv.hash ? String(argv.hash) : ''; +argv.buf = Number(argv.buf || size_bytes); // in Bytes +argv.method ||= 'GET'; +argv.concur = Number(argv.concur ?? 1); +argv.nodelay = Boolean(argv.nodelay); + +if (argv.help) usage(); +if (!argv.server && !argv.client) { + console.error('Missing --client or --server'); + usage(); +} +if (argv.method !== 'GET' && argv.method !== 'PUT') { + console.error('Invalid method:', argv.method); + usage(); +} -// set keep alive to make sure we don't reconnect between requests -http.globalAgent.keepAlive = true; -const http_agent = argv.ssl ? - new https.Agent({ keepAlive: true }) : - new http.Agent({ keepAlive: true }); +function usage() { + console.log(` + Client Usage: --client [--port X] [--ssl] [--forks X] [--hash sha256] [--buf X (Bytes)] [--size X (MB)] [--concur X] [--method GET|PUT] + + Server Usage: --server [--port X] [--ssl] [--forks X] [--hash sha256] [--buf X (Bytes)] [--size X (MB)] + `); + process.exit(1); +} -const buffers_pool_sem = new semaphore.Semaphore(1024 * 1024 * 1024, { +const buffers_pool_sem = new semaphore.Semaphore(8 * argv.concur * size_bytes, { timeout: 2 * 60 * 1000, timeout_error_code: 'HTTP_SPEED_BUFFER_POOL_TIMEOUT', warning_timeout: 10 * 60 * 1000, @@ -53,210 +66,159 @@ const buffers_pool = new buffer_utils.BuffersPool({ warning_timeout: 2 * 60 * 1000, }); -const send_speedometer = new Speedometer('Send Speed'); -const recv_speedometer = new Speedometer('Receive Speed'); -const master_speedometer = new Speedometer('Total Speed'); - -if (cluster.isMaster) { - delete argv._; - console.log('ARGV', JSON.stringify(argv)); -} - -if (argv.exit) setTimeout(() => process.exit(), Number(argv.exit) * 1000); - -if (argv.forks > 1 && cluster.isMaster) { - master_speedometer.fork(argv.forks); -} else { - main(); -} - -function main() { - if (argv.help) { - return usage(); - } - if (argv.server) { - return run_server(); - } - if (argv.client) { - return run_client(); - } - return usage(); -} +// set keep alive to make sure we don't reconnect between requests +// @ts-ignore +http.globalAgent.keepAlive = true; +// @ts-ignore +https.globalAgent.keepAlive = true; +const http_agent = argv.ssl ? + new https.Agent({ keepAlive: true }) : + new http.Agent({ keepAlive: true }); -function usage() { - console.log(` - Client Usage: --client [--port X] [--ssl] [--forks X] [--size X (MB)] [--buf X (Bytes)] [--concur X] +const speedometer = new Speedometer({ + name: 'HTTP', + argv, + num_workers: argv.forks, + workers_func, +}); +speedometer.start(); - Server Usage: --server [--port X] [--ssl] [--forks X] [--hash sha256] - `); +async function workers_func() { + return argv.server ? run_server() : run_client(); } -function run_server() { - const server = argv.ssl ? - https.createServer(ssl_utils.generate_ssl_certificate()) : - http.createServer(); - - server.on('error', err => { - console.error('HTTP server error', err.message); - process.exit(); - }) - .on('close', () => { - console.error('HTTP server closed'); - process.exit(); - }) +async function run_server() { + /** @type {http.ServerOptions} */ + const http_options = { + ...ssl_utils.generate_ssl_certificate(), + keepAlive: true, + highWaterMark: 8 * argv.buf, + noDelay: argv.nodelay, + }; + const http_server = argv.ssl ? + https.createServer(http_options) : + http.createServer(http_options); + await new Promise((resolve, reject) => http_server + .on('close', resolve) + .on('error', reject) .on('listening', () => { console.log('HTTP server listening on port', argv.port, '...'); }) - .on('connection', conn => { - const fd = conn._handle.fd; - console.log(`HTTP connection accepted (fd ${fd})`); - conn.once('close', () => { - console.log(`HTTP connection closed (fd ${fd})`); - }); - }) - .on('request', run_server_request) - .listen(argv.port); + .on('connection', conn => http_utils.http_server_connections_logger) + .on('request', handle_request) + .listen(argv.port) + ); } /** - * * @param {http.IncomingMessage} req * @param {http.ServerResponse} res */ -function run_server_request(req, res) { - const start_time = process.hrtime.bigint(); - req.on('error', err => { - console.error('HTTP server request error', err.message); - process.exit(); - }); - res.on('error', err => { - console.error('HTTP server response error', err.message); - process.exit(); - }); - req.once('end', () => { - res.end(); - const took_ns = Number(process.hrtime.bigint() - start_time); - recv_speedometer.add_op(took_ns / 1e6); - }); - run_receiver(req); +async function handle_request(req, res) { + try { + await speedometer.measure(async () => { + req.setEncoding('binary'); + req.on('error', err => { + console.error('HTTP server request error', err.message); + }); + res.on('error', err => { + console.error('HTTP server response error', err.message); + }); + if (req.method === 'GET') { + res.setHeader('Content-Length', size_bytes); + await write(res, size_bytes); + } else if (req.method === 'PUT') { + await read(req); + } else { + throw new Error('HTTP server request method not supported'); + } + res.end(); + await stream_utils.wait_finished(res); + }); + } catch (err) { + if (err.code !== 'ECONNRESET') { + console.error('HTTP server error', err.message); + } + req.destroy(); + res.destroy(); + } +} + +async function run_client() { + await Promise.all(Array(argv.concur).fill(0).map(run_client_loop)); } -function run_client() { - for (let i = 0; i < argv.concur; ++i) { - setImmediate(run_client_request); +async function run_client_loop() { + const end_time = Date.now() + (argv.time * 1000); + while (Date.now() < end_time) { + await speedometer.measure(make_request); } } -function run_client_request() { - const start_time = process.hrtime.bigint(); - const req = (argv.ssl ? https : http) - .request({ - agent: http_agent, - port: argv.port, - hostname: argv.client, - path: '/upload', - method: 'PUT', - headers: { - 'content-type': 'application/octet-stream', - }, - // we allow self generated certificates to avoid public CA signing: - rejectUnauthorized: false, - }) - .once('error', err => { - console.error('HTTP client request error', err.message); - process.exit(); - }) - .once('response', res => { - if (res.statusCode !== 200) { - console.error('HTTP client response status', res.statusCode); - process.exit(); - } - res.once('error', err => { - console.error('HTTP client response error', err.message); - process.exit(); - }) - .once('end', () => { - const took_ns = Number(process.hrtime.bigint() - start_time); - send_speedometer.add_op(took_ns / 1e6); - }) - .once('end', run_client_request) - .on('data', data => { /* noop */ }); - // setImmediate(run_client_request); - }); +async function make_request() { + const req = (argv.ssl ? https : http).request({ + agent: http_agent, + port: argv.port, + hostname: argv.client, + method: argv.method, + // we allow self generated certificates to avoid public CA signing: + rejectUnauthorized: false, + }); + + // create the promise before sending the request to already attach the event handlers + const res_promise = new Promise((resolve, reject) => req + .on('error', reject) + .on('response', resolve) + ); + + req.chunkedEncoding = false; + if (argv.nodelay) req.setNoDelay(true); + if (argv.method === 'PUT') { + req.setHeader('Content-Length', size_bytes); + await write(req, size_bytes); + } + req.end(); + + // wait for response + /** @type {http.IncomingMessage} */ + const res = await res_promise; + if (res.statusCode !== 200) { + throw new Error(`HTTP client response status ${res.statusCode}`); + } - run_sender(req); + await read(res); } /** - * @param {http.ClientRequest} writable + * @param {import('stream').Writable} writable */ -function run_sender(writable) { - const req_size = size_bytes; +async function write(writable, size) { + const hasher = argv.hash && crypto.createHash(argv.hash); let n = 0; - - writable.on('drain', send); - send(); - - async function send() { - let ok = true; - while (ok && n < req_size) { - // const buffer = Buffer.allocUnsafe(Math.min(buf_size, req_size - n)); - let { buffer, callback } = await buffers_pool.get_buffer(); - if (buffer.length > req_size - n) buffer = buffer.subarray(0, req_size - n); - ok = writable.write(buffer, callback); + await buffers_pool.use_buffer(async buffer => { + while (n < size) { + if (buffer.length > size - n) { + buffer = buffer.subarray(0, size - n); + } + if (hasher) hasher.update(buffer); + const ok = writable.write(buffer); + if (!ok) { + await stream_utils.wait_drain(writable); + } n += buffer.length; - send_speedometer.update(buffer.length); + speedometer.update(buffer.length); } - if (n >= req_size) writable.end(); - } + }); } /** - * @param {http.IncomingMessage} readable + * @param {import('stream').Readable} readable */ -function run_receiver(readable) { +async function read(readable) { + readable.setEncoding('binary'); const hasher = argv.hash && crypto.createHash(argv.hash); - if (argv.client) { - const req = (argv.ssl ? https : http) - .request({ - agent: http_agent, - port: argv.port + 1, - hostname: argv.client, - path: '/upload', - method: 'PUT', - headers: { - 'content-type': 'application/octet-stream', - }, - // we allow self generated certificates to avoid public CA signing: - rejectUnauthorized: false, - }) - .once('error', err => { - console.error('HTTP client request error', err.message); - process.exit(); - }) - .once('response', res => { - if (res.statusCode !== 200) { - console.error('HTTP client response status', res.statusCode); - process.exit(); - } - res.once('error', err => { - console.error('HTTP client response error', err.message); - process.exit(); - }) - .on('data', data => { /* noop */ }); - }); - - readable.pipe(new stream.Transform({ - transform(data, encoding, callback) { - if (hasher) hasher.update(data); - recv_speedometer.update(data.length); - callback(null, data); - } - })).pipe(req); - } else { - readable.on('data', data => { - if (hasher) hasher.update(data); - recv_speedometer.update(data.length); - }); + for await (const data of readable) { + if (hasher) hasher.update(data); + speedometer.update(data.length); } } diff --git a/src/tools/rdma_speed.js b/src/tools/rdma_speed.js new file mode 100644 index 0000000000..a628f97fdd --- /dev/null +++ b/src/tools/rdma_speed.js @@ -0,0 +1,267 @@ +/* Copyright (C) 2025 NooBaa */ +'use strict'; + +const argv = require('minimist')(process.argv); +const http = require('http'); +const https = require('https'); +const assert = require('assert'); +const setImmediateAsync = require('timers/promises').setImmediate; +const querystring = require('querystring'); +const nb_native = require('../util/nb_native'); +const ssl_utils = require('../util/ssl_utils'); +const { Semaphore } = require('../util/semaphore'); +const Speedometer = require('../util/speedometer'); +const buffer_utils = require('../util/buffer_utils'); + +require('../util/console_wrapper').original_console(); + +const size_units_mult = { + B: 1, + KB: 1024, + MB: 1024 * 1024, + GB: 1024 * 1024 * 1024, + TB: 1024 * 1024 * 1024 * 1024, +}; + +// set keep alive to make sure we don't reconnect between requests +// @ts-ignore +http.globalAgent.keepAlive = true; +// @ts-ignore +https.globalAgent.keepAlive = true; +// const http_agent = argv.ssl ? +// new https.Agent({ keepAlive: true }) : +// new http.Agent({ keepAlive: true }); + +argv.ip ||= '172.16.0.61'; +argv.port ||= 18515; +argv.op ||= "GET"; +argv.key ||= "/rdma_speed"; +argv.forks ||= 1; +argv.concur ||= 1; + +argv.size ||= 16; +argv.pool_size ||= (4 * argv.size); +argv.size_units ||= 'MB'; +const size_mult = size_units_mult[argv.size_units] || 1; +const size_bytes = argv.size * size_mult; +const pool_bytes = argv.pool_size * size_mult; + +if (argv.help) usage(); + +/** @type {nb.RdmaServerNapi} */ +let rdma_server; + +const FILL_GET = 'G'.charCodeAt(0); +const FILL_PUT = 'P'.charCodeAt(0); +const FILL_CLIENT = 'C'.charCodeAt(0); +const FILL_SERVER = 'S'.charCodeAt(0); + +const buffers_pool_sem = new Semaphore(pool_bytes, { + timeout: 2 * 60 * 1000, + timeout_error_code: 'RDMA_BUFFER_POOL_TIMEOUT', + warning_timeout: 10 * 60 * 1000, +}); +const buffers_pool = new buffer_utils.BuffersPool({ + buf_size: size_bytes, + sem: buffers_pool_sem, + warning_timeout: 2 * 60 * 1000, + buffer_alloc: size => { + const buf = nb_native().fs.dio_buffer_alloc(size); + if (rdma_server) rdma_server.register_buffer(buf); + return buf; + }, +}); + +const speedometer = new Speedometer({ + name: 'RDMA', + argv, + num_workers: argv.forks, + workers_func, +}); +speedometer.start(); + +async function workers_func() { + if (argv.server) { + return run_server(); + } + if (argv.client) { + return run_client(); + } + return usage(); +} + +function usage() { + console.log(` + Client Usage: --client --ip 1.1.1.1 --port 12345 [--ssl] [--forks X] [--concur X] [--op GET|PUT] [--size X] [--size_units MB] + + Server Usage: --server --ip 1.1.1.1 --port 12345 [--ssl] [--forks X] + `); + process.exit(1); +} + +async function run_server() { + const { RdmaServerNapi } = nb_native(); + rdma_server = new RdmaServerNapi({ + ip: argv.ip, + port: 0, // every fork will get a different port + log_level: 'ERROR', + }); + console.log('RDMA server:', argv.ip, argv.port); + + const http_options = { ...ssl_utils.generate_ssl_certificate(), keepAlive: true }; + const http_server = argv.ssl ? https.createServer(http_options) : http.createServer(http_options); + + http_server.on('error', err => { + console.error('HTTP server error', err.message); + process.exit(); + }) + .on('close', () => { + console.error('HTTP server closed'); + process.exit(); + }) + .on('listening', () => { + console.log('HTTP server listening on port', argv.port, '...'); + }) + .on('connection', conn => { + // @ts-ignore + const fd = conn._handle.fd; + console.log(`HTTP connection accepted (fd ${fd})`); + conn.once('close', () => { + console.log(`HTTP connection closed (fd ${fd})`); + }); + }) + .on('request', run_server_request) + .listen(argv.port); +} + + +async function run_server_request(req, res) { + const start_time = process.hrtime.bigint(); + + const op_type = req.method === 'GET' ? 'GET' : 'PUT'; + const op_key = String(req.url); + const parsed_info = querystring.parse(String(req.headers['x-rdma-info'])); + const rdma_info = { + desc: String(parsed_info.desc), + addr: String(parsed_info.addr), + size: Number(parsed_info.size), + offset: Number(parsed_info.offset), + }; + + let ret_size = 0; + let buffer_pool_cleanup; + try { + const pooled_buffer = await buffers_pool.get_buffer(); + const buffer = pooled_buffer.buffer; + buffer_pool_cleanup = pooled_buffer.callback; + + if (argv.fill && op_type === 'GET') { + buffer.fill(FILL_GET); + } + + ret_size = await rdma_server.rdma(op_type, op_key, buffer, rdma_info); + + if (argv.fill && op_type === 'PUT') { + assert.strictEqual(buffer[0], FILL_PUT); + assert.strictEqual(buffer[buffer.length - 1], FILL_PUT); + buffer.fill(FILL_SERVER); + } + + res.statusCode = 200; + res.setHeader('x-rdma-reply', String(ret_size)); + res.end(); + + } finally { + if (buffer_pool_cleanup) buffer_pool_cleanup(); + } + + const took_ms = Number(process.hrtime.bigint() - start_time) / 1e6; + speedometer.update(ret_size, took_ms); +} + +function run_client() { + for (let i = 0; i < argv.concur; ++i) { + setImmediate(run_client_loop); + } +} + +async function run_client_loop() { + const { RdmaClientNapi } = nb_native(); + const rdma_client = new RdmaClientNapi(); + console.log('RDMA client'); + + for (; ;) { + await run_client_request(rdma_client); + await setImmediateAsync(); + } +} + +async function run_client_request(rdma_client) { + const start_time = process.hrtime.bigint(); + + let ret_size = 0; + let buffer_pool_cleanup; + try { + const pooled_buffer = await buffers_pool.get_buffer(); + const buffer = pooled_buffer.buffer; + buffer_pool_cleanup = pooled_buffer.callback; + + if (argv.fill && argv.op === 'PUT') { + buffer.fill(FILL_PUT); + } + + ret_size = await rdma_client.rdma(argv.op, buffer, async (rdma_info, callback) => { + const res_size = await send_http_request(rdma_info); + assert.strictEqual(res_size, buffer.length); + callback(null, res_size); + }); + + assert.strictEqual(ret_size, buffer.length); + + if (argv.fill && argv.op === 'GET') { + assert.strictEqual(buffer[0], FILL_GET); + assert.strictEqual(buffer[buffer.length - 1], FILL_GET); + buffer.fill(FILL_CLIENT); + } + + } finally { + if (buffer_pool_cleanup) buffer_pool_cleanup(); + } + + const took_ms = Number(process.hrtime.bigint() - start_time) / 1e6; + speedometer.update(ret_size, took_ms); +} + +async function send_http_request(rdma_info) { + const rmda_header = querystring.stringify({ ...rdma_info }); + const http_options = { + // agent: http_agent, + hostname: argv.ip, + port: argv.port, + method: argv.op, + path: argv.key, + headers: { + 'x-rdma-info': rmda_header, + 'Content-Type': 'application/octet-stream', + }, + // we allow self generated certificates to avoid public CA signing: + rejectUnauthorized: false, + }; + + const res = await new Promise((resolve, reject) => { + const req = (argv.ssl ? https : http).request(http_options); + req.on('response', resolve); + req.on('error', reject); + req.end(); + }); + + // have to read the empty response to get the connection reused + await new Promise((resolve, reject) => res + .on('end', resolve) + .on('error', reject) + .on('data', data => { /* noop */ }) + ); + + const res_size = Number(res.headers['x-rdma-reply']); + return res_size; +} diff --git a/src/tools/s3cat.js b/src/tools/s3cat.js index 6b50348ca8..26e9591b38 100644 --- a/src/tools/s3cat.js +++ b/src/tools/s3cat.js @@ -16,8 +16,11 @@ const moment = require('moment'); const size_utils = require('../util/size_utils'); const RandStream = require('../util/rand_stream'); const Speedometer = require('../util/speedometer'); +const assert = require('assert'); let argv; + +/** @type {AWS.S3} */ let s3; async function main() { @@ -228,7 +231,7 @@ function head_object() { } function delete_objects() { - if (typeof(argv.rm) !== 'string') { + if (typeof argv.rm !== 'string') { console.error('missing keys to delete, for example: --rm "key1,/path/to/key2"'); return; } @@ -242,7 +245,7 @@ function delete_objects() { }); } -function upload_object() { +async function upload_object() { const file_path = argv.file || ''; let upload_key = (_.isString(argv.upload) && argv.upload) || @@ -253,9 +256,11 @@ function upload_object() { argv.part_size = argv.part_size || 32; let data_source; let data_size; + let content_type; const part_size = argv.part_size * 1024 * 1024; if (file_path) { upload_key = upload_key || file_path + '-' + Date.now().toString(36); + content_type = mime.getType(file_path) || ''; data_source = fs.createReadStream(file_path, { highWaterMark: part_size }); @@ -264,6 +269,7 @@ function upload_object() { 'of size', size_utils.human_size(data_size)); } else { upload_key = upload_key || 'upload-' + Date.now().toString(36); + content_type = mime.getType(upload_key) || ''; data_size = Math.round(argv.size * 1024 * 1024); data_source = argv.buf ? crypto.randomBytes(data_size) : @@ -294,15 +300,21 @@ function upload_object() { console.log('upload done.', speed_str, 'MB/sec'); } + if (argv.rdma) { + return put_object_rdma(upload_key, data_size, content_type, on_progress, on_finish); + } + if (argv.copy) { const params = { Bucket: argv.bucket, Key: upload_key, CopySource: argv.bucket + '/' + argv.copy, - ContentType: mime.lookup(upload_key) || '', + ContentType: content_type, }; if (argv.presign) return make_simple_request('copyObject', params); - return s3.copyObject(params, on_finish); + await s3.copyObject(params).promise(); + on_finish(); + return; } if (argv.put) { @@ -310,155 +322,219 @@ function upload_object() { Bucket: argv.bucket, Key: upload_key, Body: data_source, - ContentType: mime.lookup(file_path) || '', + ContentType: content_type, ContentLength: data_size }; if (argv.presign) return make_simple_request('putObject', params); - return s3.putObject(params, on_finish).on('httpUploadProgress', on_progress); + const upload = s3.putObject(params); + upload.on('httpUploadProgress', on_progress); + await upload.promise(); + on_finish(); + return; } if (!argv.perf) { - s3.upload({ - Bucket: argv.bucket, - Key: upload_key, - Body: data_source, - ContentType: mime.lookup(file_path), - ContentLength: data_size - }, { - partSize: part_size, - queueSize: argv.concur - }, on_finish) - .on('httpUploadProgress', on_progress); + const upload = s3.upload({ + Bucket: argv.bucket, + Key: upload_key, + Body: data_source, + ContentType: content_type, + ContentLength: data_size + }, { + partSize: part_size, + queueSize: argv.concur + }); + upload.on('httpUploadProgress', on_progress); + await upload.promise(); + on_finish(); return; } if (argv.perf) { - const progress = { - loaded: 0 - }; - s3.createMultipartUpload({ + return put_object_multipart_perf(upload_key, data_source, content_type, on_progress, on_finish); + } +} + +async function put_object_multipart_perf(upload_key, data_source, content_type, on_progress, on_finish) { + const progress = { + loaded: 0 + }; + const create_res = await s3.createMultipartUpload({ + Bucket: argv.bucket, + Key: upload_key, + ContentType: content_type, + }).promise(); + + let next_part_num = 0; + let concur = 0; + let finished = false; + let latency_avg = 0; + + function complete() { + s3.completeMultipartUpload({ Bucket: argv.bucket, Key: upload_key, - ContentType: mime.lookup(file_path), - }, (err, create_res) => { - if (err) { - console.error('s3.createMultipartUpload ERROR', err); + UploadId: create_res.UploadId, + // MultipartUpload: { + // Parts: [{ + // ETag: etag, + // PartNumber: part_num + // }] + // } + }, function(err2, complete_res) { + if (err2) { + console.error('s3.completeMultipartUpload ERROR', err2); return; } - let next_part_num = 0; - let concur = 0; - let finished = false; - let latency_avg = 0; - - function complete() { - s3.completeMultipartUpload({ - Bucket: argv.bucket, - Key: upload_key, - UploadId: create_res.UploadId, - // MultipartUpload: { - // Parts: [{ - // ETag: etag, - // PartNumber: part_num - // }] - // } - }, function(err2, complete_res) { - if (err2) { - console.error('s3.completeMultipartUpload ERROR', err2); - return; - } - console.log('uploadPart average latency', - (latency_avg / next_part_num).toFixed(0), 'ms'); - on_finish(); - }); - } - - data_source.on('data', data => { - next_part_num += 1; - concur += 1; - if (concur >= argv.concur) { - //console.log('=== pause source stream ==='); - data_source.pause(); - } - //console.log('uploadPart'); - const data_start_time = Date.now(); - const part_num = next_part_num; - s3.uploadPart({ - Bucket: argv.bucket, - Key: upload_key, - PartNumber: part_num, - UploadId: create_res.UploadId, - Body: data, - }, (err2, res) => { - concur -= 1; - if (err2) { - data_source.close(); - console.error('s3.uploadPart ERROR', err2); - return; - } - const took = Date.now() - data_start_time; - // console.log('Part', part_num, 'Took', took, 'ms'); - latency_avg += took; - data_source.resume(); - progress.loaded += data.length; - if (finished && !concur) { - complete(); - } else { - on_progress(progress); - } - }); - }); - data_source.on('end', () => { - finished = true; - if (!concur) { - complete(); - } - }); + console.log('uploadPart average latency', + (latency_avg / next_part_num).toFixed(0), 'ms'); + on_finish(); }); } + + data_source.on('data', data => { + next_part_num += 1; + concur += 1; + if (concur >= argv.concur) { + //console.log('=== pause source stream ==='); + data_source.pause(); + } + //console.log('uploadPart'); + const data_start_time = Date.now(); + const part_num = next_part_num; + s3.uploadPart({ + Bucket: argv.bucket, + Key: upload_key, + PartNumber: part_num, + UploadId: create_res.UploadId, + Body: data, + }, (err2, res) => { + concur -= 1; + if (err2) { + data_source.close(); + console.error('s3.uploadPart ERROR', err2); + return; + } + const took = Date.now() - data_start_time; + // console.log('Part', part_num, 'Took', took, 'ms'); + latency_avg += took; + data_source.resume(); + progress.loaded += data.length; + if (finished && !concur) { + complete(); + } else { + on_progress(progress); + } + }); + }); + data_source.on('end', () => { + finished = true; + if (!concur) { + complete(); + } + }); } -function get_object() { +async function get_object() { const params = { Bucket: argv.bucket, Key: argv.get, }; if (argv.presign) return make_simple_request('getObject', params); - s3.headObject(params, function(err, data) { - if (err) { - console.error('HEAD ERROR:', err); + + const res_head = await s3.headObject(params).promise(); + const data_size = Number(res_head.ContentLength); + const start_time = Date.now(); + const speedometer = new Speedometer('Download Speed'); + console.log('object size', size_utils.human_size(data_size)); + + function on_finish(err2) { + if (err2) { + console.error('GET ERROR:', err2); return; } + const end_time = Date.now(); + const total_seconds = (end_time - start_time) / 1000; + const speed_str = (data_size / total_seconds / 1024 / 1024).toFixed(0); + console.log('get done.', speed_str, 'MB/sec'); + } - const data_size = Number(data.ContentLength); - const start_time = Date.now(); - const speedometer = new Speedometer('Download Speed'); - - console.log('object size', size_utils.human_size(data_size)); + if (argv.rdma) { + return get_object_rdma(params, data_size, speedometer, on_finish); + } - function on_finish(err2) { - if (err2) { - console.error('GET ERROR:', err2); - return; + s3.getObject(params) + .createReadStream() + .on('error', on_finish) + .pipe(new stream.Transform({ + transform: function(buf, encoding, callback) { + speedometer.update(buf.length); + callback(); } - const end_time = Date.now(); - const total_seconds = (end_time - start_time) / 1000; - const speed_str = (data_size / total_seconds / 1024 / 1024).toFixed(0); - console.log('get done.', speed_str, 'MB/sec'); - } + })) + .on('finish', on_finish); +} - s3.getObject(params) - .createReadStream() - .on('error', on_finish) - .pipe(new stream.Transform({ - transform: function(buf, encoding, callback) { - speedometer.update(buf.length); - callback(); - } - })) - .on('finish', on_finish); +async function get_object_rdma(params, data_size, speedometer, on_finish) { + const nb_native = require('../util/nb_native'); + const rdma_utils = require('../util/rdma_utils'); + const rdma_client = new (nb_native().RdmaClientNapi)(); + const buffer = nb_native().fs.dio_buffer_alloc(data_size); + const cuda_mem = argv.cuda ? new (nb_native().CudaMemory)(data_size) : undefined; + const rdma_buf = argv.cuda ? cuda_mem.as_rdma_buffer() : buffer; + const ret_size = await rdma_client.rdma('GET', rdma_buf, async (rdma_info, callback) => { + try { + const req = s3.getObject(params); + req.on('build', () => { + req.httpRequest.headers[rdma_utils.X_NOOBAA_RDMA] = + rdma_utils.encode_rdma_header({ ...rdma_info }); + }); + const res = await req.promise(); + const rdma_hdr = res.$response.httpResponse.headers[rdma_utils.X_NOOBAA_RDMA]; + const rdma_reply = rdma_utils.decode_rdma_header(rdma_hdr); + callback(null, Number(rdma_reply.size)); + } catch (err) { + callback(err); + } }); + assert.strictEqual(ret_size, data_size); + speedometer.update(ret_size); + on_finish(); } +async function put_object_rdma(upload_key, data_size, content_type, on_progress, on_finish) { + const nb_native = require('../util/nb_native'); + const rdma_utils = require('../util/rdma_utils'); + const rdma_client = new (nb_native().RdmaClientNapi)(); + const buffer = nb_native().fs.dio_buffer_alloc(data_size); + const cuda_mem = argv.cuda ? new (nb_native().CudaMemory)(data_size) : undefined; + const rdma_buf = argv.cuda ? cuda_mem.as_rdma_buffer() : buffer; + // TODO fill buffer with data + const ret_size = await rdma_client.rdma('PUT', rdma_buf, async (rdma_info, callback) => { + try { + const params = { + Bucket: argv.bucket, + Key: upload_key, + ContentType: content_type, + }; + const req = s3.putObject(params); + req.on('build', () => { + req.httpRequest.headers[rdma_utils.X_NOOBAA_RDMA] = + rdma_utils.encode_rdma_header({ ...rdma_info }); + }); + const res = await req.promise(); + const rdma_hdr = res.$response.httpResponse.headers[rdma_utils.X_NOOBAA_RDMA]; + const rdma_reply = rdma_utils.decode_rdma_header(rdma_hdr); + const reply_size = Number(rdma_reply.size); + on_progress(reply_size); + callback(null, reply_size); + } catch (err) { + callback(err); + } + }); + assert.strictEqual(ret_size, data_size); + on_finish(); +} function print_usage() { @@ -497,6 +573,8 @@ General S3 Flags: --aws (default is false) Use AWS endpoint and subdomain-style buckets --checksum (default is false) Calculate checksums on data. slower. --presign print a presigned url instead of sending the request, value is expiry in seconds (default 3600 if not set). + --rdma (default is false) Use RDMA for data transfer + --cuda (default is false) Use CUDA memory over RDMA `); } diff --git a/src/tools/s3perf.js b/src/tools/s3perf.js index 12740e8294..ba51ffe8d7 100644 --- a/src/tools/s3perf.js +++ b/src/tools/s3perf.js @@ -1,15 +1,16 @@ /* Copyright (C) 2016 NooBaa */ 'use strict'; -require('aws-sdk/lib/maintenance_mode_message').suppress = true; - -const AWS = require('aws-sdk'); const minimist = require('minimist'); const http = require('http'); const https = require('https'); -const size_utils = require('../util/size_utils'); +const crypto = require('crypto'); +const nb_native = require('../util/nb_native'); +const rdma_utils = require('../util/rdma_utils'); const RandStream = require('../util/rand_stream'); -const { cluster } = require('../util/fork_utils'); +const Speedometer = require('../util/speedometer'); +const { S3 } = require('@aws-sdk/client-s3'); +const { Upload } = require('@aws-sdk/lib-storage'); const size_units_mult = { KB: 1024, @@ -23,25 +24,24 @@ const argv = minimist(process.argv.slice(2), { 'access_key', 'secret_key', 'bucket', - 'head', - 'get', - 'put', - 'upload', - 'delete', - 'mb', ], }); -argv.sig = argv.sig || 's3'; -argv.time = argv.time || 0; -argv.concur = argv.concur || 1; -argv.forks = argv.forks || 1; -argv.size = argv.size || 1; +argv.time = Number(argv.time ?? 0); +argv.concur = Number(argv.concur || 1); +argv.forks = Number(argv.forks ?? 1); +argv.size = Number(argv.size ?? 1); argv.size_units = argv.size_units || 'MB'; -argv.part_concur = argv.part_concur || 1; -argv.part_size = argv.part_size || 5; +argv.part_concur = Number(argv.part_concur || 1); +argv.part_size = Number(argv.part_size || 5); +argv.verbose = Boolean(argv.verbose || argv.v); +argv.max_objects = Number(argv.max_objects ?? 1_000_000); +argv.select_objects ||= 'random'; const data_size = argv.size * size_units_mult[argv.size_units]; +const size_name = String(argv.size) + String(argv.size_units); +argv.prefix ||= `s3perf/${size_name}/`; +argv.inventory ||= `s3perf/list-${size_name}`; if (!size_units_mult[argv.size_units]) { throw new Error('Unrecognized size_units ' + argv.size_units); @@ -50,34 +50,51 @@ if (argv.upload && data_size < argv.part_size * 1024 * 1024) { throw new Error('data_size lower than part_size ' + data_size); } -const start_time = Date.now(); +/** + * @typedef {{ + * worker_id: number; + * io_worker_id: number; + * buffer?: Buffer; + * rdma_buf?: Buffer; + * cuda_mem?: nb.CudaMemory; + * rdma_client?: nb.RdmaClientNapi; + * s3_client?: S3; + * }} IOWorker + */ + +/** @typedef {import('@aws-sdk/client-s3')._Object} S3Object */ + +/** @type {string[]} */ +let _object_keys; -let op_count = 0; -let total_size = 0; -let op_lat_sum = 0; -let last_reported = start_time; -let last_op_count = 0; -let last_total_size = 0; -let last_op_lat_sum = 0; +/** @type {IOWorker[]} */ +let _io_workers; /** - * @type {() => Promise} + * @type {(io_worker: IOWorker) => Promise} */ let op_func; +let need_object_keys = false; -if (argv.help) { +if (argv.help || argv.h) { print_usage(); -} else if (typeof argv.head === 'string') { - op_func = head_object; -} else if (typeof argv.get === 'string') { +} else if (argv.get) { op_func = get_object; -} else if (typeof argv.put === 'string') { + need_object_keys = true; +} else if (argv.put) { op_func = put_object; -} else if (typeof argv.upload === 'string') { +} else if (argv.upload) { op_func = upload_object; -} else if (typeof argv.delete === 'string') { +} else if (argv.head) { + op_func = head_object; + need_object_keys = true; +} else if (argv.delete) { op_func = delete_object; -} else if (typeof argv.mb === 'string') { + need_object_keys = true; +} else if (argv.gpu) { + op_func = gpu_func; + need_object_keys = true; +} else if (argv.mb) { op_func = create_bucket; } else { print_usage(); @@ -88,147 +105,166 @@ http.globalAgent.keepAlive = true; // @ts-ignore https.globalAgent.keepAlive = true; -const s3 = new AWS.S3({ +/** @type {import('@aws-sdk/client-s3').S3ClientConfig} */ +const s3_config = { endpoint: argv.endpoint, - accessKeyId: argv.access_key && String(argv.access_key), - secretAccessKey: argv.secret_key && String(argv.secret_key), - s3ForcePathStyle: true, - signatureVersion: argv.sig, // s3 or v4 - computeChecksums: argv.checksum || false, // disabled by default for performance - s3DisableBodySigning: !argv.signing || true, // disabled by default for performance region: argv.region || 'us-east-1', -}); + forcePathStyle: true, + credentials: { + accessKeyId: argv.access_key && String(argv.access_key), + secretAccessKey: argv.secret_key && String(argv.secret_key), + }, + // disable checksums by default for performance + requestChecksumCalculation: argv.checksum ? 'WHEN_SUPPORTED' : 'WHEN_REQUIRED', + responseChecksumValidation: argv.checksum ? 'WHEN_SUPPORTED' : 'WHEN_REQUIRED', + userAgentAppId: 's3perf', + requestHandler: { + httpAgent: { keepAlive: true, rejectUnauthorized: !argv.selfsigned, localAddress: argv.local_ip, scheduling: 'fifo' }, + httpsAgent: { keepAlive: true, rejectUnauthorized: !argv.selfsigned, localAddress: argv.local_ip, scheduling: 'fifo' }, + } +}; + +const s3 = new S3(s3_config); // AWS config does not use https.globalAgent // so for https we need to set the agent manually -if (s3.endpoint.protocol === 'https:') { - s3.config.update({ - httpOptions: { - agent: new https.Agent({ - keepAlive: true, - rejectUnauthorized: !argv.selfsigned, - }) - } - }); - if (!argv.selfsigned) { - // @ts-ignore - AWS.events.on('error', err => { - if (err.message === 'self signed certificate') { - setTimeout(() => console.log( - '\n*** You can accept self signed certificates with: --selfsigned\n' - ), 10); +// if (is_https && !argv.selfsigned) { +// // @ts-ignore +// s3.middlewareStack.add().events.on('error', err => { +// if (err.message === 'self signed certificate') { +// setTimeout(() => console.log( +// '\n*** You can accept self signed certificates with: --selfsigned\n' +// ), 10); +// } +// }); +// } + +const speedometer = new Speedometer({ + name: 'S3', + argv, + num_workers: argv.forks, + primary_init, + workers_init, + workers_func, +}); +speedometer.start(); + +async function primary_init() { + if (need_object_keys) { + let write_inventory = false; + if (!_object_keys && argv.inventory) { + try { + _object_keys = await read_keys_from_inventory(); + } catch (err) { + if (err.Code !== 'NoSuchKey') throw err; + console.log('Inventory object not found, creating new one'); + write_inventory = true; } - }); + } + if (!_object_keys) { + _object_keys = []; + console.log('Listing objects in bucket', argv.bucket, 'prefix', argv.prefix); + await list_bucket_concurrent(argv.prefix, _object_keys, argv.max_objects); + } + if (!_object_keys?.length) { + throw new Error(`No objects found for ${argv.prefix}`); + } + if (write_inventory) { + console.log('Writing inventory', argv.inventory, 'with', _object_keys.length, 'objects'); + await s3.putObject({ Bucket: argv.bucket, Key: argv.inventory, Body: _object_keys.join('\n') }); + } + console.log('Primary object keys', _object_keys.length); } } -if (cluster.isPrimary) { - run_master(); -} else { - run_worker(); -} - -async function run_master() { - console.log(argv); - if (argv.forks > 1) { - for (let i = 0; i < argv.forks; i++) { - const worker = cluster.fork(); - console.warn('WORKER', worker.process.pid, 'STARTED'); - worker.on('message', handle_message); +async function workers_init(worker_id) { + if (need_object_keys) { + if (!_object_keys && argv.inventory) { + _object_keys = await read_keys_from_inventory(); } - cluster.on('exit', (worker, code, signal) => { - console.warn('WORKER', worker.process.pid, 'EXITED', code, signal); - exit_all(); - }); - } else { - run_worker(); + if (!_object_keys?.length) { + throw new Error(`No objects found for ${argv.prefix}`); + } + console.log('Worker object keys: ', _object_keys.length); } - - setInterval(run_reporter, 1000).unref(); + _io_workers = new Array(argv.concur).fill(0).map((v, i) => + init_io_worker({ worker_id, io_worker_id: i })); } -function run_reporter() { +async function workers_func() { + await Promise.all(_io_workers.map(run_worker)); +} - const now = Date.now(); - const time = now - last_reported; - const time_total = now - start_time; - const ops = op_count - last_op_count; - const size = total_size - last_total_size; - const lat = op_lat_sum - last_op_lat_sum; - const tx = size / time * 1000; - const tx_total = total_size / time_total * 1000; - - console.log(`TOTAL: Throughput ${ - size_utils.human_size(tx_total) - }/sec Latency ${ - op_count ? (op_lat_sum / op_count).toFixed(3) : 0 - }ms IOPS ${ - (op_count / time_total * 1000).toFixed(3) - }/sec OPS ${op_count} | CURRENT: Throughput ${ - size_utils.human_size(tx) - }/sec Latency ${ - ops ? (lat / ops).toFixed(3) : 0 - }ms IOPS ${ - (ops / time * 1000).toFixed(3) - }/sec OPS ${ops}`); - - last_reported = now; - last_op_count = op_count; - last_total_size = total_size; - last_op_lat_sum = op_lat_sum; - - if (now - start_time > argv.time * 1000) { - console.warn('TEST DONE'); - exit_all(); - } +async function read_keys_from_inventory() { + const text = await read_text_object(argv.inventory); + const keys = text.split('\n').map(l => l.trim()).filter(l => l.length); + return keys; } -function exit_all() { - Object.keys(cluster.workers).forEach(w => cluster.workers[w].send('exit')); - process.exit(); +async function read_text_object(key) { + const res = await s3.getObject({ Bucket: argv.bucket, Key: key }); + const body = await res.Body.transformToString(); + return body; } -/** - * @typedef {{ - * ops: number; - * size: number; - * took_ms: number; - * }} Msg - * @param {Msg|'exit'} msg - */ -function handle_message(msg) { - if (msg === 'exit') { - process.exit(); - } else if (msg.took_ms >= 0) { - op_count += msg.ops; - total_size += msg.size; - op_lat_sum += msg.took_ms; +async function list_bucket_concurrent(prefix, list, max) { + let is_truncated = true; + let continuation_token; + while (is_truncated) { + if (list.length >= max) return; + const res = await s3.listObjectsV2({ + Bucket: argv.bucket, + Prefix: prefix, + Delimiter: '/', + MaxKeys: 1000, + ContinuationToken: continuation_token, + }); + if (list.length >= max) return; + if (res.Contents) { + for (const o of res.Contents) { + list.push(o.Key); + if (list.length >= max) return; + } + } + await Promise.all(res.CommonPrefixes?.map(p => list_bucket_concurrent(p.Prefix, list, max)) || []); + is_truncated = res.IsTruncated; + continuation_token = res.NextContinuationToken; } } -function send_message(msg) { - if (process.send) { - process.send(msg); - } else { - handle_message(msg); - } +/** + * @returns {string} + */ +function select_next_object_key() { + if (argv.exact_key) return argv.exact_key; + if (!need_object_keys || !_object_keys.length) throw new Error('No existing objects found'); + const i = crypto.randomInt(0, _object_keys.length); + return _object_keys[i]; } -async function run_worker() { - if (process.send) process.on('message', handle_message); - for (let i = 0; i < argv.concur; ++i) { - setImmediate(run_worker_loop); - } +/** + * @param {IOWorker} io_worker + * @returns {IOWorker} + */ +function init_io_worker(io_worker) { + const buf_size = argv.rdma ? (data_size || 4096) : data_size; + io_worker.buffer ||= nb_native().fs.dio_buffer_alloc(buf_size); + io_worker.cuda_mem ||= argv.cuda ? new (nb_native().CudaMemory)(buf_size) : undefined; + io_worker.rdma_buf ||= argv.cuda ? io_worker.cuda_mem.as_buffer() : io_worker.buffer; + io_worker.rdma_client ||= argv.rdma ? rdma_utils.new_rdma_client() : undefined; + io_worker.s3_client ||= argv.rdma ? rdma_utils.s3_rdma_client(s3_config, io_worker.rdma_buf, io_worker.rdma_client) : s3; + return io_worker; } -async function run_worker_loop() { +async function run_worker(io_worker) { try { - for (;;) { - const hrtime = process.hrtime(); - const size = await op_func(); - const hrtook = process.hrtime(hrtime); - const took_ms = (hrtook[0] * 1e-3) + (hrtook[1] * 1e-6); - send_message({ ops: 1, size, took_ms }); + const base_time = Date.now(); + for (; ;) { + if (argv.time && Date.now() - base_time > argv.time * 1000) break; + const start = process.hrtime.bigint(); + const size = await op_func(io_worker); + const took_ms = Number(process.hrtime.bigint() - start) / 1e6; + speedometer.update(size, took_ms); } } catch (err) { console.error('WORKER', process.pid, 'ERROR', err.stack || err); @@ -236,148 +272,219 @@ async function run_worker_loop() { } } -/** @type {AWS.S3.ListObjectsOutput} */ -let _list_objects = { Contents: [], IsTruncated: true }; -let _list_objects_next = 0; -let _list_objects_promise = null; /** - * This function returns the next object to be used for head/get/delete. - * It will list objects and keep the list in memory, returning the objects in list order, - * while fetching the next list pages on demand. - * If prefix is provided it will be used to filter objects keys. - * - * @param {string} [prefix] - * @returns {Promise} + * @param {IOWorker} io_worker + * @returns {Promise} */ -async function get_next_object(prefix) { - while (_list_objects_next >= _list_objects.Contents.length) { - if (_list_objects_promise) { - // console.log('get_next_object: wait for promise'); - await _list_objects_promise; - } else { - const marker = _list_objects.IsTruncated ? - (_list_objects.NextMarker || _list_objects.Contents[_list_objects.Contents.length - 1]?.Key) : - undefined; - _list_objects_promise = s3.listObjects({ - Bucket: argv.bucket, - Prefix: prefix, - Marker: marker, - }).promise(); - _list_objects = await _list_objects_promise; - _list_objects_promise = null; - _list_objects_next = 0; - console.log('get_next_object: got', _list_objects.Contents.length, 'objects from marker', marker); - } +async function get_object(io_worker) { + const key = select_next_object_key(); + const res = await io_worker.s3_client.getObject({ + Bucket: argv.bucket, + Key: key, + }); + + if (argv.verbose) console.log('GET', key, { ...res, Body: 'redacted' }); + + // even on rdma we must consume the empty stream to release the connection + + /** @type {any} */ + const body = res.Body; + for await (const chunk of body) { + speedometer.update(chunk.length); } - const obj = _list_objects.Contents[_list_objects_next]; - _list_objects_next += 1; - return obj; -} + // const buf = await res.Body.transformToByteArray(); + // for await (const chunk of res.Body.transformToWebStream()) { + // speedometer.update(chunk.length); + // } -async function head_object() { - const obj = await get_next_object(argv.head); - await s3.headObject({ Bucket: argv.bucket, Key: obj.Key }).promise(); + if (argv.rdma) { + // @ts-ignore + return res.rdma_reply.size; + } return 0; } -async function get_object() { - const obj = await get_next_object(argv.get); - await new Promise((resolve, reject) => { - s3.getObject({ - Bucket: argv.bucket, - Key: obj.Key, - }) - .createReadStream() - .on('finish', resolve) - .on('error', reject) - .on('data', data => { - send_message({ ops: 0, size: data.length, took_ms: 0 }); - }); - }); - return 0; -} +/** + * @param {IOWorker} io_worker + * @returns {Promise} + */ +async function put_object(io_worker) { + const now = Date.now(); + const key = argv.exact_key || + `${argv.prefix}${io_worker.worker_id}/${io_worker.io_worker_id}/${now % 256}/file${size_name}-${now.toString(36)}`; -async function delete_object() { - const obj = await get_next_object(argv.delete); - await s3.deleteObject({ + const res = await io_worker.s3_client.putObject({ Bucket: argv.bucket, - Key: obj.Key - }).promise(); - return 0; + Key: key, + Body: argv.rdma ? null : io_worker.buffer, + ContentLength: data_size, + // Body: new RandStream(data_size, { highWaterMark: 1024 * 1024 }), + }); + + if (argv.verbose) console.log('PUT', key, res); + + if (argv.rdma) { + // @ts-ignore + return res.rdma_reply.size; + } + return data_size; } -async function put_object() { - const upload_key = argv.put + '-' + Date.now().toString(36); - await s3.putObject({ +/** + * @param {IOWorker} io_worker + * @returns {Promise} + */ +async function upload_object(io_worker) { + const now = Date.now(); + const key = argv.exact_key || + `${argv.prefix}${io_worker.worker_id}/${io_worker.io_worker_id}/${now % 256}/file${size_name}-${now.toString(36)}`; + + const upload = new Upload({ + client: io_worker.s3_client, + partSize: argv.part_size * 1024 * 1024, + queueSize: argv.part_concur, + params: { Bucket: argv.bucket, - Key: upload_key, + Key: key, ContentLength: data_size, Body: new RandStream(data_size, { highWaterMark: 1024 * 1024, }) - }) - .on('httpUploadProgress', progress => { - send_message({ ops: 0, size: progress.loaded, took_ms: 0 }); - }) - .promise(); + } + }); + + upload.on('httpUploadProgress', progress => { + speedometer.update(progress.loaded); + }); + + const res = await upload.done(); + if (argv.verbose) console.log('UPLOAD', key, res); + return 0; } -async function upload_object() { - const upload_key = argv.upload + '-' + Date.now().toString(36); - await s3.upload({ - Bucket: argv.bucket, - Key: upload_key, - ContentLength: data_size, - Body: new RandStream(data_size, { - highWaterMark: 1024 * 1024, - }) - }, { - partSize: argv.part_size * 1024 * 1024, - queueSize: argv.part_concur - }) - .on('httpUploadProgress', progress => { - send_message({ ops: 0, size: progress.loaded, took_ms: 0 }); - }) - .promise(); +/** + * gpu workflow + * @param {IOWorker} io_worker + * @returns {Promise} + */ +async function gpu_func(io_worker) { + const key = select_next_object_key(); + const get_res = await io_worker.s3_client.getObject({ + Bucket: argv.bucket, + Key: key, + }); + + if (argv.verbose) console.log('GET', key, { ...get_res, Body: 'redacted' }); + + let get_size = 0; + + if (argv.rdma) { + // no need to make any copies! + // but must consume the stream to release the http connection + await get_res.Body.transformToString(); + // @ts-ignore + get_size = get_res.rdma_reply.size; + + } else if (argv.cuda) { + // copy the data to the cuda memory + for await (const chunk of get_res.Body.transformToWebStream()) { + get_size += io_worker.cuda_mem.copy_from_host(chunk, get_size); + } + + } else { + // copy the data to the buffer + for await (const chunk of get_res.Body.transformToWebStream()) { + get_size += chunk.copy(io_worker.buffer, get_size); + } + } + + // modify + if (argv.cuda) { + io_worker.cuda_mem.fill(0xba); + } else { + io_worker.buffer.fill(0xba); + } + + // copy the data back to the buffer + if (argv.cuda && !argv.rdma) { + io_worker.cuda_mem.copy_to_host(io_worker.buffer); + } + + const put_key = argv.gpu + (argv.samekey ? '' : '-' + Date.now().toString(36)); + const put_res = await io_worker.s3_client.putObject({ + Bucket: argv.bucket, + Key: put_key, + Body: argv.rdma ? null : io_worker.buffer, + }); + + if (argv.verbose) console.log('PUT', put_key, put_res); + + // rdma transfered the object data directly from our rdma_buf[0..size] + if (argv.rdma) { + // @ts-ignore + return put_res.rdma_reply.size; + } + + return 0; +} + +async function head_object() { + const key = select_next_object_key(); + await s3.headObject({ Bucket: argv.bucket, Key: key }); + return 0; +} + +async function delete_object() { + // require an approval flag to prevent unintended deletes + if (!argv.yes_really_delete) { + console.error('Allow deleting objects with --yes_really_delete'); + process.exit(1); + } + const key = select_next_object_key(); + await s3.deleteObject({ Bucket: argv.bucket, Key: key }); return 0; } async function create_bucket() { const new_bucket = argv.mb + '-' + Date.now().toString(36); - await s3.createBucket({ Bucket: new_bucket }).promise(); + await s3.createBucket({ Bucket: new_bucket }); return 0; } + function print_usage() { console.log(` Usage: - --help show this usage - --time running time in seconds (0 seconds by default) - --head head objects (prefix can be omitted) - --get get objects (prefix can be omitted) - --delete delete objects (prefix can be omitted) - --put put (single) to key (key can be omitted) - --upload upload (multipart) to key (key can be omitted) - --mb creates a new bucket (bucket can be omitted) + --help show this usage + --time running time in seconds (0 seconds by default) + --get get objects (prefix can be omitted) + --put put (single part) + --upload upload (multipart) + --gpu runs a gpu workflow + --head head objects + --delete delete objects + --mb creates a new bucket Upload Flags: - --concur concurrent operations to run from each process (default is 1) - --forks number of forked processes to run (default is 1) - --size generate random data of size (default 1) - --size_units KB|MB|GB generate random data of size_units (default MB) - --part_size multipart size - --part_concur multipart concurrency + --concur concurrent operations to run from each process (default is 1) + --forks number of forked processes to run (default is 1) + --size generate random data of size (default 1) + --size_units KB|MB|GB generate random data of size_units (default MB) + --part_size multipart size + --part_concur multipart concurrency + --exact_key use this key for all operations General S3 Flags: - --endpoint (default is localhost) - --access_key (default is env.AWS_ACCESS_KEY_ID || 123) - --secret_key (default is env.AWS_SECRET_ACCESS_KEY || abc) - --bucket (default is "first.bucket") - --sig v4|s3 (default is s3) - --ssl (default is false) Force SSL connection - --aws (default is false) Use AWS endpoint and subdomain-style buckets - --checksum (default is false) Calculate checksums on data. slower. + --endpoint (default is localhost) + --access_key (default is env.AWS_ACCESS_KEY_ID || 123) + --secret_key (default is env.AWS_SECRET_ACCESS_KEY || abc) + --bucket (default is "first.bucket") + --prefix (default is s3perf/) + --checksum (default is false) Calculate checksums on data. slower. + --verbose (default is false) Print more info. + --rdma (default is false) Use RDMA for data transfer + --cuda (default is false) Use CUDA memory over RDMA `); process.exit(); } diff --git a/src/tools/tcp_speed.go b/src/tools/tcp_speed.go deleted file mode 100644 index e19b1e4591..0000000000 --- a/src/tools/tcp_speed.go +++ /dev/null @@ -1,202 +0,0 @@ -package main - -import ( - "bufio" - "bytes" - "crypto/rand" - "crypto/rsa" - "crypto/tls" - "crypto/x509" - "crypto/x509/pkix" - "encoding/binary" - "encoding/pem" - "errors" - "flag" - "fmt" - "io" - "log" - "math/big" - "net" - "os" - "runtime/pprof" - "strconv" - "time" -) - -var client = flag.Bool("client", false, "run client") -var ssl = flag.Bool("ssl", false, "use ssl") -var port = flag.Int("port", 50505, "tcp port to use") -var prof = flag.String("prof", "", "write cpu profile to file") -var bufsize = flag.Int("size", 128*1024, "memory buffer size") -var frame = flag.Bool("frame", false, "send/receive framed messages") - -func main() { - flag.Parse() - if *prof != "" { - f, err := os.Create(*prof) - if err != nil { - log.Fatal(err) - } - pprof.StartCPUProfile(f) - defer pprof.StopCPUProfile() - } - if *client { - runSender() - } else { - runReceiver() - } -} - -func runSender() { - var conn net.Conn - var err error - if *ssl { - config := &tls.Config{InsecureSkipVerify: true} - conn, err = tls.Dial("tcp", "localhost:"+strconv.Itoa(*port), config) - } else { - conn, err = net.Dial("tcp", "localhost:"+strconv.Itoa(*port)) - } - fatal(err) - buf := make([]byte, *bufsize) - var speedometer Speedometer - speedometer.Init() - for { - if *frame { - n := uint32(len(buf)) // uint32(float64(len(buf))*(1+rand.Float64())/8) * 4 - err := binary.Write(conn, binary.BigEndian, n) - fatal(err) - // nwrite, err := conn.Write(buf[0:n]) - nwrite, err := conn.Write(buf) - if err == io.EOF { - break - } - fatal(err) - speedometer.Update(uint64(nwrite)) - - } else { - nwrite, err := conn.Write(buf) - if err == io.EOF { - break - } - fatal(err) - speedometer.Update(uint64(nwrite)) - } - } - conn.Close() -} - -func runReceiver() { - var listener net.Listener - var err error - if *ssl { - config := &tls.Config{Certificates: []tls.Certificate{GenerateCert()}} - listener, err = tls.Listen("tcp", ":"+strconv.Itoa(*port), config) - } else { - listener, err = net.Listen("tcp", ":"+strconv.Itoa(*port)) - } - fatal(err) - fmt.Println("Listening on port", *port) - conn, err := listener.Accept() - fatal(err) - listener.Close() - // reader := conn - reader := bufio.NewReaderSize(conn, *bufsize) - buf := make([]byte, *bufsize) - var speedometer Speedometer - speedometer.Init() - for { - if *frame { - var n uint32 - err := binary.Read(reader, binary.BigEndian, &n) - if err == io.EOF { - break - } - fatal(err) - if int(n) > len(buf) { - fatal(errors.New("Frame too big")) - } - nread, err := io.ReadAtLeast(reader, buf, int(n)) - if err == io.EOF { - break - } - fatal(err) - speedometer.Update(uint64(nread)) - } else { - nread, err := reader.Read(buf) - if err == io.EOF { - break - } - fatal(err) - speedometer.Update(uint64(nread)) - } - } -} - -// Speedometer is a speed measurement util -type Speedometer struct { - bytes uint64 - lastBytes uint64 - lastTime time.Time -} - -// Init a speedometer -func (s *Speedometer) Init() { - s.lastTime = time.Now() -} - -// Update a speedometer -func (s *Speedometer) Update(bytes uint64) { - s.bytes += bytes - took := time.Since(s.lastTime).Seconds() - if took >= 1 { - fmt.Printf("%7.1f MB/sec \n", float64(s.bytes-s.lastBytes)/1024/1024/took) - s.lastTime = time.Now() - s.lastBytes = s.bytes - } -} - -func fatal(err error) { - if err != nil { - log.Panic(err) - } -} - -// GenerateCert generates a self signed TLS certificate -func GenerateCert() tls.Certificate { - - cert := &x509.Certificate{ - SerialNumber: big.NewInt(1), - Subject: pkix.Name{Organization: []string{"Acme Co"}}, - NotBefore: time.Now(), - NotAfter: time.Now().AddDate(1, 0, 0), - IsCA: true, - BasicConstraintsValid: true, - KeyUsage: x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature | x509.KeyUsageCertSign, - ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth, x509.ExtKeyUsageClientAuth}, - IPAddresses: []net.IP{net.ParseIP("127.0.0.1")}, - DNSNames: nil, - } - - privateKey, err := rsa.GenerateKey(rand.Reader, 4096) - fatal(err) - - certBytes, err := x509.CreateCertificate(rand.Reader, cert, cert, &privateKey.PublicKey, privateKey) - fatal(err) - - certPEM := new(bytes.Buffer) - fatal(pem.Encode(certPEM, &pem.Block{ - Type: "CERTIFICATE", - Bytes: certBytes, - })) - - privateKeyPEM := new(bytes.Buffer) - fatal(pem.Encode(privateKeyPEM, &pem.Block{ - Type: "RSA PRIVATE KEY", - Bytes: x509.MarshalPKCS1PrivateKey(privateKey), - })) - - tlsCert, err := tls.X509KeyPair(certPEM.Bytes(), privateKeyPEM.Bytes()) - fatal(err) - - return tlsCert -} diff --git a/src/util/buffer_utils.js b/src/util/buffer_utils.js index 9d03a80dac..bc0005bd2d 100644 --- a/src/util/buffer_utils.js +++ b/src/util/buffer_utils.js @@ -87,12 +87,12 @@ async function read_stream(readable) { }; await new Promise((resolve, reject) => readable - .on('data', data => { - res.buffers.push(data); - res.total_length += data.length; - }) - .once('error', reject) - .once('end', resolve) + .on('data', data => { + res.buffers.push(data); + res.total_length += data.length; + }) + .once('error', reject) + .once('end', resolve) ); return res; } @@ -229,6 +229,24 @@ class BuffersPool { return { buffer, callback }; } + /** + * Invoke an async callback with a buffer from the pool, + * and release the buffer back to the pool when its promise is fulfilled. + * + * @template T + * @param {(buffer: Buffer) => Promise} func + * @returns {Promise} + */ + async use_buffer(func) { + const { buffer, callback } = await this.get_buffer(); + try { + const ret = await func(buffer); + return ret; + } finally { + callback(); + } + } + [util.inspect.custom]() { return 'BufferPool.get_buffer: sem value: ' + this.sem._value + ' waiting_value: ' + this.sem._waiting_value + @@ -242,6 +260,7 @@ class MultiSizeBuffersPool { * sorted_buf_sizes: Array<{ * size: number; * sem_size: number; + * is_default?: boolean; * }>; * warning_timeout?: number; * sem_timeout?: number, @@ -251,8 +270,10 @@ class MultiSizeBuffersPool { * }} params */ constructor({ sorted_buf_sizes, warning_timeout, sem_timeout, sem_timeout_error_code, sem_warning_timeout, buffer_alloc }) { - this.pools = sorted_buf_sizes.map(({ size, sem_size }) => - new BuffersPool({ + /** @type {BuffersPool} */ + this.default_pool = null; + this.pools = sorted_buf_sizes.map(({ size, sem_size, is_default }) => { + const pool = new BuffersPool({ buf_size: size, sem: new semaphore.Semaphore(sem_size, { timeout: sem_timeout, @@ -261,23 +282,50 @@ class MultiSizeBuffersPool { }), warning_timeout: warning_timeout, buffer_alloc - })); + }); + if (is_default) { + this.default_pool ||= pool; + } + return pool; + }); + // if no pool is marked as default, use the largest pool as default + if (!this.default_pool) { + this.default_pool = this.pools[this.pools.length - 1]; + } } /** + * Returns the buffers pool that fits the given size. + * It returns the largest pool if no size is provided. + * It returns the smallest pool that covers the given size. + * If no pool , the largest pool is returned. + * The caller should be prepared to use buffers larger than the requested size, + * or smaller than the requested size if there is no pool for that size. + * @param {number} [size] * @returns {BuffersPool} */ get_buffers_pool(size) { - const largest = this.pools[this.pools.length - 1]; if (typeof size !== 'number' || size < 0) { - return largest; + return this.default_pool; } for (const bp of this.pools) { if (size <= bp.buf_size) { return bp; } } - return largest; + return this.default_pool; + } + + /** + * Convenience method to use a buffer from a pool + * + * @template T + * @param {number} size + * @param {(buffer: Buffer) => Promise} func + * @returns {Promise} + */ + async use_buffer(size, func) { + return this.get_buffers_pool(size).use_buffer(func); } } diff --git a/src/util/file_reader.js b/src/util/file_reader.js index a8be5e9184..6a8993c637 100644 --- a/src/util/file_reader.js +++ b/src/util/file_reader.js @@ -2,7 +2,236 @@ 'use strict'; +const stream = require('stream'); +const config = require('../../config'); const nb_native = require('./nb_native'); +const stream_utils = require('./stream_utils'); +const native_fs_utils = require('./native_fs_utils'); + +/** @typedef {import('./buffer_utils').MultiSizeBuffersPool} MultiSizeBuffersPool */ + +/** + * FileReader is a Readable stream that reads data from a filesystem file. + * + * The Readable interface is easy to use, however, for us, it is not efficient enough + * because it has to allocate a new buffer for each chunk of data read from the file. + * This allocation and delayed garbage collection becomes expensive in high throughputs + * (which is something to improve in nodejs itself). + * + * To solve this, we added the optimized method read_into_stream(target_stream) which uses + * a buffer pool to recycle the buffers and avoid the allocation overhead. + * + * The target_stream should be a Writable stream that will not use the buffer after the + * write callback, since we will release the buffer back to the pool in the callback. + */ +class FileReader extends stream.Readable { + + /** + * @param {{ + * fs_context: nb.NativeFSContext, + * file: nb.NativeFile, + * file_path: string, + * stat: nb.NativeFSStats, + * start: number, + * end: number, + * multi_buffer_pool: MultiSizeBuffersPool, + * signal: AbortSignal, + * stats?: import('../sdk/endpoint_stats_collector').EndpointStatsCollector, + * bucket?: string, + * namespace_resource_id?: string, + * highWaterMark?: number, + * }} params + */ + constructor({ fs_context, + file, + file_path, + start, + end, + stat, + multi_buffer_pool, + signal, + stats, + bucket, + namespace_resource_id, + highWaterMark = config.NFSF_DOWNLOAD_STREAM_MEM_THRESHOLD, + }) { + super({ highWaterMark }); + this.fs_context = fs_context; + this.file = file; + this.file_path = file_path; + this.stat = stat; + this.start = start || 0; + this.end = Math.min(stat.size, end ?? Infinity); + this.pos = this.start; + this.multi_buffer_pool = multi_buffer_pool; + this.signal = signal; + this.stats = stats; + this.stats_count_once = 1; + this.bucket = bucket; + this.namespace_resource_id = namespace_resource_id; + this.num_bytes = 0; + this.num_buffers = 0; + this.log2_size_histogram = {}; + } + + /** + * Readable stream implementation + * @param {number} [size] + */ + async _read(size) { + try { + size ||= this.readableHighWaterMark; + const remain_size = this.end - this.pos; + if (remain_size <= 0) { + this.push(null); + return; + } + const read_size = Math.min(size, remain_size); + const buffer = Buffer.allocUnsafe(read_size); + const nread = await this.read_into_buffer(buffer, 0, read_size); + if (nread === read_size) { + this.push(buffer); + } else if (nread > 0) { + this.push(buffer.subarray(0, nread)); + } else { + this.push(null); + } + } catch (err) { + this.emit('error', err); + } + } + + /** + * @param {Buffer} buf + * @param {number} offset + * @param {number} length + * @returns {Promise} + */ + async read_into_buffer(buf, offset, length) { + await this._warmup_sparse_file(this.pos); + this.signal.throwIfAborted(); + let nread = 0; + if (process.env.GGG_SKIP_IO === 'true' || process.env.GGG_SKIP_IO_READ === 'true') { + const nnn = (await this.file.read(this.fs_context, buf, offset + length - 1, 1, this.pos + length - 1)); + nread = nnn ? nnn + length - 1 : 0; + } else { + nread = await this.file.read(this.fs_context, buf, offset, length, this.pos); + } + if (nread) { + this.pos += nread; + this._update_stats(nread); + } + return nread; + } + + + /** + * Alternative implementation without using Readable stream API + * This allows to use a buffer pool to avoid creating new buffers. + * + * The target_stream should be a Writable stream that will not use the buffer after the + * write callback, since we will release the buffer back to the pool in the callback. + * This means Transforms should not be used as target_stream. + * + * @param {stream.Writable} target_stream + */ + async read_into_stream(target_stream) { + if (target_stream instanceof stream.Transform) { + throw new Error('FileReader read_into_stream must be called with a Writable stream, not a Transform stream'); + } + + let buffer_pool_cleanup = null; + let drain_promise = null; + + try { + while (this.pos < this.end) { + // prefer to warmup sparse file before allocating a buffer + await this._warmup_sparse_file(this.pos); + + // allocate or reuse buffer + // TODO buffers_pool and the underlying semaphore should support abort signal + // to avoid sleeping inside the semaphore until the timeout while the request is already aborted. + this.signal.throwIfAborted(); + const remain_size = this.end - this.pos; + const { buffer, callback } = await this.multi_buffer_pool.get_buffers_pool(remain_size).get_buffer(); + buffer_pool_cleanup = callback; // must be called ***IMMEDIATELY*** after get_buffer + this.signal.throwIfAborted(); + + // read from file + const read_size = Math.min(buffer.length, remain_size); + const nread = await this.read_into_buffer(buffer, 0, read_size); + if (!nread) { + buffer_pool_cleanup = null; + callback(); + break; + } + + // wait for response buffer to drain before adding more data if needed - + // this occurs when the output network is slower than the input file + if (drain_promise) { + this.signal.throwIfAborted(); + await drain_promise; + drain_promise = null; + this.signal.throwIfAborted(); + } + + // write the data out to response + const data = buffer.subarray(0, nread); + buffer_pool_cleanup = null; // cleanup is now in the socket responsibility + const write_ok = target_stream.write(data, null, callback); + if (!write_ok) { + drain_promise = stream_utils.wait_drain(target_stream, { signal: this.signal }); + drain_promise.catch(() => undefined); // this avoids UnhandledPromiseRejection + } + } + + // wait for the last drain if pending. + if (drain_promise) { + this.signal.throwIfAborted(); + await drain_promise; + drain_promise = null; + this.signal.throwIfAborted(); + } + + } finally { + if (buffer_pool_cleanup) buffer_pool_cleanup(); + } + } + + /** + * @param {number} size + */ + _update_stats(size) { + this.num_bytes += size; + this.num_buffers += 1; + const log2_size = Math.ceil(Math.log2(size)); + this.log2_size_histogram[log2_size] = (this.log2_size_histogram[log2_size] || 0) + 1; + + // update stats collector but count the entire read operation just once + const count = this.stats_count_once; + this.stats_count_once = 0; // counting the entire operation just once + this.stats?.update_nsfs_write_stats({ + namespace_resource_id: this.namespace_resource_id, + size, + count, + bucket_name: this.bucket, + }); + } + + /** + * @param {number} pos + */ + async _warmup_sparse_file(pos) { + if (!config.NSFS_BUF_WARMUP_SPARSE_FILE_READS) return; + if (!native_fs_utils.is_sparse_file(this.stat)) return; + this.signal.throwIfAborted(); + await native_fs_utils.warmup_sparse_file(this.fs_context, this.file, this.file_path, this.stat, pos); + } + + +} + + class NewlineReaderFilePathEntry { constructor(fs_context, filepath) { @@ -204,5 +433,6 @@ class NewlineReader { } } +exports.FileReader = FileReader; exports.NewlineReader = NewlineReader; exports.NewlineReaderEntry = NewlineReaderFilePathEntry; diff --git a/src/util/file_writer.js b/src/util/file_writer.js index c8df126719..3da2bc4e9d 100644 --- a/src/util/file_writer.js +++ b/src/util/file_writer.js @@ -4,7 +4,7 @@ const stream = require('stream'); const config = require('../../config'); const nb_native = require('./nb_native'); -const dbg = require('../util/debug_module')(__filename); +const dbg = require('./debug_module')(__filename); /** * FileWriter is a Writable stream that write data to a filesystem file, @@ -14,17 +14,16 @@ class FileWriter extends stream.Writable { /** * @param {{ - * target_file: object, - * fs_context: object, - * namespace_resource_id: string, - * md5_enabled: boolean, - * stats: import('../sdk/endpoint_stats_collector').EndpointStatsCollector, + * target_file: nb.NativeFile, + * fs_context: nb.NativeFSContext, + * md5_enabled?: boolean, * offset?: number, + * stats?: import('../sdk/endpoint_stats_collector').EndpointStatsCollector, * bucket?: string, - * large_buf_size?: number, + * namespace_resource_id?: string, * }} params */ - constructor({ target_file, fs_context, namespace_resource_id, md5_enabled, stats, offset, bucket, large_buf_size }) { + constructor({ target_file, fs_context, md5_enabled, offset, stats, bucket, namespace_resource_id }) { super({ highWaterMark: config.NFSF_UPLOAD_STREAM_MEM_THRESHOLD }); this.target_file = target_file; this.fs_context = fs_context; @@ -34,12 +33,48 @@ class FileWriter extends stream.Writable { this.stats = stats; this.bucket = bucket; this.namespace_resource_id = namespace_resource_id; - this.large_buf_size = large_buf_size || config.NSFS_BUF_SIZE_L; this.MD5Async = md5_enabled ? new (nb_native().crypto.MD5Async)() : undefined; const platform_iov_max = nb_native().fs.PLATFORM_IOV_MAX; this.iov_max = platform_iov_max ? Math.min(platform_iov_max, config.NSFS_DEFAULT_IOV_MAX) : config.NSFS_DEFAULT_IOV_MAX; } + /** + * @param {stream.Readable} source_stream + * @param {import('events').Abortable} [options] + */ + async write_entire_stream(source_stream, options) { + await stream.promises.pipeline(source_stream, this, options); + await stream.promises.finished(this, options); + } + + /** + * Ingests an array of buffers and writes them to the target file, + * while handling MD5 calculation and stats update. + * @param {Buffer[]} buffers + * @param {number} size + */ + async write_buffers(buffers, size) { + await Promise.all([ + this.MD5Async && this._update_md5(buffers, size), + this._write_all_buffers(buffers, size), + ]); + this._update_stats(size); + } + + /** + * Finalizes the MD5 calculation and sets the digest. + */ + async finalize() { + if (this.MD5Async) { + const digest = await this.MD5Async.digest(); + this.digest = digest.toString('hex'); + } + } + + /////////////// + // INTERNALS // + /////////////// + /** * @param {number} size */ @@ -66,6 +101,8 @@ class FileWriter extends stream.Writable { } /** + * Writes an array of buffers to the target file, + * splitting them into batches if it exceeds the platform's IOV_MAX. * @param {Buffer[]} buffers * @param {number} size */ @@ -85,17 +122,24 @@ class FileWriter extends stream.Writable { } /** + * Writes an array of buffers to the target file, + * updating the offset and total bytes * @param {Buffer[]} buffers * @param {number} size */ async _write_to_file(buffers, size) { dbg.log1(`FileWriter._write_to_file: buffers ${buffers.length} size ${size} offset ${this.offset}`); - await this.target_file.writev(this.fs_context, buffers, this.offset); + if (process.env.GGG_SKIP_IO === 'true' || process.env.GGG_SKIP_IO_WRITE === 'true') { + // no-op + } else { + await this.target_file.writev(this.fs_context, buffers, this.offset); + } if (this.offset >= 0) this.offset += size; // when offset<0 we just append this.total_bytes += size; } /** + * Implements the write method of Writable stream. * @param {Array<{ chunk: Buffer; encoding: BufferEncoding; }>} chunks * @param {(error?: Error | null) => void} callback */ @@ -106,11 +150,7 @@ class FileWriter extends stream.Writable { size += it.chunk.length; return it.chunk; }); - await Promise.all([ - this.MD5Async && this._update_md5(buffers, size), - this._write_all_buffers(buffers, size), - ]); - this._update_stats(size); + await this.write_buffers(buffers, size); return callback(); } catch (err) { console.error('FileWriter._writev: failed', err); @@ -119,15 +159,12 @@ class FileWriter extends stream.Writable { } /** + * Implements the final method of Writable stream. * @param {(error?: Error | null) => void} callback */ async _final(callback) { try { - if (this.MD5Async) { - const digest = await this.MD5Async.digest(); - this.digest = digest.toString('hex'); - } - + await this.finalize(); return callback(); } catch (err) { console.error('FileWriter._final: failed', err); diff --git a/src/util/fips.js b/src/util/fips.js index 123013aff7..853b531cc8 100644 --- a/src/util/fips.js +++ b/src/util/fips.js @@ -106,7 +106,9 @@ function detect_fips_mode() { const fips_proc_file = process.env.FIPS_PROC_FILE || '/proc/sys/crypto/fips_enabled'; try { const value = fs.readFileSync(fips_proc_file, 'utf8').trim(); - console.log(`detect_fips_mode: found ${fips_proc_file} with value ${value}`); + if (value !== '0') { + console.log(`detect_fips_mode: found ${fips_proc_file} with value ${value}`); + } return value === '1'; } catch (err) { if (err.code !== 'ENOENT' && err.code !== 'ENOTDIR') { diff --git a/src/util/http_utils.js b/src/util/http_utils.js index 22b59f6a24..c6f286cd10 100644 --- a/src/util/http_utils.js +++ b/src/util/http_utils.js @@ -102,6 +102,10 @@ function hdr_as_arr(headers, key) { return v; } +/** + * @param {http.IncomingMessage & NodeJS.Dict} req + * @returns {querystring.ParsedUrlQuery} + */ function parse_url_query(req) { req.originalUrl = req.url; const query_pos = req.url.indexOf('?'); @@ -111,6 +115,7 @@ function parse_url_query(req) { req.query = querystring.parse(req.url.slice(query_pos + 1)); req.url = req.url.slice(0, query_pos); } + return req.query; } function parse_client_ip(req) { @@ -839,6 +844,21 @@ function http_get(uri, options) { client.get(uri, options, resolve).on('error', reject); }); } + +/** + * + * @param {net.Socket} conn + */ +function http_server_connections_logger(conn) { + // @ts-ignore + const fd = conn._handle.fd; + const info = { port: conn.localPort, fd, remote: conn.remoteAddress }; + dbg.log0('HTTP connection accepted', info); + conn.once('close', () => { + dbg.log0('HTTP connection closed', info); + }); +} + /** * start_https_server starts the secure https server by type and options and creates a certificate if required * @param {number} https_port @@ -848,6 +868,7 @@ function http_get(uri, options) { async function start_https_server(https_port, server_type, request_handler, nsfs_config_root) { const ssl_cert_info = await ssl_utils.get_ssl_cert_info(server_type, nsfs_config_root); const https_server = await ssl_utils.create_https_server(ssl_cert_info, true, request_handler); + https_server.on('connection', http_server_connections_logger); ssl_cert_info.on('update', updated_ssl_cert_info => { dbg.log0(`Setting updated ${server_type} ssl certs for endpoint.`); const updated_ssl_options = { ...updated_ssl_cert_info.cert, honorCipherOrder: true }; @@ -856,6 +877,7 @@ async function start_https_server(https_port, server_type, request_handler, nsfs dbg.log0(`Starting ${server_type} server on HTTPS port ${https_port}`); await listen_port(https_port, https_server, server_type); dbg.log0(`Started ${server_type} HTTPS server successfully`); + return https_server; } /** @@ -866,11 +888,13 @@ async function start_https_server(https_port, server_type, request_handler, nsfs */ async function start_http_server(http_port, server_type, request_handler) { const http_server = http.createServer(request_handler); + http_server.on('connection', http_server_connections_logger); if (http_port > 0) { dbg.log0(`Starting ${server_type} server on HTTP port ${http_port}`); await listen_port(http_port, http_server, server_type); dbg.log0(`Started ${server_type} HTTP server successfully`); } + return http_server; } /** @@ -1009,6 +1033,7 @@ exports.validate_server_ip_whitelist = validate_server_ip_whitelist; exports.http_get = http_get; exports.start_http_server = start_http_server; exports.start_https_server = start_https_server; +exports.http_server_connections_logger = http_server_connections_logger; exports.CONTENT_TYPE_TEXT_PLAIN = CONTENT_TYPE_TEXT_PLAIN; exports.CONTENT_TYPE_APP_OCTET_STREAM = CONTENT_TYPE_APP_OCTET_STREAM; exports.CONTENT_TYPE_APP_JSON = CONTENT_TYPE_APP_JSON; diff --git a/src/util/native_fs_utils.js b/src/util/native_fs_utils.js index 4a12a80b66..63af4e3dfa 100644 --- a/src/util/native_fs_utils.js +++ b/src/util/native_fs_utils.js @@ -66,25 +66,30 @@ async function _generate_unique_path(fs_context, tmp_dir_path) { * @param {string} open_mode */ // opens open_path on POSIX, and on GPFS it will open open_path parent folder -async function open_file(fs_context, bucket_path, open_path, open_mode = config.NSFS_OPEN_READ_MODE, - file_permissions = config.BASE_MODE_FILE) { +async function open_file( + fs_context, + bucket_path, + open_path, + open_mode = config.NSFS_OPEN_READ_MODE, + file_permissions = config.BASE_MODE_FILE, +) { let retries = config.NSFS_MKDIR_PATH_RETRIES; const dir_path = path.dirname(open_path); const actual_open_path = open_mode === 'wt' ? dir_path : open_path; const should_create_path_dirs = (open_mode === 'wt' || open_mode === 'w') && dir_path !== bucket_path; - for (;;) { + for (; ;) { try { if (should_create_path_dirs) { - dbg.log1(`NamespaceFS._open_file: mode=${open_mode} creating dirs`, open_path, bucket_path); + dbg.log1(`native_fs_utils.open_file: mode=${open_mode} creating dirs`, open_path, bucket_path); await _make_path_dirs(open_path, fs_context); } - dbg.log1(`NamespaceFS._open_file: mode=${open_mode}`, open_path); + dbg.log1(`native_fs_utils.open_file: mode=${open_mode}`, open_path); // for 'wt' open the tmpfile with the parent dir path const fd = await nb_native().fs.open(fs_context, actual_open_path, open_mode, get_umasked_mode(file_permissions)); return fd; } catch (err) { - dbg.warn(`native_fs_utils.open_file Retrying retries=${retries} mode=${open_mode} open_path=${open_path} dir_path=${dir_path} actual_open_path=${actual_open_path}`, err); + dbg.warn(`native_fs_utils.open_file: Retrying retries=${retries} mode=${open_mode} open_path=${open_path} dir_path=${dir_path} actual_open_path=${actual_open_path}`, err); if (err.code !== 'ENOENT') throw err; // case of concurrennt deletion of the dir_path if (retries <= 0 || !should_create_path_dirs) throw err; @@ -93,6 +98,56 @@ async function open_file(fs_context, bucket_path, open_path, open_mode = config. } } +/** + * Open a file and close it after the async scope function is done. + * + * @template T + * @param {{ + * fs_context: nb.NativeFSContext, + * bucket_path: string, + * open_path: string, + * open_mode?: string, + * file_permissions?: number, + * scope: (file: nb.NativeFile, file_path: string) => Promise, + * }} params + * @returns {Promise} + */ +async function use_file({ + fs_context, + bucket_path, + open_path, + open_mode, + file_permissions, + scope, +}) { + let file; + let ret; + + try { + file = await open_file(fs_context, bucket_path, open_path, open_mode, file_permissions); + } catch (err) { + dbg.error('native_fs_utils.use_file: open failed', open_path, err); + throw err; + } + + try { + ret = await scope(file, open_path); + } catch (err) { + dbg.error('native_fs_utils.use_file: scope failed', open_path, err); + throw err; + } finally { + if (file) { + try { + await file.close(fs_context); + } catch (err) { + dbg.warn('native_fs_utils.use_file: close failed', open_path, err); + } + } + } + + return ret; +} + /** * @param {MultiSizeBuffersPool} multi_buffers_pool * @param {nb.NativeFSContext} fs_context @@ -110,7 +165,7 @@ async function copy_bytes(multi_buffers_pool, fs_context, src_file, dst_file, si let bytes_written = 0; const total_bytes_to_write = Number(size); let write_pos = write_offset >= 0 ? write_offset : 0; - for (;;) { + for (; ;) { const total_bytes_left = total_bytes_to_write - bytes_written; if (total_bytes_left <= 0) break; const { buffer, callback } = await multi_buffers_pool.get_buffers_pool(total_bytes_left).get_buffer(); @@ -350,9 +405,7 @@ async function create_config_file(fs_context, schema_dir, config_path, config_da // validate config file doesn't exist try { await nb_native().fs.stat(fs_context, config_path); - const err = new Error('configuration file already exists'); - err.code = 'EEXIST'; - throw err; + throw Object.assign(new Error('configuration file already exists'), { code: 'EEXIST' }); } catch (err) { if (err.code !== 'ENOENT') throw err; } @@ -470,7 +523,7 @@ async function update_config_file(fs_context, schema_dir, config_path, config_da // moving tmp file to config path atomically dbg.log1('native_fs_utils: update_config_file moving from:', open_path, 'to:', config_path, 'is_gpfs=', is_gpfs); let retries = config.NSFS_RENAME_RETRIES; - for (;;) { + for (; ;) { try { const src_stat = is_gpfs ? undefined : await nb_native().fs.stat(fs_context, open_path); await safe_move(fs_context, open_path, config_path, src_stat, gpfs_options, tmp_dir_path); @@ -732,11 +785,52 @@ async function lock_and_run(fs_context, lock_path, cb) { } } +/** + * NOTICE that even files that were written sequentially, can still be identified as sparse: + * 1. After writing, but before all the data is synced, the size is higher than blocks size. + * 2. For files that were moved to an archive tier. + * 3. For files that fetch and cache data from remote storage, which are still not in the cache. + * It's not good enough for avoiding recall storms as needed by _fail_if_archived_or_sparse_file. + * However, using this check is useful for guessing that a reads is going to take more time + * and avoid holding off large buffers from the buffers_pool. + * @param {nb.NativeFSStats} stat + * @returns {boolean} + */ +function is_sparse_file(stat) { + return (stat.blocks * 512 < stat.size); +} + +let warmup_buffer; + +/** + * Our buffer pool keeps large buffers and we want to avoid spending + * all our large buffers and then have them waiting for high latency calls + * such as reading from archive/on-demand cache files. + * Instead, we detect the case where a file is "sparse", + * and then use just a small buffer to wait for a tiny read, + * which will recall the file from archive or load from remote into cache, + * and once it returns we can continue to the full fledged read. + * @param {nb.NativeFSContext} fs_context + * @param {nb.NativeFile} file + * @param {nb.NativeFSStats} stat + * @param {number} pos + */ +async function warmup_sparse_file(fs_context, file, file_path, stat, pos) { + dbg.log0('warmup_sparse_file', { + file_path, pos, size: stat.size, blocks: stat.blocks, + }); + if (!warmup_buffer) { + warmup_buffer = nb_native().fs.dio_buffer_alloc(4096); + } + await file.read(fs_context, warmup_buffer, 0, 1, pos); +} + exports.get_umasked_mode = get_umasked_mode; exports._make_path_dirs = _make_path_dirs; exports._create_path = _create_path; exports._generate_unique_path = _generate_unique_path; exports.open_file = open_file; +exports.use_file = use_file; exports.copy_bytes = copy_bytes; exports.finally_close_files = finally_close_files; exports.get_user_by_distinguished_name = get_user_by_distinguished_name; @@ -774,5 +868,6 @@ exports.get_bucket_tmpdir_full_path = get_bucket_tmpdir_full_path; exports.get_bucket_tmpdir_name = get_bucket_tmpdir_name; exports.entity_enum = entity_enum; exports.translate_error_codes = translate_error_codes; - exports.lock_and_run = lock_and_run; +exports.is_sparse_file = is_sparse_file; +exports.warmup_sparse_file = warmup_sparse_file; diff --git a/src/util/nb_native.js b/src/util/nb_native.js index 0a685cb75e..f75fb084ec 100644 --- a/src/util/nb_native.js +++ b/src/util/nb_native.js @@ -28,6 +28,15 @@ function nb_native() { inherits(nb_native_nan.Ntcp, events.EventEmitter); _.defaults(nb_native_napi, nb_native_nan); + // GGG HACK TRACING STAT CALLS - TODO: REMOVE + if (process.env.GGG_TRACE_STAT === 'true') { + const original_stat = nb_native_napi.fs.stat; + nb_native_napi.fs.stat = function(...args) { + console.trace('fs.stat', ...args, new Error('TRACE').stack); + return original_stat(...args); + }; + } + if (process.env.DISABLE_INIT_RANDOM_SEED !== 'true') { init_rand_seed(); } @@ -53,13 +62,13 @@ async function init_rand_seed() { const seed = await read_rand_seed(32); if (seed) { - console.log(`init_rand_seed: seeding with ${seed.length} bytes`); + // console.log(`init_rand_seed: seeding with ${seed.length} bytes`); nb_native_napi.rand_seed(seed); } still_reading = false; await promise; - console.log('init_rand_seed: done'); + // console.log('init_rand_seed: done'); } async function read_rand_seed(seed_bytes) { diff --git a/src/util/rdma_utils.js b/src/util/rdma_utils.js new file mode 100644 index 0000000000..655bf10b09 --- /dev/null +++ b/src/util/rdma_utils.js @@ -0,0 +1,333 @@ +/* Copyright (C) 2025 NooBaa */ +'use strict'; + +const querystring = require('querystring'); + +const dbg = require('./debug_module')(__filename); +const config = require('../../config'); +const http_utils = require('./http_utils'); +const nb_native = require('./nb_native'); +const { S3Error } = require('../endpoint/s3/s3_errors'); +const { S3 } = require('@aws-sdk/client-s3'); + +const X_NOOBAA_RDMA = 'x-noobaa-rdma'; // both a request header and a response header + +/** + * @param {querystring.ParsedUrlQueryInput} info + * @returns {string} + */ +function encode_rdma_header(info) { + return querystring.stringify({ + v: 1, + ...info, + }); +} + +/** + * @param {string} header + * @returns {querystring.ParsedUrlQueryInput} +*/ +function decode_rdma_header(header) { + const info = querystring.parse(header); + if (info.v !== '1') { + dbg.error('decode_rdma_header: mismatching rdma version', info.v, 'expected 1'); + throw new S3Error(S3Error.InvalidArgument); + } + return info; +} + +/** + * @param {import('http').OutgoingHttpHeaders} req_headers + * @param {nb.RdmaInfo|undefined} rdma_info + */ +function set_rdma_request_header(req_headers, rdma_info) { + if (!rdma_info) return; + const h = encode_rdma_header({ ...rdma_info }); + req_headers[X_NOOBAA_RDMA] = h; +} + +/** + * @param {nb.S3Request|undefined} req + * @param {nb.S3Response} res + * @param {nb.RdmaInfo|undefined} rdma_info + * @param {nb.RdmaReply|undefined} rdma_reply + */ +function set_rdma_response_header(req, res, rdma_info, rdma_reply) { + if (!rdma_info || !rdma_reply) return; + const h = encode_rdma_header({ ...rdma_reply }); + res.setHeader(X_NOOBAA_RDMA, h); +} + +/** + * @param {nb.S3Request} req + * @returns {nb.RdmaInfo|undefined} + */ +function parse_rdma_info(req) { + const header = http_utils.hdr_as_str(req.headers, X_NOOBAA_RDMA); + if (!header) return; + try { + const info = decode_rdma_header(header); + const rdma_info = { + desc: String(info.desc), + addr: String(info.addr), + size: Number(String(info.size)), + offset: Number(String(info.offset || '0')), + }; + return rdma_info; + } catch (err) { + dbg.warn('parse_rdma_info: failed to parse header', header, err); + throw new S3Error(S3Error.InvalidArgument); + } +} + +/** + * @param {import('http').IncomingHttpHeaders} res_headers + * @returns {nb.RdmaReply|undefined} + */ +function parse_rdma_reply(res_headers) { + const header = http_utils.hdr_as_str(res_headers, X_NOOBAA_RDMA); + if (!header) return; + try { + const info = decode_rdma_header(header); + const rdma_reply = { + size: Number(String(info.size)), + }; + return rdma_reply; + } catch (err) { + dbg.warn('parse_rdma_reply: failed to parse header', header, err); + throw new S3Error(S3Error.InvalidArgument); + } +} + +///////////////// +// RDMA SERVER // +///////////////// + +let _rdma_server = null; + +/** + * @returns {nb.RdmaServerNapi} + */ +function s3_rdma_server() { + if (!config.RDMA_ENABLED) { + throw new Error('RDMA is not enabled'); + } + if (_rdma_server) return _rdma_server; + const { RdmaServerNapi } = nb_native(); + const ip = process.env.S3_RDMA_SERVER_IP || '172.16.0.61'; + _rdma_server = new RdmaServerNapi({ + ip, + port: 0, // every fork will get a different port + log_level: 'ERROR', + use_async_events: process.env.S3_RDMA_USE_ASYNC_EVENTS === 'true', + }); + console.log('RDMA server:', ip); + return _rdma_server; +} + +/** + * Server side RDMA operation to write a buffer from remote server to local file + * Use buffer pool to get buffer of the required size. + * + * @param {nb.RdmaInfo} rdma_info + * @param {import ('./file_writer')} writer + * @param {import ('./buffer_utils').MultiSizeBuffersPool} multi_buffer_pool + * @param {AbortSignal} [abort_signal] + * @returns {Promise} + */ +async function write_file_from_rdma(rdma_info, writer, multi_buffer_pool, abort_signal) { + const rdma_server = await s3_rdma_server(); + return await multi_buffer_pool.use_buffer(rdma_info.size, async buffer => { + rdma_server.register_buffer(buffer); + let offset = 0; + while (offset < rdma_info.size) { + abort_signal?.throwIfAborted(); + const rdma_slice = slice_rdma_info(rdma_info, offset, buffer.length); + const ret_size = await rdma_server.rdma('PUT', 'FileWriter', buffer, rdma_slice); + // console.log('GGG ret_size', ret_size); + if (ret_size < 0) throw new Error('RDMA PUT failed'); + if (ret_size > buffer.length) throw new Error('RDMA PUT error: returned size is larger than buffer'); + if (ret_size === 0) break; + abort_signal?.throwIfAborted(); + if (ret_size === buffer.length) { + await writer.write_buffers([buffer], ret_size); + } else { + await writer.write_buffers([buffer.subarray(0, ret_size)], ret_size); + } + offset += ret_size; + } + abort_signal?.throwIfAborted(); + await writer.finalize(); + // console.log('GGG writer.total_bytes', writer.total_bytes); + return { size: offset }; + }); +} + +/** + * @param {nb.RdmaInfo} rdma_info + * @param {number} offset + * @param {number} size + * @returns {nb.RdmaInfo} + */ +function slice_rdma_info(rdma_info, offset, size) { + const slice = { ...rdma_info }; + slice.offset += offset; + slice.size -= offset; + if (slice.size > size) slice.size = size; + return slice; +} + +/** + * @param {nb.RdmaInfo} rdma_info + * @param {import ('./file_reader').FileReader} reader + * @param {import ('./buffer_utils').MultiSizeBuffersPool} multi_buffer_pool + * @param {AbortSignal} [abort_signal] + * @returns {Promise} + */ +async function read_file_to_rdma(rdma_info, reader, multi_buffer_pool, abort_signal) { + const rdma_server = await s3_rdma_server(); + return await multi_buffer_pool.use_buffer(rdma_info.size, async buffer => { + rdma_server.register_buffer(buffer); + let offset = 0; + while (offset < rdma_info.size) { + abort_signal?.throwIfAborted(); + const rdma_slice_pre_read = slice_rdma_info(rdma_info, offset, buffer.length); + const nread = await reader.read_into_buffer(buffer, 0, rdma_slice_pre_read.size); + // console.log('GGG nread', nread); + abort_signal?.throwIfAborted(); + const rdma_slice = slice_rdma_info(rdma_info, offset, nread); + const ret_size = await rdma_server.rdma('GET', reader.file_path, buffer, rdma_slice); + // console.log('GGG ret_size', ret_size); + if (ret_size !== nread) throw new Error('RDMA GET failed'); + offset += ret_size; + } + return offset; + }); +} + + +///////////////// +// RDMA CLIENT // +///////////////// + +/** + * @returns {nb.RdmaClientNapi} + */ +function new_rdma_client() { + if (!config.RDMA_ENABLED) { + throw new Error('RDMA is not enabled'); + } + return new (nb_native().RdmaClientNapi)(); +} + +/** + * @param {import('@aws-sdk/client-s3').S3ClientConfig} s3_config + * @param {Buffer} client_buf + * @param {nb.RdmaClientNapi} rdma_client + * @returns {S3} + */ +function s3_rdma_client(s3_config, client_buf, rdma_client) { + const s3 = new S3(s3_config); + s3.middlewareStack.use(s3_rdma_client_plugin(client_buf, rdma_client)); + return s3; +} + +/** + * @param {Buffer} client_buf + * @param {nb.RdmaClientNapi} rdma_client + * @returns {import('@smithy/types').Pluggable} + */ +function s3_rdma_client_plugin(client_buf, rdma_client) { + return { + applyToStack: stack => { + stack.add(s3_rdma_client_middleware(client_buf, rdma_client), { + name: 'rdma', + step: 'build', + }); + } + }; +} + +/** + * @param {Buffer} client_buf + * @param {nb.RdmaClientNapi} rdma_client + * @returns {import('@smithy/types').BuildMiddleware} + */ +function s3_rdma_client_middleware(client_buf, rdma_client) { + return (next, context) => async args => { + /** @type {any} */ + const input = args.input; + /** @type {any} */ + const request = args.request; + /** @type {any} */ + let result; + + // console.log('S3 RDMA: build', request, input); + + /** @type {'GET'|'PUT'} */ + let req_type = 'GET'; + /** @type {Buffer} */ + let rdma_buf; + + if (context.commandName === 'GetObjectCommand') { + req_type = 'GET'; + rdma_buf = client_buf; + } else if (context.commandName === 'PutObjectCommand') { + req_type = 'PUT'; + rdma_buf = client_buf; + // rdma_buf = input.Body; // TODO handle other body types? + input.Body = undefined; + request.headers['content-length'] = '0'; + } else if (context.commandName === 'UploadPartCommand') { + req_type = 'PUT'; + rdma_buf = client_buf; + // rdma_buf = input.Body; // TODO handle other body types? + input.Body = undefined; + request.headers['content-length'] = '0'; + } else { + return next(args); + } + + const ret_size = await rdma_client.rdma( + req_type, rdma_buf, async (rdma_info, callback) => { + try { + set_rdma_request_header(request.headers, rdma_info); + // console.log('S3 RDMA: request', request.headers); + result = await next(args); + // console.log('S3 RDMA: response', result.response.headers); + const rdma_reply = parse_rdma_reply(result.response.headers); + result.output.rdma_reply = rdma_reply; + callback(null, Number(rdma_reply.size)); + } catch (err) { + console.warn('S3 RDMA: Received error from server', err); + callback(err); + } + } + ); + + if (ret_size < 0) { + console.log('S3 RDMA: Return', ret_size, req_type, rdma_buf.length); + } + + return result; + }; +} + + +// EXPORTS +exports.X_NOOBAA_RDMA = X_NOOBAA_RDMA; +exports.encode_rdma_header = encode_rdma_header; +exports.decode_rdma_header = decode_rdma_header; +exports.set_rdma_request_header = set_rdma_request_header; +exports.set_rdma_response_header = set_rdma_response_header; +exports.parse_rdma_info = parse_rdma_info; +exports.parse_rdma_reply = parse_rdma_reply; +// SERVER +exports.s3_rdma_server = s3_rdma_server; +exports.write_file_from_rdma = write_file_from_rdma; +exports.read_file_to_rdma = read_file_to_rdma; +// CLIENT +exports.new_rdma_client = new_rdma_client; +exports.s3_rdma_client = s3_rdma_client; +exports.s3_rdma_client_plugin = s3_rdma_client_plugin; +exports.s3_rdma_client_middleware = s3_rdma_client_middleware; diff --git a/src/util/speedometer.js b/src/util/speedometer.js index b104ab961f..ac799461f0 100644 --- a/src/util/speedometer.js +++ b/src/util/speedometer.js @@ -1,62 +1,110 @@ /* Copyright (C) 2016 NooBaa */ 'use strict'; -const cluster = /** @type {import('node:cluster').Cluster} */ ( - /** @type {unknown} */ (require('node:cluster')) -); +/** @typedef {import('node:cluster').Cluster} Cluster */ +/** @typedef {import('node:cluster').Worker} Worker */ +const cluster = /** @type {Cluster} */ (/** @type {unknown} */ (require('node:cluster'))); +const WaitQueue = require('./wait_queue'); +const sketches = require('@datadog/sketches-js'); +// const { setTimeout: delay } = require('node:timers/promises'); + +/** + * @typedef {{ + * bytes: number, + * ops: number, + * sum_latency: number, + * min_latency: number, + * max_latency: number, + * latency_sketch: number[], + * }} Bulk + * + * @typedef {'ready' | 'init' | 'run' | 'update' | 'done'} OpType + * + * @typedef {{ + * speedometer?: { + * op: OpType, + * id?: number, + * bulk?: Bulk, + * info?: any, + * } + * }} Message + */ + +const STATE = Symbol('speedometer-worker-state'); + +const OP_READY = 'ready'; +const OP_INIT = 'init'; +const OP_RUN = 'run'; +const OP_DONE = 'done'; +const OP_UPDATE = 'update'; + +/** + * + * @param {OpType} op + * @returns {Message} + */ +function msg_from_op(op) { + return Object.freeze({ speedometer: Object.freeze({ op }) }); +} +const MSG_READY = msg_from_op(OP_READY); +const MSG_INIT = msg_from_op(OP_INIT); +const MSG_RUN = msg_from_op(OP_RUN); +const MSG_DONE = msg_from_op(OP_DONE); class Speedometer { - constructor(name) { + /** + * @param {{ + * name: string, + * argv?: any, + * num_workers?: number, + * primary_init?: () => Promise, + * workers_init?: (id: number, info: any) => Promise, + * workers_func?: (id: number, info: any) => Promise, + * }} params + */ + constructor({ name, argv, num_workers, primary_init, workers_init, workers_func }) { this.name = name || 'Speed'; + this.argv = argv; + this.num_workers = num_workers || 0; + this.primary_init = primary_init; + this.workers_init = workers_init; + this.workers_func = workers_func; + this._waitqueue = new WaitQueue(); + /** @type {NodeJS.Dict} */ + this.workers = undefined; + this.primary_info = undefined; + this.worker_info = undefined; + this.worker_run = false; + this.num_reports = 0; + this.reset(); + } + reset() { this.start_time = Date.now(); - this.last_time = this.start_time; - this.num_bytes = 0; - this.last_bytes = 0; - this.num_ops = 0; - this.last_ops = 0; - this.sum_latency = 0; - this.last_latency = 0; this.min_latency = -1; this.max_latency = -1; + this.latency_sketch = new sketches.DDSketch({ relativeAccuracy: 0.01 }); + this.last_time = this.start_time; + this.last_bytes = 0; + this.last_ops = 0; + this.last_latency = 0; } - run_workers(count, worker_func, args) { - if (cluster.isPrimary) { - console.log('ARGS:', JSON.stringify(args, null, 2)); - } - if (count > 1 && cluster.isPrimary) { - this.fork(count); - } else { - // primary will run the worker_func as well (if count <= 1 or undefined) - worker_func(); - } - } - - fork(count) { - if (cluster.isWorker) throw new Error('fork should be called only from the primary process'); - cluster.on('message', (worker, { bytes, ops, sum_latency, min_latency, max_latency }) => { - this.num_bytes += bytes; - this.num_ops += ops; - this.sum_latency += sum_latency; - if (min_latency >= 0 && (this.min_latency < 0 || min_latency < this.min_latency)) this.min_latency = min_latency; - if (max_latency >= 0 && (this.max_latency < 0 || max_latency > this.max_latency)) this.max_latency = max_latency; - if (!this.interval) this.set_interval(); - }); - cluster.on('exit', worker => { - if (!Object.keys(cluster.workers).length) { - this.clear_interval(); - this.report(); - // process.exit(); - } - }); - for (let i = 0; i < count; ++i) { - const worker = cluster.fork(); - console.warn('Worker start', worker.process.pid); + set_interval(delay_ms) { + delay_ms ||= this.is_primary() ? 1000 : 480; + this.clear_interval(); + this.interval = setInterval(() => this._on_interval(), delay_ms); + this.interval.unref(); + } + + clear_interval() { + if (this.interval) { + clearInterval(this.interval); + this.interval = null; } } @@ -68,73 +116,369 @@ class Speedometer { return cluster.isWorker; } - update(bytes) { - this.num_bytes += bytes; + /** + * @param {() => Promise} func + */ + async measure(func) { + const start = process.hrtime.bigint(); + const size = await func(); + const took_ms = Number(process.hrtime.bigint() - start) / 1e6; + this.update(size || 0, took_ms); + } + + update(bytes, latency_ms) { + if (bytes > 0) this.num_bytes += bytes; + if (latency_ms > 0) { + this.num_ops += 1; + this.sum_latency += latency_ms; + this.latency_sketch.accept(latency_ms); + if (this.min_latency < 0 || latency_ms < this.min_latency) this.min_latency = latency_ms; + if (this.max_latency < 0 || latency_ms > this.max_latency) this.max_latency = latency_ms; + } if (!this.interval) this.set_interval(); } - add_op(took_ms) { - if (took_ms < 0) throw new Error('Speedometer: negative took_ms ' + took_ms); - this.num_ops += 1; - this.sum_latency += took_ms; - if (this.min_latency < 0 || took_ms < this.min_latency) this.min_latency = took_ms; - if (this.max_latency < 0 || took_ms > this.max_latency) this.max_latency = took_ms; + /** + * @param {Bulk} bulk + */ + _update_bulk({ bytes, ops, sum_latency, min_latency, max_latency, latency_sketch }) { + this.num_bytes += bytes; + this.num_ops += ops; + this.sum_latency += sum_latency; + if (min_latency >= 0 && (this.min_latency < 0 || min_latency < this.min_latency)) this.min_latency = min_latency; + if (max_latency >= 0 && (this.max_latency < 0 || max_latency > this.max_latency)) this.max_latency = max_latency; + this.latency_sketch.merge(sketches.DDSketch.fromProto(Uint8Array.from(latency_sketch))); if (!this.interval) this.set_interval(); } - set_interval(delay_ms) { + /** + * @returns {Bulk} + */ + _get_bulk() { + const bytes = this.num_bytes; + const ops = this.num_ops; + const sum_latency = this.sum_latency; + const min_latency = this.min_latency; + const max_latency = this.max_latency; + const latency_sketch = Array.from(this.latency_sketch.toProto()); + return { bytes, ops, sum_latency, min_latency, max_latency, latency_sketch }; + } + + _on_interval(min_delay_ms) { + if (cluster.isWorker) { + /** type {Message} */ + const msg = { speedometer: { op: OP_UPDATE, bulk: this._get_bulk() } }; + process.send(msg); + this.reset(); + } else { + this.report(); + } + } + + /** + * Start the speedometer, optionally with workers. + * Use message passing to synchronize all workers on init and run phases. + */ + async start() { + let rc = 0; + try { + process.on('SIGINT', signal => this._on_signal(signal)); + await this._init_primary(); + await this._start_workers(); + await this._workers_ready(); + await this._init_workers(); + await this._run_workers(); + if (cluster.isPrimary && this.argv) { + console.log('SPEEDOMETER: Arguments', JSON.stringify(this.argv)); + } + await this._done_workers(); + } catch (err) { + console.error('SPEEDOMETER: Error', err); + rc = 1; + } + // cleanup and exit + this._kill_workers(); this.clear_interval(); - this.interval = setInterval(() => this.report(), delay_ms || 1000); - this.interval.unref(); + this.summary(); + process.exit(rc); } - clear_interval() { - if (this.interval) { - clearInterval(this.interval); - this.interval = null; + /** + * Only listen to messages from workers. + * Used when running inside the endpoint process that already has workers. + */ + start_lite() { + if (cluster.isPrimary) { + cluster.on('message', (worker, msg) => this._on_message_from_worker(worker, msg)); + } + } + + async _start_workers() { + if (cluster.isWorker) { + process.on('message', msg => this._on_message_to_worker(/** @type {Message} */(msg))); + } else if (cluster.isPrimary && this.num_workers > 1) { + cluster.on('message', (worker, msg) => this._on_message_from_worker(worker, msg)); + cluster.on('exit', (worker, code, signal) => this._on_worker_exit(worker, code, signal)); + this.workers = {}; + for (let i = 0; i < this.num_workers; ++i) { + const worker = cluster.fork(); + worker[STATE] = { id: worker.id, worker, ready: false }; + this.workers[worker.id] = worker; + console.log('SPEEDOMETER: Worker start', worker.id, 'pid', worker.process.pid); + } + } + } + + async _workers_ready() { + if (cluster.isWorker) { + console.log('SPEEDOMETER: Worker ready', process.pid); + process.send(MSG_READY); + } else if (this.workers) { + const is_ready = () => Object.values(this.workers).every(w => w[STATE].ready); + while (!is_ready()) { + console.log('SPEEDOMETER: Waiting for workers to be ready ...'); + await this._waitqueue.wait(); + } + console.log('SPEEDOMETER: All workers ready'); + } + } + + async _init_primary() { + if (cluster.isPrimary) { + console.log('SPEEDOMETER: Initializing primary ...'); + this.primary_info = await this.primary_init?.(); + } + } + + async _init_workers() { + if (cluster.isWorker) { + while (!this.worker_init) { + console.log('SPEEDOMETER: Waiting for primary to send init message ...'); + await this._waitqueue.wait(); + } + await this.workers_init?.(this.worker_id, this.worker_info); + console.log('SPEEDOMETER: Ackowledging init message ...'); + process.send(MSG_INIT); + } else if (this.workers) { + console.log('SPEEDOMETER: Sending init message to workers ...'); + for (const w of Object.values(this.workers)) { + /** @type {Message} */ + const msg = { speedometer: { op: OP_INIT, id: w.id, info: this.primary_info } }; + w.send(msg); + } + const is_inited = () => Object.values(this.workers).every(w => w[STATE].inited); + while (!is_inited()) { + console.log('SPEEDOMETER: Waiting for workers to be inited ...'); + await this._waitqueue.wait(); + } + console.log('SPEEDOMETER: All workers inited ...'); + } else { + // init as primary + await this.workers_init?.(this.worker_id, this.primary_info); + } + } + + async _run_workers() { + if (cluster.isWorker) { + while (!this.worker_run) { + console.log('SPEEDOMETER: Waiting for primary to send run message ...'); + await this._waitqueue.wait(); + } + this.reset(); + await this.workers_func(this.worker_id, this.worker_info); + } else if (this.workers) { + console.log('SPEEDOMETER: Sending run message to workers ...'); + this.reset(); + for (const w of Object.values(this.workers)) w.send(MSG_RUN); + } else { + // run as primary + this.reset(); + await this.workers_func(this.worker_id, this.primary_info); + } + } + + async _done_workers() { + if (cluster.isWorker) { + console.log('SPEEDOMETER: Worker done ...'); + process.send(MSG_DONE); + } else if (this.workers) { + const is_done = () => Object.values(cluster.workers).every(w => w[STATE].done); + while (!is_done()) { + console.log('SPEEDOMETER: Waiting for workers to be done ...'); + await this._waitqueue.wait(); + } + } else { + // done as primary + } + } + + /** + * @param {Worker} worker + * @param {Message} message + */ + _on_message_from_worker(worker, message) { + const msg = message?.speedometer; + if (!msg) return; + // console.log(`SPEEDOMETER: on_message_from_worker ${worker.id} pid ${worker.process.pid} msg`, msg); + if (msg.op === OP_READY) { + worker[STATE].ready = true; + this._waitqueue.wakeup(); + } else if (msg.op === OP_INIT) { + worker[STATE].inited = true; + this._waitqueue.wakeup(); + } else if (msg.op === OP_UPDATE) { + this._update_bulk(msg.bulk); + } else if (msg.op === OP_DONE) { + worker[STATE].done = true; + this._waitqueue.wakeup(); + } else { + throw new Error(`SPEEDOMETER: Unknown message op ${msg.op} from worker ${worker.id} pid ${worker.process.pid}`); + } + } + + /** + * @param {Message} message + */ + _on_message_to_worker(message) { + const msg = message?.speedometer; + if (!msg) return; + // console.log('SPEEDOMETER: on_message_to_worker', msg); + if (msg.op === OP_INIT) { + this.worker_id = msg.id; + this.worker_info = msg.info; + this.worker_init = true; + this._waitqueue.wakeup(); + } else if (msg.op === OP_RUN) { + this.worker_run = true; + this._waitqueue.wakeup(); + } else { + throw new Error(`SPEEDOMETER: Unknown message op ${msg.op} received by worker`); } } + _on_signal(signal) { + if (signal === 'SIGINT') { // Ctrl-C + this.clear_interval(); + if (cluster.isPrimary) { + this._kill_workers(); + this.summary(); + } + process.exit(1); + } + } + + _kill_workers(signal = 'SIGKILL') { + if (cluster.workers) { + for (const w of Object.values(cluster.workers)) { + w.kill(signal); + } + } + } + + /** + * @param {Worker} worker + * @param {number} code + * @param {string} signal + */ + _on_worker_exit(worker, code, signal) { + worker[STATE].exit = true; + this._waitqueue.wakeup(); + if (code) { + console.error('SPEEDOMETER: Worker failed', worker.id, 'pid', worker.process.pid, 'code', code); + process.exit(1); + } + // if (!Object.keys(cluster.workers).length) { + // this.clear_interval(); + // this.summary(); + // process.exit(0); + // } + } + report(min_delay_ms) { + if (!cluster.isPrimary) return; const now = Date.now(); if (min_delay_ms && now - this.last_time < min_delay_ms) { return; } - const bytes = this.num_bytes - this.last_bytes; - const ops = this.num_ops - this.last_ops; - const sum_latency = this.sum_latency - this.last_latency; - if (cluster.isWorker) { - process.send({ - bytes, - ops, - sum_latency, - min_latency: this.min_latency, // Infinity will send as null - max_latency: this.max_latency, // Infinity will send as null - }); - } else { - const speed = bytes / - Math.max(0.001, now - this.last_time) * 1000 / 1024 / 1024; - const avg_speed = this.num_bytes / - Math.max(0.001, now - this.start_time) * 1000 / 1024 / 1024; - console.log( - this.name + ': ' + - speed.toFixed(1) + ' MB/sec' + - ' (average ' + avg_speed.toFixed(1) + ')' + - (ops ? ( - ' | OPS: ' + ops + - ' min:' + this.min_latency.toFixed(1) + 'ms' + - ' max:' + this.max_latency.toFixed(1) + 'ms' + - ' avg:' + (sum_latency / ops).toFixed(1) + 'ms' - ) : '') - ); - } - this.last_time = now; + + this.num_reports += 1; + const report_num = this.num_reports; + + const mb = this.num_bytes / 1024 / 1024; + const sec = (now - this.start_time) / 1000; + const avg_speed = mb / sec; + const avg_ops = this.num_ops / sec; + const avg_latency = this.sum_latency / this.num_ops; + + const curr_mb = (this.num_bytes - this.last_bytes) / 1024 / 1024; + const curr_sec = (now - this.last_time) / 1000; + const curr_speed = curr_mb / curr_sec; + const curr_ops = (this.num_ops - this.last_ops) / curr_sec; + const curr_latency = (this.sum_latency - this.last_latency) / (this.num_ops - this.last_ops); + + console.log( + `[${report_num}] ${this.name}: ${curr_speed.toFixed(1)} MiB/sec` + + (this.num_ops ? ( + ' ' + curr_ops.toFixed(1) + ' OP/sec' + + ' ' + curr_latency.toFixed(1) + 'ms avg latency' + + ' | Total (MiB/s,OP/s,avg,min,p90,p99,max):' + + ' ' + avg_speed.toFixed(1) + + ' ' + avg_ops.toFixed(1) + + ' ' + avg_latency.toFixed(1) + + ' ' + this.min_latency.toFixed(1) + + ' ' + this.latency_sketch.getValueAtQuantile(0.90).toFixed(1) + + ' ' + this.latency_sketch.getValueAtQuantile(0.99).toFixed(1) + + ' ' + this.max_latency.toFixed(1) + ) : ( + ' | Total (MiB/s):' + avg_speed.toFixed(1) + )) + ); + + this.last_time = Date.now(); this.last_bytes = this.num_bytes; this.last_ops = this.num_ops; this.last_latency = this.sum_latency; - this.min_latency = -1; - this.max_latency = -1; } + + summary() { + if (!cluster.isPrimary) return; + const mb = this.num_bytes / 1024 / 1024; + const gb = this.num_bytes / 1024 / 1024 / 1024; + const sec = (Date.now() - this.start_time) / 1000; + const avg_speed = mb / sec; + const avg_ops = this.num_ops / sec; + const avg_latency = this.sum_latency / this.num_ops; + console.log('| SPEED SUMMARY | -------------------'); + console.log('| SPEED SUMMARY | Name :', this.name); + console.log('| SPEED SUMMARY | Arguments :', JSON.stringify(this.argv)); + console.log('| SPEED SUMMARY | -------------------'); + console.log('| SPEED SUMMARY | Total time :', sec.toFixed(1), 'seconds'); + console.log('| SPEED SUMMARY | Total bytes :', gb.toFixed(1), 'GB'); + console.log('| SPEED SUMMARY | Total ops :', this.num_ops); + console.log('| SPEED SUMMARY | -------------------'); + console.log('| SPEED SUMMARY | Average speed :', avg_speed.toFixed(1), 'MB/sec'); + console.log('| SPEED SUMMARY | Average ops :', avg_ops.toFixed(1), 'ops/sec'); + console.log('| SPEED SUMMARY | Average latency :', avg_latency.toFixed(1), 'ms'); + console.log('| SPEED SUMMARY | -------------------'); + console.log('| SPEED SUMMARY | Min latency :', this.min_latency.toFixed(1), 'ms'); + console.log('| SPEED SUMMARY | Percentile 50% :', this.latency_sketch.getValueAtQuantile(0.50).toFixed(1), 'ms'); + console.log('| SPEED SUMMARY | Percentile 90% :', this.latency_sketch.getValueAtQuantile(0.90).toFixed(1), 'ms'); + console.log('| SPEED SUMMARY | Percentile 95% :', this.latency_sketch.getValueAtQuantile(0.95).toFixed(1), 'ms'); + console.log('| SPEED SUMMARY | Percentile 99% :', this.latency_sketch.getValueAtQuantile(0.99).toFixed(1), 'ms'); + console.log('| SPEED SUMMARY | Percentile 99.9% :', this.latency_sketch.getValueAtQuantile(0.999).toFixed(1), 'ms'); + console.log('| SPEED SUMMARY | Max latency :', this.max_latency.toFixed(1), 'ms'); + console.log('| SPEED SUMMARY | -------------------'); + console.log('| SPEED SUMMARY | Final (MiB/s,OP/s,avg,min,p90,p99,max):' + + ' ' + avg_speed.toFixed(1) + + ' ' + avg_ops.toFixed(1) + + ' ' + avg_latency.toFixed(1) + + ' ' + this.min_latency.toFixed(1) + + ' ' + this.latency_sketch.getValueAtQuantile(0.90).toFixed(1) + + ' ' + this.latency_sketch.getValueAtQuantile(0.99).toFixed(1) + + ' ' + this.max_latency.toFixed(1)); + console.log('| SPEED SUMMARY | -------------------'); + } + } module.exports = Speedometer; diff --git a/src/util/stream_utils.js b/src/util/stream_utils.js index 7cc442eae1..ed7defa372 100644 --- a/src/util/stream_utils.js +++ b/src/util/stream_utils.js @@ -33,10 +33,10 @@ const async_pipeline = util.promisify(stream.pipeline); /** * * @param {(stream.Readable | stream.Writable | stream.Duplex)[]} streams - * @param {boolean} reuse_last_stream + * @param {boolean} [reuse_last_stream] * @returns {Promise} */ -async function pipeline(streams, reuse_last_stream) { +async function pipeline(streams, reuse_last_stream = false) { if (!streams || !streams.length) throw new Error('Pipeline called without streams'); if (streams.find(strm => strm.destroyed)) { const err = new Error('Pipeline called on destroyed stream');