Skip to content

Commit 16d0401

Browse files
macvincentfacebook-github-bot
authored andcommitted
Support Per Stream Chunking to Relieve Memory Pressure (facebookincubator#243)
Summary: This is an implementation of a detail in the new chunking policy described in this [doc](https://fburl.com/gdoc/gkdwwju1). Rather than chunking all eligible streams, we chunk individual streams in the order of their raw size until memory pressure is relieved. For our unit tests, the maximum number of chunks produced is identical to the previous implementation. But there may be differences for large file sizes. This requires more experimentation and tuning to determine the right threshold value that takes advantage of this. Reviewed By: helfman Differential Revision: D81715655
1 parent 684bcc1 commit 16d0401

File tree

4 files changed

+151
-10
lines changed

4 files changed

+151
-10
lines changed

dwio/nimble/velox/VeloxWriter.cpp

Lines changed: 38 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -806,7 +806,9 @@ void VeloxWriter::writeChunk(bool lastChunk) {
806806
<< ", chunk bytes: " << chunkSize;
807807
}
808808

809-
bool VeloxWriter::writeChunks(bool lastChunk) {
809+
bool VeloxWriter::writeChunks(
810+
std::span<const uint32_t> streamIndices,
811+
bool lastChunk) {
810812
uint64_t previousFlushWallTime = context_->stripeFlushTiming.wallNanos;
811813
std::atomic<uint64_t> chunkSize = 0;
812814
std::atomic<uint64_t> logicalSizeBeforeEncoding = 0;
@@ -873,15 +875,18 @@ bool VeloxWriter::writeChunks(bool lastChunk) {
873875
}
874876
};
875877

878+
const auto& streams = context_->streams();
876879
if (context_->options.encodingExecutor) {
877880
velox::dwio::common::ExecutorBarrier barrier{
878881
context_->options.encodingExecutor};
879-
for (auto& streamData : context_->streams()) {
882+
for (auto streamIndex : streamIndices) {
883+
auto& streamData = streams[streamIndex];
880884
barrier.add([&] { processStream(*streamData); });
881885
}
882886
barrier.waitAll();
883887
} else {
884-
for (auto& streamData : context_->streams()) {
888+
for (auto streamIndex : streamIndices) {
889+
auto& streamData = streams[streamIndex];
885890
processStream(*streamData);
886891
}
887892
}
@@ -910,8 +915,10 @@ bool VeloxWriter::writeStripe() {
910915
}
911916

912917
if (context_->options.enableChunking) {
913-
writeChunks(true);
914-
918+
// Chunk all streams.
919+
std::vector<uint32_t> streamIndices(context_->streams().size());
920+
std::iota(streamIndices.begin(), streamIndices.end(), 0);
921+
writeChunks(streamIndices, true);
915922
} else {
916923
writeChunk(true);
917924
}
@@ -989,8 +996,32 @@ bool VeloxWriter::evalauateFlushPolicy() {
989996
});
990997
};
991998

992-
if (context_->options.enableChunking) {
993-
while (shouldChunk() && writeChunks(false)) {
999+
if (context_->options.enableChunking && shouldChunk()) {
1000+
const auto& streams = context_->streams();
1001+
const size_t streamCount = streams.size();
1002+
// Sort streams for chunking based on raw memory usage.
1003+
// TODO(T240072104): Improve performance by bucketing the streams by size
1004+
// (most significant bit) instead of sorting.
1005+
std::vector<uint32_t> streamIndices(streamCount);
1006+
std::iota(streamIndices.begin(), streamIndices.end(), 0);
1007+
std::sort(
1008+
streamIndices.begin(),
1009+
streamIndices.end(),
1010+
[&](const uint32_t& a, const uint32_t& b) {
1011+
return streams[a]->memoryUsed() > streams[b]->memoryUsed();
1012+
});
1013+
1014+
// Chunk streams in batches.
1015+
const auto batchSize = context_->options.chunkedStreamBatchSize;
1016+
for (size_t index = 0; index < streamCount; index += batchSize) {
1017+
const size_t currentBatchSize = std::min(batchSize, streamCount - index);
1018+
std::span<const uint32_t> batchIndices(
1019+
streamIndices.begin() + index, currentBatchSize);
1020+
// Stop attempting chunking once streams are too small to chunk or
1021+
// memory pressure is relieved.
1022+
if (!(writeChunks(batchIndices, false) && shouldChunk())) {
1023+
break;
1024+
}
9941025
}
9951026
}
9961027

dwio/nimble/velox/VeloxWriter.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,9 @@ class VeloxWriter {
9090
bool writeStripe();
9191
void writeChunk(bool lastChunk = true);
9292
// Returns 'true' if chunks were written.
93-
bool writeChunks(bool lastChunk = true);
93+
bool writeChunks(
94+
std::span<const uint32_t> streamIndices,
95+
bool lastChunk = true);
9496
};
9597

9698
} // namespace facebook::nimble

dwio/nimble/velox/VeloxWriterOptions.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,10 @@ struct VeloxWriterOptions {
9696
// Note: this threshold is ignored when it is time to flush a stripe.
9797
uint64_t minStreamChunkRawSize = 1024;
9898

99+
// Number of streams to try chunking between memory pressure evaluations.
100+
// Note: this is ignored when it is time to flush a stripe.
101+
size_t chunkedStreamBatchSize = 1024;
102+
99103
// The factory function that produces the root encoding selection policy.
100104
// Encoding selection policy is the way to balance the tradeoffs of
101105
// different performance factors (at both read and write times). Heuristics

dwio/nimble/velox/tests/VeloxWriterTests.cpp

Lines changed: 106 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2020,6 +2020,7 @@ struct ChunkFlushPolicyTestCase {
20202020
const uint32_t expectedStripeCount{0};
20212021
const uint32_t expectedMaxChunkCount{0};
20222022
const uint32_t expectedMinChunkCount{0};
2023+
const uint32_t chunkedStreamBatchSize{2};
20232024
};
20242025

20252026
class ChunkFlushPolicyTest
@@ -2031,6 +2032,7 @@ TEST_P(ChunkFlushPolicyTest, ChunkFlushPolicyIntegration) {
20312032
{{"BIGINT", velox::BIGINT()}, {"SMALLINT", velox::SMALLINT()}});
20322033
nimble::VeloxWriterOptions writerOptions{
20332034
.minStreamChunkRawSize = GetParam().minStreamChunkRawSize,
2035+
.chunkedStreamBatchSize = GetParam().chunkedStreamBatchSize,
20342036
.flushPolicyFactory = GetParam().enableChunking
20352037
? []() -> std::unique_ptr<nimble::FlushPolicy> {
20362038
return std::make_unique<nimble::ChunkFlushPolicy>(
@@ -2168,6 +2170,87 @@ TEST_F(VeloxWriterTests, FuzzComplex) {
21682170
}
21692171
}
21702172

2173+
TEST_F(VeloxWriterTests, BatchedChunkingRelievesMemoryPressure) {
2174+
// Verify we stop chunking early when chunking relieves memory pressure.
2175+
const uint32_t seed = FLAGS_writer_tests_seed > 0 ? FLAGS_writer_tests_seed
2176+
: folly::Random::rand32();
2177+
LOG(INFO) << "seed: " << seed;
2178+
std::mt19937 rng{seed};
2179+
const uint32_t rowCount =
2180+
std::uniform_int_distribution<uint32_t>(1, 4096)(rng);
2181+
2182+
velox::VectorFuzzer fuzzer({.vectorSize = rowCount}, leafPool_.get(), seed);
2183+
const auto stringColumn = fuzzer.fuzzFlat(velox::VARCHAR());
2184+
const auto intColumn = fuzzer.fuzzFlat(velox::INTEGER());
2185+
2186+
nimble::RawSizeContext context;
2187+
nimble::OrderedRanges ranges;
2188+
ranges.add(0, rowCount);
2189+
const uint64_t stringColumnRawSize =
2190+
nimble::getRawSizeFromVector(stringColumn, ranges, context) +
2191+
sizeof(std::string_view) * rowCount;
2192+
const uint64_t intColumnRawSize =
2193+
nimble::getRawSizeFromVector(intColumn, ranges, context);
2194+
2195+
constexpr size_t kColumnCount = 20;
2196+
constexpr size_t kBatchSize = 4;
2197+
std::vector<velox::VectorPtr> children(kColumnCount);
2198+
std::vector<std::string> columnNames(kColumnCount);
2199+
uint64_t totalRawSize = 0;
2200+
for (size_t i = 0; i < kColumnCount; i += 2) {
2201+
columnNames[i] = fmt::format("string_column_{}", i);
2202+
columnNames[i + 1] = fmt::format("int_column_{}", i);
2203+
children[i] = stringColumn;
2204+
children[i + 1] = intColumn;
2205+
totalRawSize += intColumnRawSize + stringColumnRawSize;
2206+
}
2207+
2208+
velox::test::VectorMaker vectorMaker{leafPool_.get()};
2209+
const auto rowVector = vectorMaker.rowVector(columnNames, children);
2210+
2211+
// We will return true twice and false once
2212+
const std::vector<bool> expectedChunkingDecisions{true, true, false};
2213+
std::vector<bool> actualChunkingDecisions;
2214+
2215+
// We will be chunking the large streams in the first two batches. 8 string
2216+
// streams in total. We set the expected rawSize after chunking these two
2217+
// batches as our memory threshold.
2218+
const uint64_t memoryPressureThreshold =
2219+
totalRawSize - (2 * kBatchSize * stringColumnRawSize);
2220+
2221+
nimble::VeloxWriterOptions writerOptions;
2222+
writerOptions.chunkedStreamBatchSize = kBatchSize;
2223+
writerOptions.enableChunking = true;
2224+
writerOptions.minStreamChunkRawSize = intColumnRawSize / 2;
2225+
writerOptions.flushPolicyFactory =
2226+
[&]() -> std::unique_ptr<nimble::FlushPolicy> {
2227+
return std::make_unique<nimble::LambdaFlushPolicy>(
2228+
/* shouldFlush */ [](const auto&) { return true; },
2229+
/* shouldChunk */
2230+
[&](const nimble::StripeProgress& stripeProgress) {
2231+
const bool shouldChunk =
2232+
stripeProgress.stripeRawSize > memoryPressureThreshold;
2233+
actualChunkingDecisions.push_back(shouldChunk);
2234+
return shouldChunk;
2235+
});
2236+
};
2237+
2238+
std::string file;
2239+
auto writeFile = std::make_unique<velox::InMemoryWriteFile>(&file);
2240+
nimble::VeloxWriter writer(
2241+
*rootPool_, rowVector->type(), std::move(writeFile), writerOptions);
2242+
writer.write(rowVector);
2243+
writer.close();
2244+
2245+
EXPECT_THAT(
2246+
actualChunkingDecisions,
2247+
::testing::ElementsAreArray(expectedChunkingDecisions));
2248+
2249+
velox::InMemoryReadFile readFile(file);
2250+
nimble::VeloxReader reader(*leafPool_, &readFile);
2251+
validateChunkSize(reader, writerOptions.minStreamChunkRawSize);
2252+
}
2253+
21712254
INSTANTIATE_TEST_CASE_P(
21722255
StripeRawSizeFlushPolicyTestSuite,
21732256
StripeRawSizeFlushPolicyTest,
@@ -2209,6 +2292,7 @@ INSTANTIATE_TEST_CASE_P(
22092292
.expectedStripeCount = 4,
22102293
.expectedMaxChunkCount = 1,
22112294
.expectedMinChunkCount = 1,
2295+
.chunkedStreamBatchSize = 2,
22122296
},
22132297
// Baseline with default settings (has chunking)
22142298
ChunkFlushPolicyTestCase{
@@ -2222,6 +2306,7 @@ INSTANTIATE_TEST_CASE_P(
22222306
.expectedStripeCount = 7,
22232307
.expectedMaxChunkCount = 2,
22242308
.expectedMinChunkCount = 1,
2309+
.chunkedStreamBatchSize = 2,
22252310
},
22262311
// High memory regression threshold and no compression
22272312
// Produces file identical to RawStripeSizeFlushPolicy
@@ -2238,6 +2323,7 @@ INSTANTIATE_TEST_CASE_P(
22382323
.expectedStripeCount = 4,
22392324
.expectedMaxChunkCount = 1,
22402325
.expectedMinChunkCount = 1,
2326+
.chunkedStreamBatchSize = 2,
22412327
},
22422328
// Low memory regression threshold
22432329
// Produces file with more min chunks per stripe
@@ -2253,7 +2339,8 @@ INSTANTIATE_TEST_CASE_P(
22532339
.minStreamChunkRawSize = 100,
22542340
.expectedStripeCount = 10,
22552341
.expectedMaxChunkCount = 2,
2256-
.expectedMinChunkCount = 2,
2342+
.expectedMinChunkCount = 2, // +1 chunk
2343+
.chunkedStreamBatchSize = 2,
22572344
},
22582345
// High target stripe size bytes (with disabled memory pressure
22592346
// optimization) produces fewer stripes. Single chunks.
@@ -2271,6 +2358,8 @@ INSTANTIATE_TEST_CASE_P(
22712358
.expectedStripeCount = 1,
22722359
.expectedMaxChunkCount = 1,
22732360
.expectedMinChunkCount = 1,
2361+
.chunkedStreamBatchSize = 2,
2362+
22742363
},
22752364
// Low target stripe size bytes (with disabled memory pressure
22762365
// optimization) produces more stripes. Single chunks.
@@ -2288,5 +2377,20 @@ INSTANTIATE_TEST_CASE_P(
22882377
.expectedStripeCount = 7,
22892378
.expectedMaxChunkCount = 1,
22902379
.expectedMinChunkCount = 1,
2291-
}));
2380+
.chunkedStreamBatchSize = 2,
2381+
2382+
},
2383+
// Higher chunked stream batch size (no change in policy)
2384+
ChunkFlushPolicyTestCase{
2385+
.batchCount = 20,
2386+
.enableChunking = true,
2387+
.targetStripeSizeBytes = 250 << 10, // 250KB
2388+
.writerMemoryHighThresholdBytes = 80 << 10,
2389+
.writerMemoryLowThresholdBytes = 75 << 10,
2390+
.estimatedCompressionFactor = 1.0,
2391+
.minStreamChunkRawSize = 100,
2392+
.expectedStripeCount = 7,
2393+
.expectedMaxChunkCount = 2,
2394+
.expectedMinChunkCount = 1,
2395+
.chunkedStreamBatchSize = 10}));
22922396
} // namespace facebook

0 commit comments

Comments
 (0)