Support Per Stream Chunking to Relieve Memory Pressure (facebookincubator#243)

macvincent · facebook-github-bot · commit 91dbed7a7d97 · 2025-09-29T16:17:51.000-07:00
Summary: This is an implementation of a detail in the new chunking policy described in this [doc](https://fburl.com/gdoc/gkdwwju1). Rather than chunking all eligible streams, we chunk individual streams in the order of their raw size until memory pressure is relieved. For our unit tests, the maximum number of chunks produced is identical to the previous implementation. But there may be differences for large file sizes. This requires more experimentation and tuning to determine the right threshold value that takes advantage of this. Differential Revision: D81715655
diff --git a/dwio/nimble/velox/VeloxWriter.cpp b/dwio/nimble/velox/VeloxWriter.cpp
@@ -641,7 +641,9 @@ void VeloxWriter::flush() {
   }
 }
 
-bool VeloxWriter::writeChunk(bool lastChunk) {
+bool VeloxWriter::writeChunk(
+    bool lastChunk,
+    const std::unordered_set<uint32_t>& streamIndices) {
   uint64_t previousFlushWallTime = context_->stripeFlushTiming.wallNanos;
   std::atomic<uint64_t> chunkSize = 0;
   std::atomic<uint64_t> logicalSizeBeforeEncoding = 0;
@@ -746,9 +748,11 @@ bool VeloxWriter::writeChunk(bool lastChunk) {
       velox::dwio::common::ExecutorBarrier barrier{
           context_->options.encodingExecutor};
       for (auto& streamData : context_->streams()) {
-        auto& streamSize =
-            context_->columnStats[streamData->descriptor().offset()]
-                .physicalSize;
+        auto offset = streamData->descriptor().offset();
+        auto& streamSize = context_->columnStats[offset].physicalSize;
+        if (!streamIndices.empty() && !streamIndices.contains(offset)) {
+          continue;
+        }
         processStream(
             *streamData, [&](StreamData& innerStreamData, bool isNullStream) {
               barrier.add(
@@ -765,9 +769,11 @@ bool VeloxWriter::writeChunk(bool lastChunk) {
       barrier.waitAll();
     } else {
       for (auto& streamData : context_->streams()) {
-        auto& streamSize =
-            context_->columnStats[streamData->descriptor().offset()]
-                .physicalSize;
+        auto offset = streamData->descriptor().offset();
+        auto& streamSize = context_->columnStats[offset].physicalSize;
+        if (!streamIndices.empty() && !streamIndices.contains(offset)) {
+          continue;
+        }
         processStream(
             *streamData,
             [&encode, &streamSize](
@@ -871,8 +877,39 @@ bool VeloxWriter::tryWriteStripe(bool force) {
 
   try {
     // TODO: we can improve merge the last chunk write with stripe
-    if (context_->options.enableChunking) {
-      while (shouldChunk() == ChunkDecision::Chunk && writeChunk(false)) {
+    if (context_->options.enableChunking &&
+        shouldChunk() == ChunkDecision::Chunk) {
+      const auto& streams = context_->streams();
+      // Sort streams for chunking based on raw memory usage.
+      std::vector<uint32_t> streamIndices(streams.size());
+      std::iota(streamIndices.begin(), streamIndices.end(), 0);
+      std::sort(
+          streamIndices.begin(),
+          streamIndices.end(),
+          [&](const uint32_t& a, const uint32_t& b) {
+            return streams[a]->memoryUsed() > streams[b]->memoryUsed();
+          });
+
+      // Chunk streams in batches.
+      uint32_t currentIndex = 0;
+      ChunkDecision decision = ChunkDecision::Chunk;
+      NIMBLE_DASSERT(
+          context_->options.chunkedStreamBatchSize > 0,
+          "streamEncodingBatchSize must be greater than 0");
+      while (currentIndex < streams.size() &&
+             decision == ChunkDecision::Chunk) {
+        uint32_t endStreamIndex = std::min(
+            static_cast<uint32_t>(streams.size()),
+            currentIndex + context_->options.chunkedStreamBatchSize);
+        std::unordered_set<uint32_t> streamIndicesToChunk(
+            streamIndices.begin() + currentIndex,
+            streamIndices.begin() + endStreamIndex);
+        currentIndex = endStreamIndex;
+        // Stop attempting chunking once streams are too small to chunk.
+        if (!writeChunk(false, streamIndicesToChunk)) {
+          break;
+        }
+        decision = shouldChunk();
       }
     }
 
diff --git a/dwio/nimble/velox/VeloxWriter.h b/dwio/nimble/velox/VeloxWriter.h
@@ -87,7 +87,9 @@ class VeloxWriter {
   // Returning 'true' if stripe was written.
   bool tryWriteStripe(bool force = false);
   // Returns 'true' if chunk was written.
-  bool writeChunk(bool lastChunk = true);
+  bool writeChunk(
+      bool lastChunk = true,
+      const std::unordered_set<uint32_t>& streamIndices = {});
   uint32_t writeStripe();
 };
 
diff --git a/dwio/nimble/velox/VeloxWriterOptions.h b/dwio/nimble/velox/VeloxWriterOptions.h
@@ -96,6 +96,10 @@ struct VeloxWriterOptions {
   // Note: this threshold is ignored when it is time to flush a stripe.
   uint64_t minStreamChunkRawSize = 1024;
 
+  // Number of streams to process in parallel during chunked encoding.
+  // Note: this is ignored when it is time to flush a stripe.
+  uint32_t chunkedStreamBatchSize = 1024;
+
   // The factory function that produces the root encoding selection policy.
   // Encoding selection policy is the way to balance the tradeoffs of
   // different performance factors (at both read and write times). Heuristics
diff --git a/dwio/nimble/velox/tests/VeloxWriterTests.cpp b/dwio/nimble/velox/tests/VeloxWriterTests.cpp
@@ -1965,6 +1965,7 @@ struct ChunkFlushPolicyTestCase {
   const uint32_t expectedStripeCount{0};
   const uint32_t expectedMaxChunkCount{0};
   const uint32_t expectedMinChunkCount{0};
+  const uint32_t chunkedStreamBatchSize{2};
 };
 
 class ChunkFlushPolicyTest
@@ -1976,6 +1977,7 @@ TEST_P(ChunkFlushPolicyTest, ChunkFlushPolicyIntegration) {
       {{"BIGINT", velox::BIGINT()}, {"SMALLINT", velox::SMALLINT()}});
   nimble::VeloxWriterOptions writerOptions{
       .minStreamChunkRawSize = GetParam().minStreamChunkRawSize,
+      .chunkedStreamBatchSize = GetParam().chunkedStreamBatchSize,
       .flushPolicyFactory = GetParam().enableChunking
           ? []() -> std::unique_ptr<nimble::FlushPolicy> {
               return std::make_unique<nimble::ChunkFlushPolicy>(
@@ -2098,6 +2100,7 @@ INSTANTIATE_TEST_CASE_P(
             .expectedStripeCount = 4,
             .expectedMaxChunkCount = 1,
             .expectedMinChunkCount = 1,
+            .chunkedStreamBatchSize = 10,
         },
         // Base case with default settings (has chunking)
         ChunkFlushPolicyTestCase{
@@ -2111,6 +2114,7 @@ INSTANTIATE_TEST_CASE_P(
             .expectedStripeCount = 3,
             .expectedMaxChunkCount = 7,
             .expectedMinChunkCount = 3,
+            .chunkedStreamBatchSize = 10,
         },
         // High memory regression threshold
         // Produces file identical to RawStripeSizeFlushPolicy
@@ -2125,6 +2129,7 @@ INSTANTIATE_TEST_CASE_P(
             .expectedStripeCount = 4,
             .expectedMaxChunkCount = 1,
             .expectedMinChunkCount = 1,
+            .chunkedStreamBatchSize = 10,
         },
         // Low memory regression threshold
         // Produces file with more chunks per stripe
@@ -2139,6 +2144,7 @@ INSTANTIATE_TEST_CASE_P(
             .expectedStripeCount = 3,
             .expectedMaxChunkCount = 8,
             .expectedMinChunkCount = 4,
+            .chunkedStreamBatchSize = 10,
         },
         // High target stripe size bytes (with disabled memory pressure
         // optimization) produces fewer stripes. Single chunks.
@@ -2153,6 +2159,8 @@ INSTANTIATE_TEST_CASE_P(
             .expectedStripeCount = 1, // -2 stripes
             .expectedMaxChunkCount = 1,
             .expectedMinChunkCount = 1,
+            .chunkedStreamBatchSize = 10,
+
         },
         // Low target stripe size bytes (with disabled memory pressure
         // optimization) produces more stripes. Single chunks.
@@ -2167,5 +2175,21 @@ INSTANTIATE_TEST_CASE_P(
             .expectedStripeCount = 7, // +6 stripes
             .expectedMaxChunkCount = 1,
             .expectedMinChunkCount = 1,
-        }));
+            .chunkedStreamBatchSize = 10,
+
+        },
+        // Higher chunked stream batch size (no change in policy)
+        ChunkFlushPolicyTestCase{
+            .batchCount = 20,
+            .enableChunking = true,
+            .targetStripeSizeBytes = 250 << 10, // 250KB
+            .writerMemoryHighThreshold = 80 << 10,
+            .writerMemoryLowThreshold = 75 << 10,
+            .compressionRatioFactor = 1.0,
+            .minStreamChunkRawSize = 100,
+            .expectedStripeCount = 3,
+            .expectedMaxChunkCount = 7,
+            .expectedMinChunkCount = 3,
+            .chunkedStreamBatchSize = 3} // +1
+        ));
 } // namespace facebook