Support Per Stream Chunking to Relieve Memory Pressure (facebookincubator#243)

macvincent · meta-codesync[bot] · commit ca442632a46f · 2025-10-14T09:59:31.000-07:00
Summary: Pull Request resolved: facebookincubator#243 This is an implementation of a detail in the new chunking policy described in this [doc](https://fburl.com/gdoc/gkdwwju1). Rather than chunking all eligible streams, we chunk individual streams in the order of their raw size until memory pressure is relieved. For our unit tests, the maximum number of chunks produced is identical to the previous implementation. But there may be differences for large file sizes. This requires more experimentation and tuning to determine the right threshold value that takes advantage of this. Differential Revision: D81715655
diff --git a/dwio/nimble/velox/VeloxWriter.cpp b/dwio/nimble/velox/VeloxWriter.cpp
@@ -799,7 +799,9 @@ void VeloxWriter::writeChunk(bool lastChunk) {
           << ", chunk bytes: " << chunkSize;
 }
 
-bool VeloxWriter::writeChunks(bool lastChunk) {
+bool VeloxWriter::writeChunks(
+    std::span<const uint32_t> streamIndices,
+    bool lastChunk) {
   uint64_t previousFlushWallTime = context_->stripeFlushTiming.wallNanos;
   std::atomic<uint64_t> chunkSize = 0;
   std::atomic<uint64_t> logicalSizeBeforeEncoding = 0;
@@ -866,15 +868,18 @@ bool VeloxWriter::writeChunks(bool lastChunk) {
       }
     };
 
+    const auto& streams = context_->streams();
     if (context_->options.encodingExecutor) {
       velox::dwio::common::ExecutorBarrier barrier{
           context_->options.encodingExecutor};
-      for (auto& streamData : context_->streams()) {
+      for (auto streamIndex : streamIndices) {
+        auto& streamData = streams[streamIndex];
         barrier.add([&] { processStream(*streamData); });
       }
       barrier.waitAll();
     } else {
-      for (auto& streamData : context_->streams()) {
+      for (auto streamIndex : streamIndices) {
+        auto& streamData = streams[streamIndex];
         processStream(*streamData);
       }
     }
@@ -911,8 +916,10 @@ bool VeloxWriter::writeStripe() {
   }
 
   if (context_->options.enableChunking) {
-    writeChunks(true);
-
+    // Chunk all streams.
+    std::vector<uint32_t> streamIndices(context_->streams().size());
+    std::iota(streamIndices.begin(), streamIndices.end(), 0);
+    writeChunks(streamIndices, true);
   } else {
     writeChunk(true);
   }
@@ -998,8 +1005,32 @@ bool VeloxWriter::evalauateFlushPolicy() {
     });
   };
 
-  if (context_->options.enableChunking) {
-    while (shouldChunk() && writeChunks(false)) {
+  if (context_->options.enableChunking && shouldChunk()) {
+    const auto& streams = context_->streams();
+    const size_t streamCount = streams.size();
+    // Sort streams for chunking based on raw memory usage.
+    // TODO(T240072104): Improve performance by bucketing the streams by size
+    // (most significant bit) instead of sorting.
+    std::vector<uint32_t> streamIndices(streamCount);
+    std::iota(streamIndices.begin(), streamIndices.end(), 0);
+    std::sort(
+        streamIndices.begin(),
+        streamIndices.end(),
+        [&](const uint32_t& a, const uint32_t& b) {
+          return streams[a]->memoryUsed() > streams[b]->memoryUsed();
+        });
+
+    // Chunk streams in batches.
+    const auto batchSize = context_->options.chunkedStreamBatchSize;
+    for (size_t index = 0; index < streamCount; index += batchSize) {
+      const size_t currentBatchSize = std::min(batchSize, streamCount - index);
+      std::span<const uint32_t> batchIndices(
+          streamIndices.begin() + index, currentBatchSize);
+      // Stop attempting chunking once streams are too small to chunk or
+      // memory pressure is relieved.
+      if (!(writeChunks(batchIndices, false) && shouldChunk())) {
+        break;
+      }
     }
   }
 
diff --git a/dwio/nimble/velox/VeloxWriter.h b/dwio/nimble/velox/VeloxWriter.h
@@ -90,7 +90,9 @@ class VeloxWriter {
   bool writeStripe();
   void writeChunk(bool lastChunk = true);
   // Returns 'true' if chunks were written.
-  bool writeChunks(bool lastChunk = true);
+  bool writeChunks(
+      std::span<const uint32_t> streamIndices,
+      bool lastChunk = true);
 };
 
 } // namespace facebook::nimble
diff --git a/dwio/nimble/velox/VeloxWriterOptions.h b/dwio/nimble/velox/VeloxWriterOptions.h
@@ -96,6 +96,10 @@ struct VeloxWriterOptions {
   // Note: this threshold is ignored when it is time to flush a stripe.
   uint64_t minStreamChunkRawSize = 1024;
 
+  // Number of streams to try chunking between memory pressure evaluations.
+  // Note: this is ignored when it is time to flush a stripe.
+  size_t chunkedStreamBatchSize = 1024;
+
   // The factory function that produces the root encoding selection policy.
   // Encoding selection policy is the way to balance the tradeoffs of
   // different performance factors (at both read and write times). Heuristics
diff --git a/dwio/nimble/velox/tests/VeloxWriterTests.cpp b/dwio/nimble/velox/tests/VeloxWriterTests.cpp
@@ -1947,6 +1947,7 @@ struct ChunkFlushPolicyTestCase {
   const uint32_t expectedStripeCount{0};
   const uint32_t expectedMaxChunkCount{0};
   const uint32_t expectedMinChunkCount{0};
+  const uint32_t chunkedStreamBatchSize{2};
 };
 
 class ChunkFlushPolicyTest
@@ -1958,6 +1959,7 @@ TEST_P(ChunkFlushPolicyTest, ChunkFlushPolicyIntegration) {
       {{"BIGINT", velox::BIGINT()}, {"SMALLINT", velox::SMALLINT()}});
   nimble::VeloxWriterOptions writerOptions{
       .minStreamChunkRawSize = GetParam().minStreamChunkRawSize,
+      .chunkedStreamBatchSize = GetParam().chunkedStreamBatchSize,
       .flushPolicyFactory = GetParam().enableChunking
           ? []() -> std::unique_ptr<nimble::FlushPolicy> {
               return std::make_unique<nimble::ChunkFlushPolicy>(
@@ -2075,6 +2077,7 @@ INSTANTIATE_TEST_CASE_P(
             .expectedStripeCount = 4,
             .expectedMaxChunkCount = 1,
             .expectedMinChunkCount = 1,
+            .chunkedStreamBatchSize = 2,
         },
         // Baseline with default settings (has chunking)
         ChunkFlushPolicyTestCase{
@@ -2088,6 +2091,7 @@ INSTANTIATE_TEST_CASE_P(
             .expectedStripeCount = 7,
             .expectedMaxChunkCount = 2,
             .expectedMinChunkCount = 1,
+            .chunkedStreamBatchSize = 2,
         },
         // High memory regression threshold and no compression
         // Produces file identical to RawStripeSizeFlushPolicy
@@ -2102,6 +2106,7 @@ INSTANTIATE_TEST_CASE_P(
             .expectedStripeCount = 4,
             .expectedMaxChunkCount = 1,
             .expectedMinChunkCount = 1,
+            .chunkedStreamBatchSize = 2,
         },
         // Low memory regression threshold
         // Produces file with more min chunks per stripe
@@ -2116,6 +2121,7 @@ INSTANTIATE_TEST_CASE_P(
             .expectedStripeCount = 10,
             .expectedMaxChunkCount = 2,
             .expectedMinChunkCount = 2, // +1 chunk
+            .chunkedStreamBatchSize = 2,
         },
         // High target stripe size bytes (with disabled memory pressure
         // optimization) produces fewer stripes. Single chunks.
@@ -2130,6 +2136,8 @@ INSTANTIATE_TEST_CASE_P(
             .expectedStripeCount = 1, // -3 stripes
             .expectedMaxChunkCount = 1,
             .expectedMinChunkCount = 1,
+            .chunkedStreamBatchSize = 2,
+
         },
         // Low target stripe size bytes (with disabled memory pressure
         // optimization) produces more stripes. Single chunks.
@@ -2144,5 +2152,20 @@ INSTANTIATE_TEST_CASE_P(
             .expectedStripeCount = 7, // +6 stripes
             .expectedMaxChunkCount = 1,
             .expectedMinChunkCount = 1,
-        }));
+            .chunkedStreamBatchSize = 2,
+
+        },
+        // Higher chunked stream batch size (no change in policy)
+        ChunkFlushPolicyTestCase{
+            .batchCount = 20,
+            .enableChunking = true,
+            .targetStripeSizeBytes = 250 << 10, // 250KB
+            .writerMemoryHighThresholdBytes = 80 << 10,
+            .writerMemoryLowThresholdBytes = 75 << 10,
+            .estimatedCompressionFactor = 1.0,
+            .minStreamChunkRawSize = 100,
+            .expectedStripeCount = 7,
+            .expectedMaxChunkCount = 2,
+            .expectedMinChunkCount = 1,
+            .chunkedStreamBatchSize = 10}));
 } // namespace facebook