Integrate Max Stream Size Chunking in Velox Writer (#249)

macvincent · meta-codesync[bot] · commit 69260707a469 · 2025-10-14T13:35:15.000-07:00
Summary: Pull Request resolved: #249 This is the last feature of the new chunking policy described in this [doc](https://fburl.com/gdoc/gkdwwju1). Here, we break down large streams into multiple chunks of size up to `maxStreamChunkRawSize`. This protects the reader from attempting to materialize huge chunks. We included StreamData support for this in the previous diff. In this diff, we integrate with the VeloxWriter. With this change, while memory pressure is detected, we: 1. Chunk large streams above `maxStreamChunkRawSize`, retaining stream data below the limit. 2. If there is still memory pressure after the first step, chunk streams with size above `minStreamChunkRawSize`. During stripe flush, we chunk all remaining data, breaking down streams above `maxStreamChunkRawSize` into smaller chunks. --- The general chunking policy has two phases: ## **Phase 1 - Memory Pressure Management (shouldChunk)** The policy monitors total in-memory data size: - When memory usage exceeds the maximum threshold, it initiates chunking to reduce memory footprint while continuing data ingestion. - When previous chunking attempts succeeded and memory remains above the minimum threshold, it continues chunking to further reduce memory usage. - When chunking fails to reduce memory usage effectively and memory stays above the minimum threshold, it forces a full stripe flush to guarantee memory relief. ## **Phase 2 - Storage Size Optimization (shouldFlush)** Implements compression-aware stripe size prediction: - Calculates the anticipated final compressed stripe size by applying the estimated compression ratio to unencoded data. - Triggers stripe flush when the predicted compressed size reaches the target stripe size threshold. Differential Revision: D82175496
diff --git a/dwio/nimble/velox/CMakeLists.txt b/dwio/nimble/velox/CMakeLists.txt
@@ -153,6 +153,7 @@ add_library(
   VeloxWriter.cpp
   ChunkedStreamWriter.cpp
   VeloxWriterDefaultMetadataOSS.cpp
+  StreamChunker.cpp
 )
 target_link_libraries(
   nimble_velox_writer
diff --git a/dwio/nimble/velox/VeloxWriter.cpp b/dwio/nimble/velox/VeloxWriter.cpp
@@ -33,6 +33,7 @@
 #include "dwio/nimble/velox/SchemaSerialization.h"
 #include "dwio/nimble/velox/SchemaTypes.h"
 #include "dwio/nimble/velox/StatsGenerated.h"
+#include "dwio/nimble/velox/StreamChunker.h"
 #include "velox/common/time/CpuWallTimer.h"
 #include "velox/dwio/common/ExecutorBarrier.h"
 #include "velox/type/Type.h"
@@ -806,6 +807,7 @@ void VeloxWriter::writeChunk(bool lastChunk) {
 
 bool VeloxWriter::writeChunks(
     std::span<const uint32_t> streamIndices,
+    bool ensureFullChunks,
     bool lastChunk) {
   uint64_t previousFlushWallTime = context_->stripeFlushTiming.wallNanos;
   std::atomic<uint64_t> chunkSize = 0;
@@ -821,56 +823,43 @@ bool VeloxWriter::writeChunks(
     streams_.resize(context_->schemaBuilder.nodeCount());
 
     auto processStream = [&](StreamData& streamData) {
-      // TODO: Breakdown large streams above a threshold into smaller chunks.
-      const auto minStreamSize =
-          lastChunk ? 0 : context_->options.minStreamChunkRawSize;
       const auto* context =
           streamData.descriptor().context<WriterStreamContext>();
-      bool isNullStream = context && context->isNullStream;
-      bool shouldChunkStream = false;
-      if (isNullStream) {
-        // We apply the same null logic, where if all values
-        // are non-nulls, we omit the entire stream.
-        shouldChunkStream = streamData.hasNulls() &&
-            streamData.nonNulls().size() > minStreamSize;
-      } else {
-        shouldChunkStream = streamData.data().size() > minStreamSize;
-      }
-
-      // If we have previous written chunks for this stream, during final
-      // chunk, always write any remaining data.
-      const auto offset = streamData.descriptor().offset();
-      NIMBLE_DASSERT(offset < streams_.size(), "Stream offset out of range.");
-      auto& stream = streams_[offset];
-      if (lastChunk && !shouldChunkStream && !stream.content.empty()) {
-        shouldChunkStream =
-            !streamData.empty() || !streamData.nonNulls().empty();
-      }
-
-      if (shouldChunkStream) {
-        std::string_view encoded;
-        if (isNullStream) {
-          // For null streams we promote the null values to be written as
-          // boolean data.
-          encoded = encodeStream(
-              *context_, *encodingBuffer_, NullsAsDataStreamData(streamData));
-        } else {
-          encoded = encodeStream(*context_, *encodingBuffer_, streamData);
-        }
+      const bool isNullStream = context && context->isNullStream;
+      const auto& offset = streamData.descriptor().offset();
+      auto& streamSize = context_->columnStats[offset].physicalSize;
+      logicalSizeBeforeEncoding += streamData.memoryUsed();
+      auto& streamContent = streams_[offset].content;
+      auto chunker = getStreamChunker(
+          streamData,
+          context_->options.maxStreamChunkRawSize,
+          context_->options.minStreamChunkRawSize,
+          ensureFullChunks,
+          streamContent.empty(),
+          isNullStream,
+          lastChunk);
+      while (auto streamDataView = chunker->next()) {
+        // Null stream values are converted to boolean data for encoding.
+        std::string_view encoded = isNullStream
+            ? encodeStream(
+                  *context_,
+                  *encodingBuffer_,
+                  NullsAsDataStreamData(*streamDataView))
+            : encodeStream(*context_, *encodingBuffer_, *streamDataView);
 
         if (!encoded.empty()) {
-          auto& streamSize = context_->columnStats[offset].physicalSize;
           ChunkedStreamWriter chunkWriter{*encodingBuffer_};
           for (auto& buffer : chunkWriter.encode(encoded)) {
             streamSize += buffer.size();
             chunkSize += buffer.size();
-            stream.content.push_back(std::move(buffer));
+            streamContent.push_back(std::move(buffer));
           }
         }
         wroteChunk = true;
-        logicalSizeBeforeEncoding += streamData.memoryUsed();
-        streamData.reset();
       }
+      // Reset erases processed stream data to reclaim memory.
+      chunker.reset();
+      logicalSizeBeforeEncoding -= streamData.memoryUsed();
     };
 
     const auto& streams = context_->streams();
@@ -924,7 +913,7 @@ bool VeloxWriter::writeStripe() {
     // Chunk all streams.
     std::vector<uint32_t> streamIndices(context_->streams().size());
     std::iota(streamIndices.begin(), streamIndices.end(), 0);
-    writeChunks(streamIndices, true);
+    writeChunks(streamIndices, /*ensureFullChunks=*/false, /*lastChunk=*/true);
   } else {
     writeChunk(true);
   }
@@ -1011,32 +1000,50 @@ bool VeloxWriter::evalauateFlushPolicy() {
   };
 
   if (context_->options.enableChunking && shouldChunk()) {
+    auto batchChunkStreams = [&](const std::vector<uint32_t>& indices,
+                                 bool ensureFullChunks) {
+      const size_t indicesCount = indices.size();
+      const auto batchSize = context_->options.chunkedStreamBatchSize;
+      for (size_t index = 0; index < indicesCount; index += batchSize) {
+        size_t currentBatchSize = std::min(batchSize, indicesCount - index);
+        std::span<const uint32_t> batchIndices(
+            indices.begin() + index, currentBatchSize);
+        // Stop attempting chunking once streams are too small to chunk or
+        // memory pressure is relieved.
+        if (!(writeChunks(batchIndices, ensureFullChunks) && shouldChunk())) {
+          return false;
+        }
+      }
+      return true;
+    };
+
+    // Relieve memory pressure by chunking streams above max size.
     const auto& streams = context_->streams();
-    const size_t streamCount = streams.size();
-    // Sort streams for chunking based on raw memory usage.
-    // TODO(T240072104): Improve performance by bucketing the streams by size
-    // (most significant bit) instead of sorting.
-    std::vector<uint32_t> streamIndices(streamCount);
-    std::iota(streamIndices.begin(), streamIndices.end(), 0);
-    std::sort(
-        streamIndices.begin(),
-        streamIndices.end(),
-        [&](const uint32_t& a, const uint32_t& b) {
-          return streams[a]->memoryUsed() > streams[b]->memoryUsed();
-        });
-
-    // Chunk streams in batches.
-    const auto batchSize = context_->options.chunkedStreamBatchSize;
-    for (size_t index = 0; index < streamCount; index += batchSize) {
-      const size_t currentBatchSize = std::min(batchSize, streamCount - index);
-      std::span<const uint32_t> batchIndices(
-          streamIndices.begin() + index, currentBatchSize);
-      // Stop attempting chunking once streams are too small to chunk or
-      // memory pressure is relieved.
-      if (!(writeChunks(batchIndices, false) && shouldChunk())) {
-        break;
+    std::vector<uint32_t> streamIndices;
+    streamIndices.reserve(streams.size());
+    for (auto streamIndex = 0; streamIndex < streams.size(); ++streamIndex) {
+      if (streams[streamIndex]->memoryUsed() >=
+          context_->options.maxStreamChunkRawSize) {
+        streamIndices.push_back(streamIndex);
       }
     }
+    const bool continueChunking =
+        batchChunkStreams(streamIndices, /*ensureFullChunks=*/true);
+    if (continueChunking) {
+      // Relieve memory pressure by chunking small streams.
+      // Sort streams for chunking based on raw memory usage.
+      // TODO(T240072104): Improve performance by bucketing the streams
+      // by size (by most significant bit) instead of sorting them.
+      streamIndices.resize(streams.size());
+      std::iota(streamIndices.begin(), streamIndices.end(), 0);
+      std::sort(
+          streamIndices.begin(),
+          streamIndices.end(),
+          [&](const uint32_t& a, const uint32_t& b) {
+            return streams[a]->memoryUsed() > streams[b]->memoryUsed();
+          });
+      batchChunkStreams(streamIndices, /*ensureFullChunks=*/false);
+    }
   }
 
   if (shouldFlush()) {
diff --git a/dwio/nimble/velox/VeloxWriter.h b/dwio/nimble/velox/VeloxWriter.h
@@ -92,7 +92,8 @@ class VeloxWriter {
   // Returns 'true' if chunks were written.
   bool writeChunks(
       std::span<const uint32_t> streamIndices,
-      bool lastChunk = true);
+      bool ensureFullChunks = false,
+      bool lastChunk = false);
 };
 
 } // namespace facebook::nimble
diff --git a/dwio/nimble/velox/VeloxWriterOptions.h b/dwio/nimble/velox/VeloxWriterOptions.h
@@ -100,6 +100,11 @@ struct VeloxWriterOptions {
   // Note: this is ignored when it is time to flush a stripe.
   size_t chunkedStreamBatchSize = 1024;
 
+  // When flushing data streams into chunks, streams with raw data size larger
+  // than this threshold will be broken down into multiple smaller chunks. Each
+  // chunk will be at most this size.
+  uint64_t maxStreamChunkRawSize = 4 << 20;
+
   // The factory function that produces the root encoding selection policy.
   // Encoding selection policy is the way to balance the tradeoffs of
   // different performance factors (at both read and write times). Heuristics
diff --git a/dwio/nimble/velox/tests/VeloxWriterTests.cpp b/dwio/nimble/velox/tests/VeloxWriterTests.cpp
@@ -1944,6 +1944,7 @@ struct ChunkFlushPolicyTestCase {
   const uint64_t writerMemoryLowThresholdBytes{75 << 10};
   const double estimatedCompressionFactor{1.3};
   const uint32_t minStreamChunkRawSize{100};
+  const uint32_t maxStreamChunkRawSize{128 << 10};
   const uint32_t expectedStripeCount{0};
   const uint32_t expectedMaxChunkCount{0};
   const uint32_t expectedMinChunkCount{0};
@@ -1959,6 +1960,7 @@ TEST_P(ChunkFlushPolicyTest, ChunkFlushPolicyIntegration) {
       {{"BIGINT", velox::BIGINT()}, {"SMALLINT", velox::SMALLINT()}});
   nimble::VeloxWriterOptions writerOptions{
       .minStreamChunkRawSize = GetParam().minStreamChunkRawSize,
+            .maxStreamChunkRawSize = GetParam().maxStreamChunkRawSize,
       .chunkedStreamBatchSize = GetParam().chunkedStreamBatchSize,
       .flushPolicyFactory = GetParam().enableChunking
           ? []() -> std::unique_ptr<nimble::FlushPolicy> {
@@ -2074,6 +2076,7 @@ INSTANTIATE_TEST_CASE_P(
             .writerMemoryLowThresholdBytes = 75 << 10,
             .estimatedCompressionFactor = 1.3,
             .minStreamChunkRawSize = 100,
+            .maxStreamChunkRawSize = 128 << 10,
             .expectedStripeCount = 4,
             .expectedMaxChunkCount = 1,
             .expectedMinChunkCount = 1,
@@ -2088,13 +2091,29 @@ INSTANTIATE_TEST_CASE_P(
             .writerMemoryLowThresholdBytes = 75 << 10,
             .estimatedCompressionFactor = 1.3,
             .minStreamChunkRawSize = 100,
+            .maxStreamChunkRawSize = 128 << 10,
             .expectedStripeCount = 7,
             .expectedMaxChunkCount = 2,
             .expectedMinChunkCount = 1,
             .chunkedStreamBatchSize = 2,
         },
+        // Reducing maxStreamChunkRawSize produces more chunks
+        ChunkFlushPolicyTestCase{
+            .batchCount = 20,
+            .enableChunking = true,
+            .targetStripeSizeBytes = 250 << 10, // 250KB
+            .writerMemoryHighThresholdBytes = 80 << 10,
+            .writerMemoryLowThresholdBytes = 75 << 10,
+            .estimatedCompressionFactor = 1.0,
+            .minStreamChunkRawSize = 100,
+            .maxStreamChunkRawSize = 12 << 10, // -126KB
+            .expectedStripeCount = 8,
+            .expectedMaxChunkCount = 9, // +7
+            .expectedMinChunkCount = 2, // +1
+            .chunkedStreamBatchSize = 10,
+        },
         // High memory regression threshold and no compression
-        // Produces file identical to RawStripeSizeFlushPolicy
+        // Stripe count identical to RawStripeSizeFlushPolicy
         ChunkFlushPolicyTestCase{
             .batchCount = 20,
             .enableChunking = true,
@@ -2103,8 +2122,9 @@ INSTANTIATE_TEST_CASE_P(
             .writerMemoryLowThresholdBytes = 75 << 10,
             .estimatedCompressionFactor = 1.0,
             .minStreamChunkRawSize = 100,
+            .maxStreamChunkRawSize = 128 << 10,
             .expectedStripeCount = 4,
-            .expectedMaxChunkCount = 1,
+            .expectedMaxChunkCount = 2,
             .expectedMinChunkCount = 1,
             .chunkedStreamBatchSize = 2,
         },
@@ -2118,13 +2138,14 @@ INSTANTIATE_TEST_CASE_P(
             .writerMemoryLowThresholdBytes = 35 << 10, // -40KB
             .estimatedCompressionFactor = 1.3,
             .minStreamChunkRawSize = 100,
+            .maxStreamChunkRawSize = 128 << 10,
             .expectedStripeCount = 10,
             .expectedMaxChunkCount = 2,
             .expectedMinChunkCount = 2, // +1 chunk
             .chunkedStreamBatchSize = 2,
         },
         // High target stripe size bytes (with disabled memory pressure
-        // optimization) produces fewer stripes. Single chunks.
+        // optimization) produces fewer stripes.
         ChunkFlushPolicyTestCase{
             .batchCount = 20,
             .enableChunking = true,
@@ -2133,9 +2154,10 @@ INSTANTIATE_TEST_CASE_P(
             .writerMemoryLowThresholdBytes = 1 << 20, // +1MB
             .estimatedCompressionFactor = 1.3,
             .minStreamChunkRawSize = 100,
+            .maxStreamChunkRawSize = 128 << 10,
             .expectedStripeCount = 1, // -3 stripes
-            .expectedMaxChunkCount = 1,
-            .expectedMinChunkCount = 1,
+            .expectedMaxChunkCount = 5,
+            .expectedMinChunkCount = 2,
             .chunkedStreamBatchSize = 2,
 
         },
@@ -2149,6 +2171,7 @@ INSTANTIATE_TEST_CASE_P(
             .writerMemoryLowThresholdBytes = 1 << 20, // +1MB
             .estimatedCompressionFactor = 1.3,
             .minStreamChunkRawSize = 100,
+            .maxStreamChunkRawSize = 128 << 10,
             .expectedStripeCount = 7, // +6 stripes
             .expectedMaxChunkCount = 1,
             .expectedMinChunkCount = 1,
@@ -2164,6 +2187,7 @@ INSTANTIATE_TEST_CASE_P(
             .writerMemoryLowThresholdBytes = 75 << 10,
             .estimatedCompressionFactor = 1.0,
             .minStreamChunkRawSize = 100,
+            .maxStreamChunkRawSize = 128 << 10,
             .expectedStripeCount = 7,
             .expectedMaxChunkCount = 2,
             .expectedMinChunkCount = 1,

Original file line number	Diff line number	Diff line change
`@@ -153,6 +153,7 @@ add_library(`
`153`	`153`	`VeloxWriter.cpp`
`154`	`154`	`ChunkedStreamWriter.cpp`
`155`	`155`	`VeloxWriterDefaultMetadataOSS.cpp`
	`156`	`+ StreamChunker.cpp`
`156`	`157`	`)`
`157`	`158`	`target_link_libraries(`
`158`	`159`	`nimble_velox_writer`