Support Per Stream Chunking to Relieve Memory Pressure (#243)

macvincent · facebook-github-bot · commit ff234c4eb38f · 2025-09-04T18:48:47.000-07:00
Summary: This is an implementation of a detail in the new chunking policy described in this [doc](https://fburl.com/gdoc/gkdwwju1). Rather than chunking all eligible streams, we chunk individual streams in the order of their raw size until memory pressure is relieved. For our unit tests, the maximum number of chunks produced is identical to the previous implementation. But there may be differences for large file sizes. But this requires some experimentation and tuning. Differential Revision: D81715655
diff --git a/dwio/nimble/velox/VeloxWriter.cpp b/dwio/nimble/velox/VeloxWriter.cpp
@@ -641,7 +641,9 @@ void VeloxWriter::flush() {
   }
 }
 
-bool VeloxWriter::writeChunk(bool lastChunk) {
+bool VeloxWriter::writeChunk(
+    bool lastChunk,
+    std::optional<size_t> streamIndex) {
   uint64_t previousFlushWallTime = context_->stripeFlushTiming.wallNanos;
   std::atomic<uint64_t> chunkSize = 0;
   std::atomic<uint64_t> sizeBeforeEncoding = 0;
@@ -742,7 +744,17 @@ bool VeloxWriter::writeChunk(bool lastChunk) {
       }
     };
 
-    if (context_->options.encodingExecutor) {
+#define ENCODE_STREAM_DATA(innerStreamData, isNullStream, streamSize) \
+  do {                                                                \
+    if (isNullStream) {                                               \
+      NullsAsDataStreamData nullsStreamData{innerStreamData};         \
+      encode(nullsStreamData, streamSize);                            \
+    } else {                                                          \
+      encode(innerStreamData, streamSize);                            \
+    }                                                                 \
+  } while (0)
+
+    if (!streamIndex.has_value() && context_->options.encodingExecutor) {
       velox::dwio::common::ExecutorBarrier barrier{
           context_->options.encodingExecutor};
       for (auto& streamData : context_->streams()) {
@@ -753,16 +765,25 @@ bool VeloxWriter::writeChunk(bool lastChunk) {
             *streamData, [&](StreamData& innerStreamData, bool isNullStream) {
               barrier.add(
                   [&innerStreamData, isNullStream, &encode, &streamSize]() {
-                    if (isNullStream) {
-                      NullsAsDataStreamData nullsStreamData{innerStreamData};
-                      encode(nullsStreamData, streamSize);
-                    } else {
-                      encode(innerStreamData, streamSize);
-                    }
+                    ENCODE_STREAM_DATA(
+                        innerStreamData, isNullStream, streamSize);
                   });
             });
       }
       barrier.waitAll();
+    } else if (streamIndex.has_value()) {
+      const auto& streams = context_->streams();
+      NIMBLE_DASSERT(
+          streams.size() >= streamIndex.value(), "Invalid stream index");
+      const auto& streamData = streams[streamIndex.value()];
+      auto& streamSize =
+          context_->columnStats[streamData->descriptor().offset()].physicalSize;
+      processStream(
+          *streamData,
+          [&encode, &streamSize](
+              StreamData& innerStreamData, bool isNullStream) {
+            ENCODE_STREAM_DATA(innerStreamData, isNullStream, streamSize);
+          });
     } else {
       for (auto& streamData : context_->streams()) {
         auto& streamSize =
@@ -772,12 +793,7 @@ bool VeloxWriter::writeChunk(bool lastChunk) {
             *streamData,
             [&encode, &streamSize](
                 StreamData& innerStreamData, bool isNullStream) {
-              if (isNullStream) {
-                NullsAsDataStreamData nullsStreamData{innerStreamData};
-                encode(nullsStreamData, streamSize);
-              } else {
-                encode(innerStreamData, streamSize);
-              }
+              ENCODE_STREAM_DATA(innerStreamData, isNullStream, streamSize);
             });
       }
     }
@@ -802,6 +818,8 @@ bool VeloxWriter::writeChunk(bool lastChunk) {
   VLOG(1) << "writeChunk milliseconds: " << flushWallTimeMs
           << ", chunk bytes: " << chunkSize;
   return wroteChunk.load();
+
+#undef ENCODE_STREAM_DATA
 }
 
 uint32_t VeloxWriter::writeStripe() {
@@ -876,10 +894,23 @@ bool VeloxWriter::tryWriteStripe(bool force) {
   try {
     // TODO: we can improve merge the last chunk write with stripe
     if (decision == FlushDecision::Chunk && context_->options.enableChunking) {
+      const auto& streams = context_->streams();
+      // Sort streams for chunking based on raw memory usage.
+      std::vector<size_t> streamIndices(streams.size());
+      std::iota(streamIndices.begin(), streamIndices.end(), 0);
+      std::sort(
+          streamIndices.begin(),
+          streamIndices.end(),
+          [&](const size_t& a, const size_t& b) {
+            return streams.at(a)->memoryUsed() > streams.at(b)->memoryUsed();
+          });
+      size_t currentIndex = 0;
       bool successfullyChunked = true;
-      while (decision == FlushDecision::Chunk && successfullyChunked) {
-        successfullyChunked = writeChunk(false);
+      while (decision == FlushDecision::Chunk && successfullyChunked &&
+             currentIndex < streams.size()) {
+        successfullyChunked = writeChunk(false, streamIndices.at(currentIndex));
         decision = shouldFlush(successfullyChunked);
+        ++currentIndex;
       }
     }
 
diff --git a/dwio/nimble/velox/VeloxWriter.h b/dwio/nimble/velox/VeloxWriter.h
@@ -87,7 +87,9 @@ class VeloxWriter {
   // Returning 'true' if stripe was written.
   bool tryWriteStripe(bool force = false);
   // Returns 'true' if chunk was written.
-  bool writeChunk(bool lastChunk = true);
+  bool writeChunk(
+      bool lastChunk = true,
+      std::optional<size_t> streamIndex = std::nullopt);
   uint32_t writeStripe();
 };