Refactor Velox Writer to Use New Flush Policy Contract (#242)

macvincent · facebook-github-bot · commit 2fbe79ae9aae · 2025-09-10T12:19:13.000-07:00
Summary:

This should be a no-op since no chunking flush policy is currently being used in Prod. but we make three changes in this dif:
1. `writeChunk` now returns a boolean to indicate whether any stream was successfully chunked
2. The previous raw size of the encoded stripe data in the writer context is now stored in the Writer context
3. We update and pass down the memory stats needed by the new flush policy contract

TODO: We will be introducing two more VeloxWriter changes in the next diffs in this stack to:
1. Support per stream chunking instead of always chunking all eligible streams
2. Support breaking down large stream into multiple smaller chunks

Differential Revision: D81545433
diff --git a/dwio/nimble/velox/VeloxWriter.cpp b/dwio/nimble/velox/VeloxWriter.cpp
@@ -44,7 +44,7 @@ namespace detail {
 class WriterContext : public FieldWriterContext {
  public:
   const VeloxWriterOptions options;
-  std::unique_ptr<FlushPolicy> flushPolicy;
+  std::function<std::unique_ptr<FlushPolicy>()> flushPolicyFactory;
   velox::CpuWallTiming totalFlushTiming;
   velox::CpuWallTiming stripeFlushTiming;
   velox::CpuWallTiming encodingSelectionTiming;
@@ -57,6 +57,8 @@ class WriterContext : public FieldWriterContext {
   uint64_t rowsInFile{0};
   uint64_t rowsInStripe{0};
   uint64_t stripeSize{0};
+  // Previous raw size of the now encoded stripe data
+  uint64_t stripeEncodedRawSize{0};
   uint64_t rawSize{0};
   std::vector<uint64_t> rowsPerStripe;
 
@@ -65,8 +67,8 @@ class WriterContext : public FieldWriterContext {
       VeloxWriterOptions options)
       : FieldWriterContext{memoryPool, options.reclaimerFactory(), options.vectorDecoderVisitor},
         options{std::move(options)},
+        flushPolicyFactory{this->options.flushPolicyFactory},
         logger{this->options.metricsLogger} {
-    flushPolicy = this->options.flushPolicyFactory();
     inputBufferGrowthPolicy = this->options.lowMemoryMode
         ? std::make_unique<ExactGrowthPolicy>()
         : this->options.inputGrowthPolicyFactory();
@@ -82,6 +84,7 @@ class WriterContext : public FieldWriterContext {
     memoryUsed = 0;
     rowsInStripe = 0;
     stripeSize = 0;
+    stripeEncodedRawSize = 0;
     ++stripeIndex_;
   }
 
@@ -638,9 +641,11 @@ void VeloxWriter::flush() {
   }
 }
 
-void VeloxWriter::writeChunk(bool lastChunk) {
+bool VeloxWriter::writeChunk(bool lastChunk) {
   uint64_t previousFlushWallTime = context_->stripeFlushTiming.wallNanos;
   std::atomic<uint64_t> chunkSize = 0;
+  std::atomic<uint64_t> sizeBeforeEncoding = 0;
+  std::atomic<bool> wroteChunk = false;
   {
     LoggingScope scope{*context_->logger};
     velox::CpuWallTimer veloxTimer{context_->stripeFlushTiming};
@@ -702,6 +707,8 @@ void VeloxWriter::writeChunk(bool lastChunk) {
           stream.content.push_back(std::move(buffer));
         }
       }
+      wroteChunk = true;
+      sizeBeforeEncoding += streamData.memoryUsed();
       streamData.reset();
     };
 
@@ -711,6 +718,7 @@ void VeloxWriter::writeChunk(bool lastChunk) {
       const auto* context =
           streamData.descriptor().context<WriterStreamContext>();
 
+      // TODO: Breakdown large streams above a threshold into smaller chunks.
       const auto minStreamSize =
           lastChunk ? 0 : context_->options.minStreamChunkRawSize;
 
@@ -779,6 +787,12 @@ void VeloxWriter::writeChunk(bool lastChunk) {
     }
 
     context_->stripeSize += chunkSize;
+    context_->stripeEncodedRawSize += sizeBeforeEncoding;
+    uint64_t memoryUsed = 0;
+    for (const auto& stream : context_->streams()) {
+      memoryUsed += stream->memoryUsed();
+    }
+    context_->memoryUsed = memoryUsed;
   }
 
   // Consider getting this from flush timing.
@@ -787,6 +801,7 @@ void VeloxWriter::writeChunk(bool lastChunk) {
       1'000'000;
   VLOG(1) << "writeChunk milliseconds: " << flushWallTimeMs
           << ", chunk bytes: " << chunkSize;
+  return wroteChunk.load();
 }
 
 uint32_t VeloxWriter::writeStripe() {
@@ -840,35 +855,53 @@ bool VeloxWriter::tryWriteStripe(bool force) {
     return false;
   }
 
+  auto flushPolicy = context_->flushPolicyFactory();
+  NIMBLE_DASSERT(flushPolicy != nullptr, "Flush policy must not be null");
+
   auto shouldFlush = [&]() {
-    return context_->flushPolicy->shouldFlush(StripeProgress{
+    return flushPolicy->shouldFlush(StripeProgress{
         .stripeRawSize = context_->memoryUsed,
-        .stripeEncodedSize = context_->stripeSize});
+        .stripeEncodedSize = context_->stripeSize,
+        .stripeEncodedRawSize = context_->stripeEncodedRawSize});
+  };
+
+  auto shouldChunk = [&]() {
+    return flushPolicy->shouldChunk(StripeProgress{
+        .stripeRawSize = context_->memoryUsed,
+        .stripeEncodedSize = context_->stripeSize,
+        .stripeEncodedRawSize = context_->stripeEncodedRawSize,
+    });
   };
 
   auto decision = force ? FlushDecision::Stripe : shouldFlush();
+
+  if (context_->options.enableChunking && decision == FlushDecision::None) {
+    decision = shouldChunk();
+  }
+
   if (decision == FlushDecision::None) {
     return false;
   }
 
   try {
     // TODO: we can improve merge the last chunk write with stripe
     if (decision == FlushDecision::Chunk && context_->options.enableChunking) {
-      writeChunk(false);
-      decision = shouldFlush();
+      while (decision == FlushDecision::Chunk && writeChunk(false)) {
+        decision = shouldChunk();
+      }
     }
-
+    decision = (decision != FlushDecision::Stripe) ? shouldFlush() : decision;
     if (decision != FlushDecision::Stripe) {
       return false;
     }
 
+    uint32_t stripeSize = writeStripe();
     StripeFlushMetrics metrics{
         .inputSize = context_->stripeSize,
         .rowCount = context_->rowsInStripe,
+        .stripeSize = stripeSize,
         .trackedMemory = context_->memoryUsed,
     };
-
-    metrics.stripeSize = writeStripe();
     context_->logger->logStripeFlush(metrics);
 
     context_->nextStripe();
diff --git a/dwio/nimble/velox/VeloxWriter.h b/dwio/nimble/velox/VeloxWriter.h
@@ -86,7 +86,8 @@ class VeloxWriter {
 
   // Returning 'true' if stripe was written.
   bool tryWriteStripe(bool force = false);
-  void writeChunk(bool lastChunk = true);
+  // Returns 'true' if chunk was written.
+  bool writeChunk(bool lastChunk = true);
   uint32_t writeStripe();
 };
 
diff --git a/dwio/nimble/velox/tests/VeloxWriterTests.cpp b/dwio/nimble/velox/tests/VeloxWriterTests.cpp
@@ -22,6 +22,7 @@
 #include "dwio/nimble/tablet/Constants.h"
 #include "dwio/nimble/velox/ChunkedStream.h"
 #include "dwio/nimble/velox/EncodingLayoutTree.h"
+#include "dwio/nimble/velox/FlushPolicy.h"
 #include "dwio/nimble/velox/SchemaSerialization.h"
 #include "dwio/nimble/velox/StatsGenerated.h"
 #include "dwio/nimble/velox/VeloxReader.h"
@@ -289,7 +290,7 @@ std::vector<velox::RowVectorPtr> generateBatches(
   velox::VectorFuzzer fuzzer(
       {.vectorSize = size, .nullRatio = 0.1}, &pool, seed);
   std::vector<velox::RowVectorPtr> batches;
-
+  batches.reserve(batchCount);
   for (size_t i = 0; i < batchCount; ++i) {
     batches.push_back(fuzzer.fuzzInputFlatRow(type));
   }
@@ -1951,6 +1952,108 @@ TEST_F(VeloxWriterTests, RawSizeWritten) {
   ASSERT_EQ(expectedRawSize, rawSize);
 }
 
+struct ChunkFlushPolicyTestCase {
+  const size_t batchCount{20};
+  const bool enableChunking{true};
+  const uint64_t targetStripeSizeBytes{256 << 10};
+  const uint64_t writerMaxMemoryBytes{80 << 10};
+  const uint64_t writerMinMemoryBytes{75 << 10};
+  const double compressionRatio{1.0};
+  const uint32_t minStreamChunkRawSize{100};
+  const uint32_t expectedStripeCount{0};
+  const uint32_t expectedMaxChunkCount{0};
+  const uint32_t expectedMinChunkCount{0};
+};
+
+class ChunkFlushPolicyTest
+    : public VeloxWriterTests,
+      public ::testing::WithParamInterface<ChunkFlushPolicyTestCase> {};
+
+TEST_P(ChunkFlushPolicyTest, ChunkFlushPolicyIntegration) {
+  auto type = velox::ROW(
+      {{"BIGINT", velox::BIGINT()}, {"SMALLINT", velox::SMALLINT()}});
+  nimble::VeloxWriterOptions writerOptions{
+      .minStreamChunkRawSize = GetParam().minStreamChunkRawSize,
+      .flushPolicyFactory = GetParam().enableChunking
+          ? []() -> std::unique_ptr<nimble::FlushPolicy> {
+              return std::make_unique<nimble::ChunkFlushPolicy>(
+                  nimble::ChunkFlushPolicyConfig{
+                      .writerMaxMemoryBytes = GetParam().writerMaxMemoryBytes,
+                      .writerMinMemoryBytes = GetParam().writerMinMemoryBytes,
+                      .targetStripeSizeBytes = GetParam().targetStripeSizeBytes,
+                      .compressionRatio =
+                          GetParam().compressionRatio,
+                  });
+            }
+          : []() -> std::unique_ptr<nimble::FlushPolicy> {
+              return std::make_unique<nimble::StripeRawSizeFlushPolicy>(
+                  GetParam().targetStripeSizeBytes);
+            },
+      .enableChunking = GetParam().enableChunking,
+  };
+
+  std::string file;
+  auto writeFile = std::make_unique<velox::InMemoryWriteFile>(&file);
+
+  nimble::VeloxWriter writer(
+      *rootPool_, type, std::move(writeFile), std::move(writerOptions));
+  auto batches = generateBatches(
+      type,
+      GetParam().batchCount,
+      /*size=*/4000,
+      /*seed=*/20221110,
+      *leafPool_);
+
+  for (const auto& batch : batches) {
+    writer.write(batch);
+  }
+  writer.close();
+
+  velox::InMemoryReadFile readFile(file);
+  auto selector = std::make_shared<velox::dwio::common::ColumnSelector>(type);
+  nimble::VeloxReader reader(*leafPool_, &readFile, std::move(selector));
+
+  // Verify stripe count
+  auto expectedStripeCount = GetParam().expectedStripeCount;
+  auto actualStripeCount = reader.tabletReader().stripeCount();
+  EXPECT_EQ(expectedStripeCount, actualStripeCount);
+
+  // Verify chunk count
+  auto chunkCountPair = [&]() {
+    nimble::TabletReader tablet{*leafPool_, &readFile};
+    uint32_t maxChunkCount = 0;
+    uint32_t minChunkCount = std::numeric_limits<uint32_t>::max();
+
+    for (uint32_t index = 0; index < actualStripeCount; ++index) {
+      auto stripeIdentifier = tablet.getStripeIdentifier(index);
+      auto streamCount = tablet.streamCount(stripeIdentifier);
+
+      std::vector<uint32_t> streamIds(streamCount);
+      std::iota(streamIds.begin(), streamIds.end(), 0);
+      auto streamLoaders = tablet.load(stripeIdentifier, streamIds);
+
+      for (auto& streamLoader : streamLoaders) {
+        if (!streamLoader) {
+          continue;
+        }
+        nimble::InMemoryChunkedStream chunked{
+            *leafPool_, std::move(streamLoader)};
+        uint32_t chunkCount = 0;
+        while (chunked.hasNext()) {
+          chunked.nextChunk();
+          chunkCount++;
+        }
+        maxChunkCount = std::max(maxChunkCount, chunkCount);
+        minChunkCount = std::min(minChunkCount, chunkCount);
+      }
+    }
+    return std::make_pair(maxChunkCount, minChunkCount);
+  };
+  auto [maxChunkCount, minChunkCount] = chunkCountPair();
+  EXPECT_EQ(GetParam().expectedMaxChunkCount, maxChunkCount);
+  EXPECT_EQ(GetParam().expectedMinChunkCount, minChunkCount);
+}
+
 INSTANTIATE_TEST_CASE_P(
     StripeRawSizeFlushPolicyTestSuite,
     StripeRawSizeFlushPolicyTest,
@@ -1975,4 +2078,91 @@ INSTANTIATE_TEST_CASE_P(
             .batchCount = 100,
             .rawStripeSize = 256 << 20,
             .stripeCount = 1}));
+
+INSTANTIATE_TEST_CASE_P(
+    ChunkFlushPolicyTestSuite,
+    ChunkFlushPolicyTest,
+    ::testing::Values(
+        // Base case (no chunking, RawStripeSizeFlushPolicy)
+        ChunkFlushPolicyTestCase{
+            .batchCount = 20,
+            .enableChunking = false,
+            .targetStripeSizeBytes = 250 << 10, // 250KB
+            .writerMaxMemoryBytes = 80 << 10,
+            .writerMinMemoryBytes = 75 << 10,
+            .compressionRatio = 1.0,
+            .minStreamChunkRawSize = 100,
+            .expectedStripeCount = 4,
+            .expectedMaxChunkCount = 1,
+            .expectedMinChunkCount = 1,
+        },
+        // Base case with default settings (has chunking)
+        ChunkFlushPolicyTestCase{
+            .batchCount = 20,
+            .enableChunking = true,
+            .targetStripeSizeBytes = 250 << 10, // 250KB
+            .writerMaxMemoryBytes = 80 << 10,
+            .writerMinMemoryBytes = 75 << 10,
+            .compressionRatio = 1.0,
+            .minStreamChunkRawSize = 100,
+            .expectedStripeCount = 3,
+            .expectedMaxChunkCount = 7,
+            .expectedMinChunkCount = 3,
+        },
+        // High memory regression threshold
+        // Produces file identical to RawStripeSizeFlushPolicy
+        ChunkFlushPolicyTestCase{
+            .batchCount = 20,
+            .enableChunking = true,
+            .targetStripeSizeBytes = 256 << 10,
+            .writerMaxMemoryBytes = 500 << 10, // +420KB
+            .writerMinMemoryBytes = 495 << 10, // +420KB
+            .compressionRatio = 1.0,
+            .minStreamChunkRawSize = 100,
+            .expectedStripeCount = 4,
+            .expectedMaxChunkCount = 1,
+            .expectedMinChunkCount = 1,
+        },
+        // Low memory regression threshold
+        // Produces file with more chunks per stripe
+        ChunkFlushPolicyTestCase{
+            .batchCount = 20,
+            .enableChunking = true,
+            .targetStripeSizeBytes = 256 << 10,
+            .writerMaxMemoryBytes = 40 << 10, // -40KB
+            .writerMinMemoryBytes = 35 << 10, // -40KB
+            .compressionRatio = 1.0,
+            .minStreamChunkRawSize = 100,
+            .expectedStripeCount = 3,
+            .expectedMaxChunkCount = 8,
+            .expectedMinChunkCount = 4,
+        },
+        // High target stripe size bytes (with disabled memory pressure
+        // optimization) produces fewer stripes. Single chunks.
+        ChunkFlushPolicyTestCase{
+            .batchCount = 20,
+            .enableChunking = true,
+            .targetStripeSizeBytes = 900 << 10, // +900KB
+            .writerMaxMemoryBytes = 2 << 20, // +2MB
+            .writerMinMemoryBytes = 1 << 20, // +1MB
+            .compressionRatio = 1.0,
+            .minStreamChunkRawSize = 100,
+            .expectedStripeCount = 1, // -2 stripes
+            .expectedMaxChunkCount = 1,
+            .expectedMinChunkCount = 1,
+        },
+        // Low target stripe size bytes (with disabled memory pressure
+        // optimization) produces more stripes. Single chunks.
+        ChunkFlushPolicyTestCase{
+            .batchCount = 20,
+            .enableChunking = true,
+            .targetStripeSizeBytes = 90 << 10, // -160KB
+            .writerMaxMemoryBytes = 2 << 20, // +2MB
+            .writerMinMemoryBytes = 1 << 20, // +1MB
+            .compressionRatio = 1.0,
+            .minStreamChunkRawSize = 100,
+            .expectedStripeCount = 7, // +6 stripes
+            .expectedMaxChunkCount = 1,
+            .expectedMinChunkCount = 1,
+        }));
 } // namespace facebook