Refactor Velox Writer to Use New Flush Policy Contract (facebookincubator#242)

macvincent · facebook-github-bot · commit 660170ec617a · 2025-09-29T16:17:51.000-07:00
Summary:

This should be a no-op since no chunking flush policy is currently being used in Prod. but we make three changes in this dif:
1. `writeChunk` now returns a boolean to indicate whether any stream was successfully chunked
2. The previous raw size of the encoded stripe data in the writer context is now stored in the Writer context
3. We update and pass down the memory stats needed by the new flush policy contract

TODO: We will be introducing two more VeloxWriter changes in the next diffs in this stack to:
1. Support per stream chunking instead of always chunking all eligible streams
2. Support breaking down large stream into multiple smaller chunks

Differential Revision: D81545433
diff --git a/dwio/nimble/velox/VeloxWriter.cpp b/dwio/nimble/velox/VeloxWriter.cpp
@@ -44,7 +44,7 @@ namespace detail {
 class WriterContext : public FieldWriterContext {
  public:
   const VeloxWriterOptions options;
-  std::unique_ptr<FlushPolicy> flushPolicy;
+  std::function<std::unique_ptr<FlushPolicy>()> flushPolicyFactory;
   velox::CpuWallTiming totalFlushTiming;
   velox::CpuWallTiming stripeFlushTiming;
   velox::CpuWallTiming encodingSelectionTiming;
@@ -57,6 +57,8 @@ class WriterContext : public FieldWriterContext {
   uint64_t rowsInFile{0};
   uint64_t rowsInStripe{0};
   uint64_t stripeSize{0};
+  // Logical raw size of the encoded stripe data
+  uint64_t stripeEncodedLogicalSize{0};
   uint64_t rawSize{0};
   std::vector<uint64_t> rowsPerStripe;
 
@@ -65,8 +67,8 @@ class WriterContext : public FieldWriterContext {
       VeloxWriterOptions options)
       : FieldWriterContext{memoryPool, options.reclaimerFactory(), options.vectorDecoderVisitor},
         options{std::move(options)},
+        flushPolicyFactory{this->options.flushPolicyFactory},
         logger{this->options.metricsLogger} {
-    flushPolicy = this->options.flushPolicyFactory();
     inputBufferGrowthPolicy = this->options.lowMemoryMode
         ? std::make_unique<ExactGrowthPolicy>()
         : this->options.inputGrowthPolicyFactory();
@@ -82,6 +84,7 @@ class WriterContext : public FieldWriterContext {
     memoryUsed = 0;
     rowsInStripe = 0;
     stripeSize = 0;
+    stripeEncodedLogicalSize = 0;
     ++stripeIndex_;
   }
 
@@ -638,9 +641,11 @@ void VeloxWriter::flush() {
   }
 }
 
-void VeloxWriter::writeChunk(bool lastChunk) {
+bool VeloxWriter::writeChunk(bool lastChunk) {
   uint64_t previousFlushWallTime = context_->stripeFlushTiming.wallNanos;
   std::atomic<uint64_t> chunkSize = 0;
+  std::atomic<uint64_t> logicalSizeBeforeEncoding = 0;
+  std::atomic<bool> wroteChunk = false;
   {
     LoggingScope scope{*context_->logger};
     velox::CpuWallTimer veloxTimer{context_->stripeFlushTiming};
@@ -702,6 +707,8 @@ void VeloxWriter::writeChunk(bool lastChunk) {
           stream.content.push_back(std::move(buffer));
         }
       }
+      wroteChunk = true;
+      logicalSizeBeforeEncoding += streamData.memoryUsed();
       streamData.reset();
     };
 
@@ -711,6 +718,7 @@ void VeloxWriter::writeChunk(bool lastChunk) {
       const auto* context =
           streamData.descriptor().context<WriterStreamContext>();
 
+      // TODO: Breakdown large streams above a threshold into smaller chunks.
       const auto minStreamSize =
           lastChunk ? 0 : context_->options.minStreamChunkRawSize;
 
@@ -779,6 +787,8 @@ void VeloxWriter::writeChunk(bool lastChunk) {
     }
 
     context_->stripeSize += chunkSize;
+    context_->stripeEncodedLogicalSize += logicalSizeBeforeEncoding;
+    context_->memoryUsed -= logicalSizeBeforeEncoding;
   }
 
   // Consider getting this from flush timing.
@@ -787,6 +797,7 @@ void VeloxWriter::writeChunk(bool lastChunk) {
       1'000'000;
   VLOG(1) << "writeChunk milliseconds: " << flushWallTimeMs
           << ", chunk bytes: " << chunkSize;
+  return wroteChunk.load();
 }
 
 uint32_t VeloxWriter::writeStripe() {
@@ -840,37 +851,43 @@ bool VeloxWriter::tryWriteStripe(bool force) {
     return false;
   }
 
+  auto flushPolicy = context_->flushPolicyFactory();
+  NIMBLE_DASSERT(flushPolicy != nullptr, "Flush policy must not be null");
+
   auto shouldFlush = [&]() {
-    return context_->flushPolicy->shouldFlush(StripeProgress{
+    return flushPolicy->shouldFlush(StripeProgress{
         .stripeRawSize = context_->memoryUsed,
-        .stripeEncodedSize = context_->stripeSize});
+        .stripeEncodedSize = context_->stripeSize,
+        .stripeEncodedLogicalSize = context_->stripeEncodedLogicalSize});
   };
 
   auto shouldChunk = [&]() {
-    return context_->flushPolicy->shouldChunk(StripeProgress{
+    return flushPolicy->shouldChunk(StripeProgress{
         .stripeRawSize = context_->memoryUsed,
-        .stripeEncodedSize = context_->stripeSize});
+        .stripeEncodedSize = context_->stripeSize,
+        .stripeEncodedLogicalSize = context_->stripeEncodedLogicalSize,
+    });
   };
 
   try {
     // TODO: we can improve merge the last chunk write with stripe
-    if (context_->options.enableChunking &&
-        shouldChunk() == ChunkDecision::Chunk) {
-      writeChunk(false);
+    if (context_->options.enableChunking) {
+      while (shouldChunk() == ChunkDecision::Chunk && writeChunk(false)) {
+      }
     }
 
     auto decision = force ? FlushDecision::Stripe : shouldFlush();
     if (decision != FlushDecision::Stripe) {
       return false;
     }
 
+    uint32_t stripeSize = writeStripe();
     StripeFlushMetrics metrics{
         .inputSize = context_->stripeSize,
         .rowCount = context_->rowsInStripe,
+        .stripeSize = stripeSize,
         .trackedMemory = context_->memoryUsed,
     };
-
-    metrics.stripeSize = writeStripe();
     context_->logger->logStripeFlush(metrics);
 
     context_->nextStripe();
diff --git a/dwio/nimble/velox/VeloxWriter.h b/dwio/nimble/velox/VeloxWriter.h
@@ -86,7 +86,8 @@ class VeloxWriter {
 
   // Returning 'true' if stripe was written.
   bool tryWriteStripe(bool force = false);
-  void writeChunk(bool lastChunk = true);
+  // Returns 'true' if chunk was written.
+  bool writeChunk(bool lastChunk = true);
   uint32_t writeStripe();
 };
 
diff --git a/dwio/nimble/velox/tests/VeloxWriterTests.cpp b/dwio/nimble/velox/tests/VeloxWriterTests.cpp
@@ -22,6 +22,7 @@
 #include "dwio/nimble/tablet/Constants.h"
 #include "dwio/nimble/velox/ChunkedStream.h"
 #include "dwio/nimble/velox/EncodingLayoutTree.h"
+#include "dwio/nimble/velox/FlushPolicy.h"
 #include "dwio/nimble/velox/SchemaSerialization.h"
 #include "dwio/nimble/velox/StatsGenerated.h"
 #include "dwio/nimble/velox/VeloxReader.h"
@@ -289,7 +290,7 @@ std::vector<velox::RowVectorPtr> generateBatches(
   velox::VectorFuzzer fuzzer(
       {.vectorSize = size, .nullRatio = 0.1}, &pool, seed);
   std::vector<velox::RowVectorPtr> batches;
-
+  batches.reserve(batchCount);
   for (size_t i = 0; i < batchCount; ++i) {
     batches.push_back(fuzzer.fuzzInputFlatRow(type));
   }
@@ -1953,6 +1954,109 @@ TEST_F(VeloxWriterTests, RawSizeWritten) {
   ASSERT_EQ(expectedRawSize, rawSize);
 }
 
+struct ChunkFlushPolicyTestCase {
+  const size_t batchCount{20};
+  const bool enableChunking{true};
+  const uint64_t targetStripeSizeBytes{256 << 10};
+  const uint64_t writerMemoryHighThreshold{80 << 10};
+  const uint64_t writerMemoryLowThreshold{75 << 10};
+  const double compressionRatioFactor{1.0};
+  const uint32_t minStreamChunkRawSize{100};
+  const uint32_t expectedStripeCount{0};
+  const uint32_t expectedMaxChunkCount{0};
+  const uint32_t expectedMinChunkCount{0};
+};
+
+class ChunkFlushPolicyTest
+    : public VeloxWriterTests,
+      public ::testing::WithParamInterface<ChunkFlushPolicyTestCase> {};
+
+TEST_P(ChunkFlushPolicyTest, ChunkFlushPolicyIntegration) {
+  auto type = velox::ROW(
+      {{"BIGINT", velox::BIGINT()}, {"SMALLINT", velox::SMALLINT()}});
+  nimble::VeloxWriterOptions writerOptions{
+      .minStreamChunkRawSize = GetParam().minStreamChunkRawSize,
+      .flushPolicyFactory = GetParam().enableChunking
+          ? []() -> std::unique_ptr<nimble::FlushPolicy> {
+              return std::make_unique<nimble::ChunkFlushPolicy>(
+                  std::make_shared<const nimble::ChunkFlushPolicyConfig>(
+                      nimble::ChunkFlushPolicyConfig{
+                          .writerMemoryHighThreshold = GetParam().writerMemoryHighThreshold,
+                          .writerMemoryLowThreshold = GetParam().writerMemoryLowThreshold,
+                          .targetStripeSizeBytes = GetParam().targetStripeSizeBytes,
+                          .compressionRatioFactor =
+                              GetParam().compressionRatioFactor,
+                      }));
+            }
+          : []() -> std::unique_ptr<nimble::FlushPolicy> {
+              return std::make_unique<nimble::StripeRawSizeFlushPolicy>(
+                  GetParam().targetStripeSizeBytes);
+            },
+      .enableChunking = GetParam().enableChunking,
+  };
+
+  std::string file;
+  auto writeFile = std::make_unique<velox::InMemoryWriteFile>(&file);
+
+  nimble::VeloxWriter writer(
+      *rootPool_, type, std::move(writeFile), std::move(writerOptions));
+  auto batches = generateBatches(
+      type,
+      GetParam().batchCount,
+      /*size=*/4000,
+      /*seed=*/20221110,
+      *leafPool_);
+
+  for (const auto& batch : batches) {
+    writer.write(batch);
+  }
+  writer.close();
+
+  velox::InMemoryReadFile readFile(file);
+  auto selector = std::make_shared<velox::dwio::common::ColumnSelector>(type);
+  nimble::VeloxReader reader(*leafPool_, &readFile, std::move(selector));
+
+  // Verify stripe count
+  auto expectedStripeCount = GetParam().expectedStripeCount;
+  auto actualStripeCount = reader.tabletReader().stripeCount();
+  EXPECT_EQ(expectedStripeCount, actualStripeCount);
+
+  // Verify chunk count
+  auto chunkCountPair = [&]() {
+    nimble::TabletReader tablet{*leafPool_, &readFile};
+    uint32_t maxChunkCount = 0;
+    uint32_t minChunkCount = std::numeric_limits<uint32_t>::max();
+
+    for (uint32_t index = 0; index < actualStripeCount; ++index) {
+      auto stripeIdentifier = tablet.getStripeIdentifier(index);
+      auto streamCount = tablet.streamCount(stripeIdentifier);
+
+      std::vector<uint32_t> streamIds(streamCount);
+      std::iota(streamIds.begin(), streamIds.end(), 0);
+      auto streamLoaders = tablet.load(stripeIdentifier, streamIds);
+
+      for (auto& streamLoader : streamLoaders) {
+        if (!streamLoader) {
+          continue;
+        }
+        nimble::InMemoryChunkedStream chunked{
+            *leafPool_, std::move(streamLoader)};
+        uint32_t chunkCount = 0;
+        while (chunked.hasNext()) {
+          chunked.nextChunk();
+          chunkCount++;
+        }
+        maxChunkCount = std::max(maxChunkCount, chunkCount);
+        minChunkCount = std::min(minChunkCount, chunkCount);
+      }
+    }
+    return std::make_pair(maxChunkCount, minChunkCount);
+  };
+  auto [maxChunkCount, minChunkCount] = chunkCountPair();
+  EXPECT_EQ(GetParam().expectedMaxChunkCount, maxChunkCount);
+  EXPECT_EQ(GetParam().expectedMinChunkCount, minChunkCount);
+}
+
 INSTANTIATE_TEST_CASE_P(
     StripeRawSizeFlushPolicyTestSuite,
     StripeRawSizeFlushPolicyTest,
@@ -1977,4 +2081,91 @@ INSTANTIATE_TEST_CASE_P(
             .batchCount = 100,
             .rawStripeSize = 256 << 20,
             .stripeCount = 1}));
+
+INSTANTIATE_TEST_CASE_P(
+    ChunkFlushPolicyTestSuite,
+    ChunkFlushPolicyTest,
+    ::testing::Values(
+        // Base case (no chunking, RawStripeSizeFlushPolicy)
+        ChunkFlushPolicyTestCase{
+            .batchCount = 20,
+            .enableChunking = false,
+            .targetStripeSizeBytes = 250 << 10, // 250KB
+            .writerMemoryHighThreshold = 80 << 10,
+            .writerMemoryLowThreshold = 75 << 10,
+            .compressionRatioFactor = 1.0,
+            .minStreamChunkRawSize = 100,
+            .expectedStripeCount = 4,
+            .expectedMaxChunkCount = 1,
+            .expectedMinChunkCount = 1,
+        },
+        // Base case with default settings (has chunking)
+        ChunkFlushPolicyTestCase{
+            .batchCount = 20,
+            .enableChunking = true,
+            .targetStripeSizeBytes = 250 << 10, // 250KB
+            .writerMemoryHighThreshold = 80 << 10,
+            .writerMemoryLowThreshold = 75 << 10,
+            .compressionRatioFactor = 1.0,
+            .minStreamChunkRawSize = 100,
+            .expectedStripeCount = 3,
+            .expectedMaxChunkCount = 7,
+            .expectedMinChunkCount = 3,
+        },
+        // High memory regression threshold
+        // Produces file identical to RawStripeSizeFlushPolicy
+        ChunkFlushPolicyTestCase{
+            .batchCount = 20,
+            .enableChunking = true,
+            .targetStripeSizeBytes = 256 << 10,
+            .writerMemoryHighThreshold = 500 << 10, // +420KB
+            .writerMemoryLowThreshold = 495 << 10, // +420KB
+            .compressionRatioFactor = 1.0,
+            .minStreamChunkRawSize = 100,
+            .expectedStripeCount = 4,
+            .expectedMaxChunkCount = 1,
+            .expectedMinChunkCount = 1,
+        },
+        // Low memory regression threshold
+        // Produces file with more chunks per stripe
+        ChunkFlushPolicyTestCase{
+            .batchCount = 20,
+            .enableChunking = true,
+            .targetStripeSizeBytes = 256 << 10,
+            .writerMemoryHighThreshold = 40 << 10, // -40KB
+            .writerMemoryLowThreshold = 35 << 10, // -40KB
+            .compressionRatioFactor = 1.0,
+            .minStreamChunkRawSize = 100,
+            .expectedStripeCount = 3,
+            .expectedMaxChunkCount = 8,
+            .expectedMinChunkCount = 4,
+        },
+        // High target stripe size bytes (with disabled memory pressure
+        // optimization) produces fewer stripes. Single chunks.
+        ChunkFlushPolicyTestCase{
+            .batchCount = 20,
+            .enableChunking = true,
+            .targetStripeSizeBytes = 900 << 10, // +900KB
+            .writerMemoryHighThreshold = 2 << 20, // +2MB
+            .writerMemoryLowThreshold = 1 << 20, // +1MB
+            .compressionRatioFactor = 1.0,
+            .minStreamChunkRawSize = 100,
+            .expectedStripeCount = 1, // -2 stripes
+            .expectedMaxChunkCount = 1,
+            .expectedMinChunkCount = 1,
+        },
+        // Low target stripe size bytes (with disabled memory pressure
+        // optimization) produces more stripes. Single chunks.
+        ChunkFlushPolicyTestCase{
+            .batchCount = 20,
+            .enableChunking = true,
+            .targetStripeSizeBytes = 90 << 10, // -160KB
+            .writerMemoryHighThreshold = 2 << 20, // +2MB
+            .writerMemoryLowThreshold = 1 << 20, // +1MB
+            .compressionRatioFactor = 1.0,
+            .minStreamChunkRawSize = 100,
+            .expectedStripeCount = 7, // +6 stripes
+            .expectedMaxChunkCount = 1,
+            .expectedMinChunkCount = 1,
+        }));
 } // namespace facebook