Refactor Velox Writer to Use New Flush Policy Contract (facebookincubator#242)

macvincent · meta-codesync[bot] · commit 92978c7b3cc0 · 2025-10-07T17:52:46.000-07:00
Summary: Pull Request resolved: facebookincubator#242 This should be a no-op since no chunking flush policy is currently being used in Prod. but we make three changes in this dif: 1. `writeChunk` now returns a boolean to indicate whether any stream was successfully chunked 2. The previous raw size of the encoded stripe data in the writer context is now stored in the Writer context 3. We update and pass down the memory stats needed by the new flush policy contract TODO: We will be introducing two more VeloxWriter changes in the next diffs in this stack to: 1. Support per stream chunking instead of always chunking all eligible streams 2. Support breaking down large stream into multiple smaller chunks Differential Revision: D81545433
diff --git a/dwio/nimble/velox/VeloxWriter.cpp b/dwio/nimble/velox/VeloxWriter.cpp
@@ -44,7 +44,6 @@ namespace detail {
 class WriterContext : public FieldWriterContext {
  public:
   const VeloxWriterOptions options;
-  std::unique_ptr<FlushPolicy> flushPolicy;
   velox::CpuWallTiming totalFlushTiming;
   velox::CpuWallTiming stripeFlushTiming;
   velox::CpuWallTiming encodingSelectionTiming;
@@ -56,8 +55,11 @@ class WriterContext : public FieldWriterContext {
   uint64_t bytesWritten{0};
   uint64_t rowsInFile{0};
   uint64_t rowsInStripe{0};
-  uint64_t stripeSize{0};
-  uint64_t rawSize{0};
+  // Physical size of the encoded stripe data.
+  uint64_t stripeEncodedPhysicalSize{0};
+  // Logical size of the encoded stripe data.
+  uint64_t stripeEncodedLogicalSize{0};
+  uint64_t fileRawSize{0};
   std::vector<uint64_t> rowsPerStripe;
 
   WriterContext(
@@ -66,7 +68,6 @@ class WriterContext : public FieldWriterContext {
       : FieldWriterContext{memoryPool, options.reclaimerFactory(), options.vectorDecoderVisitor},
         options{std::move(options)},
         logger{this->options.metricsLogger} {
-    flushPolicy = this->options.flushPolicyFactory();
     inputBufferGrowthPolicy = this->options.lowMemoryMode
         ? std::make_unique<ExactGrowthPolicy>()
         : this->options.inputGrowthPolicyFactory();
@@ -81,7 +82,8 @@ class WriterContext : public FieldWriterContext {
     rowsPerStripe.push_back(rowsInStripe);
     memoryUsed = 0;
     rowsInStripe = 0;
-    stripeSize = 0;
+    stripeEncodedPhysicalSize = 0;
+    stripeEncodedLogicalSize = 0;
     ++stripeIndex_;
   }
 
@@ -99,6 +101,45 @@ namespace {
 
 constexpr uint32_t kInitialSchemaSectionSize = 1 << 20; // 1MB
 
+// When writing null streams, we write the nulls as data, and the stream itself
+// is non-nullable. This adpater class is how we expose the nulls as values.
+class NullsAsDataStreamData : public StreamData {
+ public:
+  explicit NullsAsDataStreamData(StreamData& streamData)
+      : StreamData(streamData.descriptor()), streamData_{streamData} {
+    streamData_.materialize();
+  }
+
+  inline virtual std::string_view data() const override {
+    return {
+        reinterpret_cast<const char*>(streamData_.nonNulls().data()),
+        streamData_.nonNulls().size()};
+  }
+
+  inline virtual std::span<const bool> nonNulls() const override {
+    return {};
+  }
+
+  inline virtual bool hasNulls() const override {
+    return false;
+  }
+
+  inline virtual bool empty() const override {
+    return streamData_.empty();
+  }
+
+  inline virtual uint64_t memoryUsed() const override {
+    return streamData_.memoryUsed();
+  }
+
+  inline virtual void reset() override {
+    streamData_.reset();
+  }
+
+ private:
+  StreamData& streamData_;
+};
+
 class WriterStreamContext : public StreamContext {
  public:
   bool isNullStream = false;
@@ -132,7 +173,7 @@ std::string_view encode(
   std::unique_ptr<EncodingSelectionPolicy<T>> policy;
   if (encodingLayout.has_value()) {
     policy = std::make_unique<ReplayedEncodingSelectionPolicy<T>>(
-        encodingLayout.value(),
+        std::move(encodingLayout).value(),
         context.options.compressionOptions,
         context.options.encodingSelectionPolicyFactory);
 
@@ -167,7 +208,7 @@ std::string_view encodeStreamTyped(
   }
 
   try {
-    return encode<T>(encodingLayout, context, buffer, streamData);
+    return encode<T>(std::move(encodingLayout), context, buffer, streamData);
   } catch (const NimbleUserError& e) {
     if (e.errorCode() != error_code::IncompatibleEncoding ||
         !encodingLayout.has_value()) {
@@ -214,7 +255,8 @@ template <typename Set>
 void findNodeIds(
     const velox::dwio::common::TypeWithId& typeWithId,
     Set& output,
-    std::function<bool(const velox::dwio::common::TypeWithId&)> predicate) {
+    const std::function<bool(const velox::dwio::common::TypeWithId&)>&
+        predicate) {
   if (predicate(typeWithId)) {
     output.insert(typeWithId.id());
   }
@@ -515,7 +557,7 @@ bool VeloxWriter::write(const velox::VectorPtr& vector) {
     auto rawSize = nimble::getRawSizeFromVector(
         vector, velox::common::Ranges::of(0, size));
     DWIO_ENSURE_GE(rawSize, 0, "Invalid raw size");
-    context_->rawSize += rawSize;
+    context_->fileRawSize += rawSize;
 
     if (context_->options.writeExecutor) {
       velox::dwio::common::ExecutorBarrier barrier{
@@ -580,7 +622,8 @@ void VeloxWriter::close() {
             *context_->schemaBuilder.getRoot(), context_->columnStats);
         // TODO(T228118622): Write column stats to file.
         flatbuffers::FlatBufferBuilder builder;
-        builder.Finish(serialization::CreateStats(builder, context_->rawSize));
+        builder.Finish(
+            serialization::CreateStats(builder, context_->fileRawSize));
         writer_.writeOptionalSection(
             std::string(kStatsSection),
             {reinterpret_cast<const char*>(builder.GetBufferPointer()),
@@ -650,45 +693,6 @@ void VeloxWriter::writeChunk(bool lastChunk) {
     }
     streams_.resize(context_->schemaBuilder.nodeCount());
 
-    // When writing null streams, we write the nulls as data, and the stream
-    // itself is non-nullable. This adpater class is how we expose the nulls as
-    // values.
-    class NullsAsDataStreamData : public StreamData {
-     public:
-      explicit NullsAsDataStreamData(StreamData& streamData)
-          : StreamData(streamData.descriptor()), streamData_{streamData} {
-        streamData_.materialize();
-      }
-
-      inline virtual std::string_view data() const override {
-        return {
-            reinterpret_cast<const char*>(streamData_.nonNulls().data()),
-            streamData_.nonNulls().size()};
-      }
-
-      inline virtual std::span<const bool> nonNulls() const override {
-        return {};
-      }
-
-      inline virtual bool hasNulls() const override {
-        return false;
-      }
-
-      inline virtual bool empty() const override {
-        return streamData_.empty();
-      }
-      inline virtual uint64_t memoryUsed() const override {
-        return streamData_.memoryUsed();
-      }
-
-      inline virtual void reset() override {
-        streamData_.reset();
-      }
-
-     private:
-      StreamData& streamData_;
-    };
-
     auto encode = [&](StreamData& streamData, uint64_t& streamSize) {
       const auto offset = streamData.descriptor().offset();
       auto encoded = encodeStream(*context_, *encodingBuffer_, streamData);
@@ -777,8 +781,110 @@ void VeloxWriter::writeChunk(bool lastChunk) {
     if (lastChunk) {
       root_->reset();
     }
+  }
+
+  // Consider getting this from flush timing.
+  auto flushWallTimeMs =
+      (context_->stripeFlushTiming.wallNanos - previousFlushWallTime) /
+      1'000'000;
+  VLOG(1) << "writeChunk milliseconds: " << flushWallTimeMs
+          << ", chunk bytes: " << chunkSize;
+}
+
+bool VeloxWriter::writeChunks(bool lastChunk) {
+  uint64_t previousFlushWallTime = context_->stripeFlushTiming.wallNanos;
+  std::atomic<uint64_t> chunkSize = 0;
+  std::atomic<uint64_t> logicalSizeBeforeEncoding = 0;
+  std::atomic<bool> wroteChunk = false;
+  {
+    LoggingScope scope{*context_->logger};
+    velox::CpuWallTimer veloxTimer{context_->stripeFlushTiming};
+
+    if (!encodingBuffer_) {
+      encodingBuffer_ = std::make_unique<Buffer>(*encodingMemoryPool_);
+    }
+    streams_.resize(context_->schemaBuilder.nodeCount());
 
-    context_->stripeSize += chunkSize;
+    auto encode = [&](StreamData& streamData) {
+      const auto* context =
+          streamData.descriptor().context<WriterStreamContext>();
+      std::string_view encoded;
+      if (context && context->isNullStream) {
+        encoded = encodeStream(
+            *context_, *encodingBuffer_, NullsAsDataStreamData{streamData});
+      } else {
+        encoded = encodeStream(*context_, *encodingBuffer_, streamData);
+      }
+
+      if (!encoded.empty()) {
+        const auto offset = streamData.descriptor().offset();
+        NIMBLE_DASSERT(offset < streams_.size(), "Stream offset out of range.");
+        auto& stream = streams_[offset];
+        auto& streamSize = context_->columnStats[offset].physicalSize;
+        ChunkedStreamWriter chunkWriter{*encodingBuffer_};
+        for (auto& buffer : chunkWriter.encode(encoded)) {
+          streamSize += buffer.size();
+          chunkSize += buffer.size();
+          stream.content.push_back(std::move(buffer));
+        }
+      }
+      wroteChunk = true;
+      logicalSizeBeforeEncoding += streamData.memoryUsed();
+      streamData.reset();
+    };
+
+    auto processStream = [&](StreamData& streamData) {
+      // TODO: Breakdown large streams above a threshold into smaller chunks.
+      // For null streams we will promote the null values to be written as
+      // boolean data. We still apply the same null logic, where if all values
+      // are non-nulls, we omit the entire stream.
+      const auto minStreamSize =
+          lastChunk ? 0 : context_->options.minStreamChunkRawSize;
+      const auto* context =
+          streamData.descriptor().context<WriterStreamContext>();
+      bool isNullStream = context && context->isNullStream;
+
+      bool shouldChunkStream = false;
+      if (isNullStream) {
+        shouldChunkStream = streamData.hasNulls() &&
+            streamData.nonNulls().size() > minStreamSize;
+      } else {
+        shouldChunkStream = streamData.data().size() > minStreamSize;
+      }
+
+      // If we have previous written chunks for this stream, during final
+      // chunk, always write any remaining data.
+      if (lastChunk && !shouldChunkStream &&
+          !streams_[streamData.descriptor().offset()].content.empty()) {
+        shouldChunkStream =
+            !streamData.empty() || !streamData.nonNulls().empty();
+      }
+
+      if (shouldChunkStream) {
+        encode(streamData);
+      }
+    };
+
+    if (context_->options.encodingExecutor) {
+      velox::dwio::common::ExecutorBarrier barrier{
+          context_->options.encodingExecutor};
+      for (auto& streamData : context_->streams()) {
+        barrier.add([&] { processStream(*streamData); });
+      }
+      barrier.waitAll();
+    } else {
+      for (auto& streamData : context_->streams()) {
+        processStream(*streamData);
+      }
+    }
+
+    if (lastChunk) {
+      root_->reset();
+    }
+
+    context_->stripeEncodedPhysicalSize += chunkSize;
+    context_->stripeEncodedLogicalSize += logicalSizeBeforeEncoding;
+    context_->memoryUsed -= logicalSizeBeforeEncoding;
   }
 
   // Consider getting this from flush timing.
@@ -787,10 +893,16 @@ void VeloxWriter::writeChunk(bool lastChunk) {
       1'000'000;
   VLOG(1) << "writeChunk milliseconds: " << flushWallTimeMs
           << ", chunk bytes: " << chunkSize;
+  return wroteChunk.load();
 }
 
 uint32_t VeloxWriter::writeStripe() {
-  writeChunk(true);
+  if (context_->options.enableChunking) {
+    writeChunks(true);
+
+  } else {
+    writeChunk(true);
+  }
 
   uint64_t previousFlushWallTime = context_->stripeFlushTiming.wallNanos;
   uint64_t stripeSize = 0;
@@ -840,37 +952,43 @@ bool VeloxWriter::tryWriteStripe(bool force) {
     return false;
   }
 
+  auto flushPolicy = context_->options.flushPolicyFactory();
+  NIMBLE_DASSERT(flushPolicy != nullptr, "Flush policy must not be null");
+
   auto shouldFlush = [&]() {
-    return context_->flushPolicy->shouldFlush(StripeProgress{
+    return flushPolicy->shouldFlush(StripeProgress{
         .stripeRawSize = context_->memoryUsed,
-        .stripeEncodedSize = context_->stripeSize});
+        .stripeEncodedSize = context_->stripeEncodedPhysicalSize,
+        .stripeEncodedLogicalSize = context_->stripeEncodedLogicalSize});
   };
 
   auto shouldChunk = [&]() {
-    return context_->flushPolicy->shouldChunk(StripeProgress{
+    return flushPolicy->shouldChunk(StripeProgress{
         .stripeRawSize = context_->memoryUsed,
-        .stripeEncodedSize = context_->stripeSize});
+        .stripeEncodedSize = context_->stripeEncodedPhysicalSize,
+        .stripeEncodedLogicalSize = context_->stripeEncodedLogicalSize,
+    });
   };
 
   try {
     // TODO: we can improve merge the last chunk write with stripe
-    if (context_->options.enableChunking &&
-        shouldChunk() == ChunkDecision::Chunk) {
-      writeChunk(false);
+    if (context_->options.enableChunking) {
+      while (shouldChunk() == ChunkDecision::Chunk && writeChunks(false)) {
+      }
     }
 
     auto decision = force ? FlushDecision::Stripe : shouldFlush();
     if (decision != FlushDecision::Stripe) {
       return false;
     }
 
+    uint32_t stripeSize = writeStripe();
     StripeFlushMetrics metrics{
-        .inputSize = context_->stripeSize,
+        .inputSize = context_->stripeEncodedPhysicalSize,
         .rowCount = context_->rowsInStripe,
+        .stripeSize = stripeSize,
         .trackedMemory = context_->memoryUsed,
     };
-
-    metrics.stripeSize = writeStripe();
     context_->logger->logStripeFlush(metrics);
 
     context_->nextStripe();
@@ -890,7 +1008,7 @@ VeloxWriter::RunStats VeloxWriter::getRunStats() const {
   return RunStats{
       .bytesWritten = context_->bytesWritten,
       .stripeCount = folly::to<uint32_t>(context_->getStripeIndex()),
-      .rawSize = context_->rawSize,
+      .rawSize = context_->fileRawSize,
       .rowsPerStripe = context_->rowsPerStripe,
       .flushCpuTimeUsec = context_->totalFlushTiming.cpuNanos / 1000,
       .flushWallTimeUsec = context_->totalFlushTiming.wallNanos / 1000,
diff --git a/dwio/nimble/velox/VeloxWriter.h b/dwio/nimble/velox/VeloxWriter.h
@@ -87,6 +87,8 @@ class VeloxWriter {
   // Returning 'true' if stripe was written.
   bool tryWriteStripe(bool force = false);
   void writeChunk(bool lastChunk = true);
+  // Returns 'true' if chunks were written.
+  bool writeChunks(bool lastChunk = true);
   uint32_t writeStripe();
 };
 
diff --git a/dwio/nimble/velox/tests/VeloxWriterTests.cpp b/dwio/nimble/velox/tests/VeloxWriterTests.cpp