diff --git a/modules/parquet-data-format/src/main/java/com/parquet/parquetdataformat/bridge/NativeParquetWriter.java b/modules/parquet-data-format/src/main/java/com/parquet/parquetdataformat/bridge/NativeParquetWriter.java
index 1ffa170c0ad98..bc8932c3c8833 100644
--- a/modules/parquet-data-format/src/main/java/com/parquet/parquetdataformat/bridge/NativeParquetWriter.java
+++ b/modules/parquet-data-format/src/main/java/com/parquet/parquetdataformat/bridge/NativeParquetWriter.java
@@ -10,12 +10,14 @@
import java.io.Closeable;
import java.io.IOException;
+import java.util.concurrent.atomic.AtomicBoolean;
/**
* Type-safe handle for native Parquet writer with lifecycle management.
*/
public class NativeParquetWriter implements Closeable {
+ private final AtomicBoolean writerClosed = new AtomicBoolean(false);
private final String filePath;
/**
@@ -47,15 +49,23 @@ public void flush() throws IOException {
RustBridge.flushToDisk(filePath);
}
+ private ParquetFileMetadata metadata;
+
@Override
public void close() {
- try {
- RustBridge.closeWriter(filePath);
- } catch (IOException e) {
- throw new RuntimeException("Failed to close Parquet writer for " + filePath, e);
+ if (writerClosed.compareAndSet(false, true)) {
+ try {
+ metadata = RustBridge.closeWriter(filePath);
+ } catch (IOException e) {
+ throw new RuntimeException("Failed to close Parquet writer for " + filePath, e);
+ }
}
}
+ public ParquetFileMetadata getMetadata() {
+ return metadata;
+ }
+
public String getFilePath() {
return filePath;
}
diff --git a/modules/parquet-data-format/src/main/java/com/parquet/parquetdataformat/bridge/ParquetFileMetadata.java b/modules/parquet-data-format/src/main/java/com/parquet/parquetdataformat/bridge/ParquetFileMetadata.java
new file mode 100644
index 0000000000000..fc309857be290
--- /dev/null
+++ b/modules/parquet-data-format/src/main/java/com/parquet/parquetdataformat/bridge/ParquetFileMetadata.java
@@ -0,0 +1,78 @@
+package com.parquet.parquetdataformat.bridge;
+
+/**
+ * Represents metadata information for a Parquet file.
+ *
+ * This class holds the essential metadata extracted from a Parquet file
+ * when the writer is closed, providing visibility into the file's characteristics.
+ */
+public record ParquetFileMetadata(int version, long numRows, String createdBy) {
+ /**
+ * Constructs a new ParquetFileMetadata instance.
+ *
+ * @param version the Parquet format version used
+ * @param numRows the total number of rows in the file
+ * @param createdBy the application/library that created the file (can be null)
+ */
+ public ParquetFileMetadata {
+ }
+
+ /**
+ * Gets the Parquet format version.
+ *
+ * @return the version number
+ */
+ @Override
+ public int version() {
+ return version;
+ }
+
+ /**
+ * Gets the total number of rows in the Parquet file.
+ *
+ * @return the number of rows
+ */
+ @Override
+ public long numRows() {
+ return numRows;
+ }
+
+ /**
+ * Gets information about what created this Parquet file.
+ *
+ * @return the creator information, or null if not available
+ */
+ @Override
+ public String createdBy() {
+ return createdBy;
+ }
+
+ @Override
+ public String toString() {
+ return "ParquetFileMetadata{" +
+ "version=" + version +
+ ", numRows=" + numRows +
+ ", createdBy='" + createdBy + '\'' +
+ '}';
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+
+ ParquetFileMetadata that = (ParquetFileMetadata) o;
+
+ if (version != that.version) return false;
+ if (numRows != that.numRows) return false;
+ return createdBy != null ? createdBy.equals(that.createdBy) : that.createdBy == null;
+ }
+
+ @Override
+ public int hashCode() {
+ int result = version;
+ result = 31 * result + (int) (numRows ^ (numRows >>> 32));
+ result = 31 * result + (createdBy != null ? createdBy.hashCode() : 0);
+ return result;
+ }
+}
diff --git a/modules/parquet-data-format/src/main/java/com/parquet/parquetdataformat/bridge/RustBridge.java b/modules/parquet-data-format/src/main/java/com/parquet/parquetdataformat/bridge/RustBridge.java
index 408ef74ea44f7..ebc7af2f7a2bd 100644
--- a/modules/parquet-data-format/src/main/java/com/parquet/parquetdataformat/bridge/RustBridge.java
+++ b/modules/parquet-data-format/src/main/java/com/parquet/parquetdataformat/bridge/RustBridge.java
@@ -31,8 +31,9 @@ public class RustBridge {
// Enhanced native methods that handle validation and provide better error reporting
public static native void createWriter(String file, long schemaAddress) throws IOException;
public static native void write(String file, long arrayAddress, long schemaAddress) throws IOException;
- public static native void closeWriter(String file) throws IOException;
+ public static native ParquetFileMetadata closeWriter(String file) throws IOException;
public static native void flushToDisk(String file) throws IOException;
+ public static native ParquetFileMetadata getFileMetadata(String file) throws IOException;
public static native long getFilteredNativeBytesUsed(String pathPrefix);
diff --git a/modules/parquet-data-format/src/main/java/com/parquet/parquetdataformat/vsr/VSRManager.java b/modules/parquet-data-format/src/main/java/com/parquet/parquetdataformat/vsr/VSRManager.java
index 8f900a4084821..5c404ce0ff586 100644
--- a/modules/parquet-data-format/src/main/java/com/parquet/parquetdataformat/vsr/VSRManager.java
+++ b/modules/parquet-data-format/src/main/java/com/parquet/parquetdataformat/vsr/VSRManager.java
@@ -10,19 +10,16 @@
import com.parquet.parquetdataformat.bridge.ArrowExport;
import com.parquet.parquetdataformat.bridge.NativeParquetWriter;
+import com.parquet.parquetdataformat.bridge.ParquetFileMetadata;
import com.parquet.parquetdataformat.memory.ArrowBufferPool;
import com.parquet.parquetdataformat.writer.ParquetDocumentInput;
-import org.apache.arrow.vector.FieldVector;
import org.apache.arrow.vector.types.pojo.Schema;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.opensearch.index.engine.exec.FlushIn;
import org.opensearch.index.engine.exec.WriteResult;
-import java.io.Closeable;
import java.io.IOException;
-import java.util.HashMap;
-import java.util.Map;
import java.util.concurrent.atomic.AtomicReference;
/**
@@ -107,7 +104,7 @@ public WriteResult addToManagedVSR(ParquetDocumentInput document) throws IOExcep
}
}
- public String flush(FlushIn flushIn) throws IOException {
+ public ParquetFileMetadata flush(FlushIn flushIn) throws IOException {
ManagedVSR currentVSR = managedVSR.get();
logger.info("Flush called for {}, row count: {}", fileName, currentVSR.getRowCount());
try {
@@ -120,15 +117,17 @@ public String flush(FlushIn flushIn) throws IOException {
// Transition VSR to FROZEN state before flushing
currentVSR.moveToFrozen();
logger.info("Flushing {} rows for {}", currentVSR.getRowCount(), fileName);
+ ParquetFileMetadata metadata;
// Write through native writer handle
try (ArrowExport export = currentVSR.exportToArrow()) {
writer.write(export.getArrayAddress(), export.getSchemaAddress());
writer.close();
+ metadata = writer.getMetadata();
}
- logger.info("Successfully flushed data for {}", fileName);
+ logger.debug("Successfully flushed data for {} with metadata: {}", fileName, metadata);
- return fileName;
+ return metadata;
} catch (Exception e) {
logger.error("Error in flush for {}: {}", fileName, e.getMessage(), e);
throw new IOException("Failed to flush data: " + e.getMessage(), e);
diff --git a/modules/parquet-data-format/src/main/java/com/parquet/parquetdataformat/writer/ParquetWriter.java b/modules/parquet-data-format/src/main/java/com/parquet/parquetdataformat/writer/ParquetWriter.java
index 84df70879e550..d6820d4df5aec 100644
--- a/modules/parquet-data-format/src/main/java/com/parquet/parquetdataformat/writer/ParquetWriter.java
+++ b/modules/parquet-data-format/src/main/java/com/parquet/parquetdataformat/writer/ParquetWriter.java
@@ -1,5 +1,6 @@
package com.parquet.parquetdataformat.writer;
+import com.parquet.parquetdataformat.bridge.ParquetFileMetadata;
import com.parquet.parquetdataformat.memory.ArrowBufferPool;
import com.parquet.parquetdataformat.vsr.VSRManager;
import org.apache.arrow.vector.types.pojo.Schema;
@@ -58,16 +59,17 @@ public WriteResult addDoc(ParquetDocumentInput d) throws IOException {
@Override
public FileInfos flush(FlushIn flushIn) throws IOException {
- String fileName = vsrManager.flush(flushIn);
+ ParquetFileMetadata parquetFileMetadata = vsrManager.flush(flushIn);
// no data flushed
- if (fileName == null) {
+ if (file == null) {
return FileInfos.empty();
}
- Path file = Path.of(fileName);
+ Path filePath = Path.of(file);
WriterFileSet writerFileSet = WriterFileSet.builder()
- .directory(file.getParent())
+ .directory(filePath.getParent())
.writerGeneration(writerGeneration)
- .addFile(file.getFileName().toString())
+ .addFile(filePath.getFileName().toString())
+ .addNumRows(parquetFileMetadata.numRows())
.build();
return FileInfos.builder().putWriterFileSet(PARQUET_DATA_FORMAT, writerFileSet).build();
}
diff --git a/modules/parquet-data-format/src/main/rust/src/lib.rs b/modules/parquet-data-format/src/main/rust/src/lib.rs
index 1ef5e7c40da6e..8e1e8f19e323d 100644
--- a/modules/parquet-data-format/src/main/rust/src/lib.rs
+++ b/modules/parquet-data-format/src/main/rust/src/lib.rs
@@ -1,8 +1,8 @@
use arrow::ffi::{FFI_ArrowArray, FFI_ArrowSchema};
use arrow::record_batch::RecordBatch;
use dashmap::DashMap;
-use jni::objects::{JClass, JString};
-use jni::sys::{jint, jlong};
+use jni::objects::{JClass, JString, JObject};
+use jni::sys::{jint, jlong, jobject};
use jni::JNIEnv;
use lazy_static::lazy_static;
use parquet::arrow::ArrowWriter;
@@ -10,6 +10,9 @@ use parquet::basic::{Compression, ZstdLevel};
use parquet::file::properties::WriterProperties;
use std::fs::File;
use std::sync::{Arc, Mutex};
+use parquet::format::FileMetaData as FormatFileMetaData;
+use parquet::file::metadata::FileMetaData as FileFileMetaData;
+use parquet::file::reader::{FileReader, SerializedFileReader};
pub mod logger;
pub mod parquet_merge;
@@ -118,7 +121,7 @@ impl NativeParquetWriter {
}
}
- fn close_writer(filename: String) -> Result<(), Box> {
+ fn close_writer(filename: String) -> Result