Update documentation

vectordotdev · Dec 18, 2024 · ae39cc0 · ae39cc0
1 parent 5d11fd8
commit ae39cc0
Show file tree

Hide file tree

Showing 4 changed files with 12 additions and 3 deletions.
diff --git a/changelog.d/22050-fingerprint-uncompressed-file-content.fix.md b/changelog.d/22050-fingerprint-uncompressed-file-content.fix.md
@@ -0,0 +1,6 @@
+Changes the fingerprinter for file sources to use uncompressed file content
+as a source of truth when fingerprinting lines and checking
+ignored_header_bytes. Previously this was using the compressed bytes. Only gzip
+supported for now.
+
+authors: roykim98 
diff --git a/src/sources/file.rs b/src/sources/file.rs
@@ -298,6 +298,7 @@ pub enum FingerprintConfig {
         bytes: Option<usize>,
 
         /// The number of bytes to skip ahead (or ignore) when reading the data used for generating the checksum.
+        /// If the file is compressed, the number of bytes refer to the header in the uncompressed content.
         ///
         /// This can be helpful if all files share a common header that should be skipped.
         #[serde(default = "default_ignored_header_bytes")]
@@ -306,7 +307,7 @@ pub enum FingerprintConfig {
 
         /// The number of lines to read for generating the checksum.
         ///
-        /// If your files share a common header that is not always a fixed size,
+        /// The number of lines are determined from the uncompressed content if the file is compressed.
         ///
         /// If the file has less than this amount of lines, it won’t be read at all.
         #[serde(default = "default_lines")]

diff --git a/website/cue/reference/components/sources/base/file.cue b/website/cue/reference/components/sources/base/file.cue
@@ -98,6 +98,7 @@ base: components: sources: file: configuration: {
 			ignored_header_bytes: {
 				description: """
 					The number of bytes to skip ahead (or ignore) when reading the data used for generating the checksum.
+					If the file is compressed, the number of bytes refer to the header in the uncompressed content.
 
 					This can be helpful if all files share a common header that should be skipped.
 					"""
@@ -112,7 +113,7 @@ base: components: sources: file: configuration: {
 				description: """
 					The number of lines to read for generating the checksum.
 
-					If your files share a common header that is not always a fixed size,
+					The number of lines are determined from the uncompressed content if the file is compressed.
 
 					If the file has less than this amount of lines, it won’t be read at all.
 					"""

diff --git a/website/cue/reference/components/sources/file.cue b/website/cue/reference/components/sources/file.cue
@@ -219,7 +219,8 @@ components: sources: file: {
 				check](\(urls.crc)) (CRC) on the first N lines of the file. This serves as a
 				*fingerprint* that uniquely identifies the file. The number of lines, N, that are
 				read can be set using the [`fingerprint.lines`](#fingerprint.lines) and
-				[`fingerprint.ignored_header_bytes`](#fingerprint.ignored_header_bytes) options.
+				[`fingerprint.ignored_header_bytes`](#fingerprint.ignored_header_bytes) options. Note 
+				that for compressed files, these lines and header bytes refer to the uncompressed content.
 
 				This strategy avoids the common pitfalls associated with using device and inode
 				names since inode names can be reused across files. This enables Vector to properly