GoogleCloudPlatform · raj-prince · Sep 21, 2025 · Sep 21, 2025 · gemini-code-assist · Sep 21, 2025
diff --git a/ml_samples/lmdb/lmdb_read_pattern.py b/ml_samples/lmdb/lmdb_read_pattern.py
@@ -0,0 +1,111 @@
+import lmdb
+import numpy as np
+import time
+import os
+import random
+
+DB_DIR = '/home/princer_google_com/gcs/lmdb_test_env'
-DB_DIR = '/home/princer_google_com/gcs/lmdb_test_env'
+DB_DIR = 'lmdb_test_env'
-DB_DIR = '/home/princer_google_com/gcs/lmdb_test_env'
+DB_DIR = 'lmdb_test_env'
+full_path = os.path.abspath(DB_DIR)
+NUM_SAMPLES = 1000
+SAMPLE_SIZE = 128 * 1024 # 128KB data block
+
+def create_lmdb_db():
+    """Creates an LMDB database, storing data as key-value pairs."""
+    if os.path.exists(DB_DIR):
+        # Clean up previous run
+        import shutil
+        # shutil.rmtree(DB_DIR)
+
+    print(f"Creating LMDB database with {NUM_SAMPLES} samples...")
+    # map_size is crucial, must be large enough for all data
+    env = lmdb.open(DB_DIR, map_size=NUM_SAMPLES * SAMPLE_SIZE * 2) 
+
+    with env.begin(write=True) as txn:
+        for i in range(NUM_SAMPLES):
+            # Key is the index (needs to be bytes)
+            key = str(i).encode('ascii')
+
+            # Value is the image/feature data + label (serialized)
+            label = str(i % 10).encode('ascii')
+            data_block = os.urandom(SAMPLE_SIZE)
+            value = label + b'_' + data_block
+
+            txn.put(key, value)
+
+    env.close()
+    print(f"Database created: {DB_DIR}")
+
+def run_random_read():
+    """Simulates a DataLoader requesting a batch of random indices."""
+    print("\n--- Running LMDB Random Read Pattern (simulate batching) ---")
+    env = lmdb.open(DB_DIR, readonly=True, lock=False)
+
+    # Simulate reading a batch of 100 random samples 100 times (10,0 lookups total)
+    NUM_BATCHES = 10
+    BATCH_SIZE = 5
+
+    start_time = time.time()
+
+    with env.begin() as txn:
+        for _ in range(NUM_BATCHES):
+            # --- The LMDBDataset pattern: Random access by key (index) ---
+            random_indices = random.sample(range(NUM_SAMPLES), BATCH_SIZE)
+
+            batch_data = []
+            for idx in random_indices:
+                key = str(idx).encode('ascii')
+                value = txn.get(key)
+
+                # Simulate deserialization (getting the actual data)
+                label, data = value.split(b'_', 1)
+                batch_data.append((label, data))
-                label, data = value.split(b'_', 1)
-                batch_data.append((label, data))
+                if value is not None:
+                    label, data = value.split(b'_', 1)
+                    batch_data.append((label, data))
-                label, data = value.split(b'_', 1)
-                batch_data.append((label, data))
+                if value is not None:
+                    label, data = value.split(b'_', 1)
+                    batch_data.append((label, data))
+
+            # The model consumes the batch_data here
+            pass
+
+    end_time = time.time()
+    read_duration = end_time - start_time
+    print(f"Total read time (10,000 random samples): {read_duration:.4f} seconds")
-    print(f"Total read time (10,000 random samples): {read_duration:.4f} seconds")
+    print(f"Total read time ({NUM_BATCHES * BATCH_SIZE} random samples): {read_duration:.4f} seconds")
-    print(f"Total read time (10,000 random samples): {read_duration:.4f} seconds")
+    print(f"Total read time ({NUM_BATCHES * BATCH_SIZE} random samples): {read_duration:.4f} seconds")
+
+    env.close()
+
+def run_sequential_read():
+    """Reads all samples sequentially using an LMDB cursor."""
+    if not os.path.exists(DB_DIR):
+        print(f"Error: LMDB environment directory '{DB_DIR}' not found. Please run the creation script first.")
+        return
+
+    print("\n--- Running LMDB Sequential Read Pattern (Cursor) ---")
+    env = lmdb.open(DB_DIR, readonly=True, lock=False)
+
+    count = 0
+    start_time = time.time()
+
+    with env.begin() as txn:
+        # Create a cursor to iterate through the database
+        cursor = txn.cursor()
+
+        # Iterate over all key-value pairs sequentially
+        # The .iternext() method is highly efficient
+        for key, value in cursor:
+            # Simulate processing the data (e.g., deserializing an image)
+            # label, data = value.split(b'_', 1) 
+            count += 1
+            if count >= NUM_SAMPLES:
+                 break # Ensure we don't go past the expected number of samples
+
+    end_time = time.time()
+    read_duration = end_time - start_time
+
+    print(f"Total read time ({count} samples sequentially): {read_duration:.4f} seconds")
+
+    env.close()
+
+if __name__ == '__main__':
+    # create_lmdb_db()
+    # run_random_read()
+    run_sequential_read()
+
+    # # Cleanup
+    # import shutil
+    # shutil.rmtree(DB_DIR)
diff --git a/ml_samples/lmdb/rand_read_pattern.txt b/ml_samples/lmdb/rand_read_pattern.txt
@@ -0,0 +1,75 @@
+
+Reader: lmdb_test_env/data.mdb
+Total ranges added: 151
+Final ranges (after merge): 60
+Max offset: 215359488 bytes (210312.00 KB)
+
+Range# | 0         21031.2K  42062.4K  63093.6K  84124.8K  105156.0K 126187.2K 147218.4K 168249.6K 189280.8K 
+--------------------------------------------------------------------------------------------------------------
+     0 | █                                                                                                    | [0, 49152) (len: 49152)
+     1 |                                      █                                                               | [81104896, 81235968) (len: 131072)
+     2 |                                             █                                                        | [95862784, 96256000) (len: 393216)
+     3 |                                      █                                                               | [81383424, 81514496) (len: 131072)
+     4 |                                                  █                                                   | [106270720, 106663936) (len: 393216)
+     5 |                                      █                                                               | [81661952, 81793024) (len: 131072)
+     6 |                                                             █                                        | [130600960, 130994176) (len: 393216)
+     7 |                                                                                █                     | [171831296, 171962368) (len: 131072)
+     8 |                                                                               █                      | [168316928, 168710144) (len: 393216)
+     9 |                                      █                                                               | [81522688, 81653760) (len: 131072)
+    10 |                                                         █                                            | [120868864, 121262080) (len: 393216)
+    11 |                                                                                      █               | [184541184, 184672256) (len: 131072)
+    12 |                                                                                     █                | [182648832, 183042048) (len: 393216)
+    13 |                                                                          █                           | [158986240, 159117312) (len: 131072)
+    14 |                                                                           █                          | [159395840, 159789056) (len: 393216)
+    15 |                                                             █                                        | [130994176, 131321856) (len: 327680)
+    16 |                                                                                            █         | [197251072, 197382144) (len: 131072)
+    17 |                                                                                        █             | [188059648, 188452864) (len: 393216)
+    18 |                                                      █                                               | [115056640, 115449856) (len: 393216)
+    19 |                                                                 █                                    | [139792384, 140185600) (len: 393216)
+    20 |                                                                                     █                | [181702656, 182095872) (len: 393216)
+    21 |                                                                                         █            | [189681664, 190074880) (len: 393216)
+    22 |                                                       █                                              | [117895168, 118288384) (len: 393216)
+    23 |                                                                              █                       | [166154240, 166547456) (len: 393216)
+    24 |                                           █                                                          | [92078080, 92471296) (len: 393216)
+    25 |                                                                           █                          | [160882688, 161275904) (len: 393216)
+    26 |                                      █                                                               | [81653760, 81661952) (len: 8192)
+    27 |                                      █                                                               | [81793024, 81915904) (len: 122880)
+    28 |                                                                                   █                  | [178458624, 178851840) (len: 393216)
+    29 |                                                 █                                                    | [104108032, 104501248) (len: 393216)
+    30 |                                                                                  █                   | [175890432, 176283648) (len: 393216)
+    31 |                                                                                                  █   | [209825792, 209956864) (len: 131072)
+    32 |                                                                                                    █ | [214966272, 215359488) (len: 393216)
+    33 |                                                   █                                                  | [108568576, 108961792) (len: 393216)
+    34 |                                                                                             █        | [198877184, 199270400) (len: 393216)
+    35 |                                                                                                █     | [205905920, 206299136) (len: 393216)
+    36 |                                                                                                 █    | [208068608, 208461824) (len: 393216)
+    37 |                                                            █                                         | [127086592, 127479808) (len: 393216)
+    38 |                                                              █                                       | [133439488, 133832704) (len: 393216)
+    39 |                                           █                                                          | [90456064, 90849280) (len: 393216)
+    40 |                                                                         █                            | [156553216, 156946432) (len: 393216)
+    41 |                                                                             █                        | [163991552, 164384768) (len: 393216)
+    42 |                                           █                                                          | [90849280, 90980352) (len: 131072)
+    43 |                                                █                                                     | [101675008, 102068224) (len: 393216)
+    44 |                                                                                         █            | [190074880, 190205952) (len: 131072)
+    45 |                                                                                         █            | [190357504, 190750720) (len: 393216)
+    46 |                                            █                                                         | [94375936, 94769152) (len: 393216)
+    47 |                                                                                                 █    | [208461824, 208592896) (len: 131072)
+    48 |                                                                                            █         | [196304896, 196698112) (len: 393216)
+    49 |                                                                                  █                   | [174809088, 175202304) (len: 393216)
+    50 |                                                                                              █       | [201445376, 201838592) (len: 393216)
+    51 |                                                             █                                        | [129654784, 130048000) (len: 393216)
+    52 |                                                                            █                         | [162369536, 162762752) (len: 393216)
+    53 |                                                                                            █         | [197931008, 198324224) (len: 393216)
+    54 |                                                                                █                     | [170344448, 170737664) (len: 393216)
+    55 |                                                                                               █      | [203067392, 203460608) (len: 393216)
+    56 |                                                    █                                                 | [110460928, 110854144) (len: 393216)
+    57 |                                            █                                                         | [93159424, 93552640) (len: 393216)
+    58 |                                                                                        █             | [188870656, 189263872) (len: 393216)
+    59 |                                                    █                                                 | [111001600, 111394816) (len: 393216)
+
+Summary Statistics:
+  Total bytes read: 19382272 (18928.00 KB)
+  Average range size: 323037 bytes
+  Min range size: 8192 bytes
+  Max range size: 393216 bytes
+  Read pattern analysis: Random (gaps: 52)
diff --git a/ml_samples/lmdb/requirements.txt b/ml_samples/lmdb/requirements.txt
@@ -0,0 +1,2 @@
+numpy      2.2.6
+lmdb       1.7.3
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		numpy 2.2.6
Copy link Contributor gemini-code-assist bot Sep 21, 2025 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. The `numpy` dependency is listed here, but it is not actually used in the `lmdb_read_pattern.py` script (the corresponding import is unused). Unused dependencies should be removed to keep the project's requirements minimal.
		lmdb 1.7.3