-
Notifications
You must be signed in to change notification settings - Fork 4
feat: adding script to read from lmdb #task #57
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,111 @@ | ||||||||||||
| import lmdb | ||||||||||||
| import numpy as np | ||||||||||||
| import time | ||||||||||||
| import os | ||||||||||||
| import random | ||||||||||||
|
|
||||||||||||
| DB_DIR = '/home/princer_google_com/gcs/lmdb_test_env' | ||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The database directory
Suggested change
|
||||||||||||
| full_path = os.path.abspath(DB_DIR) | ||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||||||||||||
| NUM_SAMPLES = 1000 | ||||||||||||
| SAMPLE_SIZE = 128 * 1024 # 128KB data block | ||||||||||||
|
|
||||||||||||
| def create_lmdb_db(): | ||||||||||||
| """Creates an LMDB database, storing data as key-value pairs.""" | ||||||||||||
| if os.path.exists(DB_DIR): | ||||||||||||
| # Clean up previous run | ||||||||||||
| import shutil | ||||||||||||
| # shutil.rmtree(DB_DIR) | ||||||||||||
|
|
||||||||||||
| print(f"Creating LMDB database with {NUM_SAMPLES} samples...") | ||||||||||||
| # map_size is crucial, must be large enough for all data | ||||||||||||
| env = lmdb.open(DB_DIR, map_size=NUM_SAMPLES * SAMPLE_SIZE * 2) | ||||||||||||
|
|
||||||||||||
| with env.begin(write=True) as txn: | ||||||||||||
| for i in range(NUM_SAMPLES): | ||||||||||||
| # Key is the index (needs to be bytes) | ||||||||||||
| key = str(i).encode('ascii') | ||||||||||||
|
|
||||||||||||
| # Value is the image/feature data + label (serialized) | ||||||||||||
| label = str(i % 10).encode('ascii') | ||||||||||||
| data_block = os.urandom(SAMPLE_SIZE) | ||||||||||||
| value = label + b'_' + data_block | ||||||||||||
|
|
||||||||||||
| txn.put(key, value) | ||||||||||||
|
|
||||||||||||
| env.close() | ||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This For example, you could change line 21 to |
||||||||||||
| print(f"Database created: {DB_DIR}") | ||||||||||||
|
|
||||||||||||
| def run_random_read(): | ||||||||||||
| """Simulates a DataLoader requesting a batch of random indices.""" | ||||||||||||
| print("\n--- Running LMDB Random Read Pattern (simulate batching) ---") | ||||||||||||
| env = lmdb.open(DB_DIR, readonly=True, lock=False) | ||||||||||||
|
|
||||||||||||
| # Simulate reading a batch of 100 random samples 100 times (10,0 lookups total) | ||||||||||||
| NUM_BATCHES = 10 | ||||||||||||
| BATCH_SIZE = 5 | ||||||||||||
|
|
||||||||||||
| start_time = time.time() | ||||||||||||
|
|
||||||||||||
| with env.begin() as txn: | ||||||||||||
| for _ in range(NUM_BATCHES): | ||||||||||||
| # --- The LMDBDataset pattern: Random access by key (index) --- | ||||||||||||
| random_indices = random.sample(range(NUM_SAMPLES), BATCH_SIZE) | ||||||||||||
|
|
||||||||||||
| batch_data = [] | ||||||||||||
| for idx in random_indices: | ||||||||||||
| key = str(idx).encode('ascii') | ||||||||||||
| value = txn.get(key) | ||||||||||||
|
|
||||||||||||
| # Simulate deserialization (getting the actual data) | ||||||||||||
| label, data = value.split(b'_', 1) | ||||||||||||
| batch_data.append((label, data)) | ||||||||||||
|
Comment on lines
+60
to
+61
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||
|
|
||||||||||||
| # The model consumes the batch_data here | ||||||||||||
| pass | ||||||||||||
|
|
||||||||||||
| end_time = time.time() | ||||||||||||
| read_duration = end_time - start_time | ||||||||||||
| print(f"Total read time (10,000 random samples): {read_duration:.4f} seconds") | ||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The log message here, and the comment on line 43, state that 10,000 random samples are being read. However, with
Suggested change
|
||||||||||||
|
|
||||||||||||
| env.close() | ||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This For example, you could change line 41 to |
||||||||||||
|
|
||||||||||||
| def run_sequential_read(): | ||||||||||||
| """Reads all samples sequentially using an LMDB cursor.""" | ||||||||||||
| if not os.path.exists(DB_DIR): | ||||||||||||
| print(f"Error: LMDB environment directory '{DB_DIR}' not found. Please run the creation script first.") | ||||||||||||
| return | ||||||||||||
|
|
||||||||||||
| print("\n--- Running LMDB Sequential Read Pattern (Cursor) ---") | ||||||||||||
| env = lmdb.open(DB_DIR, readonly=True, lock=False) | ||||||||||||
|
|
||||||||||||
| count = 0 | ||||||||||||
| start_time = time.time() | ||||||||||||
|
|
||||||||||||
| with env.begin() as txn: | ||||||||||||
| # Create a cursor to iterate through the database | ||||||||||||
| cursor = txn.cursor() | ||||||||||||
|
|
||||||||||||
| # Iterate over all key-value pairs sequentially | ||||||||||||
| # The .iternext() method is highly efficient | ||||||||||||
| for key, value in cursor: | ||||||||||||
| # Simulate processing the data (e.g., deserializing an image) | ||||||||||||
| # label, data = value.split(b'_', 1) | ||||||||||||
| count += 1 | ||||||||||||
| if count >= NUM_SAMPLES: | ||||||||||||
| break # Ensure we don't go past the expected number of samples | ||||||||||||
|
Comment on lines
+94
to
+95
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||||||||||||
|
|
||||||||||||
| end_time = time.time() | ||||||||||||
| read_duration = end_time - start_time | ||||||||||||
|
|
||||||||||||
| print(f"Total read time ({count} samples sequentially): {read_duration:.4f} seconds") | ||||||||||||
|
|
||||||||||||
| env.close() | ||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This For example, you could change line 79 to |
||||||||||||
|
|
||||||||||||
| if __name__ == '__main__': | ||||||||||||
| # create_lmdb_db() | ||||||||||||
| # run_random_read() | ||||||||||||
| run_sequential_read() | ||||||||||||
|
|
||||||||||||
| # # Cleanup | ||||||||||||
| # import shutil | ||||||||||||
| # shutil.rmtree(DB_DIR) | ||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,75 @@ | ||
|
|
||
| Reader: lmdb_test_env/data.mdb | ||
| Total ranges added: 151 | ||
| Final ranges (after merge): 60 | ||
| Max offset: 215359488 bytes (210312.00 KB) | ||
|
|
||
| Range# | 0 21031.2K 42062.4K 63093.6K 84124.8K 105156.0K 126187.2K 147218.4K 168249.6K 189280.8K | ||
| -------------------------------------------------------------------------------------------------------------- | ||
| 0 | █ | [0, 49152) (len: 49152) | ||
| 1 | █ | [81104896, 81235968) (len: 131072) | ||
| 2 | █ | [95862784, 96256000) (len: 393216) | ||
| 3 | █ | [81383424, 81514496) (len: 131072) | ||
| 4 | █ | [106270720, 106663936) (len: 393216) | ||
| 5 | █ | [81661952, 81793024) (len: 131072) | ||
| 6 | █ | [130600960, 130994176) (len: 393216) | ||
| 7 | █ | [171831296, 171962368) (len: 131072) | ||
| 8 | █ | [168316928, 168710144) (len: 393216) | ||
| 9 | █ | [81522688, 81653760) (len: 131072) | ||
| 10 | █ | [120868864, 121262080) (len: 393216) | ||
| 11 | █ | [184541184, 184672256) (len: 131072) | ||
| 12 | █ | [182648832, 183042048) (len: 393216) | ||
| 13 | █ | [158986240, 159117312) (len: 131072) | ||
| 14 | █ | [159395840, 159789056) (len: 393216) | ||
| 15 | █ | [130994176, 131321856) (len: 327680) | ||
| 16 | █ | [197251072, 197382144) (len: 131072) | ||
| 17 | █ | [188059648, 188452864) (len: 393216) | ||
| 18 | █ | [115056640, 115449856) (len: 393216) | ||
| 19 | █ | [139792384, 140185600) (len: 393216) | ||
| 20 | █ | [181702656, 182095872) (len: 393216) | ||
| 21 | █ | [189681664, 190074880) (len: 393216) | ||
| 22 | █ | [117895168, 118288384) (len: 393216) | ||
| 23 | █ | [166154240, 166547456) (len: 393216) | ||
| 24 | █ | [92078080, 92471296) (len: 393216) | ||
| 25 | █ | [160882688, 161275904) (len: 393216) | ||
| 26 | █ | [81653760, 81661952) (len: 8192) | ||
| 27 | █ | [81793024, 81915904) (len: 122880) | ||
| 28 | █ | [178458624, 178851840) (len: 393216) | ||
| 29 | █ | [104108032, 104501248) (len: 393216) | ||
| 30 | █ | [175890432, 176283648) (len: 393216) | ||
| 31 | █ | [209825792, 209956864) (len: 131072) | ||
| 32 | █ | [214966272, 215359488) (len: 393216) | ||
| 33 | █ | [108568576, 108961792) (len: 393216) | ||
| 34 | █ | [198877184, 199270400) (len: 393216) | ||
| 35 | █ | [205905920, 206299136) (len: 393216) | ||
| 36 | █ | [208068608, 208461824) (len: 393216) | ||
| 37 | █ | [127086592, 127479808) (len: 393216) | ||
| 38 | █ | [133439488, 133832704) (len: 393216) | ||
| 39 | █ | [90456064, 90849280) (len: 393216) | ||
| 40 | █ | [156553216, 156946432) (len: 393216) | ||
| 41 | █ | [163991552, 164384768) (len: 393216) | ||
| 42 | █ | [90849280, 90980352) (len: 131072) | ||
| 43 | █ | [101675008, 102068224) (len: 393216) | ||
| 44 | █ | [190074880, 190205952) (len: 131072) | ||
| 45 | █ | [190357504, 190750720) (len: 393216) | ||
| 46 | █ | [94375936, 94769152) (len: 393216) | ||
| 47 | █ | [208461824, 208592896) (len: 131072) | ||
| 48 | █ | [196304896, 196698112) (len: 393216) | ||
| 49 | █ | [174809088, 175202304) (len: 393216) | ||
| 50 | █ | [201445376, 201838592) (len: 393216) | ||
| 51 | █ | [129654784, 130048000) (len: 393216) | ||
| 52 | █ | [162369536, 162762752) (len: 393216) | ||
| 53 | █ | [197931008, 198324224) (len: 393216) | ||
| 54 | █ | [170344448, 170737664) (len: 393216) | ||
| 55 | █ | [203067392, 203460608) (len: 393216) | ||
| 56 | █ | [110460928, 110854144) (len: 393216) | ||
| 57 | █ | [93159424, 93552640) (len: 393216) | ||
| 58 | █ | [188870656, 189263872) (len: 393216) | ||
| 59 | █ | [111001600, 111394816) (len: 393216) | ||
|
|
||
| Summary Statistics: | ||
| Total bytes read: 19382272 (18928.00 KB) | ||
| Average range size: 323037 bytes | ||
| Min range size: 8192 bytes | ||
| Max range size: 393216 bytes | ||
| Read pattern analysis: Random (gaps: 52) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,2 @@ | ||
| numpy 2.2.6 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| lmdb 1.7.3 | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
numpymodule is imported but is not used anywhere in the script. Unused imports should be removed to keep the code clean and avoid confusion.