Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

# Created by https://www.gitignore.io/api/osx,linux,python,windows,pycharm,visualstudiocode

# Exception for headers_keys.json
Expand Down Expand Up @@ -254,11 +253,21 @@ custom_pytest.ini
# End of https://www.gitignore.io/api/osx,linux,python,windows,pycharm,visualstudiocode

run_*.sh
!run_vtdlp-ingest_example.sh
!run_vtdlp-testextract_example.sh
lambda_local.sh
results_files/

bin/
examples/
lib64/
pyvenv.cfg
lib64*
lib64*
PR.md
*.log
test_event.json
run_vtdlp-ingest_testextract.sh
run_vtdlp-testextract.sh
run_vtdlp-ingest_testss.sh
python/
requirements.txt
10 changes: 9 additions & 1 deletion lambda_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
env["script_root"] = os.path.abspath(os.path.dirname(__file__))
env["aws_src_bucket"] = os.getenv("AWS_SRC_BUCKET")
env["aws_dest_bucket"] = os.getenv("AWS_DEST_BUCKET")
env["textract_bucket"] = os.getenv("TEXTRACT_BUCKET")
env["textract_line_table"] = os.getenv("TEXTRACT_LINE_TABLE")
env["textract_word_table"] = os.getenv("TEXTRACT_WORD_TABLE")
env["collection_category"] = os.getenv("COLLECTION_CATEGORY")
env["collection_identifier"] = os.getenv("COLLECTION_IDENTIFIER")
env["collection_subdirectory"] = os.getenv("COLLECTION_SUBDIRECTORY")
Expand Down Expand Up @@ -52,7 +55,12 @@
env["update_metadata"] = (
os.getenv("UPDATE_METADATA") is not None and os.getenv("UPDATE_METADATA").lower() == "true"
)

env['local_textract'] = (
os.getenv("LOCAL_TEXTRACT") is not None and os.getenv("LOCAL_TEXTRACT").lower() == "true"
)
env["process_textract"] = (
os.getenv("PROCESS_TEXTRACT") is not None and os.getenv("PROCESS_TEXTRACT").lower() == "true"
)

def new_media_type_handler(env, filename, bucket):
media_type = media_types_map[env["media_type"]]
Expand Down
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,6 @@ pytest
pytest-cov
pytest-env
pytest-mock
requests
requests
opencv-python
numpy
4 changes: 4 additions & 0 deletions run_vtdlp-ingest_example.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
VERBOSE="true" \
AWS_SRC_BUCKET="" \
AWS_DEST_BUCKET="" \
TEXTRACT_BUCKET="" \
TEXTRACT_LINE_TABLE="" \
TEXTRACT_WORD_TABLE="" \
COLLECTION_CATEGORY="" \
COLLECTION_IDENTIFIER="" \
COLLECTION_SUBDIRECTORY="" \
Expand All @@ -22,4 +25,5 @@ METADATA_INGEST="true" \
GENERATE_THUMBNAILS="false" \
DRY_RUN="false" \
UPDATE_METADATA="false" \
PROCESS_TEXTRACT="false" \
python3 lambda_function.py "examples/testss/test_archive_metadata.csv"
4 changes: 4 additions & 0 deletions run_vtdlp-testextract_example.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
TEXTRACT_LINE_TABLE="your_line_table_name" \
TEXTRACT_WORD_TABLE="your_word_table_name" \
TEXTRACT_BUCKET="your_textract_bucket_name" \
python3 src/media_types/metadata/textract_lambda_handler.py
75 changes: 72 additions & 3 deletions src/media_types/metadata/generic_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from boto3.dynamodb.conditions import Key, Attr
from botocore.response import StreamingBody
import re
from src.media_types.metadata.textract_lambda_handler import lambda_handler

DUPLICATED = "Duplicated"

Expand Down Expand Up @@ -217,7 +218,12 @@ def batch_import_archives(self, response):
"Archive",
idx
)
# Log trying to create an item that already exists
# --- TEXTRACT WORKFLOW ---
if self.env.get("process_textract", False):
print("\033[94mProcessing textract since PROCESS_TEXTRACT is set to True.\033[0m")
self.run_textract_workflow(collection_identifier, archive_dict["identifier"])
else:
print("\033[94mSkipping Textract process as PROCESS_TEXTRACT is set to False.\033[0m")

def get_table_name(self, table_name):
return f"{table_name}-{self.env['dynamodb_table_suffix']}"
Expand Down Expand Up @@ -329,7 +335,7 @@ def print_results(self):
)
.put(Body=open(results_filename, "rb"))
)
print(s3_response)
#print(s3_response)
status = s3_response["ResponseMetadata"]["HTTPStatusCode"]
if status == 200:
print("")
Expand Down Expand Up @@ -896,4 +902,67 @@ def delete_NOID_record(self, noid):
print("delete_NOID: SIMULATED.")
else:
self.env["mint_table"].delete_item(Key={"noid": noid})
print(f"delete_NOID: {noid}")
print(f"delete_NOID: {noid}")

def run_textract_workflow(self, collection_identifier, archive_identifier):
"""
Finds images in S3 for the given archive and triggers Textract workflow for each.
In both local and AWS modes, first checks if the prefix exists in the textract bucket.
If it exists, skips processing. If not, copies files to textract bucket (AWS mode)
or triggers the lambda directly (local mode).
"""
s3_client = self.env["s3_client"]
print(f"Running Textract workflow for archive: {archive_identifier} in collection: {collection_identifier}")
print(f"\033[94mEnvironment settings: local_textract={self.env.get('local_textract', False)}\033[0m")
source_bucket = self.env["aws_src_bucket"]
# Build the s3 prefix for the archive
textract_bucket = self.env["textract_bucket"]
prefix = f"{self.env['collection_category']}/{collection_identifier}/{archive_identifier}/Access/"

# Check if the prefix already exists in the Textract bucket
print(f"Checking if prefix {prefix} exists in Textract bucket {textract_bucket}...")
textract_objects = s3_client.list_objects_v2(Bucket=textract_bucket, Prefix=prefix)
#print(f"Textract bucket response: {textract_objects}")#
if "Contents" in textract_objects and len(textract_objects["Contents"]) > 0:
print(f"\033[91mWARNING: Identifier '{archive_identifier}' already exists in Textract bucket. Skipping Textract workflow.\033[0m")
return # Skip further processing

response = s3_client.list_objects_v2(Bucket=source_bucket, Prefix=prefix)
#print(f"Source bucket response: {response}")#
if "Contents" in response:
print("Identifier does not exist in Textract bucket. Proceeding with Textract workflow.")
if self.env.get("local_textract", False):
# Run the Textract Lambda handler locally for each image
for obj in response["Contents"]:
key = obj["Key"]
if key.lower().endswith(('.jpg', '.jpeg', '.png', '.pdf')):
print(f"\033[94mRunning Textract Lambda handler locally for {key}...\033[0m")
event = {
"Records": [
{
"body": json.dumps({
"s3": {
"bucket": {"name": source_bucket},
"object": {"key": key}
},
"collection_identifier": collection_identifier
})
}
]
}
lambda_handler(event, None)
else:
# Copy files to textract bucket (AWS mode)
print(f"\033[94mCopying files from source bucket {source_bucket} to Textract bucket {textract_bucket} since local_textract is False...\033[0m")
for obj in response["Contents"]:
print(f"Found object in source bucket: {obj['Key']}")
key = obj["Key"]
if key.lower().endswith(('.jpg', '.jpeg', '.png', '.pdf')):
copy_source = {'Bucket': source_bucket, 'Key': key}
print(f"Copying {key} from {source_bucket} to {textract_bucket}...")
s3_client.copy_object(
Bucket=textract_bucket,
Key=key,
CopySource=copy_source
)
print("Copy complete. S3 trigger on Textract bucket will invoke the Textract Lambda.")
Loading