diff --git a/.circleci/config.yml b/.circleci/config.yml
deleted file mode 100644
index a841c1584..000000000
--- a/.circleci/config.yml
+++ /dev/null
@@ -1,81 +0,0 @@
-version: 2.1
-
-orbs:
-  python: circleci/python@1.4.0
-
-jobs:
-  linting:
-    executor: python/default
-    steps:
-      - checkout
-      - restore_cache:
-          key: deps1-{{ .Branch }}-{{ checksum "requirements.txt" }}
-      - run:
-          command: |
-            sudo apt update && sudo apt install libcurl4-openssl-dev
-          name: Install curl-config from Ubuntu APT
-      - run:
-          command: |
-            python3 install.py --aws --azure --gcp --no-local
-          name: Install pip dependencies
-      - run:
-          command: |
-            . python-venv/bin/activate
-            black sebs --check --config .black.toml
-          name: Python code formatting with black
-      - run:
-          command: |
-            . python-venv/bin/activate
-            flake8 sebs --config=.flake8.cfg --tee --output-file flake-reports
-          name: Python code lint with flake8
-      - run:
-          command: |
-            . python-venv/bin/activate
-            mypy sebs --config-file=.mypy.ini
-          name: Python static code verification with mypy
-      - store_artifacts:
-          path: flake-reports
-          destination: flake-reports
-  test-aws:
-    executor: python/default
-    steps:
-      - checkout
-      - setup_remote_docker
-      - restore_cache:
-          key: deps1-{{ .Branch }}-{{ checksum "requirements.txt" }}
-      - run:
-          command: |
-            if [[ -d $HOME/docker ]];
-            then
-              ls $HOME/docker/*.tar.gz | xargs -I {file} sh -c "zcat {file} | docker load";
-            else
-              docker pull mcopik/serverless-benchmarks:build.aws.python.3.7
-              docker pull mcopik/serverless-benchmarks:build.aws.nodejs.12.x
-            fi
-          name: Load Docker images
-      - run:
-          command: |
-            python3 install.py --aws
-          name: Install pip dependencies
-      - run:
-          command: |
-            mkdir -p $HOME/docker
-            docker images mcopik/serverless-benchmarks --filter='dangling=false' --format '{{.Repository}}:{{.Tag}} {{.ID}}' |\
-            xargs -n 2 -t sh -c 'test -e $HOME/docker/$1.tar.gz || docker save $0 | gzip -2 > $HOME/docker/$1.tar.gz'
-          name: Save Docker images
-      - save_cache:
-          key: deps1-{{ .Branch }}-{{ checksum "requirements.txt" }}
-          paths:
-            - "sebs-virtualenv"
-            - $HOME/docker
-      - run:
-          command: |
-            . sebs-virtualenv/bin/activate
-            tests/test_runner.py --deployment aws
-          name: Execute AWS tests
-
-workflows:
-  main:
-    jobs:
-      - linting
-
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
new file mode 100644
index 000000000..1043be62e
--- /dev/null
+++ b/.github/workflows/lint.yml
@@ -0,0 +1,55 @@
+name: Lint
+
+on:
+  push:
+  pull_request:
+
+jobs:
+  linting:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Check out code
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        id: setup-python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.x'
+
+      - name: Cache virtualenv
+        uses: actions/cache@v4
+        with:
+          path: python-venv
+          key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('requirements.txt') }}-${{ github.ref_name }}
+          restore-keys: |
+            venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('requirements.txt') }}-
+            venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-
+
+      - name: Install system packages
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y libcurl4-openssl-dev
+
+      - name: Install Python dependencies (via install.py)
+        run: |
+          python3 install.py --no-aws --no-azure --no-gcp --no-openwhisk --no-local
+
+      - name: Black (check)
+        run: |
+          . python-venv/bin/activate
+          black benchmarks --check --config .black.toml
+
+      - name: Flake8 (lint)
+        run: |
+          . python-venv/bin/activate
+          # write to file and echo to stdout (requires flake8 with --tee support)
+          flake8 benchmarks --config=.flake8.cfg --tee --output-file flake-reports
+
+      - name: Upload flake report
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: flake-reports
+          path: flake-reports
diff --git a/.gitmodules b/.gitmodules
index 4feae9bfb..c33a17880 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -3,4 +3,4 @@
   url = https://github.com/mcopik/pypapi.git
 [submodule "benchmarks-data"]
 	path = benchmarks-data
-  url = https://github.com/spcl/serverless-benchmarks-data.git
+  url = https://github.com/McLavish/serverless-benchmarks-data-dphpc.git
diff --git a/.mypy.ini b/.mypy.ini
index e202650ed..636105bfa 100644
--- a/.mypy.ini
+++ b/.mypy.ini
@@ -3,6 +3,9 @@
 [mypy-docker]
 ignore_missing_imports = True
 
+[mypy-docker.*]
+ignore_missing_imports = True
+
 [mypy-tzlocal]
 ignore_missing_imports = True
 
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 000000000..bb0519cd2
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,30 @@
+# .pre-commit-config.yaml
+repos:
+  - repo: local
+    hooks:
+      - id: flake8-local
+        name: flake8 (project env)
+        language: python
+        additional_dependencies: ["flake8==7.1.1"]
+        entry: flake8
+        args: ["--config=.flake8.cfg"]
+        types: [python]
+        files: ^(sebs/|benchmarks/)
+  - repo: local
+    hooks:
+      - id: black-check-local
+        name: black --check (project env)
+        language: python
+        additional_dependencies: ["black==24.4.2"]
+        entry: black
+        args: ["--config=.black.toml", "--check", "--diff"]
+        types: [python]
+        files: ^(sebs/|benchmarks/)
+  # - repo: local
+  #   hooks:
+  #     - id: mypy-local
+  #       name: mypy (project venv)
+  #       language: system
+  #       entry: bash -lc 'python -m mypy --config-file=.mypy.ini sebs'
+  #       types: [python]
+
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 000000000..127ae8a76
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,15 @@
+{
+  "[python]": {
+    "editor.defaultFormatter": "ms-python.black-formatter",
+    "editor.formatOnSave": true
+  },
+
+  "black-formatter.importStrategy": "fromEnvironment",
+  "black-formatter.path": [],
+  "black-formatter.args": ["--config=.black.toml"],
+
+  "flake8.importStrategy": "fromEnvironment",
+  "flake8.path": [],
+  "flake8.args": ["--config=.flake8.cfg"],
+  "flake8.enabled": true
+}
diff --git a/benchmarks-data b/benchmarks-data
index 6a17a460f..25c2bb40b 160000
--- a/benchmarks-data
+++ b/benchmarks-data
@@ -1 +1 @@
-Subproject commit 6a17a460f289e166abb47ea6298fb939e80e8beb
+Subproject commit 25c2bb40b8bde342395534b534ba62f8f0ff3549
diff --git a/benchmarks/000.microbenchmarks/010.sleep/input.py b/benchmarks/000.microbenchmarks/010.sleep/input.py
index 041d2ba7f..cd7afc15e 100644
--- a/benchmarks/000.microbenchmarks/010.sleep/input.py
+++ b/benchmarks/000.microbenchmarks/010.sleep/input.py
@@ -1,12 +1,17 @@
+size_generators = {"test": 1, "small": 100, "large": 1000}
 
-size_generators = {
-    'test' : 1,
-    'small' : 100,
-    'large': 1000
-}
 
 def buckets_count():
     return (0, 0)
 
-def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func, nosql_func):
-    return { 'sleep': size_generators[size] }
+
+def generate_input(
+    data_dir,
+    size,
+    benchmarks_bucket,
+    input_paths,
+    output_paths,
+    upload_func,
+    nosql_func,
+):
+    return {"sleep": size_generators[size]}
diff --git a/benchmarks/000.microbenchmarks/010.sleep/python/function.py b/benchmarks/000.microbenchmarks/010.sleep/python/function.py
index 7dda59a57..64be15557 100644
--- a/benchmarks/000.microbenchmarks/010.sleep/python/function.py
+++ b/benchmarks/000.microbenchmarks/010.sleep/python/function.py
@@ -1,9 +1,9 @@
-
 from time import sleep
 
+
 def handler(event):
 
     # start timing
-    sleep_time = event.get('sleep')
+    sleep_time = event.get("sleep")
     sleep(sleep_time)
-    return { 'result': sleep_time }
+    return {"result": sleep_time}
diff --git a/benchmarks/000.microbenchmarks/020.network-benchmark/input.py b/benchmarks/000.microbenchmarks/020.network-benchmark/input.py
index 0d969bc74..216d0604b 100644
--- a/benchmarks/000.microbenchmarks/020.network-benchmark/input.py
+++ b/benchmarks/000.microbenchmarks/020.network-benchmark/input.py
@@ -2,10 +2,18 @@ def buckets_count():
     return 0, 1
 
 
-def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func, nosql_func):
+def generate_input(
+    data_dir,
+    size,
+    benchmarks_bucket,
+    input_paths,
+    output_paths,
+    upload_func,
+    nosql_func,
+):
     return {
-        'bucket': {
-            'bucket': benchmarks_bucket,
-            'output': output_paths[0],
+        "bucket": {
+            "bucket": benchmarks_bucket,
+            "output": output_paths[0],
         },
     }
diff --git a/benchmarks/000.microbenchmarks/020.network-benchmark/python/function.py b/benchmarks/000.microbenchmarks/020.network-benchmark/python/function.py
index eb8ccdcf2..58c376a2d 100644
--- a/benchmarks/000.microbenchmarks/020.network-benchmark/python/function.py
+++ b/benchmarks/000.microbenchmarks/020.network-benchmark/python/function.py
@@ -1,27 +1,26 @@
 import csv
-import json
 import os.path
 import socket
 from datetime import datetime
-from time import sleep
 
 from . import storage
 
+
 def handler(event):
 
-    request_id = event['request-id']
-    address = event['server-address']
-    port = event['server-port']
-    repetitions = event['repetitions']
-    output_bucket = event.get('bucket').get('bucket')
-    output_prefix = event.get('bucket').get('output')
+    request_id = event["request-id"]
+    address = event["server-address"]
+    port = event["server-port"]
+    repetitions = event["repetitions"]
+    output_bucket = event.get("bucket").get("bucket")
+    output_prefix = event.get("bucket").get("output")
     times = []
     i = 0
     socket.setdefaulttimeout(3)
     server_socket = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
     server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
-    server_socket.bind(('', 0))
-    message = request_id.encode('utf-8')
+    server_socket.bind(("", 0))
+    message = request_id.encode("utf-8")
     adr = (address, port)
     consecutive_failures = 0
     while i < repetitions + 1:
@@ -43,16 +42,16 @@ def handler(event):
         consecutive_failures = 0
         server_socket.settimeout(2)
     server_socket.close()
-   
+
     if consecutive_failures != 5:
-        with open('/tmp/data.csv', 'w', newline='') as csvfile:
-            writer = csv.writer(csvfile, delimiter=',')
-            writer.writerow(["id", "client_send", "client_rcv"]) 
+        with open("/tmp/data.csv", "w", newline="") as csvfile:
+            writer = csv.writer(csvfile, delimiter=",")
+            writer.writerow(["id", "client_send", "client_rcv"])
             for row in times:
                 writer.writerow(row)
-      
+
         client = storage.storage.get_instance()
-        filename = 'results-{}.csv'.format(request_id)
-        key = client.upload(output_bucket, os.path.join(output_prefix, filename), '/tmp/data.csv')
+        filename = "results-{}.csv".format(request_id)
+        key = client.upload(output_bucket, os.path.join(output_prefix, filename), "/tmp/data.csv")
 
-    return { 'result': key }
+    return {"result": key}
diff --git a/benchmarks/000.microbenchmarks/030.clock-synchronization/input.py b/benchmarks/000.microbenchmarks/030.clock-synchronization/input.py
index 427215380..216d0604b 100644
--- a/benchmarks/000.microbenchmarks/030.clock-synchronization/input.py
+++ b/benchmarks/000.microbenchmarks/030.clock-synchronization/input.py
@@ -1,12 +1,19 @@
-
-
 def buckets_count():
     return 0, 1
 
-def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func, nosql_func):
+
+def generate_input(
+    data_dir,
+    size,
+    benchmarks_bucket,
+    input_paths,
+    output_paths,
+    upload_func,
+    nosql_func,
+):
     return {
-        'bucket': {
-            'bucket': benchmarks_bucket,
-            'output': output_paths[0],
+        "bucket": {
+            "bucket": benchmarks_bucket,
+            "output": output_paths[0],
         },
     }
diff --git a/benchmarks/000.microbenchmarks/030.clock-synchronization/python/function.py b/benchmarks/000.microbenchmarks/030.clock-synchronization/python/function.py
index 9ffd978ae..9cf93eccf 100644
--- a/benchmarks/000.microbenchmarks/030.clock-synchronization/python/function.py
+++ b/benchmarks/000.microbenchmarks/030.clock-synchronization/python/function.py
@@ -1,28 +1,27 @@
 import csv
-import json
 import os
 import socket
 from datetime import datetime
-from time import sleep
 
 from . import storage
 
+
 def handler(event):
 
-    request_id = event['request-id']
-    address = event['server-address']
-    port = event['server-port']
-    repetitions = event['repetitions']
-    output_bucket = event.get('bucket').get('bucket')
-    output_prefix = event.get('bucket').get('output')
+    request_id = event["request-id"]
+    address = event["server-address"]
+    port = event["server-port"]
+    repetitions = event["repetitions"]
+    output_bucket = event.get("bucket").get("bucket")
+    output_prefix = event.get("bucket").get("output")
     times = []
     print("Starting communication with {}:{}".format(address, port))
     i = 0
     socket.setdefaulttimeout(4)
     server_socket = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
     server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
-    server_socket.bind(('', 0))
-    message = request_id.encode('utf-8')
+    server_socket.bind(("", 0))
+    message = request_id.encode("utf-8")
     adr = (address, port)
     consecutive_failures = 0
     measurements_not_smaller = 0
@@ -43,11 +42,13 @@ def handler(event):
         if i > 0:
             times.append([i, send_begin, recv_end])
         cur_time = recv_end - send_begin
-        print("Time {} Min Time {} NotSmaller {}".format(cur_time, cur_min, measurements_not_smaller))
+        print(
+            "Time {} Min Time {} NotSmaller {}".format(cur_time, cur_min, measurements_not_smaller)
+        )
         if cur_time > cur_min and cur_min > 0:
             measurements_not_smaller += 1
             if measurements_not_smaller == repetitions:
-                message = "stop".encode('utf-8')
+                message = "stop".encode("utf-8")
                 server_socket.sendto(message, adr)
                 break
         else:
@@ -57,18 +58,18 @@ def handler(event):
         consecutive_failures = 0
         server_socket.settimeout(4)
     server_socket.close()
-   
+
     if consecutive_failures != 5:
-        with open('/tmp/data.csv', 'w', newline='') as csvfile:
-            writer = csv.writer(csvfile, delimiter=',')
-            writer.writerow(["id", "client_send", "client_rcv"]) 
+        with open("/tmp/data.csv", "w", newline="") as csvfile:
+            writer = csv.writer(csvfile, delimiter=",")
+            writer.writerow(["id", "client_send", "client_rcv"])
             for row in times:
                 writer.writerow(row)
-      
+
         client = storage.storage.get_instance()
-        filename = 'results-{}.csv'.format(request_id)
-        key = client.upload(output_bucket, os.path.join(output_prefix, filename), '/tmp/data.csv')
+        filename = "results-{}.csv".format(request_id)
+        key = client.upload(output_bucket, os.path.join(output_prefix, filename), "/tmp/data.csv")
     else:
         key = None
 
-    return { 'result': {'bucket-key': key, 'timestamp':  event['income-timestamp']} }
+    return {"result": {"bucket-key": key, "timestamp": event["income-timestamp"]}}
diff --git a/benchmarks/000.microbenchmarks/040.server-reply/input.py b/benchmarks/000.microbenchmarks/040.server-reply/input.py
index 041d2ba7f..cd7afc15e 100644
--- a/benchmarks/000.microbenchmarks/040.server-reply/input.py
+++ b/benchmarks/000.microbenchmarks/040.server-reply/input.py
@@ -1,12 +1,17 @@
+size_generators = {"test": 1, "small": 100, "large": 1000}
 
-size_generators = {
-    'test' : 1,
-    'small' : 100,
-    'large': 1000
-}
 
 def buckets_count():
     return (0, 0)
 
-def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func, nosql_func):
-    return { 'sleep': size_generators[size] }
+
+def generate_input(
+    data_dir,
+    size,
+    benchmarks_bucket,
+    input_paths,
+    output_paths,
+    upload_func,
+    nosql_func,
+):
+    return {"sleep": size_generators[size]}
diff --git a/benchmarks/000.microbenchmarks/040.server-reply/python/function.py b/benchmarks/000.microbenchmarks/040.server-reply/python/function.py
index fb5b57aa3..4c2a294ba 100644
--- a/benchmarks/000.microbenchmarks/040.server-reply/python/function.py
+++ b/benchmarks/000.microbenchmarks/040.server-reply/python/function.py
@@ -1,11 +1,10 @@
-
 import socket
-from time import sleep
+
 
 def handler(event):
 
     # start timing
-    addr = (event.get('ip-address'), event.get('port'))
+    addr = (event.get("ip-address"), event.get("port"))
     socket.setdefaulttimeout(20)
     s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
     s.connect(addr)
diff --git a/benchmarks/000.microbenchmarks/050.peak-performance/config.json b/benchmarks/000.microbenchmarks/050.peak-performance/config.json
new file mode 100644
index 000000000..93ce2f561
--- /dev/null
+++ b/benchmarks/000.microbenchmarks/050.peak-performance/config.json
@@ -0,0 +1,6 @@
+{
+  "timeout": 120,
+  "memory": 128,
+  "languages": ["python", "nodejs"],
+  "modules": []
+}
diff --git a/benchmarks/000.microbenchmarks/050.peak-performance/input.py b/benchmarks/000.microbenchmarks/050.peak-performance/input.py
new file mode 100644
index 000000000..cd7afc15e
--- /dev/null
+++ b/benchmarks/000.microbenchmarks/050.peak-performance/input.py
@@ -0,0 +1,17 @@
+size_generators = {"test": 1, "small": 100, "large": 1000}
+
+
+def buckets_count():
+    return (0, 0)
+
+
+def generate_input(
+    data_dir,
+    size,
+    benchmarks_bucket,
+    input_paths,
+    output_paths,
+    upload_func,
+    nosql_func,
+):
+    return {"sleep": size_generators[size]}
diff --git a/benchmarks/000.microbenchmarks/050.peak-performance/python/function.py b/benchmarks/000.microbenchmarks/050.peak-performance/python/function.py
new file mode 100644
index 000000000..e949de175
--- /dev/null
+++ b/benchmarks/000.microbenchmarks/050.peak-performance/python/function.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python3
+import torch
+import datetime
+
+
+def initialize_torch(size, dtype=torch.float16, device="cuda"):
+    A = torch.randn((size, size), dtype=dtype, device=device)
+    B = torch.randn((size, size), dtype=dtype, device=device)
+    return A, B
+
+
+def handler(event):
+
+    size = event.get("size", 1000)
+    reps = event.get("reps", 100)
+
+    if "seed" in event:
+        import random
+
+        random.seed(event["seed"])
+        seed = event.get("seed")
+        seed = int(seed)
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+
+    sync = torch.cuda.synchronize
+
+    matrix_generating_begin = datetime.datetime.now()
+    sync()
+    A, B = initialize_torch(size, dtype=torch.float16, device="cuda")
+    sync()
+    matrix_generating_end = datetime.datetime.now()
+
+    # Warm up
+    for _ in range(10):
+        _ = torch.matmul(A, B)
+
+    matmul_begin = datetime.datetime.now()
+    sync()
+    for _ in range(reps):
+        _ = torch.matmul(A, B)
+    sync()
+    matmul_end = datetime.datetime.now()
+
+    matrix_generating_time = (matrix_generating_end - matrix_generating_begin) / datetime.timedelta(
+        microseconds=1
+    )
+    matmul_time = (matmul_end - matmul_begin) / datetime.timedelta(microseconds=1)
+    num_flops = 2 * (size**3) * reps
+
+    return {
+        "size": size,
+        "reps": reps,
+        "measurement": {
+            "matrix_generating_time_us": f"{matrix_generating_time} microseconds",
+            "compute_time_us": f"{matmul_time} microseconds",
+            "avg_compute_time_us": f"{matmul_time / reps} microseconds",
+            "Total flops": f"{num_flops} FLOPs",
+            "avg TeraFLOPS per second": f"{(num_flops) / (matmul_time * 1e6)} TFLOPS/s",
+        },
+    }
diff --git a/benchmarks/000.microbenchmarks/050.peak-performance/python/requirements.txt b/benchmarks/000.microbenchmarks/050.peak-performance/python/requirements.txt
new file mode 100755
index 000000000..d8d966118
--- /dev/null
+++ b/benchmarks/000.microbenchmarks/050.peak-performance/python/requirements.txt
@@ -0,0 +1 @@
+torch==2.4.1
diff --git a/benchmarks/000.microbenchmarks/0xx.host-device-copy/config.json b/benchmarks/000.microbenchmarks/0xx.host-device-copy/config.json
new file mode 100644
index 000000000..93ce2f561
--- /dev/null
+++ b/benchmarks/000.microbenchmarks/0xx.host-device-copy/config.json
@@ -0,0 +1,6 @@
+{
+  "timeout": 120,
+  "memory": 128,
+  "languages": ["python", "nodejs"],
+  "modules": []
+}
diff --git a/benchmarks/000.microbenchmarks/0xx.host-device-copy/input.py b/benchmarks/000.microbenchmarks/0xx.host-device-copy/input.py
new file mode 100644
index 000000000..5e7ac46ec
--- /dev/null
+++ b/benchmarks/000.microbenchmarks/0xx.host-device-copy/input.py
@@ -0,0 +1,17 @@
+size_generators = {"test": 1 << 10, "small": 1 << 26, "large": 1 << 29}
+
+
+def buckets_count():
+    return (0, 0)
+
+
+def generate_input(
+    data_dir,
+    size,
+    benchmarks_bucket,
+    input_paths,
+    output_paths,
+    upload_func,
+    nosql_func,
+):
+    return {"size": size_generators[size]}
diff --git a/benchmarks/000.microbenchmarks/0xx.host-device-copy/python/function.py b/benchmarks/000.microbenchmarks/0xx.host-device-copy/python/function.py
new file mode 100644
index 000000000..67db5032d
--- /dev/null
+++ b/benchmarks/000.microbenchmarks/0xx.host-device-copy/python/function.py
@@ -0,0 +1,117 @@
+import datetime
+import torch
+
+
+def get_device_and_sync(device_str: str):
+    device = torch.device(device_str)
+    if device.type == "cuda":
+        sync = torch.cuda.synchronize
+    elif device.type == "npu":
+        sync = torch.npu.synchronize
+    else:
+
+        def sync():
+            return None
+
+    return device, sync
+
+
+def initialize(size, device_str="cuda"):
+    device, _ = get_device_and_sync(device_str)
+    pin = device.type == "cuda"
+    host_tensor = torch.randn(size, device="cpu", dtype=torch.float32, pin_memory=pin)
+    device_tensor = torch.empty(size, device=device, dtype=torch.float32)
+    return host_tensor, device_tensor
+
+
+def _run_once(size, iters, device_str):
+    generate_begin = datetime.datetime.now()
+    host_tensor, device_tensor = initialize(size, device_str=device_str)
+    generate_end = datetime.datetime.now()
+
+    device, sync = get_device_and_sync(device_str)
+
+    sync()
+    h2d_begin = datetime.datetime.now()
+    for _ in range(iters):
+        _ = host_tensor.to(device, non_blocking=True)
+    sync()
+    h2d_end = datetime.datetime.now()
+
+    sync()
+    d2h_begin = datetime.datetime.now()
+    for _ in range(iters):
+        _ = device_tensor.to("cpu", non_blocking=True)
+    sync()
+    d2h_end = datetime.datetime.now()
+
+    generate_time = (generate_end - generate_begin) / datetime.timedelta(milliseconds=1)
+    h2d_time = (h2d_end - h2d_begin) / datetime.timedelta(milliseconds=1)
+    d2h_time = (d2h_end - d2h_begin) / datetime.timedelta(milliseconds=1)
+
+    h2d_avg_ms = h2d_time / iters
+    d2h_avg_ms = d2h_time / iters
+
+    bytes_per_iter = size * host_tensor.element_size()
+
+    h2d_gbps = bytes_per_iter / (h2d_avg_ms / 1000.0) / 1e9
+    d2h_gbps = bytes_per_iter / (d2h_avg_ms / 1000.0) / 1e9
+
+    return {
+        "H2D_avg": f"{h2d_avg_ms:.4f} ms / iter",
+        "H2D_effective_BW": f"{h2d_gbps:.2f} GB/s",
+        "D2H_avg": f"{d2h_avg_ms:.4f} ms / iter",
+        "D2H_effective_BW": f"{d2h_gbps:.2f} GB/s",
+        "measurement": {
+            "generate_time_ms": generate_time,
+            "H2D_total_time_ms": h2d_time,
+            "D2H_total_time_ms": d2h_time,
+        },
+    }
+
+
+def handler(event):
+    if "size" not in event:
+        raise ValueError("event must contain 'size'")
+
+    size = event["size"]
+    iters = event.get("iters", 100)
+
+    success = False
+    error_msg = None
+    device_used = "cpu"
+    result = None
+
+    if torch.cuda.is_available():
+        try:
+            result = _run_once(size, iters, device_str="cuda")
+            success = True
+            device_used = "cuda"
+        except RuntimeError as e:
+            msg = str(e)
+            if (
+                "no kernel image is available for execution on the device" in msg
+                or "CUDA error" in msg
+            ):
+                result = _run_once(size, iters, device_str="cpu")
+                success = False
+                device_used = "cpu"
+                error_msg = (
+                    "CUDA GPU not usable, computation was done on CPU. " f"Original error: {msg}"
+                )
+            else:
+                raise
+    else:
+        result = _run_once(size, iters, device_str="cpu")
+        success = False
+        device_used = "cpu"
+        error_msg = "CUDA GPU not available, computation was done on CPU."
+
+    result.update(
+        {
+            "device": device_used,
+            "success": success,
+            "error": error_msg,
+        }
+    )
+    return result
diff --git a/benchmarks/000.microbenchmarks/0xx.host-device-copy/python/init.sh b/benchmarks/000.microbenchmarks/0xx.host-device-copy/python/init.sh
new file mode 100755
index 000000000..9820c84d8
--- /dev/null
+++ b/benchmarks/000.microbenchmarks/0xx.host-device-copy/python/init.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+# No additional initialization required.
diff --git a/benchmarks/000.microbenchmarks/0xx.host-device-copy/python/package.sh b/benchmarks/000.microbenchmarks/0xx.host-device-copy/python/package.sh
new file mode 100644
index 000000000..edb27ebe0
--- /dev/null
+++ b/benchmarks/000.microbenchmarks/0xx.host-device-copy/python/package.sh
@@ -0,0 +1,35 @@
+# Stripping package code is based on https://github.com/ryfeus/lambda-packs repo
+
+PACKAGE_DIR=$1
+echo "Original size $(du -sh $1 | cut -f1)"
+
+CUR_DIR=$(pwd)
+cd $1
+# cleaning libs
+rm -rf external
+find . -type d -name "tests" -exec rm -rf {} +
+find . -type d -name "test" -exec rm -rf {} +
+find . -type d -name "bin" -not -path "*/torch/*" -exec rm -rf {} +
+
+# cleaning
+# stripping some of the numpy libs - libgfortran-2e0d59d6.so.5.0.0 - causes issues on Azure
+find -name "*.so" -not -path "*/PIL/*" -not -path "*/Pillow.libs/*" -not -path "*libgfortran*" | xargs strip
+find -name "*.so.*" -not -path "*/PIL/*" -not -path "*/Pillow.libs/*" -not -path "*libgfortran*" | xargs strip
+
+rm -r pip >/dev/null
+rm -r pip-* >/dev/null
+rm -r wheel >/dev/null
+rm -r wheel-* >/dev/null
+rm easy_install.py >/dev/null
+find . -name \*.pyc -delete
+cd ${CUR_DIR}
+echo "Stripped size $(du -sh $1 | cut -f1)"
+
+TORCH_DIR=".python_packages/lib/site-packages/torch"
+if [ -d "$1/${TORCH_DIR}" ]; then
+	cd $1
+	zip -qr torch.zip ${TORCH_DIR}
+	rm -rf ${TORCH_DIR}
+	cd ${CUR_DIR}
+	echo "Torch-zipped size $(du -sh $1 | cut -f1)"
+fi
diff --git a/benchmarks/000.microbenchmarks/0xx.host-device-copy/python/requirements.txt b/benchmarks/000.microbenchmarks/0xx.host-device-copy/python/requirements.txt
new file mode 100644
index 000000000..37f700a78
--- /dev/null
+++ b/benchmarks/000.microbenchmarks/0xx.host-device-copy/python/requirements.txt
@@ -0,0 +1,2 @@
+torch
+torchvision
\ No newline at end of file
diff --git a/benchmarks/000.microbenchmarks/0xx.host-device-copy/python/requirements.txt.3.10 b/benchmarks/000.microbenchmarks/0xx.host-device-copy/python/requirements.txt.3.10
new file mode 100644
index 000000000..216e1c32b
--- /dev/null
+++ b/benchmarks/000.microbenchmarks/0xx.host-device-copy/python/requirements.txt.3.10
@@ -0,0 +1,3 @@
+numpy==1.26.4
+torch
+torchvision
\ No newline at end of file
diff --git a/benchmarks/000.microbenchmarks/0xx.host-device-copy/python/requirements.txt.3.11 b/benchmarks/000.microbenchmarks/0xx.host-device-copy/python/requirements.txt.3.11
new file mode 100644
index 000000000..216e1c32b
--- /dev/null
+++ b/benchmarks/000.microbenchmarks/0xx.host-device-copy/python/requirements.txt.3.11
@@ -0,0 +1,3 @@
+numpy==1.26.4
+torch
+torchvision
\ No newline at end of file
diff --git a/benchmarks/000.microbenchmarks/0xx.host-device-copy/python/requirements.txt.3.6 b/benchmarks/000.microbenchmarks/0xx.host-device-copy/python/requirements.txt.3.6
new file mode 100644
index 000000000..bdc3d4b18
--- /dev/null
+++ b/benchmarks/000.microbenchmarks/0xx.host-device-copy/python/requirements.txt.3.6
@@ -0,0 +1,3 @@
+numpy==1.24.0
+torch
+torchvision
\ No newline at end of file
diff --git a/benchmarks/000.microbenchmarks/0xx.host-device-copy/python/requirements.txt.3.7 b/benchmarks/000.microbenchmarks/0xx.host-device-copy/python/requirements.txt.3.7
new file mode 100644
index 000000000..bdc3d4b18
--- /dev/null
+++ b/benchmarks/000.microbenchmarks/0xx.host-device-copy/python/requirements.txt.3.7
@@ -0,0 +1,3 @@
+numpy==1.24.0
+torch
+torchvision
\ No newline at end of file
diff --git a/benchmarks/000.microbenchmarks/0xx.host-device-copy/python/requirements.txt.3.8 b/benchmarks/000.microbenchmarks/0xx.host-device-copy/python/requirements.txt.3.8
new file mode 100644
index 000000000..bdc3d4b18
--- /dev/null
+++ b/benchmarks/000.microbenchmarks/0xx.host-device-copy/python/requirements.txt.3.8
@@ -0,0 +1,3 @@
+numpy==1.24.0
+torch
+torchvision
\ No newline at end of file
diff --git a/benchmarks/000.microbenchmarks/0xx.host-device-copy/python/requirements.txt.3.9 b/benchmarks/000.microbenchmarks/0xx.host-device-copy/python/requirements.txt.3.9
new file mode 100644
index 000000000..bdc3d4b18
--- /dev/null
+++ b/benchmarks/000.microbenchmarks/0xx.host-device-copy/python/requirements.txt.3.9
@@ -0,0 +1,3 @@
+numpy==1.24.0
+torch
+torchvision
\ No newline at end of file
diff --git a/benchmarks/000.microbenchmarks/0xx.vector-add/config.json b/benchmarks/000.microbenchmarks/0xx.vector-add/config.json
new file mode 100644
index 000000000..93ce2f561
--- /dev/null
+++ b/benchmarks/000.microbenchmarks/0xx.vector-add/config.json
@@ -0,0 +1,6 @@
+{
+  "timeout": 120,
+  "memory": 128,
+  "languages": ["python", "nodejs"],
+  "modules": []
+}
diff --git a/benchmarks/000.microbenchmarks/0xx.vector-add/input.py b/benchmarks/000.microbenchmarks/0xx.vector-add/input.py
new file mode 100644
index 000000000..5e7ac46ec
--- /dev/null
+++ b/benchmarks/000.microbenchmarks/0xx.vector-add/input.py
@@ -0,0 +1,17 @@
+size_generators = {"test": 1 << 10, "small": 1 << 26, "large": 1 << 29}
+
+
+def buckets_count():
+    return (0, 0)
+
+
+def generate_input(
+    data_dir,
+    size,
+    benchmarks_bucket,
+    input_paths,
+    output_paths,
+    upload_func,
+    nosql_func,
+):
+    return {"size": size_generators[size]}
diff --git a/benchmarks/000.microbenchmarks/0xx.vector-add/python/function.py b/benchmarks/000.microbenchmarks/0xx.vector-add/python/function.py
new file mode 100644
index 000000000..291530a71
--- /dev/null
+++ b/benchmarks/000.microbenchmarks/0xx.vector-add/python/function.py
@@ -0,0 +1,143 @@
+import datetime
+
+import torch
+import triton
+import triton.language as tl
+
+
+def get_device_and_sync(device_str: str):
+    device = torch.device(device_str)
+    if device.type == "cuda":
+        sync = torch.cuda.synchronize
+    elif device.type == "npu":
+        sync = torch.npu.synchronize
+    else:
+
+        def sync():
+            return None
+
+    return device, sync
+
+
+@triton.jit
+def vector_add_kernel(x_ptr, y_ptr, out_ptr, n, BLOCK_SIZE: tl.constexpr):
+    pid = tl.program_id(0)
+    offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n
+    x = tl.load(x_ptr + offsets, mask=mask)
+    y = tl.load(y_ptr + offsets, mask=mask)
+    tl.store(out_ptr + offsets, x + y, mask=mask)
+
+
+def vector_add(num_elems, iters=100, device_str="cuda", x=None, y=None):
+
+    device = torch.device(device_str)
+    x = x.to(device)
+    y = y.to(device)
+    out = torch.empty_like(x)
+
+    if device.type == "cuda":
+        torch.cuda.synchronize()
+    elif device.type == "npu":
+        torch.npu.synchronize()
+
+    for _ in range(iters):
+        out = x + y
+
+    if device.type == "cuda":
+        torch.cuda.synchronize()
+    elif device.type == "npu":
+        torch.npu.synchronize()
+
+    return out
+
+
+def initialize(size, device_str="cuda"):
+    device, sync = get_device_and_sync(device_str)
+    x = torch.randn(size, device=device, dtype=torch.float32)
+    y = torch.randn(size, device=device, dtype=torch.float32)
+    return x, y
+
+
+def _run_once(size, iters, device_str):
+
+    generate_begin = datetime.datetime.now()
+    array_1, array_2 = initialize(size, device_str=device_str)
+    generate_end = datetime.datetime.now()
+
+    process_begin = datetime.datetime.now()
+    _ = vector_add(
+        x=array_1,
+        y=array_2,
+        iters=iters,
+        device_str=device_str,
+        num_elems=size,
+    )
+    process_end = datetime.datetime.now()
+
+    process_time = (process_end - process_begin) / datetime.timedelta(milliseconds=1)
+    generate_time = (generate_end - generate_begin) / datetime.timedelta(milliseconds=1)
+    avg_ms = process_time / iters
+
+    bytes_per_iter = size * 3 * array_1.element_size()
+    gbps = bytes_per_iter / (avg_ms / 1000.0) / 1e9
+
+    return {
+        "avg": f"{avg_ms:.4f} ms / iter",
+        "effective BW": f"{gbps:.2f} GB/s",
+        "measurement": {
+            "compute_time": process_time,
+            "generate_time": generate_time,
+        },
+    }
+
+
+def handler(event):
+    if "size" not in event:
+        raise ValueError("event must contain 'size'")
+    size = event["size"]
+    iters = event.get("iters", 100)
+
+    success = False
+    error_msg = None
+    device_used = "cpu"
+
+    result = None
+
+    if torch.cuda.is_available():
+        try:
+            result = _run_once(size, iters, device_str="cuda")
+            success = True
+            device_used = "cuda"
+        except RuntimeError as e:
+
+            msg = str(e)
+            if (
+                "no kernel image is available for execution on the device" in msg
+                or "CUDA error" in msg
+            ):
+                # 回退到 CPU
+                result = _run_once(size, iters, device_str="cpu")
+                success = False
+                device_used = "cpu"
+                error_msg = (
+                    "CUDA GPU not usable, computation was done on CPU. " f"Original error: {msg}"
+                )
+            else:
+
+                raise
+    else:
+
+        result = _run_once(size, iters, device_str="cpu")
+        success = False
+        device_used = "cpu"
+        error_msg = "CUDA GPU not available, computation was done on CPU."
+
+    result.update(
+        {
+            "device": device_used,
+            "success": success,
+            "error": error_msg,
+        }
+    )
+    return result
diff --git a/benchmarks/000.microbenchmarks/0xx.vector-add/python/init.sh b/benchmarks/000.microbenchmarks/0xx.vector-add/python/init.sh
new file mode 100755
index 000000000..9820c84d8
--- /dev/null
+++ b/benchmarks/000.microbenchmarks/0xx.vector-add/python/init.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+# No additional initialization required.
diff --git a/benchmarks/000.microbenchmarks/0xx.vector-add/python/package.sh b/benchmarks/000.microbenchmarks/0xx.vector-add/python/package.sh
new file mode 100644
index 000000000..edb27ebe0
--- /dev/null
+++ b/benchmarks/000.microbenchmarks/0xx.vector-add/python/package.sh
@@ -0,0 +1,35 @@
+# Stripping package code is based on https://github.com/ryfeus/lambda-packs repo
+
+PACKAGE_DIR=$1
+echo "Original size $(du -sh $1 | cut -f1)"
+
+CUR_DIR=$(pwd)
+cd $1
+# cleaning libs
+rm -rf external
+find . -type d -name "tests" -exec rm -rf {} +
+find . -type d -name "test" -exec rm -rf {} +
+find . -type d -name "bin" -not -path "*/torch/*" -exec rm -rf {} +
+
+# cleaning
+# stripping some of the numpy libs - libgfortran-2e0d59d6.so.5.0.0 - causes issues on Azure
+find -name "*.so" -not -path "*/PIL/*" -not -path "*/Pillow.libs/*" -not -path "*libgfortran*" | xargs strip
+find -name "*.so.*" -not -path "*/PIL/*" -not -path "*/Pillow.libs/*" -not -path "*libgfortran*" | xargs strip
+
+rm -r pip >/dev/null
+rm -r pip-* >/dev/null
+rm -r wheel >/dev/null
+rm -r wheel-* >/dev/null
+rm easy_install.py >/dev/null
+find . -name \*.pyc -delete
+cd ${CUR_DIR}
+echo "Stripped size $(du -sh $1 | cut -f1)"
+
+TORCH_DIR=".python_packages/lib/site-packages/torch"
+if [ -d "$1/${TORCH_DIR}" ]; then
+	cd $1
+	zip -qr torch.zip ${TORCH_DIR}
+	rm -rf ${TORCH_DIR}
+	cd ${CUR_DIR}
+	echo "Torch-zipped size $(du -sh $1 | cut -f1)"
+fi
diff --git a/benchmarks/000.microbenchmarks/0xx.vector-add/python/requirements.txt b/benchmarks/000.microbenchmarks/0xx.vector-add/python/requirements.txt
new file mode 100644
index 000000000..37f700a78
--- /dev/null
+++ b/benchmarks/000.microbenchmarks/0xx.vector-add/python/requirements.txt
@@ -0,0 +1,2 @@
+torch
+torchvision
\ No newline at end of file
diff --git a/benchmarks/000.microbenchmarks/0xx.vector-add/python/requirements.txt.3.10 b/benchmarks/000.microbenchmarks/0xx.vector-add/python/requirements.txt.3.10
new file mode 100644
index 000000000..216e1c32b
--- /dev/null
+++ b/benchmarks/000.microbenchmarks/0xx.vector-add/python/requirements.txt.3.10
@@ -0,0 +1,3 @@
+numpy==1.26.4
+torch
+torchvision
\ No newline at end of file
diff --git a/benchmarks/000.microbenchmarks/0xx.vector-add/python/requirements.txt.3.11 b/benchmarks/000.microbenchmarks/0xx.vector-add/python/requirements.txt.3.11
new file mode 100644
index 000000000..216e1c32b
--- /dev/null
+++ b/benchmarks/000.microbenchmarks/0xx.vector-add/python/requirements.txt.3.11
@@ -0,0 +1,3 @@
+numpy==1.26.4
+torch
+torchvision
\ No newline at end of file
diff --git a/benchmarks/000.microbenchmarks/0xx.vector-add/python/requirements.txt.3.6 b/benchmarks/000.microbenchmarks/0xx.vector-add/python/requirements.txt.3.6
new file mode 100644
index 000000000..bdc3d4b18
--- /dev/null
+++ b/benchmarks/000.microbenchmarks/0xx.vector-add/python/requirements.txt.3.6
@@ -0,0 +1,3 @@
+numpy==1.24.0
+torch
+torchvision
\ No newline at end of file
diff --git a/benchmarks/000.microbenchmarks/0xx.vector-add/python/requirements.txt.3.7 b/benchmarks/000.microbenchmarks/0xx.vector-add/python/requirements.txt.3.7
new file mode 100644
index 000000000..bdc3d4b18
--- /dev/null
+++ b/benchmarks/000.microbenchmarks/0xx.vector-add/python/requirements.txt.3.7
@@ -0,0 +1,3 @@
+numpy==1.24.0
+torch
+torchvision
\ No newline at end of file
diff --git a/benchmarks/000.microbenchmarks/0xx.vector-add/python/requirements.txt.3.8 b/benchmarks/000.microbenchmarks/0xx.vector-add/python/requirements.txt.3.8
new file mode 100644
index 000000000..bdc3d4b18
--- /dev/null
+++ b/benchmarks/000.microbenchmarks/0xx.vector-add/python/requirements.txt.3.8
@@ -0,0 +1,3 @@
+numpy==1.24.0
+torch
+torchvision
\ No newline at end of file
diff --git a/benchmarks/000.microbenchmarks/0xx.vector-add/python/requirements.txt.3.9 b/benchmarks/000.microbenchmarks/0xx.vector-add/python/requirements.txt.3.9
new file mode 100644
index 000000000..bdc3d4b18
--- /dev/null
+++ b/benchmarks/000.microbenchmarks/0xx.vector-add/python/requirements.txt.3.9
@@ -0,0 +1,3 @@
+numpy==1.24.0
+torch
+torchvision
\ No newline at end of file
diff --git a/benchmarks/100.webapps/110.dynamic-html/input.py b/benchmarks/100.webapps/110.dynamic-html/input.py
index 98dac88b2..cf3c8ff0e 100644
--- a/benchmarks/100.webapps/110.dynamic-html/input.py
+++ b/benchmarks/100.webapps/110.dynamic-html/input.py
@@ -1,11 +1,15 @@
+size_generators = {"test": 10, "small": 1000, "large": 100000}
 
-size_generators = {
-    'test' : 10,
-    'small' : 1000,
-    'large': 100000
-}
 
-def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func, nosql_func):
-    input_config = {'username': 'testname'} 
-    input_config['random_len'] = size_generators[size]
+def generate_input(
+    data_dir,
+    size,
+    benchmarks_bucket,
+    input_paths,
+    output_paths,
+    upload_func,
+    nosql_func,
+):
+    input_config = {"username": "testname"}
+    input_config["random_len"] = size_generators[size]
     return input_config
diff --git a/benchmarks/100.webapps/110.dynamic-html/python/function.py b/benchmarks/100.webapps/110.dynamic-html/python/function.py
index 7c990f4eb..6f7b42bc5 100644
--- a/benchmarks/100.webapps/110.dynamic-html/python/function.py
+++ b/benchmarks/100.webapps/110.dynamic-html/python/function.py
@@ -1,22 +1,21 @@
-from datetime import datetime                                                   
-from random import sample  
+from datetime import datetime
+from random import sample
 from os import path
-from time import time                                                           
-import os
 
 from jinja2 import Template
 
 SCRIPT_DIR = path.abspath(path.join(path.dirname(__file__)))
 
+
 def handler(event):
 
     # start timing
-    name = event.get('username')
-    size = event.get('random_len')
+    name = event.get("username")
+    size = event.get("random_len")
     cur_time = datetime.now()
     random_numbers = sample(range(0, 1000000), size)
-    template = Template( open(path.join(SCRIPT_DIR, 'templates', 'template.html'), 'r').read())
-    html = template.render(username = name, cur_time = cur_time, random_numbers = random_numbers)
+    template = Template(open(path.join(SCRIPT_DIR, "templates", "template.html"), "r").read())
+    html = template.render(username=name, cur_time=cur_time, random_numbers=random_numbers)
     # end timing
-    # dump stats 
-    return {'result': html}
+    # dump stats
+    return {"result": html}
diff --git a/benchmarks/100.webapps/120.uploader/input.py b/benchmarks/100.webapps/120.uploader/input.py
index ce6169ccb..7c40e674b 100644
--- a/benchmarks/100.webapps/120.uploader/input.py
+++ b/benchmarks/100.webapps/120.uploader/input.py
@@ -1,19 +1,31 @@
-
 url_generators = {
     # source: mlperf fake_imagenet.sh. 230 kB
-    'test' : 'https://upload.wikimedia.org/wikipedia/commons/thumb/e/e7/Jammlich_crop.jpg/800px-Jammlich_crop.jpg',
+    "test": (
+        "https://upload.wikimedia.org/wikipedia/commons/thumb/e/e7/"
+        "Jammlich_crop.jpg/800px-Jammlich_crop.jpg"
+    ),
     # video: HPX source code, 6.7 MB
-    'small': 'https://github.com/STEllAR-GROUP/hpx/archive/refs/tags/1.4.0.zip',
+    "small": "https://github.com/STEllAR-GROUP/hpx/archive/refs/tags/1.4.0.zip",
     # resnet model from pytorch. 98M
-    'large':  'https://download.pytorch.org/models/resnet50-19c8e357.pth'
+    "large": "https://download.pytorch.org/models/resnet50-19c8e357.pth",
 }
 
+
 def buckets_count():
     return (0, 1)
 
-def generate_input(data_dir, size, benchmarks_bucket, input_buckets, output_buckets, upload_func, nosql_func):
-    input_config = {'object': {}, 'bucket': {}}
-    input_config['object']['url'] = url_generators[size]
-    input_config['bucket']['bucket'] = benchmarks_bucket
-    input_config['bucket']['output'] = output_buckets[0]
+
+def generate_input(
+    data_dir,
+    size,
+    benchmarks_bucket,
+    input_buckets,
+    output_buckets,
+    upload_func,
+    nosql_func,
+):
+    input_config = {"object": {}, "bucket": {}}
+    input_config["object"]["url"] = url_generators[size]
+    input_config["bucket"]["bucket"] = benchmarks_bucket
+    input_config["bucket"]["output"] = output_buckets[0]
     return input_config
diff --git a/benchmarks/100.webapps/120.uploader/python/function.py b/benchmarks/100.webapps/120.uploader/python/function.py
index d032bbdb6..cb17131f1 100755
--- a/benchmarks/100.webapps/120.uploader/python/function.py
+++ b/benchmarks/100.webapps/120.uploader/python/function.py
@@ -1,26 +1,29 @@
-
 import datetime
 import os
 
 import urllib.request
 
 from . import storage
+
 client = storage.storage.get_instance()
 
-SEBS_USER_AGENT = "SeBS/1.2 (https://github.com/spcl/serverless-benchmarks) SeBS Benchmark Suite/1.2"
+SEBS_USER_AGENT = (
+    "SeBS/1.2 (https://github.com/spcl/serverless-benchmarks) SeBS Benchmark Suite/1.2"
+)
+
 
 def handler(event):
 
-    bucket = event.get('bucket').get('bucket')
-    output_prefix = event.get('bucket').get('output')
-    url = event.get('object').get('url')
+    bucket = event.get("bucket").get("bucket")
+    output_prefix = event.get("bucket").get("output")
+    url = event.get("object").get("url")
     name = os.path.basename(url)
-    download_path = '/tmp/{}'.format(name)
+    download_path = "/tmp/{}".format(name)
 
     process_begin = datetime.datetime.now()
     req = urllib.request.Request(url)
-    req.add_header('User-Agent', SEBS_USER_AGENT)
-    with open(download_path, 'wb') as f:
+    req.add_header("User-Agent", SEBS_USER_AGENT)
+    with open(download_path, "wb") as f:
         with urllib.request.urlopen(req) as response:
             f.write(response.read())
     size = os.path.getsize(download_path)
@@ -33,16 +36,12 @@ def handler(event):
     process_time = (process_end - process_begin) / datetime.timedelta(microseconds=1)
     upload_time = (upload_end - upload_begin) / datetime.timedelta(microseconds=1)
     return {
-            'result': {
-                'bucket': bucket,
-                'url': url,
-                'key': key_name
-            },
-            'measurement': {
-                'download_time': 0,
-                'download_size': 0,
-                'upload_time': upload_time,
-                'upload_size': size,
-                'compute_time': process_time
-            }
+        "result": {"bucket": bucket, "url": url, "key": key_name},
+        "measurement": {
+            "download_time": 0,
+            "download_size": 0,
+            "upload_time": upload_time,
+            "upload_size": size,
+            "compute_time": process_time,
+        },
     }
diff --git a/benchmarks/100.webapps/130.crud-api/input.py b/benchmarks/100.webapps/130.crud-api/input.py
index c019e7e8b..44f0a945d 100644
--- a/benchmarks/100.webapps/130.crud-api/input.py
+++ b/benchmarks/100.webapps/130.crud-api/input.py
@@ -6,7 +6,13 @@ def allocate_nosql() -> dict:
 
 
 def generate_input(
-    data_dir, size, benchmarks_bucket, input_buckets, output_buckets, upload_func, nosql_upload
+    data_dir,
+    size,
+    benchmarks_bucket,
+    input_buckets,
+    output_buckets,
+    upload_func,
+    nosql_upload,
 ):
 
     input_config = {}
diff --git a/benchmarks/100.webapps/130.crud-api/python/function.py b/benchmarks/100.webapps/130.crud-api/python/function.py
index 0b5e0e8c0..e045eed48 100644
--- a/benchmarks/100.webapps/130.crud-api/python/function.py
+++ b/benchmarks/100.webapps/130.crud-api/python/function.py
@@ -52,7 +52,11 @@ def handler(event):
 
         if route == "PUT /cart":
             add_product(
-                body["cart"], body["product_id"], body["name"], body["price"], body["quantity"]
+                body["cart"],
+                body["product_id"],
+                body["name"],
+                body["price"],
+                body["quantity"],
             )
             res = {}
         elif route == "GET /cart/{id}":
diff --git a/benchmarks/200.multimedia/210.thumbnailer/input.py b/benchmarks/200.multimedia/210.thumbnailer/input.py
index 8943effed..e8cad832a 100644
--- a/benchmarks/200.multimedia/210.thumbnailer/input.py
+++ b/benchmarks/200.multimedia/210.thumbnailer/input.py
@@ -1,9 +1,12 @@
-import glob, os
+import glob
+import os
+
 
 def buckets_count():
     return (1, 1)
 
-'''
+
+"""
     Generate test, small and large workload for thumbnailer.
 
     :param data_dir: directory where benchmark data is placed
@@ -11,19 +14,29 @@ def buckets_count():
     :param input_buckets: input storage containers for this benchmark
     :param output_buckets:
     :param upload_func: upload function taking three params(bucket_idx, key, filepath)
-'''
-def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func, nosql_func):
+"""
+
+
+def generate_input(
+    data_dir,
+    size,
+    benchmarks_bucket,
+    input_paths,
+    output_paths,
+    upload_func,
+    nosql_func,
+):
 
-    for file in glob.glob(os.path.join(data_dir, '*.jpg')):
+    for file in glob.glob(os.path.join(data_dir, "*.jpg")):
         img = os.path.relpath(file, data_dir)
         upload_func(0, img, file)
 
-    #TODO: multiple datasets
-    input_config = {'object': {}, 'bucket': {}}
-    input_config['object']['key'] = img
-    input_config['object']['width'] = 200
-    input_config['object']['height'] = 200
-    input_config['bucket']['bucket'] = benchmarks_bucket
-    input_config['bucket']['input'] = input_paths[0]
-    input_config['bucket']['output'] = output_paths[0]
+    # TODO: multiple datasets
+    input_config = {"object": {}, "bucket": {}}
+    input_config["object"]["key"] = img
+    input_config["object"]["width"] = 200
+    input_config["object"]["height"] = 200
+    input_config["bucket"]["bucket"] = benchmarks_bucket
+    input_config["bucket"]["input"] = input_paths[0]
+    input_config["bucket"]["output"] = output_paths[0]
     return input_config
diff --git a/benchmarks/200.multimedia/210.thumbnailer/python/function.py b/benchmarks/200.multimedia/210.thumbnailer/python/function.py
index 20527067b..2df0a7bfb 100755
--- a/benchmarks/200.multimedia/210.thumbnailer/python/function.py
+++ b/benchmarks/200.multimedia/210.thumbnailer/python/function.py
@@ -1,44 +1,45 @@
 import datetime
 import io
 import os
-import sys
-import uuid
 from urllib.parse import unquote_plus
 from PIL import Image
 
 from . import storage
+
 client = storage.storage.get_instance()
 
 # Disk-based solution
-#def resize_image(image_path, resized_path, w, h):
+# def resize_image(image_path, resized_path, w, h):
 #    with Image.open(image_path) as image:
 #        image.thumbnail((w,h))
 #        image.save(resized_path)
 
+
 # Memory-based solution
 def resize_image(image_bytes, w, h):
     with Image.open(io.BytesIO(image_bytes)) as image:
-        image.thumbnail((w,h))
+        image.thumbnail((w, h))
         out = io.BytesIO()
-        image.save(out, format='jpeg')
+        image.save(out, format="jpeg")
         # necessary to rewind to the beginning of the buffer
         out.seek(0)
         return out
 
+
 def handler(event):
-  
-    bucket = event.get('bucket').get('bucket')
-    input_prefix = event.get('bucket').get('input')
-    output_prefix = event.get('bucket').get('output')
-    key = unquote_plus(event.get('object').get('key'))
-    width = event.get('object').get('width')
-    height = event.get('object').get('height')
+
+    bucket = event.get("bucket").get("bucket")
+    input_prefix = event.get("bucket").get("input")
+    output_prefix = event.get("bucket").get("output")
+    key = unquote_plus(event.get("object").get("key"))
+    width = event.get("object").get("width")
+    height = event.get("object").get("height")
     # UUID to handle multiple calls
-    #download_path = '/tmp/{}-{}'.format(uuid.uuid4(), key)
-    #upload_path = '/tmp/resized-{}'.format(key)
-    #client.download(input_bucket, key, download_path)
-    #resize_image(download_path, upload_path, width, height)
-    #client.upload(output_bucket, key, upload_path)
+    # download_path = '/tmp/{}-{}'.format(uuid.uuid4(), key)
+    # upload_path = '/tmp/resized-{}'.format(key)
+    # client.download(input_bucket, key, download_path)
+    # resize_image(download_path, upload_path, width, height)
+    # client.upload(output_bucket, key, upload_path)
     download_begin = datetime.datetime.now()
     img = client.download_stream(bucket, os.path.join(input_prefix, key))
     download_end = datetime.datetime.now()
@@ -56,15 +57,12 @@ def handler(event):
     upload_time = (upload_end - upload_begin) / datetime.timedelta(microseconds=1)
     process_time = (process_end - process_begin) / datetime.timedelta(microseconds=1)
     return {
-            'result': {
-                'bucket': bucket,
-                'key': key_name
-            },
-            'measurement': {
-                'download_time': download_time,
-                'download_size': len(img),
-                'upload_time': upload_time,
-                'upload_size': resized_size,
-                'compute_time': process_time
-            }
+        "result": {"bucket": bucket, "key": key_name},
+        "measurement": {
+            "download_time": download_time,
+            "download_size": len(img),
+            "upload_time": upload_time,
+            "upload_size": resized_size,
+            "compute_time": process_time,
+        },
     }
diff --git a/benchmarks/200.multimedia/220.video-processing-gpu/config.json b/benchmarks/200.multimedia/220.video-processing-gpu/config.json
new file mode 100644
index 000000000..94ede7925
--- /dev/null
+++ b/benchmarks/200.multimedia/220.video-processing-gpu/config.json
@@ -0,0 +1,6 @@
+{
+  "timeout": 60,
+  "memory": 512,
+  "languages": ["python"],
+  "modules": ["storage"]
+}
diff --git a/benchmarks/200.multimedia/220.video-processing-gpu/init.sh b/benchmarks/200.multimedia/220.video-processing-gpu/init.sh
new file mode 100755
index 000000000..2553852d0
--- /dev/null
+++ b/benchmarks/200.multimedia/220.video-processing-gpu/init.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+# Arguments required by SeBS, even if unused
+DIR=$1
+VERBOSE=$2
+TARGET_ARCHITECTURE=$3
+
+# This benchmark does not need any special init step.
+# All dependencies (torch, opencv-python, etc.) are installed via requirements.txt.
+exit 0
diff --git a/benchmarks/200.multimedia/220.video-processing-gpu/input.py b/benchmarks/200.multimedia/220.video-processing-gpu/input.py
new file mode 100644
index 000000000..4ae479ba0
--- /dev/null
+++ b/benchmarks/200.multimedia/220.video-processing-gpu/input.py
@@ -0,0 +1,41 @@
+import glob
+import os
+
+
+def buckets_count():
+    return (1, 1)
+
+
+"""
+    Generate test, small and large workload for thumbnailer.
+
+    :param data_dir: directory where benchmark data is placed
+    :param size: workload size
+    :param input_buckets: input storage containers for this benchmark
+    :param output_buckets:
+    :param upload_func: upload function taking three params(bucket_idx, key, filepath)
+"""
+
+
+def generate_input(
+    data_dir,
+    size,
+    benchmarks_bucket,
+    input_paths,
+    output_paths,
+    upload_func,
+    nosql_func,
+):
+    for file in glob.glob(os.path.join(data_dir, "*.mp4")):
+        img = os.path.relpath(file, data_dir)
+        upload_func(0, img, file)
+    # TODO: multiple datasets
+    input_config = {"object": {}, "bucket": {}}
+    input_config["object"]["key"] = img
+    # The function supports only "gpu-filter" at the moment.
+    input_config["object"]["op"] = "gpu-filter"
+    input_config["object"]["duration"] = 1
+    input_config["bucket"]["bucket"] = benchmarks_bucket
+    input_config["bucket"]["input"] = input_paths[0]
+    input_config["bucket"]["output"] = output_paths[0]
+    return input_config
diff --git a/benchmarks/200.multimedia/220.video-processing-gpu/python/function.py b/benchmarks/200.multimedia/220.video-processing-gpu/python/function.py
new file mode 100755
index 000000000..4b9813ef4
--- /dev/null
+++ b/benchmarks/200.multimedia/220.video-processing-gpu/python/function.py
@@ -0,0 +1,194 @@
+#!/usr/bin/env python
+
+import datetime
+import os
+import stat  # can also be removed if you drop ffmpeg entirely
+from typing import Dict, Any
+
+import numpy as np
+import cv2
+import torch
+import torch.nn as nn
+
+from . import storage
+
+client = storage.storage.get_instance()
+SCRIPT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__)))
+
+
+def gpu_video_filter(video_path: str, duration: float, event: Dict[str, Any]) -> str:
+    """
+    Decode a video on CPU (OpenCV), run a heavy GPU filter with PyTorch,
+    and re-encode the processed video.
+
+    This gives you a realistic FaaS workload:
+      - I/O via storage
+      - CPU video decode/encode
+      - GPU-heavy tensor processing
+    """
+
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        raise RuntimeError(f"Could not open input video: {video_path}")
+
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    if not fps or fps <= 0:
+        fps = 25.0  # fallback
+
+    max_frames = int(fps * duration)
+    frames = []
+
+    for i in range(max_frames):
+        ret, frame_bgr = cap.read()
+        if not ret:
+            break
+        # Convert BGR (OpenCV default) to RGB
+        frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
+        frames.append(frame_rgb)
+
+    cap.release()
+
+    if not frames:
+        raise RuntimeError("No frames decoded from video (empty or too short?)")
+
+    # Stack into (T, H, W, C)
+    video_np = np.stack(frames, axis=0)  # uint8, 0–255
+    T, H, W, C = video_np.shape
+
+    # Convert to torch tensor: (T, C, H, W), float32 in [0, 1]
+    video = torch.from_numpy(video_np).permute(0, 3, 1, 2).float() / 255.0
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    video = video.to(device)
+
+    # Simple heavy-ish GPU workload: repeated 3x3 conv + ReLU
+    # You can tweak num_channels, num_iters, etc. via the event
+    num_iters = event.get("object", {}).get("num_iters", 10)
+    num_channels = 3  # keep 3 so we can write back as RGB
+
+    conv = nn.Conv2d(
+        in_channels=num_channels,
+        out_channels=num_channels,
+        kernel_size=3,
+        padding=1,
+        bias=False,
+    ).to(device)
+
+    with torch.no_grad():
+        for _ in range(num_iters):
+            video = torch.relu(conv(video))
+
+    # Back to uint8 on CPU: (T, H, W, C)
+    video = (video.clamp(0.0, 1.0) * 255.0).byte()
+    video_np_out = video.permute(0, 2, 3, 1).cpu().numpy()
+
+    # Encode processed video with OpenCV (CPU)
+    base = os.path.splitext(os.path.basename(video_path))[0]
+    out_path = f"/tmp/processed-{base}.mp4"
+
+    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+    writer = cv2.VideoWriter(out_path, fourcc, fps, (W, H))
+    if not writer.isOpened():
+        raise RuntimeError(f"Could not open VideoWriter for: {out_path}")
+
+    for frame_rgb in video_np_out:
+        frame_bgr = cv2.cvtColor(frame_rgb, cv2.COLOR_RGB2BGR)
+        writer.write(frame_bgr)
+
+    writer.release()
+    return out_path
+
+
+# You can still support multiple ops if you want in the future.
+# For now, we map "gpu-filter" (or "transcode" if you want to reuse the old name)
+operations = {
+    "gpu-filter": gpu_video_filter,
+    # If you want to keep old names:
+    # "transcode": gpu_video_filter,
+    # "watermark": gpu_video_filter,
+    # "extract-gif": gpu_video_filter,
+}
+
+
+def handler(event: Dict[str, Any]):
+    """
+    FaaS entrypoint.
+
+    Expected event structure (SeBS-style):
+
+    {
+      "bucket": {
+        "bucket": "<bucket-name>",
+        "input": "<input-prefix>",
+        "output": "<output-prefix>"
+      },
+      "object": {
+        "key": "<object-key>",
+        "duration": <seconds>,
+        "op": "gpu-filter",
+        // optional:
+        // "num_iters": 20
+      }
+    }
+    """
+
+    bucket = event.get("bucket", {}).get("bucket")
+    input_prefix = event.get("bucket", {}).get("input")
+    output_prefix = event.get("bucket", {}).get("output")
+
+    obj = event.get("object", {})
+    key = obj.get("key")
+    duration = obj.get("duration", 5)  # default: 5 seconds
+    op = obj.get("op", "gpu-filter")
+
+    if op not in operations:
+        raise ValueError(f"Unknown operation '{op}'. Supported: {', '.join(operations.keys())}")
+
+    download_path = f"/tmp/{key}"
+
+    # If you no longer ship ffmpeg/ffmpeg, you can remove this chmod block completely.
+    # Leaving it here is harmless if the file doesn't exist (it will just fail and pass).
+    ffmpeg_binary = os.path.join(SCRIPT_DIR, "ffmpeg", "ffmpeg")
+    try:
+        st = os.stat(ffmpeg_binary)
+        os.chmod(ffmpeg_binary, st.st_mode | stat.S_IEXEC)
+    except OSError:
+        # Ignore if ffmpeg is not present or filesystem is read-only.
+        pass
+
+    # --- Download phase ---
+    download_begin = datetime.datetime.now()
+    client.download(bucket, os.path.join(input_prefix, key), download_path)
+    download_size = os.path.getsize(download_path)
+    download_stop = datetime.datetime.now()
+
+    # --- Compute phase (GPU via PyTorch) ---
+    process_begin = datetime.datetime.now()
+    upload_path = operations[op](download_path, duration, event)
+    process_end = datetime.datetime.now()
+
+    # --- Upload phase ---
+    upload_begin = datetime.datetime.now()
+    filename = os.path.basename(upload_path)
+    upload_size = os.path.getsize(upload_path)
+    upload_key = client.upload(bucket, os.path.join(output_prefix, filename), upload_path)
+    upload_stop = datetime.datetime.now()
+
+    # Convert timedeltas to microseconds
+    download_time = (download_stop - download_begin) / datetime.timedelta(microseconds=1)
+    upload_time = (upload_stop - upload_begin) / datetime.timedelta(microseconds=1)
+    process_time = (process_end - process_begin) / datetime.timedelta(microseconds=1)
+
+    return {
+        "result": {
+            "bucket": bucket,
+            "key": upload_key,
+        },
+        "measurement": {
+            "download_time": download_time,
+            "download_size": download_size,
+            "upload_time": upload_time,
+            "upload_size": upload_size,
+            "compute_time": process_time,
+        },
+    }
diff --git a/benchmarks/200.multimedia/220.video-processing-gpu/python/requirements.txt b/benchmarks/200.multimedia/220.video-processing-gpu/python/requirements.txt
new file mode 100644
index 000000000..a7c9b6d20
--- /dev/null
+++ b/benchmarks/200.multimedia/220.video-processing-gpu/python/requirements.txt
@@ -0,0 +1,3 @@
+torch
+opencv-python-headless
+numpy
diff --git a/benchmarks/200.multimedia/220.video-processing-gpu/resources/watermark.png b/benchmarks/200.multimedia/220.video-processing-gpu/resources/watermark.png
new file mode 100755
index 000000000..32d07cedb
Binary files /dev/null and b/benchmarks/200.multimedia/220.video-processing-gpu/resources/watermark.png differ
diff --git a/benchmarks/200.multimedia/220.video-processing/input.py b/benchmarks/200.multimedia/220.video-processing/input.py
index 6da31647f..25aad6ae4 100644
--- a/benchmarks/200.multimedia/220.video-processing/input.py
+++ b/benchmarks/200.multimedia/220.video-processing/input.py
@@ -1,9 +1,13 @@
-import glob, os
+import glob
+import os
+
 
 def buckets_count():
+    # one input bucket, one output bucket
     return (1, 1)
 
-'''
+
+"""
     Generate test, small and large workload for thumbnailer.
 
     :param data_dir: directory where benchmark data is placed
@@ -11,17 +15,27 @@ def buckets_count():
     :param input_buckets: input storage containers for this benchmark
     :param output_buckets:
     :param upload_func: upload function taking three params(bucket_idx, key, filepath)
-'''
-def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func, nosql_func):
-    for file in glob.glob(os.path.join(data_dir, '*.mp4')):
+"""
+
+
+def generate_input(
+    data_dir,
+    size,
+    benchmarks_bucket,
+    input_paths,
+    output_paths,
+    upload_func,
+    nosql_func,
+):
+    for file in glob.glob(os.path.join(data_dir, "*.mp4")):
         img = os.path.relpath(file, data_dir)
         upload_func(0, img, file)
-    #TODO: multiple datasets
-    input_config = {'object': {}, 'bucket': {}}
-    input_config['object']['key'] = img
-    input_config['object']['op'] = 'watermark'
-    input_config['object']['duration'] = 1
-    input_config['bucket']['bucket'] = benchmarks_bucket
-    input_config['bucket']['input'] = input_paths[0]
-    input_config['bucket']['output'] = output_paths[0]
+    # TODO: multiple datasets
+    input_config = {"object": {}, "bucket": {}}
+    input_config["object"]["key"] = img
+    input_config["object"]["op"] = "watermark"
+    input_config["object"]["duration"] = 1
+    input_config["bucket"]["bucket"] = benchmarks_bucket
+    input_config["bucket"]["input"] = input_paths[0]
+    input_config["bucket"]["output"] = output_paths[0]
     return input_config
diff --git a/benchmarks/200.multimedia/220.video-processing/python/function.py b/benchmarks/200.multimedia/220.video-processing/python/function.py
index 9f8a869aa..ab132ba2e 100755
--- a/benchmarks/200.multimedia/220.video-processing/python/function.py
+++ b/benchmarks/200.multimedia/220.video-processing/python/function.py
@@ -7,62 +7,84 @@
 
 
 from . import storage
+
 client = storage.storage.get_instance()
 
 SCRIPT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__)))
 
+
 def call_ffmpeg(args):
-    ret = subprocess.run([os.path.join(SCRIPT_DIR, 'ffmpeg', 'ffmpeg'), '-y'] + args,
-            #subprocess might inherit Lambda's input for some reason
-            stdin=subprocess.DEVNULL,
-            stdout=subprocess.PIPE, stderr=subprocess.STDOUT
+    ret = subprocess.run(
+        [os.path.join(SCRIPT_DIR, "ffmpeg", "ffmpeg"), "-y"] + args,
+        # subprocess might inherit Lambda's input for some reason
+        stdin=subprocess.DEVNULL,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
     )
     if ret.returncode != 0:
-        print('Invocation of ffmpeg failed!')
-        print('Out: ', ret.stdout.decode('utf-8'))
+        print("Invocation of ffmpeg failed!")
+        print("Out: ", ret.stdout.decode("utf-8"))
         raise RuntimeError()
 
+
 # https://superuser.com/questions/556029/how-do-i-convert-a-video-to-gif-using-ffmpeg-with-reasonable-quality
 def to_gif(video, duration, event):
-    output = '/tmp/processed-{}.gif'.format(os.path.basename(video))
-    call_ffmpeg(["-i", video,
-        "-t",
-        "{0}".format(duration),
-        "-vf",
-        "fps=10,scale=320:-1:flags=lanczos,split[s0][s1];[s0]palettegen[p];[s1][p]paletteuse",
-        "-loop", "0",
-        output])
+    output = "/tmp/processed-{}.gif".format(os.path.basename(video))
+    call_ffmpeg(
+        [
+            "-i",
+            video,
+            "-t",
+            "{0}".format(duration),
+            "-vf",
+            "fps=10,scale=320:-1:flags=lanczos,split[s0][s1];[s0]palettegen[p];[s1][p]paletteuse",
+            "-loop",
+            "0",
+            output,
+        ]
+    )
     return output
 
+
 # https://devopstar.com/2019/01/28/serverless-watermark-using-aws-lambda-layers-ffmpeg/
 def watermark(video, duration, event):
-    output = '/tmp/processed-{}'.format(os.path.basename(video))
+    output = "/tmp/processed-{}".format(os.path.basename(video))
     watermark_file = os.path.dirname(os.path.realpath(__file__))
-    call_ffmpeg([
-        "-i", video,
-        "-i", os.path.join(watermark_file, os.path.join('resources', 'watermark.png')),
-        "-t", "{0}".format(duration),
-        "-filter_complex", "overlay=main_w/2-overlay_w/2:main_h/2-overlay_h/2",
-        output])
+    call_ffmpeg(
+        [
+            "-i",
+            video,
+            "-i",
+            os.path.join(watermark_file, os.path.join("resources", "watermark.png")),
+            "-t",
+            "{0}".format(duration),
+            "-filter_complex",
+            "overlay=main_w/2-overlay_w/2:main_h/2-overlay_h/2",
+            output,
+        ]
+    )
     return output
 
+
 def transcode_mp3(video, duration, event):
     pass
 
-operations = { 'transcode' : transcode_mp3, 'extract-gif' : to_gif, 'watermark' : watermark }
+
+operations = {"transcode": transcode_mp3, "extract-gif": to_gif, "watermark": watermark}
+
 
 def handler(event):
 
-    bucket = event.get('bucket').get('bucket')
-    input_prefix = event.get('bucket').get('input')
-    output_prefix = event.get('bucket').get('output')
-    key = event.get('object').get('key')
-    duration = event.get('object').get('duration')
-    op = event.get('object').get('op')
-    download_path = '/tmp/{}'.format(key)
+    bucket = event.get("bucket").get("bucket")
+    input_prefix = event.get("bucket").get("input")
+    output_prefix = event.get("bucket").get("output")
+    key = event.get("object").get("key")
+    duration = event.get("object").get("duration")
+    op = event.get("object").get("op")
+    download_path = "/tmp/{}".format(key)
 
     # Restore executable permission
-    ffmpeg_binary = os.path.join(SCRIPT_DIR, 'ffmpeg', 'ffmpeg')
+    ffmpeg_binary = os.path.join(SCRIPT_DIR, "ffmpeg", "ffmpeg")
     # needed on Azure but read-only filesystem on AWS
     try:
         st = os.stat(ffmpeg_binary)
@@ -89,16 +111,12 @@ def handler(event):
     upload_time = (upload_stop - upload_begin) / datetime.timedelta(microseconds=1)
     process_time = (process_end - process_begin) / datetime.timedelta(microseconds=1)
     return {
-            'result': {
-                'bucket': bucket,
-                'key': upload_key
-            },
-            'measurement': {
-                'download_time': download_time,
-                'download_size': download_size,
-                'upload_time': upload_time,
-                'upload_size': upload_size,
-                'compute_time': process_time
-            }
-        }
-
+        "result": {"bucket": bucket, "key": upload_key},
+        "measurement": {
+            "download_time": download_time,
+            "download_size": download_size,
+            "upload_time": upload_time,
+            "upload_size": upload_size,
+            "compute_time": process_time,
+        },
+    }
diff --git a/benchmarks/300.utilities/311.compression/input.py b/benchmarks/300.utilities/311.compression/input.py
index 5f88bc91a..c929eda27 100644
--- a/benchmarks/300.utilities/311.compression/input.py
+++ b/benchmarks/300.utilities/311.compression/input.py
@@ -1,4 +1,5 @@
-import glob, os
+import os
+
 
 def buckets_count():
     return (1, 1)
@@ -9,11 +10,12 @@ def upload_files(data_root, data_dir, upload_func):
     for root, dirs, files in os.walk(data_dir):
         prefix = os.path.relpath(root, data_root)
         for file in files:
-            file_name = prefix + '/' + file
+            file_name = prefix + "/" + file
             filepath = os.path.join(root, file)
             upload_func(0, file_name, filepath)
 
-'''
+
+"""
     Generate test, small and large workload for compression test.
 
     :param data_dir: directory where benchmark data is placed
@@ -21,8 +23,18 @@ def upload_files(data_root, data_dir, upload_func):
     :param input_buckets: input storage containers for this benchmark
     :param output_buckets:
     :param upload_func: upload function taking three params(bucket_idx, key, filepath)
-'''
-def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func, nosql_func):
+"""
+
+
+def generate_input(
+    data_dir,
+    size,
+    benchmarks_bucket,
+    input_paths,
+    output_paths,
+    upload_func,
+    nosql_func,
+):
 
     # upload different datasets
     datasets = []
@@ -30,9 +42,9 @@ def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths,
         datasets.append(dir)
         upload_files(data_dir, os.path.join(data_dir, dir), upload_func)
 
-    input_config = {'object': {}, 'bucket': {}}
-    input_config['object']['key'] = datasets[0]
-    input_config['bucket']['bucket'] = benchmarks_bucket
-    input_config['bucket']['input'] = input_paths[0]
-    input_config['bucket']['output'] = output_paths[0]
+    input_config = {"object": {}, "bucket": {}}
+    input_config["object"]["key"] = datasets[0]
+    input_config["bucket"]["bucket"] = benchmarks_bucket
+    input_config["bucket"]["input"] = input_paths[0]
+    input_config["bucket"]["output"] = output_paths[0]
     return input_config
diff --git a/benchmarks/300.utilities/311.compression/python/function.py b/benchmarks/300.utilities/311.compression/python/function.py
index f758e14e4..d8c8fa53f 100755
--- a/benchmarks/300.utilities/311.compression/python/function.py
+++ b/benchmarks/300.utilities/311.compression/python/function.py
@@ -1,13 +1,13 @@
 import datetime
-import io
 import os
 import shutil
 import uuid
-import zlib
 
 from . import storage
+
 client = storage.storage.get_instance()
 
+
 def parse_directory(directory):
 
     size = 0
@@ -16,13 +16,14 @@ def parse_directory(directory):
             size += os.path.getsize(os.path.join(root, file))
     return size
 
+
 def handler(event):
-  
-    bucket = event.get('bucket').get('bucket')
-    input_prefix = event.get('bucket').get('input')
-    output_prefix = event.get('bucket').get('output')
-    key = event.get('object').get('key')
-    download_path = '/tmp/{}-{}'.format(key, uuid.uuid4())
+
+    bucket = event.get("bucket").get("bucket")
+    input_prefix = event.get("bucket").get("input")
+    output_prefix = event.get("bucket").get("output")
+    key = event.get("object").get("key")
+    download_path = "/tmp/{}-{}".format(key, uuid.uuid4())
     os.makedirs(download_path)
 
     s3_download_begin = datetime.datetime.now()
@@ -31,29 +32,29 @@ def handler(event):
     size = parse_directory(download_path)
 
     compress_begin = datetime.datetime.now()
-    shutil.make_archive(os.path.join(download_path, key), 'zip', root_dir=download_path)
+    shutil.make_archive(os.path.join(download_path, key), "zip", root_dir=download_path)
     compress_end = datetime.datetime.now()
 
     s3_upload_begin = datetime.datetime.now()
-    archive_name = '{}.zip'.format(key)
+    archive_name = "{}.zip".format(key)
     archive_size = os.path.getsize(os.path.join(download_path, archive_name))
-    key_name = client.upload(bucket, os.path.join(output_prefix, archive_name), os.path.join(download_path, archive_name))
+    key_name = client.upload(
+        bucket,
+        os.path.join(output_prefix, archive_name),
+        os.path.join(download_path, archive_name),
+    )
     s3_upload_stop = datetime.datetime.now()
 
     download_time = (s3_download_stop - s3_download_begin) / datetime.timedelta(microseconds=1)
     upload_time = (s3_upload_stop - s3_upload_begin) / datetime.timedelta(microseconds=1)
     process_time = (compress_end - compress_begin) / datetime.timedelta(microseconds=1)
     return {
-            'result': {
-                'bucket': bucket,
-                'key': key_name
-            },
-            'measurement': {
-                'download_time': download_time,
-                'download_size': size,
-                'upload_time': upload_time,
-                'upload_size': archive_size,
-                'compute_time': process_time
-            }
-        }
-
+        "result": {"bucket": bucket, "key": key_name},
+        "measurement": {
+            "download_time": download_time,
+            "download_size": size,
+            "upload_time": upload_time,
+            "upload_size": archive_size,
+            "compute_time": process_time,
+        },
+    }
diff --git a/benchmarks/400.inference/411.image-recognition/input.py b/benchmarks/400.inference/411.image-recognition/input.py
index 45d7215a6..2a1332f98 100644
--- a/benchmarks/400.inference/411.image-recognition/input.py
+++ b/benchmarks/400.inference/411.image-recognition/input.py
@@ -1,18 +1,21 @@
-import glob, os
+import os
+
 
 def buckets_count():
     return (2, 0)
 
+
 def upload_files(data_root, data_dir, upload_func):
 
     for root, dirs, files in os.walk(data_dir):
         prefix = os.path.relpath(root, data_root)
         for file in files:
-            file_name = prefix + '/' + file
+            file_name = prefix + "/" + file
             filepath = os.path.join(root, file)
             upload_func(0, file_name, filepath)
 
-'''
+
+"""
     Generate test, small and large workload for compression test.
 
     :param data_dir: directory where benchmark data is placed
@@ -20,25 +23,35 @@ def upload_files(data_root, data_dir, upload_func):
     :param input_buckets: input storage containers for this benchmark
     :param output_buckets:
     :param upload_func: upload function taking three params(bucket_idx, key, filepath)
-'''
-def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func, nosql_func):
+"""
+
+
+def generate_input(
+    data_dir,
+    size,
+    benchmarks_bucket,
+    input_paths,
+    output_paths,
+    upload_func,
+    nosql_func,
+):
 
     # upload model
-    model_name = 'resnet50-19c8e357.pth'
-    upload_func(0, model_name, os.path.join(data_dir, 'model', model_name))
+    model_name = "resnet50-19c8e357.pth"
+    upload_func(0, model_name, os.path.join(data_dir, "model", model_name))
 
     input_images = []
-    resnet_path = os.path.join(data_dir, 'fake-resnet')
-    with open(os.path.join(resnet_path, 'val_map.txt'), 'r') as f:
+    resnet_path = os.path.join(data_dir, "fake-resnet")
+    with open(os.path.join(resnet_path, "val_map.txt"), "r") as f:
         for line in f:
             img, img_class = line.split()
             input_images.append((img, img_class))
             upload_func(1, img, os.path.join(resnet_path, img))
-    
-    input_config = {'object': {}, 'bucket': {}}
-    input_config['object']['model'] = model_name
-    input_config['object']['input'] = input_images[0][0]
-    input_config['bucket']['bucket'] = benchmarks_bucket
-    input_config['bucket']['input'] = input_paths[1]
-    input_config['bucket']['model'] = input_paths[0]
+
+    input_config = {"object": {}, "bucket": {}}
+    input_config["object"]["model"] = model_name
+    input_config["object"]["input"] = input_images[0][0]
+    input_config["bucket"]["bucket"] = benchmarks_bucket
+    input_config["bucket"]["input"] = input_paths[1]
+    input_config["bucket"]["model"] = input_paths[0]
     return input_config
diff --git a/benchmarks/400.inference/411.image-recognition/python/function.py b/benchmarks/400.inference/411.image-recognition/python/function.py
index 411386419..0cfa1c57f 100644
--- a/benchmarks/400.inference/411.image-recognition/python/function.py
+++ b/benchmarks/400.inference/411.image-recognition/python/function.py
@@ -1,14 +1,20 @@
-
-import datetime, json, os, uuid
+import datetime
+import json
+import os
+import uuid
 
 # Extract zipped torch model - used in Python 3.8 and 3.9
 # The reason is that torch versions supported for these Python
 # versions are too large for Lambda packages.
-if os.path.exists('function/torch.zip'):
-    import zipfile, sys
+if os.path.exists("function/torch.zip"):
+    import sys
+    import zipfile
+
     # we cannot write to the read-only filesystem
-    zipfile.ZipFile('function/torch.zip').extractall('/tmp/')
-    sys.path.append(os.path.join(os.path.dirname(__file__), '/tmp/.python_packages/lib/site-packages'))
+    zipfile.ZipFile("function/torch.zip").extractall("/tmp/")
+    sys.path.append(
+        os.path.join(os.path.dirname(__file__), "/tmp/.python_packages/lib/site-packages")
+    )
 
 from PIL import Image
 import torch
@@ -16,21 +22,23 @@
 from torchvision.models import resnet50
 
 from . import storage
+
 client = storage.storage.get_instance()
 
 SCRIPT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__)))
-class_idx = json.load(open(os.path.join(SCRIPT_DIR, "imagenet_class_index.json"), 'r'))
+class_idx = json.load(open(os.path.join(SCRIPT_DIR, "imagenet_class_index.json"), "r"))
 idx2label = [class_idx[str(k)][1] for k in range(len(class_idx))]
 model = None
 
+
 def handler(event):
-  
-    bucket = event.get('bucket').get('bucket')
-    input_prefix = event.get('bucket').get('input')
-    model_prefix = event.get('bucket').get('model')
-    key = event.get('object').get('input')
-    model_key = event.get('object').get('model')
-    download_path = '/tmp/{}-{}'.format(key, uuid.uuid4())
+
+    bucket = event.get("bucket").get("bucket")
+    input_prefix = event.get("bucket").get("input")
+    model_prefix = event.get("bucket").get("model")
+    key = event.get("object").get("input")
+    model_key = event.get("object").get("model")
+    download_path = "/tmp/{}-{}".format(key, uuid.uuid4())
 
     image_download_begin = datetime.datetime.now()
     image_path = download_path
@@ -40,7 +48,7 @@ def handler(event):
     global model
     if not model:
         model_download_begin = datetime.datetime.now()
-        model_path = os.path.join('/tmp', model_key)
+        model_path = os.path.join("/tmp", model_key)
         client.download(bucket, os.path.join(model_prefix, model_key), model_path)
         model_download_end = datetime.datetime.now()
         model_process_begin = datetime.datetime.now()
@@ -53,36 +61,38 @@ def handler(event):
         model_download_end = model_download_begin
         model_process_begin = datetime.datetime.now()
         model_process_end = model_process_begin
-   
+
     process_begin = datetime.datetime.now()
     input_image = Image.open(image_path)
-    preprocess = transforms.Compose([
-        transforms.Resize(256),
-        transforms.CenterCrop(224),
-        transforms.ToTensor(),
-        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
-    ])
+    preprocess = transforms.Compose(
+        [
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+        ]
+    )
     input_tensor = preprocess(input_image)
-    input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model 
+    input_batch = input_tensor.unsqueeze(0)  # create a mini-batch as expected by the model
     output = model(input_batch)
     _, index = torch.max(output, 1)
-    # The output has unnormalized scores. To get probabilities, you can run a softmax on it.
-    prob = torch.nn.functional.softmax(output[0], dim=0)
-    _, indices = torch.sort(output, descending = True)
     ret = idx2label[index]
     process_end = datetime.datetime.now()
 
-    download_time = (image_download_end- image_download_begin) / datetime.timedelta(microseconds=1)
-    model_download_time = (model_download_end - model_download_begin) / datetime.timedelta(microseconds=1)
-    model_process_time = (model_process_end - model_process_begin) / datetime.timedelta(microseconds=1)
+    download_time = (image_download_end - image_download_begin) / datetime.timedelta(microseconds=1)
+    model_download_time = (model_download_end - model_download_begin) / datetime.timedelta(
+        microseconds=1
+    )
+    model_process_time = (model_process_end - model_process_begin) / datetime.timedelta(
+        microseconds=1
+    )
     process_time = (process_end - process_begin) / datetime.timedelta(microseconds=1)
     return {
-            'result': {'idx': index.item(), 'class': ret},
-            'measurement': {
-                'download_time': download_time + model_download_time,
-                'compute_time': process_time + model_process_time,
-                'model_time': model_process_time,
-                'model_download_time': model_download_time
-            }
-        }
-
+        "result": {"idx": index.item(), "class": ret},
+        "measurement": {
+            "download_time": download_time + model_download_time,
+            "compute_time": process_time + model_process_time,
+            "model_time": model_process_time,
+            "model_download_time": model_download_time,
+        },
+    }
diff --git a/benchmarks/400.inference/412.language-bert/config.json b/benchmarks/400.inference/412.language-bert/config.json
new file mode 100644
index 000000000..94ede7925
--- /dev/null
+++ b/benchmarks/400.inference/412.language-bert/config.json
@@ -0,0 +1,6 @@
+{
+  "timeout": 60,
+  "memory": 512,
+  "languages": ["python"],
+  "modules": ["storage"]
+}
diff --git a/benchmarks/400.inference/412.language-bert/input.py b/benchmarks/400.inference/412.language-bert/input.py
new file mode 100644
index 000000000..2b46ed4bd
--- /dev/null
+++ b/benchmarks/400.inference/412.language-bert/input.py
@@ -0,0 +1,39 @@
+import os
+
+
+def buckets_count():
+    # model bucket and text bucket
+    return (2, 0)
+
+
+def upload_files(data_root, data_dir, upload_func):
+    for root, _, files in os.walk(data_dir):
+        prefix = os.path.relpath(root, data_root)
+        for file in files:
+            filepath = os.path.join(root, file)
+            relative_key = os.path.join(prefix, file)
+            upload_func(0, relative_key, filepath)
+
+
+def generate_input(
+    data_dir,
+    size,
+    benchmarks_bucket,
+    input_paths,
+    output_paths,
+    upload_func,
+    nosql_func,
+):
+    model_archive = "bert-tiny-onnx.tar.gz"
+    upload_func(0, model_archive, os.path.join(data_dir, "model", model_archive))
+
+    text_filename = "sentences.jsonl"
+    upload_func(1, text_filename, os.path.join(data_dir, "text", text_filename))
+
+    input_config = {"object": {}, "bucket": {}}
+    input_config["object"]["model"] = model_archive
+    input_config["object"]["input"] = text_filename
+    input_config["bucket"]["bucket"] = benchmarks_bucket
+    input_config["bucket"]["model"] = input_paths[0]
+    input_config["bucket"]["text"] = input_paths[1]
+    return input_config
diff --git a/benchmarks/400.inference/412.language-bert/python/function.py b/benchmarks/400.inference/412.language-bert/python/function.py
new file mode 100644
index 000000000..7e4f981ef
--- /dev/null
+++ b/benchmarks/400.inference/412.language-bert/python/function.py
@@ -0,0 +1,157 @@
+import datetime
+import json
+import os
+import tarfile
+import uuid
+from typing import Dict, List, Optional
+
+import numpy as np
+import onnxruntime as ort
+from tokenizers import Tokenizer
+
+from . import storage
+
+client = storage.storage.get_instance()
+
+MODEL_ARCHIVE = "bert-tiny-onnx.tar.gz"
+MODEL_DIRECTORY = "/tmp/bert_language_model"
+MODEL_SUBDIR = "bert-tiny-onnx"
+
+_session: Optional[ort.InferenceSession] = None
+_tokenizer: Optional[Tokenizer] = None
+_labels: Optional[Dict[int, str]] = None
+
+
+def _ensure_model(bucket: str, model_prefix: str):
+    """
+    Lazily download and initialize the ONNX model and tokenizer.
+    """
+    global _session, _tokenizer, _labels
+
+    model_path = os.path.join(MODEL_DIRECTORY, MODEL_SUBDIR)
+    model_download_begin = datetime.datetime.now()
+    model_download_end = model_download_begin
+
+    if _session is None or _tokenizer is None or _labels is None:
+        if not os.path.exists(model_path):
+            os.makedirs(MODEL_DIRECTORY, exist_ok=True)
+            archive_path = os.path.join("/tmp", f"{uuid.uuid4()}-{MODEL_ARCHIVE}")
+            client.download(bucket, os.path.join(model_prefix, MODEL_ARCHIVE), archive_path)
+            model_download_end = datetime.datetime.now()
+
+            with tarfile.open(archive_path, "r:gz") as tar:
+                tar.extractall(MODEL_DIRECTORY)
+            os.remove(archive_path)
+        else:
+            model_download_begin = datetime.datetime.now()
+            model_download_end = model_download_begin
+
+        model_process_begin = datetime.datetime.now()
+        tokenizer_path = os.path.join(model_path, "tokenizer.json")
+        _tokenizer = Tokenizer.from_file(tokenizer_path)
+        _tokenizer.enable_truncation(max_length=128)
+        _tokenizer.enable_padding(length=128)
+
+        label_map_path = os.path.join(model_path, "label_map.json")
+        with open(label_map_path, "r") as f:
+            raw_labels = json.load(f)
+        _labels = {int(idx): label for idx, label in raw_labels.items()}
+
+        onnx_path = os.path.join(model_path, "model.onnx")
+
+        available = ort.get_available_providers()
+        if "CUDAExecutionProvider" not in available:
+            raise RuntimeError(f"CUDAExecutionProvider unavailable (have: {available})")
+
+        _session = ort.InferenceSession(onnx_path, providers=["CUDAExecutionProvider"])
+        model_process_end = datetime.datetime.now()
+    else:
+        model_process_begin = datetime.datetime.now()
+        model_process_end = model_process_begin
+
+    model_download_time = (model_download_end - model_download_begin) / datetime.timedelta(
+        microseconds=1
+    )
+    model_process_time = (model_process_end - model_process_begin) / datetime.timedelta(
+        microseconds=1
+    )
+
+    return model_download_time, model_process_time
+
+
+def _prepare_inputs(sentences: List[str]):
+    assert _tokenizer is not None
+
+    encodings = _tokenizer.encode_batch(sentences)
+
+    input_ids = np.array([enc.ids for enc in encodings], dtype=np.int64)
+    attention_mask = np.array([enc.attention_mask for enc in encodings], dtype=np.int64)
+    token_type_ids = np.array(
+        [enc.type_ids if enc.type_ids else [0] * len(enc.ids) for enc in encodings],
+        dtype=np.int64,
+    )
+
+    return {
+        "input_ids": input_ids,
+        "attention_mask": attention_mask,
+        "token_type_ids": token_type_ids,
+    }
+
+
+def _softmax(logits: np.ndarray) -> np.ndarray:
+    shifted = logits - np.max(logits, axis=1, keepdims=True)
+    exp = np.exp(shifted)
+    return exp / np.sum(exp, axis=1, keepdims=True)
+
+
+def handler(event):
+    bucket = event.get("bucket", {}).get("bucket")
+    model_prefix = event.get("bucket", {}).get("model")
+    text_prefix = event.get("bucket", {}).get("text")
+    text_key = event.get("object", {}).get("input")
+
+    download_begin = datetime.datetime.now()
+    text_download_path = os.path.join("/tmp", f"{uuid.uuid4()}-{os.path.basename(text_key)}")
+    client.download(bucket, os.path.join(text_prefix, text_key), text_download_path)
+    download_end = datetime.datetime.now()
+
+    model_download_time, model_process_time = _ensure_model(bucket, model_prefix)
+    assert _session is not None and _labels is not None and _tokenizer is not None
+
+    with open(text_download_path, "r") as f:
+        sentences = [json.loads(line)["text"] for line in f if line.strip()]
+
+    os.remove(text_download_path)
+
+    inference_begin = datetime.datetime.now()
+    inputs = _prepare_inputs(sentences)
+    outputs = _session.run(None, inputs)
+    logits = outputs[0]
+    probabilities = _softmax(logits)
+    inference_end = datetime.datetime.now()
+
+    results = []
+    for sentence, probs in zip(sentences, probabilities):
+        label_idx = int(np.argmax(probs))
+        label = _labels.get(label_idx, str(label_idx))
+        results.append(
+            {
+                "text": sentence,
+                "label": label,
+                "confidence": float(probs[label_idx]),
+                "raw_scores": probs.tolist(),
+            }
+        )
+
+    download_time = (download_end - download_begin) / datetime.timedelta(microseconds=1)
+    compute_time = (inference_end - inference_begin) / datetime.timedelta(microseconds=1)
+
+    return {
+        "result": {"predictions": results},
+        "measurement": {
+            "download_time": download_time + model_download_time,
+            "compute_time": compute_time + model_process_time,
+            "model_time": model_process_time,
+            "model_download_time": model_download_time,
+        },
+    }
diff --git a/benchmarks/400.inference/412.language-bert/python/init.sh b/benchmarks/400.inference/412.language-bert/python/init.sh
new file mode 100755
index 000000000..160852abe
--- /dev/null
+++ b/benchmarks/400.inference/412.language-bert/python/init.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+# No additional initialization required for the BERT inference benchmark.
diff --git a/benchmarks/400.inference/412.language-bert/python/package.sh b/benchmarks/400.inference/412.language-bert/python/package.sh
new file mode 100644
index 000000000..edb27ebe0
--- /dev/null
+++ b/benchmarks/400.inference/412.language-bert/python/package.sh
@@ -0,0 +1,35 @@
+# Stripping package code is based on https://github.com/ryfeus/lambda-packs repo
+
+PACKAGE_DIR=$1
+echo "Original size $(du -sh $1 | cut -f1)"
+
+CUR_DIR=$(pwd)
+cd $1
+# cleaning libs
+rm -rf external
+find . -type d -name "tests" -exec rm -rf {} +
+find . -type d -name "test" -exec rm -rf {} +
+find . -type d -name "bin" -not -path "*/torch/*" -exec rm -rf {} +
+
+# cleaning
+# stripping some of the numpy libs - libgfortran-2e0d59d6.so.5.0.0 - causes issues on Azure
+find -name "*.so" -not -path "*/PIL/*" -not -path "*/Pillow.libs/*" -not -path "*libgfortran*" | xargs strip
+find -name "*.so.*" -not -path "*/PIL/*" -not -path "*/Pillow.libs/*" -not -path "*libgfortran*" | xargs strip
+
+rm -r pip >/dev/null
+rm -r pip-* >/dev/null
+rm -r wheel >/dev/null
+rm -r wheel-* >/dev/null
+rm easy_install.py >/dev/null
+find . -name \*.pyc -delete
+cd ${CUR_DIR}
+echo "Stripped size $(du -sh $1 | cut -f1)"
+
+TORCH_DIR=".python_packages/lib/site-packages/torch"
+if [ -d "$1/${TORCH_DIR}" ]; then
+	cd $1
+	zip -qr torch.zip ${TORCH_DIR}
+	rm -rf ${TORCH_DIR}
+	cd ${CUR_DIR}
+	echo "Torch-zipped size $(du -sh $1 | cut -f1)"
+fi
diff --git a/benchmarks/400.inference/412.language-bert/python/requirements.txt b/benchmarks/400.inference/412.language-bert/python/requirements.txt
new file mode 100644
index 000000000..67a8c1e18
--- /dev/null
+++ b/benchmarks/400.inference/412.language-bert/python/requirements.txt
@@ -0,0 +1,3 @@
+numpy==1.24.4
+onnxruntime-gpu==1.16.3
+tokenizers==0.13.3
diff --git a/benchmarks/400.inference/412.language-bert/python/requirements.txt.3.10 b/benchmarks/400.inference/412.language-bert/python/requirements.txt.3.10
new file mode 100644
index 000000000..67a8c1e18
--- /dev/null
+++ b/benchmarks/400.inference/412.language-bert/python/requirements.txt.3.10
@@ -0,0 +1,3 @@
+numpy==1.24.4
+onnxruntime-gpu==1.16.3
+tokenizers==0.13.3
diff --git a/benchmarks/400.inference/412.language-bert/python/requirements.txt.3.11 b/benchmarks/400.inference/412.language-bert/python/requirements.txt.3.11
new file mode 100644
index 000000000..67a8c1e18
--- /dev/null
+++ b/benchmarks/400.inference/412.language-bert/python/requirements.txt.3.11
@@ -0,0 +1,3 @@
+numpy==1.24.4
+onnxruntime-gpu==1.16.3
+tokenizers==0.13.3
diff --git a/benchmarks/400.inference/412.language-bert/python/requirements.txt.3.8 b/benchmarks/400.inference/412.language-bert/python/requirements.txt.3.8
new file mode 100644
index 000000000..67a8c1e18
--- /dev/null
+++ b/benchmarks/400.inference/412.language-bert/python/requirements.txt.3.8
@@ -0,0 +1,3 @@
+numpy==1.24.4
+onnxruntime-gpu==1.16.3
+tokenizers==0.13.3
diff --git a/benchmarks/400.inference/412.language-bert/python/requirements.txt.3.9 b/benchmarks/400.inference/412.language-bert/python/requirements.txt.3.9
new file mode 100644
index 000000000..67a8c1e18
--- /dev/null
+++ b/benchmarks/400.inference/412.language-bert/python/requirements.txt.3.9
@@ -0,0 +1,3 @@
+numpy==1.24.4
+onnxruntime-gpu==1.16.3
+tokenizers==0.13.3
diff --git a/benchmarks/400.inference/413.image-classification/config.json b/benchmarks/400.inference/413.image-classification/config.json
new file mode 100644
index 000000000..94ede7925
--- /dev/null
+++ b/benchmarks/400.inference/413.image-classification/config.json
@@ -0,0 +1,6 @@
+{
+  "timeout": 60,
+  "memory": 512,
+  "languages": ["python"],
+  "modules": ["storage"]
+}
diff --git a/benchmarks/400.inference/413.image-classification/input.py b/benchmarks/400.inference/413.image-classification/input.py
new file mode 100644
index 000000000..6c88f8d64
--- /dev/null
+++ b/benchmarks/400.inference/413.image-classification/input.py
@@ -0,0 +1,57 @@
+import os
+
+
+def buckets_count():
+    return (2, 0)
+
+
+def upload_files(data_root, data_dir, upload_func):
+
+    for root, dirs, files in os.walk(data_dir):
+        prefix = os.path.relpath(root, data_root)
+        for file in files:
+            file_name = prefix + "/" + file
+            filepath = os.path.join(root, file)
+            upload_func(0, file_name, filepath)
+
+
+"""
+    Generate test, small and large workload for compression test.
+
+    :param data_dir: directory where benchmark data is placed
+    :param size: workload size
+    :param input_buckets: input storage containers for this benchmark
+    :param output_buckets:
+    :param upload_func: upload function taking three params(bucket_idx, key, filepath)
+"""
+
+
+def generate_input(
+    data_dir,
+    size,
+    benchmarks_bucket,
+    input_paths,
+    output_paths,
+    upload_func,
+    nosql_func,
+):
+
+    # upload model
+    model_name = "resnet50.tar.gz"
+    upload_func(0, model_name, os.path.join(data_dir, "model", model_name))
+
+    input_images = []
+    resnet_path = os.path.join(data_dir, "data")
+    with open(os.path.join(resnet_path, "val_map.txt"), "r") as f:
+        for line in f:
+            img, img_class = line.split()
+            input_images.append((img, img_class))
+            upload_func(1, img, os.path.join(resnet_path, img))
+
+    input_config = {"object": {}, "bucket": {}}
+    input_config["object"]["model"] = model_name
+    input_config["object"]["input"] = input_images[0][0]
+    input_config["bucket"]["bucket"] = benchmarks_bucket
+    input_config["bucket"]["input"] = input_paths[1]
+    input_config["bucket"]["model"] = input_paths[0]
+    return input_config
diff --git a/benchmarks/400.inference/413.image-classification/python/function.py b/benchmarks/400.inference/413.image-classification/python/function.py
new file mode 100644
index 000000000..64795612d
--- /dev/null
+++ b/benchmarks/400.inference/413.image-classification/python/function.py
@@ -0,0 +1,178 @@
+import datetime
+import json
+import os
+import shutil
+import tarfile
+import uuid
+from typing import List, Optional, Tuple
+
+import numpy as np
+import onnxruntime as ort
+from PIL import Image
+
+from . import storage
+
+client = storage.storage.get_instance()
+
+SCRIPT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__)))
+class_idx = json.load(open(os.path.join(SCRIPT_DIR, "imagenet_class_index.json"), "r"))
+idx2label = [class_idx[str(k)][1] for k in range(len(class_idx))]
+
+MODEL_ARCHIVE = "resnet50.tar.gz"
+MODEL_DIRECTORY = "/tmp/image_classification_model"
+MODEL_SUBDIR = "resnet50"
+
+_session: Optional[ort.InferenceSession] = None
+_session_input: Optional[str] = None
+_session_output: Optional[str] = None
+_cached_model_key: Optional[str] = None
+
+_MEAN = np.array([0.485, 0.456, 0.406], dtype=np.float32)
+_STD = np.array([0.229, 0.224, 0.225], dtype=np.float32)
+
+
+def _ensure_model(bucket: str, model_prefix: str, model_key: str) -> Tuple[float, float]:
+    """
+    Lazily download, extract, and initialize the ONNX ResNet model.
+    """
+    global _session, _session_input, _session_output, _cached_model_key
+
+    effective_model_key = model_key or MODEL_ARCHIVE
+    model_download_begin = datetime.datetime.now()
+    model_download_end = model_download_begin
+
+    if _session is None or _cached_model_key != effective_model_key:
+        archive_basename = os.path.basename(effective_model_key)
+        archive_path = os.path.join("/tmp", f"{uuid.uuid4()}-{archive_basename}")
+        model_dir = os.path.join(MODEL_DIRECTORY, MODEL_SUBDIR)
+
+        if os.path.exists(model_dir):
+            shutil.rmtree(model_dir)
+        os.makedirs(MODEL_DIRECTORY, exist_ok=True)
+
+        client.download(bucket, os.path.join(model_prefix, effective_model_key), archive_path)
+        model_download_end = datetime.datetime.now()
+
+        with tarfile.open(archive_path, "r:gz") as tar:
+            tar.extractall(MODEL_DIRECTORY)
+        os.remove(archive_path)
+
+        model_process_begin = datetime.datetime.now()
+        onnx_path = os.path.join(model_dir, "model.onnx")
+        if not os.path.exists(onnx_path):
+            raise FileNotFoundError(f"Expected ONNX model at {onnx_path}")
+
+        available = ort.get_available_providers()
+        if "CUDAExecutionProvider" not in available:
+            raise RuntimeError(f"CUDAExecutionProvider unavailable (providers: {available})")
+
+        _session = ort.InferenceSession(onnx_path, providers=["CUDAExecutionProvider"])
+        _session_input = _session.get_inputs()[0].name
+        _session_output = _session.get_outputs()[0].name
+        _cached_model_key = effective_model_key
+        model_process_end = datetime.datetime.now()
+    else:
+        model_process_begin = datetime.datetime.now()
+        model_process_end = model_process_begin
+
+    model_download_time = (model_download_end - model_download_begin) / datetime.timedelta(
+        microseconds=1
+    )
+    model_process_time = (model_process_end - model_process_begin) / datetime.timedelta(
+        microseconds=1
+    )
+
+    return model_download_time, model_process_time
+
+
+def _resize_shorter_side(image: Image.Image, size: int) -> Image.Image:
+    width, height = image.size
+    if width < height:
+        new_width = size
+        new_height = int(round(size * height / width))
+    else:
+        new_height = size
+        new_width = int(round(size * width / height))
+    resample = getattr(Image, "Resampling", Image).BILINEAR
+    return image.resize((new_width, new_height), resample=resample)
+
+
+def _center_crop(image: Image.Image, size: int) -> Image.Image:
+    width, height = image.size
+    left = max(0, int(round((width - size) / 2)))
+    top = max(0, int(round((height - size) / 2)))
+    right = left + size
+    bottom = top + size
+    return image.crop((left, top, right, bottom))
+
+
+def _prepare_tensor(image_path: str) -> np.ndarray:
+    image = Image.open(image_path).convert("RGB")
+    image = _resize_shorter_side(image, 256)
+    image = _center_crop(image, 224)
+
+    np_image = np.asarray(image).astype(np.float32) / 255.0
+    np_image = (np_image - _MEAN) / _STD
+    np_image = np.transpose(np_image, (2, 0, 1))
+    return np_image[np.newaxis, :]
+
+
+def _softmax(logits: np.ndarray) -> np.ndarray:
+    shifted = logits - np.max(logits, axis=1, keepdims=True)
+    exp = np.exp(shifted)
+    return exp / np.sum(exp, axis=1, keepdims=True)
+
+
+def _run_inference(batch: np.ndarray) -> Tuple[int, float, List[int]]:
+    assert _session is not None and _session_input is not None and _session_output is not None
+
+    outputs = _session.run([_session_output], {_session_input: batch})
+    logits = outputs[0]
+    probs = _softmax(logits)
+    top1_idx = int(np.argmax(probs, axis=1)[0])
+    top1_conf = float(probs[0, top1_idx])
+    top5_idx = np.argsort(probs[0])[::-1][:5].tolist()
+
+    return top1_idx, top1_conf, top5_idx
+
+
+def handler(event):
+    bucket = event.get("bucket", {}).get("bucket")
+    input_prefix = event.get("bucket", {}).get("input")
+    model_prefix = event.get("bucket", {}).get("model")
+    key = event.get("object", {}).get("input")
+    model_key = event.get("object", {}).get("model")
+
+    download_path = os.path.join("/tmp", f"{uuid.uuid4()}-{os.path.basename(key)}")
+    image_download_begin = datetime.datetime.now()
+    client.download(bucket, os.path.join(input_prefix, key), download_path)
+    image_download_end = datetime.datetime.now()
+
+    model_download_time, model_process_time = _ensure_model(bucket, model_prefix, model_key)
+
+    inference_begin = datetime.datetime.now()
+    input_batch = _prepare_tensor(download_path)
+    top1_idx, top1_conf, top5_idx = _run_inference(input_batch)
+    inference_end = datetime.datetime.now()
+
+    os.remove(download_path)
+
+    download_time = (image_download_end - image_download_begin) / datetime.timedelta(microseconds=1)
+    compute_time = (inference_end - inference_begin) / datetime.timedelta(microseconds=1)
+    # gpu_time_ms = 0.0
+
+    return {
+        "result": {
+            "idx": top1_idx,
+            "class": idx2label[top1_idx],
+            "confidence": top1_conf,
+            "top5_idx": top5_idx,
+        },
+        "measurement": {
+            "download_time": download_time + model_download_time,
+            "compute_time": compute_time + model_process_time,
+            "model_time": model_process_time,
+            "model_download_time": model_download_time,
+            # "gpu_time_ms": round(gpu_time_ms, 3),
+        },
+    }
diff --git a/benchmarks/400.inference/413.image-classification/python/imagenet_class_index.json b/benchmarks/400.inference/413.image-classification/python/imagenet_class_index.json
new file mode 100755
index 000000000..5fe0dfefc
--- /dev/null
+++ b/benchmarks/400.inference/413.image-classification/python/imagenet_class_index.json
@@ -0,0 +1 @@
+{"0": ["n01440764", "tench"], "1": ["n01443537", "goldfish"], "2": ["n01484850", "great_white_shark"], "3": ["n01491361", "tiger_shark"], "4": ["n01494475", "hammerhead"], "5": ["n01496331", "electric_ray"], "6": ["n01498041", "stingray"], "7": ["n01514668", "cock"], "8": ["n01514859", "hen"], "9": ["n01518878", "ostrich"], "10": ["n01530575", "brambling"], "11": ["n01531178", "goldfinch"], "12": ["n01532829", "house_finch"], "13": ["n01534433", "junco"], "14": ["n01537544", "indigo_bunting"], "15": ["n01558993", "robin"], "16": ["n01560419", "bulbul"], "17": ["n01580077", "jay"], "18": ["n01582220", "magpie"], "19": ["n01592084", "chickadee"], "20": ["n01601694", "water_ouzel"], "21": ["n01608432", "kite"], "22": ["n01614925", "bald_eagle"], "23": ["n01616318", "vulture"], "24": ["n01622779", "great_grey_owl"], "25": ["n01629819", "European_fire_salamander"], "26": ["n01630670", "common_newt"], "27": ["n01631663", "eft"], "28": ["n01632458", "spotted_salamander"], "29": ["n01632777", "axolotl"], "30": ["n01641577", "bullfrog"], "31": ["n01644373", "tree_frog"], "32": ["n01644900", "tailed_frog"], "33": ["n01664065", "loggerhead"], "34": ["n01665541", "leatherback_turtle"], "35": ["n01667114", "mud_turtle"], "36": ["n01667778", "terrapin"], "37": ["n01669191", "box_turtle"], "38": ["n01675722", "banded_gecko"], "39": ["n01677366", "common_iguana"], "40": ["n01682714", "American_chameleon"], "41": ["n01685808", "whiptail"], "42": ["n01687978", "agama"], "43": ["n01688243", "frilled_lizard"], "44": ["n01689811", "alligator_lizard"], "45": ["n01692333", "Gila_monster"], "46": ["n01693334", "green_lizard"], "47": ["n01694178", "African_chameleon"], "48": ["n01695060", "Komodo_dragon"], "49": ["n01697457", "African_crocodile"], "50": ["n01698640", "American_alligator"], "51": ["n01704323", "triceratops"], "52": ["n01728572", "thunder_snake"], "53": ["n01728920", "ringneck_snake"], "54": ["n01729322", "hognose_snake"], "55": ["n01729977", "green_snake"], "56": ["n01734418", "king_snake"], "57": ["n01735189", "garter_snake"], "58": ["n01737021", "water_snake"], "59": ["n01739381", "vine_snake"], "60": ["n01740131", "night_snake"], "61": ["n01742172", "boa_constrictor"], "62": ["n01744401", "rock_python"], "63": ["n01748264", "Indian_cobra"], "64": ["n01749939", "green_mamba"], "65": ["n01751748", "sea_snake"], "66": ["n01753488", "horned_viper"], "67": ["n01755581", "diamondback"], "68": ["n01756291", "sidewinder"], "69": ["n01768244", "trilobite"], "70": ["n01770081", "harvestman"], "71": ["n01770393", "scorpion"], "72": ["n01773157", "black_and_gold_garden_spider"], "73": ["n01773549", "barn_spider"], "74": ["n01773797", "garden_spider"], "75": ["n01774384", "black_widow"], "76": ["n01774750", "tarantula"], "77": ["n01775062", "wolf_spider"], "78": ["n01776313", "tick"], "79": ["n01784675", "centipede"], "80": ["n01795545", "black_grouse"], "81": ["n01796340", "ptarmigan"], "82": ["n01797886", "ruffed_grouse"], "83": ["n01798484", "prairie_chicken"], "84": ["n01806143", "peacock"], "85": ["n01806567", "quail"], "86": ["n01807496", "partridge"], "87": ["n01817953", "African_grey"], "88": ["n01818515", "macaw"], "89": ["n01819313", "sulphur-crested_cockatoo"], "90": ["n01820546", "lorikeet"], "91": ["n01824575", "coucal"], "92": ["n01828970", "bee_eater"], "93": ["n01829413", "hornbill"], "94": ["n01833805", "hummingbird"], "95": ["n01843065", "jacamar"], "96": ["n01843383", "toucan"], "97": ["n01847000", "drake"], "98": ["n01855032", "red-breasted_merganser"], "99": ["n01855672", "goose"], "100": ["n01860187", "black_swan"], "101": ["n01871265", "tusker"], "102": ["n01872401", "echidna"], "103": ["n01873310", "platypus"], "104": ["n01877812", "wallaby"], "105": ["n01882714", "koala"], "106": ["n01883070", "wombat"], "107": ["n01910747", "jellyfish"], "108": ["n01914609", "sea_anemone"], "109": ["n01917289", "brain_coral"], "110": ["n01924916", "flatworm"], "111": ["n01930112", "nematode"], "112": ["n01943899", "conch"], "113": ["n01944390", "snail"], "114": ["n01945685", "slug"], "115": ["n01950731", "sea_slug"], "116": ["n01955084", "chiton"], "117": ["n01968897", "chambered_nautilus"], "118": ["n01978287", "Dungeness_crab"], "119": ["n01978455", "rock_crab"], "120": ["n01980166", "fiddler_crab"], "121": ["n01981276", "king_crab"], "122": ["n01983481", "American_lobster"], "123": ["n01984695", "spiny_lobster"], "124": ["n01985128", "crayfish"], "125": ["n01986214", "hermit_crab"], "126": ["n01990800", "isopod"], "127": ["n02002556", "white_stork"], "128": ["n02002724", "black_stork"], "129": ["n02006656", "spoonbill"], "130": ["n02007558", "flamingo"], "131": ["n02009229", "little_blue_heron"], "132": ["n02009912", "American_egret"], "133": ["n02011460", "bittern"], "134": ["n02012849", "crane"], "135": ["n02013706", "limpkin"], "136": ["n02017213", "European_gallinule"], "137": ["n02018207", "American_coot"], "138": ["n02018795", "bustard"], "139": ["n02025239", "ruddy_turnstone"], "140": ["n02027492", "red-backed_sandpiper"], "141": ["n02028035", "redshank"], "142": ["n02033041", "dowitcher"], "143": ["n02037110", "oystercatcher"], "144": ["n02051845", "pelican"], "145": ["n02056570", "king_penguin"], "146": ["n02058221", "albatross"], "147": ["n02066245", "grey_whale"], "148": ["n02071294", "killer_whale"], "149": ["n02074367", "dugong"], "150": ["n02077923", "sea_lion"], "151": ["n02085620", "Chihuahua"], "152": ["n02085782", "Japanese_spaniel"], "153": ["n02085936", "Maltese_dog"], "154": ["n02086079", "Pekinese"], "155": ["n02086240", "Shih-Tzu"], "156": ["n02086646", "Blenheim_spaniel"], "157": ["n02086910", "papillon"], "158": ["n02087046", "toy_terrier"], "159": ["n02087394", "Rhodesian_ridgeback"], "160": ["n02088094", "Afghan_hound"], "161": ["n02088238", "basset"], "162": ["n02088364", "beagle"], "163": ["n02088466", "bloodhound"], "164": ["n02088632", "bluetick"], "165": ["n02089078", "black-and-tan_coonhound"], "166": ["n02089867", "Walker_hound"], "167": ["n02089973", "English_foxhound"], "168": ["n02090379", "redbone"], "169": ["n02090622", "borzoi"], "170": ["n02090721", "Irish_wolfhound"], "171": ["n02091032", "Italian_greyhound"], "172": ["n02091134", "whippet"], "173": ["n02091244", "Ibizan_hound"], "174": ["n02091467", "Norwegian_elkhound"], "175": ["n02091635", "otterhound"], "176": ["n02091831", "Saluki"], "177": ["n02092002", "Scottish_deerhound"], "178": ["n02092339", "Weimaraner"], "179": ["n02093256", "Staffordshire_bullterrier"], "180": ["n02093428", "American_Staffordshire_terrier"], "181": ["n02093647", "Bedlington_terrier"], "182": ["n02093754", "Border_terrier"], "183": ["n02093859", "Kerry_blue_terrier"], "184": ["n02093991", "Irish_terrier"], "185": ["n02094114", "Norfolk_terrier"], "186": ["n02094258", "Norwich_terrier"], "187": ["n02094433", "Yorkshire_terrier"], "188": ["n02095314", "wire-haired_fox_terrier"], "189": ["n02095570", "Lakeland_terrier"], "190": ["n02095889", "Sealyham_terrier"], "191": ["n02096051", "Airedale"], "192": ["n02096177", "cairn"], "193": ["n02096294", "Australian_terrier"], "194": ["n02096437", "Dandie_Dinmont"], "195": ["n02096585", "Boston_bull"], "196": ["n02097047", "miniature_schnauzer"], "197": ["n02097130", "giant_schnauzer"], "198": ["n02097209", "standard_schnauzer"], "199": ["n02097298", "Scotch_terrier"], "200": ["n02097474", "Tibetan_terrier"], "201": ["n02097658", "silky_terrier"], "202": ["n02098105", "soft-coated_wheaten_terrier"], "203": ["n02098286", "West_Highland_white_terrier"], "204": ["n02098413", "Lhasa"], "205": ["n02099267", "flat-coated_retriever"], "206": ["n02099429", "curly-coated_retriever"], "207": ["n02099601", "golden_retriever"], "208": ["n02099712", "Labrador_retriever"], "209": ["n02099849", "Chesapeake_Bay_retriever"], "210": ["n02100236", "German_short-haired_pointer"], "211": ["n02100583", "vizsla"], "212": ["n02100735", "English_setter"], "213": ["n02100877", "Irish_setter"], "214": ["n02101006", "Gordon_setter"], "215": ["n02101388", "Brittany_spaniel"], "216": ["n02101556", "clumber"], "217": ["n02102040", "English_springer"], "218": ["n02102177", "Welsh_springer_spaniel"], "219": ["n02102318", "cocker_spaniel"], "220": ["n02102480", "Sussex_spaniel"], "221": ["n02102973", "Irish_water_spaniel"], "222": ["n02104029", "kuvasz"], "223": ["n02104365", "schipperke"], "224": ["n02105056", "groenendael"], "225": ["n02105162", "malinois"], "226": ["n02105251", "briard"], "227": ["n02105412", "kelpie"], "228": ["n02105505", "komondor"], "229": ["n02105641", "Old_English_sheepdog"], "230": ["n02105855", "Shetland_sheepdog"], "231": ["n02106030", "collie"], "232": ["n02106166", "Border_collie"], "233": ["n02106382", "Bouvier_des_Flandres"], "234": ["n02106550", "Rottweiler"], "235": ["n02106662", "German_shepherd"], "236": ["n02107142", "Doberman"], "237": ["n02107312", "miniature_pinscher"], "238": ["n02107574", "Greater_Swiss_Mountain_dog"], "239": ["n02107683", "Bernese_mountain_dog"], "240": ["n02107908", "Appenzeller"], "241": ["n02108000", "EntleBucher"], "242": ["n02108089", "boxer"], "243": ["n02108422", "bull_mastiff"], "244": ["n02108551", "Tibetan_mastiff"], "245": ["n02108915", "French_bulldog"], "246": ["n02109047", "Great_Dane"], "247": ["n02109525", "Saint_Bernard"], "248": ["n02109961", "Eskimo_dog"], "249": ["n02110063", "malamute"], "250": ["n02110185", "Siberian_husky"], "251": ["n02110341", "dalmatian"], "252": ["n02110627", "affenpinscher"], "253": ["n02110806", "basenji"], "254": ["n02110958", "pug"], "255": ["n02111129", "Leonberg"], "256": ["n02111277", "Newfoundland"], "257": ["n02111500", "Great_Pyrenees"], "258": ["n02111889", "Samoyed"], "259": ["n02112018", "Pomeranian"], "260": ["n02112137", "chow"], "261": ["n02112350", "keeshond"], "262": ["n02112706", "Brabancon_griffon"], "263": ["n02113023", "Pembroke"], "264": ["n02113186", "Cardigan"], "265": ["n02113624", "toy_poodle"], "266": ["n02113712", "miniature_poodle"], "267": ["n02113799", "standard_poodle"], "268": ["n02113978", "Mexican_hairless"], "269": ["n02114367", "timber_wolf"], "270": ["n02114548", "white_wolf"], "271": ["n02114712", "red_wolf"], "272": ["n02114855", "coyote"], "273": ["n02115641", "dingo"], "274": ["n02115913", "dhole"], "275": ["n02116738", "African_hunting_dog"], "276": ["n02117135", "hyena"], "277": ["n02119022", "red_fox"], "278": ["n02119789", "kit_fox"], "279": ["n02120079", "Arctic_fox"], "280": ["n02120505", "grey_fox"], "281": ["n02123045", "tabby"], "282": ["n02123159", "tiger_cat"], "283": ["n02123394", "Persian_cat"], "284": ["n02123597", "Siamese_cat"], "285": ["n02124075", "Egyptian_cat"], "286": ["n02125311", "cougar"], "287": ["n02127052", "lynx"], "288": ["n02128385", "leopard"], "289": ["n02128757", "snow_leopard"], "290": ["n02128925", "jaguar"], "291": ["n02129165", "lion"], "292": ["n02129604", "tiger"], "293": ["n02130308", "cheetah"], "294": ["n02132136", "brown_bear"], "295": ["n02133161", "American_black_bear"], "296": ["n02134084", "ice_bear"], "297": ["n02134418", "sloth_bear"], "298": ["n02137549", "mongoose"], "299": ["n02138441", "meerkat"], "300": ["n02165105", "tiger_beetle"], "301": ["n02165456", "ladybug"], "302": ["n02167151", "ground_beetle"], "303": ["n02168699", "long-horned_beetle"], "304": ["n02169497", "leaf_beetle"], "305": ["n02172182", "dung_beetle"], "306": ["n02174001", "rhinoceros_beetle"], "307": ["n02177972", "weevil"], "308": ["n02190166", "fly"], "309": ["n02206856", "bee"], "310": ["n02219486", "ant"], "311": ["n02226429", "grasshopper"], "312": ["n02229544", "cricket"], "313": ["n02231487", "walking_stick"], "314": ["n02233338", "cockroach"], "315": ["n02236044", "mantis"], "316": ["n02256656", "cicada"], "317": ["n02259212", "leafhopper"], "318": ["n02264363", "lacewing"], "319": ["n02268443", "dragonfly"], "320": ["n02268853", "damselfly"], "321": ["n02276258", "admiral"], "322": ["n02277742", "ringlet"], "323": ["n02279972", "monarch"], "324": ["n02280649", "cabbage_butterfly"], "325": ["n02281406", "sulphur_butterfly"], "326": ["n02281787", "lycaenid"], "327": ["n02317335", "starfish"], "328": ["n02319095", "sea_urchin"], "329": ["n02321529", "sea_cucumber"], "330": ["n02325366", "wood_rabbit"], "331": ["n02326432", "hare"], "332": ["n02328150", "Angora"], "333": ["n02342885", "hamster"], "334": ["n02346627", "porcupine"], "335": ["n02356798", "fox_squirrel"], "336": ["n02361337", "marmot"], "337": ["n02363005", "beaver"], "338": ["n02364673", "guinea_pig"], "339": ["n02389026", "sorrel"], "340": ["n02391049", "zebra"], "341": ["n02395406", "hog"], "342": ["n02396427", "wild_boar"], "343": ["n02397096", "warthog"], "344": ["n02398521", "hippopotamus"], "345": ["n02403003", "ox"], "346": ["n02408429", "water_buffalo"], "347": ["n02410509", "bison"], "348": ["n02412080", "ram"], "349": ["n02415577", "bighorn"], "350": ["n02417914", "ibex"], "351": ["n02422106", "hartebeest"], "352": ["n02422699", "impala"], "353": ["n02423022", "gazelle"], "354": ["n02437312", "Arabian_camel"], "355": ["n02437616", "llama"], "356": ["n02441942", "weasel"], "357": ["n02442845", "mink"], "358": ["n02443114", "polecat"], "359": ["n02443484", "black-footed_ferret"], "360": ["n02444819", "otter"], "361": ["n02445715", "skunk"], "362": ["n02447366", "badger"], "363": ["n02454379", "armadillo"], "364": ["n02457408", "three-toed_sloth"], "365": ["n02480495", "orangutan"], "366": ["n02480855", "gorilla"], "367": ["n02481823", "chimpanzee"], "368": ["n02483362", "gibbon"], "369": ["n02483708", "siamang"], "370": ["n02484975", "guenon"], "371": ["n02486261", "patas"], "372": ["n02486410", "baboon"], "373": ["n02487347", "macaque"], "374": ["n02488291", "langur"], "375": ["n02488702", "colobus"], "376": ["n02489166", "proboscis_monkey"], "377": ["n02490219", "marmoset"], "378": ["n02492035", "capuchin"], "379": ["n02492660", "howler_monkey"], "380": ["n02493509", "titi"], "381": ["n02493793", "spider_monkey"], "382": ["n02494079", "squirrel_monkey"], "383": ["n02497673", "Madagascar_cat"], "384": ["n02500267", "indri"], "385": ["n02504013", "Indian_elephant"], "386": ["n02504458", "African_elephant"], "387": ["n02509815", "lesser_panda"], "388": ["n02510455", "giant_panda"], "389": ["n02514041", "barracouta"], "390": ["n02526121", "eel"], "391": ["n02536864", "coho"], "392": ["n02606052", "rock_beauty"], "393": ["n02607072", "anemone_fish"], "394": ["n02640242", "sturgeon"], "395": ["n02641379", "gar"], "396": ["n02643566", "lionfish"], "397": ["n02655020", "puffer"], "398": ["n02666196", "abacus"], "399": ["n02667093", "abaya"], "400": ["n02669723", "academic_gown"], "401": ["n02672831", "accordion"], "402": ["n02676566", "acoustic_guitar"], "403": ["n02687172", "aircraft_carrier"], "404": ["n02690373", "airliner"], "405": ["n02692877", "airship"], "406": ["n02699494", "altar"], "407": ["n02701002", "ambulance"], "408": ["n02704792", "amphibian"], "409": ["n02708093", "analog_clock"], "410": ["n02727426", "apiary"], "411": ["n02730930", "apron"], "412": ["n02747177", "ashcan"], "413": ["n02749479", "assault_rifle"], "414": ["n02769748", "backpack"], "415": ["n02776631", "bakery"], "416": ["n02777292", "balance_beam"], "417": ["n02782093", "balloon"], "418": ["n02783161", "ballpoint"], "419": ["n02786058", "Band_Aid"], "420": ["n02787622", "banjo"], "421": ["n02788148", "bannister"], "422": ["n02790996", "barbell"], "423": ["n02791124", "barber_chair"], "424": ["n02791270", "barbershop"], "425": ["n02793495", "barn"], "426": ["n02794156", "barometer"], "427": ["n02795169", "barrel"], "428": ["n02797295", "barrow"], "429": ["n02799071", "baseball"], "430": ["n02802426", "basketball"], "431": ["n02804414", "bassinet"], "432": ["n02804610", "bassoon"], "433": ["n02807133", "bathing_cap"], "434": ["n02808304", "bath_towel"], "435": ["n02808440", "bathtub"], "436": ["n02814533", "beach_wagon"], "437": ["n02814860", "beacon"], "438": ["n02815834", "beaker"], "439": ["n02817516", "bearskin"], "440": ["n02823428", "beer_bottle"], "441": ["n02823750", "beer_glass"], "442": ["n02825657", "bell_cote"], "443": ["n02834397", "bib"], "444": ["n02835271", "bicycle-built-for-two"], "445": ["n02837789", "bikini"], "446": ["n02840245", "binder"], "447": ["n02841315", "binoculars"], "448": ["n02843684", "birdhouse"], "449": ["n02859443", "boathouse"], "450": ["n02860847", "bobsled"], "451": ["n02865351", "bolo_tie"], "452": ["n02869837", "bonnet"], "453": ["n02870880", "bookcase"], "454": ["n02871525", "bookshop"], "455": ["n02877765", "bottlecap"], "456": ["n02879718", "bow"], "457": ["n02883205", "bow_tie"], "458": ["n02892201", "brass"], "459": ["n02892767", "brassiere"], "460": ["n02894605", "breakwater"], "461": ["n02895154", "breastplate"], "462": ["n02906734", "broom"], "463": ["n02909870", "bucket"], "464": ["n02910353", "buckle"], "465": ["n02916936", "bulletproof_vest"], "466": ["n02917067", "bullet_train"], "467": ["n02927161", "butcher_shop"], "468": ["n02930766", "cab"], "469": ["n02939185", "caldron"], "470": ["n02948072", "candle"], "471": ["n02950826", "cannon"], "472": ["n02951358", "canoe"], "473": ["n02951585", "can_opener"], "474": ["n02963159", "cardigan"], "475": ["n02965783", "car_mirror"], "476": ["n02966193", "carousel"], "477": ["n02966687", "carpenter's_kit"], "478": ["n02971356", "carton"], "479": ["n02974003", "car_wheel"], "480": ["n02977058", "cash_machine"], "481": ["n02978881", "cassette"], "482": ["n02979186", "cassette_player"], "483": ["n02980441", "castle"], "484": ["n02981792", "catamaran"], "485": ["n02988304", "CD_player"], "486": ["n02992211", "cello"], "487": ["n02992529", "cellular_telephone"], "488": ["n02999410", "chain"], "489": ["n03000134", "chainlink_fence"], "490": ["n03000247", "chain_mail"], "491": ["n03000684", "chain_saw"], "492": ["n03014705", "chest"], "493": ["n03016953", "chiffonier"], "494": ["n03017168", "chime"], "495": ["n03018349", "china_cabinet"], "496": ["n03026506", "Christmas_stocking"], "497": ["n03028079", "church"], "498": ["n03032252", "cinema"], "499": ["n03041632", "cleaver"], "500": ["n03042490", "cliff_dwelling"], "501": ["n03045698", "cloak"], "502": ["n03047690", "clog"], "503": ["n03062245", "cocktail_shaker"], "504": ["n03063599", "coffee_mug"], "505": ["n03063689", "coffeepot"], "506": ["n03065424", "coil"], "507": ["n03075370", "combination_lock"], "508": ["n03085013", "computer_keyboard"], "509": ["n03089624", "confectionery"], "510": ["n03095699", "container_ship"], "511": ["n03100240", "convertible"], "512": ["n03109150", "corkscrew"], "513": ["n03110669", "cornet"], "514": ["n03124043", "cowboy_boot"], "515": ["n03124170", "cowboy_hat"], "516": ["n03125729", "cradle"], "517": ["n03126707", "crane"], "518": ["n03127747", "crash_helmet"], "519": ["n03127925", "crate"], "520": ["n03131574", "crib"], "521": ["n03133878", "Crock_Pot"], "522": ["n03134739", "croquet_ball"], "523": ["n03141823", "crutch"], "524": ["n03146219", "cuirass"], "525": ["n03160309", "dam"], "526": ["n03179701", "desk"], "527": ["n03180011", "desktop_computer"], "528": ["n03187595", "dial_telephone"], "529": ["n03188531", "diaper"], "530": ["n03196217", "digital_clock"], "531": ["n03197337", "digital_watch"], "532": ["n03201208", "dining_table"], "533": ["n03207743", "dishrag"], "534": ["n03207941", "dishwasher"], "535": ["n03208938", "disk_brake"], "536": ["n03216828", "dock"], "537": ["n03218198", "dogsled"], "538": ["n03220513", "dome"], "539": ["n03223299", "doormat"], "540": ["n03240683", "drilling_platform"], "541": ["n03249569", "drum"], "542": ["n03250847", "drumstick"], "543": ["n03255030", "dumbbell"], "544": ["n03259280", "Dutch_oven"], "545": ["n03271574", "electric_fan"], "546": ["n03272010", "electric_guitar"], "547": ["n03272562", "electric_locomotive"], "548": ["n03290653", "entertainment_center"], "549": ["n03291819", "envelope"], "550": ["n03297495", "espresso_maker"], "551": ["n03314780", "face_powder"], "552": ["n03325584", "feather_boa"], "553": ["n03337140", "file"], "554": ["n03344393", "fireboat"], "555": ["n03345487", "fire_engine"], "556": ["n03347037", "fire_screen"], "557": ["n03355925", "flagpole"], "558": ["n03372029", "flute"], "559": ["n03376595", "folding_chair"], "560": ["n03379051", "football_helmet"], "561": ["n03384352", "forklift"], "562": ["n03388043", "fountain"], "563": ["n03388183", "fountain_pen"], "564": ["n03388549", "four-poster"], "565": ["n03393912", "freight_car"], "566": ["n03394916", "French_horn"], "567": ["n03400231", "frying_pan"], "568": ["n03404251", "fur_coat"], "569": ["n03417042", "garbage_truck"], "570": ["n03424325", "gasmask"], "571": ["n03425413", "gas_pump"], "572": ["n03443371", "goblet"], "573": ["n03444034", "go-kart"], "574": ["n03445777", "golf_ball"], "575": ["n03445924", "golfcart"], "576": ["n03447447", "gondola"], "577": ["n03447721", "gong"], "578": ["n03450230", "gown"], "579": ["n03452741", "grand_piano"], "580": ["n03457902", "greenhouse"], "581": ["n03459775", "grille"], "582": ["n03461385", "grocery_store"], "583": ["n03467068", "guillotine"], "584": ["n03476684", "hair_slide"], "585": ["n03476991", "hair_spray"], "586": ["n03478589", "half_track"], "587": ["n03481172", "hammer"], "588": ["n03482405", "hamper"], "589": ["n03483316", "hand_blower"], "590": ["n03485407", "hand-held_computer"], "591": ["n03485794", "handkerchief"], "592": ["n03492542", "hard_disc"], "593": ["n03494278", "harmonica"], "594": ["n03495258", "harp"], "595": ["n03496892", "harvester"], "596": ["n03498962", "hatchet"], "597": ["n03527444", "holster"], "598": ["n03529860", "home_theater"], "599": ["n03530642", "honeycomb"], "600": ["n03532672", "hook"], "601": ["n03534580", "hoopskirt"], "602": ["n03535780", "horizontal_bar"], "603": ["n03538406", "horse_cart"], "604": ["n03544143", "hourglass"], "605": ["n03584254", "iPod"], "606": ["n03584829", "iron"], "607": ["n03590841", "jack-o'-lantern"], "608": ["n03594734", "jean"], "609": ["n03594945", "jeep"], "610": ["n03595614", "jersey"], "611": ["n03598930", "jigsaw_puzzle"], "612": ["n03599486", "jinrikisha"], "613": ["n03602883", "joystick"], "614": ["n03617480", "kimono"], "615": ["n03623198", "knee_pad"], "616": ["n03627232", "knot"], "617": ["n03630383", "lab_coat"], "618": ["n03633091", "ladle"], "619": ["n03637318", "lampshade"], "620": ["n03642806", "laptop"], "621": ["n03649909", "lawn_mower"], "622": ["n03657121", "lens_cap"], "623": ["n03658185", "letter_opener"], "624": ["n03661043", "library"], "625": ["n03662601", "lifeboat"], "626": ["n03666591", "lighter"], "627": ["n03670208", "limousine"], "628": ["n03673027", "liner"], "629": ["n03676483", "lipstick"], "630": ["n03680355", "Loafer"], "631": ["n03690938", "lotion"], "632": ["n03691459", "loudspeaker"], "633": ["n03692522", "loupe"], "634": ["n03697007", "lumbermill"], "635": ["n03706229", "magnetic_compass"], "636": ["n03709823", "mailbag"], "637": ["n03710193", "mailbox"], "638": ["n03710637", "maillot"], "639": ["n03710721", "maillot"], "640": ["n03717622", "manhole_cover"], "641": ["n03720891", "maraca"], "642": ["n03721384", "marimba"], "643": ["n03724870", "mask"], "644": ["n03729826", "matchstick"], "645": ["n03733131", "maypole"], "646": ["n03733281", "maze"], "647": ["n03733805", "measuring_cup"], "648": ["n03742115", "medicine_chest"], "649": ["n03743016", "megalith"], "650": ["n03759954", "microphone"], "651": ["n03761084", "microwave"], "652": ["n03763968", "military_uniform"], "653": ["n03764736", "milk_can"], "654": ["n03769881", "minibus"], "655": ["n03770439", "miniskirt"], "656": ["n03770679", "minivan"], "657": ["n03773504", "missile"], "658": ["n03775071", "mitten"], "659": ["n03775546", "mixing_bowl"], "660": ["n03776460", "mobile_home"], "661": ["n03777568", "Model_T"], "662": ["n03777754", "modem"], "663": ["n03781244", "monastery"], "664": ["n03782006", "monitor"], "665": ["n03785016", "moped"], "666": ["n03786901", "mortar"], "667": ["n03787032", "mortarboard"], "668": ["n03788195", "mosque"], "669": ["n03788365", "mosquito_net"], "670": ["n03791053", "motor_scooter"], "671": ["n03792782", "mountain_bike"], "672": ["n03792972", "mountain_tent"], "673": ["n03793489", "mouse"], "674": ["n03794056", "mousetrap"], "675": ["n03796401", "moving_van"], "676": ["n03803284", "muzzle"], "677": ["n03804744", "nail"], "678": ["n03814639", "neck_brace"], "679": ["n03814906", "necklace"], "680": ["n03825788", "nipple"], "681": ["n03832673", "notebook"], "682": ["n03837869", "obelisk"], "683": ["n03838899", "oboe"], "684": ["n03840681", "ocarina"], "685": ["n03841143", "odometer"], "686": ["n03843555", "oil_filter"], "687": ["n03854065", "organ"], "688": ["n03857828", "oscilloscope"], "689": ["n03866082", "overskirt"], "690": ["n03868242", "oxcart"], "691": ["n03868863", "oxygen_mask"], "692": ["n03871628", "packet"], "693": ["n03873416", "paddle"], "694": ["n03874293", "paddlewheel"], "695": ["n03874599", "padlock"], "696": ["n03876231", "paintbrush"], "697": ["n03877472", "pajama"], "698": ["n03877845", "palace"], "699": ["n03884397", "panpipe"], "700": ["n03887697", "paper_towel"], "701": ["n03888257", "parachute"], "702": ["n03888605", "parallel_bars"], "703": ["n03891251", "park_bench"], "704": ["n03891332", "parking_meter"], "705": ["n03895866", "passenger_car"], "706": ["n03899768", "patio"], "707": ["n03902125", "pay-phone"], "708": ["n03903868", "pedestal"], "709": ["n03908618", "pencil_box"], "710": ["n03908714", "pencil_sharpener"], "711": ["n03916031", "perfume"], "712": ["n03920288", "Petri_dish"], "713": ["n03924679", "photocopier"], "714": ["n03929660", "pick"], "715": ["n03929855", "pickelhaube"], "716": ["n03930313", "picket_fence"], "717": ["n03930630", "pickup"], "718": ["n03933933", "pier"], "719": ["n03935335", "piggy_bank"], "720": ["n03937543", "pill_bottle"], "721": ["n03938244", "pillow"], "722": ["n03942813", "ping-pong_ball"], "723": ["n03944341", "pinwheel"], "724": ["n03947888", "pirate"], "725": ["n03950228", "pitcher"], "726": ["n03954731", "plane"], "727": ["n03956157", "planetarium"], "728": ["n03958227", "plastic_bag"], "729": ["n03961711", "plate_rack"], "730": ["n03967562", "plow"], "731": ["n03970156", "plunger"], "732": ["n03976467", "Polaroid_camera"], "733": ["n03976657", "pole"], "734": ["n03977966", "police_van"], "735": ["n03980874", "poncho"], "736": ["n03982430", "pool_table"], "737": ["n03983396", "pop_bottle"], "738": ["n03991062", "pot"], "739": ["n03992509", "potter's_wheel"], "740": ["n03995372", "power_drill"], "741": ["n03998194", "prayer_rug"], "742": ["n04004767", "printer"], "743": ["n04005630", "prison"], "744": ["n04008634", "projectile"], "745": ["n04009552", "projector"], "746": ["n04019541", "puck"], "747": ["n04023962", "punching_bag"], "748": ["n04026417", "purse"], "749": ["n04033901", "quill"], "750": ["n04033995", "quilt"], "751": ["n04037443", "racer"], "752": ["n04039381", "racket"], "753": ["n04040759", "radiator"], "754": ["n04041544", "radio"], "755": ["n04044716", "radio_telescope"], "756": ["n04049303", "rain_barrel"], "757": ["n04065272", "recreational_vehicle"], "758": ["n04067472", "reel"], "759": ["n04069434", "reflex_camera"], "760": ["n04070727", "refrigerator"], "761": ["n04074963", "remote_control"], "762": ["n04081281", "restaurant"], "763": ["n04086273", "revolver"], "764": ["n04090263", "rifle"], "765": ["n04099969", "rocking_chair"], "766": ["n04111531", "rotisserie"], "767": ["n04116512", "rubber_eraser"], "768": ["n04118538", "rugby_ball"], "769": ["n04118776", "rule"], "770": ["n04120489", "running_shoe"], "771": ["n04125021", "safe"], "772": ["n04127249", "safety_pin"], "773": ["n04131690", "saltshaker"], "774": ["n04133789", "sandal"], "775": ["n04136333", "sarong"], "776": ["n04141076", "sax"], "777": ["n04141327", "scabbard"], "778": ["n04141975", "scale"], "779": ["n04146614", "school_bus"], "780": ["n04147183", "schooner"], "781": ["n04149813", "scoreboard"], "782": ["n04152593", "screen"], "783": ["n04153751", "screw"], "784": ["n04154565", "screwdriver"], "785": ["n04162706", "seat_belt"], "786": ["n04179913", "sewing_machine"], "787": ["n04192698", "shield"], "788": ["n04200800", "shoe_shop"], "789": ["n04201297", "shoji"], "790": ["n04204238", "shopping_basket"], "791": ["n04204347", "shopping_cart"], "792": ["n04208210", "shovel"], "793": ["n04209133", "shower_cap"], "794": ["n04209239", "shower_curtain"], "795": ["n04228054", "ski"], "796": ["n04229816", "ski_mask"], "797": ["n04235860", "sleeping_bag"], "798": ["n04238763", "slide_rule"], "799": ["n04239074", "sliding_door"], "800": ["n04243546", "slot"], "801": ["n04251144", "snorkel"], "802": ["n04252077", "snowmobile"], "803": ["n04252225", "snowplow"], "804": ["n04254120", "soap_dispenser"], "805": ["n04254680", "soccer_ball"], "806": ["n04254777", "sock"], "807": ["n04258138", "solar_dish"], "808": ["n04259630", "sombrero"], "809": ["n04263257", "soup_bowl"], "810": ["n04264628", "space_bar"], "811": ["n04265275", "space_heater"], "812": ["n04266014", "space_shuttle"], "813": ["n04270147", "spatula"], "814": ["n04273569", "speedboat"], "815": ["n04275548", "spider_web"], "816": ["n04277352", "spindle"], "817": ["n04285008", "sports_car"], "818": ["n04286575", "spotlight"], "819": ["n04296562", "stage"], "820": ["n04310018", "steam_locomotive"], "821": ["n04311004", "steel_arch_bridge"], "822": ["n04311174", "steel_drum"], "823": ["n04317175", "stethoscope"], "824": ["n04325704", "stole"], "825": ["n04326547", "stone_wall"], "826": ["n04328186", "stopwatch"], "827": ["n04330267", "stove"], "828": ["n04332243", "strainer"], "829": ["n04335435", "streetcar"], "830": ["n04336792", "stretcher"], "831": ["n04344873", "studio_couch"], "832": ["n04346328", "stupa"], "833": ["n04347754", "submarine"], "834": ["n04350905", "suit"], "835": ["n04355338", "sundial"], "836": ["n04355933", "sunglass"], "837": ["n04356056", "sunglasses"], "838": ["n04357314", "sunscreen"], "839": ["n04366367", "suspension_bridge"], "840": ["n04367480", "swab"], "841": ["n04370456", "sweatshirt"], "842": ["n04371430", "swimming_trunks"], "843": ["n04371774", "swing"], "844": ["n04372370", "switch"], "845": ["n04376876", "syringe"], "846": ["n04380533", "table_lamp"], "847": ["n04389033", "tank"], "848": ["n04392985", "tape_player"], "849": ["n04398044", "teapot"], "850": ["n04399382", "teddy"], "851": ["n04404412", "television"], "852": ["n04409515", "tennis_ball"], "853": ["n04417672", "thatch"], "854": ["n04418357", "theater_curtain"], "855": ["n04423845", "thimble"], "856": ["n04428191", "thresher"], "857": ["n04429376", "throne"], "858": ["n04435653", "tile_roof"], "859": ["n04442312", "toaster"], "860": ["n04443257", "tobacco_shop"], "861": ["n04447861", "toilet_seat"], "862": ["n04456115", "torch"], "863": ["n04458633", "totem_pole"], "864": ["n04461696", "tow_truck"], "865": ["n04462240", "toyshop"], "866": ["n04465501", "tractor"], "867": ["n04467665", "trailer_truck"], "868": ["n04476259", "tray"], "869": ["n04479046", "trench_coat"], "870": ["n04482393", "tricycle"], "871": ["n04483307", "trimaran"], "872": ["n04485082", "tripod"], "873": ["n04486054", "triumphal_arch"], "874": ["n04487081", "trolleybus"], "875": ["n04487394", "trombone"], "876": ["n04493381", "tub"], "877": ["n04501370", "turnstile"], "878": ["n04505470", "typewriter_keyboard"], "879": ["n04507155", "umbrella"], "880": ["n04509417", "unicycle"], "881": ["n04515003", "upright"], "882": ["n04517823", "vacuum"], "883": ["n04522168", "vase"], "884": ["n04523525", "vault"], "885": ["n04525038", "velvet"], "886": ["n04525305", "vending_machine"], "887": ["n04532106", "vestment"], "888": ["n04532670", "viaduct"], "889": ["n04536866", "violin"], "890": ["n04540053", "volleyball"], "891": ["n04542943", "waffle_iron"], "892": ["n04548280", "wall_clock"], "893": ["n04548362", "wallet"], "894": ["n04550184", "wardrobe"], "895": ["n04552348", "warplane"], "896": ["n04553703", "washbasin"], "897": ["n04554684", "washer"], "898": ["n04557648", "water_bottle"], "899": ["n04560804", "water_jug"], "900": ["n04562935", "water_tower"], "901": ["n04579145", "whiskey_jug"], "902": ["n04579432", "whistle"], "903": ["n04584207", "wig"], "904": ["n04589890", "window_screen"], "905": ["n04590129", "window_shade"], "906": ["n04591157", "Windsor_tie"], "907": ["n04591713", "wine_bottle"], "908": ["n04592741", "wing"], "909": ["n04596742", "wok"], "910": ["n04597913", "wooden_spoon"], "911": ["n04599235", "wool"], "912": ["n04604644", "worm_fence"], "913": ["n04606251", "wreck"], "914": ["n04612504", "yawl"], "915": ["n04613696", "yurt"], "916": ["n06359193", "web_site"], "917": ["n06596364", "comic_book"], "918": ["n06785654", "crossword_puzzle"], "919": ["n06794110", "street_sign"], "920": ["n06874185", "traffic_light"], "921": ["n07248320", "book_jacket"], "922": ["n07565083", "menu"], "923": ["n07579787", "plate"], "924": ["n07583066", "guacamole"], "925": ["n07584110", "consomme"], "926": ["n07590611", "hot_pot"], "927": ["n07613480", "trifle"], "928": ["n07614500", "ice_cream"], "929": ["n07615774", "ice_lolly"], "930": ["n07684084", "French_loaf"], "931": ["n07693725", "bagel"], "932": ["n07695742", "pretzel"], "933": ["n07697313", "cheeseburger"], "934": ["n07697537", "hotdog"], "935": ["n07711569", "mashed_potato"], "936": ["n07714571", "head_cabbage"], "937": ["n07714990", "broccoli"], "938": ["n07715103", "cauliflower"], "939": ["n07716358", "zucchini"], "940": ["n07716906", "spaghetti_squash"], "941": ["n07717410", "acorn_squash"], "942": ["n07717556", "butternut_squash"], "943": ["n07718472", "cucumber"], "944": ["n07718747", "artichoke"], "945": ["n07720875", "bell_pepper"], "946": ["n07730033", "cardoon"], "947": ["n07734744", "mushroom"], "948": ["n07742313", "Granny_Smith"], "949": ["n07745940", "strawberry"], "950": ["n07747607", "orange"], "951": ["n07749582", "lemon"], "952": ["n07753113", "fig"], "953": ["n07753275", "pineapple"], "954": ["n07753592", "banana"], "955": ["n07754684", "jackfruit"], "956": ["n07760859", "custard_apple"], "957": ["n07768694", "pomegranate"], "958": ["n07802026", "hay"], "959": ["n07831146", "carbonara"], "960": ["n07836838", "chocolate_sauce"], "961": ["n07860988", "dough"], "962": ["n07871810", "meat_loaf"], "963": ["n07873807", "pizza"], "964": ["n07875152", "potpie"], "965": ["n07880968", "burrito"], "966": ["n07892512", "red_wine"], "967": ["n07920052", "espresso"], "968": ["n07930864", "cup"], "969": ["n07932039", "eggnog"], "970": ["n09193705", "alp"], "971": ["n09229709", "bubble"], "972": ["n09246464", "cliff"], "973": ["n09256479", "coral_reef"], "974": ["n09288635", "geyser"], "975": ["n09332890", "lakeside"], "976": ["n09399592", "promontory"], "977": ["n09421951", "sandbar"], "978": ["n09428293", "seashore"], "979": ["n09468604", "valley"], "980": ["n09472597", "volcano"], "981": ["n09835506", "ballplayer"], "982": ["n10148035", "groom"], "983": ["n10565667", "scuba_diver"], "984": ["n11879895", "rapeseed"], "985": ["n11939491", "daisy"], "986": ["n12057211", "yellow_lady's_slipper"], "987": ["n12144580", "corn"], "988": ["n12267677", "acorn"], "989": ["n12620546", "hip"], "990": ["n12768682", "buckeye"], "991": ["n12985857", "coral_fungus"], "992": ["n12998815", "agaric"], "993": ["n13037406", "gyromitra"], "994": ["n13040303", "stinkhorn"], "995": ["n13044778", "earthstar"], "996": ["n13052670", "hen-of-the-woods"], "997": ["n13054560", "bolete"], "998": ["n13133613", "ear"], "999": ["n15075141", "toilet_tissue"]}
\ No newline at end of file
diff --git a/benchmarks/400.inference/413.image-classification/python/init.sh b/benchmarks/400.inference/413.image-classification/python/init.sh
new file mode 100755
index 000000000..71a2e39c0
--- /dev/null
+++ b/benchmarks/400.inference/413.image-classification/python/init.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+DIR=$1
+VERBOSE=$2
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+path="${SCRIPT_DIR}/imagenet_class_index.json"
+if [ "$VERBOSE" = true ]; then
+  echo "Update ${DIR} with json ${path}"
+fi
+cp ${path} ${DIR}
diff --git a/benchmarks/400.inference/413.image-classification/python/package.sh b/benchmarks/400.inference/413.image-classification/python/package.sh
new file mode 100644
index 000000000..038fac7c5
--- /dev/null
+++ b/benchmarks/400.inference/413.image-classification/python/package.sh
@@ -0,0 +1,32 @@
+# Stripping package code is based on https://github.com/ryfeus/lambda-packs repo
+
+PACKAGE_DIR=$1
+echo "Original size $(du -sh $1 | cut -f1)"
+
+CUR_DIR=$(pwd)
+cd $1
+# cleaning libs
+rm -rf external
+find . -type d -name "tests" -exec rm -rf {} +
+find . -type d -name "test" -exec rm -rf {} +
+find . -type d -name "bin" -not -path "*/torch/*" -exec rm -rf {} +
+
+# cleaning
+# stripping some of the numpy libs - libgfortran-2e0d59d6.so.5.0.0 - causes issues on Azure
+find -name "*.so" -not -path "*/PIL/*" -not -path "*/Pillow.libs/*" -not -path "*libgfortran*" | xargs strip
+find -name "*.so.*" -not -path "*/PIL/*" -not -path "*/Pillow.libs/*" -not -path "*libgfortran*" | xargs strip
+
+rm -r pip >/dev/null
+rm -r pip-* >/dev/null
+rm -r wheel >/dev/null
+rm -r wheel-* >/dev/null
+rm easy_install.py >/dev/null
+find . -name \*.pyc -delete
+cd ${CUR_DIR}
+echo "Stripped size $(du -sh $1 | cut -f1)"
+
+if ([[ "${PLATFORM}" == "AWS" ]] || [[ "${PLATFORM}" == "GCP" ]]) && ([[ "${PYTHON_VERSION}" == "3.8" ]] || [[ "${PYTHON_VERSION}" == "3.9" ]]); then
+	zip -qr torch.zip $1/torch
+	rm -rf $1/torch
+	echo "Torch-zipped size $(du -sh ${CUR_DIR} | cut -f1)"
+fi
diff --git a/benchmarks/400.inference/413.image-classification/python/requirements.txt b/benchmarks/400.inference/413.image-classification/python/requirements.txt
new file mode 100755
index 000000000..01d9a45b4
--- /dev/null
+++ b/benchmarks/400.inference/413.image-classification/python/requirements.txt
@@ -0,0 +1,5 @@
+numpy>=1.22,<2.0
+pillow>=9.5,<10.0
+torch==2.4.1
+torchvision==0.19.1
+typing-extensions>=4.8
diff --git a/benchmarks/400.inference/413.image-classification/python/requirements.txt.3.10 b/benchmarks/400.inference/413.image-classification/python/requirements.txt.3.10
new file mode 100644
index 000000000..96299cb57
--- /dev/null
+++ b/benchmarks/400.inference/413.image-classification/python/requirements.txt.3.10
@@ -0,0 +1,4 @@
+numpy>=2.0
+pillow>=10.0
+torch==2.5.1
+torchvision==0.20.1
diff --git a/benchmarks/400.inference/413.image-classification/python/requirements.txt.3.11 b/benchmarks/400.inference/413.image-classification/python/requirements.txt.3.11
new file mode 100644
index 000000000..96299cb57
--- /dev/null
+++ b/benchmarks/400.inference/413.image-classification/python/requirements.txt.3.11
@@ -0,0 +1,4 @@
+numpy>=2.0
+pillow>=10.0
+torch==2.5.1
+torchvision==0.20.1
diff --git a/benchmarks/400.inference/413.image-classification/python/requirements.txt.3.12 b/benchmarks/400.inference/413.image-classification/python/requirements.txt.3.12
new file mode 100644
index 000000000..96299cb57
--- /dev/null
+++ b/benchmarks/400.inference/413.image-classification/python/requirements.txt.3.12
@@ -0,0 +1,4 @@
+numpy>=2.0
+pillow>=10.0
+torch==2.5.1
+torchvision==0.20.1
diff --git a/benchmarks/400.inference/413.image-classification/python/requirements.txt.3.8 b/benchmarks/400.inference/413.image-classification/python/requirements.txt.3.8
new file mode 100755
index 000000000..01d9a45b4
--- /dev/null
+++ b/benchmarks/400.inference/413.image-classification/python/requirements.txt.3.8
@@ -0,0 +1,5 @@
+numpy>=1.22,<2.0
+pillow>=9.5,<10.0
+torch==2.4.1
+torchvision==0.19.1
+typing-extensions>=4.8
diff --git a/benchmarks/400.inference/413.image-classification/python/requirements.txt.3.9 b/benchmarks/400.inference/413.image-classification/python/requirements.txt.3.9
new file mode 100755
index 000000000..96299cb57
--- /dev/null
+++ b/benchmarks/400.inference/413.image-classification/python/requirements.txt.3.9
@@ -0,0 +1,4 @@
+numpy>=2.0
+pillow>=10.0
+torch==2.5.1
+torchvision==0.20.1
diff --git a/benchmarks/400.inference/413.image-classification/python/requirements.txt.arm.3.8 b/benchmarks/400.inference/413.image-classification/python/requirements.txt.arm.3.8
new file mode 100644
index 000000000..01d9a45b4
--- /dev/null
+++ b/benchmarks/400.inference/413.image-classification/python/requirements.txt.arm.3.8
@@ -0,0 +1,5 @@
+numpy>=1.22,<2.0
+pillow>=9.5,<10.0
+torch==2.4.1
+torchvision==0.19.1
+typing-extensions>=4.8
diff --git a/benchmarks/400.inference/413.image-classification/python/requirements.txt.arm.3.9 b/benchmarks/400.inference/413.image-classification/python/requirements.txt.arm.3.9
new file mode 100644
index 000000000..96299cb57
--- /dev/null
+++ b/benchmarks/400.inference/413.image-classification/python/requirements.txt.arm.3.9
@@ -0,0 +1,4 @@
+numpy>=2.0
+pillow>=10.0
+torch==2.5.1
+torchvision==0.20.1
diff --git a/benchmarks/400.inference/413.recommendation/config.json b/benchmarks/400.inference/413.recommendation/config.json
new file mode 100644
index 000000000..649bb78d6
--- /dev/null
+++ b/benchmarks/400.inference/413.recommendation/config.json
@@ -0,0 +1,6 @@
+{
+  "timeout": 60,
+  "memory": 1024,
+  "languages": ["python"],
+  "modules": ["storage"]
+}
diff --git a/benchmarks/400.inference/413.recommendation/input.py b/benchmarks/400.inference/413.recommendation/input.py
new file mode 100644
index 000000000..b810b2293
--- /dev/null
+++ b/benchmarks/400.inference/413.recommendation/input.py
@@ -0,0 +1,36 @@
+import os
+
+
+def buckets_count():
+    return (2, 0)
+
+
+def upload_files(data_root, data_dir, upload_func):
+    for root, _, files in os.walk(data_dir):
+        prefix = os.path.relpath(root, data_root)
+        for file in files:
+            upload_func(0, os.path.join(prefix, file), os.path.join(root, file))
+
+
+def generate_input(
+    data_dir,
+    size,
+    benchmarks_bucket,
+    input_paths,
+    output_paths,
+    upload_func,
+    nosql_func,
+):
+    model_file = "dlrm_tiny.pt"
+    upload_func(0, model_file, os.path.join(data_dir, "model", model_file))
+
+    requests_file = "requests.jsonl"
+    upload_func(1, requests_file, os.path.join(data_dir, "data", requests_file))
+
+    cfg = {"object": {}, "bucket": {}}
+    cfg["object"]["model"] = model_file
+    cfg["object"]["requests"] = requests_file
+    cfg["bucket"]["bucket"] = benchmarks_bucket
+    cfg["bucket"]["model"] = input_paths[0]
+    cfg["bucket"]["requests"] = input_paths[1]
+    return cfg
diff --git a/benchmarks/400.inference/413.recommendation/python/function.py b/benchmarks/400.inference/413.recommendation/python/function.py
new file mode 100644
index 000000000..ec6a8455f
--- /dev/null
+++ b/benchmarks/400.inference/413.recommendation/python/function.py
@@ -0,0 +1,146 @@
+import datetime
+import json
+import os
+import uuid
+
+import torch
+import torch.nn as nn
+
+from . import storage
+
+client = storage.storage.get_instance()
+
+MODEL_FILE = "dlrm_tiny.pt"
+MODEL_CACHE = "/tmp/dlrm_gpu_model"
+
+_model = None
+_device = torch.device("cpu")
+
+
+class TinyDLRM(nn.Module):
+    def __init__(self, num_users, num_items, num_categories, embed_dim=8):
+        super().__init__()
+        self.user_emb = nn.Embedding(num_users, embed_dim)
+        self.item_emb = nn.Embedding(num_items, embed_dim)
+        self.category_emb = nn.Embedding(num_categories, embed_dim)
+        in_dim = embed_dim * 3 + 2
+        hidden = 16
+        self.mlp = nn.Sequential(
+            nn.Linear(in_dim, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, 1),
+        )
+
+    def forward(self, user_id, item_id, category_id, dense):
+        features = torch.cat(
+            [
+                self.user_emb(user_id),
+                self.item_emb(item_id),
+                self.category_emb(category_id),
+                dense,
+            ],
+            dim=-1,
+        )
+        return torch.sigmoid(self.mlp(features))
+
+
+def _select_device():
+    if torch.cuda.is_available():
+        return torch.device("cuda")
+    raise RuntimeError("CUDA is not available")
+    return torch.device("cpu")
+
+
+def _load_model(bucket, prefix):
+    global _model, _device
+
+    if _model is not None:
+        return 0.0, 0.0
+
+    download_begin = datetime.datetime.now()
+    os.makedirs(MODEL_CACHE, exist_ok=True)
+    tmp_path = os.path.join("/tmp", f"{uuid.uuid4()}-{MODEL_FILE}")
+    client.download(bucket, os.path.join(prefix, MODEL_FILE), tmp_path)
+    download_end = datetime.datetime.now()
+
+    process_begin = datetime.datetime.now()
+    checkpoint = torch.load(tmp_path, map_location="cpu")
+    meta = checkpoint["meta"]
+    _device = _select_device()
+    model = TinyDLRM(
+        meta["num_users"], meta["num_items"], meta["num_categories"], meta["embed_dim"]
+    )
+    model.load_state_dict(checkpoint["state_dict"])
+    model.to(_device)
+    model.eval()
+    _model = model
+    os.remove(tmp_path)
+    process_end = datetime.datetime.now()
+
+    download_time = (download_end - download_begin) / datetime.timedelta(microseconds=1)
+    process_time = (process_end - process_begin) / datetime.timedelta(microseconds=1)
+    return download_time, process_time
+
+
+def _prepare_batch(requests):
+    user_ids = torch.tensor([req["user_id"] for req in requests], dtype=torch.long, device=_device)
+    item_ids = torch.tensor([req["item_id"] for req in requests], dtype=torch.long, device=_device)
+    category_ids = torch.tensor(
+        [req["category_id"] for req in requests], dtype=torch.long, device=_device
+    )
+    dense = torch.tensor(
+        [req.get("dense", [0.0, 0.0]) for req in requests],
+        dtype=torch.float32,
+        device=_device,
+    )
+    return user_ids, item_ids, category_ids, dense
+
+
+def handler(event):
+    bucket = event.get("bucket", {}).get("bucket")
+    model_prefix = event.get("bucket", {}).get("model")
+    requests_prefix = event.get("bucket", {}).get("requests")
+    requests_key = event.get("object", {}).get("requests")
+
+    download_begin = datetime.datetime.now()
+    req_path = os.path.join("/tmp", f"{uuid.uuid4()}-{os.path.basename(requests_key)}")
+    client.download(bucket, os.path.join(requests_prefix, requests_key), req_path)
+    download_end = datetime.datetime.now()
+
+    model_download_time, model_process_time = _load_model(bucket, model_prefix)
+
+    with open(req_path, "r") as f:
+        payloads = [json.loads(line) for line in f if line.strip()]
+    os.remove(req_path)
+
+    inference_begin = datetime.datetime.now()
+    user_ids, item_ids, category_ids, dense = _prepare_batch(payloads)
+
+    with torch.no_grad():
+        scores = _model(user_ids, item_ids, category_ids, dense).squeeze(-1).tolist()
+    inference_end = datetime.datetime.now()
+
+    predictions = []
+    for req, score in zip(payloads, scores):
+        predictions.append(
+            {
+                "user_id": req["user_id"],
+                "item_id": req["item_id"],
+                "category_id": req["category_id"],
+                "score": score,
+                "device": str(_device),
+            }
+        )
+
+    download_time = (download_end - download_begin) / datetime.timedelta(microseconds=1)
+    compute_time = (inference_end - inference_begin) / datetime.timedelta(microseconds=1)
+
+    return {
+        "result": {"predictions": predictions},
+        "measurement": {
+            "download_time": download_time + model_download_time,
+            "compute_time": compute_time + model_process_time,
+            "model_time": model_process_time,
+            "model_download_time": model_download_time,
+        },
+    }
diff --git a/benchmarks/400.inference/413.recommendation/python/init.sh b/benchmarks/400.inference/413.recommendation/python/init.sh
new file mode 100644
index 000000000..f42329404
--- /dev/null
+++ b/benchmarks/400.inference/413.recommendation/python/init.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+# No additional initialization required for GPU recommendation benchmark.
diff --git a/benchmarks/400.inference/413.recommendation/python/package.sh b/benchmarks/400.inference/413.recommendation/python/package.sh
new file mode 100644
index 000000000..64e9deacb
--- /dev/null
+++ b/benchmarks/400.inference/413.recommendation/python/package.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+PACKAGE_DIR=$1
+echo "DLRM GPU package size $(du -sh $1 | cut -f1)"
diff --git a/benchmarks/400.inference/413.recommendation/python/requirements.txt b/benchmarks/400.inference/413.recommendation/python/requirements.txt
new file mode 100644
index 000000000..c5ddafe5b
--- /dev/null
+++ b/benchmarks/400.inference/413.recommendation/python/requirements.txt
@@ -0,0 +1 @@
+torch==2.2.2
diff --git a/benchmarks/400.inference/413.recommendation/python/requirements.txt.3.10 b/benchmarks/400.inference/413.recommendation/python/requirements.txt.3.10
new file mode 100644
index 000000000..c5ddafe5b
--- /dev/null
+++ b/benchmarks/400.inference/413.recommendation/python/requirements.txt.3.10
@@ -0,0 +1 @@
+torch==2.2.2
diff --git a/benchmarks/400.inference/413.recommendation/python/requirements.txt.3.11 b/benchmarks/400.inference/413.recommendation/python/requirements.txt.3.11
new file mode 100644
index 000000000..c5ddafe5b
--- /dev/null
+++ b/benchmarks/400.inference/413.recommendation/python/requirements.txt.3.11
@@ -0,0 +1 @@
+torch==2.2.2
diff --git a/benchmarks/400.inference/413.recommendation/python/requirements.txt.3.8 b/benchmarks/400.inference/413.recommendation/python/requirements.txt.3.8
new file mode 100644
index 000000000..c5ddafe5b
--- /dev/null
+++ b/benchmarks/400.inference/413.recommendation/python/requirements.txt.3.8
@@ -0,0 +1 @@
+torch==2.2.2
diff --git a/benchmarks/400.inference/413.recommendation/python/requirements.txt.3.9 b/benchmarks/400.inference/413.recommendation/python/requirements.txt.3.9
new file mode 100644
index 000000000..c5ddafe5b
--- /dev/null
+++ b/benchmarks/400.inference/413.recommendation/python/requirements.txt.3.9
@@ -0,0 +1 @@
+torch==2.2.2
diff --git a/benchmarks/500.scientific/501.graph-pagerank/input.py b/benchmarks/500.scientific/501.graph-pagerank/input.py
index e20a6dcd1..1d42cd73e 100644
--- a/benchmarks/500.scientific/501.graph-pagerank/input.py
+++ b/benchmarks/500.scientific/501.graph-pagerank/input.py
@@ -1,8 +1,13 @@
-size_generators = {
-    'test' : 10,
-    'small' : 10000,
-    'large': 100000
-}
+size_generators = {"test": 10, "small": 10000, "large": 100000}
 
-def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func, nosql_func):
-    return { 'size': size_generators[size], 'seed': 42}
+
+def generate_input(
+    data_dir,
+    size,
+    benchmarks_bucket,
+    input_paths,
+    output_paths,
+    upload_func,
+    nosql_func,
+):
+    return {"size": size_generators[size], "seed": 42}
diff --git a/benchmarks/500.scientific/501.graph-pagerank/python/function.py b/benchmarks/500.scientific/501.graph-pagerank/python/function.py
index 0e462e9b4..461fc14a9 100755
--- a/benchmarks/500.scientific/501.graph-pagerank/python/function.py
+++ b/benchmarks/500.scientific/501.graph-pagerank/python/function.py
@@ -1,9 +1,10 @@
 import datetime
 import igraph
 
+
 def handler(event):
 
-    size = event.get('size')
+    size = event.get("size")
     if "seed" in event:
         import random
 
@@ -17,13 +18,15 @@ def handler(event):
     result = graph.pagerank()
     process_end = datetime.datetime.now()
 
-    graph_generating_time = (graph_generating_end - graph_generating_begin) / datetime.timedelta(microseconds=1)
+    graph_generating_time = (graph_generating_end - graph_generating_begin) / datetime.timedelta(
+        microseconds=1
+    )
     process_time = (process_end - process_begin) / datetime.timedelta(microseconds=1)
 
     return {
-            'result': result[0],
-            'measurement': {
-                'graph_generating_time': graph_generating_time,
-                'compute_time': process_time
-            }
+        "result": result[0],
+        "measurement": {
+            "graph_generating_time": graph_generating_time,
+            "compute_time": process_time,
+        },
     }
diff --git a/benchmarks/500.scientific/502.graph-mst/input.py b/benchmarks/500.scientific/502.graph-mst/input.py
index e20a6dcd1..1d42cd73e 100644
--- a/benchmarks/500.scientific/502.graph-mst/input.py
+++ b/benchmarks/500.scientific/502.graph-mst/input.py
@@ -1,8 +1,13 @@
-size_generators = {
-    'test' : 10,
-    'small' : 10000,
-    'large': 100000
-}
+size_generators = {"test": 10, "small": 10000, "large": 100000}
 
-def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func, nosql_func):
-    return { 'size': size_generators[size], 'seed': 42}
+
+def generate_input(
+    data_dir,
+    size,
+    benchmarks_bucket,
+    input_paths,
+    output_paths,
+    upload_func,
+    nosql_func,
+):
+    return {"size": size_generators[size], "seed": 42}
diff --git a/benchmarks/500.scientific/502.graph-mst/python/function.py b/benchmarks/500.scientific/502.graph-mst/python/function.py
index b63fbdce2..69ad77678 100755
--- a/benchmarks/500.scientific/502.graph-mst/python/function.py
+++ b/benchmarks/500.scientific/502.graph-mst/python/function.py
@@ -1,9 +1,10 @@
 import datetime
 import igraph
 
+
 def handler(event):
 
-    size = event.get('size')
+    size = event.get("size")
     if "seed" in event:
         import random
 
@@ -17,13 +18,15 @@ def handler(event):
     result = graph.spanning_tree(None, False)
     process_end = datetime.datetime.now()
 
-    graph_generating_time = (graph_generating_end - graph_generating_begin) / datetime.timedelta(microseconds=1)
+    graph_generating_time = (graph_generating_end - graph_generating_begin) / datetime.timedelta(
+        microseconds=1
+    )
     process_time = (process_end - process_begin) / datetime.timedelta(microseconds=1)
 
     return {
-            'result': result[0],
-            'measurement': {
-                'graph_generating_time': graph_generating_time,
-                'compute_time': process_time
-            }
+        "result": result[0],
+        "measurement": {
+            "graph_generating_time": graph_generating_time,
+            "compute_time": process_time,
+        },
     }
diff --git a/benchmarks/500.scientific/503.graph-bfs/input.py b/benchmarks/500.scientific/503.graph-bfs/input.py
index e20a6dcd1..1d42cd73e 100644
--- a/benchmarks/500.scientific/503.graph-bfs/input.py
+++ b/benchmarks/500.scientific/503.graph-bfs/input.py
@@ -1,8 +1,13 @@
-size_generators = {
-    'test' : 10,
-    'small' : 10000,
-    'large': 100000
-}
+size_generators = {"test": 10, "small": 10000, "large": 100000}
 
-def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func, nosql_func):
-    return { 'size': size_generators[size], 'seed': 42}
+
+def generate_input(
+    data_dir,
+    size,
+    benchmarks_bucket,
+    input_paths,
+    output_paths,
+    upload_func,
+    nosql_func,
+):
+    return {"size": size_generators[size], "seed": 42}
diff --git a/benchmarks/500.scientific/503.graph-bfs/python/function.py b/benchmarks/500.scientific/503.graph-bfs/python/function.py
index 18423ae1a..51a37346b 100755
--- a/benchmarks/500.scientific/503.graph-bfs/python/function.py
+++ b/benchmarks/500.scientific/503.graph-bfs/python/function.py
@@ -1,9 +1,10 @@
 import datetime
 import igraph
 
+
 def handler(event):
 
-    size = event.get('size')
+    size = event.get("size")
     if "seed" in event:
         import random
 
@@ -17,13 +18,15 @@ def handler(event):
     result = graph.bfs(0)
     process_end = datetime.datetime.now()
 
-    graph_generating_time = (graph_generating_end - graph_generating_begin) / datetime.timedelta(microseconds=1)
+    graph_generating_time = (graph_generating_end - graph_generating_begin) / datetime.timedelta(
+        microseconds=1
+    )
     process_time = (process_end - process_begin) / datetime.timedelta(microseconds=1)
 
     return {
-            'result': result,
-            'measurement': {
-                'graph_generating_time': graph_generating_time,
-                'compute_time': process_time
-            }
+        "result": result,
+        "measurement": {
+            "graph_generating_time": graph_generating_time,
+            "compute_time": process_time,
+        },
     }
diff --git a/benchmarks/500.scientific/504.dna-visualisation/input.py b/benchmarks/500.scientific/504.dna-visualisation/input.py
index a9f376ea2..1f2c06a10 100644
--- a/benchmarks/500.scientific/504.dna-visualisation/input.py
+++ b/benchmarks/500.scientific/504.dna-visualisation/input.py
@@ -1,16 +1,27 @@
-import glob, os
+import glob
+import os
+
 
 def buckets_count():
     return (1, 1)
 
-def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func, nosql_func):
 
-    for file in glob.glob(os.path.join(data_dir, '*.fasta')):
+def generate_input(
+    data_dir,
+    size,
+    benchmarks_bucket,
+    input_paths,
+    output_paths,
+    upload_func,
+    nosql_func,
+):
+
+    for file in glob.glob(os.path.join(data_dir, "*.fasta")):
         data = os.path.relpath(file, data_dir)
         upload_func(0, data, file)
-    input_config = {'object': {}, 'bucket': {}}
-    input_config['object']['key'] = data
-    input_config['bucket']['bucket'] = benchmarks_bucket
-    input_config['bucket']['input'] = input_paths[0]
-    input_config['bucket']['output'] = output_paths[0]
+    input_config = {"object": {}, "bucket": {}}
+    input_config["object"]["key"] = data
+    input_config["bucket"]["bucket"] = benchmarks_bucket
+    input_config["bucket"]["input"] = input_paths[0]
+    input_config["bucket"]["output"] = output_paths[0]
     return input_config
diff --git a/benchmarks/500.scientific/504.dna-visualisation/python/function.py b/benchmarks/500.scientific/504.dna-visualisation/python/function.py
index 8362a73a1..ca9f5975e 100755
--- a/benchmarks/500.scientific/504.dna-visualisation/python/function.py
+++ b/benchmarks/500.scientific/504.dna-visualisation/python/function.py
@@ -1,17 +1,23 @@
-import datetime, io, json, os
+import datetime
+import io
+import json
+import os
+
 # using https://squiggle.readthedocs.io/en/latest/
 from squiggle import transform
 
 from . import storage
+
 client = storage.storage.get_instance()
 
+
 def handler(event):
 
-    bucket = event.get('bucket').get('bucket')
-    input_prefix = event.get('bucket').get('input')
-    output_prefix = event.get('bucket').get('output')
-    key = event.get('object').get('key')
-    download_path = '/tmp/{}'.format(key)
+    bucket = event.get("bucket").get("bucket")
+    input_prefix = event.get("bucket").get("input")
+    output_prefix = event.get("bucket").get("output")
+    key = event.get("object").get("key")
+    download_path = "/tmp/{}".format(key)
 
     download_begin = datetime.datetime.now()
     client.download(bucket, os.path.join(input_prefix, key), download_path)
@@ -34,13 +40,10 @@ def handler(event):
     process_time = (process_end - process_begin) / datetime.timedelta(microseconds=1)
 
     return {
-            'result': {
-                'bucket': bucket,
-                'key': key_name
-            },
-            'measurement': {
-                'download_time': download_time,
-                'compute_time': process_time,
-                'upload_time': process_time
-            }
+        "result": {"bucket": bucket, "key": key_name},
+        "measurement": {
+            "download_time": download_time,
+            "compute_time": process_time,
+            "upload_time": upload_time,
+        },
     }
diff --git a/benchmarks/500.scientific/5xx.channel_flow_jax_npbench/config.json b/benchmarks/500.scientific/5xx.channel_flow_jax_npbench/config.json
new file mode 100644
index 000000000..ff297ac5b
--- /dev/null
+++ b/benchmarks/500.scientific/5xx.channel_flow_jax_npbench/config.json
@@ -0,0 +1,6 @@
+{
+  "timeout": 60,
+  "memory": 2048,
+  "languages": ["python"],
+  "modules": ["storage"]
+}
diff --git a/benchmarks/500.scientific/5xx.channel_flow_jax_npbench/input.py b/benchmarks/500.scientific/5xx.channel_flow_jax_npbench/input.py
new file mode 100644
index 000000000..bb53694c9
--- /dev/null
+++ b/benchmarks/500.scientific/5xx.channel_flow_jax_npbench/input.py
@@ -0,0 +1,17 @@
+size_generators = {
+    "test": {"ny": 61, "nx": 61, "nit": 5, "rho": 1.0, "nu": 0.1, "F": 1.0},
+    "small": {"ny": 121, "nx": 121, "nit": 10, "rho": 1.0, "nu": 0.1, "F": 1.0},
+    "large": {"ny": 201, "nx": 201, "nit": 20, "rho": 1.0, "nu": 0.1, "F": 1.0},
+}
+
+
+def generate_input(
+    data_dir,
+    size,
+    benchmarks_bucket,
+    input_paths,
+    output_paths,
+    upload_func,
+    nosql_func,
+):
+    return {"size": size_generators[size]}
diff --git a/benchmarks/500.scientific/5xx.channel_flow_jax_npbench/python/function.py b/benchmarks/500.scientific/5xx.channel_flow_jax_npbench/python/function.py
new file mode 100644
index 000000000..5788880b2
--- /dev/null
+++ b/benchmarks/500.scientific/5xx.channel_flow_jax_npbench/python/function.py
@@ -0,0 +1,279 @@
+# Barba, Lorena A., and Forsyth, Gilbert F. (2018).
+# CFD Python: the 12 steps to Navier-Stokes equations.
+# Journal of Open Source Education, 1(9), 21,
+# https://doi.org/10.21105/jose.00021
+# TODO: License
+# (c) 2017 Lorena A. Barba, Gilbert F. Forsyth.
+# All content is under Creative Commons Attribution CC-BY 4.0,
+# and all code is under BSD-3 clause (previously under MIT, and changed on March 8, 2018).
+
+import datetime
+
+import jax.numpy as jnp
+import jax
+from jax import lax
+from functools import partial
+
+
+@partial(jax.jit, static_argnums=(0,))
+def build_up_b(rho, dt, dx, dy, u, v):
+    b = jnp.zeros_like(u)
+    b = b.at[1:-1, 1:-1].set(
+        (
+            rho
+            * (
+                1
+                / dt
+                * (
+                    (u[1:-1, 2:] - u[1:-1, 0:-2]) / (2 * dx)
+                    + (v[2:, 1:-1] - v[0:-2, 1:-1]) / (2 * dy)
+                )
+                - ((u[1:-1, 2:] - u[1:-1, 0:-2]) / (2 * dx)) ** 2
+                - 2
+                * (
+                    (u[2:, 1:-1] - u[0:-2, 1:-1])
+                    / (2 * dy)
+                    * (v[1:-1, 2:] - v[1:-1, 0:-2])
+                    / (2 * dx)
+                )
+                - ((v[2:, 1:-1] - v[0:-2, 1:-1]) / (2 * dy)) ** 2
+            )
+        )
+    )
+
+    # Periodic BC Pressure @ x = 2
+    b = b.at[1:-1, -1].set(
+        (
+            rho
+            * (
+                1
+                / dt
+                * ((u[1:-1, 0] - u[1:-1, -2]) / (2 * dx) + (v[2:, -1] - v[0:-2, -1]) / (2 * dy))
+                - ((u[1:-1, 0] - u[1:-1, -2]) / (2 * dx)) ** 2
+                - 2 * ((u[2:, -1] - u[0:-2, -1]) / (2 * dy) * (v[1:-1, 0] - v[1:-1, -2]) / (2 * dx))
+                - ((v[2:, -1] - v[0:-2, -1]) / (2 * dy)) ** 2
+            )
+        )
+    )
+
+    # Periodic BC Pressure @ x = 0
+    b = b.at[1:-1, 0].set(
+        (
+            rho
+            * (
+                1
+                / dt
+                * ((u[1:-1, 1] - u[1:-1, -1]) / (2 * dx) + (v[2:, 0] - v[0:-2, 0]) / (2 * dy))
+                - ((u[1:-1, 1] - u[1:-1, -1]) / (2 * dx)) ** 2
+                - 2 * ((u[2:, 0] - u[0:-2, 0]) / (2 * dy) * (v[1:-1, 1] - v[1:-1, -1]) / (2 * dx))
+                - ((v[2:, 0] - v[0:-2, 0]) / (2 * dy)) ** 2
+            )
+        )
+    )
+
+    return b
+
+
+@partial(jax.jit, static_argnums=(0,))
+def pressure_poisson_periodic(nit, p, dx, dy, b):
+    def body_func(p, q):
+        pn = p.copy()
+        p = p.at[1:-1, 1:-1].set(
+            ((pn[1:-1, 2:] + pn[1:-1, 0:-2]) * dy**2 + (pn[2:, 1:-1] + pn[0:-2, 1:-1]) * dx**2)
+            / (2 * (dx**2 + dy**2))
+            - dx**2 * dy**2 / (2 * (dx**2 + dy**2)) * b[1:-1, 1:-1]
+        )
+
+        # Periodic BC Pressure @ x = 2
+        p = p.at[1:-1, -1].set(
+            ((pn[1:-1, 0] + pn[1:-1, -2]) * dy**2 + (pn[2:, -1] + pn[0:-2, -1]) * dx**2)
+            / (2 * (dx**2 + dy**2))
+            - dx**2 * dy**2 / (2 * (dx**2 + dy**2)) * b[1:-1, -1]
+        )
+
+        # Periodic BC Pressure @ x = 0
+        p = p.at[1:-1, 0].set(
+            (
+                ((pn[1:-1, 1] + pn[1:-1, -1]) * dy**2 + (pn[2:, 0] + pn[0:-2, 0]) * dx**2)
+                / (2 * (dx**2 + dy**2))
+                - dx**2 * dy**2 / (2 * (dx**2 + dy**2)) * b[1:-1, 0]
+            )
+        )
+
+        # Wall boundary conditions, pressure
+        p = p.at[-1, :].set(p[-2, :])  # dp/dy = 0 at y = 2
+        p = p.at[0, :].set(p[1, :])  # dp/dy = 0 at y = 0
+
+        return p, None
+
+    p, _ = lax.scan(body_func, p, jnp.arange(nit))
+
+
+@partial(jax.jit, static_argnums=(0, 7, 8, 9))
+def channel_flow(nit, u, v, dt, dx, dy, p, rho, nu, F):
+    udiff = 1
+    stepcount = 0
+
+    array_vals = (udiff, stepcount, u, v, p)
+
+    def conf_func(array_vals):
+        udiff, _, _, _, _ = array_vals
+        return udiff > 0.001
+
+    def body_func(array_vals):
+        _, stepcount, u, v, p = array_vals
+
+        un = u.copy()
+        vn = v.copy()
+
+        b = build_up_b(rho, dt, dx, dy, u, v)
+        pressure_poisson_periodic(nit, p, dx, dy, b)
+
+        u = u.at[1:-1, 1:-1].set(
+            un[1:-1, 1:-1]
+            - un[1:-1, 1:-1] * dt / dx * (un[1:-1, 1:-1] - un[1:-1, 0:-2])
+            - vn[1:-1, 1:-1] * dt / dy * (un[1:-1, 1:-1] - un[0:-2, 1:-1])
+            - dt / (2 * rho * dx) * (p[1:-1, 2:] - p[1:-1, 0:-2])
+            + nu
+            * (
+                dt / dx**2 * (un[1:-1, 2:] - 2 * un[1:-1, 1:-1] + un[1:-1, 0:-2])
+                + dt / dy**2 * (un[2:, 1:-1] - 2 * un[1:-1, 1:-1] + un[0:-2, 1:-1])
+            )
+            + F * dt
+        )
+
+        v = v.at[1:-1, 1:-1].set(
+            vn[1:-1, 1:-1]
+            - un[1:-1, 1:-1] * dt / dx * (vn[1:-1, 1:-1] - vn[1:-1, 0:-2])
+            - vn[1:-1, 1:-1] * dt / dy * (vn[1:-1, 1:-1] - vn[0:-2, 1:-1])
+            - dt / (2 * rho * dy) * (p[2:, 1:-1] - p[0:-2, 1:-1])
+            + nu
+            * (
+                dt / dx**2 * (vn[1:-1, 2:] - 2 * vn[1:-1, 1:-1] + vn[1:-1, 0:-2])
+                + dt / dy**2 * (vn[2:, 1:-1] - 2 * vn[1:-1, 1:-1] + vn[0:-2, 1:-1])
+            )
+        )
+
+        # Periodic BC u @ x = 2
+        u = u.at[1:-1, -1].set(
+            un[1:-1, -1]
+            - un[1:-1, -1] * dt / dx * (un[1:-1, -1] - un[1:-1, -2])
+            - vn[1:-1, -1] * dt / dy * (un[1:-1, -1] - un[0:-2, -1])
+            - dt / (2 * rho * dx) * (p[1:-1, 0] - p[1:-1, -2])
+            + nu
+            * (
+                dt / dx**2 * (un[1:-1, 0] - 2 * un[1:-1, -1] + un[1:-1, -2])
+                + dt / dy**2 * (un[2:, -1] - 2 * un[1:-1, -1] + un[0:-2, -1])
+            )
+            + F * dt
+        )
+
+        # Periodic BC u @ x = 0
+        u = u.at[1:-1, 0].set(
+            un[1:-1, 0]
+            - un[1:-1, 0] * dt / dx * (un[1:-1, 0] - un[1:-1, -1])
+            - vn[1:-1, 0] * dt / dy * (un[1:-1, 0] - un[0:-2, 0])
+            - dt / (2 * rho * dx) * (p[1:-1, 1] - p[1:-1, -1])
+            + nu
+            * (
+                dt / dx**2 * (un[1:-1, 1] - 2 * un[1:-1, 0] + un[1:-1, -1])
+                + dt / dy**2 * (un[2:, 0] - 2 * un[1:-1, 0] + un[0:-2, 0])
+            )
+            + F * dt
+        )
+
+        # Periodic BC v @ x = 2
+        v = v.at[1:-1, -1].set(
+            vn[1:-1, -1]
+            - un[1:-1, -1] * dt / dx * (vn[1:-1, -1] - vn[1:-1, -2])
+            - vn[1:-1, -1] * dt / dy * (vn[1:-1, -1] - vn[0:-2, -1])
+            - dt / (2 * rho * dy) * (p[2:, -1] - p[0:-2, -1])
+            + nu
+            * (
+                dt / dx**2 * (vn[1:-1, 0] - 2 * vn[1:-1, -1] + vn[1:-1, -2])
+                + dt / dy**2 * (vn[2:, -1] - 2 * vn[1:-1, -1] + vn[0:-2, -1])
+            )
+        )
+
+        # Periodic BC v @ x = 0
+        v = v.at[1:-1, 0].set(
+            vn[1:-1, 0]
+            - un[1:-1, 0] * dt / dx * (vn[1:-1, 0] - vn[1:-1, -1])
+            - vn[1:-1, 0] * dt / dy * (vn[1:-1, 0] - vn[0:-2, 0])
+            - dt / (2 * rho * dy) * (p[2:, 0] - p[0:-2, 0])
+            + nu
+            * (
+                dt / dx**2 * (vn[1:-1, 1] - 2 * vn[1:-1, 0] + vn[1:-1, -1])
+                + dt / dy**2 * (vn[2:, 0] - 2 * vn[1:-1, 0] + vn[0:-2, 0])
+            )
+        )
+
+        # Wall BC: u,v = 0 @ y = 0,2
+        u = u.at[0, :].set(0)
+        u = u.at[-1, :].set(0)
+        v = v.at[0, :].set(0)
+        v = v.at[-1, :].set(0)
+
+        udiff = (jnp.sum(u) - jnp.sum(un)) / jnp.sum(u)
+        stepcount += 1
+
+        return (udiff, stepcount, u, v, p)
+
+    _, stepcount, _, _, _ = lax.while_loop(conf_func, body_func, array_vals)
+
+    return stepcount
+
+
+def initialize(ny, nx):
+    u = jnp.zeros((ny, nx), dtype=jnp.float64)
+    v = jnp.zeros((ny, nx), dtype=jnp.float64)
+    p = jnp.ones((ny, nx), dtype=jnp.float64)
+    dx = 2 / (nx - 1)
+    dy = 2 / (ny - 1)
+    dt = 0.1 / ((nx - 1) * (ny - 1))
+    return u, v, p, dx, dy, dt
+
+
+def handler(event):
+
+    if "size" in event:
+        size = event["size"]
+        ny = size["ny"]
+        nx = size["nx"]
+        nit = size["nit"]
+        rho = size["rho"]
+        nu = size["nu"]
+        F = size["F"]
+
+    generate_begin = datetime.datetime.now()
+
+    u, v, p, dx, dy, dt = initialize(ny, nx)
+
+    generate_end = datetime.datetime.now()
+
+    process_begin = datetime.datetime.now()
+
+    results = channel_flow(nit, u, v, dt, dx, dy, p, rho, nu, F)
+
+    process_end = datetime.datetime.now()
+
+    # y_re_im = jnp.stack([jnp.real(result), jnp.imag(result)], axis=-1).tolist()
+
+    process_time = (process_end - process_begin) / datetime.timedelta(milliseconds=1)
+    generate_time = (generate_end - generate_begin) / datetime.timedelta(milliseconds=1)
+
+    try:
+        results = jax.device_get(results)
+    except Exception:
+        pass
+
+    if hasattr(results, "item"):
+        results = results.item()
+    elif hasattr(results, "tolist"):
+        results = results.tolist()
+
+    return {
+        "size": size,
+        "result": results,
+        "measurement": {"compute_time": process_time, "generate_time": generate_time},
+    }
diff --git a/benchmarks/500.scientific/5xx.channel_flow_jax_npbench/python/requirements.txt b/benchmarks/500.scientific/5xx.channel_flow_jax_npbench/python/requirements.txt
new file mode 100644
index 000000000..f31e1afe0
--- /dev/null
+++ b/benchmarks/500.scientific/5xx.channel_flow_jax_npbench/python/requirements.txt
@@ -0,0 +1 @@
+jax[cuda12]
\ No newline at end of file
diff --git a/benchmarks/500.scientific/5xx.compute_jax_npbench/config.json b/benchmarks/500.scientific/5xx.compute_jax_npbench/config.json
new file mode 100644
index 000000000..ff297ac5b
--- /dev/null
+++ b/benchmarks/500.scientific/5xx.compute_jax_npbench/config.json
@@ -0,0 +1,6 @@
+{
+  "timeout": 60,
+  "memory": 2048,
+  "languages": ["python"],
+  "modules": ["storage"]
+}
diff --git a/benchmarks/500.scientific/5xx.compute_jax_npbench/input.py b/benchmarks/500.scientific/5xx.compute_jax_npbench/input.py
new file mode 100644
index 000000000..56f136720
--- /dev/null
+++ b/benchmarks/500.scientific/5xx.compute_jax_npbench/input.py
@@ -0,0 +1,17 @@
+size_generators = {
+    "test": {"M": 2000, "N": 2000},
+    "small": {"M": 5000, "N": 5000},
+    "large": {"M": 16000, "N": 16000},
+}
+
+
+def generate_input(
+    data_dir,
+    size,
+    benchmarks_bucket,
+    input_paths,
+    output_paths,
+    upload_func,
+    nosql_func,
+):
+    return {"size": size_generators[size]}
diff --git a/benchmarks/500.scientific/5xx.compute_jax_npbench/python/function.py b/benchmarks/500.scientific/5xx.compute_jax_npbench/python/function.py
new file mode 100644
index 000000000..2e16b320d
--- /dev/null
+++ b/benchmarks/500.scientific/5xx.compute_jax_npbench/python/function.py
@@ -0,0 +1,62 @@
+import datetime
+
+import jax.numpy as jnp
+import jax
+
+
+@jax.jit
+def compute(array_1, array_2, a, b, c):
+    return jnp.clip(array_1, 2, 10) * a + array_2 * b + c
+
+
+def initialize(M, N):
+    from numpy.random import default_rng
+
+    rng = default_rng(42)
+    array_1 = rng.uniform(0, 1000, size=(M, N)).astype(jnp.int64)
+    array_2 = rng.uniform(0, 1000, size=(M, N)).astype(jnp.int64)
+    a = jnp.int64(4)
+    b = jnp.int64(3)
+    c = jnp.int64(9)
+    return array_1, array_2, a, b, c
+
+
+def handler(event):
+
+    if "size" in event:
+        size = event["size"]
+        M = size["M"]
+        N = size["N"]
+
+    generate_begin = datetime.datetime.now()
+
+    array_1, array_2, a, b, c = initialize(M, N)
+
+    generate_end = datetime.datetime.now()
+
+    process_begin = datetime.datetime.now()
+
+    results = compute(array_1, array_2, a, b, c)
+
+    process_end = datetime.datetime.now()
+
+    # y_re_im = jnp.stack([jnp.real(result), jnp.imag(result)], axis=-1).tolist()
+
+    process_time = (process_end - process_begin) / datetime.timedelta(milliseconds=1)
+    generate_time = (generate_end - generate_begin) / datetime.timedelta(milliseconds=1)
+
+    try:
+        results = jax.device_get(results)
+    except Exception:
+        pass
+
+    if getattr(results, "ndim", 0) == 0 or getattr(results, "size", 0) == 1:
+        results = results.item()
+    else:
+        results = results.tolist()
+
+    return {
+        "size": size,
+        "result": results,
+        "measurement": {"compute_time": process_time, "generate_time": generate_time},
+    }
diff --git a/benchmarks/500.scientific/5xx.compute_jax_npbench/python/requirements.txt b/benchmarks/500.scientific/5xx.compute_jax_npbench/python/requirements.txt
new file mode 100644
index 000000000..f31e1afe0
--- /dev/null
+++ b/benchmarks/500.scientific/5xx.compute_jax_npbench/python/requirements.txt
@@ -0,0 +1 @@
+jax[cuda12]
\ No newline at end of file
diff --git a/benchmarks/500.scientific/5xx.deep_learning_resnet_jax_npbench/config.json b/benchmarks/500.scientific/5xx.deep_learning_resnet_jax_npbench/config.json
new file mode 100644
index 000000000..ff297ac5b
--- /dev/null
+++ b/benchmarks/500.scientific/5xx.deep_learning_resnet_jax_npbench/config.json
@@ -0,0 +1,6 @@
+{
+  "timeout": 60,
+  "memory": 2048,
+  "languages": ["python"],
+  "modules": ["storage"]
+}
diff --git a/benchmarks/500.scientific/5xx.deep_learning_resnet_jax_npbench/input.py b/benchmarks/500.scientific/5xx.deep_learning_resnet_jax_npbench/input.py
new file mode 100644
index 000000000..937e96e44
--- /dev/null
+++ b/benchmarks/500.scientific/5xx.deep_learning_resnet_jax_npbench/input.py
@@ -0,0 +1,17 @@
+size_generators = {
+    "test": {"N": 8, "W": 14, "H": 14, "C1": 32, "C2": 8},
+    "small": {"N": 8, "W": 28, "H": 28, "C1": 64, "C2": 16},
+    "large": {"N": 8, "W": 56, "H": 56, "C1": 128, "C2": 32},
+}
+
+
+def generate_input(
+    data_dir,
+    size,
+    benchmarks_bucket,
+    input_paths,
+    output_paths,
+    upload_func,
+    nosql_func,
+):
+    return {"size": size_generators[size]}
diff --git a/benchmarks/500.scientific/5xx.deep_learning_resnet_jax_npbench/python/function.py b/benchmarks/500.scientific/5xx.deep_learning_resnet_jax_npbench/python/function.py
new file mode 100644
index 000000000..f24b2cc71
--- /dev/null
+++ b/benchmarks/500.scientific/5xx.deep_learning_resnet_jax_npbench/python/function.py
@@ -0,0 +1,123 @@
+import datetime
+
+import jax.numpy as jnp
+import jax
+from jax import lax
+
+
+@jax.jit
+def relu(x):
+    return jnp.maximum(x, 0)
+
+
+# Deep learning convolutional operator (stride = 1)
+@jax.jit
+def conv2d(input, weights):
+    K = weights.shape[0]  # Assuming square kernel
+    N = input.shape[0]
+    H_out = input.shape[1] - K + 1
+    W_out = input.shape[2] - K + 1
+    C_out = weights.shape[3]
+    output = jnp.empty((N, H_out, W_out, C_out), dtype=jnp.float32)
+
+    def row_update(output, i):
+        def col_update(output, j):
+            input_slice = lax.dynamic_slice(input, (0, i, j, 0), (N, K, K, input.shape[-1]))
+            conv_result = jnp.sum(
+                input_slice[:, :, :, :, None] * weights[None, :, :, :], axis=(1, 2, 3)
+            )
+            output = lax.dynamic_update_slice(output, conv_result[:, None, None, :], (0, i, j, 0))
+            return output, None
+
+        output, _ = lax.scan(col_update, output, jnp.arange(W_out))
+        return output, None
+
+    output, _ = lax.scan(row_update, output, jnp.arange(H_out))
+    return output
+
+
+# Batch normalization operator, as used in ResNet
+@jax.jit
+def batchnorm2d(x, eps=1e-5):
+    mean = jnp.mean(x, axis=0, keepdims=True)
+    std = jnp.std(x, axis=0, keepdims=True)
+    return (x - mean) / jnp.sqrt(std + eps)
+
+
+# Bottleneck residual block (after initial convolution, without downsampling)
+# in the ResNet-50 CNN (inference)
+@jax.jit
+def resnet_basicblock(input, conv1, conv2, conv3):
+    # Pad output of first convolution for second convolution
+    padded = jnp.zeros(
+        (input.shape[0], input.shape[1] + 2, input.shape[2] + 2, conv1.shape[3]),
+        dtype=jnp.float32,
+    )
+    padded = lax.dynamic_update_slice(padded, conv2d(input, conv1), (0, 1, 1, 0))
+    x = batchnorm2d(padded)
+    x = relu(x)
+
+    x = conv2d(x, conv2)
+    x = batchnorm2d(x)
+    x = relu(x)
+    x = conv2d(x, conv3)
+    x = batchnorm2d(x)
+    return relu(x + input)
+
+
+def initialize(N, W, H, C1, C2):
+    from numpy.random import default_rng
+
+    rng = default_rng(42)
+
+    # Input
+    input = rng.random((N, H, W, C1), dtype=jnp.float32)
+    # Weights
+    conv1 = rng.random((1, 1, C1, C2), dtype=jnp.float32)
+    conv2 = rng.random((3, 3, C2, C2), dtype=jnp.float32)
+    conv3 = rng.random((1, 1, C2, C1), dtype=jnp.float32)
+    return (input, conv1, conv2, conv3)
+
+
+def handler(event):
+
+    if "size" in event:
+        size = event["size"]
+        N = size["N"]
+        W = size["W"]
+        H = size["H"]
+        C1 = size["C1"]
+        C2 = size["C2"]
+
+    generate_begin = datetime.datetime.now()
+
+    input, conv1, conv2, conv3 = initialize(N, W, H, C1, C2)
+
+    generate_end = datetime.datetime.now()
+
+    process_begin = datetime.datetime.now()
+
+    results = resnet_basicblock(input, conv1, conv2, conv3)
+
+    process_end = datetime.datetime.now()
+
+    # y_re_im = jnp.stack([jnp.real(result), jnp.imag(result)], axis=-1).tolist()
+
+    process_time = (process_end - process_begin) / datetime.timedelta(milliseconds=1)
+    generate_time = (generate_end - generate_begin) / datetime.timedelta(milliseconds=1)
+
+    try:
+        results = jax.device_get(results)
+    except Exception:
+        pass
+
+    if getattr(results, "ndim", 0) == 0 or getattr(results, "size", 0) == 1:
+        results = results.item()
+    else:
+        results = results.tolist()
+
+    return {
+        "size": size,
+        "result": results,
+        "measurement": {"compute_time": process_time, "generate_time": generate_time},
+    }
diff --git a/benchmarks/500.scientific/5xx.deep_learning_resnet_jax_npbench/python/requirements.txt b/benchmarks/500.scientific/5xx.deep_learning_resnet_jax_npbench/python/requirements.txt
new file mode 100644
index 000000000..f31e1afe0
--- /dev/null
+++ b/benchmarks/500.scientific/5xx.deep_learning_resnet_jax_npbench/python/requirements.txt
@@ -0,0 +1 @@
+jax[cuda12]
\ No newline at end of file
diff --git a/benchmarks/600.linearalgebra/601.matmul/config.json b/benchmarks/600.linearalgebra/601.matmul/config.json
new file mode 100644
index 000000000..e80fb4351
--- /dev/null
+++ b/benchmarks/600.linearalgebra/601.matmul/config.json
@@ -0,0 +1,6 @@
+{
+  "timeout": 120,
+  "memory": 512,
+  "languages": ["python"],
+  "modules": []
+}
diff --git a/benchmarks/600.linearalgebra/601.matmul/input.py b/benchmarks/600.linearalgebra/601.matmul/input.py
new file mode 100644
index 000000000..ec8cd9d59
--- /dev/null
+++ b/benchmarks/600.linearalgebra/601.matmul/input.py
@@ -0,0 +1,13 @@
+size_generators = {"test": 10, "small": 100, "large": 1000}
+
+
+def generate_input(
+    data_dir,
+    size,
+    benchmarks_bucket,
+    input_paths,
+    output_paths,
+    upload_func,
+    nosql_func,
+):
+    return {"size": size_generators[size], "seed": 42}
diff --git a/benchmarks/600.linearalgebra/601.matmul/python/function.py b/benchmarks/600.linearalgebra/601.matmul/python/function.py
new file mode 100755
index 000000000..ee88b2e58
--- /dev/null
+++ b/benchmarks/600.linearalgebra/601.matmul/python/function.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+import torch
+import datetime
+
+
+def initialize_torch(NI, NJ, NK, dtype=torch.float32, device="cuda"):
+    alpha = torch.tensor(1.5, dtype=dtype, device=device)
+    beta = torch.tensor(1.2, dtype=dtype, device=device)
+    i = torch.arange(NI, device=device)
+    j = torch.arange(NJ, device=device)
+    k = torch.arange(NK, device=device)
+    C = ((i[:, None] * j[None, :] + 1) % NI).to(dtype) / NI
+    A = ((i[:, None] * (k[None, :] + 1)) % NK).to(dtype) / NK
+    B = ((k[:, None] * (j[None, :] + 2)) % NJ).to(dtype) / NJ
+    return alpha, beta, C, A, B
+
+
+def kernel_gemm(alpha, beta, C, A, B, reps=1):
+    torch.cuda.synchronize()
+    _ = alpha * (A @ B) + beta * C  # warmup
+    torch.cuda.synchronize()
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    start.record()
+    for _ in range(reps):
+        C = alpha * (A @ B) + beta * C
+    end.record()
+    torch.cuda.synchronize()
+    return C, float(start.elapsed_time(end))  # ms for all reps
+
+
+def handler(event):
+
+    size = event.get("size")
+    if "seed" in event:
+        import random
+
+        random.seed(event["seed"])
+
+        seed = event.get("seed", 42)
+        seed = int(seed)
+
+    matrix_generating_begin = datetime.datetime.now()
+    alpha, beta, C, A, B = initialize_torch(size, size, size, dtype=torch.float32, device="cuda")
+    matrix_generating_end = datetime.datetime.now()
+
+    matmul_begin = datetime.datetime.now()
+    C_out, gpu_ms = kernel_gemm(alpha, beta, C, A, B, reps=1)
+    matmul_end = datetime.datetime.now()
+
+    matrix_generating_time = (matrix_generating_end - matrix_generating_begin) / datetime.timedelta(
+        microseconds=1
+    )
+    matmul_time = (matmul_end - matmul_begin) / datetime.timedelta(microseconds=1)
+
+    return {
+        # "result": result[0],
+        "measurement": {
+            "generating_time": matrix_generating_time,
+            "compute_time": matmul_time,
+        },
+    }
diff --git a/benchmarks/600.linearalgebra/601.matmul/python/requirements.txt b/benchmarks/600.linearalgebra/601.matmul/python/requirements.txt
new file mode 100755
index 000000000..d8d966118
--- /dev/null
+++ b/benchmarks/600.linearalgebra/601.matmul/python/requirements.txt
@@ -0,0 +1 @@
+torch==2.4.1
diff --git a/benchmarks/600.linearalgebra/601.matmul/python/requirements.txt.3.10 b/benchmarks/600.linearalgebra/601.matmul/python/requirements.txt.3.10
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/601.matmul/python/requirements.txt.3.11 b/benchmarks/600.linearalgebra/601.matmul/python/requirements.txt.3.11
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/601.matmul/python/requirements.txt.3.12 b/benchmarks/600.linearalgebra/601.matmul/python/requirements.txt.3.12
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/601.matmul/python/requirements.txt.3.7 b/benchmarks/600.linearalgebra/601.matmul/python/requirements.txt.3.7
new file mode 100755
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/601.matmul/python/requirements.txt.3.8 b/benchmarks/600.linearalgebra/601.matmul/python/requirements.txt.3.8
new file mode 100755
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/601.matmul/python/requirements.txt.3.9 b/benchmarks/600.linearalgebra/601.matmul/python/requirements.txt.3.9
new file mode 100755
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/601.matmul/python/requirements.txt.arm.3.8 b/benchmarks/600.linearalgebra/601.matmul/python/requirements.txt.arm.3.8
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/601.matmul/python/requirements.txt.arm.3.9 b/benchmarks/600.linearalgebra/601.matmul/python/requirements.txt.arm.3.9
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/602.axpy/config.json b/benchmarks/600.linearalgebra/602.axpy/config.json
new file mode 100644
index 000000000..e80fb4351
--- /dev/null
+++ b/benchmarks/600.linearalgebra/602.axpy/config.json
@@ -0,0 +1,6 @@
+{
+  "timeout": 120,
+  "memory": 512,
+  "languages": ["python"],
+  "modules": []
+}
diff --git a/benchmarks/600.linearalgebra/602.axpy/input.py b/benchmarks/600.linearalgebra/602.axpy/input.py
new file mode 100644
index 000000000..ec8cd9d59
--- /dev/null
+++ b/benchmarks/600.linearalgebra/602.axpy/input.py
@@ -0,0 +1,13 @@
+size_generators = {"test": 10, "small": 100, "large": 1000}
+
+
+def generate_input(
+    data_dir,
+    size,
+    benchmarks_bucket,
+    input_paths,
+    output_paths,
+    upload_func,
+    nosql_func,
+):
+    return {"size": size_generators[size], "seed": 42}
diff --git a/benchmarks/600.linearalgebra/602.axpy/python/function.py b/benchmarks/600.linearalgebra/602.axpy/python/function.py
new file mode 100755
index 000000000..79117fa1b
--- /dev/null
+++ b/benchmarks/600.linearalgebra/602.axpy/python/function.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+import torch
+import datetime
+
+
+def initialize_torch(N, dtype=torch.float32, device="cuda", seed=42):
+    if seed is not None:
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+    alpha = torch.randn((), dtype=dtype, device=device)
+    x = torch.randn(N, dtype=dtype, device=device)
+    y = torch.randn(N, dtype=dtype, device=device)
+    return alpha, x, y
+
+
+def kernel_axpy(alpha, x, y, reps=100):
+    torch.cuda.synchronize()
+    _ = alpha * x + y  # warmup
+    torch.cuda.synchronize()
+
+    start_evt = torch.cuda.Event(enable_timing=True)
+    end_evt = torch.cuda.Event(enable_timing=True)
+    start_evt.record()
+    for _ in range(reps):
+        y = alpha * x + y
+    end_evt.record()
+    torch.cuda.synchronize()
+    gpu_ms = float(start_evt.elapsed_time(end_evt))
+    return y, gpu_ms
+
+
+def handler(event):
+    size = event.get("size")
+    if "seed" in event:
+        import random
+
+        random.seed(event["seed"])
+
+        seed = event.get("seed", 42)
+        seed = int(seed)
+
+    gen_begin = datetime.datetime.now()
+    alpha, x, y = initialize_torch(size, dtype=torch.float32, device="cuda", seed=seed)
+    gen_end = datetime.datetime.now()
+
+    comp_begin = datetime.datetime.now()
+    y_out, gpu_ms = kernel_axpy(alpha, x, y, reps=100)
+    comp_end = datetime.datetime.now()
+
+    gen_us = (gen_end - gen_begin) / datetime.timedelta(microseconds=1)
+    comp_us = (comp_end - comp_begin) / datetime.timedelta(microseconds=1)
+
+    return {
+        "measurement": {
+            "generating_time": gen_us,
+            "compute_time": comp_us,
+            "gpu_time": gpu_ms,
+        }
+    }
diff --git a/benchmarks/600.linearalgebra/602.axpy/python/requirements.txt b/benchmarks/600.linearalgebra/602.axpy/python/requirements.txt
new file mode 100755
index 000000000..d8d966118
--- /dev/null
+++ b/benchmarks/600.linearalgebra/602.axpy/python/requirements.txt
@@ -0,0 +1 @@
+torch==2.4.1
diff --git a/benchmarks/600.linearalgebra/602.axpy/python/requirements.txt.3.10 b/benchmarks/600.linearalgebra/602.axpy/python/requirements.txt.3.10
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/602.axpy/python/requirements.txt.3.11 b/benchmarks/600.linearalgebra/602.axpy/python/requirements.txt.3.11
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/602.axpy/python/requirements.txt.3.12 b/benchmarks/600.linearalgebra/602.axpy/python/requirements.txt.3.12
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/602.axpy/python/requirements.txt.3.7 b/benchmarks/600.linearalgebra/602.axpy/python/requirements.txt.3.7
new file mode 100755
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/602.axpy/python/requirements.txt.3.8 b/benchmarks/600.linearalgebra/602.axpy/python/requirements.txt.3.8
new file mode 100755
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/602.axpy/python/requirements.txt.3.9 b/benchmarks/600.linearalgebra/602.axpy/python/requirements.txt.3.9
new file mode 100755
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/602.axpy/python/requirements.txt.arm.3.8 b/benchmarks/600.linearalgebra/602.axpy/python/requirements.txt.arm.3.8
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/602.axpy/python/requirements.txt.arm.3.9 b/benchmarks/600.linearalgebra/602.axpy/python/requirements.txt.arm.3.9
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/603.jacobi2d/config.json b/benchmarks/600.linearalgebra/603.jacobi2d/config.json
new file mode 100644
index 000000000..e80fb4351
--- /dev/null
+++ b/benchmarks/600.linearalgebra/603.jacobi2d/config.json
@@ -0,0 +1,6 @@
+{
+  "timeout": 120,
+  "memory": 512,
+  "languages": ["python"],
+  "modules": []
+}
diff --git a/benchmarks/600.linearalgebra/603.jacobi2d/input.py b/benchmarks/600.linearalgebra/603.jacobi2d/input.py
new file mode 100644
index 000000000..ec8cd9d59
--- /dev/null
+++ b/benchmarks/600.linearalgebra/603.jacobi2d/input.py
@@ -0,0 +1,13 @@
+size_generators = {"test": 10, "small": 100, "large": 1000}
+
+
+def generate_input(
+    data_dir,
+    size,
+    benchmarks_bucket,
+    input_paths,
+    output_paths,
+    upload_func,
+    nosql_func,
+):
+    return {"size": size_generators[size], "seed": 42}
diff --git a/benchmarks/600.linearalgebra/603.jacobi2d/python/function.py b/benchmarks/600.linearalgebra/603.jacobi2d/python/function.py
new file mode 100755
index 000000000..4dc37e2c6
--- /dev/null
+++ b/benchmarks/600.linearalgebra/603.jacobi2d/python/function.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+import torch
+import datetime
+
+
+def initialize_torch(N, dtype=torch.float32, device="cuda"):
+    i = torch.arange(N, device=device, dtype=dtype).view(-1, 1)
+    j = torch.arange(N, device=device, dtype=dtype).view(1, -1)
+
+    A = i * (j + 2) / N
+    B = i * (j + 3) / N
+    return A, B
+
+
+def kernel_jacobi2d(A, B, iters=50):
+    torch.cuda.synchronize()
+    # warmup
+    if A.shape[0] > 2 and A.shape[1] > 2:
+        B_inner = 0.2 * (A[1:-1, 1:-1] + A[1:-1, :-2] + A[1:-1, 2:] + A[2:, 1:-1] + A[:-2, 1:-1])
+        B[1:-1, 1:-1].copy_(B_inner)
+
+        A_inner = 0.2 * (B[1:-1, 1:-1] + B[1:-1, :-2] + B[1:-1, 2:] + B[2:, 1:-1] + B[:-2, 1:-1])
+        A[1:-1, 1:-1].copy_(A_inner)
+    torch.cuda.synchronize()
+
+    start_evt = torch.cuda.Event(enable_timing=True)
+    end_evt = torch.cuda.Event(enable_timing=True)
+    start_evt.record()
+    for _ in range(iters):
+        B_inner = 0.2 * (A[1:-1, 1:-1] + A[1:-1, :-2] + A[1:-1, 2:] + A[2:, 1:-1] + A[:-2, 1:-1])
+        B[1:-1, 1:-1].copy_(B_inner)
+
+        A_inner = 0.2 * (B[1:-1, 1:-1] + B[1:-1, :-2] + B[1:-1, 2:] + B[2:, 1:-1] + B[:-2, 1:-1])
+        A[1:-1, 1:-1].copy_(A_inner)
+    end_evt.record()
+    torch.cuda.synchronize()
+    gpu_ms = float(start_evt.elapsed_time(end_evt))
+    return A, B, gpu_ms
+
+
+def handler(event):
+
+    size = event.get("size")
+    if "seed" in event:
+        import random
+
+        random.seed(event["seed"])
+
+        seed = event.get("seed", 42)
+        seed = int(seed)
+
+    matrix_generating_begin = datetime.datetime.now()
+    A, B = initialize_torch(size, dtype=torch.float32, device="cuda")
+    matrix_generating_end = datetime.datetime.now()
+
+    matmul_begin = datetime.datetime.now()
+    A_out, B_out, gpu_ms = kernel_jacobi2d(A, B, iters=50)
+    matmul_end = datetime.datetime.now()
+
+    matrix_generating_time = (matrix_generating_end - matrix_generating_begin) / datetime.timedelta(
+        microseconds=1
+    )
+    matmul_time = (matmul_end - matmul_begin) / datetime.timedelta(microseconds=1)
+
+    return {
+        # "result": result[0],
+        "measurement": {
+            "generating_time": matrix_generating_time,
+            "compute_time": matmul_time,
+            "gpu_time": gpu_ms,
+        },
+    }
diff --git a/benchmarks/600.linearalgebra/603.jacobi2d/python/requirements.txt b/benchmarks/600.linearalgebra/603.jacobi2d/python/requirements.txt
new file mode 100755
index 000000000..d8d966118
--- /dev/null
+++ b/benchmarks/600.linearalgebra/603.jacobi2d/python/requirements.txt
@@ -0,0 +1 @@
+torch==2.4.1
diff --git a/benchmarks/600.linearalgebra/603.jacobi2d/python/requirements.txt.3.10 b/benchmarks/600.linearalgebra/603.jacobi2d/python/requirements.txt.3.10
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/603.jacobi2d/python/requirements.txt.3.11 b/benchmarks/600.linearalgebra/603.jacobi2d/python/requirements.txt.3.11
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/603.jacobi2d/python/requirements.txt.3.12 b/benchmarks/600.linearalgebra/603.jacobi2d/python/requirements.txt.3.12
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/603.jacobi2d/python/requirements.txt.3.7 b/benchmarks/600.linearalgebra/603.jacobi2d/python/requirements.txt.3.7
new file mode 100755
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/603.jacobi2d/python/requirements.txt.3.8 b/benchmarks/600.linearalgebra/603.jacobi2d/python/requirements.txt.3.8
new file mode 100755
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/603.jacobi2d/python/requirements.txt.3.9 b/benchmarks/600.linearalgebra/603.jacobi2d/python/requirements.txt.3.9
new file mode 100755
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/603.jacobi2d/python/requirements.txt.arm.3.8 b/benchmarks/600.linearalgebra/603.jacobi2d/python/requirements.txt.arm.3.8
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/603.jacobi2d/python/requirements.txt.arm.3.9 b/benchmarks/600.linearalgebra/603.jacobi2d/python/requirements.txt.arm.3.9
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/604.cholesky/config.json b/benchmarks/600.linearalgebra/604.cholesky/config.json
new file mode 100644
index 000000000..e80fb4351
--- /dev/null
+++ b/benchmarks/600.linearalgebra/604.cholesky/config.json
@@ -0,0 +1,6 @@
+{
+  "timeout": 120,
+  "memory": 512,
+  "languages": ["python"],
+  "modules": []
+}
diff --git a/benchmarks/600.linearalgebra/604.cholesky/input.py b/benchmarks/600.linearalgebra/604.cholesky/input.py
new file mode 100644
index 000000000..ec8cd9d59
--- /dev/null
+++ b/benchmarks/600.linearalgebra/604.cholesky/input.py
@@ -0,0 +1,13 @@
+size_generators = {"test": 10, "small": 100, "large": 1000}
+
+
+def generate_input(
+    data_dir,
+    size,
+    benchmarks_bucket,
+    input_paths,
+    output_paths,
+    upload_func,
+    nosql_func,
+):
+    return {"size": size_generators[size], "seed": 42}
diff --git a/benchmarks/600.linearalgebra/604.cholesky/python/function.py b/benchmarks/600.linearalgebra/604.cholesky/python/function.py
new file mode 100755
index 000000000..5a7ac77d5
--- /dev/null
+++ b/benchmarks/600.linearalgebra/604.cholesky/python/function.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python3
+import torch
+import datetime
+
+
+def initialize_torch(N, dtype=torch.float32, device="cuda"):
+    j = torch.arange(N, device=device)
+    v = (torch.remainder(-j, N).to(dtype) / N) + 1
+
+    L = v.expand(N, -1).clone()
+    L = torch.tril(L)
+    L.fill_diagonal_(1.0)
+
+    A = L @ L.transpose(-1, -2)
+    return A
+
+
+def kernel_cholesky(A):
+    torch.cuda.synchronize()
+    _ = torch.linalg.cholesky(A)  # warmup
+    torch.cuda.synchronize()
+
+    start_evt = torch.cuda.Event(enable_timing=True)
+    end_evt = torch.cuda.Event(enable_timing=True)
+    start_evt.record()
+    for _ in range(A.size(0)):
+        L = torch.linalg.cholesky(A)
+    end_evt.record()
+    torch.cuda.synchronize()
+    gpu_ms = float(start_evt.elapsed_time(end_evt))
+    return L, gpu_ms
+
+
+def handler(event):
+    size = event.get("size")
+    if "seed" in event:
+        import random
+
+        random.seed(event["seed"])
+
+        seed = event.get("seed", 42)
+        seed = int(seed)
+
+    gen_begin = datetime.datetime.now()
+    A = initialize_torch(size, dtype=torch.float32, device="cuda")
+    gen_end = datetime.datetime.now()
+
+    comp_begin = datetime.datetime.now()
+    L, gpu_ms = kernel_cholesky(A)
+    comp_end = datetime.datetime.now()
+
+    gen_us = (gen_end - gen_begin) / datetime.timedelta(microseconds=1)
+    comp_us = (comp_end - comp_begin) / datetime.timedelta(microseconds=1)
+
+    return {
+        "measurement": {
+            "generating_time": gen_us,
+            "compute_time": comp_us,
+            "gpu_time": gpu_ms,
+        }
+    }
diff --git a/benchmarks/600.linearalgebra/604.cholesky/python/requirements.txt b/benchmarks/600.linearalgebra/604.cholesky/python/requirements.txt
new file mode 100755
index 000000000..d8d966118
--- /dev/null
+++ b/benchmarks/600.linearalgebra/604.cholesky/python/requirements.txt
@@ -0,0 +1 @@
+torch==2.4.1
diff --git a/benchmarks/600.linearalgebra/604.cholesky/python/requirements.txt.3.10 b/benchmarks/600.linearalgebra/604.cholesky/python/requirements.txt.3.10
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/604.cholesky/python/requirements.txt.3.11 b/benchmarks/600.linearalgebra/604.cholesky/python/requirements.txt.3.11
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/604.cholesky/python/requirements.txt.3.12 b/benchmarks/600.linearalgebra/604.cholesky/python/requirements.txt.3.12
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/604.cholesky/python/requirements.txt.3.7 b/benchmarks/600.linearalgebra/604.cholesky/python/requirements.txt.3.7
new file mode 100755
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/604.cholesky/python/requirements.txt.3.8 b/benchmarks/600.linearalgebra/604.cholesky/python/requirements.txt.3.8
new file mode 100755
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/604.cholesky/python/requirements.txt.3.9 b/benchmarks/600.linearalgebra/604.cholesky/python/requirements.txt.3.9
new file mode 100755
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/604.cholesky/python/requirements.txt.arm.3.8 b/benchmarks/600.linearalgebra/604.cholesky/python/requirements.txt.arm.3.8
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/604.cholesky/python/requirements.txt.arm.3.9 b/benchmarks/600.linearalgebra/604.cholesky/python/requirements.txt.arm.3.9
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/605.lu/config.json b/benchmarks/600.linearalgebra/605.lu/config.json
new file mode 100644
index 000000000..e80fb4351
--- /dev/null
+++ b/benchmarks/600.linearalgebra/605.lu/config.json
@@ -0,0 +1,6 @@
+{
+  "timeout": 120,
+  "memory": 512,
+  "languages": ["python"],
+  "modules": []
+}
diff --git a/benchmarks/600.linearalgebra/605.lu/input.py b/benchmarks/600.linearalgebra/605.lu/input.py
new file mode 100644
index 000000000..ec8cd9d59
--- /dev/null
+++ b/benchmarks/600.linearalgebra/605.lu/input.py
@@ -0,0 +1,13 @@
+size_generators = {"test": 10, "small": 100, "large": 1000}
+
+
+def generate_input(
+    data_dir,
+    size,
+    benchmarks_bucket,
+    input_paths,
+    output_paths,
+    upload_func,
+    nosql_func,
+):
+    return {"size": size_generators[size], "seed": 42}
diff --git a/benchmarks/600.linearalgebra/605.lu/python/function.py b/benchmarks/600.linearalgebra/605.lu/python/function.py
new file mode 100755
index 000000000..fc99a3ab9
--- /dev/null
+++ b/benchmarks/600.linearalgebra/605.lu/python/function.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+import torch
+import datetime
+
+
+def initialize_torch(N, dtype=torch.float32, device="cuda"):
+    col = torch.arange(N, device=device)
+    base = (torch.remainder(-col, N).to(dtype) / N) + 1
+
+    A = torch.tril(base.expand(N, N)).clone()
+
+    A.fill_diagonal_(torch.tensor(1.0, dtype=dtype, device=device))
+
+    A = A @ A.T
+    return A
+
+
+def _kernel_lu(B: torch.Tensor) -> torch.Tensor:
+    n = B.shape[0]
+    for i in range(n):
+        for j in range(i):
+            B[i, j] = B[i, j] - (B[i, :j] @ B[:j, j])
+            B[i, j] = B[i, j] / B[j, j]
+        for j in range(i, n):
+            B[i, j] = B[i, j] - (B[i, :i] @ B[:i, j])
+    return B
+
+
+def kernel(A: torch.Tensor):
+    torch.cuda.synchronize()
+
+    _ = _kernel_lu(A.clone())  # Warm-up
+
+    torch.cuda.synchronize()
+
+    start_evt = torch.cuda.Event(enable_timing=True)
+    end_evt = torch.cuda.Event(enable_timing=True)
+
+    start_evt.record()
+    B = None
+    for _ in range(A.size(0)):
+        B = _kernel_lu(A.clone())
+    end_evt.record()
+
+    torch.cuda.synchronize()
+
+    gpu_ms = float(start_evt.elapsed_time(end_evt))
+    return B, gpu_ms
+
+
+def handler(event):
+    size = event.get("size")
+    if "seed" in event:
+        import random
+
+        random.seed(event["seed"])
+
+        seed = event.get("seed", 42)
+        seed = int(seed)
+
+    gen_begin = datetime.datetime.now()
+    A = initialize_torch(size, dtype=torch.float32, device="cuda")
+    gen_end = datetime.datetime.now()
+
+    comp_begin = datetime.datetime.now()
+    B, gpu_ms = kernel(A)
+    comp_end = datetime.datetime.now()
+
+    gen_us = (gen_end - gen_begin) / datetime.timedelta(microseconds=1)
+    comp_us = (comp_end - comp_begin) / datetime.timedelta(microseconds=1)
+
+    return {
+        "measurement": {
+            "generating_time": gen_us,
+            "compute_time": comp_us,
+            "gpu_time": gpu_ms,
+        }
+    }
diff --git a/benchmarks/600.linearalgebra/605.lu/python/requirements.txt b/benchmarks/600.linearalgebra/605.lu/python/requirements.txt
new file mode 100755
index 000000000..d8d966118
--- /dev/null
+++ b/benchmarks/600.linearalgebra/605.lu/python/requirements.txt
@@ -0,0 +1 @@
+torch==2.4.1
diff --git a/benchmarks/600.linearalgebra/605.lu/python/requirements.txt.3.10 b/benchmarks/600.linearalgebra/605.lu/python/requirements.txt.3.10
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/605.lu/python/requirements.txt.3.11 b/benchmarks/600.linearalgebra/605.lu/python/requirements.txt.3.11
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/605.lu/python/requirements.txt.3.12 b/benchmarks/600.linearalgebra/605.lu/python/requirements.txt.3.12
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/605.lu/python/requirements.txt.3.7 b/benchmarks/600.linearalgebra/605.lu/python/requirements.txt.3.7
new file mode 100755
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/605.lu/python/requirements.txt.3.8 b/benchmarks/600.linearalgebra/605.lu/python/requirements.txt.3.8
new file mode 100755
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/605.lu/python/requirements.txt.3.9 b/benchmarks/600.linearalgebra/605.lu/python/requirements.txt.3.9
new file mode 100755
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/605.lu/python/requirements.txt.arm.3.8 b/benchmarks/600.linearalgebra/605.lu/python/requirements.txt.arm.3.8
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/605.lu/python/requirements.txt.arm.3.9 b/benchmarks/600.linearalgebra/605.lu/python/requirements.txt.arm.3.9
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/606.spmv/config.json b/benchmarks/600.linearalgebra/606.spmv/config.json
new file mode 100644
index 000000000..e80fb4351
--- /dev/null
+++ b/benchmarks/600.linearalgebra/606.spmv/config.json
@@ -0,0 +1,6 @@
+{
+  "timeout": 120,
+  "memory": 512,
+  "languages": ["python"],
+  "modules": []
+}
diff --git a/benchmarks/600.linearalgebra/606.spmv/input.py b/benchmarks/600.linearalgebra/606.spmv/input.py
new file mode 100644
index 000000000..5cb42a0d9
--- /dev/null
+++ b/benchmarks/600.linearalgebra/606.spmv/input.py
@@ -0,0 +1,13 @@
+size_generators = {"test": 10, "small": 100, "large": 1000}
+
+
+def generate_input(
+    data_dir,
+    size,
+    benchmarks_bucket,
+    input_paths,
+    output_paths,
+    upload_func,
+    nosql_func,
+):
+    return {"size": size_generators[size], "seed": 42, "density": 0.01}
diff --git a/benchmarks/600.linearalgebra/606.spmv/python/function.py b/benchmarks/600.linearalgebra/606.spmv/python/function.py
new file mode 100755
index 000000000..e2c4b0218
--- /dev/null
+++ b/benchmarks/600.linearalgebra/606.spmv/python/function.py
@@ -0,0 +1,71 @@
+import torch
+import datetime
+
+
+def initialize_torch(N, density=0.01, dtype=torch.float32, device="cuda", seed=42):
+    if seed is not None:
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+
+    nnz = int(N * N * density)
+    row_indices = torch.randint(0, N, (nnz,), device=device)
+    col_indices = torch.randint(0, N, (nnz,), device=device)
+    values = torch.randn(nnz, dtype=dtype, device=device)
+
+    indices = torch.stack([row_indices, col_indices])
+    sparse_matrix = torch.sparse_coo_tensor(indices, values, (N, N), dtype=dtype, device=device)
+
+    sparse_matrix_csr = sparse_matrix.to_sparse_csr()
+
+    x = torch.randn(N, dtype=dtype, device=device)
+
+    return sparse_matrix_csr, x
+
+
+def kernel_spmv(A, x, reps=100):
+    torch.cuda.synchronize()
+    _ = torch.sparse.mm(A, x.unsqueeze(1)).squeeze()  # warmup
+    torch.cuda.synchronize()
+
+    start_evt = torch.cuda.Event(enable_timing=True)
+    end_evt = torch.cuda.Event(enable_timing=True)
+    start_evt.record()
+    for _ in range(reps):
+        y = torch.sparse.mm(A, x.unsqueeze(1)).squeeze()
+    end_evt.record()
+    torch.cuda.synchronize()
+    gpu_ms = float(start_evt.elapsed_time(end_evt))
+    return y, gpu_ms
+
+
+def handler(event):
+    size = event.get("size")
+    density = event.get("density", 0.01)  # default 1% density
+
+    if "seed" in event:
+        import random
+
+        random.seed(event["seed"])
+        seed = event.get("seed", 42)
+        seed = int(seed)
+    else:
+        seed = 42
+
+    gen_begin = datetime.datetime.now()
+    A, x = initialize_torch(size, density=density, dtype=torch.float32, device="cuda", seed=seed)
+    gen_end = datetime.datetime.now()
+
+    comp_begin = datetime.datetime.now()
+    y_out, gpu_ms = kernel_spmv(A, x, reps=100)
+    comp_end = datetime.datetime.now()
+
+    gen_us = (gen_end - gen_begin) / datetime.timedelta(microseconds=1)
+    comp_us = (comp_end - comp_begin) / datetime.timedelta(microseconds=1)
+
+    return {
+        "measurement": {
+            "generating_time": gen_us,
+            "compute_time": comp_us,
+            "gpu_time": gpu_ms,
+        }
+    }
diff --git a/benchmarks/600.linearalgebra/606.spmv/python/requirements.txt b/benchmarks/600.linearalgebra/606.spmv/python/requirements.txt
new file mode 100755
index 000000000..d8d966118
--- /dev/null
+++ b/benchmarks/600.linearalgebra/606.spmv/python/requirements.txt
@@ -0,0 +1 @@
+torch==2.4.1
diff --git a/benchmarks/600.linearalgebra/606.spmv/python/requirements.txt.3.10 b/benchmarks/600.linearalgebra/606.spmv/python/requirements.txt.3.10
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/606.spmv/python/requirements.txt.3.11 b/benchmarks/600.linearalgebra/606.spmv/python/requirements.txt.3.11
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/606.spmv/python/requirements.txt.3.12 b/benchmarks/600.linearalgebra/606.spmv/python/requirements.txt.3.12
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/606.spmv/python/requirements.txt.3.7 b/benchmarks/600.linearalgebra/606.spmv/python/requirements.txt.3.7
new file mode 100755
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/606.spmv/python/requirements.txt.3.8 b/benchmarks/600.linearalgebra/606.spmv/python/requirements.txt.3.8
new file mode 100755
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/606.spmv/python/requirements.txt.3.9 b/benchmarks/600.linearalgebra/606.spmv/python/requirements.txt.3.9
new file mode 100755
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/606.spmv/python/requirements.txt.arm.3.8 b/benchmarks/600.linearalgebra/606.spmv/python/requirements.txt.arm.3.8
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/606.spmv/python/requirements.txt.arm.3.9 b/benchmarks/600.linearalgebra/606.spmv/python/requirements.txt.arm.3.9
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/607.fw/config.json b/benchmarks/600.linearalgebra/607.fw/config.json
new file mode 100644
index 000000000..e80fb4351
--- /dev/null
+++ b/benchmarks/600.linearalgebra/607.fw/config.json
@@ -0,0 +1,6 @@
+{
+  "timeout": 120,
+  "memory": 512,
+  "languages": ["python"],
+  "modules": []
+}
diff --git a/benchmarks/600.linearalgebra/607.fw/input.py b/benchmarks/600.linearalgebra/607.fw/input.py
new file mode 100644
index 000000000..ec8cd9d59
--- /dev/null
+++ b/benchmarks/600.linearalgebra/607.fw/input.py
@@ -0,0 +1,13 @@
+size_generators = {"test": 10, "small": 100, "large": 1000}
+
+
+def generate_input(
+    data_dir,
+    size,
+    benchmarks_bucket,
+    input_paths,
+    output_paths,
+    upload_func,
+    nosql_func,
+):
+    return {"size": size_generators[size], "seed": 42}
diff --git a/benchmarks/600.linearalgebra/607.fw/python/function.py b/benchmarks/600.linearalgebra/607.fw/python/function.py
new file mode 100755
index 000000000..bee06dd03
--- /dev/null
+++ b/benchmarks/600.linearalgebra/607.fw/python/function.py
@@ -0,0 +1,71 @@
+import torch
+import datetime
+
+
+def initialize_torch(N, dtype=torch.int32, device="cuda", seed=42):
+    if seed is not None:
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+
+    i, j = torch.meshgrid(
+        torch.arange(N, device=device), torch.arange(N, device=device), indexing="ij"
+    )
+    path = ((i * j) % 7 + 1).to(dtype)
+
+    mask = ((i + j) % 13 == 0) | ((i + j) % 7 == 0) | ((i + j) % 11 == 0)
+    path = path.masked_fill(mask, torch.as_tensor(999, dtype=dtype, device=device))
+    return path
+
+
+def kernel_fw(path):
+    torch.cuda.synchronize()
+    path2 = path.clone()
+    n = path2.size(0)
+    for k in range(n):
+        for i in range(n):
+            path2[i, :] = torch.minimum(path2[i, :], path2[i, k] + path2[k, :])  # warmup
+    torch.cuda.synchronize()
+
+    start_evt = torch.cuda.Event(enable_timing=True)
+    end_evt = torch.cuda.Event(enable_timing=True)
+    start_evt.record()
+    n = path.size(0)
+    for k in range(n):
+        for i in range(n):
+            path[i, :] = torch.minimum(path[i, :], path[i, k] + path[k, :])
+    end_evt.record()
+    torch.cuda.synchronize()
+    gpu_ms = float(start_evt.elapsed_time(end_evt))
+    return path, gpu_ms
+
+
+def handler(event):
+    size = event.get("size")
+
+    if "seed" in event:
+        import random
+
+        random.seed(event["seed"])
+        seed = event.get("seed", 42)
+        seed = int(seed)
+    else:
+        seed = 42
+
+    gen_begin = datetime.datetime.now()
+    path = initialize_torch(size, dtype=torch.float32, device="cuda", seed=seed)
+    gen_end = datetime.datetime.now()
+
+    comp_begin = datetime.datetime.now()
+    path_out, gpu_ms = kernel_fw(path)
+    comp_end = datetime.datetime.now()
+
+    gen_us = (gen_end - gen_begin) / datetime.timedelta(microseconds=1)
+    comp_us = (comp_end - comp_begin) / datetime.timedelta(microseconds=1)
+
+    return {
+        "measurement": {
+            "generating_time": gen_us,
+            "compute_time": comp_us,
+            "gpu_time": gpu_ms,
+        }
+    }
diff --git a/benchmarks/600.linearalgebra/607.fw/python/requirements.txt b/benchmarks/600.linearalgebra/607.fw/python/requirements.txt
new file mode 100755
index 000000000..d8d966118
--- /dev/null
+++ b/benchmarks/600.linearalgebra/607.fw/python/requirements.txt
@@ -0,0 +1 @@
+torch==2.4.1
diff --git a/benchmarks/600.linearalgebra/607.fw/python/requirements.txt.3.10 b/benchmarks/600.linearalgebra/607.fw/python/requirements.txt.3.10
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/607.fw/python/requirements.txt.3.11 b/benchmarks/600.linearalgebra/607.fw/python/requirements.txt.3.11
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/607.fw/python/requirements.txt.3.12 b/benchmarks/600.linearalgebra/607.fw/python/requirements.txt.3.12
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/607.fw/python/requirements.txt.3.7 b/benchmarks/600.linearalgebra/607.fw/python/requirements.txt.3.7
new file mode 100755
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/607.fw/python/requirements.txt.3.8 b/benchmarks/600.linearalgebra/607.fw/python/requirements.txt.3.8
new file mode 100755
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/607.fw/python/requirements.txt.3.9 b/benchmarks/600.linearalgebra/607.fw/python/requirements.txt.3.9
new file mode 100755
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/607.fw/python/requirements.txt.arm.3.8 b/benchmarks/600.linearalgebra/607.fw/python/requirements.txt.arm.3.8
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/600.linearalgebra/607.fw/python/requirements.txt.arm.3.9 b/benchmarks/600.linearalgebra/607.fw/python/requirements.txt.arm.3.9
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/wrappers/aws/python/handler.py b/benchmarks/wrappers/aws/python/handler.py
index 907b2c612..a4ce0a6b9 100644
--- a/benchmarks/wrappers/aws/python/handler.py
+++ b/benchmarks/wrappers/aws/python/handler.py
@@ -1,39 +1,48 @@
-
-import datetime, io, json, os, sys, uuid
+import datetime
+import io
+import json
+import os
+import sys
+import uuid
 
 # Add current directory to allow location of packages
-sys.path.append(os.path.join(os.path.dirname(__file__), '.python_packages/lib/site-packages'))
+sys.path.append(os.path.join(os.path.dirname(__file__), ".python_packages/lib/site-packages"))
 
 # TODO: usual trigger
 # implement support for S3 and others
+
+
 def handler(event, context):
 
     income_timestamp = datetime.datetime.now().timestamp()
 
     # HTTP trigger with API Gateaway
-    if 'body' in event:
-        event = json.loads(event['body'])
+    if "body" in event:
+        event = json.loads(event["body"])
     req_id = context.aws_request_id
-    event['request-id'] = req_id
-    event['income-timestamp'] = income_timestamp
+    event["request-id"] = req_id
+    event["income-timestamp"] = income_timestamp
     begin = datetime.datetime.now()
     from function import function
+
     ret = function.handler(event)
     end = datetime.datetime.now()
 
-    log_data = {
-        'output': ret['result']
-    }
-    if 'measurement' in ret:
-        log_data['measurement'] = ret['measurement']
-    if 'logs' in event:
-        log_data['time'] = (end - begin) / datetime.timedelta(microseconds=1)
+    log_data = {"output": ret["result"]}
+    if "measurement" in ret:
+        log_data["measurement"] = ret["measurement"]
+    if "logs" in event:
+        log_data["time"] = (end - begin) / datetime.timedelta(microseconds=1)
         results_begin = datetime.datetime.now()
         from function import storage
+
         storage_inst = storage.storage.get_instance()
-        b = event.get('logs').get('bucket')
-        storage_inst.upload_stream(b, '{}.json'.format(req_id),
-                io.BytesIO(json.dumps(log_data).encode('utf-8')))
+        b = event.get("logs").get("bucket")
+        storage_inst.upload_stream(
+            b,
+            "{}.json".format(req_id),
+            io.BytesIO(json.dumps(log_data).encode("utf-8")),
+        )
         results_end = datetime.datetime.now()
         results_time = (results_end - results_begin) / datetime.timedelta(microseconds=1)
     else:
@@ -41,14 +50,14 @@ def handler(event, context):
 
     # cold test
     is_cold = False
-    fname = os.path.join('/tmp', 'cold_run')
+    fname = os.path.join("/tmp", "cold_run")
     if not os.path.exists(fname):
         is_cold = True
         container_id = str(uuid.uuid4())[0:8]
-        with open(fname, 'a') as f:
+        with open(fname, "a") as f:
             f.write(container_id)
     else:
-        with open(fname, 'r') as f:
+        with open(fname, "r") as f:
             container_id = f.read()
 
     cold_start_var = ""
@@ -56,16 +65,17 @@ def handler(event, context):
         cold_start_var = os.environ["cold_start"]
 
     return {
-        'statusCode': 200,
-        'body': json.dumps({
-            'begin': begin.strftime('%s.%f'),
-            'end': end.strftime('%s.%f'),
-            'results_time': results_time,
-            'is_cold': is_cold,
-            'result': log_data,
-            'request_id': context.aws_request_id,
-            'cold_start_var': cold_start_var,
-            'container_id': container_id,
-        })
+        "statusCode": 200,
+        "body": json.dumps(
+            {
+                "begin": begin.strftime("%s.%f"),
+                "end": end.strftime("%s.%f"),
+                "results_time": results_time,
+                "is_cold": is_cold,
+                "result": log_data,
+                "request_id": context.aws_request_id,
+                "cold_start_var": cold_start_var,
+                "container_id": container_id,
+            }
+        ),
     }
-
diff --git a/benchmarks/wrappers/aws/python/nosql.py b/benchmarks/wrappers/aws/python/nosql.py
index 72bc2d9da..ad300071d 100644
--- a/benchmarks/wrappers/aws/python/nosql.py
+++ b/benchmarks/wrappers/aws/python/nosql.py
@@ -57,7 +57,10 @@ def insert(
         self._get_table(table_name).put_item(Item=data)
 
     def get(
-        self, table_name: str, primary_key: Tuple[str, str], secondary_key: Tuple[str, str]
+        self,
+        table_name: str,
+        primary_key: Tuple[str, str],
+        secondary_key: Tuple[str, str],
     ) -> dict:
 
         data = {}
@@ -107,7 +110,12 @@ def query(self, table_name: str, primary_key: Tuple[str, str], _: str) -> List[d
         )["Items"]
         return self._remove_decimals(res)
 
-    def delete(self, table_name: str, primary_key: Tuple[str, str], secondary_key: Tuple[str, str]):
+    def delete(
+        self,
+        table_name: str,
+        primary_key: Tuple[str, str],
+        secondary_key: Tuple[str, str],
+    ):
         data = {}
         for key in (primary_key, secondary_key):
             data[key[0]] = key[1]
diff --git a/benchmarks/wrappers/aws/python/setup.py b/benchmarks/wrappers/aws/python/setup.py
index b3d878351..016974465 100644
--- a/benchmarks/wrappers/aws/python/setup.py
+++ b/benchmarks/wrappers/aws/python/setup.py
@@ -2,14 +2,13 @@
 from glob import glob
 from pkg_resources import parse_requirements
 
-with open('requirements.txt') as f:
+with open("requirements.txt") as f:
     requirements = [str(r) for r in parse_requirements(f)]
 
 setup(
-    name='function',
+    name="function",
     install_requires=requirements,
-    packages=['function'],
-    package_dir={'function': '.'},
-    package_data={'function': glob('**', recursive=True)},
+    packages=["function"],
+    package_dir={"function": "."},
+    package_data={"function": glob("**", recursive=True)},
 )
-
diff --git a/benchmarks/wrappers/aws/python/storage.py b/benchmarks/wrappers/aws/python/storage.py
index 4be0025e8..50875fbfc 100644
--- a/benchmarks/wrappers/aws/python/storage.py
+++ b/benchmarks/wrappers/aws/python/storage.py
@@ -10,16 +10,14 @@ class storage:
     client = None
 
     def __init__(self):
-        self.client = boto3.client('s3')
+        self.client = boto3.client("s3")
 
     @staticmethod
     def unique_name(name):
         name, extension = os.path.splitext(name)
-        return '{name}.{random}{extension}'.format(
-                    name=name,
-                    extension=extension,
-                    random=str(uuid.uuid4()).split('-')[0]
-                )
+        return "{name}.{random}{extension}".format(
+            name=name, extension=extension, random=str(uuid.uuid4()).split("-")[0]
+        )
 
     def upload(self, bucket, file, filepath):
         key_name = storage.unique_name(file)
@@ -31,8 +29,8 @@ def download(self, bucket, file, filepath):
 
     def download_directory(self, bucket, prefix, path):
         objects = self.client.list_objects_v2(Bucket=bucket, Prefix=prefix)
-        for obj in objects['Contents']:
-            file_name = obj['Key']
+        for obj in objects["Contents"]:
+            file_name = obj["Key"]
             path_to_file = os.path.dirname(file_name)
             os.makedirs(os.path.join(path, path_to_file), exist_ok=True)
             self.download(bucket, file_name, os.path.join(path, file_name))
@@ -46,7 +44,7 @@ def download_stream(self, bucket, file):
         data = io.BytesIO()
         self.client.download_fileobj(bucket, file, data)
         return data.getbuffer()
-    
+
     def get_instance():
         if storage.instance is None:
             storage.instance = storage()
diff --git a/benchmarks/wrappers/azure/python/handler.py b/benchmarks/wrappers/azure/python/handler.py
index 88e44baf6..9f04930f6 100644
--- a/benchmarks/wrappers/azure/python/handler.py
+++ b/benchmarks/wrappers/azure/python/handler.py
@@ -1,52 +1,62 @@
-
-import datetime, io, json, os, uuid
+import datetime
+import io
+import json
+import os
+import uuid
 
 import azure.functions as func
 
 
-if 'NOSQL_STORAGE_DATABASE' in os.environ:
+if "NOSQL_STORAGE_DATABASE" in os.environ:
 
     from . import nosql
 
     nosql.nosql.get_instance(
-        os.environ['NOSQL_STORAGE_DATABASE'],
-        os.environ['NOSQL_STORAGE_URL'],
-        os.environ['NOSQL_STORAGE_CREDS']
+        os.environ["NOSQL_STORAGE_DATABASE"],
+        os.environ["NOSQL_STORAGE_URL"],
+        os.environ["NOSQL_STORAGE_CREDS"],
     )
 
-if 'STORAGE_CONNECTION_STRING' in os.environ:
+if "STORAGE_CONNECTION_STRING" in os.environ:
 
     from . import storage
-    client = storage.storage.get_instance(os.environ['STORAGE_CONNECTION_STRING'])
+
+    client = storage.storage.get_instance(os.environ["STORAGE_CONNECTION_STRING"])
+
 
 # TODO: usual trigger
 # implement support for blob and others
+
+
 def main(req: func.HttpRequest, context: func.Context) -> func.HttpResponse:
     income_timestamp = datetime.datetime.now().timestamp()
     req_json = req.get_json()
 
-    req_json['request-id'] = context.invocation_id
-    req_json['income-timestamp'] = income_timestamp
+    req_json["request-id"] = context.invocation_id
+    req_json["income-timestamp"] = income_timestamp
     begin = datetime.datetime.now()
     # We are deployed in the same directory
     from . import function
+
     ret = function.handler(req_json)
     end = datetime.datetime.now()
 
-    log_data = {
-        'output': ret['result']
-    }
-    if 'measurement' in ret:
-        log_data['measurement'] = ret['measurement']
-    if 'logs' in req_json:
-        log_data['time'] = (end - begin) / datetime.timedelta(microseconds=1)
+    log_data = {"output": ret["result"]}
+    if "measurement" in ret:
+        log_data["measurement"] = ret["measurement"]
+    if "logs" in req_json:
+        log_data["time"] = (end - begin) / datetime.timedelta(microseconds=1)
         results_begin = datetime.datetime.now()
         from . import storage
+
         storage_inst = storage.storage.get_instance()
-        b = req_json.get('logs').get('bucket')
+        b = req_json.get("logs").get("bucket")
         req_id = context.invocation_id
-        storage_inst.upload_stream(b, '{}.json'.format(req_id),
-                io.BytesIO(json.dumps(log_data).encode('utf-8')))
+        storage_inst.upload_stream(
+            b,
+            "{}.json".format(req_id),
+            io.BytesIO(json.dumps(log_data).encode("utf-8")),
+        )
         results_end = datetime.datetime.now()
         results_time = (results_end - results_begin) / datetime.timedelta(microseconds=1)
     else:
@@ -54,14 +64,14 @@ def main(req: func.HttpRequest, context: func.Context) -> func.HttpResponse:
 
     # cold test
     is_cold = False
-    fname = os.path.join('/tmp','cold_run')
+    fname = os.path.join("/tmp", "cold_run")
     if not os.path.exists(fname):
         is_cold = True
         container_id = str(uuid.uuid4())[0:8]
-        with open(fname, 'a') as f:
+        with open(fname, "a") as f:
             f.write(container_id)
     else:
-        with open(fname, 'r') as f:
+        with open(fname, "r") as f:
             container_id = f.read()
 
     is_cold_worker = False
@@ -73,17 +83,18 @@ def main(req: func.HttpRequest, context: func.Context) -> func.HttpResponse:
         is_cold_worker = True
 
     return func.HttpResponse(
-        json.dumps({
-            'begin': begin.strftime('%s.%f'),
-            'end': end.strftime('%s.%f'),
-            'results_time': results_time,
-            'result': log_data,
-            'is_cold': is_cold,
-            'is_cold_worker': is_cold_worker,
-            'container_id': container_id,
-            'environ_container_id': os.environ['CONTAINER_NAME'],
-            'request_id': context.invocation_id
-        }),
-        mimetype="application/json"
+        json.dumps(
+            {
+                "begin": begin.strftime("%s.%f"),
+                "end": end.strftime("%s.%f"),
+                "results_time": results_time,
+                "result": log_data,
+                "is_cold": is_cold,
+                "is_cold_worker": is_cold_worker,
+                "container_id": container_id,
+                "environ_container_id": os.environ["CONTAINER_NAME"],
+                "request_id": context.invocation_id,
+            }
+        ),
+        mimetype="application/json",
     )
-
diff --git a/benchmarks/wrappers/azure/python/nosql.py b/benchmarks/wrappers/azure/python/nosql.py
index f7dd94851..26c8608d1 100644
--- a/benchmarks/wrappers/azure/python/nosql.py
+++ b/benchmarks/wrappers/azure/python/nosql.py
@@ -34,7 +34,10 @@ def insert(
         self._get_table(table_name).upsert_item(data)
 
     def get(
-        self, table_name: str, primary_key: Tuple[str, str], secondary_key: Tuple[str, str]
+        self,
+        table_name: str,
+        primary_key: Tuple[str, str],
+        secondary_key: Tuple[str, str],
     ) -> dict:
         res = self._get_table(table_name).read_item(
             item=secondary_key[1], partition_key=primary_key[1]
@@ -80,13 +83,20 @@ def query(
 
         return res
 
-    def delete(self, table_name: str, primary_key: Tuple[str, str], secondary_key: Tuple[str, str]):
+    def delete(
+        self,
+        table_name: str,
+        primary_key: Tuple[str, str],
+        secondary_key: Tuple[str, str],
+    ):
 
         self._get_table(table_name).delete_item(item=secondary_key[1], partition_key=primary_key[1])
 
     @staticmethod
     def get_instance(
-        database: Optional[str] = None, url: Optional[str] = None, credential: Optional[str] = None
+        database: Optional[str] = None,
+        url: Optional[str] = None,
+        credential: Optional[str] = None,
     ):
         if nosql.instance is None:
             assert database is not None and url is not None and credential is not None
diff --git a/benchmarks/wrappers/azure/python/storage.py b/benchmarks/wrappers/azure/python/storage.py
index 42b129c89..fabd8e6a1 100644
--- a/benchmarks/wrappers/azure/python/storage.py
+++ b/benchmarks/wrappers/azure/python/storage.py
@@ -1,10 +1,10 @@
-
 import os
 import uuid
 from typing import Optional
 
 from azure.storage.blob import BlobServiceClient
 
+
 class storage:
     instance = None
     client = None
@@ -15,20 +15,18 @@ def __init__(self, connection_string: str):
     @staticmethod
     def unique_name(name):
         name, extension = os.path.splitext(name)
-        return '{name}.{random}{extension}'.format(
-                    name=name,
-                    extension=extension,
-                    random=str(uuid.uuid4()).split('-')[0]
-                )
+        return "{name}.{random}{extension}".format(
+            name=name, extension=extension, random=str(uuid.uuid4()).split("-")[0]
+        )
 
     def upload(self, container, file, filepath):
-        with open(filepath, 'rb') as data:
+        with open(filepath, "rb") as data:
             return self.upload_stream(container, file, data)
 
     def download(self, container, file, filepath):
-        with open(filepath, 'wb') as download_file:
-            download_file.write( self.download_stream(container, file) )
-    
+        with open(filepath, "wb") as download_file:
+            download_file.write(self.download_stream(container, file))
+
     def download_directory(self, container, prefix, path):
         client = self.client.get_container_client(container=container)
         objects = client.list_blobs(name_starts_with=prefix)
@@ -37,20 +35,17 @@ def download_directory(self, container, prefix, path):
             path_to_file = os.path.dirname(file_name)
             os.makedirs(os.path.join(path, path_to_file), exist_ok=True)
             self.download(container, file_name, os.path.join(path, file_name))
-    
+
     def upload_stream(self, container, file, data):
         key_name = storage.unique_name(file)
-        client = self.client.get_blob_client(
-                container=container,
-                blob=key_name
-        )
+        client = self.client.get_blob_client(container=container, blob=key_name)
         client.upload_blob(data)
         return key_name
 
     def download_stream(self, container, file):
         client = self.client.get_blob_client(container=container, blob=file)
         return client.download_blob().readall()
-    
+
     @staticmethod
     def get_instance(connection_string: Optional[str] = None):
         if storage.instance is None:
diff --git a/benchmarks/wrappers/gcp/python/handler.py b/benchmarks/wrappers/gcp/python/handler.py
index 9b6989611..378540dcc 100644
--- a/benchmarks/wrappers/gcp/python/handler.py
+++ b/benchmarks/wrappers/gcp/python/handler.py
@@ -1,44 +1,48 @@
-import datetime, io, json, os, uuid, sys
+import datetime
+import io
+import json
+import os
+import sys
+import uuid
 
-sys.path.append(os.path.join(os.path.dirname(__file__), '.python_packages/lib/site-packages'))
+sys.path.append(os.path.join(os.path.dirname(__file__), ".python_packages/lib/site-packages"))
 
 # This variable is defined by SeBS during function creation.
-if 'NOSQL_STORAGE_DATABASE' in os.environ:
+if "NOSQL_STORAGE_DATABASE" in os.environ:
     from function import nosql
 
-    nosql.nosql.get_instance(
-        os.environ['NOSQL_STORAGE_DATABASE']
-    )
+    nosql.nosql.get_instance(os.environ["NOSQL_STORAGE_DATABASE"])
 
 
 def handler(req):
     income_timestamp = datetime.datetime.now().timestamp()
-    req_id = req.headers.get('Function-Execution-Id')
-
+    req_id = req.headers.get("Function-Execution-Id")
 
     req_json = req.get_json()
-    req_json['request-id'] = req_id
-    req_json['income-timestamp'] = income_timestamp
+    req_json["request-id"] = req_id
+    req_json["income-timestamp"] = income_timestamp
     begin = datetime.datetime.now()
     # We are deployed in the same directorygit status
     from function import function
+
     ret = function.handler(req_json)
     end = datetime.datetime.now()
 
-
-    log_data = {
-        'output': ret['result']
-    }
-    if 'measurement' in ret:
-        log_data['measurement'] = ret['measurement']
-    if 'logs' in req_json:
-        log_data['time'] = (end - begin) / datetime.timedelta(microseconds=1)
+    log_data = {"output": ret["result"]}
+    if "measurement" in ret:
+        log_data["measurement"] = ret["measurement"]
+    if "logs" in req_json:
+        log_data["time"] = (end - begin) / datetime.timedelta(microseconds=1)
         results_begin = datetime.datetime.now()
         from function import storage
+
         storage_inst = storage.storage.get_instance()
-        b = req_json.get('logs').get('bucket')
-        storage_inst.upload_stream(b, '{}.json'.format(req_id),
-                                   io.BytesIO(json.dumps(log_data).encode('utf-8')))
+        b = req_json.get("logs").get("bucket")
+        storage_inst.upload_stream(
+            b,
+            "{}.json".format(req_id),
+            io.BytesIO(json.dumps(log_data).encode("utf-8")),
+        )
         results_end = datetime.datetime.now()
         results_time = (results_end - results_begin) / datetime.timedelta(microseconds=1)
     else:
@@ -46,27 +50,33 @@ def handler(req):
 
     # cold test
     is_cold = False
-    fname = os.path.join('/tmp', 'cold_run')
+    fname = os.path.join("/tmp", "cold_run")
     if not os.path.exists(fname):
         is_cold = True
         container_id = str(uuid.uuid4())[0:8]
-        with open(fname, 'a') as f:
+        with open(fname, "a") as f:
             f.write(container_id)
     else:
-        with open(fname, 'r') as f:
+        with open(fname, "r") as f:
             container_id = f.read()
 
     cold_start_var = ""
     if "cold_start" in os.environ:
         cold_start_var = os.environ["cold_start"]
 
-    return json.dumps({
-            'begin': begin.strftime('%s.%f'),
-            'end': end.strftime('%s.%f'),
-            'results_time': results_time,
-            'is_cold': is_cold,
-            'result': log_data,
-            'request_id': req_id,
-            'cold_start_var': cold_start_var,
-            'container_id': container_id,
-        }), 200, {'ContentType': 'application/json'}
+    return (
+        json.dumps(
+            {
+                "begin": begin.strftime("%s.%f"),
+                "end": end.strftime("%s.%f"),
+                "results_time": results_time,
+                "is_cold": is_cold,
+                "result": log_data,
+                "request_id": req_id,
+                "cold_start_var": cold_start_var,
+                "container_id": container_id,
+            }
+        ),
+        200,
+        {"ContentType": "application/json"},
+    )
diff --git a/benchmarks/wrappers/gcp/python/nosql.py b/benchmarks/wrappers/gcp/python/nosql.py
index 408712857..1eeba638c 100644
--- a/benchmarks/wrappers/gcp/python/nosql.py
+++ b/benchmarks/wrappers/gcp/python/nosql.py
@@ -70,7 +70,10 @@ def update(
         self._client.put(res)
 
     def get(
-        self, table_name: str, primary_key: Tuple[str, str], secondary_key: Tuple[str, str]
+        self,
+        table_name: str,
+        primary_key: Tuple[str, str],
+        secondary_key: Tuple[str, str],
     ) -> Optional[dict]:
 
         parent_key = self._client.key(primary_key[0], primary_key[1])
@@ -110,7 +113,12 @@ def query(
 
         return res
 
-    def delete(self, table_name: str, primary_key: Tuple[str, str], secondary_key: Tuple[str, str]):
+    def delete(
+        self,
+        table_name: str,
+        primary_key: Tuple[str, str],
+        secondary_key: Tuple[str, str],
+    ):
         parent_key = self._client.key(primary_key[0], primary_key[1])
         key = self._client.key(
             # kind determines the table
diff --git a/benchmarks/wrappers/gcp/python/storage.py b/benchmarks/wrappers/gcp/python/storage.py
index 81163cb34..70f182618 100644
--- a/benchmarks/wrappers/gcp/python/storage.py
+++ b/benchmarks/wrappers/gcp/python/storage.py
@@ -15,11 +15,9 @@ def __init__(self):
     @staticmethod
     def unique_name(name):
         name, extension = os.path.splitext(name)
-        return '{name}.{random}{extension}'.format(
-                    name=name,
-                    extension=extension,
-                    random=str(uuid.uuid4()).split('-')[0]
-                )
+        return "{name}.{random}{extension}".format(
+            name=name, extension=extension, random=str(uuid.uuid4()).split("-")[0]
+        )
 
     def upload(self, bucket, file, filepath):
         key_name = storage.unique_name(file)
diff --git a/benchmarks/wrappers/local/python/nosql.py b/benchmarks/wrappers/local/python/nosql.py
index 0e816954c..fc6379491 100644
--- a/benchmarks/wrappers/local/python/nosql.py
+++ b/benchmarks/wrappers/local/python/nosql.py
@@ -67,7 +67,10 @@ def insert(
         self._get_table(table_name).put_item(Item=data)
 
     def get(
-        self, table_name: str, primary_key: Tuple[str, str], secondary_key: Tuple[str, str]
+        self,
+        table_name: str,
+        primary_key: Tuple[str, str],
+        secondary_key: Tuple[str, str],
     ) -> dict:
 
         data = {}
@@ -117,7 +120,12 @@ def query(self, table_name: str, primary_key: Tuple[str, str], _: str) -> List[d
         )["Items"]
         return self._remove_decimals(res)
 
-    def delete(self, table_name: str, primary_key: Tuple[str, str], secondary_key: Tuple[str, str]):
+    def delete(
+        self,
+        table_name: str,
+        primary_key: Tuple[str, str],
+        secondary_key: Tuple[str, str],
+    ):
         data = {}
         for key in (primary_key, secondary_key):
             data[key[0]] = key[1]
diff --git a/benchmarks/wrappers/local/python/storage.py b/benchmarks/wrappers/local/python/storage.py
index b44968408..d25583a13 100644
--- a/benchmarks/wrappers/local/python/storage.py
+++ b/benchmarks/wrappers/local/python/storage.py
@@ -1,32 +1,28 @@
-import io
 import os
 import uuid
 
 import minio
 
+
 class storage:
     instance = None
     client = None
 
     def __init__(self):
-        if 'MINIO_ADDRESS' in os.environ:
-            address = os.environ['MINIO_ADDRESS']
-            access_key = os.environ['MINIO_ACCESS_KEY']
-            secret_key = os.environ['MINIO_SECRET_KEY']
+        if "MINIO_ADDRESS" in os.environ:
+            address = os.environ["MINIO_ADDRESS"]
+            access_key = os.environ["MINIO_ACCESS_KEY"]
+            secret_key = os.environ["MINIO_SECRET_KEY"]
             self.client = minio.Minio(
-                    address,
-                    access_key=access_key,
-                    secret_key=secret_key,
-                    secure=False)
+                address, access_key=access_key, secret_key=secret_key, secure=False
+            )
 
     @staticmethod
     def unique_name(name):
         name, extension = os.path.splitext(name)
-        return '{name}.{random}{extension}'.format(
-                    name=name,
-                    extension=extension,
-                    random=str(uuid.uuid4()).split('-')[0]
-                )
+        return "{name}.{random}{extension}".format(
+            name=name, extension=extension, random=str(uuid.uuid4()).split("-")[0]
+        )
 
     def upload(self, bucket, file, filepath):
         key_name = storage.unique_name(file)
@@ -55,4 +51,3 @@ def get_instance():
         if storage.instance is None:
             storage.instance = storage()
         return storage.instance
-
diff --git a/benchmarks/wrappers/openwhisk/python/__main__.py b/benchmarks/wrappers/openwhisk/python/__main__.py
index 3ae44f9c2..3833bff8c 100644
--- a/benchmarks/wrappers/openwhisk/python/__main__.py
+++ b/benchmarks/wrappers/openwhisk/python/__main__.py
@@ -2,24 +2,30 @@
 import datetime
 import os
 
+
 def main(args):
     logging.getLogger().setLevel(logging.INFO)
     begin = datetime.datetime.now()
-    args['request-id'] = os.getenv('__OW_ACTIVATION_ID')
-    args['income-timestamp'] = begin.timestamp()
+    args["request-id"] = os.getenv("__OW_ACTIVATION_ID")
+    args["income-timestamp"] = begin.timestamp()
 
-    for arg in ["MINIO_STORAGE_CONNECTION_URL", "MINIO_STORAGE_ACCESS_KEY", "MINIO_STORAGE_SECRET_KEY"]:
+    for arg in [
+        "MINIO_STORAGE_CONNECTION_URL",
+        "MINIO_STORAGE_ACCESS_KEY",
+        "MINIO_STORAGE_SECRET_KEY",
+    ]:
         os.environ[arg] = args[arg]
         del args[arg]
 
     key_list = list(args.keys())
     for arg in key_list:
-        if 'NOSQL_STORAGE_' in arg:
+        if "NOSQL_STORAGE_" in arg:
             os.environ[arg] = args[arg]
             del args[arg]
 
     try:
         from function import function
+
         ret = function.handler(args)
         end = datetime.datetime.now()
         logging.info("Function result: {}".format(ret))
@@ -38,7 +44,7 @@ def main(args):
         return {
             "begin": begin.strftime("%s.%f"),
             "end": end.strftime("%s.%f"),
-            "request_id": os.getenv('__OW_ACTIVATION_ID'),
+            "request_id": os.getenv("__OW_ACTIVATION_ID"),
             "results_time": results_time,
             "is_cold": is_cold,
             "result": log_data,
@@ -49,7 +55,7 @@ def main(args):
         return {
             "begin": begin.strftime("%s.%f"),
             "end": end.strftime("%s.%f"),
-            "request_id": os.getenv('__OW_ACTIVATION_ID'),
+            "request_id": os.getenv("__OW_ACTIVATION_ID"),
             "results_time": results_time,
-            "result": f"Error - invocation failed! Reason: {e}"
+            "result": f"Error - invocation failed! Reason: {e}",
         }
diff --git a/benchmarks/wrappers/openwhisk/python/nosql.py b/benchmarks/wrappers/openwhisk/python/nosql.py
index da8245009..e100d3b2c 100644
--- a/benchmarks/wrappers/openwhisk/python/nosql.py
+++ b/benchmarks/wrappers/openwhisk/python/nosql.py
@@ -5,6 +5,7 @@
 import boto3
 from botocore.client import Config
 
+
 class nosql:
 
     instance: Optional["nosql"] = None
@@ -14,14 +15,14 @@ def __init__(self):
         if environ["NOSQL_STORAGE_TYPE"] != "scylladb":
             raise RuntimeError(f"Unsupported NoSQL storage type: {environ['NOSQL_STORAGE_TYPE']}!")
 
-        config = Config(connect_timeout=5, retries={'max_attempts': 0})
+        config = Config(connect_timeout=5, retries={"max_attempts": 0})
         self.client = boto3.resource(
             "dynamodb",
             region_name="None",
             aws_access_key_id="None",
             aws_secret_access_key="None",
             endpoint_url=f"http://{environ['NOSQL_STORAGE_ENDPOINT']}",
-            config=config
+            config=config,
         )
         self._tables = {}
 
@@ -69,7 +70,10 @@ def insert(
         self._get_table(table_name).put_item(Item=data)
 
     def get(
-        self, table_name: str, primary_key: Tuple[str, str], secondary_key: Tuple[str, str]
+        self,
+        table_name: str,
+        primary_key: Tuple[str, str],
+        secondary_key: Tuple[str, str],
     ) -> dict:
 
         data = {}
@@ -119,7 +123,12 @@ def query(self, table_name: str, primary_key: Tuple[str, str], _: str) -> List[d
         )["Items"]
         return self._remove_decimals(res)
 
-    def delete(self, table_name: str, primary_key: Tuple[str, str], secondary_key: Tuple[str, str]):
+    def delete(
+        self,
+        table_name: str,
+        primary_key: Tuple[str, str],
+        secondary_key: Tuple[str, str],
+    ):
         data = {}
         for key in (primary_key, secondary_key):
             data[key[0]] = key[1]
diff --git a/benchmarks/wrappers/openwhisk/python/setup.py b/benchmarks/wrappers/openwhisk/python/setup.py
index b942d059b..016974465 100644
--- a/benchmarks/wrappers/openwhisk/python/setup.py
+++ b/benchmarks/wrappers/openwhisk/python/setup.py
@@ -2,13 +2,13 @@
 from glob import glob
 from pkg_resources import parse_requirements
 
-with open('requirements.txt') as f:
+with open("requirements.txt") as f:
     requirements = [str(r) for r in parse_requirements(f)]
 
 setup(
-    name='function',
+    name="function",
     install_requires=requirements,
-    packages=['function'],
-    package_dir={'function': '.'},
-    package_data={'function': glob('**', recursive=True)},
-)
\ No newline at end of file
+    packages=["function"],
+    package_dir={"function": "."},
+    package_data={"function": glob("**", recursive=True)},
+)
diff --git a/benchmarks/wrappers/openwhisk/python/storage.py b/benchmarks/wrappers/openwhisk/python/storage.py
index 76c7e3e8e..09b9e78a7 100644
--- a/benchmarks/wrappers/openwhisk/python/storage.py
+++ b/benchmarks/wrappers/openwhisk/python/storage.py
@@ -1,8 +1,8 @@
+import logging
 import os
 import uuid
-import json
+
 import minio
-import logging
 
 
 class storage:
@@ -25,14 +25,14 @@ def __init__(self):
                 maxsize=10,
                 retries=urllib3.Retry(
                     total=5, backoff_factor=0.2, status_forcelist=[500, 502, 503, 504]
-                )
+                ),
             )
             self.client = minio.Minio(
                 os.getenv("MINIO_STORAGE_CONNECTION_URL"),
                 access_key=os.getenv("MINIO_STORAGE_ACCESS_KEY"),
                 secret_key=os.getenv("MINIO_STORAGE_SECRET_KEY"),
                 secure=False,
-                http_client=mgr
+                http_client=mgr,
             )
         except Exception as e:
             logging.info(e)
@@ -41,12 +41,9 @@ def __init__(self):
     @staticmethod
     def unique_name(name):
         name, extension = os.path.splitext(name)
-        return '{name}.{random}{extension}'.format(
-                    name=name,
-                    extension=extension,
-                    random=str(uuid.uuid4()).split('-')[0]
-                )
-
+        return "{name}.{random}{extension}".format(
+            name=name, extension=extension, random=str(uuid.uuid4()).split("-")[0]
+        )
 
     def upload(self, bucket, file, filepath):
         key_name = storage.unique_name(file)
@@ -64,9 +61,7 @@ def download_directory(self, bucket, prefix, path):
 
     def upload_stream(self, bucket, file, bytes_data):
         key_name = storage.unique_name(file)
-        self.client.put_object(
-            bucket, key_name, bytes_data, bytes_data.getbuffer().nbytes
-        )
+        self.client.put_object(bucket, key_name, bytes_data, bytes_data.getbuffer().nbytes)
         return key_name
 
     def download_stream(self, bucket, file):
diff --git a/docs/benchmarks.md b/docs/benchmarks.md
index e292a4b04..6977672d6 100644
--- a/docs/benchmarks.md
+++ b/docs/benchmarks.md
@@ -10,6 +10,8 @@
 | Multimedia      | 220.video-processing    | Python    | x64, arm64 | Add a watermark and generate gif of a video file. |
 | Utilities      | 311.compression    | Python   | x64, arm64 | Create a .zip file for a group of files in storage and return to user to download. |
 | Inference      | 411.image-recognition    | Python    | x64 | Image recognition with ResNet and pytorch. |
+| Inference      | 412.language-bert    | Python    | x64 | Sentence classification with a compact BERT model served via ONNX Runtime. |
+| Inference      | 413.recommendation    | Python    | x64 | GPU DLRM-inspired recommender scoring implemented in PyTorch. |
 | Scientific      | 501.graph-pagerank    | Python    | x64, arm64 | PageRank implementation with igraph. |
 | Scientific      | 502.graph-mst    | Python    | x64, arm64 | Minimum spanning tree (MST)  implementation with igraph. |
 | Scientific      | 503.graph-bfs    | Python    | x64, arm64 | Breadth-first search (BFS) implementation with igraph. |
@@ -70,6 +72,14 @@ It implements the .zip file creation with the help of the `shutil` standard libr
 
 The benchmark is inspired by MLPerf and implements image recognition with Resnet50. It downloads the input and model from the storage and uses the CPU-only `pytorch` library in Python.
 
+### Language Inference
+
+This benchmark runs sequence classification with a compact BERT model exported to ONNX. The function downloads the model archive and text samples from storage, tokenizes the sentences, executes the ONNX Runtime session, and returns the predicted labels together with confidences.
+
+### Recommendation
+
+Inspired by MLPerf’s DLRM v2, this benchmark ships a tiny PyTorch DLRM model that optionally runs on CUDA when available. The function downloads the model and request batch, moves the network to GPU if possible, performs batched inference, and reports recommendation scores alongside timing measurements.
+
 ## Scientific
 
 ### Graph PageRank, BFS, MST
@@ -87,4 +97,3 @@ This benchmark is inspired by the [DNAVisualization](https://github.com/Benjamin
 ## Applications
 
 **(WiP)** Coming soon!
-
diff --git a/experiments.json b/experiments.json
new file mode 100644
index 000000000..69f9c9cd3
--- /dev/null
+++ b/experiments.json
@@ -0,0 +1,295 @@
+{
+  "_invocations": {
+    "sebd-220.video-processing-gpu-python-3.10": {
+      "3b50a32d-3014-4c44-a0b2-039560cdff7b": {
+        "billing": {
+          "_billed_time": null,
+          "_gb_seconds": 0,
+          "_memory": null
+        },
+        "output": {
+          "begin": "1763907631.718880",
+          "end": "1763907635.443804",
+          "is_cold": false,
+          "request_id": "3b50a32d-3014-4c44-a0b2-039560cdff7b",
+          "result": {
+            "output": {
+              "measurement": {
+                "compute_time": 1889760.0,
+                "download_size": 15350231,
+                "download_time": 121586.0,
+                "upload_size": 42661,
+                "upload_time": 8043.0
+              },
+              "result": {
+                "bucket": "sebs-benchmarks-local-9e8a1fb2",
+                "key": "220.video-processing-gpu-0-output/processed-city.606b0d90.mp4"
+              }
+            }
+          }
+        },
+        "provider_times": {
+          "execution": 0,
+          "initialization": 0
+        },
+        "request_id": "3b50a32d-3014-4c44-a0b2-039560cdff7b",
+        "stats": {
+          "cold_start": false,
+          "failure": false,
+          "memory_used": null
+        },
+        "times": {
+          "benchmark": 3724924,
+          "client": 5749513,
+          "client_begin": "2025-11-23 15:20:31.716253",
+          "client_end": "2025-11-23 15:20:37.465766",
+          "http_first_byte_return": 5.749141,
+          "http_startup": 0.000296,
+          "initialization": 0
+        }
+      },
+      "51ef80d2-70e3-427a-ac8d-61fd9269878e": {
+        "billing": {
+          "_billed_time": null,
+          "_gb_seconds": 0,
+          "_memory": null
+        },
+        "output": {
+          "begin": "1763907640.430133",
+          "end": "1763907640.430140",
+          "is_cold": false,
+          "request_id": "51ef80d2-70e3-427a-ac8d-61fd9269878e",
+          "result": {
+            "output": {
+              "measurement": {
+                "compute_time": 1404551.0,
+                "download_size": 15350231,
+                "download_time": 72988.0,
+                "upload_size": 42661,
+                "upload_time": 4235.0
+              },
+              "result": {
+                "bucket": "sebs-benchmarks-local-9e8a1fb2",
+                "key": "220.video-processing-gpu-0-output/processed-city.606b0d90.mp4"
+              }
+            }
+          }
+        },
+        "provider_times": {
+          "execution": 0,
+          "initialization": 0
+        },
+        "request_id": "51ef80d2-70e3-427a-ac8d-61fd9269878e",
+        "stats": {
+          "cold_start": false,
+          "failure": false,
+          "memory_used": null
+        },
+        "times": {
+          "benchmark": 7,
+          "client": 1483008,
+          "client_begin": "2025-11-23 15:20:40.433191",
+          "client_end": "2025-11-23 15:20:41.916199",
+          "http_first_byte_return": 1.482853,
+          "http_startup": 0.000191,
+          "initialization": 0
+        }
+      },
+      "9ca32b60-2f1d-45d5-905a-abc2da650f56": {
+        "billing": {
+          "_billed_time": null,
+          "_gb_seconds": 0,
+          "_memory": null
+        },
+        "output": {
+          "begin": "1763907638.930096",
+          "end": "1763907638.930101",
+          "is_cold": false,
+          "request_id": "9ca32b60-2f1d-45d5-905a-abc2da650f56",
+          "result": {
+            "output": {
+              "measurement": {
+                "compute_time": 1419573.0,
+                "download_size": 15350231,
+                "download_time": 69220.0,
+                "upload_size": 42661,
+                "upload_time": 9621.0
+              },
+              "result": {
+                "bucket": "sebs-benchmarks-local-9e8a1fb2",
+                "key": "220.video-processing-gpu-0-output/processed-city.5fa4adb4.mp4"
+              }
+            }
+          }
+        },
+        "provider_times": {
+          "execution": 0,
+          "initialization": 0
+        },
+        "request_id": "9ca32b60-2f1d-45d5-905a-abc2da650f56",
+        "stats": {
+          "cold_start": false,
+          "failure": false,
+          "memory_used": null
+        },
+        "times": {
+          "benchmark": 5,
+          "client": 1499886,
+          "client_begin": "2025-11-23 15:20:38.932968",
+          "client_end": "2025-11-23 15:20:40.432854",
+          "http_first_byte_return": 1.499677,
+          "http_startup": 0.000399,
+          "initialization": 0
+        }
+      },
+      "a36c549f-9bb7-49a7-860e-f0f415db37f2": {
+        "billing": {
+          "_billed_time": null,
+          "_gb_seconds": 0,
+          "_memory": null
+        },
+        "output": {
+          "begin": "1763907641.913196",
+          "end": "1763907641.913202",
+          "is_cold": false,
+          "request_id": "a36c549f-9bb7-49a7-860e-f0f415db37f2",
+          "result": {
+            "output": {
+              "measurement": {
+                "compute_time": 1430172.0,
+                "download_size": 15350231,
+                "download_time": 82087.0,
+                "upload_size": 42661,
+                "upload_time": 5858.0
+              },
+              "result": {
+                "bucket": "sebs-benchmarks-local-9e8a1fb2",
+                "key": "220.video-processing-gpu-0-output/processed-city.5cd4d46a.mp4"
+              }
+            }
+          }
+        },
+        "provider_times": {
+          "execution": 0,
+          "initialization": 0
+        },
+        "request_id": "a36c549f-9bb7-49a7-860e-f0f415db37f2",
+        "stats": {
+          "cold_start": false,
+          "failure": false,
+          "memory_used": null
+        },
+        "times": {
+          "benchmark": 6,
+          "client": 1519133,
+          "client_begin": "2025-11-23 15:20:41.916414",
+          "client_end": "2025-11-23 15:20:43.435547",
+          "http_first_byte_return": 1.51901,
+          "http_startup": 0.000124,
+          "initialization": 0
+        }
+      },
+      "c97d051d-838c-4261-bda0-5dd5dc8e5bd5": {
+        "billing": {
+          "_billed_time": null,
+          "_gb_seconds": 0,
+          "_memory": null
+        },
+        "output": {
+          "begin": "1763907637.467845",
+          "end": "1763907637.467854",
+          "is_cold": false,
+          "request_id": "c97d051d-838c-4261-bda0-5dd5dc8e5bd5",
+          "result": {
+            "output": {
+              "measurement": {
+                "compute_time": 1386454.0,
+                "download_size": 15350231,
+                "download_time": 68422.0,
+                "upload_size": 42661,
+                "upload_time": 5302.0
+              },
+              "result": {
+                "bucket": "sebs-benchmarks-local-9e8a1fb2",
+                "key": "220.video-processing-gpu-0-output/processed-city.94a14cbc.mp4"
+              }
+            }
+          }
+        },
+        "provider_times": {
+          "execution": 0,
+          "initialization": 0
+        },
+        "request_id": "c97d051d-838c-4261-bda0-5dd5dc8e5bd5",
+        "stats": {
+          "cold_start": false,
+          "failure": false,
+          "memory_used": null
+        },
+        "times": {
+          "benchmark": 9,
+          "client": 1464927,
+          "client_begin": "2025-11-23 15:20:37.467411",
+          "client_end": "2025-11-23 15:20:38.932338",
+          "http_first_byte_return": 1.464707,
+          "http_startup": 0.000377,
+          "initialization": 0
+        }
+      }
+    }
+  },
+  "_metrics": {},
+  "begin_time": 1763907631.64001,
+  "config": {
+    "deployment": {
+      "name": "local",
+      "region": "",
+      "resources": {
+        "allocated_ports": [
+          9000,
+          9001
+        ],
+        "nosql": {
+          "access_key": "None",
+          "address": "host.docker.internal:9012",
+          "alternator_port": 8000,
+          "cpus": 1,
+          "data_volume": "scylladb-volume",
+          "instance_id": "1b7969ee333b5fc962e8223946be900aa4bf0b3532fb111cd62b6a1d6618c2b9",
+          "mapped_port": 9012,
+          "memory": "750",
+          "region": "None",
+          "secret_key": "None",
+          "version": "6.0"
+        },
+        "storage": {
+          "access_key": "cStL2xQ1C1c2wpB_25M3QnnNO4aizEjhHEH203q_a7s",
+          "address": "host.docker.internal:9011",
+          "data_volume": "minio-volume",
+          "input_buckets": [],
+          "instance_id": "0571b3121a5e06773187dff7ffc30b4c2a9f63ae7c05ff592e3c45b9b3a6d1e9",
+          "mapped_port": 9011,
+          "output_buckets": [],
+          "secret_key": "22a1207b6c97d3669271e9896097e3d8fddd6160e3c75444fb0eabf8b9ef8681",
+          "type": "minio",
+          "version": "RELEASE.2024-07-16T23-46-41Z"
+        }
+      }
+    },
+    "experiments": {
+      "architecture": "x64",
+      "container_deployment": false,
+      "download_results": false,
+      "experiments": {},
+      "flags": {},
+      "runtime": {
+        "language": "python",
+        "version": "3.10"
+      },
+      "update_code": true,
+      "update_storage": false
+    }
+  },
+  "end_time": 1763907643.435708,
+  "result_bucket": null
+}
diff --git a/install.py b/install.py
index 57f047d23..b856e45b7 100755
--- a/install.py
+++ b/install.py
@@ -86,7 +86,7 @@ def execute(cmd, cwd=None):
             execute(f"git pull", cwd=data_dir)
         # clone
         else:
-            execute(f"git clone https://github.com/spcl/serverless-benchmarks-data.git {data_dir}")
+            execute(f"git clone https://github.com/McLavish/serverless-benchmarks-data-dphpc.git {data_dir}")
     else:
         raise error
 
@@ -99,4 +99,3 @@ def execute(cmd, cwd=None):
     execute("python3 setup.py build")
     execute("python3 pypapi/papi_build.py")
     os.chdir(cur_dir)
-
diff --git a/requirements.txt b/requirements.txt
index b8c1517f0..bee695ee6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,7 @@ tzlocal>=2.1
 requests
 #linting
 flake8
-black==22.8.0
+black==24.4.2
 mypy
 types-pycurl
 types-requests
diff --git a/sebs/__init__.py b/sebs/__init__.py
index b92b9f25c..bd1b0b43f 100644
--- a/sebs/__init__.py
+++ b/sebs/__init__.py
@@ -1,5 +1,5 @@
 """
-    SeBS
+SeBS
 """
 
 from .version import __version__  # noqa
diff --git a/sebs/aws/aws.py b/sebs/aws/aws.py
index 243a6f0f9..a50353154 100644
--- a/sebs/aws/aws.py
+++ b/sebs/aws/aws.py
@@ -131,7 +131,12 @@ def package_code(
         if container_deployment:
             # build base image and upload to ECR
             _, container_uri = self.ecr_client.build_base_image(
-                directory, language_name, language_version, architecture, benchmark, is_cached
+                directory,
+                language_name,
+                language_version,
+                architecture,
+                benchmark,
+                is_cached,
             )
 
         CONFIG_FILES = {
diff --git a/sebs/aws/config.py b/sebs/aws/config.py
index 2d05e842e..f779b2e91 100644
--- a/sebs/aws/config.py
+++ b/sebs/aws/config.py
@@ -21,7 +21,9 @@ def __init__(self, access_key: str, secret_key: str):
         self._secret_key = secret_key
 
         client = boto3.client(
-            "sts", aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key
+            "sts",
+            aws_access_key_id=self.access_key,
+            aws_secret_access_key=self.secret_key,
         )
         self._account_id = client.get_caller_identity()["Account"]
 
@@ -334,7 +336,8 @@ def update_cache(self, cache: Cache):
             val=self.docker_username, keys=["aws", "resources", "docker", "username"]
         )
         cache.update_config(
-            val=self.container_repository, keys=["aws", "resources", "container_repository"]
+            val=self.container_repository,
+            keys=["aws", "resources", "container_repository"],
         )
         cache.update_config(val=self._lambda_role, keys=["aws", "resources", "lambda-role"])
         for name, api in self._http_apis.items():
diff --git a/sebs/aws/container.py b/sebs/aws/container.py
index e7c2cbe69..2d9280e57 100644
--- a/sebs/aws/container.py
+++ b/sebs/aws/container.py
@@ -36,7 +36,11 @@ def client(self) -> ECRClient:
         return self.ecr_client
 
     def registry_name(
-        self, benchmark: str, language_name: str, language_version: str, architecture: str
+        self,
+        benchmark: str,
+        language_name: str,
+        language_version: str,
+        architecture: str,
     ) -> Tuple[str, str, str, str]:
 
         account_id = self.config.credentials.account_id
diff --git a/sebs/aws/dynamodb.py b/sebs/aws/dynamodb.py
index 0f3cc8782..1f66cf89c 100644
--- a/sebs/aws/dynamodb.py
+++ b/sebs/aws/dynamodb.py
@@ -103,7 +103,11 @@ def write_to_table(
     """
 
     def create_table(
-        self, benchmark: str, name: str, primary_key: str, secondary_key: Optional[str] = None
+        self,
+        benchmark: str,
+        name: str,
+        primary_key: str,
+        secondary_key: Optional[str] = None,
     ) -> str:
 
         table_name = f"sebs-benchmarks-{self._cloud_resources.resources_id}-{benchmark}-{name}"
diff --git a/sebs/aws/resources.py b/sebs/aws/resources.py
index 5913c3928..b76a7283a 100644
--- a/sebs/aws/resources.py
+++ b/sebs/aws/resources.py
@@ -60,7 +60,7 @@ def get_storage(self, replace_existing: Optional[bool] = None) -> PersistentStor
                 self.config.region,
                 access_key=self.config.credentials.access_key,
                 secret_key=self.config.credentials.secret_key,
-                replace_existing=replace_existing if replace_existing is not None else False,
+                replace_existing=(replace_existing if replace_existing is not None else False),
             )
             self._storage.logging_handlers = self._logging_handlers
         elif replace_existing is not None:
diff --git a/sebs/azure/azure.py b/sebs/azure/azure.py
index d848d724a..a28e80150 100644
--- a/sebs/azure/azure.py
+++ b/sebs/azure/azure.py
@@ -85,7 +85,6 @@ def shutdown(self):
         super().shutdown()
 
     def find_deployments(self) -> List[str]:
-
         """
         Look for duplicated resource groups.
         """
@@ -288,7 +287,8 @@ def update_function(
 
         if not found_trigger:
             trigger = HTTPTrigger(
-                function_url, self.config.resources.data_storage_account(self.cli_instance)
+                function_url,
+                self.config.resources.data_storage_account(self.cli_instance),
             )
             trigger.logging_handlers = self.logging_handlers
             function.add_trigger(trigger)
diff --git a/sebs/azure/cli.py b/sebs/azure/cli.py
index b875ee029..3ecefdc8a 100644
--- a/sebs/azure/cli.py
+++ b/sebs/azure/cli.py
@@ -85,7 +85,6 @@ def login(self, appId: str, tenant: str, password: str) -> bytes:
         return result
 
     def upload_package(self, directory: str, dest: str):
-
         """
         This is not an efficient and memory-intensive implementation.
         So far, we didn't have very large functions that require many gigabytes.
diff --git a/sebs/azure/cloud_resources.py b/sebs/azure/cloud_resources.py
index e0d2a1ddd..a80051161 100644
--- a/sebs/azure/cloud_resources.py
+++ b/sebs/azure/cloud_resources.py
@@ -35,7 +35,10 @@ def from_cache(account_name: str, url: str, credential: str) -> "CosmosDBAccount
 
     @staticmethod
     def from_allocation(
-        account_name: str, resource_group: str, cli_instance: AzureCLI, url: Optional[str]
+        account_name: str,
+        resource_group: str,
+        cli_instance: AzureCLI,
+        url: Optional[str],
     ) -> "CosmosDBAccount":
 
         if url is None:
diff --git a/sebs/azure/config.py b/sebs/azure/config.py
index 9aef0d8c0..ede4f1b60 100644
--- a/sebs/azure/config.py
+++ b/sebs/azure/config.py
@@ -20,7 +20,11 @@ class AzureCredentials(Credentials):
     _password: str
 
     def __init__(
-        self, appId: str, tenant: str, password: str, subscription_id: Optional[str] = None
+        self,
+        appId: str,
+        tenant: str,
+        password: str,
+        subscription_id: Optional[str] = None,
     ):
         super().__init__()
         self._appId = appId
diff --git a/sebs/azure/cosmosdb.py b/sebs/azure/cosmosdb.py
index 52f8086b1..3f4cc5168 100644
--- a/sebs/azure/cosmosdb.py
+++ b/sebs/azure/cosmosdb.py
@@ -77,7 +77,9 @@ def retrieve_cache(self, benchmark: str) -> bool:
     def update_cache(self, benchmark: str):
 
         self.cache_client.update_nosql(
-            self.deployment_name(), benchmark, self._benchmark_resources[benchmark].serialize()
+            self.deployment_name(),
+            benchmark,
+            self._benchmark_resources[benchmark].serialize(),
         )
 
     def cosmos_client(self) -> CosmosClient:
@@ -89,7 +91,8 @@ def cosmos_client(self) -> CosmosClient:
             )
 
             self._cosmos_client = CosmosClient(
-                url=self._cosmosdb_account.url, credential=self._cosmosdb_account.credential
+                url=self._cosmosdb_account.url,
+                credential=self._cosmosdb_account.credential,
             )
 
         return self._cosmos_client
diff --git a/sebs/azure/system_resources.py b/sebs/azure/system_resources.py
index 0e3494d1c..95eff859a 100644
--- a/sebs/azure/system_resources.py
+++ b/sebs/azure/system_resources.py
@@ -56,7 +56,7 @@ def get_storage(self, replace_existing: Optional[bool] = None) -> BlobStorage:
                 self._cache_client,
                 self.config.resources,
                 self.config.resources.data_storage_account(self.cli_instance).connection_string,
-                replace_existing=replace_existing if replace_existing is not None else False,
+                replace_existing=(replace_existing if replace_existing is not None else False),
             )
             self._storage.logging_handlers = self.logging_handlers
         elif replace_existing is not None:
@@ -66,7 +66,10 @@ def get_storage(self, replace_existing: Optional[bool] = None) -> BlobStorage:
     def get_nosql_storage(self) -> CosmosDB:
         if self._nosql_storage is None:
             self._nosql_storage = CosmosDB(
-                self.cli_instance, self._cache_client, self.config.resources, self.config.region
+                self.cli_instance,
+                self._cache_client,
+                self.config.resources,
+                self.config.region,
             )
         return self._nosql_storage
 
diff --git a/sebs/benchmark.py b/sebs/benchmark.py
index f159e820c..9379443e4 100644
--- a/sebs/benchmark.py
+++ b/sebs/benchmark.py
@@ -24,7 +24,11 @@
 
 class BenchmarkConfig:
     def __init__(
-        self, timeout: int, memory: int, languages: List["Language"], modules: List[BenchmarkModule]
+        self,
+        timeout: int,
+        memory: int,
+        languages: List["Language"],
+        modules: List[BenchmarkModule],
     ):
         self._timeout = timeout
         self._memory = memory
@@ -591,7 +595,12 @@ def build(
                 "Using cached benchmark {} at {}".format(self.benchmark, self.code_location)
             )
             if self.container_deployment:
-                return False, self.code_location, self.container_deployment, self.container_uri
+                return (
+                    False,
+                    self.code_location,
+                    self.container_deployment,
+                    self.container_uri,
+                )
 
             return False, self.code_location, self.container_deployment, ""
 
@@ -642,7 +651,12 @@ def build(
             self._cache_client.add_code_package(self._deployment_name, self)
         self.query_cache()
 
-        return True, self._code_location, self._container_deployment, self._container_uri
+        return (
+            True,
+            self._code_location,
+            self._container_deployment,
+            self._container_uri,
+        )
 
     """
         Locates benchmark input generator, inspect how many storage buckets
@@ -655,9 +669,11 @@ def build(
     """
 
     def prepare_input(
-        self, system_resources: SystemResources, size: str, replace_existing: bool = False
+        self,
+        system_resources: SystemResources,
+        size: str,
+        replace_existing: bool = False,
     ):
-
         """
         Handle object storage buckets.
         """
@@ -684,7 +700,10 @@ def prepare_input(
         if hasattr(self._benchmark_input_module, "allocate_nosql"):
 
             nosql_storage = system_resources.get_nosql_storage()
-            for name, table_properties in self._benchmark_input_module.allocate_nosql().items():
+            for (
+                name,
+                table_properties,
+            ) in self._benchmark_input_module.allocate_nosql().items():
                 nosql_storage.create_benchmark_tables(
                     self._benchmark,
                     name,
@@ -701,7 +720,13 @@ def prepare_input(
         # storage.allocate_buckets(self.benchmark, buckets)
         # Get JSON and upload data as required by benchmark
         input_config = self._benchmark_input_module.generate_input(
-            self._benchmark_data_path, size, bucket, input, output, storage_func, nosql_func
+            self._benchmark_data_path,
+            size,
+            bucket,
+            input,
+            output,
+            storage_func,
+            nosql_func,
         )
 
         # Cache only once we data is in the cloud.
diff --git a/sebs/experiments/perf_cost.py b/sebs/experiments/perf_cost.py
index 7b940f8df..8e7efff13 100644
--- a/sebs/experiments/perf_cost.py
+++ b/sebs/experiments/perf_cost.py
@@ -354,7 +354,10 @@ def process(
                         ) as out_f:
                             out_f.write(
                                 serialize(
-                                    {**json.loads(serialize(experiments)), "statistics": statistics}
+                                    {
+                                        **json.loads(serialize(experiments)),
+                                        "statistics": statistics,
+                                    }
                                 )
                             )
                 for func in experiments.functions():
diff --git a/sebs/faas/container.py b/sebs/faas/container.py
index b17525f7b..0be66f60a 100644
--- a/sebs/faas/container.py
+++ b/sebs/faas/container.py
@@ -98,7 +98,10 @@ def push_image(self, repository_uri, image_tag):
 
                     self.logging.info(f"Pushing image {image_tag} to {repository_uri}")
                     ret = self.docker_client.images.push(
-                        repository=repository_uri, tag=image_tag, stream=True, decode=True
+                        repository=repository_uri,
+                        tag=image_tag,
+                        stream=True,
+                        decode=True,
                     )
                     for line in ret:
                         self.show_progress(line, progress, layer_tasks)
@@ -122,7 +125,11 @@ def push_image(self, repository_uri, image_tag):
 
     @abstractmethod
     def registry_name(
-        self, benchmark: str, language_name: str, language_version: str, architecture: str
+        self,
+        benchmark: str,
+        language_name: str,
+        language_version: str,
+        architecture: str,
     ) -> Tuple[str, str, str, str]:
         pass
 
diff --git a/sebs/faas/nosql.py b/sebs/faas/nosql.py
index 16f9ab119..a45b4072c 100644
--- a/sebs/faas/nosql.py
+++ b/sebs/faas/nosql.py
@@ -59,7 +59,11 @@ def envs(self) -> dict:
     """
 
     def create_benchmark_tables(
-        self, benchmark: str, name: str, primary_key: str, secondary_key: Optional[str] = None
+        self,
+        benchmark: str,
+        name: str,
+        primary_key: str,
+        secondary_key: Optional[str] = None,
     ):
 
         if self.retrieve_cache(benchmark):
@@ -84,7 +88,11 @@ def create_benchmark_tables(
 
     @abstractmethod
     def create_table(
-        self, benchmark: str, name: str, primary_key: str, secondary_key: Optional[str] = None
+        self,
+        benchmark: str,
+        name: str,
+        primary_key: str,
+        secondary_key: Optional[str] = None,
     ) -> str:
         pass
 
diff --git a/sebs/faas/storage.py b/sebs/faas/storage.py
index 5b93c0539..6b36cbc27 100644
--- a/sebs/faas/storage.py
+++ b/sebs/faas/storage.py
@@ -37,7 +37,11 @@ def region(self):
         return self._region
 
     def __init__(
-        self, region: str, cache_client: Cache, resources: Resources, replace_existing: bool
+        self,
+        region: str,
+        cache_client: Cache,
+        resources: Resources,
+        replace_existing: bool,
     ):
         super().__init__()
         self._cache_client = cache_client
@@ -143,7 +147,6 @@ def remove_bucket(self, bucket: str):
     def benchmark_data(
         self, benchmark: str, requested_buckets: Tuple[int, int]
     ) -> Tuple[List[str], List[str]]:
-
         """
         Add an input path inside benchmarks bucket.
         Bucket name format: name-idx-input
diff --git a/sebs/faas/system.py b/sebs/faas/system.py
index 9fbe0e273..5872f176a 100644
--- a/sebs/faas/system.py
+++ b/sebs/faas/system.py
@@ -75,7 +75,6 @@ def function_type() -> "Type[Function]":
         pass
 
     def find_deployments(self) -> List[str]:
-
         """
         Default implementation that uses storage buckets.
         data storage accounts.
@@ -187,7 +186,6 @@ def create_function(
         container_deployment: bool,
         container_uri: str,
     ) -> Function:
-
         """
         Create a new function in the FaaS platform.
         The implementation is responsible for creating all necessary
diff --git a/sebs/gcp/cli.py b/sebs/gcp/cli.py
index 65ca33bc2..893d8331d 100644
--- a/sebs/gcp/cli.py
+++ b/sebs/gcp/cli.py
@@ -14,7 +14,10 @@ def typename() -> str:
         return "GCP.CLI"
 
     def __init__(
-        self, credentials: GCPCredentials, system_config: SeBSConfig, docker_client: docker.client
+        self,
+        credentials: GCPCredentials,
+        system_config: SeBSConfig,
+        docker_client: docker.client,
     ):
 
         super().__init__()
diff --git a/sebs/gcp/datastore.py b/sebs/gcp/datastore.py
index ae747fb17..8e2ab2f2f 100644
--- a/sebs/gcp/datastore.py
+++ b/sebs/gcp/datastore.py
@@ -35,7 +35,11 @@ def deployment_name():
         return "gcp"
 
     def __init__(
-        self, cli_instance: GCloudCLI, cache_client: Cache, resources: Resources, region: str
+        self,
+        cli_instance: GCloudCLI,
+        cache_client: Cache,
+        resources: Resources,
+        region: str,
     ):
         super().__init__(region, cache_client, resources)
         self._cli_instance = cli_instance
@@ -76,7 +80,9 @@ def retrieve_cache(self, benchmark: str) -> bool:
     def update_cache(self, benchmark: str):
 
         self._cache_client.update_nosql(
-            self.deployment_name(), benchmark, self._benchmark_resources[benchmark].serialize()
+            self.deployment_name(),
+            benchmark,
+            self._benchmark_resources[benchmark].serialize(),
         )
 
     def benchmark_database(self, benchmark: str) -> str:
diff --git a/sebs/gcp/gcp.py b/sebs/gcp/gcp.py
index 6525034c2..aff182305 100644
--- a/sebs/gcp/gcp.py
+++ b/sebs/gcp/gcp.py
@@ -181,7 +181,11 @@ def package_code(
         # rename the main.py back to handler.py
         shutil.move(new_path, old_path)
 
-        return os.path.join(directory, "{}.zip".format(benchmark)), bytes_size, container_uri
+        return (
+            os.path.join(directory, "{}.zip".format(benchmark)),
+            bytes_size,
+            container_uri,
+        )
 
     def create_function(
         self,
@@ -256,7 +260,10 @@ def create_function(
                     body={
                         "policy": {
                             "bindings": [
-                                {"role": "roles/cloudfunctions.invoker", "members": ["allUsers"]}
+                                {
+                                    "role": "roles/cloudfunctions.invoker",
+                                    "members": ["allUsers"],
+                                }
                             ]
                         }
                     },
@@ -538,7 +545,12 @@ def shutdown(self) -> None:
         super().shutdown()
 
     def download_metrics(
-        self, function_name: str, start_time: int, end_time: int, requests: dict, metrics: dict
+        self,
+        function_name: str,
+        start_time: int,
+        end_time: int,
+        requests: dict,
+        metrics: dict,
     ):
 
         from google.api_core import exceptions
diff --git a/sebs/gcp/resources.py b/sebs/gcp/resources.py
index 0a7d5c14d..6d613a199 100644
--- a/sebs/gcp/resources.py
+++ b/sebs/gcp/resources.py
@@ -61,7 +61,10 @@ def get_storage(self, replace_existing: Optional[bool] = None) -> GCPStorage:
     def get_nosql_storage(self) -> Datastore:
         if not self._nosql_storage:
             self._nosql_storage = Datastore(
-                self.cli_instance, self._cache_client, self.config.resources, self.config.region
+                self.cli_instance,
+                self._cache_client,
+                self.config.resources,
+                self.config.region,
             )
         return self._nosql_storage
 
diff --git a/sebs/gcp/storage.py b/sebs/gcp/storage.py
index c578966f1..aeddd498b 100644
--- a/sebs/gcp/storage.py
+++ b/sebs/gcp/storage.py
@@ -29,7 +29,11 @@ def replace_existing(self, val: bool):
         self._replace_existing = val
 
     def __init__(
-        self, region: str, cache_client: Cache, resources: Resources, replace_existing: bool
+        self,
+        region: str,
+        cache_client: Cache,
+        resources: Resources,
+        replace_existing: bool,
     ):
         super().__init__(region, cache_client, resources, replace_existing)
         self.replace_existing = replace_existing
diff --git a/sebs/local/config.py b/sebs/local/config.py
index 0b512c67c..478af6596 100644
--- a/sebs/local/config.py
+++ b/sebs/local/config.py
@@ -53,7 +53,8 @@ def initialize(res: Resources, config: dict):
     def update_cache(self, cache: Cache):
         super().update_cache(cache)
         cache.update_config(
-            val=list(self._allocated_ports), keys=["local", "resources", "allocated_ports"]
+            val=list(self._allocated_ports),
+            keys=["local", "resources", "allocated_ports"],
         )
 
     @staticmethod
@@ -113,7 +114,11 @@ def deserialize(config: dict, cache: Cache, handlers: LoggingHandlers) -> Config
         return config_obj
 
     def serialize(self) -> dict:
-        out = {"name": "local", "region": self._region, "resources": self._resources.serialize()}
+        out = {
+            "name": "local",
+            "region": self._region,
+            "resources": self._resources.serialize(),
+        }
         return out
 
     def update_cache(self, cache: Cache):
diff --git a/sebs/local/deployment.py b/sebs/local/deployment.py
index 85f7df8e7..267088487 100644
--- a/sebs/local/deployment.py
+++ b/sebs/local/deployment.py
@@ -69,7 +69,9 @@ def deserialize(path: str, cache_client: Cache) -> "Deployment":
                 deployment._memory_measurement_pids = input_data["memory_measurements"]["pids"]
                 deployment._measurement_file = input_data["memory_measurements"]["file"]
             deployment._storage = Minio.deserialize(
-                MinioConfig.deserialize(input_data["storage"]), cache_client, LocalResources()
+                MinioConfig.deserialize(input_data["storage"]),
+                cache_client,
+                LocalResources(),
             )
             return deployment
 
diff --git a/sebs/local/local.py b/sebs/local/local.py
index 32b9f9ffb..7f49974e5 100644
--- a/sebs/local/local.py
+++ b/sebs/local/local.py
@@ -222,7 +222,12 @@ def _start_container(
             container_kwargs["command"] = f"/bin/bash /sebs/run_server.sh {port}"
             container_kwargs["ports"] = {f"{port}/tcp": port}
 
-        container = self._docker_client.containers.run(**container_kwargs)
+        from docker.types import DeviceRequest
+
+        container = self._docker_client.containers.run(
+            **container_kwargs,
+            device_requests=[DeviceRequest(driver="nvidia", count=-1, capabilities=[["gpu"]])],
+        )
 
         pid: Optional[int] = None
         if self.measurements_enabled and self._memory_measurement_path is not None:
diff --git a/sebs/openwhisk/config.py b/sebs/openwhisk/config.py
index bba54f7c7..44b031b7a 100644
--- a/sebs/openwhisk/config.py
+++ b/sebs/openwhisk/config.py
@@ -111,13 +111,16 @@ def deserialize(config: dict, cache: Cache, handlers: LoggingHandlers) -> Resour
     def update_cache(self, cache: Cache):
         super().update_cache(cache)
         cache.update_config(
-            val=self.docker_registry, keys=["openwhisk", "resources", "docker", "registry"]
+            val=self.docker_registry,
+            keys=["openwhisk", "resources", "docker", "registry"],
         )
         cache.update_config(
-            val=self.docker_username, keys=["openwhisk", "resources", "docker", "username"]
+            val=self.docker_username,
+            keys=["openwhisk", "resources", "docker", "username"],
         )
         cache.update_config(
-            val=self.docker_password, keys=["openwhisk", "resources", "docker", "password"]
+            val=self.docker_password,
+            keys=["openwhisk", "resources", "docker", "password"],
         )
 
     def serialize(self) -> dict:
diff --git a/sebs/openwhisk/container.py b/sebs/openwhisk/container.py
index 2dd27717e..a5c7688a7 100644
--- a/sebs/openwhisk/container.py
+++ b/sebs/openwhisk/container.py
@@ -26,7 +26,11 @@ def __init__(
         self.config = config
 
     def registry_name(
-        self, benchmark: str, language_name: str, language_version: str, architecture: str
+        self,
+        benchmark: str,
+        language_name: str,
+        language_version: str,
+        architecture: str,
     ) -> Tuple[str, str, str, str]:
 
         registry_name = self.config.resources.docker_registry
diff --git a/sebs/openwhisk/function.py b/sebs/openwhisk/function.py
index daf851ca6..5c4e62c27 100644
--- a/sebs/openwhisk/function.py
+++ b/sebs/openwhisk/function.py
@@ -38,7 +38,11 @@ def from_benchmark(benchmark: Benchmark) -> OpenWhiskFunctionConfig:
 
 class OpenWhiskFunction(Function):
     def __init__(
-        self, name: str, benchmark: str, code_package_hash: str, cfg: OpenWhiskFunctionConfig
+        self,
+        name: str,
+        benchmark: str,
+        code_package_hash: str,
+        cfg: OpenWhiskFunctionConfig,
     ):
         super().__init__(benchmark, name, code_package_hash, cfg)
 
@@ -60,7 +64,10 @@ def deserialize(cached_config: dict) -> OpenWhiskFunction:
 
         cfg = OpenWhiskFunctionConfig.deserialize(cached_config["config"])
         ret = OpenWhiskFunction(
-            cached_config["name"], cached_config["benchmark"], cached_config["hash"], cfg
+            cached_config["name"],
+            cached_config["benchmark"],
+            cached_config["hash"],
+            cfg,
         )
         for trigger in cached_config["triggers"]:
             trigger_type = cast(
diff --git a/sebs/openwhisk/openwhisk.py b/sebs/openwhisk/openwhisk.py
index 9c196fe25..5f471cfc0 100644
--- a/sebs/openwhisk/openwhisk.py
+++ b/sebs/openwhisk/openwhisk.py
@@ -43,7 +43,10 @@ def __init__(
         self.logging_handlers = logger_handlers
 
         self.container_client = OpenWhiskContainer(
-            self.system_config, self.config, self.docker_client, self.config.experimentalManifest
+            self.system_config,
+            self.config,
+            self.docker_client,
+            self.config.experimentalManifest,
         )
 
         if self.config.resources.docker_username:
@@ -107,7 +110,12 @@ def package_code(
         # Regardless of Docker image status, we need to create .zip file
         # to allow registration of function with OpenWhisk
         _, image_uri = self.container_client.build_base_image(
-            directory, language_name, language_version, architecture, benchmark, is_cached
+            directory,
+            language_name,
+            language_version,
+            architecture,
+            benchmark,
+            is_cached,
         )
 
         # We deploy Minio config in code package since this depends on local
@@ -120,7 +128,9 @@ def package_code(
 
         benchmark_archive = os.path.join(directory, f"{benchmark}.zip")
         subprocess.run(
-            ["zip", benchmark_archive] + package_config, stdout=subprocess.DEVNULL, cwd=directory
+            ["zip", benchmark_archive] + package_config,
+            stdout=subprocess.DEVNULL,
+            cwd=directory,
         )
         self.logging.info(f"Created {benchmark_archive} archive")
         bytes_size = os.path.getsize(benchmark_archive)
@@ -230,7 +240,10 @@ def create_function(
                     )
                     function_cfg.docker_image = docker_image
                     res = OpenWhiskFunction(
-                        func_name, code_package.benchmark, code_package.hash, function_cfg
+                        func_name,
+                        code_package.benchmark,
+                        code_package.hash,
+                        function_cfg,
                     )
                 except subprocess.CalledProcessError as e:
                     self.logging.error(f"Cannot create action {func_name}.")
diff --git a/sebs/regression.py b/sebs/regression.py
index 579760a1c..01dc8d071 100644
--- a/sebs/regression.py
+++ b/sebs/regression.py
@@ -21,6 +21,8 @@
     "220.video-processing",
     "311.compression",
     "411.image-recognition",
+    "412.language-bert",
+    "413.recommendation",
     "501.graph-pagerank",
     "502.graph-mst",
     "503.graph-bfs",
diff --git a/sebs/sebs.py b/sebs/sebs.py
index 309c0b253..34b0b2197 100644
--- a/sebs/sebs.py
+++ b/sebs/sebs.py
@@ -199,14 +199,18 @@ def get_benchmark(
         return benchmark
 
     @staticmethod
-    def get_storage_implementation(storage_type: types.Storage) -> Type[PersistentStorage]:
+    def get_storage_implementation(
+        storage_type: types.Storage,
+    ) -> Type[PersistentStorage]:
         _storage_implementations = {types.Storage.MINIO: minio.Minio}
         impl = _storage_implementations.get(storage_type)
         assert impl
         return impl
 
     @staticmethod
-    def get_nosql_implementation(storage_type: types.NoSQLStorage) -> Type[NoSQLStorage]:
+    def get_nosql_implementation(
+        storage_type: types.NoSQLStorage,
+    ) -> Type[NoSQLStorage]:
         _storage_implementations = {types.NoSQLStorage.SCYLLADB: scylladb.ScyllaDB}
         impl = _storage_implementations.get(storage_type)
         assert impl
diff --git a/sebs/storage/scylladb.py b/sebs/storage/scylladb.py
index aae97815d..2d2f23af1 100644
--- a/sebs/storage/scylladb.py
+++ b/sebs/storage/scylladb.py
@@ -144,7 +144,8 @@ def configure_connection(self):
             if platform.system() == "Linux" and "microsoft" not in platform.release().lower():
                 networks = self._storage_container.attrs["NetworkSettings"]["Networks"]
                 self._cfg.address = "{IPAddress}:{Port}".format(
-                    IPAddress=networks["bridge"]["IPAddress"], Port=self._cfg.alternator_port
+                    IPAddress=networks["bridge"]["IPAddress"],
+                    Port=self._cfg.alternator_port,
                 )
             else:
                 # System is either WSL, Windows, or Mac
@@ -169,7 +170,10 @@ def stop(self):
             self.logging.error("Stopping ScyllaDB was not succesful, storage container not known!")
 
     def envs(self) -> dict:
-        return {"NOSQL_STORAGE_TYPE": "scylladb", "NOSQL_STORAGE_ENDPOINT": self._cfg.address}
+        return {
+            "NOSQL_STORAGE_TYPE": "scylladb",
+            "NOSQL_STORAGE_ENDPOINT": self._cfg.address,
+        }
 
     def serialize(self) -> Tuple[StorageType, dict]:
         return StorageType.SCYLLADB, self._cfg.serialize()
@@ -186,7 +190,10 @@ def serialize(self) -> Tuple[StorageType, dict]:
 
     @staticmethod
     def _deserialize(
-        cached_config: ScyllaDBConfig, cache_client: Cache, resources: Resources, obj_type: Type[T]
+        cached_config: ScyllaDBConfig,
+        cache_client: Cache,
+        resources: Resources,
+        obj_type: Type[T],
     ) -> T:
         docker_client = docker.from_env()
         obj = obj_type(docker_client, cache_client, cached_config, resources)
@@ -269,7 +276,11 @@ def write_to_table(
     """
 
     def create_table(
-        self, benchmark: str, name: str, primary_key: str, secondary_key: Optional[str] = None
+        self,
+        benchmark: str,
+        name: str,
+        primary_key: str,
+        secondary_key: Optional[str] = None,
     ) -> str:
 
         table_name = f"sebs-benchmarks-{self._cloud_resources.resources_id}-{benchmark}-{name}"
diff --git a/watermarking_bench/.gitkeep b/watermarking_bench/.gitkeep
new file mode 100644
index 000000000..8b1378917
--- /dev/null
+++ b/watermarking_bench/.gitkeep
@@ -0,0 +1 @@
+
diff --git a/watermarking_bench/gpu_bench.py b/watermarking_bench/gpu_bench.py
new file mode 100644
index 000000000..0f45a3a12
--- /dev/null
+++ b/watermarking_bench/gpu_bench.py
@@ -0,0 +1,293 @@
+#!/usr/bin/env python3
+import argparse, datetime, json, os, re, shutil, subprocess, sys, tempfile, csv
+from typing import List, Dict, Any, Optional, Tuple
+
+# --- helpers ---------------------------------------------------------------
+
+def which_ffmpeg() -> str:
+    p = shutil.which("ffmpeg")
+    if not p:
+        sys.exit("ffmpeg not found on PATH. Use Docker image with NVENC or install FFmpeg with NVENC.")
+    return p
+
+def run(cmd: List[str]) -> subprocess.CompletedProcess:
+    return subprocess.run(cmd, stdin=subprocess.DEVNULL, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
+
+def has_encoder(ffmpeg: str, enc: str) -> bool:
+    out = run([ffmpeg, "-hide_banner", "-encoders"]).stdout
+    return re.search(rf"\b{re.escape(enc)}\b", out) is not None
+
+def has_filter(ffmpeg: str, name: str) -> bool:
+    out = run([ffmpeg, "-hide_banner", "-filters"]).stdout
+    return (f" {name} " in out)
+
+def gpu_info() -> Dict[str, Any]:
+    try:
+        out = run(["nvidia-smi", "--query-gpu=name,memory.total,driver_version", "--format=csv,noheader,nounits"]).stdout.strip()
+        name, mem, drv = [x.strip() for x in out.splitlines()[0].split(",")]
+        return {"name": name, "memory_total_mb": int(mem), "driver_version": drv}
+    except Exception:
+        return {"name": None, "memory_total_mb": None, "driver_version": None}
+
+def parse_progress(log: str) -> Dict[str, Any]:
+    lines = [ln for ln in log.splitlines() if ("fps=" in ln or "speed=" in ln or "frame=" in ln)]
+    fps = speed = frames = None
+    if lines:
+        last = lines[-1]
+        m = re.search(r"fps=\s*([0-9]+(?:\.[0-9]+)?)", last);  fps = float(m.group(1)) if m else None
+        m = re.search(r"speed=\s*([0-9]+(?:\.[0-9]+)?)x", last); speed = float(m.group(1)) if m else None
+        m = re.search(r"frame=\s*([0-9]+)", last);              frames = int(m.group(1)) if m else None
+    return {"fps": fps, "speed_x": speed, "frames": frames}
+
+# --- filter planning -------------------------------------------------------
+
+def build_vf_or_complex(
+    ffmpeg: str,
+    scale: Optional[str],
+    wm_path: Optional[str],
+    overlay: str,
+    want_gpu_decode: bool
+) -> Tuple[List[str], str]:
+    """
+    Returns (ffmpeg_args_for_filters, filter_used_string).
+
+    Priority:
+      - Prefer GPU filters: scale_npp, then scale_cuda, then CPU scale with explicit bridges.
+      - Prefer overlay_cuda; else CPU overlay with explicit bridges.
+      - Never place 'format=nv12' *after* 'hwupload_cuda'.
+    """
+    used = []
+    vf_args: List[str] = []
+    complex_graph = ""
+
+    have_scale_npp   = has_filter(ffmpeg, "scale_npp")
+    have_scale_cuda  = has_filter(ffmpeg, "scale_cuda")
+    have_overlay_cuda= has_filter(ffmpeg, "overlay_cuda")
+
+    # No watermark case
+    if not wm_path:
+        if scale:
+            if want_gpu_decode and have_scale_npp:
+                vf_args = ["-vf", f"scale_npp={scale}"]
+                used.append("scale_npp")
+            elif want_gpu_decode and have_scale_cuda:
+                vf_args = ["-vf", f"scale_cuda={scale}"]
+                used.append("scale_cuda")
+            else:
+                # CPU scale with explicit bridges
+                # hw frames -> CPU: hwdownload,format=nv12
+                # CPU scale -> back to GPU: hwupload_cuda
+                vf_args = ["-vf", f"hwdownload,format=nv12,scale={scale},hwupload_cuda"]
+                used.append("scale(cpu)+hwdownload+hwupload_cuda")
+        else:
+            vf_args = []
+        return (vf_args, "+".join(used))
+
+    # Watermark case
+    if want_gpu_decode and have_overlay_cuda:
+        if scale and have_scale_npp:
+            complex_graph = f"[0:v]scale_npp={scale}[v0];[v0][1:v]overlay_cuda={overlay}[vout]"
+            used += ["scale_npp","overlay_cuda"]
+        elif scale and have_scale_cuda:
+            complex_graph = f"[0:v]scale_cuda={scale}[v0];[v0][1:v]overlay_cuda={overlay}[vout]"
+            used += ["scale_cuda","overlay_cuda"]
+        elif scale:
+            complex_graph = (
+                f"[0:v]hwdownload,format=nv12,scale={scale},hwupload_cuda[v0];"
+                f"[v0][1:v]overlay_cuda={overlay}[vout]"
+            )
+            used += ["scale(cpu)+hwdownload+hwupload_cuda","overlay_cuda"]
+        else:
+            complex_graph = f"[0:v][1:v]overlay_cuda={overlay}[vout]"
+            used += ["overlay_cuda"]
+        return (["-filter_complex", complex_graph, "-map", "[vout]"], "+".join(used))
+
+    # CPU overlay fallback
+    if scale and want_gpu_decode and (have_scale_npp or have_scale_cuda):
+        scaler = "scale_npp" if have_scale_npp else "scale_cuda"
+        complex_graph = (
+            f"[0:v]{scaler}={scale}[v0gpu];"
+            f"[v0gpu]hwdownload,format=nv12[v0cpu];"
+            f"[v0cpu][1:v]overlay={overlay}[mix];"
+            f"[mix]hwupload_cuda[vout]"
+        )
+        used += [scaler, "hwdownload+overlay(cpu)+hwupload_cuda"]
+    elif scale:
+        complex_graph = (
+            f"[0:v]hwdownload,format=nv12,scale={scale}[v0cpu];"
+            f"[v0cpu][1:v]overlay={overlay}[mix];"
+            f"[mix]hwupload_cuda[vout]"
+        )
+        used += ["scale(cpu)+overlay(cpu)+hwupload_cuda"]
+    else:
+        complex_graph = (
+            f"[0:v]hwdownload,format=nv12[v0cpu];"
+            f"[v0cpu][1:v]overlay={overlay}[mix];"
+            f"[mix]hwupload_cuda[vout]"
+        )
+        used += ["overlay(cpu)+hwupload_cuda"]
+
+    return (["-filter_complex", complex_graph, "-map", "[vout]"], "+".join(used))
+
+# --- core ------------------------------------------------------------------
+
+def transcode_once(
+    ffmpeg: str,
+    inp: str,
+    outp: str,
+    codec: str,
+    bitrate: str,
+    preset: str,
+    duration: Optional[float],
+    scale: Optional[str],
+    wm_path: Optional[str],
+    overlay_pos: str,
+    decode_mode: str = "gpu"  # "gpu" or "cpu"
+) -> Dict[str, Any]:
+
+    if not has_encoder(ffmpeg, codec):
+        raise RuntimeError(f"encoder '{codec}' not available; check your ffmpeg build (NVENC/AV1).")
+
+    want_gpu_decode = (decode_mode == "gpu")
+
+    args = [ffmpeg, "-hide_banner", "-y", "-vsync", "0"]
+
+    if want_gpu_decode:
+        # Keep decode on GPU & use CUDA frames. Give NVDEC extra surfaces.
+        args += ["-hwaccel", "cuda", "-hwaccel_output_format", "cuda", "-extra_hw_frames", "16"]
+        # Helpful on some builds to make filters pick the right device
+        args += ["-init_hw_device", "cuda=cuda", "-filter_hw_device", "cuda"]
+
+    # inputs
+    args += ["-i", inp]
+    if wm_path:
+        args += ["-loop", "1", "-i", wm_path]
+
+    if duration:
+        args += ["-t", str(duration)]
+
+    # Build filters
+    filt_args, filter_used = build_vf_or_complex(ffmpeg, scale, wm_path, overlay_pos, want_gpu_decode)
+    args += filt_args
+
+    # encoder params
+    args += ["-c:v", codec, "-b:v", bitrate, "-preset", preset, "-rc", "vbr", "-movflags", "+faststart"]
+    # audio: copy if present
+    args += ["-c:a", "copy"]
+
+    # Output path
+    args += [outp]
+
+    t0 = datetime.datetime.now()
+    proc = run(args)
+    t1 = datetime.datetime.now()
+    if proc.returncode != 0:
+        raise RuntimeError("ffmpeg failed:\n" + proc.stdout + f"\n\nARGS:\n{' '.join(args)}")
+
+    parsed = parse_progress(proc.stdout)
+    size = os.path.getsize(outp) if os.path.exists(outp) else 0
+    return {
+        "args": args,
+        "filter_used": filter_used,
+        "stdout_tail": "\n".join(proc.stdout.splitlines()[-15:]),
+        "compute_time_us": (t1 - t0) / datetime.timedelta(microseconds=1),
+        "fps": parsed["fps"],
+        "speed_x": parsed["speed_x"],
+        "frames": parsed["frames"],
+        "output_size_bytes": size
+    }
+
+def main():
+    ap = argparse.ArgumentParser(description="GPU NVENC benchmark.")
+    ap.add_argument("--input", required=True, help="Path to input video")
+    ap.add_argument("--duration", type=float, default=None, help="Trim to first N seconds")
+    ap.add_argument("--repeat", type=int, default=1, help="Repeat each trial")
+    ap.add_argument("--warmup", action="store_true", help="Run one warmup trial (not recorded)")
+    ap.add_argument("--csv", default=None, help="Optional path to write CSV summary")
+    ap.add_argument("--watermark", default=None, help="Path to watermark PNG (optional)")
+    ap.add_argument("--overlay", default="main_w/2-overlay_w/2:main_h/2-overlay_h/2",
+                    help="Overlay position (ffmpeg expr), e.g. '10:10' or 'main_w-overlay_w-10:10'")
+    ap.add_argument("--decode", choices=["gpu","cpu"], default="gpu",
+                    help="Decode on GPU (default) or CPU.")
+    ap.add_argument("--trials", nargs="+", default=[
+        "codec=h264_nvenc,bitrate=5M,preset=p5",
+        "codec=h264_nvenc,bitrate=12M,preset=p1,scale=1920:1080",
+        "codec=hevc_nvenc,bitrate=6M,preset=p4",
+        "codec=av1_nvenc,bitrate=3M,preset=p5"
+    ], help="List like codec=h264_nvenc,bitrate=5M,preset=p5[,scale=WxH]")
+    args = ap.parse_args()
+
+    ffmpeg = which_ffmpeg()
+    gi = gpu_info()
+
+    def parse_trial(s: str) -> Dict[str, str]:
+        d: Dict[str, str] = {}
+        for kv in s.split(","):
+            k, v = kv.split("=", 1)
+            d[k.strip()] = v.strip()
+        return d
+
+    trial_specs = [parse_trial(s) for s in args.trials]
+
+    # optional warmup
+    if args.warmup:
+        with tempfile.NamedTemporaryFile(suffix=".mp4", delete=True) as tmp:
+            _ = transcode_once(ffmpeg, args.input, tmp.name,
+                               trial_specs[0].get("codec","h264_nvenc"),
+                               trial_specs[0].get("bitrate","5M"),
+                               trial_specs[0].get("preset","p5"),
+                               args.duration,
+                               trial_specs[0].get("scale"),
+                               args.watermark,
+                               args.overlay,
+                               args.decode)
+
+    results = []
+    idx = 0
+    for spec in trial_specs:
+        for _ in range(args.repeat):
+            with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp:
+                outp = tmp.name
+            res = transcode_once(ffmpeg, args.input, outp,
+                                 spec.get("codec","h264_nvenc"),
+                                 spec.get("bitrate","5M"),
+                                 spec.get("preset","p5"),
+                                 args.duration,
+                                 spec.get("scale"),
+                                 args.watermark,
+                                 args.overlay,
+                                 args.decode)
+            results.append({
+                "trial_index": idx,
+                "codec": spec.get("codec"),
+                "bitrate": spec.get("bitrate"),
+                "preset": spec.get("preset"),
+                "scale_filter": res["filter_used"],
+                "fps": res["fps"],
+                "speed_x": res["speed_x"],
+                "frames": res["frames"],
+                "compute_time_us": res["compute_time_us"],
+                "output_size_bytes": res["output_size_bytes"],
+                "stdout_tail": res["stdout_tail"],
+                "argv": " ".join(res["args"]),
+            })
+            idx += 1
+            try: os.remove(outp)
+            except OSError: pass
+
+    report = {
+        "gpu": gi,
+        "ffmpeg_path": ffmpeg,
+        "trial_count": len(results),
+        "results": results
+    }
+    print(json.dumps(report, indent=2))
+
+    if args.csv and results:
+        with open(args.csv, "w", newline="") as f:
+            w = csv.DictWriter(f, fieldnames=list(results[0].keys()))
+            w.writeheader()
+            w.writerows(results)
+
+if __name__ == "__main__":
+    main()
diff --git a/watermarking_bench/read.me b/watermarking_bench/read.me
new file mode 100644
index 000000000..efe5b1ab9
--- /dev/null
+++ b/watermarking_bench/read.me
@@ -0,0 +1,3 @@
+chmod +x run_nvenc_bench.sh
+./run_nvenc_bench.sh              # uses ~/bench/sample.mp4 (auto-creates)
+./run_nvenc_bench.sh /path/video.mp4  # use your own file
diff --git a/watermarking_bench/results.csv b/watermarking_bench/results.csv
new file mode 100644
index 000000000..7dcd68aa1
--- /dev/null
+++ b/watermarking_bench/results.csv
@@ -0,0 +1,46 @@
+trial_index,codec,bitrate,preset,scale_filter,fps,speed_x,frames,compute_time_us,output_size_bytes,stdout_tail,argv
+0,h264_nvenc,5M,p5,,73.0,2.44,240,5879259.0,2272623,"    Side data:
+      cpb: bitrate max/min/avg: 0/0/5000000 buffer size: 10000000 vbv_delay: N/A
+  Stream #0:1(und): Audio: aac (LC) (mp4a / 0x6134706D), 48000 Hz, mono, fltp, 69 kb/s (default)
+    Metadata:
+      handler_name    : SoundHandler
+      vendor_id       : [0][0][0][0]
+frame=    1 fps=0.0 q=0.0 size=       0kB time=00:00:00.12 bitrate=   3.0kbits/s speed=1.44x    
+frame=   41 fps=0.0 q=22.0 size=       0kB time=00:00:01.47 bitrate=   0.3kbits/s speed=2.49x    
+frame=   81 fps= 74 q=12.0 size=     256kB time=00:00:02.81 bitrate= 744.9kbits/s speed=2.58x    
+frame=  121 fps= 76 q=12.0 size=     768kB time=00:00:04.13 bitrate=1520.3kbits/s speed=2.59x    
+frame=  161 fps= 77 q=12.0 size=    1024kB time=00:00:05.48 bitrate=1530.1kbits/s speed=2.61x    
+frame=  201 fps= 77 q=13.0 size=    1536kB time=00:00:06.80 bitrate=1849.0kbits/s speed=2.62x    
+[mp4 @ 0x601c5da3d280] Starting second pass: moving the moov atom to the beginning of the file
+frame=  240 fps= 73 q=13.0 Lsize=    2219kB time=00:00:07.97 bitrate=2278.7kbits/s speed=2.44x    
+video:2142kB audio:68kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.409259%",/usr/bin/ffmpeg -hide_banner -y -vsync 0 -hwaccel cuda -hwaccel_output_format cuda -extra_hw_frames 16 -init_hw_device cuda=cuda -filter_hw_device cuda -i ./sample.mp4 -t 8.0 -c:v h264_nvenc -b:v 5M -preset p5 -rc vbr -movflags +faststart -c:a copy /tmp/tmpy5hxojjv.mp4
+1,h264_nvenc,12M,p1,scale_cuda,191.0,6.34,240,3748632.0,3041922,"      handler_name    : VideoHandler
+      vendor_id       : [0][0][0][0]
+      encoder         : Lavc58.134.100 h264_nvenc
+    Side data:
+      cpb: bitrate max/min/avg: 0/0/12000000 buffer size: 24000000 vbv_delay: N/A
+  Stream #0:1(und): Audio: aac (LC) (mp4a / 0x6134706D), 48000 Hz, mono, fltp, 69 kb/s (default)
+    Metadata:
+      handler_name    : SoundHandler
+      vendor_id       : [0][0][0][0]
+frame=    1 fps=0.0 q=0.0 size=       0kB time=00:00:00.12 bitrate=   3.0kbits/s speed=1.51x    
+frame=  102 fps=0.0 q=7.0 size=     768kB time=00:00:03.52 bitrate=1787.5kbits/s speed=5.93x    
+frame=  209 fps=191 q=7.0 size=    2304kB time=00:00:07.08 bitrate=2664.9kbits/s speed=6.46x    
+[mp4 @ 0x5c6c573cf740] Starting second pass: moving the moov atom to the beginning of the file
+frame=  240 fps=191 q=7.0 Lsize=    2971kB time=00:00:07.97 bitrate=3050.1kbits/s speed=6.34x    
+video:2895kB audio:68kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.274427%",/usr/bin/ffmpeg -hide_banner -y -vsync 0 -hwaccel cuda -hwaccel_output_format cuda -extra_hw_frames 16 -init_hw_device cuda=cuda -filter_hw_device cuda -i ./sample.mp4 -t 8.0 -vf scale_cuda=1920:1080 -c:v h264_nvenc -b:v 12M -preset p1 -rc vbr -movflags +faststart -c:a copy /tmp/tmp68ay0l6q.mp4
+2,hevc_nvenc,6M,p4,,101.0,3.37,240,4821593.0,2393406,"      encoder         : Lavc58.134.100 hevc_nvenc
+    Side data:
+      cpb: bitrate max/min/avg: 0/0/6000000 buffer size: 12000000 vbv_delay: N/A
+  Stream #0:1(und): Audio: aac (LC) (mp4a / 0x6134706D), 48000 Hz, mono, fltp, 69 kb/s (default)
+    Metadata:
+      handler_name    : SoundHandler
+      vendor_id       : [0][0][0][0]
+frame=    1 fps=0.0 q=0.0 size=       0kB time=00:00:00.12 bitrate=   2.8kbits/s speed=1.18x    
+frame=   52 fps=0.0 q=17.0 size=       0kB time=00:00:01.83 bitrate=   0.2kbits/s speed=2.98x    
+frame=  110 fps= 98 q=12.0 size=     512kB time=00:00:03.77 bitrate=1110.9kbits/s speed=3.36x    
+frame=  168 fps=103 q=9.0 size=    1280kB time=00:00:05.71 bitrate=1834.1kbits/s speed=3.52x    
+frame=  226 fps=106 q=12.0 size=    1792kB time=00:00:07.63 bitrate=1922.2kbits/s speed=3.59x    
+[mp4 @ 0x62016db565c0] Starting second pass: moving the moov atom to the beginning of the file
+frame=  240 fps=101 q=12.0 Lsize=    2337kB time=00:00:07.97 bitrate=2399.8kbits/s speed=3.37x    
+video:2260kB audio:68kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.392147%",/usr/bin/ffmpeg -hide_banner -y -vsync 0 -hwaccel cuda -hwaccel_output_format cuda -extra_hw_frames 16 -init_hw_device cuda=cuda -filter_hw_device cuda -i ./sample.mp4 -t 8.0 -c:v hevc_nvenc -b:v 6M -preset p4 -rc vbr -movflags +faststart -c:a copy /tmp/tmpkjy5g24f.mp4
diff --git a/watermarking_bench/results_long.csv b/watermarking_bench/results_long.csv
new file mode 100644
index 000000000..e0dc05606
--- /dev/null
+++ b/watermarking_bench/results_long.csv
@@ -0,0 +1,136 @@
+trial_index,codec,bitrate,preset,scale_filter,fps,speed_x,frames,compute_time_us,output_size_bytes,stdout_tail,argv
+0,h264_nvenc,5M,p5,,77.0,2.57,600,10411441.0,6064925,"frame=  121 fps= 76 q=12.0 size=     768kB time=00:00:04.13 bitrate=1520.3kbits/s speed= 2.6x    
+frame=  161 fps= 77 q=12.0 size=    1024kB time=00:00:05.48 bitrate=1530.1kbits/s speed=2.62x    
+frame=  201 fps= 77 q=13.0 size=    1536kB time=00:00:06.80 bitrate=1849.0kbits/s speed=2.62x    
+frame=  241 fps= 78 q=12.0 size=    2048kB time=00:00:08.14 bitrate=2058.8kbits/s speed=2.63x    
+frame=  281 fps= 78 q=13.0 size=    2304kB time=00:00:09.47 bitrate=1992.7kbits/s speed=2.63x    
+frame=  321 fps= 78 q=12.0 size=    2816kB time=00:00:10.81 bitrate=2132.9kbits/s speed=2.64x    
+frame=  361 fps= 78 q=12.0 size=    3072kB time=00:00:12.13 bitrate=2073.2kbits/s speed=2.64x    
+frame=  401 fps= 79 q=12.0 size=    3584kB time=00:00:13.48 bitrate=2177.6kbits/s speed=2.64x    
+frame=  441 fps= 79 q=13.0 size=    4096kB time=00:00:14.80 bitrate=2266.4kbits/s speed=2.64x    
+frame=  481 fps= 79 q=13.0 size=    4352kB time=00:00:16.14 bitrate=2207.6kbits/s speed=2.64x    
+frame=  521 fps= 79 q=14.0 size=    4864kB time=00:00:17.47 bitrate=2280.6kbits/s speed=2.64x    
+frame=  561 fps= 79 q=12.0 size=    5120kB time=00:00:18.81 bitrate=2229.1kbits/s speed=2.65x    
+[mp4 @ 0x62a7ce754b00] Starting second pass: moving the moov atom to the beginning of the file
+frame=  600 fps= 77 q=12.0 Lsize=    5923kB time=00:00:19.98 bitrate=2427.3kbits/s speed=2.57x    
+video:5733kB audio:170kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.348635%",/usr/bin/ffmpeg -hide_banner -y -vsync 0 -hwaccel cuda -hwaccel_output_format cuda -extra_hw_frames 16 -init_hw_device cuda=cuda -filter_hw_device cuda -i ./sample.mp4 -t 30.0 -c:v h264_nvenc -b:v 5M -preset p5 -rc vbr -movflags +faststart -c:a copy /tmp/tmpzb_ed9aq.mp4
+1,h264_nvenc,5M,p5,,76.0,2.52,600,10294387.0,6064925,"frame=  150 fps= 71 q=12.0 size=    1024kB time=00:00:05.12 bitrate=1638.5kbits/s speed=2.44x    
+frame=  190 fps= 73 q=12.0 size=    1280kB time=00:00:06.44 bitrate=1627.6kbits/s speed=2.48x    
+frame=  230 fps= 74 q=13.0 size=    1792kB time=00:00:07.78 bitrate=1885.3kbits/s speed=2.51x    
+frame=  270 fps= 75 q=9.0 size=    2304kB time=00:00:09.10 bitrate=2072.0kbits/s speed=2.53x    
+frame=  310 fps= 75 q=9.0 size=    2560kB time=00:00:10.45 bitrate=2006.2kbits/s speed=2.54x    
+frame=  350 fps= 76 q=9.0 size=    3072kB time=00:00:11.77 bitrate=2137.1kbits/s speed=2.55x    
+frame=  390 fps= 76 q=9.0 size=    3584kB time=00:00:13.12 bitrate=2237.8kbits/s speed=2.57x    
+frame=  430 fps= 77 q=9.0 size=    3840kB time=00:00:14.44 bitrate=2178.1kbits/s speed=2.57x    
+frame=  470 fps= 77 q=9.0 size=    4352kB time=00:00:15.78 bitrate=2258.4kbits/s speed=2.58x    
+frame=  510 fps= 77 q=9.0 size=    4608kB time=00:00:17.10 bitrate=2206.3kbits/s speed=2.59x    
+frame=  550 fps= 77 q=12.0 size=    5120kB time=00:00:18.45 bitrate=2272.9kbits/s speed=2.59x    
+frame=  590 fps= 77 q=12.0 size=    5632kB time=00:00:19.77 bitrate=2333.0kbits/s speed=2.59x    
+[mp4 @ 0x60d928a16bc0] Starting second pass: moving the moov atom to the beginning of the file
+frame=  600 fps= 76 q=12.0 Lsize=    5923kB time=00:00:19.98 bitrate=2427.3kbits/s speed=2.52x    
+video:5733kB audio:170kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.348635%",/usr/bin/ffmpeg -hide_banner -y -vsync 0 -hwaccel cuda -hwaccel_output_format cuda -extra_hw_frames 16 -init_hw_device cuda=cuda -filter_hw_device cuda -i ./sample.mp4 -t 30.0 -c:v h264_nvenc -b:v 5M -preset p5 -rc vbr -movflags +faststart -c:a copy /tmp/tmpy9va1dr6.mp4
+2,h264_nvenc,5M,p5,,77.0,2.57,600,10306400.0,6064925,"frame=  121 fps= 76 q=12.0 size=     768kB time=00:00:04.13 bitrate=1520.3kbits/s speed=2.59x    
+frame=  161 fps= 77 q=12.0 size=    1024kB time=00:00:05.48 bitrate=1530.1kbits/s speed=2.61x    
+frame=  201 fps= 77 q=13.0 size=    1536kB time=00:00:06.80 bitrate=1849.0kbits/s speed=2.62x    
+frame=  241 fps= 78 q=12.0 size=    2048kB time=00:00:08.14 bitrate=2058.8kbits/s speed=2.63x    
+frame=  281 fps= 78 q=13.0 size=    2304kB time=00:00:09.47 bitrate=1992.7kbits/s speed=2.63x    
+frame=  321 fps= 78 q=12.0 size=    2816kB time=00:00:10.81 bitrate=2132.9kbits/s speed=2.63x    
+frame=  361 fps= 78 q=12.0 size=    3072kB time=00:00:12.13 bitrate=2073.2kbits/s speed=2.63x    
+frame=  401 fps= 78 q=12.0 size=    3584kB time=00:00:13.48 bitrate=2177.6kbits/s speed=2.64x    
+frame=  441 fps= 79 q=13.0 size=    4096kB time=00:00:14.80 bitrate=2266.4kbits/s speed=2.64x    
+frame=  481 fps= 79 q=13.0 size=    4352kB time=00:00:16.14 bitrate=2207.6kbits/s speed=2.64x    
+frame=  521 fps= 79 q=14.0 size=    4864kB time=00:00:17.47 bitrate=2280.6kbits/s speed=2.64x    
+frame=  561 fps= 79 q=12.0 size=    5120kB time=00:00:18.81 bitrate=2229.1kbits/s speed=2.64x    
+[mp4 @ 0x5c30462b0640] Starting second pass: moving the moov atom to the beginning of the file
+frame=  600 fps= 77 q=12.0 Lsize=    5923kB time=00:00:19.98 bitrate=2427.3kbits/s speed=2.57x    
+video:5733kB audio:170kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.348635%",/usr/bin/ffmpeg -hide_banner -y -vsync 0 -hwaccel cuda -hwaccel_output_format cuda -extra_hw_frames 16 -init_hw_device cuda=cuda -filter_hw_device cuda -i ./sample.mp4 -t 30.0 -c:v h264_nvenc -b:v 5M -preset p5 -rc vbr -movflags +faststart -c:a copy /tmp/tmp8540g4_n.mp4
+3,h264_nvenc,12M,p1,scale_cuda,195.0,6.48,600,5529076.0,8430659,"    Side data:
+      cpb: bitrate max/min/avg: 0/0/12000000 buffer size: 24000000 vbv_delay: N/A
+  Stream #0:1(und): Audio: aac (LC) (mp4a / 0x6134706D), 48000 Hz, mono, fltp, 69 kb/s (default)
+    Metadata:
+      handler_name    : SoundHandler
+      vendor_id       : [0][0][0][0]
+frame=    1 fps=0.0 q=0.0 size=       0kB time=00:00:00.12 bitrate=   3.0kbits/s speed=1.48x    
+frame=  102 fps=0.0 q=7.0 size=     768kB time=00:00:03.52 bitrate=1787.5kbits/s speed=5.92x    
+frame=  209 fps=191 q=7.0 size=    2304kB time=00:00:07.08 bitrate=2664.9kbits/s speed=6.46x    
+frame=  290 fps=181 q=7.0 size=    3584kB time=00:00:09.77 bitrate=3005.0kbits/s speed=6.11x    
+frame=  396 fps=188 q=7.0 size=    5120kB time=00:00:13.31 bitrate=3150.8kbits/s speed=6.32x    
+frame=  502 fps=192 q=7.0 size=    6656kB time=00:00:16.85 bitrate=3235.3kbits/s speed=6.46x    
+[mp4 @ 0x587cfb879b80] Starting second pass: moving the moov atom to the beginning of the file
+frame=  600 fps=195 q=7.0 Lsize=    8233kB time=00:00:19.98 bitrate=3374.1kbits/s speed=6.48x    
+video:8045kB audio:170kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.222219%",/usr/bin/ffmpeg -hide_banner -y -vsync 0 -hwaccel cuda -hwaccel_output_format cuda -extra_hw_frames 16 -init_hw_device cuda=cuda -filter_hw_device cuda -i ./sample.mp4 -t 30.0 -vf scale_cuda=1920:1080 -c:v h264_nvenc -b:v 12M -preset p1 -rc vbr -movflags +faststart -c:a copy /tmp/tmp5_mf5dkd.mp4
+4,h264_nvenc,12M,p1,scale_cuda,203.0,6.75,600,5264378.0,8430659,"    Side data:
+      cpb: bitrate max/min/avg: 0/0/12000000 buffer size: 24000000 vbv_delay: N/A
+  Stream #0:1(und): Audio: aac (LC) (mp4a / 0x6134706D), 48000 Hz, mono, fltp, 69 kb/s (default)
+    Metadata:
+      handler_name    : SoundHandler
+      vendor_id       : [0][0][0][0]
+frame=    1 fps=0.0 q=0.0 size=       0kB time=00:00:00.12 bitrate=   3.0kbits/s speed= 1.5x    
+frame=  102 fps=0.0 q=7.0 size=     768kB time=00:00:03.52 bitrate=1787.5kbits/s speed=5.95x    
+frame=  209 fps=191 q=7.0 size=    2304kB time=00:00:07.08 bitrate=2664.9kbits/s speed=6.48x    
+frame=  315 fps=197 q=7.0 size=    3840kB time=00:00:10.60 bitrate=2967.0kbits/s speed=6.65x    
+frame=  420 fps=200 q=7.0 size=    5376kB time=00:00:14.10 bitrate=3123.1kbits/s speed=6.72x    
+frame=  526 fps=202 q=7.0 size=    6912kB time=00:00:17.64 bitrate=3209.5kbits/s speed=6.79x    
+[mp4 @ 0x64e038e965c0] Starting second pass: moving the moov atom to the beginning of the file
+frame=  600 fps=203 q=7.0 Lsize=    8233kB time=00:00:19.98 bitrate=3374.1kbits/s speed=6.75x    
+video:8045kB audio:170kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.222219%",/usr/bin/ffmpeg -hide_banner -y -vsync 0 -hwaccel cuda -hwaccel_output_format cuda -extra_hw_frames 16 -init_hw_device cuda=cuda -filter_hw_device cuda -i ./sample.mp4 -t 30.0 -vf scale_cuda=1920:1080 -c:v h264_nvenc -b:v 12M -preset p1 -rc vbr -movflags +faststart -c:a copy /tmp/tmp7t8tpliz.mp4
+5,h264_nvenc,12M,p1,scale_cuda,203.0,6.75,600,5273983.0,8430659,"    Side data:
+      cpb: bitrate max/min/avg: 0/0/12000000 buffer size: 24000000 vbv_delay: N/A
+  Stream #0:1(und): Audio: aac (LC) (mp4a / 0x6134706D), 48000 Hz, mono, fltp, 69 kb/s (default)
+    Metadata:
+      handler_name    : SoundHandler
+      vendor_id       : [0][0][0][0]
+frame=    1 fps=0.0 q=0.0 size=       0kB time=00:00:00.12 bitrate=   3.0kbits/s speed=1.51x    
+frame=  102 fps=0.0 q=7.0 size=     768kB time=00:00:03.52 bitrate=1787.5kbits/s speed=5.95x    
+frame=  209 fps=191 q=7.0 size=    2304kB time=00:00:07.08 bitrate=2664.9kbits/s speed=6.47x    
+frame=  315 fps=197 q=7.0 size=    3840kB time=00:00:10.60 bitrate=2967.0kbits/s speed=6.64x    
+frame=  420 fps=200 q=7.0 size=    5376kB time=00:00:14.10 bitrate=3123.1kbits/s speed=6.72x    
+frame=  526 fps=202 q=7.0 size=    6912kB time=00:00:17.64 bitrate=3209.5kbits/s speed=6.79x    
+[mp4 @ 0x5d9076a76740] Starting second pass: moving the moov atom to the beginning of the file
+frame=  600 fps=203 q=7.0 Lsize=    8233kB time=00:00:19.98 bitrate=3374.1kbits/s speed=6.75x    
+video:8045kB audio:170kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.222219%",/usr/bin/ffmpeg -hide_banner -y -vsync 0 -hwaccel cuda -hwaccel_output_format cuda -extra_hw_frames 16 -init_hw_device cuda=cuda -filter_hw_device cuda -i ./sample.mp4 -t 30.0 -vf scale_cuda=1920:1080 -c:v h264_nvenc -b:v 12M -preset p1 -rc vbr -movflags +faststart -c:a copy /tmp/tmpe5tfql7r.mp4
+6,hevc_nvenc,6M,p4,,110.0,3.67,600,7783224.0,6248386,"      vendor_id       : [0][0][0][0]
+frame=    1 fps=0.0 q=0.0 size=       0kB time=00:00:00.12 bitrate=   2.8kbits/s speed=1.52x    
+frame=   55 fps=0.0 q=18.0 size=       0kB time=00:00:01.94 bitrate=   0.2kbits/s speed=3.29x    
+frame=  113 fps=103 q=12.0 size=     768kB time=00:00:03.88 bitrate=1620.5kbits/s speed=3.54x    
+frame=  171 fps=107 q=12.0 size=    1280kB time=00:00:05.80 bitrate=1807.1kbits/s speed=3.63x    
+frame=  229 fps=109 q=12.0 size=    1792kB time=00:00:07.74 bitrate=1895.7kbits/s speed=3.69x    
+frame=  287 fps=110 q=12.0 size=    2560kB time=00:00:09.68 bitrate=2165.3kbits/s speed=3.72x    
+frame=  345 fps=111 q=12.0 size=    3072kB time=00:00:11.60 bitrate=2168.5kbits/s speed=3.74x    
+frame=  403 fps=112 q=12.0 size=    3840kB time=00:00:13.54 bitrate=2322.2kbits/s speed=3.76x    
+frame=  461 fps=112 q=12.0 size=    4352kB time=00:00:15.46 bitrate=2305.1kbits/s speed=3.77x    
+frame=  519 fps=113 q=12.0 size=    5120kB time=00:00:17.40 bitrate=2409.4kbits/s speed=3.78x    
+frame=  577 fps=113 q=12.0 size=    5632kB time=00:00:19.34 bitrate=2384.5kbits/s speed=3.79x    
+[mp4 @ 0x5dbdbab92440] Starting second pass: moving the moov atom to the beginning of the file
+frame=  600 fps=110 q=12.0 Lsize=    6102kB time=00:00:19.98 bitrate=2500.7kbits/s speed=3.67x    
+video:5912kB audio:170kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.339750%",/usr/bin/ffmpeg -hide_banner -y -vsync 0 -hwaccel cuda -hwaccel_output_format cuda -extra_hw_frames 16 -init_hw_device cuda=cuda -filter_hw_device cuda -i ./sample.mp4 -t 30.0 -c:v hevc_nvenc -b:v 6M -preset p4 -rc vbr -movflags +faststart -c:a copy /tmp/tmph3v5xk4u.mp4
+7,hevc_nvenc,6M,p4,,110.0,3.67,600,7939115.0,6248386,"      vendor_id       : [0][0][0][0]
+frame=    1 fps=0.0 q=0.0 size=       0kB time=00:00:00.12 bitrate=   2.8kbits/s speed=1.49x    
+frame=   55 fps=0.0 q=18.0 size=       0kB time=00:00:01.94 bitrate=   0.2kbits/s speed=3.28x    
+frame=  113 fps=103 q=12.0 size=     768kB time=00:00:03.88 bitrate=1620.5kbits/s speed=3.54x    
+frame=  171 fps=107 q=12.0 size=    1280kB time=00:00:05.80 bitrate=1807.1kbits/s speed=3.63x    
+frame=  229 fps=109 q=12.0 size=    1792kB time=00:00:07.74 bitrate=1895.7kbits/s speed=3.68x    
+frame=  287 fps=110 q=12.0 size=    2560kB time=00:00:09.68 bitrate=2165.3kbits/s speed=3.72x    
+frame=  345 fps=111 q=12.0 size=    3072kB time=00:00:11.60 bitrate=2168.5kbits/s speed=3.74x    
+frame=  403 fps=112 q=12.0 size=    3840kB time=00:00:13.54 bitrate=2322.2kbits/s speed=3.76x    
+frame=  461 fps=112 q=12.0 size=    4352kB time=00:00:15.46 bitrate=2305.1kbits/s speed=3.76x    
+frame=  519 fps=113 q=12.0 size=    5120kB time=00:00:17.40 bitrate=2409.4kbits/s speed=3.77x    
+frame=  577 fps=113 q=12.0 size=    5632kB time=00:00:19.34 bitrate=2384.5kbits/s speed=3.78x    
+[mp4 @ 0x56aa54a3cb00] Starting second pass: moving the moov atom to the beginning of the file
+frame=  600 fps=110 q=12.0 Lsize=    6102kB time=00:00:19.98 bitrate=2500.7kbits/s speed=3.67x    
+video:5912kB audio:170kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.339750%",/usr/bin/ffmpeg -hide_banner -y -vsync 0 -hwaccel cuda -hwaccel_output_format cuda -extra_hw_frames 16 -init_hw_device cuda=cuda -filter_hw_device cuda -i ./sample.mp4 -t 30.0 -c:v hevc_nvenc -b:v 6M -preset p4 -rc vbr -movflags +faststart -c:a copy /tmp/tmpbo93lv7k.mp4
+8,hevc_nvenc,6M,p4,,110.0,3.67,600,8011795.0,6248386,"      vendor_id       : [0][0][0][0]
+frame=    1 fps=0.0 q=0.0 size=       0kB time=00:00:00.12 bitrate=   2.8kbits/s speed=1.51x    
+frame=   55 fps=0.0 q=18.0 size=       0kB time=00:00:01.94 bitrate=   0.2kbits/s speed=3.28x    
+frame=  113 fps=103 q=12.0 size=     768kB time=00:00:03.88 bitrate=1620.5kbits/s speed=3.54x    
+frame=  171 fps=107 q=12.0 size=    1280kB time=00:00:05.80 bitrate=1807.1kbits/s speed=3.63x    
+frame=  229 fps=109 q=12.0 size=    1792kB time=00:00:07.74 bitrate=1895.7kbits/s speed=3.69x    
+frame=  287 fps=110 q=12.0 size=    2560kB time=00:00:09.68 bitrate=2165.3kbits/s speed=3.72x    
+frame=  345 fps=111 q=12.0 size=    3072kB time=00:00:11.60 bitrate=2168.5kbits/s speed=3.74x    
+frame=  403 fps=112 q=12.0 size=    3840kB time=00:00:13.54 bitrate=2322.2kbits/s speed=3.76x    
+frame=  461 fps=112 q=12.0 size=    4352kB time=00:00:15.46 bitrate=2305.1kbits/s speed=3.76x    
+frame=  519 fps=113 q=12.0 size=    5120kB time=00:00:17.40 bitrate=2409.4kbits/s speed=3.78x    
+frame=  577 fps=113 q=12.0 size=    5632kB time=00:00:19.34 bitrate=2384.5kbits/s speed=3.78x    
+[mp4 @ 0x55882f1e1400] Starting second pass: moving the moov atom to the beginning of the file
+frame=  600 fps=110 q=12.0 Lsize=    6102kB time=00:00:19.98 bitrate=2500.7kbits/s speed=3.67x    
+video:5912kB audio:170kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.339750%",/usr/bin/ffmpeg -hide_banner -y -vsync 0 -hwaccel cuda -hwaccel_output_format cuda -extra_hw_frames 16 -init_hw_device cuda=cuda -filter_hw_device cuda -i ./sample.mp4 -t 30.0 -c:v hevc_nvenc -b:v 6M -preset p4 -rc vbr -movflags +faststart -c:a copy /tmp/tmpehv_nft4.mp4
diff --git a/watermarking_bench/run_nvenc_bench.sh b/watermarking_bench/run_nvenc_bench.sh
new file mode 100644
index 000000000..c6ed88d31
--- /dev/null
+++ b/watermarking_bench/run_nvenc_bench.sh
@@ -0,0 +1,300 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# --- Config ---
+BENCH_DIR="${HOME}/bench"
+INPUT="${1:-${BENCH_DIR}/sample.mp4}"
+DURATION="${DURATION:-8}"         # seconds per trial
+REPEAT="${REPEAT:-1}"             # repeats per trial
+
+mkdir -p "$BENCH_DIR"
+cd "$BENCH_DIR"
+
+echo "==[ NVENC Bench Repro ]=="
+command -v nvidia-smi >/dev/null || { echo "nvidia-smi missing"; exit 1; }
+command -v ffmpeg >/dev/null || { echo "ffmpeg missing"; exit 1; }
+command -v python3 >/dev/null || { echo "python3 missing"; exit 1; }
+
+echo "GPU:"
+nvidia-smi --query-gpu=name,memory.total,driver_version --format=csv,noheader
+echo
+echo "Encoders (ffmpeg):"
+ffmpeg -hide_banner -encoders | grep -E 'nvenc|av1' || true
+echo
+echo "Filters (ffmpeg):"
+ffmpeg -hide_banner -filters  | grep -E 'scale_(npp|cuda)|overlay_cuda' || true
+echo
+
+# --- Make or update a working gpu_bench.py (GPU-first, safe bridges, skip missing encoders) ---
+cat > gpu_bench.py <<'PY'
+#!/usr/bin/env python3
+import argparse, datetime, json, os, re, shutil, subprocess, sys, tempfile, csv
+from typing import List, Dict, Any, Optional, Tuple
+
+def which_ffmpeg() -> str:
+    p = shutil.which("ffmpeg")
+    if not p:
+        sys.exit("ffmpeg not found on PATH. Use Docker image with NVENC or install FFmpeg with NVENC.")
+    return p
+
+def run(cmd: List[str]) -> subprocess.CompletedProcess:
+    return subprocess.run(cmd, stdin=subprocess.DEVNULL, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
+
+def has_encoder(ffmpeg: str, enc: str) -> bool:
+    out = run([ffmpeg, "-hide_banner", "-encoders"]).stdout
+    return re.search(rf"\b{re.escape(enc)}\b", out) is not None
+
+def has_filter(ffmpeg: str, name: str) -> bool:
+    out = run([ffmpeg, "-hide_banner", "-filters"]).stdout
+    return (f" {name} " in out)
+
+def gpu_info() -> Dict[str, Any]:
+    try:
+        out = run(["nvidia-smi", "--query-gpu=name,memory.total,driver_version", "--format=csv,noheader,nounits"]).stdout.strip()
+        name, mem, drv = [x.strip() for x in out.splitlines()[0].split(",")]
+        return {"name": name, "memory_total_mb": int(mem), "driver_version": drv}
+    except Exception:
+        return {"name": None, "memory_total_mb": None, "driver_version": None}
+
+def parse_progress(log: str) -> Dict[str, Any]:
+    lines = [ln for ln in log.splitlines() if ("fps=" in ln or "speed=" in ln or "frame=" in ln)]
+    fps = speed = frames = None
+    if lines:
+        last = lines[-1]
+        m = re.search(r"fps=\s*([0-9]+(?:\.[0-9]+)?)", last);  fps = float(m.group(1)) if m else None
+        m = re.search(r"speed=\s*([0-9]+(?:\.[0-9]+)?)x", last); speed = float(m.group(1)) if m else None
+        m = re.search(r"frame=\s*([0-9]+)", last);              frames = int(m.group(1)) if m else None
+    return {"fps": fps, "speed_x": speed, "frames": frames}
+
+def build_vf_or_complex(ffmpeg: str, scale: Optional[str], wm_path: Optional[str], overlay: str, want_gpu_decode: bool) -> Tuple[List[str], str]:
+    used = []
+    vf_args: List[str] = []
+    complex_graph = ""
+
+    have_scale_npp    = has_filter(ffmpeg, "scale_npp")
+    have_scale_cuda   = has_filter(ffmpeg, "scale_cuda")
+    have_overlay_cuda = has_filter(ffmpeg, "overlay_cuda")
+
+    if not wm_path:
+        if scale:
+            if want_gpu_decode and have_scale_npp:
+                vf_args = ["-vf", f"scale_npp={scale}"]; used.append("scale_npp")
+            elif want_gpu_decode and have_scale_cuda:
+                vf_args = ["-vf", f"scale_cuda={scale}"]; used.append("scale_cuda")
+            else:
+                vf_args = ["-vf", f"hwdownload,format=nv12,scale={scale},hwupload_cuda"]
+                used.append("scale(cpu)+hwdownload+hwupload_cuda")
+        return (vf_args, "+".join(used))
+
+    # watermark path
+    if want_gpu_decode and have_overlay_cuda:
+        if scale and have_scale_npp:
+            complex_graph = f"[0:v]scale_npp={scale}[v0];[v0][1:v]overlay_cuda={overlay}[vout]"
+            used += ["scale_npp","overlay_cuda"]
+        elif scale and have_scale_cuda:
+            complex_graph = f"[0:v]scale_cuda={scale}[v0];[v0][1:v]overlay_cuda={overlay}[vout]"
+            used += ["scale_cuda","overlay_cuda"]
+        elif scale:
+            complex_graph = f"[0:v]hwdownload,format=nv12,scale={scale},hwupload_cuda[v0];[v0][1:v]overlay_cuda={overlay}[vout]"
+            used += ["scale(cpu)+hwdownload+hwupload_cuda","overlay_cuda"]
+        else:
+            complex_graph = f"[0:v][1:v]overlay_cuda={overlay}[vout]"
+            used += ["overlay_cuda"]
+        return (["-filter_complex", complex_graph, "-map", "[vout]"], "+".join(used))
+
+    # CPU overlay fallback (explicit bridges)
+    if scale and want_gpu_decode and (have_scale_npp or have_scale_cuda):
+        scaler = "scale_npp" if have_scale_npp else "scale_cuda"
+        complex_graph = (
+            f"[0:v]{scaler}={scale}[v0gpu];"
+            f"[v0gpu]hwdownload,format=nv12[v0cpu];"
+            f"[v0cpu][1:v]overlay={overlay}[mix];"
+            f"[mix]hwupload_cuda[vout]"
+        )
+        used += [scaler, "hwdownload+overlay(cpu)+hwupload_cuda"]
+    elif scale:
+        complex_graph = (
+            f"[0:v]hwdownload,format=nv12,scale={scale}[v0cpu];"
+            f"[v0cpu][1:v]overlay={overlay}[mix];"
+            f"[mix]hwupload_cuda[vout]"
+        )
+        used += ["scale(cpu)+overlay(cpu)+hwupload_cuda"]
+    else:
+        complex_graph = (
+            f"[0:v]hwdownload,format=nv12[v0cpu];"
+            f"[v0cpu][1:v]overlay={overlay}[mix];"
+            f"[mix]hwupload_cuda[vout]"
+        )
+        used += ["overlay(cpu)+hwupload_cuda"]
+
+    return (["-filter_complex", complex_graph, "-map", "[vout]"], "+".join(used))
+
+def transcode_once(ffmpeg: str, inp: str, outp: str, codec: str, bitrate: str, preset: str,
+                   duration: Optional[float], scale: Optional[str], wm_path: Optional[str],
+                   overlay_pos: str, decode_mode: str = "gpu") -> Dict[str, Any]:
+
+    if not has_encoder(ffmpeg, codec):
+        raise RuntimeError(f"encoder '{codec}' not available; check your ffmpeg build (NVENC/AV1).")
+
+    want_gpu_decode = (decode_mode == "gpu")
+    args = [ffmpeg, "-hide_banner", "-y", "-vsync", "0"]
+
+    if want_gpu_decode:
+        args += ["-hwaccel", "cuda", "-hwaccel_output_format", "cuda", "-extra_hw_frames", "16",
+                 "-init_hw_device", "cuda=cuda", "-filter_hw_device", "cuda"]
+
+    args += ["-i", inp]
+    if wm_path:
+        args += ["-loop", "1", "-i", wm_path]
+    if duration:
+        args += ["-t", str(duration)]
+
+    filt_args, filter_used = build_vf_or_complex(ffmpeg, scale, wm_path, overlay_pos, want_gpu_decode)
+    args += filt_args
+
+    args += ["-c:v", codec, "-b:v", bitrate, "-preset", preset, "-rc", "vbr", "-movflags", "+faststart"]
+    args += ["-c:a", "copy"]
+    args += [outp]
+
+    t0 = datetime.datetime.now()
+    proc = run(args)
+    t1 = datetime.datetime.now()
+    if proc.returncode != 0:
+        raise RuntimeError("ffmpeg failed:\n" + proc.stdout + f"\n\nARGS:\n{' '.join(args)}")
+
+    parsed = parse_progress(proc.stdout)
+    size = os.path.getsize(outp) if os.path.exists(outp) else 0
+    return {
+        "args": args,
+        "filter_used": filter_used,
+        "stdout_tail": "\n".join(proc.stdout.splitlines()[-15:]),
+        "compute_time_us": (t1 - t0) / datetime.timedelta(microseconds=1),
+        "fps": parsed["fps"],
+        "speed_x": parsed["speed_x"],
+        "frames": parsed["frames"],
+        "output_size_bytes": size
+    }
+
+def main():
+    ap = argparse.ArgumentParser(description="GPU NVENC benchmark.")
+    ap.add_argument("--input", required=True)
+    ap.add_argument("--duration", type=float, default=None)
+    ap.add_argument("--repeat", type=int, default=1)
+    ap.add_argument("--warmup", action="store_true")
+    ap.add_argument("--csv", default=None)
+    ap.add_argument("--watermark", default=None)
+    ap.add_argument("--overlay", default="main_w/2-overlay_w/2:main_h/2-overlay_h/2")
+    ap.add_argument("--decode", choices=["gpu","cpu"], default="gpu")
+    ap.add_argument("--trials", nargs="+", default=[
+        "codec=h264_nvenc,bitrate=5M,preset=p5",
+        "codec=h264_nvenc,bitrate=12M,preset=p1,scale=1920:1080",
+        "codec=hevc_nvenc,bitrate=6M,preset=p4",
+        # "codec=av1_nvenc,bitrate=3M,preset=p5",  # only if available
+    ])
+    args = ap.parse_args()
+
+    ffmpeg = which_ffmpeg()
+    gi = gpu_info()
+
+    def parse_trial(s: str) -> Dict[str, str]:
+        d: Dict[str, str] = {}
+        for kv in s.split(","):
+            k, v = kv.split("=", 1); d[k.strip()] = v.strip()
+        return d
+
+    trial_specs = [parse_trial(s) for s in args.trials]
+
+    # Warmup with first available encoder
+    if args.warmup:
+        warm = next((t for t in trial_specs if has_encoder(ffmpeg, t.get("codec","h264_nvenc"))), None)
+        if warm:
+            with tempfile.NamedTemporaryFile(suffix=".mp4", delete=True) as tmp:
+                _ = transcode_once(ffmpeg, args.input, tmp.name,
+                                   warm.get("codec","h264_nvenc"),
+                                   warm.get("bitrate","5M"),
+                                   warm.get("preset","p5"),
+                                   args.duration,
+                                   warm.get("scale"),
+                                   args.watermark,
+                                   args.overlay,
+                                   args.decode)
+
+    results = []; idx = 0
+    for spec in trial_specs:
+        for _ in range(args.repeat):
+            if not has_encoder(ffmpeg, spec.get("codec","h264_nvenc")):
+                results.append({
+                    "trial_index": idx, "codec": spec.get("codec"), "bitrate": spec.get("bitrate"),
+                    "preset": spec.get("preset"), "scale_filter": "", "fps": None, "speed_x": None,
+                    "frames": None, "compute_time_us": 0, "output_size_bytes": 0,
+                    "stdout_tail": "SKIPPED: encoder not available", "argv": "", "status": "skipped"
+                }); idx += 1; continue
+            with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp:
+                outp = tmp.name
+            res = transcode_once(ffmpeg, args.input, outp,
+                                 spec.get("codec","h264_nvenc"),
+                                 spec.get("bitrate","5M"),
+                                 spec.get("preset","p5"),
+                                 args.duration,
+                                 spec.get("scale"),
+                                 args.watermark,
+                                 args.overlay,
+                                 args.decode)
+            results.append({
+                "trial_index": idx, "codec": spec.get("codec"), "bitrate": spec.get("bitrate"),
+                "preset": spec.get("preset"), "scale_filter": res["filter_used"], "fps": res["fps"],
+                "speed_x": res["speed_x"], "frames": res["frames"],
+                "compute_time_us": res["compute_time_us"], "output_size_bytes": res["output_size_bytes"],
+                "stdout_tail": res["stdout_tail"], "argv": " ".join(res["args"]), "status": "ok"
+            })
+            idx += 1
+            try: os.remove(outp)
+            except OSError: pass
+
+    report = {"gpu": gi, "ffmpeg_path": ffmpeg, "trial_count": len(results), "results": results}
+    print(json.dumps(report, indent=2))
+
+    if args.csv and results:
+        with open(args.csv, "w", newline="") as f:
+            w = csv.DictWriter(f, fieldnames=list(results[0].keys()))
+            w.writeheader(); w.writerows(results)
+
+if __name__ == "__main__":
+    main()
+PY
+
+chmod +x gpu_bench.py
+
+# --- Provide a sample 4K clip if missing ---
+if [[ ! -f "$INPUT" ]]; then
+  echo "No input provided or file missing. Creating ${INPUT} (4K, 20s, tone + test pattern)..."
+  ffmpeg -hide_banner -y \
+    -f lavfi -i testsrc2=size=3840x2160:rate=30 \
+    -f lavfi -i sine=frequency=1000:sample_rate=48000 \
+    -t 20 \
+    -c:v libx264 -pix_fmt yuv420p -b:v 600k \
+    -c:a aac -b:a 96k \
+    "$INPUT"
+fi
+
+# --- Run the benchmark ---
+TS="$(date +%Y%m%d_%H%M%S)"
+CSV="results_${TS}.csv"
+
+echo
+echo "Running GPU NVENC benchmark..."
+./gpu_bench.py \
+  --input "$INPUT" \
+  --duration "$DURATION" \
+  --repeat "$REPEAT" \
+  --decode gpu \
+  --csv "$CSV" \
+  --trials \
+    "codec=h264_nvenc,bitrate=5M,preset=p5" \
+    "codec=h264_nvenc,bitrate=12M,preset=p1,scale=1920:1080" \
+    "codec=hevc_nvenc,bitrate=6M,preset=p4"
+
+echo
+echo "Done. CSV saved at: $BENCH_DIR/$CSV"
+echo "Preview:"
+(head -n 1 "$CSV" && tail -n +2 "$CSV" | sed -n '1,3p') | sed 's/,/ | /g'
diff --git a/watermarking_bench/sample.mp4 b/watermarking_bench/sample.mp4
new file mode 100644
index 000000000..909679fc3
Binary files /dev/null and b/watermarking_bench/sample.mp4 differ
diff --git a/watermarking_bench/watermark.png b/watermarking_bench/watermark.png
new file mode 100644
index 000000000..8a6d9e541
Binary files /dev/null and b/watermarking_bench/watermark.png differ
diff --git a/watermarking_bench/watermarking_readme.md b/watermarking_bench/watermarking_readme.md
new file mode 100644
index 000000000..f522c717b
--- /dev/null
+++ b/watermarking_bench/watermarking_readme.md
@@ -0,0 +1,9 @@
+### Running the NVENC Benchmark
+
+```bash
+chmod +x run_nvenc_bench.sh
+# Run it with the default test video (auto-generated in ~/bench/sample.mp4):
+./run_nvenc_bench.sh
+# Run it on your own input video:
+./run_nvenc_bench.sh /path/to/your/video.mp4
+```