diff --git a/01-models/Qwen3/Qwen3-VL/Qwen3-VL-2B-Instruct.ipynb b/01-models/Qwen3/Qwen3-VL/Qwen3-VL-2B-Instruct.ipynb new file mode 100644 index 0000000..4efddd3 --- /dev/null +++ b/01-models/Qwen3/Qwen3-VL/Qwen3-VL-2B-Instruct.ipynb @@ -0,0 +1,575 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "94d68fb3-00f2-447c-b5e0-28fa18562392", + "metadata": {}, + "source": [ + "# Deploy the Qwen3-VL-2B-Instruct for inference using Amazon SageMakerAI\n", + "**Recommended kernel(s):** This notebook can be run with any Amazon SageMaker Studio kernel.\n", + "\n", + "In this notebook, you will learn how to deploy the Qwen3-VL-2B-Instruct model (HuggingFace model ID: [Qwen/Qwen3-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct)) using Amazon SageMaker AI. \n", + "\n", + "Let's install or upgrade these dependencies using the following command:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fed1c92f-9fbc-47c9-b940-7dd07f962ef8", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -Uq huggingface==4.49 sagemaker transformers==4.57.0" + ] + }, + { + "cell_type": "markdown", + "id": "fc56cad7-ea97-43c4-835d-a56729549023", + "metadata": {}, + "source": [ + "### Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "c128cf97-3f1f-4176-87b4-1193cd9e9c0f", + "metadata": { + "execution": { + "iopub.execute_input": "2025-10-28T19:19:08.231499Z", + "iopub.status.busy": "2025-10-28T19:19:08.231306Z", + "iopub.status.idle": "2025-10-28T19:19:09.646052Z", + "shell.execute_reply": "2025-10-28T19:19:09.645642Z", + "shell.execute_reply.started": "2025-10-28T19:19:08.231484Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml\n", + "sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml\n", + "2.253.1\n" + ] + } + ], + "source": [ + "import os\n", + "import datetime\n", + "import sagemaker\n", + "import boto3\n", + "import logging\n", + "import json\n", + "import time\n", + "import shutil\n", + "import tarfile\n", + "\n", + "import sagemaker\n", + "from sagemaker.huggingface import HuggingFaceModel\n", + "from sagemaker.session import Session\n", + "from sagemaker.s3 import S3Uploader\n", + "\n", + "from huggingface_hub import snapshot_download\n", + "\n", + "print(sagemaker.__version__)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "bba6d4bf-5df9-4e3e-8a9e-a03897e0cb0f", + "metadata": { + "execution": { + "iopub.execute_input": "2025-10-28T19:19:09.646609Z", + "iopub.status.busy": "2025-10-28T19:19:09.646472Z", + "iopub.status.idle": "2025-10-28T19:19:10.114037Z", + "shell.execute_reply": "2025-10-28T19:19:10.113669Z", + "shell.execute_reply.started": "2025-10-28T19:19:09.646594Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Qwen-Qwen3-VL-2B-Instruct-endpoint-1761679149-892584\n", + "Saving model artifacts to sagemaker-us-east-1-329542461890/models/Qwen_Qwen3-VL-2B-Instruct\n" + ] + } + ], + "source": [ + "session = sagemaker.Session()\n", + "role = sagemaker.get_execution_role()\n", + "\n", + "instance_type = \"ml.g5.4xlarge\"\n", + "instance_count = 1\n", + "\n", + "model_id = \"Qwen/Qwen3-VL-2B-Instruct\"\n", + "model_id_filesafe = model_id.replace(\"/\", \"_\").replace(\".\", \"_\")\n", + "endpoint_name = f\"{model_id_filesafe.replace(\"_\", \"-\")}-endpoint-{str(datetime.datetime.now().timestamp()).replace(\".\", \"-\")}\"\n", + "print(endpoint_name)\n", + "\n", + "image_uri = \"763104351884.dkr.ecr.us-east-1.amazonaws.com/djl-inference:0.34.0-lmi16.0.0-cu128-v1.2\"\n", + "\n", + "base_name = model_id.split('/')[-1].replace('.', '-').lower()\n", + "model_lineage = model_id.split('/')[0]\n", + "base_name\n", + "\n", + "bucket_name = session.default_bucket()\n", + "default_prefix = session.default_bucket_prefix or f\"models/{model_id_filesafe}\"\n", + "print(f\"Saving model artifacts to {bucket_name}/{default_prefix}\")\n", + "\n", + "os.makedirs(\"code\", exist_ok=True)" + ] + }, + { + "cell_type": "markdown", + "id": "0a7d4737-12ca-4cb7-aee4-06f7c0328b6d", + "metadata": {}, + "source": [ + "## Local Model Test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "51db134d-19bb-4702-949e-cf8160fe917f", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from transformers import Qwen3VLForConditionalGeneration, AutoProcessor\n", + "\n", + "model = Qwen3VLForConditionalGeneration.from_pretrained(\n", + " \"Qwen/Qwen3-VL-2B-Instruct\",\n", + " dtype=torch.float16,\n", + " device_map=\"auto\",\n", + " attn_implementation=\"sdpa\"\n", + ")\n", + "processor = AutoProcessor.from_pretrained(\"Qwen/Qwen3-VL-2B-Instruct\")\n", + "messages = [\n", + " {\n", + " \"role\":\"user\",\n", + " \"content\":[\n", + " {\n", + " \"type\":\"image\",\n", + " \"url\": \"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg\"\n", + " },\n", + " {\n", + " \"type\":\"text\",\n", + " \"text\":\"Describe this image.\"\n", + " }\n", + " ]\n", + " }\n", + "\n", + "]\n", + "\n", + "inputs = processor.apply_chat_template(\n", + " messages,\n", + " tokenize=True,\n", + " add_generation_prompt=True,\n", + " return_dict=True,\n", + " return_tensors=\"pt\",\n", + ")\n", + "inputs.pop(\"token_type_ids\", None)\n", + "\n", + "generated_ids = model.generate(**inputs, max_new_tokens=128)\n", + "generated_ids_trimmed = [\n", + " out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n", + "]\n", + "output_text = processor.batch_decode(\n", + " generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\n", + ")\n", + "print(output_text)" + ] + }, + { + "cell_type": "markdown", + "id": "7245e684-aa6e-44ca-9451-10e0bee4b96e", + "metadata": {}, + "source": [ + "## Create SageMaker Model\n", + "Here we define the custom requirements and inference logic to be run by this model. We download the model assets from HuggingFace, zip them up and upload them to S3. We then deploy the model as a `HuggingFaceModel`." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "410cb9a8-9fd0-408d-bebb-af8b8c09d398", + "metadata": { + "execution": { + "iopub.execute_input": "2025-10-28T19:19:13.048393Z", + "iopub.status.busy": "2025-10-28T19:19:13.048051Z", + "iopub.status.idle": "2025-10-28T19:19:13.051031Z", + "shell.execute_reply": "2025-10-28T19:19:13.050649Z", + "shell.execute_reply.started": "2025-10-28T19:19:13.048378Z" + } + }, + "outputs": [], + "source": [ + "env = {\n", + " 'HF_MODEL_ID': model_id,\n", + " 'HF_TASK':'image-text-to-text',\n", + " 'SM_NUM_GPUS': json.dumps(1),\n", + " 'OPTION_TRUST_REMOTE_CODE': 'true',\n", + " 'OPTION_MODEL_LOADING_TIMEOUT': '3600',\n", + " \"OPTION_ROLLING_BATCH\": \"disable\",\n", + " \"OPTION_TENSOR_PARALLEL_DEGREE\": \"1\",\n", + " \"OPTION_MAX_MODEL_LEN\": \"5000\",\n", + " \"OPTION_ASYNC_MODE\": \"true\",\n", + " \"OPTION_TRUST_REMOTE_CODE\": \"true\",\n", + " \"SERVING_FAIL_FAST\": \"true\",\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "5d441971-621c-4d52-a7db-d67020a8667e", + "metadata": { + "execution": { + "iopub.execute_input": "2025-10-28T19:19:13.916283Z", + "iopub.status.busy": "2025-10-28T19:19:13.915921Z", + "iopub.status.idle": "2025-10-28T19:19:13.919378Z", + "shell.execute_reply": "2025-10-28T19:19:13.919024Z", + "shell.execute_reply.started": "2025-10-28T19:19:13.916253Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting code/requirements.txt\n" + ] + } + ], + "source": [ + "%%writefile code/requirements.txt\n", + "transformers==4.57.0\n", + "torch\n", + "torchvision\n", + "torchaudio\n", + "pillow\n", + "requests" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "a6468cba-5d10-484d-898a-19802612925c", + "metadata": { + "execution": { + "iopub.execute_input": "2025-10-28T19:51:30.543812Z", + "iopub.status.busy": "2025-10-28T19:51:30.543609Z", + "iopub.status.idle": "2025-10-28T19:51:30.546908Z", + "shell.execute_reply": "2025-10-28T19:51:30.546525Z", + "shell.execute_reply.started": "2025-10-28T19:51:30.543795Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting code/inference.py\n" + ] + } + ], + "source": [ + "%%writefile code/inference.py\n", + "# This code comes from HuggingFace\n", + "# https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct\n", + "import logging\n", + "import torch\n", + "from transformers import Qwen3VLForConditionalGeneration, AutoProcessor\n", + "\n", + "logger = logging.getLogger()\n", + "logger.setLevel(logging.INFO)\n", + "\n", + "def model_fn(model_dir):\n", + "\n", + " model = Qwen3VLForConditionalGeneration.from_pretrained(\n", + " model_dir,\n", + " dtype=torch.float16,\n", + " device_map=\"auto\",\n", + " attn_implementation=\"sdpa\"\n", + " )\n", + "\n", + " processor = AutoProcessor.from_pretrained(\n", + " model_dir,\n", + " trust_remote_code=True\n", + " )\n", + "\n", + " return {\"processor\": processor, \"model\": model}\n", + "\n", + "\n", + "def predict_fn(data, model_obj):\n", + " processor = model_obj[\"processor\"]\n", + " model = model_obj[\"model\"]\n", + " messages = [\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\n", + " \"type\": \"image\",\n", + " \"image\": \"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg\",\n", + " },\n", + " {\"type\": \"text\", \"text\": \"Describe this image.\"},\n", + " ],\n", + " }\n", + " ]\n", + "\n", + " inputs = processor.apply_chat_template(\n", + " messages,\n", + " tokenize=True,\n", + " add_generation_prompt=True,\n", + " return_dict=True,\n", + " return_tensors=\"pt\"\n", + " )\n", + " inputs = inputs.to(model.device)\n", + " generated_ids = model.generate(**inputs, max_new_tokens=128)\n", + " generated_ids_trimmed = [\n", + " out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n", + " ]\n", + " output_text = processor.batch_decode(\n", + " generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\n", + " )\n", + " print(output_text)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "3ba9db6b-3a56-462a-a513-770ed7f4bbcc", + "metadata": { + "execution": { + "iopub.execute_input": "2025-10-28T19:51:31.602686Z", + "iopub.status.busy": "2025-10-28T19:51:31.602482Z", + "iopub.status.idle": "2025-10-28T19:51:31.605398Z", + "shell.execute_reply": "2025-10-28T19:51:31.604909Z", + "shell.execute_reply.started": "2025-10-28T19:51:31.602672Z" + } + }, + "outputs": [], + "source": [ + "def filter_function(tarinfo):\n", + " \"\"\"Filter function to exclude .cache files and directories\"\"\"\n", + " if '.cache' in tarinfo.name or '.gitattributes' in tarinfo.name:\n", + " return None\n", + " return tarinfo" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "ae9b8cdf-ebf9-4d05-837d-8a3f412b73d4", + "metadata": { + "execution": { + "iopub.execute_input": "2025-10-28T19:51:33.065578Z", + "iopub.status.busy": "2025-10-28T19:51:33.065191Z", + "iopub.status.idle": "2025-10-28T19:57:37.284971Z", + "shell.execute_reply": "2025-10-28T19:57:37.284528Z", + "shell.execute_reply.started": "2025-10-28T19:51:33.065543Z" + } + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7dc899529b1b47008ef16bc6303b5b18", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Fetching 12 files: 0%| | 0/12 [00:00 ⚠️ **Important**: \n", + "> - Deployment can take up to 15 minutes\n", + "> - Monitor the CloudWatch logs for progress" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "7207d0d4-f2be-41c6-a989-3db36da256cf", + "metadata": { + "execution": { + "iopub.execute_input": "2025-10-28T19:57:37.288133Z", + "iopub.status.busy": "2025-10-28T19:57:37.287995Z", + "iopub.status.idle": "2025-10-28T20:10:17.947197Z", + "shell.execute_reply": "2025-10-28T20:10:17.946723Z", + "shell.execute_reply.started": "2025-10-28T19:57:37.288117Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-----------!" + ] + } + ], + "source": [ + "# Hub Model configuration. https://huggingface.co/models\n", + "hub = {\n", + "\t'HF_MODEL_ID':'Qwen/Qwen3-VL-2B-Instruct',\n", + "\t'HF_TASK':'image-text-to-text'\n", + "}\n", + "\n", + "# create Hugging Face Model Class\n", + "huggingface_model = HuggingFaceModel(\n", + " model_data=f\"s3://{bucket_name}/{default_prefix}/model.tar.gz\",\n", + "\ttransformers_version='4.49.0',\n", + "\tpytorch_version='2.6.0',\n", + "\tpy_version='py312',\n", + "\tenv=env,\n", + "\trole=role, \n", + " entry_point=\"inference.py\",\n", + " enable_network_isolation=False\n", + ")\n", + "\n", + "# deploy model to SageMaker Inference\n", + "predictor = huggingface_model.deploy(\n", + "\tinitial_instance_count=1, # number of instances\n", + "\tinstance_type='ml.g5.4xlarge'\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8ef9603-a16d-4195-9702-b920762e2e9b", + "metadata": {}, + "outputs": [], + "source": [ + "# Using DJL Serving\n", + "# UNDER CONSTRUCTION\n", + "\n", + "# model = HuggingFaceModel(\n", + "# model_data=f\"s3://{bucket_name}/{default_prefix}/model.tar.gz\",\n", + "# image_uri=image_uri,\n", + "# env=env,\n", + "# role=role,\n", + "# entry_point=\"inference.py\",\n", + "# enable_network_isolation=False\n", + "# )\n", + "\n", + "# predictor = model.deploy(\n", + "# initial_instance_count=instance_count,\n", + "# instance_type=instance_type,\n", + "# endpoint_name=endpoint_name\n", + "# )\n", + "\n", + "# predictor.predict()" + ] + }, + { + "cell_type": "markdown", + "id": "f7b85c32-d4df-45f1-84f4-86a296ad1c0d", + "metadata": { + "execution": { + "iopub.execute_input": "2025-09-15T19:14:57.135928Z", + "iopub.status.busy": "2025-09-15T19:14:57.135661Z", + "iopub.status.idle": "2025-09-15T19:14:57.139468Z", + "shell.execute_reply": "2025-09-15T19:14:57.138566Z", + "shell.execute_reply.started": "2025-09-15T19:14:57.135907Z" + } + }, + "source": [ + "# Clean up" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bcef37c7-6b47-4c3b-b6b7-b266245b492e", + "metadata": {}, + "outputs": [], + "source": [ + "huggingface_model.delete_model()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/01-models/google/Owl/Owlv2-base-patch16.ipynb b/01-models/google/Owl/Owlv2-base-patch16.ipynb new file mode 100644 index 0000000..8adbe11 --- /dev/null +++ b/01-models/google/Owl/Owlv2-base-patch16.ipynb @@ -0,0 +1,464 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "94d68fb3-00f2-447c-b5e0-28fa18562392", + "metadata": {}, + "source": [ + "# Deploy the Owlv2-base-patch16 for inference using Amazon SageMakerAI\n", + "**Recommended kernel(s):** This notebook can be run with any Amazon SageMaker Studio kernel.\n", + "\n", + "In this notebook, you will learn how to deploy the Qwen3-VL-2B-Instruct model (HuggingFace model ID: [google/owlv2-base-patch16](https://huggingface.co/google/owlv2-base-patch16)) using Amazon SageMaker AI. \n", + "\n", + "Let's install or upgrade these dependencies using the following command:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fed1c92f-9fbc-47c9-b940-7dd07f962ef8", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -Uq huggingface==4.49 sagemaker transformers==4.57.0" + ] + }, + { + "cell_type": "markdown", + "id": "fc56cad7-ea97-43c4-835d-a56729549023", + "metadata": {}, + "source": [ + "### Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c128cf97-3f1f-4176-87b4-1193cd9e9c0f", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import datetime\n", + "import sagemaker\n", + "import boto3\n", + "import logging\n", + "import json\n", + "import time\n", + "import shutil\n", + "import tarfile\n", + "\n", + "import sagemaker\n", + "from sagemaker.huggingface import HuggingFaceModel\n", + "from sagemaker.session import Session\n", + "from sagemaker.s3 import S3Uploader\n", + "\n", + "from huggingface_hub import snapshot_download\n", + "\n", + "print(sagemaker.__version__)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bba6d4bf-5df9-4e3e-8a9e-a03897e0cb0f", + "metadata": {}, + "outputs": [], + "source": [ + "session = sagemaker.Session()\n", + "role = sagemaker.get_execution_role()\n", + "\n", + "instance_type = \"ml.g5.4xlarge\"\n", + "instance_count = 1\n", + "\n", + "model_id = \"google/owlv2-base-patch16\"\n", + "model_id_filesafe = model_id.replace(\"/\", \"_\").replace(\".\", \"_\")\n", + "endpoint_name = f\"{model_id_filesafe.replace(\"_\", \"-\")}-endpoint-{str(datetime.datetime.now().timestamp()).replace(\".\", \"-\")}\"\n", + "print(endpoint_name)\n", + "\n", + "base_name = model_id.split('/')[-1].replace('.', '-').lower()\n", + "model_lineage = model_id.split('/')[0]\n", + "base_name\n", + "\n", + "bucket_name = session.default_bucket()\n", + "default_prefix = session.default_bucket_prefix or f\"models/{model_id_filesafe}\"\n", + "print(f\"Saving model artifacts to {bucket_name}/{default_prefix}\")\n", + "\n", + "os.makedirs(\"code\", exist_ok=True)" + ] + }, + { + "cell_type": "markdown", + "id": "0a7d4737-12ca-4cb7-aee4-06f7c0328b6d", + "metadata": {}, + "source": [ + "## Local Model Test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "51db134d-19bb-4702-949e-cf8160fe917f", + "metadata": {}, + "outputs": [], + "source": [ + "# This code is adapted from https://huggingface.co/google/owlv2-base-patch16\n", + "\n", + "import requests\n", + "from PIL import Image\n", + "import numpy as np\n", + "import torch\n", + "from transformers import AutoProcessor, Owlv2ForObjectDetection\n", + "from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD\n", + "\n", + "processor = AutoProcessor.from_pretrained(\"google/owlv2-base-patch16\")\n", + "model = Owlv2ForObjectDetection.from_pretrained(\"google/owlv2-base-patch16\")\n", + "\n", + "url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n", + "image = Image.open(requests.get(url, stream=True).raw)\n", + "texts = [[\"a photo of a cat\", \"a photo of a dog\"]]\n", + "inputs = processor(text=texts, images=image, return_tensors=\"pt\")\n", + "\n", + "# forward pass\n", + "with torch.no_grad():\n", + " outputs = model(**inputs)\n", + "\n", + "# Note: boxes need to be visualized on the padded, unnormalized image\n", + "# hence we'll set the target image sizes (height, width) based on that\n", + "\n", + "def get_preprocessed_image(pixel_values):\n", + " pixel_values = pixel_values.squeeze().numpy()\n", + " unnormalized_image = (pixel_values * np.array(OPENAI_CLIP_STD)[:, None, None]) + np.array(OPENAI_CLIP_MEAN)[:, None, None]\n", + " unnormalized_image = (unnormalized_image * 255).astype(np.uint8)\n", + " unnormalized_image = np.moveaxis(unnormalized_image, 0, -1)\n", + " unnormalized_image = Image.fromarray(unnormalized_image)\n", + " return unnormalized_image\n", + "\n", + "unnormalized_image = get_preprocessed_image(inputs.pixel_values)\n", + "\n", + "target_sizes = torch.Tensor([unnormalized_image.size[::-1]])\n", + "# Convert outputs (bounding boxes and class logits) to final bounding boxes and scores\n", + "results = processor.post_process_object_detection(\n", + " outputs=outputs, threshold=0.2, target_sizes=target_sizes\n", + ")\n", + "\n", + "i = 0 # Retrieve predictions for the first image for the corresponding text queries\n", + "text = texts[i]\n", + "boxes, scores, labels = results[i][\"boxes\"], results[i][\"scores\"], results[i][\"labels\"]\n", + "\n", + "for box, score, label in zip(boxes, scores, labels):\n", + " box = [round(i, 2) for i in box.tolist()]\n", + " print(f\"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}\")" + ] + }, + { + "cell_type": "markdown", + "id": "7245e684-aa6e-44ca-9451-10e0bee4b96e", + "metadata": {}, + "source": [ + "## Create SageMaker Model\n", + "Here we define the custom requirements and inference logic to be run by this model. We download the model assets from HuggingFace, zip them up and upload them to S3. We then deploy the model as a `HuggingFaceModel`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "410cb9a8-9fd0-408d-bebb-af8b8c09d398", + "metadata": {}, + "outputs": [], + "source": [ + "env = {\n", + " 'HF_MODEL_ID': model_id,\n", + " 'HF_TASK':'image-text-to-text',\n", + " 'SM_NUM_GPUS': json.dumps(1),\n", + " 'OPTION_TRUST_REMOTE_CODE': 'true',\n", + " 'OPTION_MODEL_LOADING_TIMEOUT': '3600',\n", + " \"OPTION_ROLLING_BATCH\": \"disable\",\n", + " \"OPTION_TENSOR_PARALLEL_DEGREE\": \"1\",\n", + " \"OPTION_MAX_MODEL_LEN\": \"5000\",\n", + " \"OPTION_ASYNC_MODE\": \"true\",\n", + " \"OPTION_TRUST_REMOTE_CODE\": \"true\",\n", + " \"SERVING_FAIL_FAST\": \"true\",\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d441971-621c-4d52-a7db-d67020a8667e", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile code/requirements.txt\n", + "transformers==4.57.0\n", + "torch\n", + "torchvision\n", + "torchaudio\n", + "pillow\n", + "requests" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6468cba-5d10-484d-898a-19802612925c", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile code/inference.py\n", + "# This code comes from HuggingFace\n", + "# https://huggingface.co/google/owlv2-base-patch16\n", + "\n", + "import logging\n", + "import requests\n", + "from PIL import Image\n", + "import numpy as np\n", + "import torch\n", + "from transformers import AutoProcessor, Owlv2ForObjectDetection\n", + "from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD\n", + "\n", + "logger = logging.getLogger()\n", + "logger.setLevel(logging.INFO)\n", + "\n", + "def model_fn(model_dir):\n", + "\n", + " model = Owlv2ForObjectDetection.from_pretrained(\n", + " model_dir,\n", + " device_map=\"auto\"\n", + " )\n", + " \n", + " processor = AutoProcessor.from_pretrained(\n", + " model_dir,\n", + " trust_remote_code=True\n", + " )\n", + "\n", + " return {\"processor\": processor, \"model\": model}\n", + "\n", + "\n", + "def predict_fn(data, model_obj):\n", + " processor = model_obj[\"processor\"]\n", + " model = model_obj[\"model\"]\n", + " \n", + " url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n", + " image = Image.open(requests.get(url, stream=True).raw)\n", + " texts = [[\"a photo of a cat\", \"a photo of a dog\"]]\n", + " inputs = processor(text=texts, images=image, return_tensors=\"pt\")\n", + " \n", + " # forward pass\n", + " with torch.no_grad():\n", + " outputs = model(**inputs)\n", + " \n", + " # Note: boxes need to be visualized on the padded, unnormalized image\n", + " # hence we'll set the target image sizes (height, width) based on that\n", + " \n", + " def get_preprocessed_image(pixel_values):\n", + " pixel_values = pixel_values.squeeze().numpy()\n", + " unnormalized_image = (pixel_values * np.array(OPENAI_CLIP_STD)[:, None, None]) + np.array(OPENAI_CLIP_MEAN)[:, None, None]\n", + " unnormalized_image = (unnormalized_image * 255).astype(np.uint8)\n", + " unnormalized_image = np.moveaxis(unnormalized_image, 0, -1)\n", + " unnormalized_image = Image.fromarray(unnormalized_image)\n", + " return unnormalized_image\n", + " \n", + " unnormalized_image = get_preprocessed_image(inputs.pixel_values)\n", + " \n", + " target_sizes = torch.Tensor([unnormalized_image.size[::-1]])\n", + " # Convert outputs (bounding boxes and class logits) to final bounding boxes and scores\n", + " results = processor.post_process_object_detection(\n", + " outputs=outputs, threshold=0.2, target_sizes=target_sizes\n", + " )\n", + " \n", + " i = 0 # Retrieve predictions for the first image for the corresponding text queries\n", + " text = texts[i]\n", + " boxes, scores, labels = results[i][\"boxes\"], results[i][\"scores\"], results[i][\"labels\"]\n", + " \n", + " for box, score, label in zip(boxes, scores, labels):\n", + " box = [round(i, 2) for i in box.tolist()]\n", + " print(f\"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ba9db6b-3a56-462a-a513-770ed7f4bbcc", + "metadata": {}, + "outputs": [], + "source": [ + "def filter_function(tarinfo):\n", + " \"\"\"Filter function to exclude .cache files and directories\"\"\"\n", + " if '.cache' in tarinfo.name or '.gitattributes' in tarinfo.name:\n", + " return None\n", + " return tarinfo" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae9b8cdf-ebf9-4d05-837d-8a3f412b73d4", + "metadata": {}, + "outputs": [], + "source": [ + "s3_client = boto3.client('s3')\n", + "key = f\"{default_prefix}/model.tar.gz\"\n", + "force_rebuild_tarball = True\n", + "\n", + "if force_rebuild_tarball or not s3_client.head_object(Bucket=bucket_name, Key=key):\n", + " try:\n", + " model_path = snapshot_download(repo_id=model_id, local_dir=\"./model\")\n", + " print(f\"Successfully downloaded to {model_path}\")\n", + " except Exception as e:\n", + " print(f\"Failed to download after retries: {str(e)}\")\n", + " \n", + " print(\"Building gzipped tarball...\")\n", + " with tarfile.open(\"./model.tar.gz\", \"w:gz\") as tar:\n", + " tar.add(model_path, arcname=\".\", filter=filter_function)\n", + " tar.add(\"./code\", filter=filter_function)\n", + " print(\"Successfully tarred the ball.\")\n", + " \n", + " print(f\"Uploading tarball to {bucket_name}/{default_prefix}...\")\n", + " s3_client.upload_file(\"./model.tar.gz\", bucket_name, f\"{default_prefix}/model.tar.gz\")\n", + " shutil.rmtree(\"./model\")\n", + " os.remove(\"./model.tar.gz\")\n", + " print(\"Successfully uploaded, working directory cleaned\")" + ] + }, + { + "cell_type": "markdown", + "id": "40af4b17-30e9-4138-93f3-0589e4770815", + "metadata": {}, + "source": [ + "## Deploy Model to SageMaker Endpoint\n", + "\n", + "Now we'll deploy our model to a SageMaker endpoint for real-time inference. This is a significant step that:\n", + "1. Provisions the specified compute resources (G5 instance)\n", + "2. Deploys the model container\n", + "3. Sets up the endpoint for API access\n", + "\n", + "### Deployment Configuration\n", + "- **Instance Count**: 1 instance for single-node deployment\n", + "- **Instance Type**: `ml.g5.4xlarge` for high-performance inference\n", + "\n", + "> ⚠️ **Important**: \n", + "> - Deployment can take up to 15 minutes\n", + "> - Monitor the CloudWatch logs for progress" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7cbeaf71-b51c-4b65-8196-ba0f403eb2a8", + "metadata": {}, + "outputs": [], + "source": [ + "# Hub Model configuration. https://huggingface.co/models\n", + "hub = {\n", + "\t'HF_MODEL_ID':'google/owlv2-base-patch16',\n", + "\t'HF_TASK':'zero-shot-object-detection'\n", + "}\n", + "\n", + "# create Hugging Face Model Class\n", + "huggingface_model = HuggingFaceModel(\n", + " model_data=f\"s3://{bucket_name}/{default_prefix}/model.tar.gz\",\n", + "\ttransformers_version='4.49.0',\n", + "\tpytorch_version='2.6.0',\n", + "\tpy_version='py312',\n", + "\tenv=env,\n", + "\trole=role, \n", + " entry_point=\"inference.py\",\n", + " enable_network_isolation=False\n", + ")\n", + "\n", + "# deploy model to SageMaker Inference\n", + "predictor = huggingface_model.deploy(\n", + "\tinitial_instance_count=1, # number of instances\n", + "\tinstance_type='ml.m5.xlarge' # ec2 instance type\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8ef9603-a16d-4195-9702-b920762e2e9b", + "metadata": {}, + "outputs": [], + "source": [ + "# Using DJL Serving\n", + "# UNDER CONSTRUCTION\n", + "\n", + "# image_uri = \"763104351884.dkr.ecr.us-east-1.amazonaws.com/djl-inference:0.34.0-lmi16.0.0-cu128-v1.2\"\n", + "\n", + "# model = HuggingFaceModel(\n", + "# model_data=f\"s3://{bucket_name}/{default_prefix}/model.tar.gz\",\n", + "# image_uri=image_uri,\n", + "# env=env,\n", + "# role=role,\n", + "# entry_point=\"inference.py\",\n", + "# enable_network_isolation=False\n", + "# )\n", + "\n", + "# predictor = model.deploy(\n", + "# initial_instance_count=instance_count,\n", + "# instance_type=instance_type,\n", + "# endpoint_name=endpoint_name\n", + "# )\n", + "\n", + "# predictor.predict()" + ] + }, + { + "cell_type": "markdown", + "id": "f7b85c32-d4df-45f1-84f4-86a296ad1c0d", + "metadata": { + "execution": { + "iopub.execute_input": "2025-09-15T19:14:57.135928Z", + "iopub.status.busy": "2025-09-15T19:14:57.135661Z", + "iopub.status.idle": "2025-09-15T19:14:57.139468Z", + "shell.execute_reply": "2025-09-15T19:14:57.138566Z", + "shell.execute_reply.started": "2025-09-15T19:14:57.135907Z" + } + }, + "source": [ + "# Clean up" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bcef37c7-6b47-4c3b-b6b7-b266245b492e", + "metadata": {}, + "outputs": [], + "source": [ + "predictor.delete_endpoint(True)\n", + "huggingface_model.delete_model()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac15a012-fa80-43d0-a8d2-cdfbf2082e2e", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/01-models/google/Vault-Gemma/.ipynb_checkpoints/Vault-Gemma-1B-checkpoint.ipynb b/01-models/google/Vault-Gemma/.ipynb_checkpoints/Vault-Gemma-1B-checkpoint.ipynb new file mode 100644 index 0000000..cd7a1e9 --- /dev/null +++ b/01-models/google/Vault-Gemma/.ipynb_checkpoints/Vault-Gemma-1B-checkpoint.ipynb @@ -0,0 +1,321 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "94d68fb3-00f2-447c-b5e0-28fa18562392", + "metadata": {}, + "source": [ + "# How to deploy the VaultGemma 1B for inference using Amazon SageMakerAI\n", + "**Recommended kernel(s):** This notebook can be run with any Amazon SageMaker Studio kernel.\n", + "\n", + "In this notebook, you will learn how to deploy the Vault Gemma 1B model (HuggingFace model ID: [google/vaultgemma-1b](https://huggingface.co/google/vaultgemma-1b)) using Amazon SageMaker AI. \n", + "\n", + "VaultGemma is a variant of the Gemma family of lightweight, state-of-the-art open models from Google. It is pre-trained from the ground up using Differential Privacy (DP). This provides strong, mathematically-backed privacy guarantees for its training data, limiting the extent to which the model's outputs can reveal information about any single training example.\n", + "\n", + "VaultGemma uses a similar architecture as Gemma 2. VaultGemma is a pretrained model that can be instruction tuned for a variety of language understanding and generation tasks. Its relatively small size (< 1B parameters) makes it possible to deploy in environments with limited resources, democratizing access to state-of-the-art AI models that are built with privacy at their core.\n", + "\n", + "### License agreement\n", + "* This model is gated on HuggingFace, please refer to the original [model card](https://huggingface.co/google/vaultgemma-1b) for license.\n", + "* This notebook is a sample notebook and not intended for production use.\n", + "\n", + "### Execution environment setup\n", + "This notebook requires the following third-party Python dependencies:\n", + "* AWS [`sagemaker`](https://sagemaker.readthedocs.io/en/stable/index.html) with a version greater than or equal to 2.242.0\n", + "\n", + "Let's install or upgrade these dependencies using the following command:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "fed1c92f-9fbc-47c9-b940-7dd07f962ef8", + "metadata": { + "execution": { + "iopub.execute_input": "2025-09-15T19:04:31.124480Z", + "iopub.status.busy": "2025-09-15T19:04:31.124212Z", + "iopub.status.idle": "2025-09-15T19:04:37.189319Z", + "shell.execute_reply": "2025-09-15T19:04:37.188352Z", + "shell.execute_reply.started": "2025-09-15T19:04:31.124456Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "autogluon-multimodal 1.4.0 requires nvidia-ml-py3<8.0,>=7.352.0, which is not installed.\n", + "aiobotocore 2.21.1 requires botocore<1.37.2,>=1.37.0, but you have botocore 1.40.30 which is incompatible.\n", + "autogluon-multimodal 1.4.0 requires transformers[sentencepiece]<4.50,>=4.38.0, but you have transformers 4.57.0.dev0 which is incompatible.\n", + "autogluon-timeseries 1.4.0 requires transformers[sentencepiece]<4.50,>=4.38.0, but you have transformers 4.57.0.dev0 which is incompatible.\n", + "sagemaker-studio-analytics-extension 0.2.0 requires sparkmagic==0.22.0, but you have sparkmagic 0.21.0 which is incompatible.\n", + "sparkmagic 0.21.0 requires pandas<2.0.0,>=0.17.1, but you have pandas 2.3.1 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install -Uq sagemaker" + ] + }, + { + "cell_type": "markdown", + "id": "fc56cad7-ea97-43c4-835d-a56729549023", + "metadata": {}, + "source": [ + "### Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "c128cf97-3f1f-4176-87b4-1193cd9e9c0f", + "metadata": { + "execution": { + "iopub.execute_input": "2025-09-15T19:04:37.191175Z", + "iopub.status.busy": "2025-09-15T19:04:37.190795Z", + "iopub.status.idle": "2025-09-15T19:04:37.196080Z", + "shell.execute_reply": "2025-09-15T19:04:37.195223Z", + "shell.execute_reply.started": "2025-09-15T19:04:37.191132Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.245.0\n" + ] + } + ], + "source": [ + "import sagemaker\n", + "import boto3\n", + "import logging\n", + "import time\n", + "from sagemaker.session import Session\n", + "from sagemaker.s3 import S3Uploader\n", + "\n", + "print(sagemaker.__version__)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "bba6d4bf-5df9-4e3e-8a9e-a03897e0cb0f", + "metadata": { + "execution": { + "iopub.execute_input": "2025-09-15T19:04:37.197423Z", + "iopub.status.busy": "2025-09-15T19:04:37.197103Z", + "iopub.status.idle": "2025-09-15T19:04:38.010473Z", + "shell.execute_reply": "2025-09-15T19:04:38.009714Z", + "shell.execute_reply.started": "2025-09-15T19:04:37.197392Z" + } + }, + "outputs": [], + "source": [ + "try:\n", + " role = sagemaker.get_execution_role()\n", + " sagemaker_session = sagemaker.Session()\n", + " \n", + "except ValueError:\n", + " iam = boto3.client('iam')\n", + " role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "7e6b15cd-34e5-45f4-a68a-728aa9cc930d", + "metadata": { + "execution": { + "iopub.execute_input": "2025-09-15T19:09:45.382881Z", + "iopub.status.busy": "2025-09-15T19:09:45.382602Z", + "iopub.status.idle": "2025-09-15T19:09:45.388802Z", + "shell.execute_reply": "2025-09-15T19:09:45.387863Z", + "shell.execute_reply.started": "2025-09-15T19:09:45.382860Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'vaultgemma-1b'" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "HF_MODEL_ID = \"google/vaultgemma-1b\"\n", + "HUGGING_FACE_HUB_TOKEN = \"\"\n", + "\n", + "base_name = HF_MODEL_ID.split('/')[-1].replace('.', '-').lower()\n", + "model_lineage = HF_MODEL_ID.split('/')[0]\n", + "base_name" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "af864803-2921-4dda-95d7-6177b7d720ec", + "metadata": { + "execution": { + "iopub.execute_input": "2025-09-15T19:07:30.037543Z", + "iopub.status.busy": "2025-09-15T19:07:30.037241Z", + "iopub.status.idle": "2025-09-15T19:07:30.041271Z", + "shell.execute_reply": "2025-09-15T19:07:30.040218Z", + "shell.execute_reply.started": "2025-09-15T19:07:30.037519Z" + } + }, + "outputs": [], + "source": [ + "instance_type = \"ml.m5.xlarge\"\n", + "instance_count = 1" + ] + }, + { + "cell_type": "markdown", + "id": "7245e684-aa6e-44ca-9451-10e0bee4b96e", + "metadata": {}, + "source": [ + "## Create SageMaker Model\n", + "\n", + "#### HUGGING_FACE_HUB_TOKEN \n", + "VaultGemma-1B is a gated model. Therefore, if you deploy model files hosted on the Hub, you need to provide your HuggingFace token as environment variable. This enables SageMaker AI to download the files at runtime." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "2e196ece-bada-486a-8f20-b78a381de41b", + "metadata": { + "execution": { + "iopub.execute_input": "2025-09-15T19:13:36.536292Z", + "iopub.status.busy": "2025-09-15T19:13:36.535990Z", + "iopub.status.idle": "2025-09-15T19:13:36.714432Z", + "shell.execute_reply": "2025-09-15T19:13:36.713736Z", + "shell.execute_reply.started": "2025-09-15T19:13:36.536270Z" + } + }, + "outputs": [], + "source": [ + "from sagemaker.huggingface import HuggingFaceModel\n", + "\n", + "# Hub Model configuration. https://huggingface.co/models\n", + "hub = {\n", + "\t'HF_MODEL_ID':'google/vaultgemma-1b',\n", + "\t'HF_TASK':'text-generation',\n", + " 'HF_TOKEN':HUGGING_FACE_HUB_TOKEN\n", + "}\n", + "\n", + "# create Hugging Face Model Class\n", + "huggingface_model = HuggingFaceModel(\n", + "\ttransformers_version='4.49.0',\n", + "\tpytorch_version='2.6.0',\n", + "\tpy_version='py312',\n", + "\tenv=hub,\n", + "\trole=role, \n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "40af4b17-30e9-4138-93f3-0589e4770815", + "metadata": {}, + "source": [ + "## Deploy Model to SageMaker Endpoint\n", + "\n", + "Now we'll deploy our model to a SageMaker endpoint for real-time inference. This is a significant step that:\n", + "1. Provisions the specified compute resources (M5 instance)\n", + "2. Deploys the model container\n", + "3. Sets up the endpoint for API access\n", + "\n", + "### Deployment Configuration\n", + "- **Instance Count**: 1 instance for single-node deployment\n", + "- **Instance Type**: `ml.m5.xlarge` for high-performance inference\n", + "\n", + "> ⚠️ **Important**: \n", + "> - Deployment can take up to 15 minutes\n", + "> - Monitor the CloudWatch logs for progress" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "786fa8a9-d635-412f-adc5-4941a80b2b72", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "# deploy model to SageMaker Inference\n", + "predictor = huggingface_model.deploy(\n", + "\tinitial_instance_count=instance_count, # number of instances\n", + "\tinstance_type=instance_type # ec2 instance type\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc38898a-290b-482b-968c-f04b8674300c", + "metadata": {}, + "outputs": [], + "source": [ + "predictor.predict({\n", + "\t\"inputs\": \"Can you please let us know more details about your training using differential privacy?\",\n", + "})" + ] + }, + { + "cell_type": "markdown", + "id": "f7b85c32-d4df-45f1-84f4-86a296ad1c0d", + "metadata": { + "execution": { + "iopub.execute_input": "2025-09-15T19:14:57.135928Z", + "iopub.status.busy": "2025-09-15T19:14:57.135661Z", + "iopub.status.idle": "2025-09-15T19:14:57.139468Z", + "shell.execute_reply": "2025-09-15T19:14:57.138566Z", + "shell.execute_reply.started": "2025-09-15T19:14:57.135907Z" + } + }, + "source": [ + "# Clean up" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bcef37c7-6b47-4c3b-b6b7-b266245b492e", + "metadata": {}, + "outputs": [], + "source": [ + "huggingface_model.delete_model()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/01-models/google/Vault-Gemma/Vault-Gemma-1B.ipynb b/01-models/google/Vault-Gemma/Vault-Gemma-1B.ipynb new file mode 100644 index 0000000..2b01394 --- /dev/null +++ b/01-models/google/Vault-Gemma/Vault-Gemma-1B.ipynb @@ -0,0 +1,533 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "94d68fb3-00f2-447c-b5e0-28fa18562392", + "metadata": {}, + "source": [ + "# How to deploy the VaultGemma 1B for inference using Amazon SageMakerAI\n", + "**Recommended kernel(s):** This notebook can be run with any Amazon SageMaker Studio kernel.\n", + "\n", + "In this notebook, you will learn how to deploy the Vault Gemma 1B model (HuggingFace model ID: [google/vaultgemma-1b](https://huggingface.co/google/vaultgemma-1b)) using Amazon SageMaker AI. \n", + "\n", + "VaultGemma is a variant of the Gemma family of lightweight, state-of-the-art open models from Google. It is pre-trained from the ground up using Differential Privacy (DP). This provides strong, mathematically-backed privacy guarantees for its training data, limiting the extent to which the model's outputs can reveal information about any single training example.\n", + "\n", + "VaultGemma uses a similar architecture as Gemma 2. VaultGemma is a pretrained model that can be instruction tuned for a variety of language understanding and generation tasks. Its relatively small size (< 1B parameters) makes it possible to deploy in environments with limited resources, democratizing access to state-of-the-art AI models that are built with privacy at their core.\n", + "\n", + "### License agreement\n", + "* This model is gated on HuggingFace, please refer to the original [model card](https://huggingface.co/google/vaultgemma-1b) for license.\n", + "* This notebook is a sample notebook and not intended for production use.\n", + "\n", + "### Execution environment setup\n", + "This notebook requires the following third-party Python dependencies:\n", + "* AWS [`sagemaker`](https://sagemaker.readthedocs.io/en/stable/index.html) with a version greater than or equal to 2.242.0\n", + "\n", + "Let's install or upgrade these dependencies using the following command:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fed1c92f-9fbc-47c9-b940-7dd07f962ef8", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -Uq huggingface==4.49 sagemaker transformers==4.57.0" + ] + }, + { + "cell_type": "markdown", + "id": "fc56cad7-ea97-43c4-835d-a56729549023", + "metadata": {}, + "source": [ + "### Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "c128cf97-3f1f-4176-87b4-1193cd9e9c0f", + "metadata": { + "execution": { + "iopub.execute_input": "2025-10-28T20:33:42.848333Z", + "iopub.status.busy": "2025-10-28T20:33:42.848186Z", + "iopub.status.idle": "2025-10-28T20:33:42.851101Z", + "shell.execute_reply": "2025-10-28T20:33:42.850698Z", + "shell.execute_reply.started": "2025-10-28T20:33:42.848317Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.253.1\n" + ] + } + ], + "source": [ + "import os\n", + "import datetime\n", + "import sagemaker\n", + "import boto3\n", + "import logging\n", + "import json\n", + "import time\n", + "import shutil\n", + "import tarfile\n", + "\n", + "import sagemaker\n", + "from sagemaker.huggingface import HuggingFaceModel\n", + "from sagemaker.session import Session\n", + "from sagemaker.s3 import S3Uploader\n", + "\n", + "from huggingface_hub import snapshot_download\n", + "\n", + "print(sagemaker.__version__)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "bba6d4bf-5df9-4e3e-8a9e-a03897e0cb0f", + "metadata": { + "execution": { + "iopub.execute_input": "2025-10-28T20:38:20.172482Z", + "iopub.status.busy": "2025-10-28T20:38:20.172255Z", + "iopub.status.idle": "2025-10-28T20:38:20.624643Z", + "shell.execute_reply": "2025-10-28T20:38:20.624085Z", + "shell.execute_reply.started": "2025-10-28T20:38:20.172467Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "google-vaultgemma-1b-endpoint-1761683900-420591\n", + "Saving model artifacts to sagemaker-us-east-1-329542461890/models/google_vaultgemma-1b\n" + ] + } + ], + "source": [ + "session = sagemaker.Session()\n", + "role = sagemaker.get_execution_role()\n", + "\n", + "instance_type = \"ml.m5.xlarge\"\n", + "instance_count = 1\n", + "\n", + "HUGGING_FACE_HUB_TOKEN = \"\"\n", + "model_id = \"google/vaultgemma-1b\"\n", + "model_id_filesafe = model_id.replace(\"/\", \"_\").replace(\".\", \"_\")\n", + "endpoint_name = f\"{model_id_filesafe.replace(\"_\", \"-\")}-endpoint-{str(datetime.datetime.now().timestamp()).replace(\".\", \"-\")}\"\n", + "print(endpoint_name)\n", + "\n", + "base_name = model_id.split('/')[-1].replace('.', '-').lower()\n", + "model_lineage = model_id.split('/')[0]\n", + "base_name\n", + "\n", + "bucket_name = session.default_bucket()\n", + "default_prefix = session.default_bucket_prefix or f\"models/{model_id_filesafe}\"\n", + "print(f\"Saving model artifacts to {bucket_name}/{default_prefix}\")\n", + "\n", + "os.makedirs(\"code\", exist_ok=True)" + ] + }, + { + "cell_type": "markdown", + "id": "51c1ee59-9644-42d3-8f3f-45dd7e8c124d", + "metadata": {}, + "source": [ + "### Local Model Test" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "d8b33210-6e04-4885-ad35-513f257124d9", + "metadata": { + "execution": { + "iopub.execute_input": "2025-10-28T20:42:00.175305Z", + "iopub.status.busy": "2025-10-28T20:42:00.175098Z", + "iopub.status.idle": "2025-10-28T20:42:00.193771Z", + "shell.execute_reply": "2025-10-28T20:42:00.193345Z", + "shell.execute_reply.started": "2025-10-28T20:42:00.175290Z" + } + }, + "outputs": [], + "source": [ + "from huggingface_hub import login\n", + "login(HUGGING_FACE_HUB_TOKEN)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "7f67cb1f-77b4-4348-884e-cccf5a4f845a", + "metadata": { + "execution": { + "iopub.execute_input": "2025-10-28T20:42:04.762066Z", + "iopub.status.busy": "2025-10-28T20:42:04.761680Z", + "iopub.status.idle": "2025-10-28T20:42:17.726388Z", + "shell.execute_reply": "2025-10-28T20:42:17.725770Z", + "shell.execute_reply.started": "2025-10-28T20:42:04.762034Z" + } + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "40cb769b1a894fc580d2b283d11ed516", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "config.json: 0%| | 0.00/1.37k [00:00 ⚠️ **Important**: \n", + "> - Deployment can take up to 15 minutes\n", + "> - Monitor the CloudWatch logs for progress" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "794b5a0a-bfb4-4659-962b-9523ce38de1b", + "metadata": {}, + "outputs": [], + "source": [ + "# Hub Model configuration. https://huggingface.co/models\n", + "hub = {\n", + "\t'HF_MODEL_ID':'google/vaultgemma-1b',\n", + "\t'HF_TASK':'image-text-to-text',\n", + " 'HF_TOKEN': HUGGING_FACE_HUB_TOKEN,\n", + "}\n", + "\n", + "# create Hugging Face Model Class\n", + "huggingface_model = HuggingFaceModel(\n", + " model_data=f\"s3://{bucket_name}/{default_prefix}/model.tar.gz\",\n", + "\ttransformers_version='4.49.0',\n", + "\tpytorch_version='2.6.0',\n", + "\tpy_version='py312',\n", + "\tenv=env,\n", + "\trole=role, \n", + " entry_point=\"inference.py\",\n", + " enable_network_isolation=False\n", + ")\n", + "\n", + "# deploy model to SageMaker Inference\n", + "predictor = huggingface_model.deploy(\n", + "\tinitial_instance_count=1, # number of instances\n", + "\tinstance_type='ml.m5.xlarge' # ec2 instance type\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e196ece-bada-486a-8f20-b78a381de41b", + "metadata": {}, + "outputs": [], + "source": [ + "# Using DJL Serving\n", + "# UNDER CONSTRUCTION\n", + "# %%time\n", + "\n", + "# image_uri = \"763104351884.dkr.ecr.us-east-1.amazonaws.com/djl-inference:0.34.0-lmi16.0.0-cu128-v1.2\"\n", + "# model = HuggingFaceModel(\n", + "# model_data=f\"s3://{bucket_name}/{default_prefix}/model.tar.gz\",\n", + "# image_uri=image_uri,\n", + "# env=env,\n", + "# role=role,\n", + "# entry_point=\"inference.py\",\n", + "# enable_network_isolation=False\n", + "# )\n", + "\n", + "# predictor = model.deploy(\n", + "# initial_instance_count=instance_count,\n", + "# instance_type=instance_type,\n", + "# endpoint_name=endpoint_name\n", + "# )\n", + "\n", + "# predictor.predict({\n", + "# \t\"inputs\": \"Can you please let us know more details about your training using differential privacy?\",\n", + "# })" + ] + }, + { + "cell_type": "markdown", + "id": "f7b85c32-d4df-45f1-84f4-86a296ad1c0d", + "metadata": { + "execution": { + "iopub.execute_input": "2025-09-15T19:14:57.135928Z", + "iopub.status.busy": "2025-09-15T19:14:57.135661Z", + "iopub.status.idle": "2025-09-15T19:14:57.139468Z", + "shell.execute_reply": "2025-09-15T19:14:57.138566Z", + "shell.execute_reply.started": "2025-09-15T19:14:57.135907Z" + } + }, + "source": [ + "# Clean up" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bcef37c7-6b47-4c3b-b6b7-b266245b492e", + "metadata": {}, + "outputs": [], + "source": [ + "predictor.delete_endpoint(True)\n", + "huggingface_model.delete_model()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Vault-Gemma/.ipynb_checkpoints/Vault-Gemma-1B-checkpoint.ipynb b/Vault-Gemma/.ipynb_checkpoints/Vault-Gemma-1B-checkpoint.ipynb new file mode 100644 index 0000000..cd7a1e9 --- /dev/null +++ b/Vault-Gemma/.ipynb_checkpoints/Vault-Gemma-1B-checkpoint.ipynb @@ -0,0 +1,321 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "94d68fb3-00f2-447c-b5e0-28fa18562392", + "metadata": {}, + "source": [ + "# How to deploy the VaultGemma 1B for inference using Amazon SageMakerAI\n", + "**Recommended kernel(s):** This notebook can be run with any Amazon SageMaker Studio kernel.\n", + "\n", + "In this notebook, you will learn how to deploy the Vault Gemma 1B model (HuggingFace model ID: [google/vaultgemma-1b](https://huggingface.co/google/vaultgemma-1b)) using Amazon SageMaker AI. \n", + "\n", + "VaultGemma is a variant of the Gemma family of lightweight, state-of-the-art open models from Google. It is pre-trained from the ground up using Differential Privacy (DP). This provides strong, mathematically-backed privacy guarantees for its training data, limiting the extent to which the model's outputs can reveal information about any single training example.\n", + "\n", + "VaultGemma uses a similar architecture as Gemma 2. VaultGemma is a pretrained model that can be instruction tuned for a variety of language understanding and generation tasks. Its relatively small size (< 1B parameters) makes it possible to deploy in environments with limited resources, democratizing access to state-of-the-art AI models that are built with privacy at their core.\n", + "\n", + "### License agreement\n", + "* This model is gated on HuggingFace, please refer to the original [model card](https://huggingface.co/google/vaultgemma-1b) for license.\n", + "* This notebook is a sample notebook and not intended for production use.\n", + "\n", + "### Execution environment setup\n", + "This notebook requires the following third-party Python dependencies:\n", + "* AWS [`sagemaker`](https://sagemaker.readthedocs.io/en/stable/index.html) with a version greater than or equal to 2.242.0\n", + "\n", + "Let's install or upgrade these dependencies using the following command:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "fed1c92f-9fbc-47c9-b940-7dd07f962ef8", + "metadata": { + "execution": { + "iopub.execute_input": "2025-09-15T19:04:31.124480Z", + "iopub.status.busy": "2025-09-15T19:04:31.124212Z", + "iopub.status.idle": "2025-09-15T19:04:37.189319Z", + "shell.execute_reply": "2025-09-15T19:04:37.188352Z", + "shell.execute_reply.started": "2025-09-15T19:04:31.124456Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "autogluon-multimodal 1.4.0 requires nvidia-ml-py3<8.0,>=7.352.0, which is not installed.\n", + "aiobotocore 2.21.1 requires botocore<1.37.2,>=1.37.0, but you have botocore 1.40.30 which is incompatible.\n", + "autogluon-multimodal 1.4.0 requires transformers[sentencepiece]<4.50,>=4.38.0, but you have transformers 4.57.0.dev0 which is incompatible.\n", + "autogluon-timeseries 1.4.0 requires transformers[sentencepiece]<4.50,>=4.38.0, but you have transformers 4.57.0.dev0 which is incompatible.\n", + "sagemaker-studio-analytics-extension 0.2.0 requires sparkmagic==0.22.0, but you have sparkmagic 0.21.0 which is incompatible.\n", + "sparkmagic 0.21.0 requires pandas<2.0.0,>=0.17.1, but you have pandas 2.3.1 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install -Uq sagemaker" + ] + }, + { + "cell_type": "markdown", + "id": "fc56cad7-ea97-43c4-835d-a56729549023", + "metadata": {}, + "source": [ + "### Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "c128cf97-3f1f-4176-87b4-1193cd9e9c0f", + "metadata": { + "execution": { + "iopub.execute_input": "2025-09-15T19:04:37.191175Z", + "iopub.status.busy": "2025-09-15T19:04:37.190795Z", + "iopub.status.idle": "2025-09-15T19:04:37.196080Z", + "shell.execute_reply": "2025-09-15T19:04:37.195223Z", + "shell.execute_reply.started": "2025-09-15T19:04:37.191132Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.245.0\n" + ] + } + ], + "source": [ + "import sagemaker\n", + "import boto3\n", + "import logging\n", + "import time\n", + "from sagemaker.session import Session\n", + "from sagemaker.s3 import S3Uploader\n", + "\n", + "print(sagemaker.__version__)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "bba6d4bf-5df9-4e3e-8a9e-a03897e0cb0f", + "metadata": { + "execution": { + "iopub.execute_input": "2025-09-15T19:04:37.197423Z", + "iopub.status.busy": "2025-09-15T19:04:37.197103Z", + "iopub.status.idle": "2025-09-15T19:04:38.010473Z", + "shell.execute_reply": "2025-09-15T19:04:38.009714Z", + "shell.execute_reply.started": "2025-09-15T19:04:37.197392Z" + } + }, + "outputs": [], + "source": [ + "try:\n", + " role = sagemaker.get_execution_role()\n", + " sagemaker_session = sagemaker.Session()\n", + " \n", + "except ValueError:\n", + " iam = boto3.client('iam')\n", + " role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "7e6b15cd-34e5-45f4-a68a-728aa9cc930d", + "metadata": { + "execution": { + "iopub.execute_input": "2025-09-15T19:09:45.382881Z", + "iopub.status.busy": "2025-09-15T19:09:45.382602Z", + "iopub.status.idle": "2025-09-15T19:09:45.388802Z", + "shell.execute_reply": "2025-09-15T19:09:45.387863Z", + "shell.execute_reply.started": "2025-09-15T19:09:45.382860Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'vaultgemma-1b'" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "HF_MODEL_ID = \"google/vaultgemma-1b\"\n", + "HUGGING_FACE_HUB_TOKEN = \"\"\n", + "\n", + "base_name = HF_MODEL_ID.split('/')[-1].replace('.', '-').lower()\n", + "model_lineage = HF_MODEL_ID.split('/')[0]\n", + "base_name" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "af864803-2921-4dda-95d7-6177b7d720ec", + "metadata": { + "execution": { + "iopub.execute_input": "2025-09-15T19:07:30.037543Z", + "iopub.status.busy": "2025-09-15T19:07:30.037241Z", + "iopub.status.idle": "2025-09-15T19:07:30.041271Z", + "shell.execute_reply": "2025-09-15T19:07:30.040218Z", + "shell.execute_reply.started": "2025-09-15T19:07:30.037519Z" + } + }, + "outputs": [], + "source": [ + "instance_type = \"ml.m5.xlarge\"\n", + "instance_count = 1" + ] + }, + { + "cell_type": "markdown", + "id": "7245e684-aa6e-44ca-9451-10e0bee4b96e", + "metadata": {}, + "source": [ + "## Create SageMaker Model\n", + "\n", + "#### HUGGING_FACE_HUB_TOKEN \n", + "VaultGemma-1B is a gated model. Therefore, if you deploy model files hosted on the Hub, you need to provide your HuggingFace token as environment variable. This enables SageMaker AI to download the files at runtime." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "2e196ece-bada-486a-8f20-b78a381de41b", + "metadata": { + "execution": { + "iopub.execute_input": "2025-09-15T19:13:36.536292Z", + "iopub.status.busy": "2025-09-15T19:13:36.535990Z", + "iopub.status.idle": "2025-09-15T19:13:36.714432Z", + "shell.execute_reply": "2025-09-15T19:13:36.713736Z", + "shell.execute_reply.started": "2025-09-15T19:13:36.536270Z" + } + }, + "outputs": [], + "source": [ + "from sagemaker.huggingface import HuggingFaceModel\n", + "\n", + "# Hub Model configuration. https://huggingface.co/models\n", + "hub = {\n", + "\t'HF_MODEL_ID':'google/vaultgemma-1b',\n", + "\t'HF_TASK':'text-generation',\n", + " 'HF_TOKEN':HUGGING_FACE_HUB_TOKEN\n", + "}\n", + "\n", + "# create Hugging Face Model Class\n", + "huggingface_model = HuggingFaceModel(\n", + "\ttransformers_version='4.49.0',\n", + "\tpytorch_version='2.6.0',\n", + "\tpy_version='py312',\n", + "\tenv=hub,\n", + "\trole=role, \n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "40af4b17-30e9-4138-93f3-0589e4770815", + "metadata": {}, + "source": [ + "## Deploy Model to SageMaker Endpoint\n", + "\n", + "Now we'll deploy our model to a SageMaker endpoint for real-time inference. This is a significant step that:\n", + "1. Provisions the specified compute resources (M5 instance)\n", + "2. Deploys the model container\n", + "3. Sets up the endpoint for API access\n", + "\n", + "### Deployment Configuration\n", + "- **Instance Count**: 1 instance for single-node deployment\n", + "- **Instance Type**: `ml.m5.xlarge` for high-performance inference\n", + "\n", + "> ⚠️ **Important**: \n", + "> - Deployment can take up to 15 minutes\n", + "> - Monitor the CloudWatch logs for progress" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "786fa8a9-d635-412f-adc5-4941a80b2b72", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "# deploy model to SageMaker Inference\n", + "predictor = huggingface_model.deploy(\n", + "\tinitial_instance_count=instance_count, # number of instances\n", + "\tinstance_type=instance_type # ec2 instance type\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc38898a-290b-482b-968c-f04b8674300c", + "metadata": {}, + "outputs": [], + "source": [ + "predictor.predict({\n", + "\t\"inputs\": \"Can you please let us know more details about your training using differential privacy?\",\n", + "})" + ] + }, + { + "cell_type": "markdown", + "id": "f7b85c32-d4df-45f1-84f4-86a296ad1c0d", + "metadata": { + "execution": { + "iopub.execute_input": "2025-09-15T19:14:57.135928Z", + "iopub.status.busy": "2025-09-15T19:14:57.135661Z", + "iopub.status.idle": "2025-09-15T19:14:57.139468Z", + "shell.execute_reply": "2025-09-15T19:14:57.138566Z", + "shell.execute_reply.started": "2025-09-15T19:14:57.135907Z" + } + }, + "source": [ + "# Clean up" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bcef37c7-6b47-4c3b-b6b7-b266245b492e", + "metadata": {}, + "outputs": [], + "source": [ + "huggingface_model.delete_model()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Vault-Gemma/Vault-Gemma-1B.ipynb b/Vault-Gemma/Vault-Gemma-1B.ipynb new file mode 100644 index 0000000..cd7a1e9 --- /dev/null +++ b/Vault-Gemma/Vault-Gemma-1B.ipynb @@ -0,0 +1,321 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "94d68fb3-00f2-447c-b5e0-28fa18562392", + "metadata": {}, + "source": [ + "# How to deploy the VaultGemma 1B for inference using Amazon SageMakerAI\n", + "**Recommended kernel(s):** This notebook can be run with any Amazon SageMaker Studio kernel.\n", + "\n", + "In this notebook, you will learn how to deploy the Vault Gemma 1B model (HuggingFace model ID: [google/vaultgemma-1b](https://huggingface.co/google/vaultgemma-1b)) using Amazon SageMaker AI. \n", + "\n", + "VaultGemma is a variant of the Gemma family of lightweight, state-of-the-art open models from Google. It is pre-trained from the ground up using Differential Privacy (DP). This provides strong, mathematically-backed privacy guarantees for its training data, limiting the extent to which the model's outputs can reveal information about any single training example.\n", + "\n", + "VaultGemma uses a similar architecture as Gemma 2. VaultGemma is a pretrained model that can be instruction tuned for a variety of language understanding and generation tasks. Its relatively small size (< 1B parameters) makes it possible to deploy in environments with limited resources, democratizing access to state-of-the-art AI models that are built with privacy at their core.\n", + "\n", + "### License agreement\n", + "* This model is gated on HuggingFace, please refer to the original [model card](https://huggingface.co/google/vaultgemma-1b) for license.\n", + "* This notebook is a sample notebook and not intended for production use.\n", + "\n", + "### Execution environment setup\n", + "This notebook requires the following third-party Python dependencies:\n", + "* AWS [`sagemaker`](https://sagemaker.readthedocs.io/en/stable/index.html) with a version greater than or equal to 2.242.0\n", + "\n", + "Let's install or upgrade these dependencies using the following command:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "fed1c92f-9fbc-47c9-b940-7dd07f962ef8", + "metadata": { + "execution": { + "iopub.execute_input": "2025-09-15T19:04:31.124480Z", + "iopub.status.busy": "2025-09-15T19:04:31.124212Z", + "iopub.status.idle": "2025-09-15T19:04:37.189319Z", + "shell.execute_reply": "2025-09-15T19:04:37.188352Z", + "shell.execute_reply.started": "2025-09-15T19:04:31.124456Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "autogluon-multimodal 1.4.0 requires nvidia-ml-py3<8.0,>=7.352.0, which is not installed.\n", + "aiobotocore 2.21.1 requires botocore<1.37.2,>=1.37.0, but you have botocore 1.40.30 which is incompatible.\n", + "autogluon-multimodal 1.4.0 requires transformers[sentencepiece]<4.50,>=4.38.0, but you have transformers 4.57.0.dev0 which is incompatible.\n", + "autogluon-timeseries 1.4.0 requires transformers[sentencepiece]<4.50,>=4.38.0, but you have transformers 4.57.0.dev0 which is incompatible.\n", + "sagemaker-studio-analytics-extension 0.2.0 requires sparkmagic==0.22.0, but you have sparkmagic 0.21.0 which is incompatible.\n", + "sparkmagic 0.21.0 requires pandas<2.0.0,>=0.17.1, but you have pandas 2.3.1 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install -Uq sagemaker" + ] + }, + { + "cell_type": "markdown", + "id": "fc56cad7-ea97-43c4-835d-a56729549023", + "metadata": {}, + "source": [ + "### Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "c128cf97-3f1f-4176-87b4-1193cd9e9c0f", + "metadata": { + "execution": { + "iopub.execute_input": "2025-09-15T19:04:37.191175Z", + "iopub.status.busy": "2025-09-15T19:04:37.190795Z", + "iopub.status.idle": "2025-09-15T19:04:37.196080Z", + "shell.execute_reply": "2025-09-15T19:04:37.195223Z", + "shell.execute_reply.started": "2025-09-15T19:04:37.191132Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.245.0\n" + ] + } + ], + "source": [ + "import sagemaker\n", + "import boto3\n", + "import logging\n", + "import time\n", + "from sagemaker.session import Session\n", + "from sagemaker.s3 import S3Uploader\n", + "\n", + "print(sagemaker.__version__)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "bba6d4bf-5df9-4e3e-8a9e-a03897e0cb0f", + "metadata": { + "execution": { + "iopub.execute_input": "2025-09-15T19:04:37.197423Z", + "iopub.status.busy": "2025-09-15T19:04:37.197103Z", + "iopub.status.idle": "2025-09-15T19:04:38.010473Z", + "shell.execute_reply": "2025-09-15T19:04:38.009714Z", + "shell.execute_reply.started": "2025-09-15T19:04:37.197392Z" + } + }, + "outputs": [], + "source": [ + "try:\n", + " role = sagemaker.get_execution_role()\n", + " sagemaker_session = sagemaker.Session()\n", + " \n", + "except ValueError:\n", + " iam = boto3.client('iam')\n", + " role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "7e6b15cd-34e5-45f4-a68a-728aa9cc930d", + "metadata": { + "execution": { + "iopub.execute_input": "2025-09-15T19:09:45.382881Z", + "iopub.status.busy": "2025-09-15T19:09:45.382602Z", + "iopub.status.idle": "2025-09-15T19:09:45.388802Z", + "shell.execute_reply": "2025-09-15T19:09:45.387863Z", + "shell.execute_reply.started": "2025-09-15T19:09:45.382860Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'vaultgemma-1b'" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "HF_MODEL_ID = \"google/vaultgemma-1b\"\n", + "HUGGING_FACE_HUB_TOKEN = \"\"\n", + "\n", + "base_name = HF_MODEL_ID.split('/')[-1].replace('.', '-').lower()\n", + "model_lineage = HF_MODEL_ID.split('/')[0]\n", + "base_name" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "af864803-2921-4dda-95d7-6177b7d720ec", + "metadata": { + "execution": { + "iopub.execute_input": "2025-09-15T19:07:30.037543Z", + "iopub.status.busy": "2025-09-15T19:07:30.037241Z", + "iopub.status.idle": "2025-09-15T19:07:30.041271Z", + "shell.execute_reply": "2025-09-15T19:07:30.040218Z", + "shell.execute_reply.started": "2025-09-15T19:07:30.037519Z" + } + }, + "outputs": [], + "source": [ + "instance_type = \"ml.m5.xlarge\"\n", + "instance_count = 1" + ] + }, + { + "cell_type": "markdown", + "id": "7245e684-aa6e-44ca-9451-10e0bee4b96e", + "metadata": {}, + "source": [ + "## Create SageMaker Model\n", + "\n", + "#### HUGGING_FACE_HUB_TOKEN \n", + "VaultGemma-1B is a gated model. Therefore, if you deploy model files hosted on the Hub, you need to provide your HuggingFace token as environment variable. This enables SageMaker AI to download the files at runtime." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "2e196ece-bada-486a-8f20-b78a381de41b", + "metadata": { + "execution": { + "iopub.execute_input": "2025-09-15T19:13:36.536292Z", + "iopub.status.busy": "2025-09-15T19:13:36.535990Z", + "iopub.status.idle": "2025-09-15T19:13:36.714432Z", + "shell.execute_reply": "2025-09-15T19:13:36.713736Z", + "shell.execute_reply.started": "2025-09-15T19:13:36.536270Z" + } + }, + "outputs": [], + "source": [ + "from sagemaker.huggingface import HuggingFaceModel\n", + "\n", + "# Hub Model configuration. https://huggingface.co/models\n", + "hub = {\n", + "\t'HF_MODEL_ID':'google/vaultgemma-1b',\n", + "\t'HF_TASK':'text-generation',\n", + " 'HF_TOKEN':HUGGING_FACE_HUB_TOKEN\n", + "}\n", + "\n", + "# create Hugging Face Model Class\n", + "huggingface_model = HuggingFaceModel(\n", + "\ttransformers_version='4.49.0',\n", + "\tpytorch_version='2.6.0',\n", + "\tpy_version='py312',\n", + "\tenv=hub,\n", + "\trole=role, \n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "40af4b17-30e9-4138-93f3-0589e4770815", + "metadata": {}, + "source": [ + "## Deploy Model to SageMaker Endpoint\n", + "\n", + "Now we'll deploy our model to a SageMaker endpoint for real-time inference. This is a significant step that:\n", + "1. Provisions the specified compute resources (M5 instance)\n", + "2. Deploys the model container\n", + "3. Sets up the endpoint for API access\n", + "\n", + "### Deployment Configuration\n", + "- **Instance Count**: 1 instance for single-node deployment\n", + "- **Instance Type**: `ml.m5.xlarge` for high-performance inference\n", + "\n", + "> ⚠️ **Important**: \n", + "> - Deployment can take up to 15 minutes\n", + "> - Monitor the CloudWatch logs for progress" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "786fa8a9-d635-412f-adc5-4941a80b2b72", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "# deploy model to SageMaker Inference\n", + "predictor = huggingface_model.deploy(\n", + "\tinitial_instance_count=instance_count, # number of instances\n", + "\tinstance_type=instance_type # ec2 instance type\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc38898a-290b-482b-968c-f04b8674300c", + "metadata": {}, + "outputs": [], + "source": [ + "predictor.predict({\n", + "\t\"inputs\": \"Can you please let us know more details about your training using differential privacy?\",\n", + "})" + ] + }, + { + "cell_type": "markdown", + "id": "f7b85c32-d4df-45f1-84f4-86a296ad1c0d", + "metadata": { + "execution": { + "iopub.execute_input": "2025-09-15T19:14:57.135928Z", + "iopub.status.busy": "2025-09-15T19:14:57.135661Z", + "iopub.status.idle": "2025-09-15T19:14:57.139468Z", + "shell.execute_reply": "2025-09-15T19:14:57.138566Z", + "shell.execute_reply.started": "2025-09-15T19:14:57.135907Z" + } + }, + "source": [ + "# Clean up" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bcef37c7-6b47-4c3b-b6b7-b266245b492e", + "metadata": {}, + "outputs": [], + "source": [ + "huggingface_model.delete_model()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}