diff --git a/ build_and_train_models/sm-finetune_qwen_for_code_embedding/inference_code/inference.py b/ build_and_train_models/sm-finetune_qwen_for_code_embedding/inference_code/inference.py new file mode 100644 index 0000000000..d82a2da4f5 --- /dev/null +++ b/ build_and_train_models/sm-finetune_qwen_for_code_embedding/inference_code/inference.py @@ -0,0 +1,153 @@ +# inference.py - Custom inference script for embedding operations +import json +import torch +from sentence_transformers import SentenceTransformer +from sklearn.metrics.pairwise import cosine_similarity +import logging + +# Configure logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +class EmbeddingInferenceHandler: + """ + Custom inference handler that supports both encoding and similarity operations. + """ + + def __init__(self): + self.model = None + self.device = None + + def model_fn(self, model_dir): + """ + Load the fine-tuned model from the model directory. + """ + logger.info(f"Loading model from: {model_dir}") + + # Detect device + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + logger.info(f"Using device: {self.device}") + + # Load the sentence transformer model + self.model = SentenceTransformer(model_dir, device=self.device) + + return self.model + + def input_fn(self, request_body, content_type="application/json"): + """ + Parse input data for inference. + """ + if content_type == "application/json": + logger.info(f"Inference request: {request_body}") + data = json.loads(request_body) + else: + logger.warn(f"Wrong content type: {content_type}") + raise ValueError(f"Unsupported content type: {content_type}") + + return data + + def predict_fn(self, data, model): + """ + Perform inference based on the operation type. + """ + operation = data.get("operation", "encode") + logger.info(f"Prediction input: {data}") + if operation == "encode": + return self._encode_operation(data, model) + elif operation == "similarity": + return self._similarity_operation(data, model) + else: + raise ValueError(f"Unsupported operation: {operation}") + + def _encode_operation(self, data, model): + """ + Encode text inputs to embeddings. + """ + inputs = data.get("inputs", []) + if not inputs: + raise ValueError("No inputs provided for encoding") + + # Get target dimension (default to 512 for Matryoshka) + target_dim = data.get("dimension", 512) + + # Encode inputs + embeddings = model.encode( + inputs, + batch_size=data.get("batch_size", 32), + show_progress_bar=False, + normalize_embeddings=data.get("normalize", True), + ) + + # Truncate to target dimension if specified + if target_dim and target_dim < embeddings.shape[1]: + embeddings = embeddings[:, :target_dim] + + return { + "embeddings": embeddings.tolist(), + "dimension": embeddings.shape[1], + "num_texts": len(inputs), + } + + def _similarity_operation(self, data, model): + """ + Calculate similarity between text pairs. + """ + text1 = data.get("text1") + text2 = data.get("text2") + + if not text1 or not text2: + raise ValueError("Both text1 and text2 required for similarity") + + # Get target dimension + target_dim = data.get("dimension", 512) + + # Encode both texts + embeddings = model.encode([text1, text2], normalize_embeddings=True) + + # Truncate if needed + if target_dim and target_dim < embeddings.shape[1]: + embeddings = embeddings[:, :target_dim] + + # Calculate cosine similarity + similarity = cosine_similarity(embeddings[0].reshape(1, -1), embeddings[1].reshape(1, -1))[ + 0 + ][0] + + return { + "similarity": float(similarity), + "dimension": embeddings.shape[1], + "text1_embedding": embeddings[0].tolist(), + "text2_embedding": embeddings[1].tolist(), + } + + def output_fn(self, prediction, accept="application/json"): + """ + Format the prediction output. + """ + if accept == "application/json": + return json.dumps(prediction), "application/json" + else: + raise ValueError(f"Unsupported accept type: {accept}") + + +# Global handler instance +handler = EmbeddingInferenceHandler() + + +# SageMaker inference functions +def model_fn(model_dir): + return handler.model_fn(model_dir) + + +def input_fn(request_body, content_type): + return handler.input_fn(request_body, content_type) + + +def predict_fn(data, model): + logger.info(f"predict_fn data: {data}") + return handler.predict_fn(data, model) + + +def output_fn(prediction, accept): + return handler.output_fn(prediction, accept) diff --git a/ build_and_train_models/sm-finetune_qwen_for_code_embedding/inference_code/requirements.txt b/ build_and_train_models/sm-finetune_qwen_for_code_embedding/inference_code/requirements.txt new file mode 100644 index 0000000000..143b22b790 --- /dev/null +++ b/ build_and_train_models/sm-finetune_qwen_for_code_embedding/inference_code/requirements.txt @@ -0,0 +1,2 @@ +sentence-transformers>=4.1.0 +transformers>=4.51.0 \ No newline at end of file diff --git a/ build_and_train_models/sm-finetune_qwen_for_code_embedding/scripts/dataset.py b/ build_and_train_models/sm-finetune_qwen_for_code_embedding/scripts/dataset.py new file mode 100644 index 0000000000..67b2f346ae --- /dev/null +++ b/ build_and_train_models/sm-finetune_qwen_for_code_embedding/scripts/dataset.py @@ -0,0 +1,82 @@ +# Create code embedding training dataset +from datasets import Dataset + + +def create_code_embedding_dataset(): + """ + Create a comprehensive dataset with code-description pairs for embedding fine-tuning. + This dataset follows contrastive learning principles for code embeddings. + """ + code_pairs = [ + { + "text1": "def fibonacci(n): return n if n <= 1 else fibonacci(n-1) + fibonacci(n-2)", + "text2": "recursive function to calculate fibonacci numbers in Python", + }, + { + "text1": "SELECT * FROM users WHERE age > 18 AND status = 'active'", + "text2": "SQL query to find all active adult users from database", + }, + { + "text1": "class Stack: def __init__(self): self.items = []", + "text2": "stack data structure implementation with initialization method", + }, + { + "text1": "for i in range(len(arr)): for j in range(len(arr)-1-i): if arr[j] > arr[j+1]: arr[j], arr[j+1] = arr[j+1], arr[j]", + "text2": "bubble sort algorithm implementation using nested loops", + }, + { + "text1": "import pandas as pd; df = pd.read_csv('data.csv')", + "text2": "load CSV file into pandas DataFrame for data analysis", + }, + { + "text1": "def quicksort(arr): return [] if not arr else quicksort([x for x in arr[1:] if x <= arr[0]]) + [arr[0]] + quicksort([x for x in arr[1:] if x > arr[0]])", + "text2": "quicksort algorithm implementation using list comprehension", + }, + { + "text1": "try: result = func() except Exception as e: print(f'Error: {e}')", + "text2": "error handling with try-except block and formatted output", + }, + {"text1": "lambda x: x ** 2", "text2": "lambda function to calculate square of a number"}, + { + "text1": "def binary_search(arr, target): left, right = 0, len(arr) - 1", + "text2": "binary search algorithm initialization with left and right pointers", + }, + { + "text1": "class LinkedList: def __init__(self): self.head = None", + "text2": "linked list data structure class definition with head pointer", + }, + ] + + # Expand dataset with more programming language examples + additional_pairs = [ + { + "text1": "function addNumbers(a, b) { return a + b; }", + "text2": "JavaScript function to add two numbers and return result", + }, + { + "text1": "public class Calculator { private int result; }", + "text2": "Java class definition for calculator with private result field", + }, + { + "text1": "def __str__(self): return f'{self.name}: {self.value}'", + "text2": "Python string representation method for object display", + }, + { + "text1": "CREATE TABLE users (id INT PRIMARY KEY, name VARCHAR(100))", + "text2": "SQL table creation statement with primary key and name field", + }, + { + "text1": "const result = await fetch('/api/data').then(res => res.json())", + "text2": "JavaScript async API call with JSON response parsing", + }, + ] + + # Combine both lists + code_pairs.extend(additional_pairs) + + # Transform combined list to the format for Dataset.from_dict() + dataset_dict = { + "text1": [pair["text1"] for pair in code_pairs], + "text2": [pair["text2"] for pair in code_pairs], + } + return Dataset.from_dict(dataset_dict) diff --git a/ build_and_train_models/sm-finetune_qwen_for_code_embedding/sm-training_qwen3_embedding_for_coding_tasks.ipynb b/ build_and_train_models/sm-finetune_qwen_for_code_embedding/sm-training_qwen3_embedding_for_coding_tasks.ipynb new file mode 100644 index 0000000000..e7c0c8d9a4 --- /dev/null +++ b/ build_and_train_models/sm-finetune_qwen_for_code_embedding/sm-training_qwen3_embedding_for_coding_tasks.ipynb @@ -0,0 +1,515 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1ba333d0-58b1-45e8-829c-2175c46b4de0", + "metadata": {}, + "source": [ + "# Fine-tune Qwen3-Embedding for code embeddings using Amazon SageMaker" + ] + }, + { + "cell_type": "markdown", + "id": "894bfaa5-7c0d-4b07-a78f-6e3c817ebdbc", + "metadata": {}, + "source": [ + "---\n", + "\n", + "This notebook's CI test result for us-west-2 is as follows. CI test results in other regions can be found at the end of the notebook.\n", + "\n", + "![This us-west-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-west-2/build_and_train_models|sm-finetune_qwen_for_code_embedding|sm-training_qwen3_embedding_for_coding_tasks.ipynb)\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "id": "b910c91f-6ff9-4e39-a45c-30946330dd80", + "metadata": {}, + "source": [ + "## Introduction\n", + "Embedding models for code have become essential components in modern software development workflows, powering applications like semantic code search, retrieval-augmented generation (RAG), and intelligent coding assistants. These models capture the semantic and functional relationships between code snippets by transforming them into numerical vector representations, enabling more effective code retrieval and understanding compared to traditional methods that treat code as sequences of characters.\n", + "\n", + "While general-purpose embedding models perform well on natural language tasks, code has unique characteristics such as: syntax, structure, and domain-specific semantics - that benefit from specialized fine-tuning. This blog demonstrates how to fine-tune the latest [Qwen3-Embedding](https://github.com/QwenLM/Qwen3-Embedding) models specifically for code embeddings using Amazon SageMaker, showcasing three key innovations that make this approach particularly effective: (1) leveraging the state-of-the-art Qwen3-Embedding foundation model, (2) implementing Matryoshka Representation Learning to flexibly handle different dimensionality requirements, and (3) applying Contrastive Learning specifically optimized for code relationships.\n", + "\n", + "First, we'll explore the Qwen3-Embedding model family, representing the latest advancement in embedding technology with state-of-the-art performance on code benchmarks and built-in instruction awareness. Next, we'll implement Matryoshka Representation Learning, an innovative technique that creates flexible, nested embeddings allowing you to balance performance with computational efficiency by truncating vectors to optimal dimensions. Finally, we'll leverage contrastive learning, a powerful training paradigm that teaches models to understand code semantics by learning from naturally occurring code-documentation pairs found in real-world repositories.\n", + "\n", + "Together, these three approaches create a comprehensive solution for building high-performance code embedding models that can adapt to various deployment scenarios while maintaining superior accuracy on code understanding tasks.\n", + "## Introducing Qwen3-Embedding: The Next-Generation Model Family for Code Embeddings\n", + "The Qwen3-Embedding series represents one of the latest advancement in Alibaba's Qwen model family, specifically engineered for text embedding, retrieval, and ranking tasks with exceptional performance in code understanding applications. Built upon the robust foundational architecture of the Qwen3 series, these models inherit exceptional multilingual capabilities, long-text understanding, and advanced reasoning skills that make them particularly well-suited for fine-tuning on code embedding tasks. The series offers comprehensive flexibility through three distinct model sizes-0.6B, 4B, and 8B parameters-allowing developers to choose the optimal balance between computational efficiency and performance for their specific deployment requirements .\n", + "\n", + "### Key Benefits of Qwen3-Embedding model for code embedding fine-tuning\n", + "__Model Size Options__ \n", + "Qwen3-Embedding provides three model configurations (0.6B, 4B, and 8B parameters) that cater to diverse computational constraints and performance requirements. This flexibility enables selecting the most appropriate model size based on infrastructure limitations, latency requirements, and accuracy targets, with even the smallest 0.6B model demonstrating competitive performance on code retrieval benchmarks.\n", + "\n", + "__Built-in Instruction Awareness__ \n", + "All Qwen3-Embedding models feature native instruction awareness, supporting user-defined instructions to enhance performance for specific tasks, languages, or scenarios without requiring separate \"instruct\" variants. Qwen3-Embedding evaluation indicates that utilizing instructions typically yields performance improvements of 1% to 5% compared to non-instructed approaches, making this capability invaluable for optimizing code embedding tasks.\n", + "\n", + "__Extended Context Length__ \n", + "The series supports an impressive 32K token context length across all model sizes, significantly exceeding many competing embedding models and enabling comprehensive processing of large code files, extensive documentation, and complex multi-file codebases. This extended context capability is particularly beneficial for code embedding applications where understanding broader code structure and relationships is crucial for generating meaningful representations.\n", + "\n", + "__Superior Code Retrieval Capabilities__ \n", + "Qwen3-Embedding models have achieved state-of-the-art performance on code-specific benchmarks, with the flagship 8B model scoring high on the [MTEB](https://huggingface.co/spaces/mteb/leaderboard) Code benchmark. The models excel across multiple code-related applications including code retrieval, code classification, and code clustering, demonstrating their versatility for various code embedding use cases .\n", + "\n", + "__Multilingual Programming Language Support__ \n", + "The Qwen3-Embedding series offers robust support for over 100 languages, including comprehensive coverage of major programming languages such as Python, Java, JavaScript, C++, SQL, and many others. This extensive programming language support, inherited from the multilingual capabilities of the Qwen3 foundation models, enables effective cross-language code understanding and retrieval, making the models ideal for diverse development environments and polyglot codebases. The models provide strong multilingual, cross-lingual, and code retrieval capabilities that are essential for modern software development workflows involving multiple programming languages and frameworks.\n", + "## Matryoshka Representation Learning: Flexible Embedding Dimensions with Qwen3 Models\n", + "Traditional embedding models face a fundamental limitation: they generate fixed-dimensional vectors where all dimensions carry equal importance, requiring organizations to deploy separate models for different computational requirements. For instance, mobile applications might need lightweight 256-dimensional embeddings for real-time processing, while server-side analytics could benefit from rich 1024-dimensional representations for maximum accuracy.\n", + "\n", + "[Matryoshka Representation Learning](https://arxiv.org/abs/2205.13147) (MRL) addresses this challenge by training embedding models to prioritize information hierarchically, with the most critical semantic content concentrated in the earlier dimensions. This approach, inspired by Russian nesting dolls, enables a single model to produce high-quality embeddings at multiple dimensional scales. For example, when processing the code snippet ```def quicksort(arr): return sorted(arr)```, with an MRL-trained model:\n", + "- First 256 dimensions: Contain 80% of the important information (core semantic meaning)\n", + "- First 512 dimensions: Contain 90% of the important information (additional contextual information)\n", + "- All 1024 dimensions: Contain 100% of the information (fine-grained details)\n", + "\n", + "The key advantage is deployment flexibility: organizations can deploy one model that serves multiple use cases by specifying the desired output dimension at inference time. A mobile app can request 256-dimensional embeddings for fast similarity searches, while a code analysis system can request 1024-dimensional embeddings for comprehensive semantic understanding-all from the same deployed model. This \"one model fits all\" approach significantly reduces infrastructure complexity, maintenance overhead, and deployment costs while providing the flexibility to optimize performance based on real-time computational constraints.\n", + "\n", + "__Motivation for using Matryoshka with Qwen3 embedding models__ \n", + "The primary motivation for Matryoshka embeddings stems from the practical challenges of deploying high-dimensional embedding models in resource-constrained environments. Consider the Qwen3-Embedding model family we're working with: the smallest Qwen3-Embedding-0.6B model produces embeddings with up to 1,024 dimensions, the Qwen3-Embedding-4B model generates vectors up to 2,560 dimensions, and the flagship Qwen3-Embedding-8B model creates massive 4,096-dimensional vectors. These substantial vector sizes consume significant memory and computational resources - for instance, storing just one million 4,096-dimensional embeddings requires approximately 16GB of memory in float32 precision, making them expensive to deploy and slow to process at scale.\n", + "\n", + "Matryoshka addresses this challenge by training the Qwen3 models to frontload the most important semantic information into the first few dimensions, creating a hierarchy where earlier dimensions capture broad semantic meaning while later dimensions add progressively finer details. This design philosophy allows developers to choose the appropriate level of detail needed for specific tasks - you could truncate the 4,096-dimensional output from Qwen3-Embedding-8B down to 512 dimensions by passing a dimension value parameter during inference request.\n", + "\n", + "__Key Benefits for Qwen3 Embedding Deployment__ \n", + "Matryoshka embeddings offer several compelling advantages that make them particularly valuable for production applications using Qwen3 models. Storage and Speed Efficiency: For some use-cases, the technique can achieve [up to 14x smaller embedding sizes](https://arxiv.org/abs/2205.13147) while maintaining comparable accuracy, resulting in significant cost savings for storage and faster retrieval operations - particularly important when dealing with Qwen3's large dimensional outputs ranging from 1,024 to 4,096 dimensions. Flexible Performance Trade-offs: Organizations can scale their Qwen3 embedding solutions to match desired storage costs, processing speeds, and performance requirements by simply truncating embeddings to appropriate dimensions - for example, reducing Qwen3-Embedding-8B's 4,096 dimensions to 512 for memory-constrained environments.\n", + "\n", + "Deployment Simplicity: Unlike traditional dimensionality reduction techniques that require retraining or additional processing steps, Matryoshka optimization is built into the Qwen3 training process, eliminating extra pipeline complexity. Robust Performance Retention: Research demonstrates that Matryoshka models can preserve up to 98.37% of their original performance even when truncated to just 8.3% of the original embedding size ([ref](https://huggingface.co/blog/matryoshka)). This means you could potentially reduce Qwen3-Embedding-8B's 4,096 dimensions down to approximately 340 dimensions while retaining nearly all semantic quality - making them remarkably efficient for applications requiring real-time processing of large code corpora where the full dimensional output would be prohibitively expensive to deploy and manage.\n", + "## Contrastive Learning for Code Embeddings\n", + "Contrastive learning has emerged as a powerful training paradigm for embedding models, particularly effective for code representation learning. The core principle involves training the model to pull semantically similar code snippets closer together in the embedding space while pushing dissimilar ones apart.\n", + "\n", + "During training, the model learns from carefully constructed pairs: positive examples include functionally equivalent code written in different styles, refactored versions of the same function, or code snippets that solve the same problem using different approaches. Negative examples consist of code with different functionality or purpose. This approach is especially valuable for code embeddings because it helps the model distinguish between superficial syntactic differences and deeper semantic similarities-for instance, learning that ```for i in range(len(list))``` and ```for item in list``` represent similar iteration patterns despite different syntax.\n", + "\n", + "By optimizing contrastive objectives like [InfoNCE loss](https://paperswithcode.com/method/infonce) or triplet loss, the fine-tuned Qwen3-embedding model develops a more nuanced understanding of code semantics, resulting in embeddings that better capture functional relationships and enable more accurate code search, similarity detection, and retrieval-augmented generation tasks.\n", + "\n", + "__How Contrastive Learning Works in Practice__ \n", + "The contrastive learning framework trains models through structured comparison of code pairs within each training batch. Rather than learning from isolated examples, the model simultaneously processes multiple code-description pairs, learning to associate functionally equivalent implementations while differentiating them from unrelated code snippets.\n", + "\n", + "For example, the system learns to connect various sorting algorithm implementations-bubble sort, quicksort, merge sort-with descriptions like \"arranges array elements in ascending order\" while distinguishing these from functionally different operations like database queries or string manipulations. This approach leverages techniques such as Multiple Negatives Ranking Loss, where each positive code-description pair in a training batch serves as a negative example for all other pairs in the same batch. This creates dense training signals that efficiently teach the model to recognize functional similarity across different syntactic representations." + ] + }, + { + "cell_type": "markdown", + "id": "6a7bb224-0bd8-4541-a7a8-04b3fbebd0b0", + "metadata": {}, + "source": [ + "# Let's build\n", + "## Start with few imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a3fb5237-b1e5-48d8-9a66-45544676ec47", + "metadata": {}, + "outputs": [], + "source": [ + "import sagemaker\n", + "import boto3\n", + "import os\n", + "from datetime import datetime\n", + "from sagemaker.huggingface import HuggingFace, HuggingFaceModel\n", + "from sagemaker.s3 import S3Uploader" + ] + }, + { + "cell_type": "markdown", + "id": "577e137c-10b5-445c-bfa5-46c54c9ff19a", + "metadata": {}, + "source": [ + "## Configure the SageMaker environment \n", + "Configure IAM role for SageMaker so it can interract with services such as: EC2, S3, etc.\n", + "I use SageMaker default S3 bucket which is automatically created if you don't specify one when using SageMaker's Python SDK." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ffa6713f-c674-46f9-a6ee-f85db2d0aace", + "metadata": {}, + "outputs": [], + "source": [ + "sess = sagemaker.Session()\n", + "\n", + "# S3 bucket information\n", + "s3_bucket = sess.default_bucket()\n", + "prefix = \"qwen3-embedding-code\"\n", + "s3_path = f\"s3://{s3_bucket}/{prefix}\"\n", + "\n", + "# Role for SageMaker to access S3, EC2 and other services\n", + "try:\n", + " role = sagemaker.get_execution_role()\n", + "except ValueError:\n", + " iam = boto3.client(\"iam\")\n", + " role = iam.get_role(RoleName=\"sagemaker_execution_role\")[\"Role\"][\"Arn\"]\n", + "\n", + "print(f\"SageMaker role ARN: {role}\")\n", + "print(f\"SageMaker session bucket: {s3_path}\")\n", + "print(f\"AWS region: {sess.boto_region_name}\")" + ] + }, + { + "cell_type": "markdown", + "id": "6e4c0a59-8450-42e7-93fe-99c84b8d593c", + "metadata": {}, + "source": [ + "## Create a training dataset\n", + "We'll create a training dataset specifically designed for code embedding tasks using code-description pairs that reflect real-world development scenarios.\n", + "The script that generate the code samples can be found at `scripts/dataset.py`.\n", + "Generated dataset is transformed to `jsonl` format and uploaded to S3, later to be used by the training job." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "649cdbcf-750d-4695-9481-d1647ca6bf35", + "metadata": {}, + "outputs": [], + "source": [ + "# Create and display dataset\n", + "from scripts.dataset import create_code_embedding_dataset\n", + "\n", + "train_dataset = create_code_embedding_dataset()\n", + "print(f\"Created training dataset with {len(train_dataset)} code-description pairs\")\n", + "print(\"Sample entries:\")\n", + "for i in range(5):\n", + " print(f\" Code: {train_dataset[i]['text1'][:60]}...\")\n", + " print(f\" Description: {train_dataset[i]['text2']}\")\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "id": "e55ed584-25e8-4818-9a52-06bbfed8398c", + "metadata": {}, + "source": [ + "Prepare and upload the training dataset to S3 for SageMaker access" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0087c0ad-7f31-411a-a37c-6089a883d442", + "metadata": {}, + "outputs": [], + "source": [ + "# Save dataset locally\n", + "train_dataset.to_json(\"train_dataset.jsonl\")\n", + "\n", + "# Upload training data to S3\n", + "training_input_path = S3Uploader.upload(\n", + " local_path=\"train_dataset.jsonl\", desired_s3_uri=f\"{s3_path}/train\"\n", + ")\n", + "\n", + "os.remove(\"train_dataset.jsonl\")\n", + "print(f\"Training data uploaded to: {training_input_path}\")" + ] + }, + { + "cell_type": "markdown", + "id": "23d9d1c3-205c-466f-9e49-b2a6d0b4a4de", + "metadata": {}, + "source": [ + "## Training configurations\n", + "Set up the HuggingFace estimator with appropriate configurations for Qwen3-Embedding fine-tuning. \n", + "We use `requirements.txt` to overide the pre-built [HaggingFace deep learning container](https://docs.aws.amazon.com/sagemaker/latest/dg/hugging-face.html) (DLC) with the package versions we need. This is because current HuggingFace DLC doesn't sutissfy `transformers>=4.51.0`. Overiding this pre-built using `requirements.txt` help us to avoid writing [custom container](https://docs.aws.amazon.com/sagemaker/latest/dg/docker-containers-adapt-your-own.html). \n", + "\n", + "The target_dimension hyperparameter serves to guide Matryoshka Loss on which embedding size to prioritize during multi-dimensional training. Although the base Qwen3-Embedding model produces full-length vectors (e.g., 1,024 dimensions for the 0.6B variant), specifying target_dimension=512 during training helps the model front-load its most critical semantic information into the first 512 dimensions. This ensures that, when truncated at inference time, the 512-dimensional slice retains maximal performance without requiring a separate projection step.\n", + "- Without this guidance, the model would distribute key features uniformly across all 1,024 dimensions, making any fixed truncation suboptimal.\n", + "- Training with a target_dimension directs Matryoshka Loss to weight those dimensions more heavily, improving downstream retrieval accuracy at 512 dimensions.\n", + "\n", + "__Note__: Training will take appox. 8 minutes.\n", + "### SageMaker Training Managed Warm Pools\n", + "When you initiate a SageMaker training job, on every job startup, the service pulls the specified Docker image (often few GB of size) before provisioning the training instances. This is followed by downloading the model which can also be few GB in size. This alone can add tens of seconds to minutes of overhead per run. SageMaker provides [AI Managed Warm Pools](https://docs.aws.amazon.com/sagemaker/latest/dg/train-warm-pools.html) to keep infrastructure-including the container image=“warm” between jobs, significantly reducing startup latency. \n", + "\n", + "__Warm Pools__ allow you to retain provisioned resources for a configurable period after a training job finishes. When a new training job with matching configuration parameters starts within the warm pool’s lifetime, SageMaker reuses the existing instances-and crucially, the already-pulled container image-avoiding repeated downloads.\n", + "You can opt in to warm pools through the SageMaker Python SDK by specifying `keep_alive_period_in_seconds` when creating your estimator. \n", + "\n", + "When you enable SageMaker AI Managed Warm Pools, the underlying EC2 instances and the pulled container image remain “warm” for the specified `keep_alive_period_in_seconds`, but your training job’s configuration-including hyperparameters and your training script-always comes from the new job request you submit. \n", + "You can further read [Best practices for Amazon SageMaker Training Managed Warm Pools](https://aws.amazon.com/blogs/machine-learning/best-practices-for-amazon-sagemaker-training-managed-warm-pools/) blog.\n", + "\n", + "__Note__: If a training job is created with `keep_alive_period_in_seconds` specified, but you did not request a warm pool limit increase, then a warm pool is not retained after the completion of the training job. A warm pool is only created if your warm pool limit has sufficient resources." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cec8c77d-89c4-462a-b902-a588753760c3", + "metadata": {}, + "outputs": [], + "source": [ + "# Configure hyperparameters for Qwen3-Embedding-0.6B fine-tuning. Will pass to train.py\n", + "hyperparameters = {\n", + " \"model_name\": \"Qwen/Qwen3-Embedding-0.6B\",\n", + " \"num_train_epochs\": 3,\n", + " \"per_device_train_batch_size\": 12,\n", + " \"learning_rate\": 2e-5,\n", + " \"warmup_ratio\": 0.1,\n", + " \"target_dimension\": 512,\n", + " \"logging_steps\": 100,\n", + " \"fp16\": True,\n", + " \"save_strategy\": \"no\", # disables saving checkpoints to reduce model packaging size (not for prod)\n", + "}\n", + "\n", + "# Create HuggingFace Estimator\n", + "# Note:\n", + "# - Qwen3 models require transformers>=4.51.0; Below code will result container with transformers_version='4.49.0'\n", + "# - The requirements.txt under training_code directory will be used by SageMaker to upgrade the framework to the required version\n", + "huggingface_estimator = HuggingFace(\n", + " entry_point=\"train.py\",\n", + " source_dir=\"training_code\", # Directory with train.py & requirements.txt\n", + " instance_type=\"ml.g4dn.xlarge\",\n", + " instance_count=1,\n", + " role=role,\n", + " transformers_version=\"4.49.0\", # Upgraded via requirements.txt\n", + " pytorch_version=\"2.5.1\",\n", + " py_version=\"py311\",\n", + " hyperparameters=hyperparameters,\n", + " base_job_name=\"qwen3-embed-finetune\", # Prefix for job naming\n", + " volume_size=100,\n", + " keep_alive_period_in_seconds=1800, # Retain provisioned resources for 30 minutes\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a7dbbcc-c708-4271-a7aa-1e6658651854", + "metadata": {}, + "outputs": [], + "source": [ + "# Start the fine-tuning job\n", + "print(\"Starting Qwen3-Embedding fine-tuning job...\")\n", + "print(f\"Timestamp: {datetime.now()}\")\n", + "\n", + "# Start the training job with the jsonl dataset location in S3\n", + "huggingface_estimator.fit({\"train\": training_input_path})\n", + "\n", + "training_job_name = huggingface_estimator.latest_training_job.name\n", + "model_data = huggingface_estimator.model_data\n", + "print(f\"Model artifacts location: {model_data}\")\n", + "print(f\"\\nTraining job launched: {training_job_name}\")" + ] + }, + { + "cell_type": "markdown", + "id": "97479e43-93a5-402f-aee2-75ad825c7a23", + "metadata": {}, + "source": [ + "## That's it\n", + "Our fine-tuned model is waiting for us in S3\n", + "## Let's deploy the model and put it to the test\n", + "Deploying a model with SageMaker's HuggingFaceModel involves a multi-stage process that begins on your local environment and completes in the AWS cloud. When you call the `.deploy()` method, SageMaker initiates a model repacking operation that downloads your model artifacts from S3, combines them with your custom inference code (`inference.py` and other files in the `inference_code` directory), and creates a consolidated model package. This repacking process occurs on your local environment - the notebook instance's root filesystem (overlay mount) in our case. This step requires sufficient disk space—typically 2-3 times the model size. After repacking, SageMaker uploads the combined artifact to a new S3 location and provisions the specified compute resources (`ml.g4dn.xlarge` in this example). \n", + "Once the infrastructure is ready, SageMaker pulls the appropriate Deep Learning Container (DLC) image based on your framework specifications (`transformers_version='4.49.0'`, `pytorch_version='2.6.0'`, `py_version='py312'`), downloads the repacked model artifact from S3, and extracts it to the container's file system. The container then initializes the model server, which loads your model into memory and executes your inference handler code specified in `entry_point='inference.py'`. This handler implements standardized functions like `model_fn()`, `input_fn()`, `predict_fn()`, and `output_fn()` that process incoming requests and return predictions. After successful initialization, SageMaker exposes the endpoint for real-time inference requests through a REST API, allowing you to invoke predictions using the returned predictor object.\n", + "\n", + "__Note__: It might take long time for the model repacking and endpoint deployment. Be patient." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c03d9cb7-071c-4ff8-9096-d03cb0ce0d87", + "metadata": {}, + "outputs": [], + "source": [ + "# Create HuggingFace model for deployment\n", + "huggingface_model = HuggingFaceModel(\n", + " model_data=model_data,\n", + " role=role,\n", + " entry_point=\"inference.py\",\n", + " source_dir=\"./inference_code\",\n", + " transformers_version=\"4.49.0\",\n", + " pytorch_version=\"2.6.0\",\n", + " py_version=\"py312\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "515fdab5-0695-4389-9a46-bb12c6bedd55", + "metadata": {}, + "outputs": [], + "source": [ + "# Deploy the model\n", + "predictor = huggingface_model.deploy(\n", + " initial_instance_count=1,\n", + " instance_type=\"ml.g4dn.xlarge\",\n", + " endpoint_name=\"qwen3-embedding-code-finetune\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "dabc8dfb-6917-47f1-8db2-27810fdc58a6", + "metadata": {}, + "source": [ + "## Predictions\n", + "Creating a Predictor class from the deployed model endpoint name. This helps us to create a predictor without neededing to run the deployment API again. Use the deployed endpoint name to create a predictor. \n", + "The prediction supports two operations on the same endpoint:\n", + "* encode - Outputs an embedding for the given input and dimension\n", + "* similarity - Outputs similarity score for the given text1 & text2 along their embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ba4f6091-d90f-495e-816e-58b1ad203985", + "metadata": {}, + "outputs": [], + "source": [ + "import sagemaker\n", + "from sagemaker.deserializers import JSONDeserializer\n", + "from sagemaker.serializers import JSONSerializer\n", + "\n", + "predictor = sagemaker.Predictor(\n", + " endpoint_name=\"qwen3-embedding-code-finetune\",\n", + " sagemaker_session=sagemaker.Session(),\n", + " serializer=JSONSerializer(),\n", + " deserializer=JSONDeserializer(),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "ca2aaa61-b6db-462c-8407-ff57ba0bb851", + "metadata": {}, + "source": [ + "### Similarity tests with dynamic embedding dimensions\n", + "By adjusting the dimension parameter, you can see how truncating the same underlying embedding model at 1024, 728, 512, 256, or 128 dimensions influences both the cosine similarity score and the returned embedding length. This dynamic resizing is a direct benefit of the Matryoshka training approach, which frontloads semantic information into the earliest vector dimensions. We encourage you to run these examples, observe how similarity varies, and then craft your own code pairs-experimenting with related and unrelated snippets to deepen your intuition about embedding truncation and performance." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b0ffb3b-ff51-4494-acd2-aea04f2fa723", + "metadata": {}, + "outputs": [], + "source": [ + "# Encode a Python function at 128 dimensions\n", + "predictor.predict(\n", + " {\"operation\": \"encode\", \"inputs\": [\"def quicksort(arr): return sorted(arr)\"], \"dimension\": 128}\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f02b19a-a7da-4c24-9662-b1d3b870ca16", + "metadata": {}, + "outputs": [], + "source": [ + "# Perfect match at minimal dimension\n", + "predictor.predict(\n", + " {\n", + " \"operation\": \"similarity\",\n", + " \"text1\": \"def add(a, b): return a + b\",\n", + " \"text2\": \"def add(a, b): return a + b\",\n", + " \"dimension\": 128,\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bcf72aba-fa73-4484-a2b3-2e452b077b29", + "metadata": {}, + "outputs": [], + "source": [ + "# Dissimilar snippets comparison\n", + "predictor.predict(\n", + " {\n", + " \"operation\": \"similarity\",\n", + " \"text1\": \"print('Hello World')\",\n", + " \"text2\": \"SELECT * FROM users\",\n", + " \"dimension\": 512,\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe841fea-5a36-4ba7-89ec-c84bdc67bcc0", + "metadata": {}, + "outputs": [], + "source": [ + "# Compute similarity for the same functionality, but different programming languages\n", + "# Test the effect of non-equivalent function names\n", + "factorial_py = \"def factorial(n): return 1 if n <= 1 else n * factorial(n-1)\"\n", + "factorial_java = \"public static int factorial(int n) {\\n if (n <= 1) return 1;\\n return n * factorial(n - 1);\\n}\"\n", + "predictor.predict(\n", + " {\"operation\": \"similarity\", \"text1\": factorial_py, \"text2\": factorial_java, \"dimension\": 128}\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "01c7a0ce-98a1-42f1-9126-cbadef6ca055", + "metadata": {}, + "source": [ + "## Notebook CI Test Results\n", + "\n", + "This notebook was tested in multiple regions. The test results are as follows, except for us-west-2 which is shown at the top of the notebook.\n", + "\n", + "\n", + "![This us-east-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-east-1/build_and_train_models|sm-finetune_qwen_for_code_embedding|sm-training_qwen3_embedding_for_coding_tasks.ipynb)\n", + "\n", + "![This us-east-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-east-2/build_and_train_models|sm-finetune_qwen_for_code_embedding|sm-training_qwen3_embedding_for_coding_tasks.ipynb)\n", + "\n", + "![This us-west-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-west-1/build_and_train_models|sm-finetune_qwen_for_code_embedding|sm-training_qwen3_embedding_for_coding_tasks.ipynb)\n", + "\n", + "![This ca-central-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ca-central-1/build_and_train_models|sm-finetune_qwen_for_code_embedding|sm-training_qwen3_embedding_for_coding_tasks.ipynb)\n", + "\n", + "![This sa-east-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/sa-east-1/build_and_train_models|sm-finetune_qwen_for_code_embedding|sm-training_qwen3_embedding_for_coding_tasks.ipynb)\n", + "\n", + "![This eu-west-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-1/build_and_train_models|sm-finetune_qwen_for_code_embedding|sm-training_qwen3_embedding_for_coding_tasks.ipynb)\n", + "\n", + "![This eu-west-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-2/build_and_train_models|sm-finetune_qwen_for_code_embedding|sm-training_qwen3_embedding_for_coding_tasks.ipynb)\n", + "\n", + "![This eu-west-3 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-3/build_and_train_models|sm-finetune_qwen_for_code_embedding|sm-training_qwen3_embedding_for_coding_tasks.ipynb)\n", + "\n", + "![This eu-central-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-central-1/build_and_train_models|sm-finetune_qwen_for_code_embedding|sm-training_qwen3_embedding_for_coding_tasks.ipynb)\n", + "\n", + "![This eu-north-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-north-1/build_and_train_models|sm-finetune_qwen_for_code_embedding|sm-training_qwen3_embedding_for_coding_tasks.ipynb)\n", + "\n", + "![This ap-southeast-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-southeast-1/build_and_train_models|sm-finetune_qwen_for_code_embedding|sm-training_qwen3_embedding_for_coding_tasks.ipynb)\n", + "\n", + "![This ap-southeast-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-southeast-2/build_and_train_models|sm-finetune_qwen_for_code_embedding|sm-training_qwen3_embedding_for_coding_tasks.ipynb)\n", + "\n", + "![This ap-northeast-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-northeast-1/build_and_train_models|sm-finetune_qwen_for_code_embedding|sm-training_qwen3_embedding_for_coding_tasks.ipynb)\n", + "\n", + "![This ap-northeast-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-northeast-2/build_and_train_models|sm-finetune_qwen_for_code_embedding|sm-training_qwen3_embedding_for_coding_tasks.ipynb)\n", + "\n", + "![This ap-south-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-south-1/build_and_train_models|sm-finetune_qwen_for_code_embedding|sm-training_qwen3_embedding_for_coding_tasks.ipynb)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0cd4e3fa-26ec-4f12-8008-8659e89fb547", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/ build_and_train_models/sm-finetune_qwen_for_code_embedding/training_code/requirements.txt b/ build_and_train_models/sm-finetune_qwen_for_code_embedding/training_code/requirements.txt new file mode 100644 index 0000000000..04cc6a2878 --- /dev/null +++ b/ build_and_train_models/sm-finetune_qwen_for_code_embedding/training_code/requirements.txt @@ -0,0 +1,2 @@ +transformers>=4.51.0 +sentence-transformers>=3.0.0 \ No newline at end of file diff --git a/ build_and_train_models/sm-finetune_qwen_for_code_embedding/training_code/train.py b/ build_and_train_models/sm-finetune_qwen_for_code_embedding/training_code/train.py new file mode 100644 index 0000000000..f7f5f3f0a7 --- /dev/null +++ b/ build_and_train_models/sm-finetune_qwen_for_code_embedding/training_code/train.py @@ -0,0 +1,99 @@ +import argparse +import os +import logging +from datasets import load_dataset +from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer +from sentence_transformers.losses import MultipleNegativesRankingLoss, MatryoshkaLoss +from sentence_transformers.training_args import SentenceTransformerTrainingArguments + +# Initialize logger +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser() + + # SageMaker channels and directories + # SM_CHANNEL_TRAIN is a preconfigured environment variable that points to the directory where training data is made + # available to your training script. SageMaker uses channels to organize input data; each channel corresponds to a named + # input location in Amazon S3 that SageMaker downloads into the container before running your entry script. + # This ensures the script dynamically references the correct data location without hardcoding S3 paths. + parser.add_argument("--train", type=str, default=os.getenv("SM_CHANNEL_TRAIN")) + parser.add_argument("--model_dir", type=str, default=os.getenv("SM_MODEL_DIR")) + + # Delegated hyperparameters + parser.add_argument("--model_name", type=str, required=True) + parser.add_argument("--num_train_epochs", type=int, default=3) + parser.add_argument("--per_device_train_batch_size", type=int, default=16) + parser.add_argument("--learning_rate", type=float, default=2e-5) + parser.add_argument("--warmup_ratio", type=float, default=0.1) + parser.add_argument("--target_dimension", type=int, default=512) + parser.add_argument("--logging_steps", type=int, default=100) + parser.add_argument("--fp16", type=lambda x: x.lower() in ("true", "1"), default=False) + parser.add_argument( + "--save_strategy", + type=str, + choices=["no", "steps", "epoch"], + default="no", + ) + args = parser.parse_args() + + # Load the base embedding model + logger.info(f"Loading model {args.model_name}") + model = SentenceTransformer(args.model_name) + + # Prepare the training dataset + logger.info(f"Loading dataset from {args.train}") + train_dataset = load_dataset( + "json", data_files=f"{args.train}/train_dataset.jsonl", split="train" + ) + + """ + Construct Matryoshka dimensions + Rather than hard-coding, dynamically build and sort dimensions: + This approach adapts automatically to any new base model size and ensures correct hierarchical ordering for robust Matryoshka training. + When configuring matryoshka_dims for Matryoshka Representation Learning, the order of dimensions matters for how the model prioritizes embedding information. Sorting ensures a logical progression from smallest to largest subspace, which aids training stability and performance. + """ + full_dim = model.get_sentence_embedding_dimension() + dims = [full_dim] + sorted(d for d in [768, args.target_dimension, 256, 128] if d < full_dim) + logger.info(f"Matryoshka dimensions: {dims}") + + # Define loss functions + inner_loss = MultipleNegativesRankingLoss(model) + train_loss = MatryoshkaLoss( + model=model, + loss=inner_loss, + matryoshka_dims=dims, + matryoshka_weights=[2.0 if d == args.target_dimension else 1.0 for d in dims], + ) + + # Configure training arguments + training_args = SentenceTransformerTrainingArguments( + output_dir="/opt/ml/model", + num_train_epochs=args.num_train_epochs, + per_device_train_batch_size=args.per_device_train_batch_size, + learning_rate=args.learning_rate, + warmup_ratio=args.warmup_ratio, + fp16=args.fp16, + logging_steps=args.logging_steps, + eval_strategy="no", + gradient_checkpointing=True, + save_strategy=args.save_strategy, + ) + + # Initialize and run trainer + trainer = SentenceTransformerTrainer( + model=model, args=training_args, train_dataset=train_dataset, loss=train_loss + ) + + logger.info("Starting training") + trainer.train() + + # Save the fine-tuned model + logger.info(f"Saving model to {args.model_dir}") + model.save_pretrained(args.model_dir) + + +if __name__ == "__main__": + main()