|
| 1 | +import os |
| 2 | +from typing import Annotated, List |
| 3 | +from fastapi import Request, HTTPException, Form |
| 4 | + |
| 5 | +import modal |
| 6 | +import torch |
| 7 | +import torch.nn.functional as F |
| 8 | +from torch import Tensor |
| 9 | +from transformers import AutoTokenizer, AutoModel |
| 10 | + |
| 11 | +image = ( |
| 12 | + modal.Image.debian_slim(python_version="3.11") |
| 13 | + .pip_install( |
| 14 | + "transformers", |
| 15 | + "huggingface_hub[hf_transfer]==0.26.2", |
| 16 | + "flashinfer-python==0.2.0.post2", # pinning, very unstable |
| 17 | + "fastapi[standard]", |
| 18 | + extra_index_url="https://flashinfer.ai/whl/cu124/torch2.5", |
| 19 | + ) |
| 20 | + .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) # faster model transfers |
| 21 | +) |
| 22 | + |
| 23 | +MODELS_DIR = "/llamas" |
| 24 | +MODEL_NAME = "intfloat/multilingual-e5-large-instruct" |
| 25 | +MODEL_REVISION = "84344a23ee1820ac951bc365f1e91d094a911763" |
| 26 | + |
| 27 | +hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True) |
| 28 | +vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True) |
| 29 | + |
| 30 | +app = modal.App("intfloat-multilingual-e5-large-instruct-embeddings") |
| 31 | + |
| 32 | + |
| 33 | +def get_device(): |
| 34 | + return torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
| 35 | + |
| 36 | +def load_model(): |
| 37 | + print("Loading model...") |
| 38 | + device = get_device() |
| 39 | + print(f"Using device: {device}") |
| 40 | + |
| 41 | + tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large-instruct') |
| 42 | + model = AutoModel.from_pretrained('intfloat/multilingual-e5-large-instruct').to(device) |
| 43 | + print("Model loaded successfully.") |
| 44 | + |
| 45 | + return tokenizer, model, device |
| 46 | + |
| 47 | + |
| 48 | +N_GPU = 1 |
| 49 | +MINUTES = 60 # seconds |
| 50 | +VLLM_PORT = 8000 |
| 51 | + |
| 52 | + |
| 53 | +def average_pool(last_hidden_states: Tensor, |
| 54 | + attention_mask: Tensor) -> Tensor: |
| 55 | + last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0) |
| 56 | + return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None] |
| 57 | + |
| 58 | + |
| 59 | +@app.function( |
| 60 | + image=image, |
| 61 | + gpu=f"L40S:{N_GPU}", |
| 62 | + # gpu=f"A10G:{N_GPU}", |
| 63 | + # how long should we stay up with no requests? |
| 64 | + scaledown_window=3 * MINUTES, |
| 65 | + volumes={ |
| 66 | + "/root/.cache/huggingface": hf_cache_vol, |
| 67 | + "/root/.cache/vllm": vllm_cache_vol, |
| 68 | + }, |
| 69 | + secrets=[modal.Secret.from_name("document-qa-embedding-key")] |
| 70 | +) |
| 71 | +@modal.concurrent( |
| 72 | + max_inputs=5 |
| 73 | +) # how many requests can one replica handle? tune carefully! |
| 74 | +@modal.fastapi_endpoint(method="POST") |
| 75 | +def embed(request: Request, text: Annotated[str, Form()]): |
| 76 | + api_key = request.headers.get("x-api-key") |
| 77 | + expected_key = os.environ["API_KEY"] |
| 78 | + |
| 79 | + if api_key != expected_key: |
| 80 | + raise HTTPException(status_code=401, detail="Unauthorized") |
| 81 | + |
| 82 | + |
| 83 | + texts = [t for t in text.split("\n") if t.strip()] |
| 84 | + if not texts: |
| 85 | + return [] |
| 86 | + |
| 87 | + tokenizer, model, device = load_model() |
| 88 | + model.eval() |
| 89 | + |
| 90 | + print(f"Start embedding {len(texts)} texts") |
| 91 | + try: |
| 92 | + with torch.no_grad(): |
| 93 | + # Move inputs to the same device as model |
| 94 | + batch_dict = tokenizer(texts, padding=True, truncation=True, return_tensors='pt') |
| 95 | + batch_dict = {k: v.to(device) for k, v in batch_dict.items()} |
| 96 | + |
| 97 | + # Forward pass |
| 98 | + outputs = model(**batch_dict) |
| 99 | + |
| 100 | + # Process embeddings |
| 101 | + embeddings = average_pool( |
| 102 | + outputs.last_hidden_state, |
| 103 | + batch_dict['attention_mask'] |
| 104 | + ) |
| 105 | + embeddings = F.normalize(embeddings, p=2, dim=1) |
| 106 | + |
| 107 | + # Move to CPU and convert to list for serialization |
| 108 | + embeddings = embeddings.cpu().numpy().tolist() |
| 109 | + |
| 110 | + print("Finished embedding texts.") |
| 111 | + return embeddings |
| 112 | + |
| 113 | + except RuntimeError as e: |
| 114 | + print(f"Error during embedding: {str(e)}") |
| 115 | + if "CUDA out of memory" in str(e): |
| 116 | + print("CUDA out of memory error. Try reducing batch size or using a smaller model.") |
| 117 | + raise |
0 commit comments