move to modal.com for embeddings as well

lfoppiano · lfoppiano · commit 9897f0eff400 · 2025-06-21T17:14:05.000+02:00
diff --git a/document_qa/custom_embeddings.py b/document_qa/custom_embeddings.py
@@ -0,0 +1,58 @@
+from typing import List
+import requests
+from langchain_core.embeddings import Embeddings
+
+
+class ModalEmbeddings(Embeddings):
+    def __init__(self, url: str, model_name: str, api_key: str = None):
+        self.url = url
+        self.model_name = model_name
+        self.api_key = api_key
+
+    def embed(self, text: List[str]) -> List[List[str]]:
+        # We remove newlines from the text to avoid issues with the embedding model.
+        cleaned_text = [t.replace("\n", " ") for t in text]
+
+        payload = {'text': "\n".join(cleaned_text)}
+
+        headers = {}
+        if self.api_key:
+            headers = {'x-api-key': self.api_key}
+
+        response = requests.post(
+            self.url,
+            data=payload,
+            files=[],
+            headers=headers
+        )
+        response.raise_for_status()
+
+        # print(response.text)
+        return response.json()
+
+    def embed_documents(self, text: List[str]) -> List[List[str]]:
+        """
+        Embed a list of documents using the embedding model.
+        """
+        return self.embed(text)
+
+    def embed_query(self, text: str) -> List[str]:
+        """
+        Embed a query
+        """
+        return self.embed([text])[0]
+
+    def get_model_name(self) -> str:
+        return self.model_name
+
+
+if __name__ == "__main__":
+    embeds = ModalEmbeddings(
+        url="https://lfoppiano--intfloat-multilingual-e5-large-instruct-embed-5da184.modal.run/",
+        model_name="intfloat/multilingual-e5-large-instruct"
+    )
+
+    print(embeds.embed(
+        ["We are surrounded by stupid kids",
+         "We are interested in the future of AI"]
+    ))
diff --git a/document_qa/deployment/modal_embeddings.py b/document_qa/deployment/modal_embeddings.py
@@ -0,0 +1,117 @@
+import os
+from typing import Annotated, List
+from fastapi import Request, HTTPException, Form
+
+import modal
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+from transformers import AutoTokenizer, AutoModel
+
+image = (
+    modal.Image.debian_slim(python_version="3.11")
+    .pip_install(
+        "transformers",
+        "huggingface_hub[hf_transfer]==0.26.2",
+        "flashinfer-python==0.2.0.post2",  # pinning, very unstable
+        "fastapi[standard]",
+        extra_index_url="https://flashinfer.ai/whl/cu124/torch2.5",
+    )
+    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})  # faster model transfers
+)
+
+MODELS_DIR = "/llamas"
+MODEL_NAME = "intfloat/multilingual-e5-large-instruct"
+MODEL_REVISION = "84344a23ee1820ac951bc365f1e91d094a911763"
+
+hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)
+vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)
+
+app = modal.App("intfloat-multilingual-e5-large-instruct-embeddings")
+
+
+def get_device():
+    return torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+def load_model():
+    print("Loading model...")
+    device = get_device()
+    print(f"Using device: {device}")
+    
+    tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large-instruct')
+    model = AutoModel.from_pretrained('intfloat/multilingual-e5-large-instruct').to(device)
+    print("Model loaded successfully.")
+
+    return tokenizer, model, device
+
+
+N_GPU = 1
+MINUTES = 60  # seconds
+VLLM_PORT = 8000
+
+
+def average_pool(last_hidden_states: Tensor,
+                 attention_mask: Tensor) -> Tensor:
+    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
+    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
+
+
+@app.function(
+    image=image,
+    gpu=f"L40S:{N_GPU}",
+    # gpu=f"A10G:{N_GPU}",
+    # how long should we stay up with no requests?
+    scaledown_window=3 * MINUTES,
+    volumes={
+        "/root/.cache/huggingface": hf_cache_vol,
+        "/root/.cache/vllm": vllm_cache_vol,
+    },
+    secrets=[modal.Secret.from_name("document-qa-embedding-key")]
+)
+@modal.concurrent(
+    max_inputs=5
+)  # how many requests can one replica handle? tune carefully!
+@modal.fastapi_endpoint(method="POST")
+def embed(request: Request, text: Annotated[str, Form()]):
+    api_key = request.headers.get("x-api-key")
+    expected_key = os.environ["API_KEY"]
+
+    if api_key != expected_key:
+        raise HTTPException(status_code=401, detail="Unauthorized")
+
+
+    texts = [t for t in text.split("\n") if t.strip()]
+    if not texts:
+        return []
+        
+    tokenizer, model, device = load_model()
+    model.eval()
+
+    print(f"Start embedding {len(texts)} texts")
+    try:
+        with torch.no_grad():
+            # Move inputs to the same device as model
+            batch_dict = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
+            batch_dict = {k: v.to(device) for k, v in batch_dict.items()}
+            
+            # Forward pass
+            outputs = model(**batch_dict)
+            
+            # Process embeddings
+            embeddings = average_pool(
+                outputs.last_hidden_state, 
+                batch_dict['attention_mask']
+            )
+            embeddings = F.normalize(embeddings, p=2, dim=1)
+            
+            # Move to CPU and convert to list for serialization
+            embeddings = embeddings.cpu().numpy().tolist()
+            
+        print("Finished embedding texts.")
+        return embeddings
+        
+    except RuntimeError as e:
+        print(f"Error during embedding: {str(e)}")
+        if "CUDA out of memory" in str(e):
+            print("CUDA out of memory error. Try reducing batch size or using a smaller model.")
+        raise
diff --git a/streamlit_app.py b/streamlit_app.py
@@ -6,10 +6,10 @@
 import dotenv
 from grobid_quantities.quantities import QuantitiesAPI
 from langchain.memory import ConversationBufferMemory
-from langchain_huggingface import HuggingFaceEmbeddings, HuggingFaceEndpointEmbeddings
 from langchain_openai import ChatOpenAI
 from streamlit_pdf_viewer import pdf_viewer
 
+from document_qa.custom_embeddings import ModalEmbeddings
 from document_qa.ner_client_generic import NERClientGeneric
 
 dotenv.load_dotenv(override=True)
@@ -19,11 +19,11 @@
 from document_qa.grobid_processors import GrobidAggregationProcessor, decorate_text_with_annotations
 
 API_MODELS = {
-    "microsoft/Phi-4-mini-instruct": os.environ["MODAL_1_URL"]
+    "microsoft/Phi-4-mini-instruct": os.environ["LM_URL"]
 }
 
 API_EMBEDDINGS = {
-    'intfloat/multilingual-e5-large-instruct': 'intfloat/multilingual-e5-large-instruct'
+    'intfloat/multilingual-e5-large-instruct-modal': os.environ['EMBEDS_URL']
 }
 
 if 'rqa' not in st.session_state:
@@ -112,6 +112,7 @@ def new_file():
     st.session_state['loaded_embeddings'] = None
     st.session_state['doc_id'] = None
     st.session_state['uploaded'] = True
+    st.session_state['annotations'] = []
     if st.session_state['memory']:
         st.session_state['memory'].clear()
 
@@ -133,8 +134,10 @@ def init_qa(model_name, embeddings_name):
         api_key=os.environ.get('API_KEY')
     )
 
-    embeddings = HuggingFaceEndpointEmbeddings(
-        repo_id=API_EMBEDDINGS[embeddings_name]
+    embeddings = ModalEmbeddings(
+        url=API_EMBEDDINGS[embeddings_name],
+        model_name=embeddings_name,
+        api_key=os.environ.get('EMBEDS_API_KEY')
     )
 
     storage = DataStorage(embeddings)
@@ -195,7 +198,7 @@ def play_old_messages(container):
     st.markdown("Upload a scientific article in PDF, ask questions, get insights.")
     st.markdown(
         ":warning: [Usage disclaimer](https://github.com/lfoppiano/document-qa?tab=readme-ov-file#disclaimer-on-data-security-and-privacy-%EF%B8%8F) :warning: ")
-    st.markdown("Powered by [Huggingface](https://huggingface.co) and [Modal.com](https://modal.com/)")
+    st.markdown("LM and Embeddings are powered by [Modal.com](https://modal.com/)")
 
     st.divider()
     st.session_state['model'] = model = st.selectbox(