Improve tests on cached multiturn conversation and fix bugs

tomasruizt · tomasruizt · commit 7eae500acedc · 2025-03-14T09:41:11.000-07:00
diff --git a/.gitignore b/.gitignore
@@ -7,6 +7,9 @@ models/
 # C extensions
 *.so
 
+# MacOS
+.DS_Store
+
 # Distribution / packaging
 .Python
 build/
diff --git a/llmlib/llmlib/gemini/gemini_code.py b/llmlib/llmlib/gemini/gemini_code.py
@@ -3,9 +3,9 @@
 """
 
 from dataclasses import dataclass
-from datetime import datetime
 from functools import singledispatchmethod
 from io import BytesIO
+import json
 from logging import getLogger
 from pathlib import Path
 import tempfile
@@ -56,7 +56,7 @@ class GeminiModels(StrEnum):
 
     gemini_15_pro = "gemini-1.5-pro"
     gemini_15_flash = "gemini-1.5-flash-002"
-    gemini_20_flash = "gemini-2.0-flash"
+    gemini_20_flash = "gemini-2.0-flash-001"
     gemini_20_flash_lite = "gemini-2.0-flash-lite-001"
 
 
@@ -115,23 +115,29 @@ def _execute_multi_turn_req(req: MultiTurnRequest) -> str:
             raise ValueError("Only the first message can have file(s)")
 
     # Prepare Inputs. Use context caching for media
-    paths = filepaths(msg=req.messages[0])
-    use_caching = req.use_context_caching and is_long_enough_to_cache(paths)
+    client = create_client()
+    contents = [convert_to_gemini_format(msg) for msg in req.messages]
+
+    files: list[Path] = filepaths(msg=req.messages[0])
+    use_caching = req.use_context_caching and is_long_enough_to_cache(files)
     if use_caching:
-        cached_content, blobs = cache_content(req.model_name, tuple(paths))
-    else:
+        # Assume caching was done before
+        cached_content, success = get_cached_content(client, req.model_name, files)
+        blobs = []
+        if not success:
+            cached_content, blobs = cache_content(client, req.model_name, files)
+    else:  # Add files to the content
+        blobs = upload_files(files=files)
+        contents = [*blobs_to_parts(blobs), *contents]
         cached_content = None
-        blobs = upload_files(files=paths)
-    contents = [convert_to_gemini_format(msg) for msg in req.messages]
 
     # Call Gemini
-    client = create_client()
     response: GenerateContentResponse = _call_gemini(
         client, req, contents, cached_content
     )
 
     # Cleanup
-    if req.delete_files_after_use and not use_caching:
+    if req.delete_files_after_use:
         delete_blobs(blobs)
     return response.text
 
@@ -161,26 +167,37 @@ def video_duration_in_sec(filename: Path) -> float:
     return duration
 
 
-@ttl_cache(ttl=60 * 60)
 def cache_content(
-    model_id: str, paths: list[Path]
+    client: genai.Client, model_id: str, paths: list[Path], ttl: str = f"{60 * 20}s"
 ) -> tuple[CachedContent, list[storage.Blob]]:
     """Caches the content on Google as describe here: https://cloud.google.com/vertex-ai/generative-ai/docs/context-cache/context-cache-create"""
     logger.info("Caching content for paths: %s", paths)
-    client = create_client()
     blobs = upload_files(files=paths)
     parts = blobs_to_parts(blobs)
     content = Content(role="user", parts=parts)
-    cached_content = client.caches.create(
-        model=model_id,
-        config=CreateCachedContentConfig(
-            contents=[content],
-            display_name="multiturn cache for req at %s" % datetime.now(),
-        ),
+    config = CreateCachedContentConfig(
+        contents=[content], display_name=cache_id(model_id, paths), ttl=ttl
     )
+    cached_content = client.caches.create(model=model_id, config=config)
     return cached_content, blobs
 
 
+def cache_id(model_id: str, paths: list[Path]) -> str:
+    return json.dumps(dict(model=model_id, paths=str(paths)))
+
+
+def get_cached_content(
+    client: genai.Client, model_id: str, paths: list[Path]
+) -> tuple[CachedContent, bool]:
+    for cache in client.caches.list():
+        if cache.display_name == cache_id(model_id, paths):
+            logger.info(
+                "Found cached content for model_id='%s' and paths='%s'", model_id, paths
+            )
+            return cache, True
+    return None, False
+
+
 def convert_to_gemini_format(msg: Message) -> tuple[Content, list[storage.Blob]]:
     role_map = dict(user="user", assistant="model")
     role = role_map[msg.role]
@@ -324,6 +341,7 @@ class GeminiAPI(LLM):
     model_id: str = GeminiModels.gemini_20_flash_lite
     max_output_tokens: int = 1000
     use_context_caching: bool = False
+    delete_files_after_use: bool = True
 
     requires_gpu_exclusively = False
     model_ids = available_models
@@ -333,13 +351,23 @@ def complete_msgs(self, msgs: list[Message]) -> str:
             msg = msgs[0]
             paths = filepaths(msg)
             req = SingleTurnRequest(
-                model_name=self.model_id, media_files=paths, prompt=msg.msg
+                model_name=self.model_id,
+                media_files=paths,
+                prompt=msg.msg,
+                max_output_tokens=self.max_output_tokens,
+                delete_files_after_use=self.delete_files_after_use,
             )
         else:
+            delete_files_after_use = self.delete_files_after_use
+            if self.use_context_caching:
+                delete_files_after_use = False
+
             req = MultiTurnRequest(
                 model_name=self.model_id,
                 messages=msgs,
                 use_context_caching=self.use_context_caching,
+                max_output_tokens=self.max_output_tokens,
+                delete_files_after_use=delete_files_after_use,
             )
         return req.fetch_media_description()
 
diff --git a/tests/helpers.py b/tests/helpers.py
@@ -133,26 +133,26 @@ def assert_model_supports_multiturn_with_6min_video(model: LLM):
     video = file_for_test("tasting travel - rome italy.mp4")
     convo = [Message(role="user", msg="What country are they visiting?", video=video)]
     answer1 = model.complete_msgs(convo)
-    assert "italy" in answer1.lower()
+    assert "italy" in answer1.lower(), answer1
 
     convo.append(Message(role="assistant", msg=answer1))
     convo.append(Message(role="user", msg="What food do they eat?"))
     answer2 = model.complete_msgs(convo)
-    assert "lasagna" in answer2.lower()
+    assert "lasagna" in answer2.lower(), answer2
 
     convo.append(Message(role="assistant", msg=answer2))
     convo.append(
         Message(role="user", msg="What character appears in the middle of the video?")
     )
     answer3 = model.complete_msgs(convo)
-    assert "jesus" in answer3.lower()
+    assert "jesus" in answer3.lower(), answer3
 
 
 def assert_model_supports_multiturn_with_picture(model: LLM):
     q1_msg = pyramid_message()
     a1_txt = model.complete_msgs([q1_msg])
-    assert "pyramid" in a1_txt.lower()
+    assert "pyramid" in a1_txt.lower(), a1_txt
     a1_msg = Message(role="assistant", msg=a1_txt)
     q2_msg = Message(role="user", msg="What country is the picture in?")
     a2_txt = model.complete_msgs([q1_msg, a1_msg, q2_msg])
-    assert "egypt" in a2_txt.lower()
+    assert "egypt" in a2_txt.lower(), a2_txt
diff --git a/tests/test_gemini.py b/tests/test_gemini.py
@@ -1,4 +1,12 @@
-from llmlib.gemini.gemini_code import GeminiAPI, GeminiModels
+from pathlib import Path
+from llmlib.gemini.gemini_code import (
+    GeminiAPI,
+    GeminiModels,
+    cache_content,
+    create_client,
+    get_cached_content,
+)
+from google.genai.types import CachedContent
 import pytest
 
 from tests.helpers import (
@@ -8,6 +16,7 @@
     assert_model_supports_multiturn,
     assert_model_supports_multiturn_with_6min_video,
     assert_model_supports_multiturn_with_picture,
+    file_for_test,
     is_ci,
 )
 
@@ -21,31 +30,52 @@ def test_gemini_vision_using_interface():
 
 
 @pytest.mark.skipif(condition=is_ci(), reason="Avoid costs")
-def test_multiturn_conversation():
+def test_multiturn_textonly_conversation():
     model = GeminiAPI(model_id=GeminiModels.gemini_20_flash_lite, max_output_tokens=50)
     assert_model_supports_multiturn(model)
 
 
 @pytest.mark.skipif(condition=is_ci(), reason="Avoid costs")
 @pytest.mark.parametrize("use_context_caching", [False, True])
-def test_multiturn_conversation_with_file(use_context_caching: bool):
+def test_multiturn_conversation_with_img(use_context_caching: bool):
     model = GeminiAPI(
         model_id=GeminiModels.gemini_20_flash_lite,
         max_output_tokens=50,
         use_context_caching=use_context_caching,
+        delete_files_after_use=False,
     )
     assert_model_supports_multiturn_with_picture(model)
 
 
 @pytest.mark.skipif(condition=is_ci(), reason="Avoid costs")
-def test_multiturn_conversation_with_file_and_context_caching():
+@pytest.mark.parametrize("use_context_caching", [False, True])
+def test_multiturn_conversation_with_6min_video_and_context_caching(
+    use_context_caching: bool,
+):
     """
     Context caching is supported only for Gemini 1.5 Pro and Flash
     https://cloud.google.com/vertex-ai/generative-ai/docs/context-cache/context-cache-overview#supported_models
     """
     model = GeminiAPI(
         model_id=GeminiModels.gemini_15_flash,
         max_output_tokens=50,
-        use_context_caching=True,
+        use_context_caching=use_context_caching,
+        delete_files_after_use=False,
     )
     assert_model_supports_multiturn_with_6min_video(model)
+
+
+def test_get_cached_content():
+    """We can cache content and reuse the cache later"""
+    path: Path = file_for_test("tasting travel - rome italy.mp4")
+    client = create_client()
+    model_id = GeminiModels.gemini_15_flash
+    _, success = get_cached_content(client, model_id=model_id, paths=[path])
+    assert not success
+
+    cache_content(client, model_id=model_id, paths=[path], ttl="60s")
+    cached_content, success = get_cached_content(
+        client, model_id=model_id, paths=[path]
+    )
+    assert success
+    assert isinstance(cached_content, CachedContent)