gradio-app · abidlabs · Apr 22, 2026 · Apr 22, 2026 · Apr 23, 2026 · Apr 23, 2026
diff --git a/.changeset/bright-stars-smoke.md b/.changeset/bright-stars-smoke.md
@@ -0,0 +1,5 @@
+---
+"gradio": minor
+---
+
+feat:Allow applying `gr.cache()` to intermediate functions directly
-feat:Allow applying `gr.cache()` to intermediate functions directly
+feat: Allow applying `gr.cache()` to intermediate functions directly
-feat:Allow applying `gr.cache()` to intermediate functions directly
+feat: Allow applying `gr.cache()` to intermediate functions directly
diff --git a/demo/cache_intermediate_demo/run.py b/demo/cache_intermediate_demo/run.py
@@ -0,0 +1,155 @@
+"""Demo showcasing runtime gr.cache(fn)(...) for an intermediate helper call."""
+
+import re
+import time
+
+import gradio as gr
+
+POLICIES = {
+    "Billing": [
+        {
+            "title": "Refund windows",
+            "text": "Customers can request a refund within 30 days of purchase. "
+            "Annual subscriptions can be prorated if the request arrives after the "
+            "first 30 days but before day 90.",
+        },
+        {
+            "title": "Invoice corrections",
+            "text": "Billing agents can correct invoice email addresses, company "
+            "names, and tax identifiers, but cannot alter the purchase date.",
+        },
+        {
+            "title": "Duplicate charges",
+            "text": "If two charges appear within 24 hours for the same plan and "
+            "customer account, billing should confirm card fingerprint and issue a "
+            "same-day reversal.",
+        },
+    ],
+    "IT": [
+        {
+            "title": "Password resets",
+            "text": "IT can revoke all active sessions and force a password reset "
+            "after verifying the employee through the HR directory.",
+        },
+        {
+            "title": "VPN access",
+            "text": "New VPN access is approved only for employees with manager "
+            "approval and a registered MFA device.",
+        },
+        {
+            "title": "Laptop replacement",
+            "text": "Broken laptops should be tagged with the device asset number "
+            "and shipped to the repair center before a replacement is issued.",
+        },
+    ],
+    "HR": [
+        {
+            "title": "Parental leave",
+            "text": "Full-time employees are eligible for 16 weeks of paid parental "
+            "leave after six months of employment.",
+        },
+        {
+            "title": "Address changes",
+            "text": "Employees should update their home address in the HR portal "
+            "before payroll closes on the 20th of each month.",
+        },
+        {
+            "title": "Interview scheduling",
+            "text": "Recruiters should provide interview panels at least 48 hours "
+            "to review candidate materials before the session starts.",
+        },
+    ],
+}
+
+TONE_PREFIX = {
+    "Concise": "Give a short answer.",
+    "Friendly": "Give a warm, helpful answer.",
+    "Formal": "Give a professional and policy-focused answer.",
+}
+
+
+def _tokens(text: str) -> set[str]:
+    return set(re.findall(r"[a-z0-9]+", text.lower()))
+
+
+def retrieve_passages(question: str, team: str) -> list[dict[str, str | int]]:
+    """Pretend retrieval is the expensive deterministic step."""
+    time.sleep(2)
+    query_tokens = _tokens(question)
+    ranked = []
+    for doc in POLICIES[team]:
+        combined_text = f"{doc['title']} {doc['text']}"
+        score = len(query_tokens & _tokens(combined_text))
+        ranked.append(
+            {
+                "title": doc["title"],
+                "text": doc["text"],
+                "score": score,
+            }
+        )
+
+    ranked.sort(key=lambda item: item["score"], reverse=True)
+    return ranked[:2]
+
+
+def draft_answer(question: str, team: str, tone: str) -> tuple[str, str, str]:
+    start = time.time()
+
+    passages = gr.cache(retrieve_passages)(question, team)
+    top_match = passages[0]
+    bullets = "\n".join(
+        f"- {match['title']}: {match['text']}" for match in passages
+    )
+
+    answer = (
+        f"{TONE_PREFIX[tone]}\n\n"
+        f"For a {team.lower()} request about '{question}', the strongest match is "
+        f"**{top_match['title']}**.\n\n"
+        f"Suggested reply: Based on the current {team.lower()} policy, {top_match['text']}"
+    )
+    debug = (
+        f"Retrieved {len(passages)} passages in {time.time() - start:.2f}s.\n"
+        "Try the same question twice, or change only the tone. "
+        "The retrieval helper should be reused from cache."
+    )
+    return answer, bullets, debug
+
+
+with gr.Blocks(title="Intermediate gr.cache() Demo") as demo:
+    gr.Markdown(
+        "# Intermediate `gr.cache()` Demo\n"
+        "This simulates a support assistant where the **retrieval** step is expensive "
+        "but deterministic, while the final answer still recomputes. "
+        "Submit the same question twice, or change only the tone, and watch Gradio "
+        "show the `used cache` badge when the cached helper is reused."
+    )
+
+    with gr.Row():
+        with gr.Column():
+            team = gr.Dropdown(
+                choices=list(POLICIES.keys()),
+                value="Billing",
+                label="Team",
+            )
+            tone = gr.Dropdown(
+                choices=["Concise", "Friendly", "Formal"],
+                value="Friendly",
+                label="Answer style",
+            )
+            question = gr.Textbox(
+                label="Customer question",
+                lines=3,
+                value="Can this customer get a refund after being charged twice?",
+            )
+            draft = gr.Markdown()
+            retrieved = gr.Markdown()
+            debug = gr.Textbox(label="Debug", lines=3)
+            gr.Button("Draft reply").click(
+                draft_answer,
+                [question, team, tone],
+                outputs=[draft, retrieved, debug],
+            )
+
+
+if __name__ == "__main__":
+    demo.launch()
diff --git a/gradio/caching.py b/gradio/caching.py
@@ -254,6 +254,14 @@ def __len__(self) -> int:
 
 
 _per_session_stores: weakref.WeakSet[_CacheStore] = weakref.WeakSet()
+# We need this store because runtime `gr.cache(fn)(...)` may be evaluated on
+# every request, and we want one shared wrapper/cache per function+config
+# instead of recreating an empty cache each time.
+_cache_wrappers: dict[tuple[Any, ...], Callable] = {}
+# We need this lock because requests can resolve `gr.cache(fn)` concurrently,
+# and wrapper creation must be atomic so only one shared wrapper/cache is
+# installed for a given cache key.
+_runtime_cache_lock = threading.Lock()
 
 
 def _normalize_kwargs(signature: inspect.Signature, args: tuple, kwargs: dict) -> dict:
@@ -331,7 +339,11 @@ def clear_session_caches(session_hash: str | None) -> None:
 
 
 def _make_wrapper(
-    func: Callable, store: _CacheStore, key: Callable | None = None
+    func: Callable,
+    store: _CacheStore,
+    key: Callable | None = None,
+    *,
+    track_cache_hits: bool = False,
 ) -> Callable:
     signature = inspect.signature(func)
 
@@ -344,6 +356,10 @@ def _on_miss():
         if _probe_mode_active.get():
             raise CacheMissError()
 
+    def _on_hit():
+        if track_cache_hits and not _probe_mode_active.get():
+            mark_manual_cache_hit()
+
     if inspect.isgeneratorfunction(func):
 
         @functools.wraps(func)
@@ -352,6 +368,7 @@ def sync_gen_wrapper(*args, **kwargs):
             key_hash = _compute_hash(normalized)
             entry = store.get(key_hash)
             if entry is not None:
+                _on_hit()
                 yield from entry["yields"]
                 return
             _on_miss()
@@ -371,6 +388,7 @@ async def async_gen_wrapper(*args, **kwargs):
             key_hash = _compute_hash(normalized)
             entry = store.get(key_hash)
             if entry is not None:
+                _on_hit()
                 for value in entry["yields"]:
                     yield value
                 return
@@ -391,6 +409,7 @@ async def async_wrapper(*args, **kwargs):
             key_hash = _compute_hash(normalized)
             entry = store.get(key_hash)
             if entry is not None:
+                _on_hit()
                 return entry["value"]
             _on_miss()
             result = await func(**normalized)
@@ -407,6 +426,7 @@ def sync_wrapper(*args, **kwargs):
             key_hash = _compute_hash(normalized)
             entry = store.get(key_hash)
             if entry is not None:
+                _on_hit()
                 return entry["value"]
             _on_miss()
             result = func(**normalized)
@@ -416,6 +436,38 @@ def sync_wrapper(*args, **kwargs):
         return sync_wrapper
 
 
+def _get_cached_wrapper(
+    func: Callable,
+    *,
+    key: Callable | None,
+    max_size: int,
+    max_memory: str | int | None,
+    per_session: bool,
+) -> Callable:
+    from gradio.utils import _parse_file_size
+
+    registry_key = (
+        id(func),
+        id(key) if key is not None else None,
+        max_size,
+        _parse_file_size(max_memory),
+        per_session,
+    )
+    with _runtime_cache_lock:
+        wrapper = _cache_wrappers.get(registry_key)
+        if wrapper is None:
+            store = _make_store(max_size, max_memory, per_session)
+            wrapper = _make_wrapper(
+                func,
+                store,
+                key=key,
+                track_cache_hits=True,
+            )
+            wrapper.cache = store  # type: ignore
+            _cache_wrappers[registry_key] = wrapper
+        return wrapper
+
+
 @document()
 def cache(
     fn: Callable | None = None,
@@ -426,9 +478,9 @@ def cache(
     per_session: bool = False,
 ):
     """
-    Decorator that auto-caches function results based on content-hashed inputs. Works with sync/async functions and sync/async generators. For generators, all yielded values are cached and replayed on hit. Cache hits bypass the Gradio queue.
+    Decorator that auto-caches function results based on content-hashed inputs. Works with sync/async functions and sync/async generators. For generators, all yielded values are cached and replayed on hit. Cache hits bypass the Gradio queue. It can also be called at runtime as `gr.cache(fn)(*args)` to cache intermediate helper calls.
     Parameters:
-        fn: The function to cache. When used as @gr.cache without parentheses, this is the decorated function. When used as @gr.cache(...), this is None.
+        fn: The function to cache. When used as @gr.cache without parentheses, this is the decorated function. When used as @gr.cache(...), this is None. When used as `gr.cache(fn)(...)`, this must be a callable.
         key: Optional function that receives the kwargs dict and returns a hashable cache key, e.g. to only cache based on the prompt, pass in: lambda kw: kw["prompt"]
         max_size: Maximum number of cache entries. Least-recently-used entries are evicted when full. Set to 0 for unlimited. Default: 128.
         max_memory: Maximum total memory usage before eviction. Accepts strings like "512mb", "2gb" or integer bytes. When exceeded, least-recently-used entries are evicted. If None, no memory limit is applied. If both max_size and max_memory are set, the cache will evict entries when either limit is reached.
@@ -442,14 +494,22 @@ def classify(image):
         def generate(prompt):
             return llm(prompt)
     """
-    store = _make_store(max_size, max_memory, per_session)
 
     def decorator(func: Callable) -> Callable:
-        wrapper = _make_wrapper(func, store, key=key)
-        wrapper.cache = store  # type: ignore
-        return wrapper
+        return _get_cached_wrapper(
+            func,
+            key=key,
+            max_size=max_size,
+            max_memory=max_memory,
+            per_session=per_session,
+        )
 
     if fn is not None:
+        if not callable(fn):
+            raise TypeError(
+                "gr.cache(...) expected a callable when used at runtime. "
+                "Use gr.cache(fn)(*args) instead of gr.cache(fn(*args))."
+            )
         return decorator(fn)
     return decorator
 

diff --git a/guides/04_additional-features/17_caching.md b/guides/04_additional-features/17_caching.md
@@ -67,6 +67,23 @@ print(len(generate.cache))
 
 When a queued event is served from `@gr.cache`, Gradio shows a small `from cache` timing badge in the UI which appears temporarily in the relevant output components.
 
+### Caching intermediate helper calls
+
+You can also apply `gr.cache()` to a callable at runtime to cache an intermediate step inside a larger Gradio callback:
+
+```python
+def embed(text):
+    return embedding_model(text)
+
+def predict(text):
+    embedding = gr.cache(embed, per_session=True)(text)
+    return rerank(embedding)
+```
+
+This is especially useful when only part of your function is deterministic or reusable. Runtime `gr.cache(fn)(...)` uses the same cache store for repeated calls to that helper and shows the same `used cache` badge as `gr.Cache()` (see below) when a hit is reused during a request.
+
+`gr.cache()` must wrap a callable. If you accidentally write `gr.cache(fn(...))`, Gradio raises an error and tells you to use `gr.cache(fn)(...)` instead.
+
 ## Manual cache control with `gr.Cache()`
 
 For full control over what gets cached and when, use `gr.Cache()` as an injectable parameter (like `gr.Progress`). Gradio injects the same instance on every call, giving you a thread-safe `get`/`set` interface: