diff --git a/BLOG.md b/BLOG.md
index 9232a20..38c4f20 100644
--- a/BLOG.md
+++ b/BLOG.md
@@ -8,7 +8,7 @@ By leveraging the raw power of the **Snapdragon X Elite** and its high-performan
 
 ## 🚀 The Vision: AI at the Edge
 
-Most AI-integrated IDEs rely on heavy cloud APIs. This introduces latency, subscription costs, and—most importantly—privacy concerns. **Nova IDE** flips the script. It uses **MediaPipe GenAI** and **WebGPU** to run Large Language Models (LLMs) locally.
+Most AI-integrated IDEs rely on heavy cloud APIs. This introduces latency, subscription costs, and—most importantly—privacy concerns. **Nova IDE** flips the script. It uses **LiteRT LM** and **WebGPU** to run Large Language Models (LLMs) locally.
 
 When you run Nova IDE on a machine powered by the **Snapdragon X Elite**, you're not just running a web app; you're utilizing one of the most efficient NPU/GPU architectures ever designed for portable computing.
 
@@ -29,7 +29,7 @@ graph TD
     end
 
     subgraph AI_Runtime ["AI Execution Layer"]
-        MP["MediaPipe GenAI (WASM)"]
+        MP["LiteRT LM (WASM)"]
         WG["WebGPU API"]
     end
 
@@ -47,8 +47,8 @@ graph TD
     MP -->|Streaming Tokens| UI
 ```
 
-### 1. The Inference Engine (MediaPipe + WebGPU)
-At the heart of Nova IDE is the MediaPipe GenAI runtime. Unlike traditional JavaScript which runs on the CPU, Nova IDE uses **WebGPU** to talk directly to the **Qualcomm Adreno GPU**. 
+### 1. The Inference Engine (LiteRT LM + WebGPU)
+At the heart of Nova IDE is the LiteRT LM runtime. Unlike traditional JavaScript which runs on the CPU, Nova IDE uses **WebGPU** to talk directly to the **Qualcomm Adreno GPU**. 
 - **WebGPU** allows for massively parallel tensor operations required by transformers.
 - On the Snapdragon X Elite, the Adreno GPU provides the floating-point performance needed to generate tokens at lightning speed, rivaling cloud-based solutions.
 
@@ -74,9 +74,9 @@ The Snapdragon X Elite is a breakthrough for web-based AI. While Intel and AMD h
 
 When you click **"⚡ Load Local AI Model"** in Nova IDE, the following sequence occurs:
 
-1. **WASM Initialization**: The IDE loads the MediaPipe GenAI WebAssembly runtime.
+1. **WASM Initialization**: The IDE loads the LiteRT LM WebAssembly runtime.
 2. **GPU Adapter Request**: The browser requests a WebGPU adapter. On an X Elite machine, this identifies the **Qualcomm Adreno GPU**.
-3. **Model Loading**: The model (in `.task` format) is fetched into a `SharedArrayBuffer`.
+3. **Model Loading**: The model (in `.litertlm` format) is fetched into a `SharedArrayBuffer`.
 4. **GPU Compilation**: The model's computation graph is compiled into GPU-specific kernels.
 5. **Streaming Inference**: When you type a prompt, the tokens are generated on the Adreno GPU and streamed back to the UI in real-time.
 
diff --git a/README.md b/README.md
index 45b7984..d99872e 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # ⚡ Nova IDE
 
-Nova IDE is a lightweight, browser-based coding environment designed for the future of **on-device AI**. It leverages WebGPU and MediaPipe to run high-performance Large Language Models (LLMs) entirely within your browser—no cloud, no APIs, and total privacy.
+Nova IDE is a lightweight, browser-based coding environment designed for the future of **on-device AI**. It leverages WebGPU and LiteRT LM to run high-performance Large Language Models (LLMs) entirely within your browser—no cloud, no APIs, and total privacy.
 
 ![Nova IDE Screenshot](src/assets/hero.png)
 
@@ -60,7 +60,7 @@ Nova IDE is a lightweight, browser-based coding environment designed for the fut
 
 - **Core**: JavaScript (ESM), HTML5, CSS3
 - **Editor**: [CodeMirror 6](https://codemirror.net/)
-- **AI Engine**: [MediaPipe GenAI](https://ai.google.dev/edge/mediapipe/solutions/genai/llm_inference)
+- **AI Engine**: [LiteRT LM](https://ai.google.dev/edge/litert-lm/js)
 - **Bundler**: [Vite](https://vitejs.dev/)
 - **Storage**: IndexedDB (Virtual File System)
 
diff --git a/package-lock.json b/package-lock.json
index d89dfb7..a367c52 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,11 +1,11 @@
 {
-  "name": "local-ide-litertjs",
+  "name": "nova-ide",
   "version": "0.0.0",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
-      "name": "local-ide-litertjs",
+      "name": "nova-ide",
       "version": "0.0.0",
       "dependencies": {
         "@codemirror/autocomplete": "^6.20.2",
@@ -15,7 +15,7 @@
         "@codemirror/lang-python": "^6.2.1",
         "@codemirror/lint": "^6.9.6",
         "@codemirror/theme-one-dark": "^6.1.3",
-        "@mediapipe/tasks-genai": "^0.10.27",
+        "@litert-lm/core": "^0.12.1",
         "codemirror": "^6.0.2"
       },
       "devDependencies": {
@@ -174,29 +174,6 @@
         "w3c-keyname": "^2.2.4"
       }
     },
-    "node_modules/@emnapi/core": {
-      "version": "1.10.0",
-      "resolved": "https://registry.npmjs.org/@emnapi/core/-/core-1.10.0.tgz",
-      "integrity": "sha512-yq6OkJ4p82CAfPl0u9mQebQHKPJkY7WrIuk205cTYnYe+k2Z8YBh11FrbRG/H6ihirqcacOgl2BIO8oyMQLeXw==",
-      "dev": true,
-      "license": "MIT",
-      "optional": true,
-      "dependencies": {
-        "@emnapi/wasi-threads": "1.2.1",
-        "tslib": "^2.4.0"
-      }
-    },
-    "node_modules/@emnapi/runtime": {
-      "version": "1.10.0",
-      "resolved": "https://registry.npmjs.org/@emnapi/runtime/-/runtime-1.10.0.tgz",
-      "integrity": "sha512-ewvYlk86xUoGI0zQRNq/mC+16R1QeDlKQy21Ki3oSYXNgLb45GV1P6A0M+/s6nyCuNDqe5VpaY84BzXGwVbwFA==",
-      "dev": true,
-      "license": "MIT",
-      "optional": true,
-      "dependencies": {
-        "tslib": "^2.4.0"
-      }
-    },
     "node_modules/@emnapi/wasi-threads": {
       "version": "1.2.1",
       "resolved": "https://registry.npmjs.org/@emnapi/wasi-threads/-/wasi-threads-1.2.1.tgz",
@@ -276,18 +253,27 @@
         "@lezer/lr": "^1.0.0"
       }
     },
+    "node_modules/@litert-lm/core": {
+      "version": "0.12.1",
+      "resolved": "https://registry.npmjs.org/@litert-lm/core/-/core-0.12.1.tgz",
+      "integrity": "sha512-lLF+Hvg52TIN0ekHQ+RLpjyDt3pPuEqMFvE/6xYCvcnsJ3R03dz3GmVJ3mGgumPZ7gxc+iqLUF3Be5v5L1K1Xw==",
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@litertjs/wasm-utils": "^2.0.0"
+      }
+    },
+    "node_modules/@litertjs/wasm-utils": {
+      "version": "2.5.0",
+      "resolved": "https://registry.npmjs.org/@litertjs/wasm-utils/-/wasm-utils-2.5.0.tgz",
+      "integrity": "sha512-zhMAqJRJ3ROi48flZxYx+K2MiMllJVuH7oeumpSIfQMBeOb6JyLV/7ltLbY6E+nERUAfNwzIBqjslWAeXcO6iQ==",
+      "license": "Apache-2.0"
+    },
     "node_modules/@marijn/find-cluster-break": {
       "version": "1.0.2",
       "resolved": "https://registry.npmjs.org/@marijn/find-cluster-break/-/find-cluster-break-1.0.2.tgz",
       "integrity": "sha512-l0h88YhZFyKdXIFNfSWpyjStDjGHwZ/U7iobcK1cQQD8sejsONdQtTVU+1wVN1PBw40PiiHB1vA5S7VTfQiP9g==",
       "license": "MIT"
     },
-    "node_modules/@mediapipe/tasks-genai": {
-      "version": "0.10.27",
-      "resolved": "https://registry.npmjs.org/@mediapipe/tasks-genai/-/tasks-genai-0.10.27.tgz",
-      "integrity": "sha512-cv69CPPAtEDBUs6dGZft2S+sBqde1XvEMST367siSyxrhffdWtm4uQIsfdedAbhJ33BwAjuMnAdxDrO9WrzIAQ==",
-      "license": "Apache-2.0"
-    },
     "node_modules/@napi-rs/wasm-runtime": {
       "version": "1.1.4",
       "resolved": "https://registry.npmjs.org/@napi-rs/wasm-runtime/-/wasm-runtime-1.1.4.tgz",
@@ -949,6 +935,7 @@
       "integrity": "sha512-QP88BAKvMam/3NxH6vj2o21R6MjxZUAd6nlwAS/pnGvN9IVLocLHxGYIzFhg6fUQ+5th6P4dv4eW9jX3DSIj7A==",
       "dev": true,
       "license": "MIT",
+      "peer": true,
       "engines": {
         "node": ">=12"
       },
diff --git a/package.json b/package.json
index dc9c9dd..61e02b8 100644
--- a/package.json
+++ b/package.json
@@ -20,7 +20,7 @@
     "@codemirror/lang-python": "^6.2.1",
     "@codemirror/lint": "^6.9.6",
     "@codemirror/theme-one-dark": "^6.1.3",
-    "@mediapipe/tasks-genai": "^0.10.27",
+    "@litert-lm/core": "^0.12.1",
     "codemirror": "^6.0.2"
   }
 }
diff --git a/packages/web-agent-core/README.md b/packages/web-agent-core/README.md
index 0d35c6c..cda5a2b 100644
--- a/packages/web-agent-core/README.md
+++ b/packages/web-agent-core/README.md
@@ -2,7 +2,7 @@
 
 A purely in-browser, WebGPU-accelerated, zero-dependency autonomous ReAct (Reason+Act) Agent Framework.
 
-Built for local-first, privacy-respecting AI applications using MediaPipe GenAI and lightweight models like Gemma.
+Built for local-first, privacy-respecting AI applications using LiteRT LM and lightweight models like Gemma.
 
 ## Features
 - **Zero Backend**: Runs entirely in the browser using WebGPU.
@@ -41,7 +41,7 @@ const myTools = [
 // Your LLM object just needs a \`generateRaw(history, onToken)\` method
 const aiEngine = {
   generateRaw: async (prompt, onToken) => {
-    // Call MediaPipe or Transformers.js here
+    // Call LiteRT LM, Transformers.js, or another local model runtime here
     // stream tokens to onToken(token)
   }
 };
diff --git a/packages/web-agent-core/package.json b/packages/web-agent-core/package.json
index 1137a15..00d82f7 100644
--- a/packages/web-agent-core/package.json
+++ b/packages/web-agent-core/package.json
@@ -15,7 +15,7 @@
     "agent",
     "llm",
     "webgpu",
-    "mediapipe",
+    "litert-lm",
     "react",
     "autonomous",
     "browser",
@@ -26,4 +26,4 @@
   "publishConfig": {
     "registry": "https://npm.pkg.github.com"
   }
-}
\ No newline at end of file
+}
diff --git a/packages/web-agent-core/website/docs/intro.md b/packages/web-agent-core/website/docs/intro.md
index 9589fb6..dd17a23 100644
--- a/packages/web-agent-core/website/docs/intro.md
+++ b/packages/web-agent-core/website/docs/intro.md
@@ -6,7 +6,7 @@ sidebar_position: 1
 
 Welcome to **Nova Web Agent Core**!
 
-This package provides a purely in-browser, WebGPU-accelerated, zero-dependency autonomous ReAct (Reason+Act) Agent Framework. It is built for local-first, privacy-respecting AI applications using MediaPipe GenAI and lightweight models like Gemma.
+This package provides a purely in-browser, WebGPU-accelerated, zero-dependency autonomous ReAct (Reason+Act) Agent Framework. It is built for local-first, privacy-respecting AI applications using LiteRT LM and lightweight models like Gemma.
 
 ## Why this framework?
 
@@ -80,7 +80,7 @@ Your LLM wrapper only needs one method: `generateRaw(prompt, onTokenCallback)`.
 ```javascript
 const aiEngine = {
   generateRaw: async (prompt, onToken) => {
-    // Call MediaPipe, Transformers.js, or even a cloud API here
+    // Call LiteRT LM, Transformers.js, or even a cloud API here
     // stream tokens to onToken(token)
   }
 };
diff --git a/packages/web-agent-core/website/src/components/HomepageFeatures/index.js b/packages/web-agent-core/website/src/components/HomepageFeatures/index.js
index 13e00da..4457077 100644
--- a/packages/web-agent-core/website/src/components/HomepageFeatures/index.js
+++ b/packages/web-agent-core/website/src/components/HomepageFeatures/index.js
@@ -17,7 +17,7 @@ const FeatureList = [
     icon: '🧠',
     description: (
       <>
-        Powered by the cutting-edge WebGPU backend and MediaPipe, execute your agent's reasoning loops completely client-side without API keys.
+        Powered by the cutting-edge WebGPU backend and LiteRT LM, execute your agent's reasoning loops completely client-side without API keys.
       </>
     ),
   },
diff --git a/src/ai-engine.js b/src/ai-engine.js
index 01df7ad..d3eaca0 100644
--- a/src/ai-engine.js
+++ b/src/ai-engine.js
@@ -1,6 +1,6 @@
 /**
- * ai-engine.js — Local AI Inference wrapper
- * Uses @mediapipe/tasks-genai for on-device WebGPU inference.
+ * ai-engine.js - Local AI inference wrapper.
+ * Uses LiteRT LM for on-device WebGPU inference.
  */
 
 const MODEL_STATUS = {
@@ -12,13 +12,26 @@ const MODEL_STATUS = {
   GENERATING: 'generating',
 };
 
+const SYSTEM_PROMPT = 'You are an expert coding assistant inside an IDE. Provide concise, correct code and explanations. Use markdown code blocks for code snippets.';
+const MIN_MODEL_BYTES = 1024 * 1024;
+const MODEL_DOWNLOAD_MEMORY_LIMIT_BYTES = 1024 * 1024 * 1024;
+const LITERT_LM_CONFIG = {
+  maxTokens: 8192,
+  samplerParams: {
+    k: 40,
+    temperature: 0.7,
+    seed: 42,
+  },
+};
+
 class AIEngine {
   constructor() {
-    this.llmInference = null;
+    this.engine = null;
+    this.conversation = null;
     this.status = MODEL_STATUS.IDLE;
     this.statusMessage = 'No model loaded';
     this.listeners = new Set();
-    this.modelName = 'Local AI Engine';
+    this.modelName = 'LiteRT LM';
     this._opfsRoot = null;
   }
 
@@ -29,19 +42,23 @@ class AIEngine {
     return this._opfsRoot;
   }
 
-  /** Check if a model exists in OPFS cache */
+  /** Check if a model exists in OPFS cache. */
   async getCachedModel(filename) {
     try {
       const root = await this._getOpfs();
       const fileHandle = await root.getFileHandle(filename);
       const file = await fileHandle.getFile();
+      if (file.size < MIN_MODEL_BYTES) {
+        await root.removeEntry(filename);
+        return null;
+      }
       return file;
     } catch (e) {
       return null;
     }
   }
 
-  /** Save a buffer to OPFS cache */
+  /** Save a buffer to OPFS cache. */
   async saveToCache(filename, buffer) {
     try {
       const root = await this._getOpfs();
@@ -55,7 +72,54 @@ class AIEngine {
     }
   }
 
-  /** Delete a model from OPFS cache */
+  /** Stream a response body directly into OPFS without buffering the model in RAM. */
+  async saveResponseToCache(filename, response, onProgress) {
+    const root = await this._getOpfs();
+    const fileHandle = await root.getFileHandle(filename, { create: true });
+    const writable = await fileHandle.createWritable();
+    const reader = response.body.getReader();
+    const total = parseInt(response.headers.get('content-length') || '0');
+    let received = 0;
+
+    try {
+      while (true) {
+        const { done, value } = await reader.read();
+        if (done) break;
+
+        await writable.write(value);
+        received += value.length;
+
+        if (onProgress) onProgress(received, total);
+        this._setStatus(
+          MODEL_STATUS.DOWNLOADING,
+          total
+            ? `Downloading... ${(received / 1048576).toFixed(0)} / ${(total / 1048576).toFixed(0)} MB`
+            : `Downloading... ${(received / 1048576).toFixed(0)} MB`
+        );
+      }
+    } catch (err) {
+      await writable.abort();
+      await this.deleteFromCache(filename);
+      throw err;
+    }
+
+    await writable.close();
+    const file = await fileHandle.getFile();
+    const contentEncoding = (response.headers.get('content-encoding') || 'identity').toLowerCase();
+    const canValidateLength = total > 0 && (contentEncoding === 'identity' || contentEncoding === '');
+
+    if (file.size < MIN_MODEL_BYTES || (canValidateLength && file.size !== total)) {
+      await this.deleteFromCache(filename);
+      throw new Error(
+        canValidateLength
+          ? `Downloaded model is incomplete (${file.size} of ${total} bytes). Please retry.`
+          : 'Downloaded model is incomplete. Please retry.'
+      );
+    }
+    return file;
+  }
+
+  /** Delete a model from OPFS cache. */
   async deleteFromCache(filename) {
     try {
       const root = await this._getOpfs();
@@ -63,7 +127,7 @@ class AIEngine {
     } catch (e) {}
   }
 
-  /** Subscribe to status changes */
+  /** Subscribe to status changes. */
   onStatusChange(fn) {
     this.listeners.add(fn);
     return () => this.listeners.delete(fn);
@@ -84,59 +148,70 @@ class AIEngine {
     this._emit();
   }
 
+  async _deleteConversation() {
+    if (this.conversation) {
+      await this.conversation.delete();
+      this.conversation = null;
+    }
+  }
+
+  async _deleteEngine() {
+    await this._deleteConversation();
+    if (this.engine) {
+      await this.engine.delete();
+      this.engine = null;
+    }
+  }
+
+  async _createConversation() {
+    if (!this.engine) {
+      throw new Error('Model not loaded');
+    }
+
+    return this.engine.createConversation({
+      sessionConfig: {
+        samplerParams: LITERT_LM_CONFIG.samplerParams,
+        maxOutputTokens: LITERT_LM_CONFIG.maxTokens,
+      },
+      preface: {
+        messages: [
+          { role: 'system', content: SYSTEM_PROMPT },
+        ],
+      },
+    });
+  }
+
   /**
-   * Load model from a URL or user-uploaded file.
+   * Load model from a URL, OPFS File, or user-uploaded file.
    * @param {string|File} modelSource - URL string or File object
    */
   async loadModel(modelSource) {
     try {
-      this._setStatus(MODEL_STATUS.LOADING, 'Importing AI Engine…');
-
-      // Dynamic import so initial page load is fast
-      const genai = await import('@mediapipe/tasks-genai');
-      const { FilesetResolver, LlmInference } = genai;
-
-      this._setStatus(MODEL_STATUS.LOADING, 'Loading AI runtime…');
+      this._setStatus(MODEL_STATUS.LOADING, 'Importing LiteRT LM...');
 
-      // Initialize the WASM fileset resolver (required by MediaPipe)
-      const genaiFileset = await FilesetResolver.forGenAiTasks(
-        'https://cdn.jsdelivr.net/npm/@mediapipe/tasks-genai@latest/wasm'
-      );
-
-      this._setStatus(MODEL_STATUS.LOADING, 'Initializing LLM engine…');
+      const { Engine } = await import('@litert-lm/core');
 
-      let modelAssetPath = null;
-      let modelAssetBuffer = null;
+      await this._deleteEngine();
 
-      if (typeof modelSource === 'string') {
-        modelAssetPath = modelSource;
-        this._setStatus(MODEL_STATUS.DOWNLOADING, 'Downloading model…');
-      } else if (modelSource instanceof File) {
-        this._setStatus(MODEL_STATUS.LOADING, 'Reading model file…');
-        const buf = await modelSource.arrayBuffer();
-        modelAssetBuffer = new Uint8Array(buf);
+      let model = modelSource;
+      if (modelSource instanceof File) {
+        this._setStatus(MODEL_STATUS.LOADING, 'Reading LiteRT LM model file...');
+        model = modelSource.stream();
+      } else if (typeof modelSource === 'string') {
+        this._setStatus(MODEL_STATUS.DOWNLOADING, 'Preparing model download...');
       }
 
-      this._setStatus(MODEL_STATUS.LOADING, 'Compiling model for WebGPU…');
+      this._setStatus(MODEL_STATUS.LOADING, 'Initializing LiteRT LM engine...');
 
-      const options = {
-        baseOptions: {},
-        maxTokens: 8192,
-        topK: 40,
-        temperature: 0.7,
-        randomSeed: 42,
-      };
+      this.engine = await Engine.create({
+        model,
+        mainExecutorSettings: {
+          maxNumTokens: LITERT_LM_CONFIG.maxTokens,
+        },
+      });
 
-      if (modelAssetPath) {
-        options.baseOptions.modelAssetPath = modelAssetPath;
-      }
-      if (modelAssetBuffer) {
-        options.baseOptions.modelAssetBuffer = modelAssetBuffer;
-      }
-
-      // FilesetResolver must be passed as the first argument
-      this.llmInference = await LlmInference.createFromOptions(genaiFileset, options);
-      this._setStatus(MODEL_STATUS.READY, 'Model ready — on-device inference active');
+      this.conversation = await this._createConversation();
+      this._setStatus(MODEL_STATUS.READY, 'Model ready - LiteRT LM on-device inference active');
     } catch (err) {
       console.error('[AIEngine] Load error:', err);
       this._setStatus(MODEL_STATUS.ERROR, `Failed: ${err.message}`);
@@ -144,41 +219,62 @@ class AIEngine {
     }
   }
 
+  _chunkText(chunk) {
+    if (!chunk) return '';
+    if (typeof chunk.content === 'string') return chunk.content;
+    if (!Array.isArray(chunk.content)) return '';
+
+    return chunk.content
+      .filter(item => item && item.type === 'text' && typeof item.text === 'string')
+      .map(item => item.text)
+      .join('');
+  }
+
+  async _streamMessage(conversation, message, onToken) {
+    let fullResponse = '';
+    const stream = conversation.sendMessageStreaming(message);
+
+    for await (const chunk of stream) {
+      const text = this._chunkText(chunk);
+      if (!text) continue;
+      fullResponse += text;
+      if (onToken) onToken(fullResponse);
+    }
+
+    return fullResponse;
+  }
+
+  _messagesFromGemmaTranscript(rawPrompt) {
+    const turnRegex = /<start_of_turn>(user|model)\n([\s\S]*?)(?:<end_of_turn>|$)/g;
+    const messages = [];
+    let match;
+
+    while ((match = turnRegex.exec(rawPrompt)) !== null) {
+      const role = match[1] === 'model' ? 'assistant' : 'user';
+      const content = match[2].trim();
+      if (!content) continue;
+      messages.push({ role, content });
+    }
+
+    return messages.length ? messages : rawPrompt;
+  }
+
   /**
-   * Generate a response (streaming).
+   * Generate a response in the main chat conversation.
    * @param {string} prompt
-   * @param {(partial: string) => void} onToken - called with each partial result
+   * @param {(partial: string) => void} onToken - called with accumulated text
    * @returns {Promise<string>} full response
    */
   async generate(prompt, onToken) {
-    if (!this.llmInference) {
+    if (!this.conversation) {
       throw new Error('Model not loaded');
     }
 
-    this._setStatus(MODEL_STATUS.GENERATING, 'Generating…');
+    this._setStatus(MODEL_STATUS.GENERATING, 'Generating...');
 
     try {
-      // Format as instruction-tuned prompt
-      const formattedPrompt = this._formatPrompt(prompt);
-
-      let fullResponse = '';
-
-      // Use streaming API
-      const response = await this.llmInference.generateResponse(
-        formattedPrompt,
-        (partialResult, done) => {
-          fullResponse = partialResult;
-          if (onToken) onToken(partialResult);
-        }
-      );
-
-      // If streaming callback didn't fire, use direct result
-      if (!fullResponse && response) {
-        fullResponse = response;
-        if (onToken) onToken(response);
-      }
-
-      this._setStatus(MODEL_STATUS.READY, 'Model ready — on-device inference active');
+      const fullResponse = await this._streamMessage(this.conversation, prompt, onToken);
+      this._setStatus(MODEL_STATUS.READY, 'Model ready - LiteRT LM on-device inference active');
       return fullResponse;
     } catch (err) {
       console.error('[AIEngine] Generation error:', err);
@@ -188,46 +284,33 @@ class AIEngine {
   }
 
   /**
-   * Generate a response using raw formatted string (for multi-turn/agents).
+   * Generate a response using a raw formatted string for the agent loop.
    */
   async generateRaw(rawPrompt, onToken) {
-    if (!this.llmInference) throw new Error('Model not loaded');
-    this._setStatus(MODEL_STATUS.GENERATING, 'Agent thinking…');
+    if (!this.engine) {
+      throw new Error('Model not loaded');
+    }
+
+    this._setStatus(MODEL_STATUS.GENERATING, 'Agent thinking...');
 
+    let conversation = null;
     try {
-      let fullResponse = '';
-      const response = await this.llmInference.generateResponse(
-        rawPrompt,
-        (partialResult, done) => {
-          fullResponse = partialResult;
-          if (onToken) onToken(partialResult);
-        }
-      );
-      if (!fullResponse && response) {
-        fullResponse = response;
-        if (onToken) onToken(response);
-      }
-      this._setStatus(MODEL_STATUS.READY, 'Model ready — on-device inference active');
+      conversation = await this._createConversation();
+      const message = this._messagesFromGemmaTranscript(rawPrompt);
+      const fullResponse = await this._streamMessage(conversation, message, onToken);
+      this._setStatus(MODEL_STATUS.READY, 'Model ready - LiteRT LM on-device inference active');
       return fullResponse;
     } catch (err) {
       console.error('[AIEngine] Raw generation error:', err);
       this._setStatus(MODEL_STATUS.READY, 'Generation completed with errors');
       throw err;
+    } finally {
+      if (conversation) {
+        await conversation.delete();
+      }
     }
   }
 
-  /**
-   * Format prompt using instruction template.
-   */
-  _formatPrompt(userMessage) {
-    return `<start_of_turn>user
-You are an expert coding assistant inside an IDE. Provide concise, correct code and explanations. Use markdown code blocks for code snippets.
-
-${userMessage}<end_of_turn>
-<start_of_turn>model
-`;
-  }
-
   get isReady() {
     return this.status === MODEL_STATUS.READY;
   }
@@ -246,22 +329,38 @@ ${userMessage}<end_of_turn>
     try {
       const filename = url.split('/').pop();
 
-      // Check cache first if persist is requested
       if (persist) {
         const cached = await this.getCachedModel(filename);
         if (cached) {
-          this._setStatus(MODEL_STATUS.LOADING, 'Loading from local disk…');
-          await this.loadModel(cached);
-          return;
+          this._setStatus(MODEL_STATUS.LOADING, 'Loading from local disk...');
+          try {
+            await this.loadModel(cached);
+            return;
+          } catch (err) {
+            await this.deleteFromCache(filename);
+            console.warn('[AIEngine] Removed corrupt cached model:', err);
+            this._setStatus(MODEL_STATUS.DOWNLOADING, 'Cached model was corrupt. Downloading again...');
+          }
         }
       }
 
-      this._setStatus(MODEL_STATUS.DOWNLOADING, 'Connecting…');
+      this._setStatus(MODEL_STATUS.DOWNLOADING, 'Connecting...');
 
       const response = await fetch(url);
       if (!response.ok) throw new Error(`HTTP ${response.status} ${response.statusText}`);
 
       const total = parseInt(response.headers.get('content-length') || '0');
+
+      if (persist) {
+        const file = await this.saveResponseToCache(filename, response, onProgress);
+        await this.loadModel(file);
+        return;
+      }
+
+      if (total > MODEL_DOWNLOAD_MEMORY_LIMIT_BYTES) {
+        throw new Error('This model is too large for memory-only loading. Enable "Save to local disk (OPFS)" and try again.');
+      }
+
       const reader = response.body.getReader();
       const chunks = [];
       let received = 0;
@@ -269,18 +368,23 @@ ${userMessage}<end_of_turn>
       while (true) {
         const { done, value } = await reader.read();
         if (done) break;
+
+        if (received + value.length > MODEL_DOWNLOAD_MEMORY_LIMIT_BYTES) {
+          await reader.cancel();
+          throw new Error('This model is too large for memory-only loading. Enable "Save to local disk (OPFS)" and try again.');
+        }
+
         chunks.push(value);
         received += value.length;
         if (onProgress) onProgress(received, total);
         this._setStatus(
           MODEL_STATUS.DOWNLOADING,
           total
-            ? `Downloading… ${(received / 1048576).toFixed(0)} / ${(total / 1048576).toFixed(0)} MB`
-            : `Downloading… ${(received / 1048576).toFixed(0)} MB`
+            ? `Downloading... ${(received / 1048576).toFixed(0)} / ${(total / 1048576).toFixed(0)} MB`
+            : `Downloading... ${(received / 1048576).toFixed(0)} MB`
         );
       }
 
-      // Concatenate chunks into a single buffer
       const buffer = new Uint8Array(received);
       let offset = 0;
       for (const chunk of chunks) {
@@ -290,10 +394,6 @@ ${userMessage}<end_of_turn>
 
       const file = new File([buffer], filename);
 
-      if (persist) {
-        await this.saveToCache(filename, buffer);
-      }
-
       await this.loadModel(file);
     } catch (err) {
       console.error('[AIEngine] Download error:', err);
@@ -302,12 +402,16 @@ ${userMessage}<end_of_turn>
     }
   }
 
+  async disposeAsync() {
+    await this._deleteEngine();
+    this._setStatus(MODEL_STATUS.IDLE, 'Engine disposed');
+  }
+
   dispose() {
-    if (this.llmInference) {
-      this.llmInference.close();
-      this.llmInference = null;
-    }
     this._setStatus(MODEL_STATUS.IDLE, 'Engine disposed');
+    this._deleteEngine().catch(err => {
+      console.error('[AIEngine] Dispose error:', err);
+    });
   }
 }
 
diff --git a/src/chat.js b/src/chat.js
index ef83e52..923c35b 100644
--- a/src/chat.js
+++ b/src/chat.js
@@ -10,17 +10,17 @@ import { vfs } from './file-system.js';
 const MODELS = [
   {
     id: 'gemma4-e2b',
-    label: 'Gemma 4 E2B',
+    label: 'Gemma 4 E2B LiteRT LM',
     size: '2 GB',
     badge: '★ Recommended',
-    url: 'https://huggingface.co/litert-community/gemma-4-E2B-it-litert-lm/resolve/main/gemma-4-E2B-it-web.task',
+    url: 'https://huggingface.co/litert-community/gemma-4-E2B-it-litert-lm/resolve/main/gemma-4-E2B-it-web.litertlm',
   },
   {
     id: 'gemma4-e4b',
-    label: 'Gemma 4 E4B',
+    label: 'Gemma 4 E4B LiteRT LM',
     size: '3 GB',
     badge: '🔥 Best',
-    url: 'https://huggingface.co/litert-community/gemma-4-E4B-it-litert-lm/resolve/main/gemma-4-E4B-it-web.task',
+    url: 'https://huggingface.co/litert-community/gemma-4-E4B-it-litert-lm/resolve/main/gemma-4-E4B-it-web.litertlm',
   },
 ];
 
@@ -94,7 +94,7 @@ export class ChatPanel {
           </summary>
           <div style="margin-top: 8px; display: flex; flex-direction: column; gap: 8px;">
             <div class="model-status__row" style="gap: 8px; flex-wrap: wrap;">
-              <input type="file" id="model-file-input" accept=".task,.bin,.tflite" style="display:none">
+              <input type="file" id="model-file-input" accept=".litertlm" style="display:none">
               <button class="model-status__btn model-status__btn--primary" id="model-upload-btn"
                 style="flex: 1;">
                 📁 Upload File
@@ -102,7 +102,7 @@ export class ChatPanel {
             </div>
             <div class="model-status__row" style="gap: 6px;">
               <input type="text" id="model-url-input"
-                placeholder="https://...model.task"
+                placeholder="https://...model.litertlm"
                 style="flex:1; background: var(--bg-primary); border: 1px solid var(--border-primary); 
                        border-radius: 6px; padding: 5px 10px; color: var(--text-primary); 
                        font-family: var(--font-mono); font-size: 11px; outline: none;">
diff --git a/vite.config.js b/vite.config.js
index 96b6a13..374b87e 100644
--- a/vite.config.js
+++ b/vite.config.js
@@ -1,7 +1,7 @@
 import { defineConfig } from 'vite';
 
 export default defineConfig({
-  // Allow serving the large .task model file from public/
+  // Allow serving large LiteRT LM model files from public/
   server: {
     headers: {
       'Cross-Origin-Embedder-Policy': 'require-corp',
@@ -12,9 +12,9 @@ export default defineConfig({
       timeout: 60000,
     },
   },
-  // Exclude model files from being processed by Vite's pipeline
-  assetsInclude: ['**/*.task'],
+  // Exclude model files from being processed by Vite's pipeline.
+  assetsInclude: ['**/*.litertlm'],
   optimizeDeps: {
-    exclude: ['@mediapipe/tasks-genai'],
+    exclude: ['@litert-lm/core'],
   },
 });