keras-team
diff --git a/‎guides/gptq_quantization_in_keras.py‎
Lines changed: 141 additions & 0 deletions b/‎guides/gptq_quantization_in_keras.py‎
Lines changed: 141 additions & 0 deletions
diff --git a/‎guides/ipynb/gptq_quantization_in_keras.ipynb‎
Lines changed: 237 additions & 0 deletions b/‎guides/ipynb/gptq_quantization_in_keras.ipynb‎
Lines changed: 237 additions & 0 deletions
@@ -0,0 +1,141 @@
+"""
+Title: GPTQ Quantization in Keras
+Author: [Jyotinder Singh](https://x.com/Jyotinder_Singh)
+Date created: 2025/10/16
+Last modified: 2025/10/16
+Description: How to run weight-only GPTQ quantization for Keras & KerasHub models.
+Accelerator: GPU
+"""
+
+"""
+## What is GPTQ?
+
+GPTQ ("Generative Pre-Training Quantization") is a post-training, weight-only
+quantization method that uses a second-order approximation of the loss (via a
+Hessian estimate) to minimize the error introduced when compressing weights to
+lower precision, typically 4-bit integers.
+
+Unlike standard post-training techniques, GPTQ keeps activations in
+higher-precision and only quantizes the weights. This often preserves model
+quality in low bit-width settings while still providing large storage and
+memory savings.
+
+Keras supports GPTQ quantization for KerasHub models via the
+`keras.quantizers.GPTQConfig` class.
+"""
+
+"""
+## Load a KerasHub model
+
+This guide uses the `Gemma3CausalLM` model from KerasHub, a small (1B
+parameter) causal language model.
+
+"""
+import keras
+from keras_hub.models import Gemma3CausalLM
+from datasets import load_dataset
+
+
+prompt = "Keras is a"
+
+model = Gemma3CausalLM.from_preset("gemma3_1b")
+
+outputs = model.generate(prompt, max_length=30)
+print(outputs)
+
+"""
+## Configure & run GPTQ quantization
+
+You can configure GPTQ quantization via the `keras.quantizers.GPTQConfig` class.
+
+The GPTQ configuration requires a calibration dataset and tokenizer, which it
+uses to estimate the Hessian and quantization error. Here, we use a small slice
+of the WikiText-2 dataset for calibration.
+
+You can tune several parameters to trade off speed, memory, and accuracy. The
+most important of these are `weight_bits` (the bit-width to quantize weights to)
+and `group_size` (the number of weights to quantize together). The group size
+controls the granularity of quantization: smaller groups typically yield better
+accuracy but are slower to quantize and may use more memory. A good starting
+point is `group_size=128` for 4-bit quantization (`weight_bits=4`).
+
+In this example, we first prepare a tiny calibration set, and then run GPTQ on
+the model using the `.quantize(...)` API.
+"""
+
+# Calibration slice (use a larger/representative set in practice)
+texts = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:1%]")["text"]
+
+calibration_dataset = [
+    s + "." for text in texts for s in map(str.strip, text.split(".")) if s
+]
+
+gptq_config = keras.quantizers.GPTQConfig(
+    dataset=calibration_dataset,
+    tokenizer=model.preprocessor.tokenizer,
+    weight_bits=4,
+    group_size=128,
+    num_samples=256,
+    sequence_length=256,
+    hessian_damping=0.01,
+    symmetric=False,
+    activation_order=False,
+)
+
+model.quantize("gptq", config=gptq_config)
+
+outputs = model.generate(prompt, max_length=30)
+print(outputs)
+
+"""
+## Model Export
+
+The GPTQ quantized model can be saved to a preset and reloaded elsewhere, just
+like any other KerasHub model.
+"""
+
+model.save_to_preset("gemma3_gptq_w4gs128_preset")
+model_from_preset = Gemma3CausalLM.from_preset("gemma3_gptq_w4gs128_preset")
+output = model_from_preset.generate(prompt, max_length=30)
+print(output)
+
+"""
+## Performance & Benchmarking
+
+Micro-benchmarks collected on a single NVIDIA 4070 Ti Super (16 GB).
+Baselines are FP32.
+
+Dataset: WikiText-2.
+
+
+| Model (preset)                    | Perplexity Increase % (↓ better) | Disk Storage Reduction Δ % (↓ better) | VRAM Reduction Δ % (↓ better) | First-token Latency Δ % (↓ better) | Throughput Δ % (↑ better) |
+| --------------------------------- | -------------------------------: | ------------------------------------: | ----------------------------: | ---------------------------------: | ------------------------: |
+| GPT2 (gpt2_base_en_cnn_dailymail) |                             1.0% |                              -50.1% ↓ |                      -41.1% ↓ |                            +0.7% ↑ |                  +20.1% ↑ |
+| OPT (opt_125m_en)                 |                            10.0% |                              -49.8% ↓ |                      -47.0% ↓ |                            +6.7% ↑ |                  -15.7% ↓ |
+| Bloom (bloom_1.1b_multi)          |                             7.0% |                              -47.0% ↓ |                      -54.0% ↓ |                            +1.8% ↑ |                  -15.7% ↓ |
+| Gemma3 (gemma3_1b)                |                             3.0% |                              -51.5% ↓ |                      -51.8% ↓ |                           +39.5% ↑ |                   +5.7% ↑ |
+
+
+Detailed benchmarking numbers and scripts are available
+[here](https://github.com/keras-team/keras/pull/21641).
+
+### Analysis
+
+There is notable reduction in disk space and VRAM usage across all models, with
+disk space savings around 50% and VRAM savings ranging from 41% to 54%. The
+reported disk savings understate the true weight compression because presets
+also include non-weight assets.
+
+Perplexity increases only marginally, indicating model quality is largely
+preserved after quantization.
+"""
+
+"""
+## Practical tips
+
+* GPTQ is weight-only; training after quantization is not supported.
+* Always use the model's own tokenizer for calibration.
+* Use a representative calibration set; small slices are only for demos.
+* Start with W4 group_size=128; tune per model/task.
+* Save to `.keras` or to a preset for reuse elsewhere.
+"""
@@ -0,0 +1,237 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "# GPTQ Quantization in Keras\n",
+    "\n",
+    "**Author:** [Jyotinder Singh](https://x.com/Jyotinder_Singh)<br>\n",
+    "**Date created:** 2025/10/16<br>\n",
+    "**Last modified:** 2025/10/16<br>\n",
+    "**Description:** How to run weight-only GPTQ quantization for Keras & KerasHub models."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "## What is GPTQ?\n",
+    "\n",
+    "GPTQ (\"Generative Pre-Training Quantization\") is a post-training, weight-only\n",
+    "quantization method that uses a second-order approximation of the loss (via a\n",
+    "Hessian estimate) to minimize the error introduced when compressing weights to\n",
+    "lower precision, typically 4-bit integers.\n",
+    "\n",
+    "Unlike standard post-training techniques, GPTQ keeps activations in\n",
+    "higher-precision and only quantizes the weights. This often preserves model\n",
+    "quality in low bit-width settings while still providing large storage and\n",
+    "memory savings.\n",
+    "\n",
+    "Keras supports GPTQ quantization for KerasHub models via the\n",
+    "`keras.quantizers.GPTQConfig` class."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "## Load a KerasHub model\n",
+    "\n",
+    "This guide uses the `Gemma3CausalLM` model from KerasHub, a small (1B\n",
+    "parameter) causal language model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "import keras\n",
+    "from keras_hub.models import Gemma3CausalLM\n",
+    "from datasets import load_dataset\n",
+    "\n",
+    "\n",
+    "prompt = \"Keras is a\"\n",
+    "\n",
+    "model = Gemma3CausalLM.from_preset(\"gemma3_1b\")\n",
+    "\n",
+    "outputs = model.generate(prompt, max_length=30)\n",
+    "print(outputs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "## Configure & run GPTQ quantization\n",
+    "\n",
+    "You can configure GPTQ quantization via the `keras.quantizers.GPTQConfig` class.\n",
+    "\n",
+    "The GPTQ configuration requires a calibration dataset and tokenizer, which it\n",
+    "uses to estimate the Hessian and quantization error. Here, we use a small slice\n",
+    "of the WikiText-2 dataset for calibration.\n",
+    "\n",
+    "You can tune several parameters to trade off speed, memory, and accuracy. The\n",
+    "most important of these are `weight_bits` (the bit-width to quantize weights to)\n",
+    "and `group_size` (the number of weights to quantize together). The group size\n",
+    "controls the granularity of quantization: smaller groups typically yield better\n",
+    "accuracy but are slower to quantize and may use more memory. A good starting\n",
+    "point is `group_size=128` for 4-bit quantization (`weight_bits=4`).\n",
+    "\n",
+    "In this example, we first prepare a tiny calibration set, and then run GPTQ on\n",
+    "the model using the `.quantize(...)` API."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "# Calibration slice (use a larger/representative set in practice)\n",
+    "texts = load_dataset(\"wikitext\", \"wikitext-2-raw-v1\", split=\"train[:1%]\")[\"text\"]\n",
+    "\n",
+    "calibration_dataset = [\n",
+    "    s + \".\" for text in texts for s in map(str.strip, text.split(\".\")) if s\n",
+    "]\n",
+    "\n",
+    "gptq_config = keras.quantizers.GPTQConfig(\n",
+    "    dataset=calibration_dataset,\n",
+    "    tokenizer=model.preprocessor.tokenizer,\n",
+    "    weight_bits=4,\n",
+    "    group_size=128,\n",
+    "    num_samples=256,\n",
+    "    sequence_length=256,\n",
+    "    hessian_damping=0.01,\n",
+    "    symmetric=False,\n",
+    "    activation_order=False,\n",
+    ")\n",
+    "\n",
+    "model.quantize(\"gptq\", config=gptq_config)\n",
+    "\n",
+    "outputs = model.generate(prompt, max_length=30)\n",
+    "print(outputs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "## Model Export\n",
+    "\n",
+    "The GPTQ quantized model can be saved to a preset and reloaded elsewhere, just\n",
+    "like any other KerasHub model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "model.save_to_preset(\"gemma3_gptq_w4gs128_preset\")\n",
+    "model_from_preset = Gemma3CausalLM.from_preset(\"gemma3_gptq_w4gs128_preset\")\n",
+    "output = model_from_preset.generate(prompt, max_length=30)\n",
+    "print(output)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "## Performance & Benchmarking\n",
+    "\n",
+    "Micro-benchmarks collected on a single NVIDIA 4070 Ti Super (16 GB).\n",
+    "Baselines are FP32.\n",
+    "\n",
+    "Dataset: WikiText-2.\n",
+    "\n",
+    "\n",
+    "| Model (preset)                              | Perplexity Increase % (↓ better) | Disk Storage Reduction Δ % (↓ better) | VRAM Reduction Δ % (↓ better) | First-token Latency Δ % (↓ better) | Throughput Δ % (↑ better) |\n",
+    "| ------------------------------------------- | -------------------------------: | ------------------------------------: | ----------------------------: | ---------------------------------: | ------------------------: |\n",
+    "| gpt2_causal_lm (gpt2_base_en_cnn_dailymail) |                             1.0% |                              -50.1% ↓ |                      -41.1% ↓ |                            +0.7% ↑ |                  +20.1% ↑ |\n",
+    "| opt_causal_lm (opt_125m_en)                 |                            10.0% |                              -49.8% ↓ |                      -47.0% ↓ |                            +6.7% ↑ |                  -15.7% ↓ |\n",
+    "| bloom_causal_lm (bloom_1.1b_multi)          |                             7.0% |                              -47.0% ↓ |                      -54.0% ↓ |                            +1.8% ↑ |                  -15.7% ↓ |\n",
+    "| gemma3_causal_lm (gemma3_1b)                |                             3.0% |                              -51.5% ↓ |                      -51.8% ↓ |                           +39.5% ↑ |                   +5.7% ↑ |\n",
+    "\n",
+    "\n",
+    "Detailed benchmarking numbers and scripts are available\n",
+    "[here](https://github.com/keras-team/keras/pull/21641).\n",
+    "\n",
+    "### Analysis\n",
+    "\n",
+    "There is notable reduction in disk space and VRAM usage across all models, with\n",
+    "disk space savings around 50% and VRAM savings ranging from 41% to 54%. The\n",
+    "reported disk savings understate the true weight compression because presets\n",
+    "also include non-weight assets.\n",
+    "\n",
+    "Perplexity increases only marginally, indicating model quality is largely\n",
+    "preserved after quantization."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "## Practical tips\n",
+    "\n",
+    "* GPTQ is weight-only; training after quantization is not supported.\n",
+    "* Always use the model's own tokenizer for calibration.\n",
+    "* Use a representative calibration set; small slices are only for demos.\n",
+    "* Start with W4 group_size=128; tune per model/task.\n",
+    "* Save to `.keras` or to a preset for reuse elsewhere."
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "collapsed_sections": [],
+   "name": "gptq_quantization_in_keras",
+   "private_outputs": false,
+   "provenance": [],
+   "toc_visible": true
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}