From d525163d5344d2368ed0bab1636a2b5552260559 Mon Sep 17 00:00:00 2001
From: idostyle <idostyl3@googlemail.com>
Date: Wed, 9 Apr 2025 22:24:42 +0200
Subject: [PATCH 1/2] Support DistillT5

---
 t5.hpp | 55 +++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 41 insertions(+), 14 deletions(-)
diff --git a/t5.hpp b/t5.hpp
index 2a53e2743..b8a48641b 100644
--- a/t5.hpp
+++ b/t5.hpp
@@ -648,6 +648,25 @@ struct T5Block : public GGMLBlock {
     }
 };
 
+struct T5Projection : public UnaryBlock {
+public:
+    T5Projection(int64_t model_dim, int64_t projection_dim) {
+        blocks["0"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, projection_dim, false));
+        blocks["3"] = std::shared_ptr<GGMLBlock>(new Linear(projection_dim, projection_dim, false));
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [N, n_token, model_dim]
+        auto wi = std::dynamic_pointer_cast<Linear>(blocks["0"]);
+        auto wo = std::dynamic_pointer_cast<Linear>(blocks["3"]);
+
+        x = wi->forward(ctx, x);
+        x = ggml_relu_inplace(ctx, x);
+        x = wo->forward(ctx, x);
+        return x;
+    }
+};
+
 struct T5Stack : public GGMLBlock {
     int64_t num_layers;
 
@@ -682,6 +701,7 @@ struct T5Stack : public GGMLBlock {
         auto final_layer_norm = std::dynamic_pointer_cast<T5LayerNorm>(blocks["final_layer_norm"]);
 
         x = final_layer_norm->forward(ctx, x);
+        
         return x;
     }
 };
@@ -692,9 +712,11 @@ struct T5 : public GGMLBlock {
        int64_t model_dim,
        int64_t ff_dim,
        int64_t num_heads,
-       int64_t vocab_size) {
+       int64_t vocab_size,
+       int64_t projection_dim) {
         blocks["encoder"] = std::shared_ptr<GGMLBlock>(new T5Stack(num_layers, model_dim, model_dim, ff_dim, num_heads));
         blocks["shared"]  = std::shared_ptr<GGMLBlock>(new Embedding(vocab_size, model_dim));
+        blocks["final_projection"] = std::shared_ptr<GGMLBlock>(new T5Projection(model_dim, projection_dim));
     }
 
     struct ggml_tensor* forward(struct ggml_context* ctx,
@@ -709,6 +731,9 @@ struct T5 : public GGMLBlock {
 
         auto x = shared->forward(ctx, input_ids);
         x      = encoder->forward(ctx, x, past_bias, attention_mask, relative_position_bucket);
+
+        auto final_projection = std::dynamic_pointer_cast<T5Projection>(blocks["final_projection"]);
+        x = final_projection->forward(ctx, x);
         return x;
     }
 };
@@ -720,12 +745,13 @@ struct T5Runner : public GGMLRunner {
     T5Runner(ggml_backend_t backend,
              std::map<std::string, enum ggml_type>& tensor_types,
              const std::string prefix,
-             int64_t num_layers = 24,
-             int64_t model_dim  = 4096,
-             int64_t ff_dim     = 10240,
-             int64_t num_heads  = 64,
-             int64_t vocab_size = 32128)
-        : GGMLRunner(backend), model(num_layers, model_dim, ff_dim, num_heads, vocab_size) {
+             int64_t num_layers = 12,
+             int64_t model_dim  = 768,
+             int64_t ff_dim     = 2048,
+             int64_t num_heads  = 12,
+             int64_t vocab_size = 32128,
+             int64_t projection_dim = 4096)
+        : GGMLRunner(backend), model(num_layers, model_dim, ff_dim, num_heads, vocab_size, projection_dim) {
         model.init(params_ctx, tensor_types, prefix);
     }
 
@@ -861,12 +887,13 @@ struct T5Embedder {
     T5Embedder(ggml_backend_t backend,
                std::map<std::string, enum ggml_type>& tensor_types = empty_tensor_types,
                const std::string prefix                            = "",
-               int64_t num_layers                                  = 24,
-               int64_t model_dim                                   = 4096,
-               int64_t ff_dim                                      = 10240,
-               int64_t num_heads                                   = 64,
-               int64_t vocab_size                                  = 32128)
-        : model(backend, tensor_types, prefix, num_layers, model_dim, ff_dim, num_heads, vocab_size) {
+               int64_t num_layers                                  = 12,
+               int64_t model_dim                                   = 768,
+               int64_t ff_dim                                      = 2048,
+               int64_t num_heads                                   = 12,
+               int64_t vocab_size                                  = 32128,
+               int64_t projection_dim                              = 4096)
+        : model(backend, tensor_types, prefix, num_layers, model_dim, ff_dim, num_heads, vocab_size, projection_dim) {
     }
 
     void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
@@ -983,4 +1010,4 @@ struct T5Embedder {
     }
 };
 
-#endif  // __T5_HPP__
\ No newline at end of file
+#endif  // __T5_HPP__

From 875faf7635807546cba24708a9a97c1b395ef19a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Fri, 11 Apr 2025 01:09:05 +0200
Subject: [PATCH 2/2] t5: detect distillT5

---
 t5.hpp | 44 ++++++++++++++++++++++++++++++--------------
 1 file changed, 30 insertions(+), 14 deletions(-)

diff --git a/t5.hpp b/t5.hpp
index b8a48641b..b1b0e7d03 100644
--- a/t5.hpp
+++ b/t5.hpp
@@ -357,7 +357,7 @@ class T5UniGramTokenizer {
 
         BuildTrie(&pieces);
     }
-    ~T5UniGramTokenizer(){};
+    ~T5UniGramTokenizer() {};
 
     std::string Normalize(const std::string& input) const {
         // Ref: https://github.com/huggingface/tokenizers/blob/1ff56c0c70b045f0cd82da1af9ac08cd4c7a6f9f/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py#L29
@@ -701,22 +701,27 @@ struct T5Stack : public GGMLBlock {
         auto final_layer_norm = std::dynamic_pointer_cast<T5LayerNorm>(blocks["final_layer_norm"]);
 
         x = final_layer_norm->forward(ctx, x);
-        
+
         return x;
     }
 };
 
 struct T5 : public GGMLBlock {
+    bool final_proj = false;
+
 public:
+    T5() {}
     T5(int64_t num_layers,
        int64_t model_dim,
        int64_t ff_dim,
        int64_t num_heads,
        int64_t vocab_size,
-       int64_t projection_dim) {
+       int64_t projection_dim) : final_proj(projection_dim > 0) {
         blocks["encoder"] = std::shared_ptr<GGMLBlock>(new T5Stack(num_layers, model_dim, model_dim, ff_dim, num_heads));
         blocks["shared"]  = std::shared_ptr<GGMLBlock>(new Embedding(vocab_size, model_dim));
-        blocks["final_projection"] = std::shared_ptr<GGMLBlock>(new T5Projection(model_dim, projection_dim));
+        if (final_proj) {
+            blocks["final_projection"] = std::shared_ptr<GGMLBlock>(new T5Projection(model_dim, projection_dim));
+        }
     }
 
     struct ggml_tensor* forward(struct ggml_context* ctx,
@@ -731,9 +736,10 @@ struct T5 : public GGMLBlock {
 
         auto x = shared->forward(ctx, input_ids);
         x      = encoder->forward(ctx, x, past_bias, attention_mask, relative_position_bucket);
-
-        auto final_projection = std::dynamic_pointer_cast<T5Projection>(blocks["final_projection"]);
-        x = final_projection->forward(ctx, x);
+        if (final_proj) {
+            auto final_projection = std::dynamic_pointer_cast<T5Projection>(blocks["final_projection"]);
+            x                     = final_projection->forward(ctx, x);
+        }
         return x;
     }
 };
@@ -745,13 +751,23 @@ struct T5Runner : public GGMLRunner {
     T5Runner(ggml_backend_t backend,
              std::map<std::string, enum ggml_type>& tensor_types,
              const std::string prefix,
-             int64_t num_layers = 12,
-             int64_t model_dim  = 768,
-             int64_t ff_dim     = 2048,
-             int64_t num_heads  = 12,
-             int64_t vocab_size = 32128,
-             int64_t projection_dim = 4096)
-        : GGMLRunner(backend), model(num_layers, model_dim, ff_dim, num_heads, vocab_size, projection_dim) {
+             int64_t num_layers     = 24,
+             int64_t model_dim      = 4096,
+             int64_t ff_dim         = 10240,
+             int64_t num_heads      = 64,
+             int64_t vocab_size     = 32128,
+             int64_t projection_dim = -1)
+        : GGMLRunner(backend) {
+        if (tensor_types.find(prefix + ".final_projection.0.weight") != tensor_types.end()) {
+            num_layers     = 12;
+            model_dim      = 768;
+            ff_dim         = 2048;
+            num_heads      = 12;
+            vocab_size     = 32128;
+            projection_dim = 4096;
+        }
+
+        model = T5(num_layers, model_dim, ff_dim, num_heads, vocab_size, projection_dim);
         model.init(params_ctx, tensor_types, prefix);
     }