From d525163d5344d2368ed0bab1636a2b5552260559 Mon Sep 17 00:00:00 2001 From: idostyle Date: Wed, 9 Apr 2025 22:24:42 +0200 Subject: [PATCH 1/2] Support DistillT5 --- t5.hpp | 55 +++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 14 deletions(-) diff --git a/t5.hpp b/t5.hpp index 2a53e2743..b8a48641b 100644 --- a/t5.hpp +++ b/t5.hpp @@ -648,6 +648,25 @@ struct T5Block : public GGMLBlock { } }; +struct T5Projection : public UnaryBlock { +public: + T5Projection(int64_t model_dim, int64_t projection_dim) { + blocks["0"] = std::shared_ptr(new Linear(model_dim, projection_dim, false)); + blocks["3"] = std::shared_ptr(new Linear(projection_dim, projection_dim, false)); + } + + struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { + // x: [N, n_token, model_dim] + auto wi = std::dynamic_pointer_cast(blocks["0"]); + auto wo = std::dynamic_pointer_cast(blocks["3"]); + + x = wi->forward(ctx, x); + x = ggml_relu_inplace(ctx, x); + x = wo->forward(ctx, x); + return x; + } +}; + struct T5Stack : public GGMLBlock { int64_t num_layers; @@ -682,6 +701,7 @@ struct T5Stack : public GGMLBlock { auto final_layer_norm = std::dynamic_pointer_cast(blocks["final_layer_norm"]); x = final_layer_norm->forward(ctx, x); + return x; } }; @@ -692,9 +712,11 @@ struct T5 : public GGMLBlock { int64_t model_dim, int64_t ff_dim, int64_t num_heads, - int64_t vocab_size) { + int64_t vocab_size, + int64_t projection_dim) { blocks["encoder"] = std::shared_ptr(new T5Stack(num_layers, model_dim, model_dim, ff_dim, num_heads)); blocks["shared"] = std::shared_ptr(new Embedding(vocab_size, model_dim)); + blocks["final_projection"] = std::shared_ptr(new T5Projection(model_dim, projection_dim)); } struct ggml_tensor* forward(struct ggml_context* ctx, @@ -709,6 +731,9 @@ struct T5 : public GGMLBlock { auto x = shared->forward(ctx, input_ids); x = encoder->forward(ctx, x, past_bias, attention_mask, relative_position_bucket); + + auto final_projection = std::dynamic_pointer_cast(blocks["final_projection"]); + x = final_projection->forward(ctx, x); return x; } }; @@ -720,12 +745,13 @@ struct T5Runner : public GGMLRunner { T5Runner(ggml_backend_t backend, std::map& tensor_types, const std::string prefix, - int64_t num_layers = 24, - int64_t model_dim = 4096, - int64_t ff_dim = 10240, - int64_t num_heads = 64, - int64_t vocab_size = 32128) - : GGMLRunner(backend), model(num_layers, model_dim, ff_dim, num_heads, vocab_size) { + int64_t num_layers = 12, + int64_t model_dim = 768, + int64_t ff_dim = 2048, + int64_t num_heads = 12, + int64_t vocab_size = 32128, + int64_t projection_dim = 4096) + : GGMLRunner(backend), model(num_layers, model_dim, ff_dim, num_heads, vocab_size, projection_dim) { model.init(params_ctx, tensor_types, prefix); } @@ -861,12 +887,13 @@ struct T5Embedder { T5Embedder(ggml_backend_t backend, std::map& tensor_types = empty_tensor_types, const std::string prefix = "", - int64_t num_layers = 24, - int64_t model_dim = 4096, - int64_t ff_dim = 10240, - int64_t num_heads = 64, - int64_t vocab_size = 32128) - : model(backend, tensor_types, prefix, num_layers, model_dim, ff_dim, num_heads, vocab_size) { + int64_t num_layers = 12, + int64_t model_dim = 768, + int64_t ff_dim = 2048, + int64_t num_heads = 12, + int64_t vocab_size = 32128, + int64_t projection_dim = 4096) + : model(backend, tensor_types, prefix, num_layers, model_dim, ff_dim, num_heads, vocab_size, projection_dim) { } void get_param_tensors(std::map& tensors, const std::string prefix) { @@ -983,4 +1010,4 @@ struct T5Embedder { } }; -#endif // __T5_HPP__ \ No newline at end of file +#endif // __T5_HPP__ From 875faf7635807546cba24708a9a97c1b395ef19a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Fri, 11 Apr 2025 01:09:05 +0200 Subject: [PATCH 2/2] t5: detect distillT5 --- t5.hpp | 44 ++++++++++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/t5.hpp b/t5.hpp index b8a48641b..b1b0e7d03 100644 --- a/t5.hpp +++ b/t5.hpp @@ -357,7 +357,7 @@ class T5UniGramTokenizer { BuildTrie(&pieces); } - ~T5UniGramTokenizer(){}; + ~T5UniGramTokenizer() {}; std::string Normalize(const std::string& input) const { // Ref: https://github.com/huggingface/tokenizers/blob/1ff56c0c70b045f0cd82da1af9ac08cd4c7a6f9f/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py#L29 @@ -701,22 +701,27 @@ struct T5Stack : public GGMLBlock { auto final_layer_norm = std::dynamic_pointer_cast(blocks["final_layer_norm"]); x = final_layer_norm->forward(ctx, x); - + return x; } }; struct T5 : public GGMLBlock { + bool final_proj = false; + public: + T5() {} T5(int64_t num_layers, int64_t model_dim, int64_t ff_dim, int64_t num_heads, int64_t vocab_size, - int64_t projection_dim) { + int64_t projection_dim) : final_proj(projection_dim > 0) { blocks["encoder"] = std::shared_ptr(new T5Stack(num_layers, model_dim, model_dim, ff_dim, num_heads)); blocks["shared"] = std::shared_ptr(new Embedding(vocab_size, model_dim)); - blocks["final_projection"] = std::shared_ptr(new T5Projection(model_dim, projection_dim)); + if (final_proj) { + blocks["final_projection"] = std::shared_ptr(new T5Projection(model_dim, projection_dim)); + } } struct ggml_tensor* forward(struct ggml_context* ctx, @@ -731,9 +736,10 @@ struct T5 : public GGMLBlock { auto x = shared->forward(ctx, input_ids); x = encoder->forward(ctx, x, past_bias, attention_mask, relative_position_bucket); - - auto final_projection = std::dynamic_pointer_cast(blocks["final_projection"]); - x = final_projection->forward(ctx, x); + if (final_proj) { + auto final_projection = std::dynamic_pointer_cast(blocks["final_projection"]); + x = final_projection->forward(ctx, x); + } return x; } }; @@ -745,13 +751,23 @@ struct T5Runner : public GGMLRunner { T5Runner(ggml_backend_t backend, std::map& tensor_types, const std::string prefix, - int64_t num_layers = 12, - int64_t model_dim = 768, - int64_t ff_dim = 2048, - int64_t num_heads = 12, - int64_t vocab_size = 32128, - int64_t projection_dim = 4096) - : GGMLRunner(backend), model(num_layers, model_dim, ff_dim, num_heads, vocab_size, projection_dim) { + int64_t num_layers = 24, + int64_t model_dim = 4096, + int64_t ff_dim = 10240, + int64_t num_heads = 64, + int64_t vocab_size = 32128, + int64_t projection_dim = -1) + : GGMLRunner(backend) { + if (tensor_types.find(prefix + ".final_projection.0.weight") != tensor_types.end()) { + num_layers = 12; + model_dim = 768; + ff_dim = 2048; + num_heads = 12; + vocab_size = 32128; + projection_dim = 4096; + } + + model = T5(num_layers, model_dim, ff_dim, num_heads, vocab_size, projection_dim); model.init(params_ctx, tensor_types, prefix); }