StreamingTokenDecoder, AntipromptProcessor support

saddam213 · saddam213 · commit df757531635f · 2023-11-14T21:38:49.000+13:00
diff --git a/LLamaStack.Core/Extensions/Extensions.cs b/LLamaStack.Core/Extensions/Extensions.cs
@@ -32,7 +32,7 @@ public static ILLamaParams ToLLamaParams(this IModelConfig modelConfig)
                 RopeFrequencyBase = modelConfig.RopeFrequencyBase,
                 RopeFrequencyScale = modelConfig.RopeFrequencyScale,
                 Seed = modelConfig.Seed,
-                TensorSplits = modelConfig.TensorSplits,
+                TensorSplits = modelConfig.TensorSplits == null ? new() :  new (modelConfig.TensorSplits),
                 UseFp16Memory = modelConfig.UseFp16Memory,
                 UseMemoryLock = modelConfig.UseMemoryLock,
                 UseMemorymap = modelConfig.UseMemorymap,
diff --git a/LLamaStack.Core/Inference/AntipromptProcessor.cs b/LLamaStack.Core/Inference/AntipromptProcessor.cs
@@ -0,0 +1,65 @@
+﻿namespace LLamaStack.Core.Inference
+{
+    //TODO remove when made public interface LLamaSharp
+    public sealed class AntipromptProcessor
+    {
+        private int _longestAntiprompt;
+        private readonly List<string> _antiprompts = new();
+
+        private string? _string;
+
+        public AntipromptProcessor(IEnumerable<string>? antiprompts = null)
+        {
+            if (antiprompts != null)
+                SetAntiprompts(antiprompts);
+        }
+
+        /// <summary>
+        /// Add an antiprompt to the collection
+        /// </summary>
+        /// <param name="antiprompt"></param>
+        public void AddAntiprompt(string antiprompt)
+        {
+            _antiprompts.Add(antiprompt);
+            _longestAntiprompt = Math.Max(_longestAntiprompt, antiprompt.Length);
+        }
+
+        /// <summary>
+        /// Overwrite all current antiprompts with a new set
+        /// </summary>
+        /// <param name="antiprompts"></param>
+        public void SetAntiprompts(IEnumerable<string> antiprompts)
+        {
+            _antiprompts.Clear();
+            _antiprompts.AddRange(antiprompts);
+
+            _longestAntiprompt = 0;
+            foreach (var antiprompt in _antiprompts)
+                _longestAntiprompt = Math.Max(_longestAntiprompt, antiprompt.Length);
+        }
+
+        /// <summary>
+        /// Add some text and check if the buffer now ends with any antiprompt
+        /// </summary>
+        /// <param name="text"></param>
+        /// <returns>true if the text buffer ends with any antiprompt</returns>
+        public bool Add(string text)
+        {
+            _string += text;
+
+            // When the string gets very long (4x antiprompt length) trim it down (to 2x antiprompt length).
+            // This trimming leaves a lot of extra characters because two sequences can be considered "equal" in unicode
+            // even with different numbers of characters. Hopefully there are enough characters here to handle all those weird circumstances!
+            var maxLength = Math.Max(32, _longestAntiprompt * 4);
+            var trimLength = Math.Max(16, _longestAntiprompt * 2);
+            if (_string.Length > maxLength)
+                _string = _string.Substring(_string.Length - trimLength);
+
+            foreach (var antiprompt in _antiprompts)
+                if (_string.EndsWith(antiprompt, StringComparison.CurrentCulture))
+                    return true;
+
+            return false;
+        }
+    }
+}
diff --git a/LLamaStack.Core/Inference/InferenceHandlerBase.cs b/LLamaStack.Core/Inference/InferenceHandlerBase.cs
@@ -1,8 +1,12 @@
-﻿using LLama.Abstractions;
+﻿using LLama;
+using LLama.Abstractions;
 using LLama.Common;
 using LLamaStack.Core.Common;
+using LLamaStack.Core.Extensions;
+using LLamaStack.Core.Models;
 using LLamaStack.Core.Services;
 using System.Runtime.CompilerServices;
+using System.Text;
 
 namespace LLamaStack.Core.Inference
 {
@@ -48,6 +52,11 @@ public abstract class InferenceHandlerBase<T> : IInferenceHandler
         /// </summary>
         protected ISampleService _sampleService;
 
+        /// <summary>
+        /// The token decoder
+        /// </summary>
+        protected StreamingTokenDecoder _tokenDecoder;
+
 
         /// <summary>
         /// Initializes a new instance of the <see cref="InferenceHandlerBase{T}"/> class.
@@ -61,6 +70,7 @@ protected InferenceHandlerBase(LLamaStackModel<T> model, LLamaStackContext conte
             _pastTokensCount = 0;
             _consumedTokensCount = 0;
             _sampleService = new SampleService(_context);
+            _tokenDecoder = new StreamingTokenDecoder(_context.LLamaContext);
             _lastTokens = new FixedSizeQueue<TokenData>(_context.ContextSize).FillWith(new(0));
         }
 
@@ -136,6 +146,7 @@ public async virtual IAsyncEnumerable<TokenData> InferAsync(string text, IInfere
         {
             cancellationToken.ThrowIfCancellationRequested();
             inferenceParams ??= new InferenceParams();
+            var antipromptProcessor = new AntipromptProcessor(inferenceParams.AntiPrompts);
 
             InferStateArgs args = new InferStateArgs()
             {
@@ -158,8 +169,14 @@ public async virtual IAsyncEnumerable<TokenData> InferAsync(string text, IInfere
 
                 if (args.ReturnValue)
                 {
-                    foreach (var embed in _currentTokens)
-                        yield return embed;
+                    foreach (var tokenData in ProcessTokens(_currentTokens))
+                    {
+                        // Check if any of the antiprompts have been generated
+                        if (!tokenData.IsChild && antipromptProcessor.Add(tokenData.Content))
+                            args.WaitForInput = true;
+
+                        yield return tokenData;
+                    }
                 }
 
                 var breakGeneration = await PostProcess(inferenceParams, args);
@@ -170,6 +187,18 @@ public async virtual IAsyncEnumerable<TokenData> InferAsync(string text, IInfere
             }
         }
 
+        protected List<TokenData> ProcessTokens(List<TokenData> tokens)
+        {
+            _tokenDecoder.AddRange(tokens.ToTokenIds());
+
+            // First token is parent, contains full Content,
+            // Others are Child, no Content, Data only
+            tokens[0].Content = _tokenDecoder.Read();
+            foreach (var token in tokens.Skip(1))
+                token.IsChild = true;
+
+            return tokens;
+        }
 
         /// <summary>
         /// Gets the state.
diff --git a/LLamaStack.Core/Inference/InstructInferenceHandler.cs b/LLamaStack.Core/Inference/InstructInferenceHandler.cs
@@ -89,32 +89,13 @@ protected override Task<bool> PostProcess(IInferenceParams inferenceParams, Infe
         {
             if (_promptTokens.Count <= _consumedTokensCount)
             {
-                if (args.Antiprompts is not null && args.Antiprompts.Count > 0)
-                {
-                    var last_output_builder = new StringBuilder();
-                    foreach (var token in _lastTokens)
-                    {
-                        _context.TokenToString(token, last_output_builder);
-                    }
-
-                    var last_output = last_output_builder.ToString();
-                    foreach (var antiprompt in args.Antiprompts)
-                    {
-                        if (last_output.EndsWith(antiprompt))
-                        {
-                            args.WaitForInput = true;
-                            return Task.FromResult(true);
-                        }
-                    }
-                }
-
                 if (_pastTokensCount > 0 && args.WaitForInput)
                 {
                     return Task.FromResult(true);
                 }
             }
 
-            if (_currentTokens.Count > 0 && _currentTokens.Last()?.Id == _context.TokenEOS)
+            if (_currentTokens.Count > 0 && _currentTokens.Last()?.Id == _model.TokenEOS)
             {
                 args.WaitForInput = true;
             }
diff --git a/LLamaStack.Core/Inference/InteractiveInferenceHandler.cs b/LLamaStack.Core/Inference/InteractiveInferenceHandler.cs
@@ -18,7 +18,7 @@ public sealed class InteractiveInferenceHandler<T> : InferenceHandlerBase<T>
         /// <param name="context">The context.</param>
         public InteractiveInferenceHandler(LLamaStackModel<T> model, LLamaStackContext context) : base(model, context)
         {
-            _tokenNewline = new TokenData(_context.TokenNL);
+            _tokenNewline = new TokenData(model.TokenNL);
         }
 
 
@@ -69,32 +69,13 @@ protected override Task<bool> PostProcess(IInferenceParams inferenceParams, Infe
         {
             if (_promptTokens.Count <= _consumedTokensCount)
             {
-                if (!args.Antiprompts.IsNullOrEmpty())
-                {
-                    var last_output_builder = new StringBuilder();
-                    foreach (var token in _lastTokens)
-                    {
-                        _context.TokenToString(token, last_output_builder);
-                    }
-
-                    var last_output = last_output_builder.ToString();
-                    foreach (var antiprompt in args.Antiprompts)
-                    {
-                        if (last_output.EndsWith(antiprompt))
-                        {
-                            args.WaitForInput = true;
-                            break;
-                        }
-                    }
-                }
-
                 if (_pastTokensCount > 0 && args.WaitForInput)
                 {
                     return Task.FromResult(true);
                 }
             }
 
-            if (_currentTokens.Count > 0 && _currentTokens.Last()?.Id == _context.TokenEOS)
+            if (_currentTokens.Count > 0 && _currentTokens.Last()?.Id == _model.TokenEOS)
             {
                 return Task.FromResult(true);
             }
@@ -133,15 +114,9 @@ protected override async Task InferInternal(IInferenceParams inferenceParams, In
 
                 _lastTokens.Enqueue(tokenData);
 
-
-                if (tokenData.Id == _context.TokenEOS)
+                if (tokenData.Id == _model.TokenEOS)
                 {
                     tokenData = _tokenNewline;
-                    if (!args.Antiprompts.IsNullOrEmpty())
-                    {
-                        var first_antiprompt = _context.TokenizeTextToList(args.Antiprompts[0], false);
-                        _promptTokens.AddRange(first_antiprompt);
-                    }
                 }
 
                 _currentTokens.Add(tokenData);
diff --git a/LLamaStack.Core/Inference/TokenData.cs b/LLamaStack.Core/Inference/TokenData.cs
@@ -5,5 +5,6 @@ public sealed record TokenData(int Id)
         public float Logit { get; set; }
         public float Probability { get; set; }
         public string Content { get; set; }
+        public bool IsChild { get; set; }
     }
 }
diff --git a/LLamaStack.Core/LLamaStackContext.cs b/LLamaStack.Core/LLamaStackContext.cs
@@ -37,18 +37,6 @@ public LLamaStackContext(LLamaContext context)
         public int ContextSize => _context.ContextSize;
 
 
-        /// <summary>
-        /// Gets the native llama EOS tokenid.
-        /// </summary>
-        public int TokenEOS => NativeApi.llama_token_eos(_context.NativeHandle);
-
-
-        /// <summary>
-        /// Gets the native llama NL tokenid.
-        /// </summary>
-        public int TokenNL => NativeApi.llama_token_nl(_context.NativeHandle);
-
-
         /// <summary>
         /// Loads the state.
         /// </summary>
@@ -100,8 +88,7 @@ public TokenData GetTokenData(LLamaTokenDataArray tokenDataArray, int id)
             return new TokenData(tokenData.id)
             {
                 Logit = tokenData.logit,
-                Probability = tokenData.p,
-                Content = _context.TokenToString(tokenData.id)
+                Probability = tokenData.p
             };
         }
 
@@ -152,7 +139,8 @@ public int Sample(LLamaTokenDataArray tokenDataArray, IInferenceParams inference
                 inferenceParams.TopP,
                 inferenceParams.TfsZ,
                 inferenceParams.TypicalP,
-                inferenceParams.Grammar
+                inferenceParams.Grammar,
+                inferenceParams.MinP
             );
         }
 
@@ -166,7 +154,7 @@ public int Sample(LLamaTokenDataArray tokenDataArray, IInferenceParams inference
         private IEnumerable<TokenData> TokenizeText(string text, bool addBos)
         {
             return _context.Tokenize(text, addBos)
-                .Select(x => new TokenData(x) { Content = _context.TokenToString(x) });
+                .Select(x => new TokenData(x));
         }
 
 
@@ -205,27 +193,12 @@ public Task<int> EvalAsync(IEnumerable<TokenData> tokens, int pastTokensCount)
             return Task.Run(() => _context.Eval(tokens.ToTokenIds(), pastTokensCount));
         }
 
-
-        /// <summary>
-        /// Token to string.
-        /// </summary>
-        /// <param name="token">The token.</param>
-        /// <param name="stringBuilder">The string builder.</param>
-        public void TokenToString(TokenData token, StringBuilder stringBuilder)
-        {
-            _context.NativeHandle.TokenToString(token.Id, _context.Encoding, stringBuilder);
-        }
-
         /// <summary>
         /// Performs application-defined tasks associated with freeing, releasing, or resetting unmanaged resources.
         /// </summary>
         public void Dispose()
         {
             _context?.Dispose();
         }
-
-
     }
-
-
 }
diff --git a/LLamaStack.Core/LLamaStackModel.cs b/LLamaStack.Core/LLamaStackModel.cs
@@ -1,5 +1,6 @@
 ﻿using LLama;
 using LLama.Abstractions;
+using LLama.Native;
 using LLamaStack.Core.Config;
 using LLamaStack.Core.Extensions;
 using System.Collections.Concurrent;
@@ -45,6 +46,19 @@ public LLamaStackModel(ModelConfig modelParams)
         public int ContextCount => _contexts.Count;
 
 
+
+        /// <summary>
+        /// Gets the native llama EOS tokenid.
+        /// </summary>
+        public int TokenEOS => NativeApi.llama_token_eos(_weights.NativeHandle);
+
+
+        /// <summary>
+        /// Gets the native llama NL tokenid.
+        /// </summary>
+        public int TokenNL => NativeApi.llama_token_nl(_weights.NativeHandle);
+
+
         /// <summary>
         /// Creates a new context session on this model
         /// </summary>
diff --git a/LLamaStack.WPF/App.xaml.cs b/LLamaStack.WPF/App.xaml.cs
diff --git a/LLamaStack.WPF/Utils.cs b/LLamaStack.WPF/Utils.cs

Original file line number	Diff line number	Diff line change
`@@ -89,32 +89,13 @@ protected override Task<bool> PostProcess(IInferenceParams inferenceParams, Infe`
`89`	`89`	`{`
`90`	`90`	`if (_promptTokens.Count <= _consumedTokensCount)`
`91`	`91`	`{`
`92`		`- if (args.Antiprompts is not null && args.Antiprompts.Count > 0)`
`93`		`- {`
`94`		`- var last_output_builder = new StringBuilder();`
`95`		`- foreach (var token in _lastTokens)`
`96`		`- {`
`97`		`- _context.TokenToString(token, last_output_builder);`
`98`		`- }`
`99`		`-`
`100`		`- var last_output = last_output_builder.ToString();`
`101`		`- foreach (var antiprompt in args.Antiprompts)`
`102`		`- {`
`103`		`- if (last_output.EndsWith(antiprompt))`
`104`		`- {`
`105`		`- args.WaitForInput = true;`
`106`		`- return Task.FromResult(true);`
`107`		`- }`
`108`		`- }`
`109`		`- }`
`110`		`-`
`111`	`92`	`if (_pastTokensCount > 0 && args.WaitForInput)`
`112`	`93`	`{`
`113`	`94`	`return Task.FromResult(true);`
`114`	`95`	`}`
`115`	`96`	`}`
`116`	`97`
`117`		`- if (_currentTokens.Count > 0 && _currentTokens.Last()?.Id == _context.TokenEOS)`
	`98`	`+ if (_currentTokens.Count > 0 && _currentTokens.Last()?.Id == _model.TokenEOS)`
`118`	`99`	`{`
`119`	`100`	`args.WaitForInput = true;`
`120`	`101`	`}`
Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@ public sealed class InteractiveInferenceHandler<T> : InferenceHandlerBase<T>`
`18`	`18`	`/// <param name="context">The context.</param>`
`19`	`19`	`public InteractiveInferenceHandler(LLamaStackModel<T> model, LLamaStackContext context) : base(model, context)`
`20`	`20`	`{`
`21`		`- _tokenNewline = new TokenData(_context.TokenNL);`
	`21`	`+ _tokenNewline = new TokenData(model.TokenNL);`
`22`	`22`	`}`
`23`	`23`
`24`	`24`
`@@ -69,32 +69,13 @@ protected override Task<bool> PostProcess(IInferenceParams inferenceParams, Infe`
`69`	`69`	`{`
`70`	`70`	`if (_promptTokens.Count <= _consumedTokensCount)`
`71`	`71`	`{`
`72`		`- if (!args.Antiprompts.IsNullOrEmpty())`
`73`		`- {`
`74`		`- var last_output_builder = new StringBuilder();`
`75`		`- foreach (var token in _lastTokens)`
`76`		`- {`
`77`		`- _context.TokenToString(token, last_output_builder);`
`78`		`- }`
`79`		`-`
`80`		`- var last_output = last_output_builder.ToString();`
`81`		`- foreach (var antiprompt in args.Antiprompts)`
`82`		`- {`
`83`		`- if (last_output.EndsWith(antiprompt))`
`84`		`- {`
`85`		`- args.WaitForInput = true;`
`86`		`- break;`
`87`		`- }`
`88`		`- }`
`89`		`- }`
`90`		`-`
`91`	`72`	`if (_pastTokensCount > 0 && args.WaitForInput)`
`92`	`73`	`{`
`93`	`74`	`return Task.FromResult(true);`
`94`	`75`	`}`
`95`	`76`	`}`
`96`	`77`
`97`		`- if (_currentTokens.Count > 0 && _currentTokens.Last()?.Id == _context.TokenEOS)`
	`78`	`+ if (_currentTokens.Count > 0 && _currentTokens.Last()?.Id == _model.TokenEOS)`
`98`	`79`	`{`
`99`	`80`	`return Task.FromResult(true);`
`100`	`81`	`}`
`@@ -133,15 +114,9 @@ protected override async Task InferInternal(IInferenceParams inferenceParams, In`
`133`	`114`
`134`	`115`	`_lastTokens.Enqueue(tokenData);`
`135`	`116`
`136`		`-`
`137`		`- if (tokenData.Id == _context.TokenEOS)`
	`117`	`+ if (tokenData.Id == _model.TokenEOS)`
`138`	`118`	`{`
`139`	`119`	`tokenData = _tokenNewline;`
`140`		`- if (!args.Antiprompts.IsNullOrEmpty())`
`141`		`- {`
`142`		`- var first_antiprompt = _context.TokenizeTextToList(args.Antiprompts[0], false);`
`143`		`- _promptTokens.AddRange(first_antiprompt);`
`144`		`- }`
`145`	`120`	`}`
`146`	`121`
`147`	`122`	`_currentTokens.Add(tokenData);`
Original file line number	Diff line number	Diff line change
`@@ -5,5 +5,6 @@ public sealed record TokenData(int Id)`
`5`	`5`	`public float Logit { get; set; }`
`6`	`6`	`public float Probability { get; set; }`
`7`	`7`	`public string Content { get; set; }`
	`8`	`+ public bool IsChild { get; set; }`
`8`	`9`	`}`
`9`	`10`	`}`