From 2ebdf63539ef19ae7c0a5c9240c6fdf0341c6f35 Mon Sep 17 00:00:00 2001
From: Ashton Ohms <me@ashtonoh.ms>
Date: Wed, 20 Dec 2023 21:51:13 -0800
Subject: [PATCH 1/9] [WIP] Update for latest llama.cpp

---
 cbits/wrapper.c          |   6 +-
 cbits/wrapper.h          |   4 +-
 examples/Main.hs         |  68 +++++------
 llama-cpp-bindings.cabal |  14 +--
 src/LLaMACPP.chs         | 243 ++++++++++++++++++++++++++++-----------
 5 files changed, 226 insertions(+), 109 deletions(-)

diff --git a/cbits/wrapper.c b/cbits/wrapper.c
index 5b1bb98..0ff94d5 100644
--- a/cbits/wrapper.c
+++ b/cbits/wrapper.c
@@ -3,7 +3,7 @@
 #include "wrapper.h"
 
 struct llama_model * wrapper_load_model_from_file
-( const char * path_model, struct llama_context_params * params) {
+( const char * path_model, struct llama_model_params * params) {
   return llama_load_model_from_file(path_model, *params);
 }
 
@@ -11,6 +11,10 @@ void wrapper_context_default_params(struct llama_context_params * default_params
   *default_params = llama_context_default_params();
 }
 
+void wrapper_model_default_params(struct llama_model_params * default_params) {
+  *default_params = llama_model_default_params();
+}
+
 void wrapper_model_quantize_default_params(struct llama_model_quantize_params * default_params) {
   *default_params = llama_model_quantize_default_params();
 }
diff --git a/cbits/wrapper.h b/cbits/wrapper.h
index b94ebfc..e118d74 100644
--- a/cbits/wrapper.h
+++ b/cbits/wrapper.h
@@ -3,11 +3,13 @@
 
 struct llama_model * wrapper_load_model_from_file(
   const char * path_model,
-  struct llama_context_params * params
+  struct llama_model_params * params
   );
 
 void wrapper_context_default_params(struct llama_context_params *);
 
+void wrapper_model_default_params(struct llama_model_params *);
+
 void wrapper_model_quantize_default_params(struct llama_model_quantize_params *);
 
 void wrapper_get_timings(struct llama_context * ctx, struct llama_timings * timings);
diff --git a/examples/Main.hs b/examples/Main.hs
index a498d62..b294291 100644
--- a/examples/Main.hs
+++ b/examples/Main.hs
@@ -8,7 +8,7 @@ import Control.Monad.IO.Class (MonadIO, liftIO)
 import Control.Monad.Reader (MonadReader, ReaderT, runReaderT, asks)
 import Data.Functor ((<&>), void)
 import qualified Data.Vector.Storable as V
-import Foreign.C.String (peekCString, withCString)
+import Foreign.C.String (peekCString, withCString, withCStringLen)
 import Foreign.C.Types (CFloat, CInt)
 import Foreign.ForeignPtr (newForeignPtr_)
 import Foreign.Marshal.Alloc (alloca, free, malloc)
@@ -147,24 +147,25 @@ main = do
 
       params' <- asks _params
       ctx <- asks _llamaCtx
+      model <- asks _model
 
       -- tokenizing & eval
 
       -- "do one empty run to warm up the model"
       liftIO . allocaArray 1 $ \(tmp :: Ptr L.Token) -> do
-        bos <- L.tokenBos
+        bos <- L.tokenBos model
         pokeArray tmp [bos]
-        _evalRes <- L.eval ctx tmp 1 0 (_nThreads $ params')
+        _evalRes <- L.eval ctx tmp 1 0 --(_nThreads $ params')
         L.resetTimings ctx
 
       let
         -- todo why is this constant here
         maxTokens = 1024
-        tokenize s tks addBos =
-          L.tokenize ctx s tks (fromIntegral maxTokens) (fromBool addBos)
+        tokenize s l tks addBos =
+          L.tokenize model s (fromIntegral l) tks (fromIntegral maxTokens) (fromBool addBos) (fromBool False)
 
       (tokenized, tokenizedCount) <- liftIO . allocaArray maxTokens $ \tokensPtr -> do
-        tokenizedCount' <- withCString (_prompt params') $ \ts -> tokenize ts tokensPtr True
+        tokenizedCount' <- withCStringLen (_prompt params') $ \(ts, l) -> tokenize ts l tokensPtr True
 
         putStrLn "\nPrompt"
         putStrLn $ _prompt params' <> "\n\n"
@@ -172,7 +173,7 @@ main = do
         putStrLn $ "\nTokenized " <> show tokenizedCount'
 
         putStrLn "\nRunning first eval of entire prompt"
-        _evalRes <- L.eval ctx tokensPtr tokenizedCount' 0 (_nThreads params')
+        _evalRes <- L.eval ctx tokensPtr tokenizedCount' 0 --(_nThreads params')
 
         (, tokenizedCount') <$>
           peekArray (fromIntegral tokenizedCount') tokensPtr
@@ -190,29 +191,30 @@ main = do
 
       liftIO . putStrLn $ "\nsampling"
 
-      eos <- liftIO L.tokenEos
+      eos <- liftIO $ L.tokenEos model
 
       -- I feel like this is not the right way to do this?
       runEffect $
         for (sample >-> takeWhile (/= eos)) $ \id' ->
-          lift . liftIO $ putStr =<< peekCString =<< L.tokenToStr ctx id'
+          lift . liftIO $ putStr =<< L.tokenToPiece model id'
 
 
     sample :: Producer L.Token ContextM ()
     sample = do
       params' <- asks _params
       ctx <- asks _llamaCtx
+      model <- asks _model
       nPastTV <- asks _nPast
       nPast' <- liftIO . readTVarIO $ nPastTV
       lastNTokensTV <- asks _lastNTokens
       lastNTokens' <- liftIO . readTVarIO $ lastNTokensTV
 
-      nVocab <- fromIntegral <$> (liftIO . L.nVocab $ ctx)
+      nVocab <- fromIntegral <$> (liftIO . L.nVocab $ model)
 
-      id' <- liftIO $ sample' params' ctx nVocab lastNTokensTV
+      id' <- liftIO $ sample' model params' ctx nVocab lastNTokensTV
 
       void . liftIO . withArray [id'] $ \newTokenArrPtr ->
-        L.eval ctx newTokenArrPtr 1 (fromIntegral nPast') (_nThreads $ params')
+        L.eval ctx newTokenArrPtr 1 (fromIntegral nPast') -- (_nThreads $ params')
 
       liftIO . atomically $ do
         writeTVar nPastTV $
@@ -226,8 +228,8 @@ main = do
       yield id' *> sample
 
 
-    sample' :: Params -> L.Context -> Int -> TVar [L.Token] -> IO L.Token
-    sample' params' ctx nVocab lastNTokensTV = do
+    sample' :: L.Model -> Params -> L.Context -> Int -> TVar [L.Token] -> IO L.Token
+    sample' model params' ctx nVocab lastNTokensTV = do
       alloca $ \candidatesPPtr ->
         allocaArray nVocab $ \logitsCopyPtr -> do
 
@@ -262,23 +264,23 @@ main = do
               lastNTokensLen = fromIntegral . length $ lastNTokens'
 
             -- float nl_logit = logits[llama_token_nl()];
-            _nlLogit <- V.unsafeIndex logitsCopy . fromIntegral <$> L.tokenNl
+            _nlLogit <- V.unsafeIndex logitsCopy . fromIntegral <$> (L.tokenNl model)
 
             --
             -- lastNTokensPtr should be a pointer just to the last
             -- set of tokens at the end of lastNTokens matching the
             -- count of lastNRepeat, right now it's the entire thing
             --
-            L.sampleRepetitionPenalty
-              ctx candidatesPPtr lastNTokensPtr lastNTokensLen (_repeatPenalty params')
+            L.sampleRepetitionPenalties
+              ctx candidatesPPtr lastNTokensPtr lastNTokensLen (_repeatPenalty params') (1.0) (1.0)
 
-            L.sampleFrequencyAndPresencePenalties
-              ctx
-              candidatesPPtr
-              lastNTokensPtr
-              lastNTokensLen
-              (_alphaFrequency params')
-              (_alphaPresence params')
+            --L.sampleFrequencyAndPresencePenalties
+            --  ctx
+            --  candidatesPPtr
+            --  lastNTokensPtr
+            --  lastNTokensLen
+            --  (_alphaFrequency params')
+            --  (_alphaPresence params')
 
             -- todo
             -- if (!penalize_nl) {
@@ -316,21 +318,23 @@ main = do
       -- putStrLn "\ndefault model quantize params"
 
       cpp <- malloc
+      mpp <- malloc
       putStrLn "\ninit context params"
       L.contextDefaultParams cpp
-      ctxParams' <- peek cpp
+      L.modelDefaultParams mpp
+      --ctxParams' <- peek cpp
 
-      let
-        ctxParams = ctxParams'
-          { L._nCtx = _nCtx params'
-          , L._nGpuLayers = _nGpuLayers params'
-          }
+      --let
+      --  ctxParams = ctxParams'
+      --    -- { L._nCtx = _nCtx params'
+      --    -- , L._nGpuLayers = _nGpuLayers params'
+      --    -- }
 
-      poke cpp ctxParams
+      --poke cpp ctxParams
 
       putStrLn "\nloading model"
 
-      model' <- withCString (_modelPath params') $ flip L.loadModelFromFile cpp
+      model' <- withCString (_modelPath params') $ flip L.loadModelFromFile mpp
 
       putStrLn "\nloading context"
 
diff --git a/llama-cpp-bindings.cabal b/llama-cpp-bindings.cabal
index 77f50c9..fd2ea27 100644
--- a/llama-cpp-bindings.cabal
+++ b/llama-cpp-bindings.cabal
@@ -64,10 +64,10 @@ executable examples
   main-is:             Main.hs
   default-language:    GHC2021
 
-test-suite llama-cpp-bindings-test
-  import:              shared
-  build-depends:       llama-cpp-bindings
-  hs-source-dirs:      test
-  main-is:             Main.hs
-  default-language:    GHC2021
-  type:                exitcode-stdio-1.0
+-- test-suite llama-cpp-bindings-test
+--   import:              shared
+--   build-depends:       llama-cpp-bindings
+--   hs-source-dirs:      test
+--   main-is:             Main.hs
+--   default-language:    GHC2021
+--   type:                exitcode-stdio-1.0
diff --git a/src/LLaMACPP.chs b/src/LLaMACPP.chs
index d6d07f4..c0a6ab2 100644
--- a/src/LLaMACPP.chs
+++ b/src/LLaMACPP.chs
@@ -1,7 +1,10 @@
 
 module LLaMACPP where
 
+import Control.Exception (assert)
 import Data.Word (Word32)
+import Foreign
+import Foreign.C.String (peekCStringLen)
 import Foreign.C.Types (CChar, CDouble, CFloat, CInt, CLong, CSize, CUChar, CULong)
 import Foreign.Marshal.Utils (fromBool, toBool)
 import Foreign.Ptr (FunPtr, Ptr, castPtr)
@@ -103,6 +106,40 @@ instance Storable TokenDataArray where
     {# set token_data_array->size #} p $ fromIntegral _size
     {# set token_data_array->sorted #} p _sorted
 
+--     struct llama_model_params {
+--         int32_t n_gpu_layers; // number of layers to store in VRAM
+--         int32_t main_gpu;     // the GPU that is used for scratch and small tensors
+--         const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
+-- 
+--         // called with a progress value between 0 and 1, pass NULL to disable
+--         llama_progress_callback progress_callback;
+-- 
+--         // context pointer passed to the progress callback
+--         void * progress_callback_user_data;
+-- 
+--         // override key-value pairs of the model meta data
+--         const struct llama_model_kv_override * kv_overrides;
+-- 
+--         // Keep the booleans together to avoid misalignment during copy-by-value.
+--         bool vocab_only; // only load the vocabulary, no weights
+--         bool use_mmap;   // use mmap if possible
+--         bool use_mlock;  // force system to keep model in RAM
+--     };
+data ModelParams = ModelParams
+    { _nGpuLayers :: Word32
+    , _mainGpu :: Word32
+    }
+-- TODO, implement storable
+
+instance Storable ModelParams where
+  sizeOf _ = {# sizeof model_params #}
+  alignment _ = {# alignof model_params #}
+  peek p = ModelParams
+    <$> (fromIntegral <$> {# get model_params->n_gpu_layers #} p)
+    <*> (fromIntegral <$> {# get model_params->main_gpu #} p)
+  poke p mps = do 
+    {# set model_params->n_gpu_layers #} p $ fromIntegral $ _nGpuLayers mps
+    {# set model_params->main_gpu #} p $ fromIntegral $ _mainGpu mps
 
 --
 -- typedef void (*llama_progress_callback)(float progress, void *ctx);
@@ -138,22 +175,39 @@ data ContextParams = ContextParams
   { _seed :: Word32
   , _nCtx :: Int
   , _nBatch :: Int
-  , _nGpuLayers :: Int
-  , _mainGpu :: Int
-  , _tensorSplit :: Ptr CFloat
-  , _progressCallback :: FunPtr ProgressCallback
-  , _progressCallbackUserData :: Ptr ()
-  , _lowVRAM :: Bool
-  , _f16KV :: Bool
+  , _nThreads :: Word32
+  , _nThreadsBatch :: Word32
+  , _ropeScalingType :: Word8
+  , _ropeFreqBase :: CFloat
+  , _ropeFreqScale :: CFloat
+  , _yarnExtFactor :: CFloat
+  , _yarnAttnFactor :: CFloat
+  , _yarnBetaFast :: CFloat
+  , _yarnBetaSlow :: CFloat
+  , _yarnOrigCtx :: Word32
+  , _typeK :: CInt
+  , _typeV :: CInt
+  , _mulMatQ :: Bool
   , _logitsAll :: Bool
-  , _vocabOnly :: Bool
-  , _useMmap :: Bool
-  , _useMlock :: Bool
   , _embedding :: Bool
+  , _offloadKqv :: Bool
+  --, _nGpuLayers :: Int
+  --, _mainGpu :: Int
+  --, _tensorSplit :: Ptr CFloat
+  --, _progressCallback :: FunPtr ProgressCallback
+  --, _progressCallbackUserData :: Ptr ()
+  --, _lowVRAM :: Bool
+  --, _f16KV :: Bool
+  --, _logitsAll :: Bool
+  --, _vocabOnly :: Bool
+  --, _useMmap :: Bool
+  --, _useMlock :: Bool
+  --, _embedding :: Bool
   }
   -- deriving (Eq, Show) -- needs ProgressCallback instance
 
 {# pointer *context_params as ContextParamsPtr -> ContextParams #}
+{# pointer *model_params as ModelParamsPtr -> ModelParams #}
 
 instance Storable ContextParams where
   sizeOf _ = {# sizeof context_params #}
@@ -162,34 +216,66 @@ instance Storable ContextParams where
     <$> (fromIntegral <$> {# get context_params->seed #} p)
     <*> (fromIntegral <$> {# get context_params->n_ctx #} p)
     <*> (fromIntegral <$> {# get context_params->n_batch #} p)
-    <*> (fromIntegral <$> {# get context_params->n_gpu_layers #} p)
-    <*> (fromIntegral <$> {# get context_params->main_gpu #} p)
-    <*> {# get context_params->tensor_split #} p
-    <*> {# get context_params->progress_callback #} p
-    <*> {# get context_params->progress_callback_user_data #} p
-    <*> {# get context_params->low_vram #} p
-    <*> {# get context_params->f16_kv #} p
-    <*> {# get context_params->logits_all #} p
-    <*> {# get context_params->vocab_only #} p
-    <*> {# get context_params->use_mmap #} p
-    <*> {# get context_params->use_mlock #} p
-    <*> {# get context_params->embedding #} p
+    <*> (fromIntegral <$> {# get context_params->n_threads  #} p)
+    <*> (fromIntegral <$> {# get context_params->n_threads_batch  #} p)
+    <*> (fromIntegral <$> {# get context_params->rope_scaling_type  #} p)
+    <*> {# get context_params->rope_freq_base  #} p
+    <*> {# get context_params->rope_freq_scale  #} p
+    <*> {# get context_params->yarn_ext_factor  #} p
+    <*> {# get context_params->yarn_attn_factor  #} p
+    <*> {# get context_params->yarn_beta_fast  #} p
+    <*> {# get context_params->yarn_beta_slow  #} p
+    <*> (fromIntegral <$> {# get context_params->yarn_orig_ctx  #} p)
+    <*> {# get context_params->type_k  #} p
+    <*> {# get context_params->type_k  #} p
+    <*> {# get context_params->mul_mat_q  #} p
+    <*> {# get context_params->logits_all  #} p
+    <*> {# get context_params->embedding  #} p
+    <*> {# get context_params->offload_kqv  #} p
+    -- <*> (fromIntegral <$> {# get context_params->n_gpu_layers #} p)
+    -- <*> (fromIntegral <$> {# get context_params->main_gpu #} p)
+    -- <*> {# get context_params->tensor_split #} p
+    -- <*> {# get context_params->progress_callback #} p
+    -- <*> {# get context_params->progress_callback_user_data #} p
+    -- <*> {# get context_params->low_vram #} p
+    -- <*> {# get context_params->f16_kv #} p
+    -- <*> {# get context_params->logits_all #} p
+    -- <*> {# get context_params->vocab_only #} p
+    -- <*> {# get context_params->use_mmap #} p
+    -- <*> {# get context_params->use_mlock #} p
+    -- <*> {# get context_params->embedding #} p
   poke p cps = do 
     {# set context_params->seed #} p $ fromIntegral $ _seed cps
     {# set context_params->n_ctx #} p $ fromIntegral $ _nCtx cps
     {# set context_params->n_batch #} p $ fromIntegral $ _nBatch cps
-    {# set context_params->n_gpu_layers #} p $ fromIntegral $ _nGpuLayers cps
-    {# set context_params->main_gpu #} p $ fromIntegral $ _mainGpu cps
-    {# set context_params->tensor_split #} p $ _tensorSplit cps
-    {# set context_params->progress_callback #} p $ _progressCallback cps
-    {# set context_params->progress_callback_user_data #} p $ _progressCallbackUserData cps
-    {# set context_params->low_vram #} p $ _lowVRAM cps
-    {# set context_params->f16_kv #} p $ _f16KV cps
-    {# set context_params->logits_all #} p $ _logitsAll cps
-    {# set context_params->vocab_only #} p $ _vocabOnly cps
-    {# set context_params->use_mmap #} p $ _useMmap cps
-    {# set context_params->use_mlock #} p $ _useMlock cps
-    {# set context_params->embedding #} p $ _embedding cps
+    {# set context_params-> n_threads  #} p $ fromIntegral $ _nThreads  cps
+    {# set context_params-> n_threads_batch  #} p $ fromIntegral $ _nThreadsBatch  cps
+    {# set context_params-> rope_scaling_type  #} p $ fromIntegral $ _ropeScalingType  cps
+    {# set context_params-> rope_freq_base  #} p $ _ropeFreqBase  cps
+    {# set context_params-> rope_freq_scale  #} p $ _ropeFreqScale  cps
+    {# set context_params-> yarn_ext_factor  #} p $ _yarnExtFactor  cps
+    {# set context_params-> yarn_attn_factor  #} p $ _yarnAttnFactor  cps
+    {# set context_params-> yarn_beta_fast  #} p $ _yarnBetaFast  cps
+    {# set context_params-> yarn_beta_slow  #} p $ _yarnBetaSlow  cps
+    {# set context_params-> yarn_orig_ctx  #} p $ fromIntegral $ _yarnOrigCtx  cps
+    {# set context_params-> type_k  #} p $ _typeK  cps
+    {# set context_params-> type_v  #} p $ _typeV  cps
+    {# set context_params-> mul_mat_q  #} p $ _mulMatQ  cps
+    {# set context_params-> logits_all  #} p $ _logitsAll  cps
+    {# set context_params-> embedding  #} p $ _embedding  cps
+    {# set context_params-> offload_kqv  #} p $ _offloadKqv  cps
+    -- {# set context_params->n_gpu_layers #} p $ fromIntegral $ _nGpuLayers cps
+    -- {# set context_params->main_gpu #} p $ fromIntegral $ _mainGpu cps
+    -- {# set context_params->tensor_split #} p $ _tensorSplit cps
+    -- {# set context_params->progress_callback #} p $ _progressCallback cps
+    -- {# set context_params->progress_callback_user_data #} p $ _progressCallbackUserData cps
+    -- {# set context_params->low_vram #} p $ _lowVRAM cps
+    -- {# set context_params->f16_kv #} p $ _f16KV cps
+    -- {# set context_params->logits_all #} p $ _logitsAll cps
+    -- {# set context_params->vocab_only #} p $ _vocabOnly cps
+    -- {# set context_params->use_mmap #} p $ _useMmap cps
+    -- {# set context_params->use_mlock #} p $ _useMlock cps
+    -- {# set context_params->embedding #} p $ _embedding cps
  
 
 --
@@ -266,6 +352,9 @@ maxDevices = {# call max_devices #}
 contextDefaultParams :: ContextParamsPtr -> IO ()
 contextDefaultParams = {# call wrapper_context_default_params #}
 
+modelDefaultParams :: ModelParamsPtr -> IO ()
+modelDefaultParams = {# call wrapper_model_default_params #}
+
 
 --
 -- LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
@@ -319,9 +408,9 @@ timeUs = {# call time_us #}
 --
 -- Can't pass structs by value via FFI, so wrote a wrapper:
 --
-loadModelFromFile :: Ptr CChar -> ContextParamsPtr -> IO Model
-loadModelFromFile modelPath ctxParamsPtr =
-  {# call wrapper_load_model_from_file #} modelPath (castPtr ctxParamsPtr)
+loadModelFromFile :: Ptr CChar -> ModelParamsPtr -> IO Model
+loadModelFromFile modelPath modelParamsPtr =
+  {# call wrapper_load_model_from_file #} modelPath (castPtr modelParamsPtr)
 
 
 --
@@ -369,7 +458,7 @@ modelQuantize = {# call model_quantize #}
 --                   const char * path_base_model,
 --                          int   n_threads);
 --
-modelApplyLoraFromFile :: Model -> Ptr CChar -> Ptr CChar -> CInt -> IO CInt
+modelApplyLoraFromFile :: Model -> Ptr CChar -> CFloat -> Ptr CChar -> CInt -> IO CInt
 modelApplyLoraFromFile = {# call model_apply_lora_from_file #}
 
 
@@ -444,7 +533,7 @@ saveSessionFile = {# call save_session_file #}
 --                          int   n_past,
 --                          int   n_threads);
 --
-eval :: Context -> Ptr Token -> CInt -> CInt -> CInt -> IO CInt
+eval :: Context -> Ptr CInt -> CInt -> CInt -> IO CInt
 eval = {# call eval #}
 
 
@@ -457,7 +546,7 @@ eval = {# call eval #}
 --                          int   n_past,
 --                          int   n_threads);
 --
-evalEmbd :: Context -> Ptr CFloat -> CInt -> CInt -> CInt -> IO CInt
+evalEmbd :: Context -> Ptr CFloat -> CInt -> CInt -> IO CInt
 evalEmbd = {# call eval_embd #}
 
 
@@ -468,8 +557,8 @@ evalEmbd = {# call eval_embd #}
 -- // IMPORTANT: do not use for anything else other than debugging and testing!
 -- LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
 --
-evalExport :: Context -> Ptr CChar -> IO CInt
-evalExport = {# call eval_export #}
+-- evalExport :: Context -> Ptr CChar -> IO CInt
+-- evalExport = {# call eval_export #}
 
 
 --
@@ -485,7 +574,14 @@ evalExport = {# call eval_export #}
 --                          int   n_max_tokens,
 --                         bool   add_bos);
 --
-tokenize :: Context -> Ptr CChar -> Ptr Token -> CInt -> CUChar -> IO CInt
+tokenize :: Model
+                -> Ptr CChar
+                -> CInt
+                -> Ptr CInt
+                -> CInt
+                -> CUChar
+                -> CUChar
+                -> IO CInt
 tokenize = {# call tokenize #}
 
 -- LLAMA_API int llama_tokenize_with_model(
@@ -494,14 +590,14 @@ tokenize = {# call tokenize #}
 --                  llama_token * tokens,
 --                          int   n_max_tokens,
 --                         bool   add_bos);
-tokenizeWithModel :: Model -> Ptr CChar -> Ptr Token -> CInt -> CUChar -> IO CInt
-tokenizeWithModel = {# call tokenize_with_model #}
+-- tokenizeWithModel :: Model -> Ptr CChar -> Ptr Token -> CInt -> CUChar -> IO CInt
+-- tokenizeWithModel = {# call tokenize_with_model #}
 
 
 --
 -- LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
 --
-nVocab :: Context -> IO CInt
+nVocab :: Model -> IO CInt
 nVocab = {# call n_vocab #}
 
 --
@@ -513,28 +609,28 @@ nCtx = {# call n_ctx #}
 --
 -- LLAMA_API int llama_n_embd (const struct llama_context * ctx);
 --
-nEmbd :: Context -> IO CInt
+nEmbd :: Model -> IO CInt
 nEmbd = {# call n_embd #}
 
 
 --
 -- LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
 --
-nVocabFromModel :: Model -> IO CInt
-nVocabFromModel = {# call n_vocab_from_model #}
+-- nVocabFromModel :: Model -> IO CInt
+-- nVocabFromModel = {# call n_vocab_from_model #}
 
 --
 -- LLAMA_API int llama_n_ctx_from_model  (const struct llama_model * model);
 --
-nCtxFromModel :: Model -> IO CInt
-nCtxFromModel = {# call n_ctx_from_model #}
+-- nCtxFromModel :: Model -> IO CInt
+-- nCtxFromModel = {# call n_ctx_from_model #}
 
 
 --
 -- LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
 --
-nEmbdFromModel :: Model -> IO CInt
-nEmbdFromModel = {# call n_embd_from_model #}
+-- nEmbdFromModel :: Model -> IO CInt
+-- nEmbdFromModel = {# call n_embd_from_model #}
 
 
 --
@@ -546,8 +642,8 @@ nEmbdFromModel = {# call n_embd_from_model #}
 --                              float * scores,
 --                                int   capacity);
 --
-getVocab :: Context -> Ptr (Ptr CChar) -> Ptr CFloat -> CInt -> IO CInt
-getVocab = {# call get_vocab #}
+-- getVocab :: Context -> Ptr (Ptr CChar) -> Ptr CFloat -> CInt -> IO CInt
+-- getVocab = {# call get_vocab #}
 
 
 --
@@ -571,19 +667,30 @@ getEmbeddings :: Context -> IO (Ptr CFloat)
 getEmbeddings = {# call llama_get_embeddings #}
 
 
+tokenToPiece :: Model -> Token -> IO String
+tokenToPiece m t = do
+    allocaBytes 8 $ \buf -> do
+        nTokens <- fromIntegral <$> tokenToPiece' m t buf 8
+        if nTokens >= 0
+            then peekCStringLen (buf, nTokens)
+            else allocaBytes nTokens $ \buf' -> do
+                nTokens' <- fromIntegral <$> tokenToPiece' m t buf' (fromIntegral (negate nTokens))
+                assert (nTokens==nTokens') (pure ())
+                peekCStringLen (buf', fromIntegral nTokens)
+
 --
 -- // Token Id -> String. Uses the vocabulary in the provided context
 -- LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
 --
-tokenToStr :: Context -> Token -> IO (Ptr CChar)
-tokenToStr = {# call token_to_str #}
+tokenToPiece' :: Model -> Token -> Ptr CChar -> CInt -> IO CInt
+tokenToPiece' = {# call token_to_piece #}
 
 
 -- LLAMA_API const char * llama_token_to_str_with_model(
 --           const struct llama_model * model,
 --                        llama_token   token);
-tokenToStrWithModel :: Model -> Token -> IO (Ptr CChar)
-tokenToStrWithModel = {# call token_to_str_with_model #}
+-- tokenToStrWithModel :: Model -> Token -> IO (Ptr CChar)
+-- tokenToStrWithModel = {# call token_to_str_with_model #}
 
 
 -- // Special tokens
@@ -591,21 +698,21 @@ tokenToStrWithModel = {# call token_to_str_with_model #}
 --
 -- LLAMA_API llama_token llama_token_bos();  // beginning-of-sentence
 --
-tokenBos :: IO Token
+tokenBos :: Model -> IO Token
 tokenBos = {# call token_bos #}
 
 
 --
 -- LLAMA_API llama_token llama_token_eos();  // end-of-sentence
 --
-tokenEos :: IO Token
+tokenEos :: Model -> IO Token
 tokenEos = {# call token_eos #}
 
 
 --
 -- LLAMA_API llama_token llama_token_nl();   // next-line
 --
-tokenNl :: IO Token
+tokenNl :: Model -> IO Token
 tokenNl = {# call token_nl #}
 
 
@@ -613,19 +720,19 @@ tokenNl = {# call token_nl #}
 
 --
 -- /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
--- LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
+-- LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty, float penalty_freq);
 --
-sampleRepetitionPenalty :: Context -> Ptr TokenDataArray -> Ptr Token -> CULong -> CFloat -> IO ()
-sampleRepetitionPenalty = {# call sample_repetition_penalty #}
+sampleRepetitionPenalties :: Context -> Ptr TokenDataArray -> Ptr Token -> CULong -> CFloat -> CFloat -> CFloat -> IO ()
+sampleRepetitionPenalties = {# call sample_repetition_penalties #}
 
 
 --
 -- /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
 -- LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
 --
-sampleFrequencyAndPresencePenalties
-  :: Context -> Ptr TokenDataArray -> Ptr Token -> CULong -> CFloat -> CFloat -> IO ()
-sampleFrequencyAndPresencePenalties = {# call sample_frequency_and_presence_penalties #} 
+-- sampleFrequencyAndPresencePenalties
+--   :: Context -> Ptr TokenDataArray -> Ptr Token -> CULong -> CFloat -> CFloat -> IO ()
+-- sampleFrequencyAndPresencePenalties = {# call sample_frequency_and_presence_penalties #} 
 
 
 -- /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806

From dd9867320b90aa868a6519b1af8ed10b8ff482af Mon Sep 17 00:00:00 2001
From: Ashton Ohms <me@ashtonoh.ms>
Date: Thu, 21 Dec 2023 02:03:39 -0800
Subject: [PATCH 2/9] It works! Stops + leaks mem after ~2doz tokens though

---
 cbits/wrapper.c  |  7 +++++++
 cbits/wrapper.h  |  6 ++++++
 examples/Main.hs | 26 ++++++++++++++++----------
 src/LLaMACPP.chs |  6 +++---
 4 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/cbits/wrapper.c b/cbits/wrapper.c
index 0ff94d5..55f6cdd 100644
--- a/cbits/wrapper.c
+++ b/cbits/wrapper.c
@@ -7,6 +7,13 @@ struct llama_model * wrapper_load_model_from_file
   return llama_load_model_from_file(path_model, *params);
 }
 
+struct llama_context * wrapper_new_context_with_model(
+            struct llama_model * model,
+            struct llama_context_params * params
+        ) {
+    return llama_new_context_with_model(model, *params);
+}
+
 void wrapper_context_default_params(struct llama_context_params * default_params) {
   *default_params = llama_context_default_params();
 }
diff --git a/cbits/wrapper.h b/cbits/wrapper.h
index e118d74..9482ecb 100644
--- a/cbits/wrapper.h
+++ b/cbits/wrapper.h
@@ -6,6 +6,12 @@ struct llama_model * wrapper_load_model_from_file(
   struct llama_model_params * params
   );
 
+
+struct llama_context * wrapper_new_context_with_model(
+            struct llama_model * model,
+            struct llama_context_params * params
+        );
+
 void wrapper_context_default_params(struct llama_context_params *);
 
 void wrapper_model_default_params(struct llama_model_params *);
diff --git a/examples/Main.hs b/examples/Main.hs
index b294291..400fe7e 100644
--- a/examples/Main.hs
+++ b/examples/Main.hs
@@ -1,7 +1,8 @@
 module Main where
 
 import Prelude hiding (takeWhile)
-
+import System.Exit
+import System.IO
 import Control.Applicative ((<**>))
 import Control.Exception (bracket)
 import Control.Monad.IO.Class (MonadIO, liftIO)
@@ -196,7 +197,9 @@ main = do
       -- I feel like this is not the right way to do this?
       runEffect $
         for (sample >-> takeWhile (/= eos)) $ \id' ->
-          lift . liftIO $ putStr =<< L.tokenToPiece model id'
+          lift . liftIO $ do
+              putStr =<< L.tokenToPiece model id'
+              hFlush stdout
 
 
     sample :: Producer L.Token ContextM ()
@@ -322,15 +325,18 @@ main = do
       putStrLn "\ninit context params"
       L.contextDefaultParams cpp
       L.modelDefaultParams mpp
-      --ctxParams' <- peek cpp
-
-      --let
-      --  ctxParams = ctxParams'
-      --    -- { L._nCtx = _nCtx params'
-      --    -- , L._nGpuLayers = _nGpuLayers params'
-      --    -- }
+      ctxParams' <- peek cpp
 
-      --poke cpp ctxParams
+      let
+        ctxParams = ctxParams'
+          { L._nCtx = 2048
+          , L._nBatch = 2048
+          , L._seed = 1234
+          --, L._nThreads = 2
+          --, L._nThreadsBatch = 2
+          }
+
+      poke cpp ctxParams
 
       putStrLn "\nloading model"
 
diff --git a/src/LLaMACPP.chs b/src/LLaMACPP.chs
index c0a6ab2..f93c7e1 100644
--- a/src/LLaMACPP.chs
+++ b/src/LLaMACPP.chs
@@ -130,6 +130,7 @@ data ModelParams = ModelParams
     , _mainGpu :: Word32
     }
 -- TODO, implement storable
+{# pointer *model_params as ModelParamsPtr -> ModelParams #}
 
 instance Storable ModelParams where
   sizeOf _ = {# sizeof model_params #}
@@ -204,10 +205,9 @@ data ContextParams = ContextParams
   --, _useMlock :: Bool
   --, _embedding :: Bool
   }
-  -- deriving (Eq, Show) -- needs ProgressCallback instance
+   deriving (Eq, Show) -- needs ProgressCallback instance
 
 {# pointer *context_params as ContextParamsPtr -> ContextParams #}
-{# pointer *model_params as ModelParamsPtr -> ModelParams #}
 
 instance Storable ContextParams where
   sizeOf _ = {# sizeof context_params #}
@@ -428,7 +428,7 @@ freeModel = {# call free_model #}
 
 newContextWithModel :: Model -> ContextParamsPtr -> IO (Context)
 newContextWithModel model ctxParamsPtr =
-  {# call new_context_with_model #} model (castPtr ctxParamsPtr)
+  {# call wrapper_new_context_with_model #} model (castPtr ctxParamsPtr)
 
 
 --

From ca940083f4d15fa1772e31797cbfbd360ec57952 Mon Sep 17 00:00:00 2001
From: Ashton Ohms <me@ashtonoh.ms>
Date: Thu, 21 Dec 2023 14:57:12 -0800
Subject: [PATCH 3/9] Fixed mem leak in tokenToPiece

---
 examples/Main.hs |  6 +++---
 src/LLaMACPP.chs | 30 +++++++++++++++++++++---------
 2 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/examples/Main.hs b/examples/Main.hs
index 400fe7e..02b09d6 100644
--- a/examples/Main.hs
+++ b/examples/Main.hs
@@ -329,9 +329,9 @@ main = do
 
       let
         ctxParams = ctxParams'
-          { L._nCtx = 2048
-          , L._nBatch = 2048
-          , L._seed = 1234
+          { L._seed = 1234
+          --, L._nCtx = 2048
+          --, L._nBatch = 2048
           --, L._nThreads = 2
           --, L._nThreadsBatch = 2
           }
diff --git a/src/LLaMACPP.chs b/src/LLaMACPP.chs
index f93c7e1..41a1546 100644
--- a/src/LLaMACPP.chs
+++ b/src/LLaMACPP.chs
@@ -1,6 +1,7 @@
 
 module LLaMACPP where
 
+import Data.Either (fromRight)
 import Control.Exception (assert)
 import Data.Word (Word32)
 import Foreign
@@ -668,15 +669,26 @@ getEmbeddings = {# call llama_get_embeddings #}
 
 
 tokenToPiece :: Model -> Token -> IO String
-tokenToPiece m t = do
-    allocaBytes 8 $ \buf -> do
-        nTokens <- fromIntegral <$> tokenToPiece' m t buf 8
-        if nTokens >= 0
-            then peekCStringLen (buf, nTokens)
-            else allocaBytes nTokens $ \buf' -> do
-                nTokens' <- fromIntegral <$> tokenToPiece' m t buf' (fromIntegral (negate nTokens))
-                assert (nTokens==nTokens') (pure ())
-                peekCStringLen (buf', fromIntegral nTokens)
+tokenToPiece m t =
+    let alloc' size = allocaBytes size $ \buf -> do
+            nTokens <- fromIntegral <$> tokenToPiece' m t buf (fromIntegral size)
+            if nTokens >= 0
+                then Right <$> peekCStringLen (buf, nTokens)
+                else pure $ Left (negate nTokens)
+     in do
+        r <- alloc' 8
+        either (\nTokens -> do
+            r' <- alloc' nTokens
+            pure $ fromRight "??" r'
+            ) (pure . id) r
+
+
+        
+
+            --else allocaBytes nTokens $ \buf' -> do
+            --    nTokens' <- fromIntegral <$> tokenToPiece' m t buf' (fromIntegral (negate nTokens))
+            --    assert (nTokens==nTokens') (pure ())
+            --    peekCStringLen (buf', fromIntegral nTokens)
 
 --
 -- // Token Id -> String. Uses the vocabulary in the provided context

From 4021e9871777903a840d1c5332a9d2114be7cd32 Mon Sep 17 00:00:00 2001
From: Ashton Ohms <me@ashtonoh.ms>
Date: Thu, 21 Dec 2023 15:08:36 -0800
Subject: [PATCH 4/9] Populate ModelParams

---
 src/LLaMACPP.chs | 53 ++++++++++++++++++++++--------------------------
 1 file changed, 24 insertions(+), 29 deletions(-)

diff --git a/src/LLaMACPP.chs b/src/LLaMACPP.chs
index 41a1546..7f826ec 100644
--- a/src/LLaMACPP.chs
+++ b/src/LLaMACPP.chs
@@ -107,6 +107,13 @@ instance Storable TokenDataArray where
     {# set token_data_array->size #} p $ fromIntegral _size
     {# set token_data_array->sorted #} p _sorted
 
+--
+-- typedef void (*llama_progress_callback)(float progress, void *ctx);
+--
+type ProgressCallback = CFloat -> Ptr () -> IO ()
+
+{# pointer progress_callback as ProgressCallbackPtr -> ProgressCallback #}
+
 --     struct llama_model_params {
 --         int32_t n_gpu_layers; // number of layers to store in VRAM
 --         int32_t main_gpu;     // the GPU that is used for scratch and small tensors
@@ -129,8 +136,16 @@ instance Storable TokenDataArray where
 data ModelParams = ModelParams
     { _nGpuLayers :: Word32
     , _mainGpu :: Word32
+    , _tensorSplit :: Ptr CFloat
+    , _progressCallback :: FunPtr ProgressCallback
+    , _progressCallbackUserData :: Ptr ()
+    , _kvOverrides :: Ptr ()
+    , _vocabOnly :: Bool
+    , _useMmap :: Bool
+    , _useMlock :: Bool
     }
--- TODO, implement storable
+    deriving (Show, Eq)
+
 {# pointer *model_params as ModelParamsPtr -> ModelParams #}
 
 instance Storable ModelParams where
@@ -139,17 +154,17 @@ instance Storable ModelParams where
   peek p = ModelParams
     <$> (fromIntegral <$> {# get model_params->n_gpu_layers #} p)
     <*> (fromIntegral <$> {# get model_params->main_gpu #} p)
+    <*> ({# get model_params->tensor_split #} p)
+    <*> ({# get model_params->progress_callback #} p)
+    <*> ({# get model_params->progress_callback_user_data #} p)
+    <*> ({# get model_params->kv_overrides #} p)
+    <*> ({# get model_params->vocab_only #} p)
+    <*> ({# get model_params->use_mmap #} p)
+    <*> ({# get model_params->use_mlock #} p)
   poke p mps = do 
     {# set model_params->n_gpu_layers #} p $ fromIntegral $ _nGpuLayers mps
     {# set model_params->main_gpu #} p $ fromIntegral $ _mainGpu mps
 
---
--- typedef void (*llama_progress_callback)(float progress, void *ctx);
---
-type ProgressCallback = CFloat -> Ptr () -> IO ()
-
-{# pointer progress_callback as ProgressCallbackPtr -> ProgressCallback #}
-
 --
 --struct llama_context_params {
 --     uint32_t seed;                         // RNG seed, -1 for random
@@ -193,20 +208,8 @@ data ContextParams = ContextParams
   , _logitsAll :: Bool
   , _embedding :: Bool
   , _offloadKqv :: Bool
-  --, _nGpuLayers :: Int
-  --, _mainGpu :: Int
-  --, _tensorSplit :: Ptr CFloat
-  --, _progressCallback :: FunPtr ProgressCallback
-  --, _progressCallbackUserData :: Ptr ()
-  --, _lowVRAM :: Bool
-  --, _f16KV :: Bool
-  --, _logitsAll :: Bool
-  --, _vocabOnly :: Bool
-  --, _useMmap :: Bool
-  --, _useMlock :: Bool
-  --, _embedding :: Bool
   }
-   deriving (Eq, Show) -- needs ProgressCallback instance
+   deriving (Eq, Show)
 
 {# pointer *context_params as ContextParamsPtr -> ContextParams #}
 
@@ -682,14 +685,6 @@ tokenToPiece m t =
             pure $ fromRight "??" r'
             ) (pure . id) r
 
-
-        
-
-            --else allocaBytes nTokens $ \buf' -> do
-            --    nTokens' <- fromIntegral <$> tokenToPiece' m t buf' (fromIntegral (negate nTokens))
-            --    assert (nTokens==nTokens') (pure ())
-            --    peekCStringLen (buf', fromIntegral nTokens)
-
 --
 -- // Token Id -> String. Uses the vocabulary in the provided context
 -- LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);

From 9409bbdaeb3a686a8f541795ccf484edb1af5614 Mon Sep 17 00:00:00 2001
From: Ashton Ohms <me@ashtonoh.ms>
Date: Thu, 21 Dec 2023 15:25:02 -0800
Subject: [PATCH 5/9] Cleanup, update doc comments

---
 src/LLaMACPP.chs | 118 ++++++++++-------------------------------------
 1 file changed, 25 insertions(+), 93 deletions(-)

diff --git a/src/LLaMACPP.chs b/src/LLaMACPP.chs
index 7f826ec..0363f99 100644
--- a/src/LLaMACPP.chs
+++ b/src/LLaMACPP.chs
@@ -167,26 +167,31 @@ instance Storable ModelParams where
 
 --
 --struct llama_context_params {
---     uint32_t seed;                         // RNG seed, -1 for random
---     int32_t  n_ctx;                        // text context
---     int32_t  n_batch;                      // prompt processing batch size
---     int32_t  n_gpu_layers;                 // number of layers to store in VRAM
---     int32_t  main_gpu;                     // the GPU that is used for scratch and small tensors
---     float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
---     // called with a progress value between 0 and 1, pass NULL to disable
---     llama_progress_callback progress_callback;
---     // context pointer passed to the progress callback
---     void * progress_callback_user_data;
---
---     // Keep the booleans together to avoid misalignment during copy-by-value.
---     bool low_vram;   // if true, reduce VRAM usage at the cost of performance
---     bool f16_kv;     // use fp16 for KV cache
---     bool logits_all; // the llama_eval() call computes all logits, not just the last one
---     bool vocab_only; // only load the vocabulary, no weights
---     bool use_mmap;   // use mmap if possible
---     bool use_mlock;  // force system to keep model in RAM
---     bool embedding;  // embedding mode only
--- };
+--         uint32_t seed;              // RNG seed, -1 for random
+--         uint32_t n_ctx;             // text context, 0 = from model
+--         uint32_t n_batch;           // prompt processing maximum batch size
+--         uint32_t n_threads;         // number of threads to use for generation
+--         uint32_t n_threads_batch;   // number of threads to use for batch processing
+--         int8_t   rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
+-- 
+--         // ref: https://github.com/ggerganov/llama.cpp/pull/2054
+--         float    rope_freq_base;   // RoPE base frequency, 0 = from model
+--         float    rope_freq_scale;  // RoPE frequency scaling factor, 0 = from model
+--         float    yarn_ext_factor;  // YaRN extrapolation mix factor, negative = from model
+--         float    yarn_attn_factor; // YaRN magnitude scaling factor
+--         float    yarn_beta_fast;   // YaRN low correction dim
+--         float    yarn_beta_slow;   // YaRN high correction dim
+--         uint32_t yarn_orig_ctx;    // YaRN original context size
+-- 
+--         enum ggml_type type_k; // data type for K cache
+--         enum ggml_type type_v; // data type for V cache
+-- 
+--         // Keep the booleans together to avoid misalignment during copy-by-value.
+--         bool mul_mat_q;   // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
+--         bool logits_all;  // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
+--         bool embedding;   // embedding mode only
+--         bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
+--     };
 --
 data ContextParams = ContextParams
   { _seed :: Word32
@@ -553,18 +558,6 @@ eval = {# call eval #}
 evalEmbd :: Context -> Ptr CFloat -> CInt -> CInt -> IO CInt
 evalEmbd = {# call eval_embd #}
 
-
---
--- // Export a static computation graph for context of 511 and batch size of 1
--- // NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
--- //       parameters here to keep things simple
--- // IMPORTANT: do not use for anything else other than debugging and testing!
--- LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
---
--- evalExport :: Context -> Ptr CChar -> IO CInt
--- evalExport = {# call eval_export #}
-
-
 --
 -- // Convert the provided text into tokens.
 -- // The tokens pointer must be large enough to hold the resulting tokens.
@@ -588,16 +581,6 @@ tokenize :: Model
                 -> IO CInt
 tokenize = {# call tokenize #}
 
--- LLAMA_API int llama_tokenize_with_model(
---     const struct llama_model * model,
---                   const char * text,
---                  llama_token * tokens,
---                          int   n_max_tokens,
---                         bool   add_bos);
--- tokenizeWithModel :: Model -> Ptr CChar -> Ptr Token -> CInt -> CUChar -> IO CInt
--- tokenizeWithModel = {# call tokenize_with_model #}
-
-
 --
 -- LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
 --
@@ -616,40 +599,6 @@ nCtx = {# call n_ctx #}
 nEmbd :: Model -> IO CInt
 nEmbd = {# call n_embd #}
 
-
---
--- LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
---
--- nVocabFromModel :: Model -> IO CInt
--- nVocabFromModel = {# call n_vocab_from_model #}
-
---
--- LLAMA_API int llama_n_ctx_from_model  (const struct llama_model * model);
---
--- nCtxFromModel :: Model -> IO CInt
--- nCtxFromModel = {# call n_ctx_from_model #}
-
-
---
--- LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
---
--- nEmbdFromModel :: Model -> IO CInt
--- nEmbdFromModel = {# call n_embd_from_model #}
-
-
---
--- // Get the vocabulary as output parameters.
--- // Returns number of results.
--- LLAMA_API int llama_get_vocab(
---         const struct llama_context * ctx,
---                       const char * * strings,
---                              float * scores,
---                                int   capacity);
---
--- getVocab :: Context -> Ptr (Ptr CChar) -> Ptr CFloat -> CInt -> IO CInt
--- getVocab = {# call get_vocab #}
-
-
 --
 -- // Token logits obtained from the last call to llama_eval()
 -- // The logits for the last token are stored in the last row
@@ -661,7 +610,6 @@ nEmbd = {# call n_embd #}
 getLogits :: Context -> IO (Ptr CFloat)
 getLogits = {# call llama_get_logits #}
 
-
 --
 -- // Get the embeddings for the input
 -- // shape: [n_embd] (1-dimensional)
@@ -693,13 +641,6 @@ tokenToPiece' :: Model -> Token -> Ptr CChar -> CInt -> IO CInt
 tokenToPiece' = {# call token_to_piece #}
 
 
--- LLAMA_API const char * llama_token_to_str_with_model(
---           const struct llama_model * model,
---                        llama_token   token);
--- tokenToStrWithModel :: Model -> Token -> IO (Ptr CChar)
--- tokenToStrWithModel = {# call token_to_str_with_model #}
-
-
 -- // Special tokens
 
 --
@@ -733,15 +674,6 @@ sampleRepetitionPenalties :: Context -> Ptr TokenDataArray -> Ptr Token -> CULon
 sampleRepetitionPenalties = {# call sample_repetition_penalties #}
 
 
---
--- /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
--- LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
---
--- sampleFrequencyAndPresencePenalties
---   :: Context -> Ptr TokenDataArray -> Ptr Token -> CULong -> CFloat -> CFloat -> IO ()
--- sampleFrequencyAndPresencePenalties = {# call sample_frequency_and_presence_penalties #} 
-
-
 -- /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
 -- /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
 -- /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.

From 1883545933a29d1e58e03860698f379af28791c6 Mon Sep 17 00:00:00 2001
From: Ashton Ohms <me@ashtonoh.ms>
Date: Thu, 21 Dec 2023 15:42:05 -0800
Subject: [PATCH 6/9] Refactor tokenToPiece as tokenToString in Main

---
 examples/Main.hs | 35 ++++++++++++++++++++++++++---------
 src/LLaMACPP.chs | 29 ++++++-----------------------
 2 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/examples/Main.hs b/examples/Main.hs
index 02b09d6..b1d098b 100644
--- a/examples/Main.hs
+++ b/examples/Main.hs
@@ -1,15 +1,15 @@
 module Main where
 
 import Prelude hiding (takeWhile)
-import System.Exit
-import System.IO
 import Control.Applicative ((<**>))
 import Control.Exception (bracket)
 import Control.Monad.IO.Class (MonadIO, liftIO)
 import Control.Monad.Reader (MonadReader, ReaderT, runReaderT, asks)
+import Data.Either (fromRight)
 import Data.Functor ((<&>), void)
 import qualified Data.Vector.Storable as V
-import Foreign.C.String (peekCString, withCString, withCStringLen)
+import Foreign (allocaBytes)
+import Foreign.C.String (peekCString, peekCStringLen, withCString, withCStringLen)
 import Foreign.C.Types (CFloat, CInt)
 import Foreign.ForeignPtr (newForeignPtr_)
 import Foreign.Marshal.Alloc (alloca, free, malloc)
@@ -40,6 +40,7 @@ import Options.Applicative
   )
 import Pipes (Producer, (>->), for, lift, runEffect, yield)
 import Pipes.Prelude (takeWhile)
+import System.IO (hFlush, stdout)
 
 data Params = Params
   { _nCtx :: Int
@@ -119,6 +120,24 @@ runContextM :: Context -> ContextM () -> IO ()
 runContextM ctx (ContextM ctxM) = runReaderT ctxM ctx
 
 
+tokenToString :: L.Model -> L.Token -> IO String
+tokenToString m t =
+  let
+    -- Returns (Right tokenString) if successful, otherwise 
+    -- (Left len), where len is correct token size to allocate
+    tokenBufToStr size = allocaBytes size $ \buf -> do
+          nTokens <- fromIntegral <$> L.tokenToPiece m t buf (fromIntegral size)
+          if nTokens >= 0
+              then Right <$> peekCStringLen (buf, nTokens)
+              -- tokenToPiece will return the token size, negated,
+              -- iff the size we allocate is too small.
+              else pure $ Left (negate nTokens)
+   in do
+      r <- tokenBufToStr 8
+      either (\nTokens -> do
+          r' <- tokenBufToStr nTokens
+          pure $ fromRight (error "token allocation failed") r'
+          ) (pure . id) r
 --
 
 -- mimicking call here, somewhat: https://huggingface.co/TheBloke/open-llama-7B-v2-open-instruct-GGML#how-to-run-in-llamacpp 
@@ -156,7 +175,7 @@ main = do
       liftIO . allocaArray 1 $ \(tmp :: Ptr L.Token) -> do
         bos <- L.tokenBos model
         pokeArray tmp [bos]
-        _evalRes <- L.eval ctx tmp 1 0 --(_nThreads $ params')
+        _evalRes <- L.eval ctx tmp 1 0
         L.resetTimings ctx
 
       let
@@ -174,7 +193,7 @@ main = do
         putStrLn $ "\nTokenized " <> show tokenizedCount'
 
         putStrLn "\nRunning first eval of entire prompt"
-        _evalRes <- L.eval ctx tokensPtr tokenizedCount' 0 --(_nThreads params')
+        _evalRes <- L.eval ctx tokensPtr tokenizedCount' 0
 
         (, tokenizedCount') <$>
           peekArray (fromIntegral tokenizedCount') tokensPtr
@@ -198,7 +217,7 @@ main = do
       runEffect $
         for (sample >-> takeWhile (/= eos)) $ \id' ->
           lift . liftIO $ do
-              putStr =<< L.tokenToPiece model id'
+              putStr =<< tokenToString model id'
               hFlush stdout
 
 
@@ -217,7 +236,7 @@ main = do
       id' <- liftIO $ sample' model params' ctx nVocab lastNTokensTV
 
       void . liftIO . withArray [id'] $ \newTokenArrPtr ->
-        L.eval ctx newTokenArrPtr 1 (fromIntegral nPast') -- (_nThreads $ params')
+        L.eval ctx newTokenArrPtr 1 (fromIntegral nPast')
 
       liftIO . atomically $ do
         writeTVar nPastTV $
@@ -226,8 +245,6 @@ main = do
           else nPast' + 1
         writeTVar lastNTokensTV $ drop 1 lastNTokens' <> [id']
 
-      -- liftIO $ putStr =<< peekCString =<< L.tokenToStr ctx id'
-
       yield id' *> sample
 
 
diff --git a/src/LLaMACPP.chs b/src/LLaMACPP.chs
index 0363f99..ee6834b 100644
--- a/src/LLaMACPP.chs
+++ b/src/LLaMACPP.chs
@@ -1,11 +1,8 @@
 
 module LLaMACPP where
 
-import Data.Either (fromRight)
 import Control.Exception (assert)
-import Data.Word (Word32)
-import Foreign
-import Foreign.C.String (peekCStringLen)
+import Data.Word (Word8, Word32)
 import Foreign.C.Types (CChar, CDouble, CFloat, CInt, CLong, CSize, CUChar, CULong)
 import Foreign.Marshal.Utils (fromBool, toBool)
 import Foreign.Ptr (FunPtr, Ptr, castPtr)
@@ -619,46 +616,32 @@ getEmbeddings :: Context -> IO (Ptr CFloat)
 getEmbeddings = {# call llama_get_embeddings #}
 
 
-tokenToPiece :: Model -> Token -> IO String
-tokenToPiece m t =
-    let alloc' size = allocaBytes size $ \buf -> do
-            nTokens <- fromIntegral <$> tokenToPiece' m t buf (fromIntegral size)
-            if nTokens >= 0
-                then Right <$> peekCStringLen (buf, nTokens)
-                else pure $ Left (negate nTokens)
-     in do
-        r <- alloc' 8
-        either (\nTokens -> do
-            r' <- alloc' nTokens
-            pure $ fromRight "??" r'
-            ) (pure . id) r
-
 --
 -- // Token Id -> String. Uses the vocabulary in the provided context
 -- LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
 --
-tokenToPiece' :: Model -> Token -> Ptr CChar -> CInt -> IO CInt
-tokenToPiece' = {# call token_to_piece #}
+tokenToPiece :: Model -> Token -> Ptr CChar -> CInt -> IO CInt
+tokenToPiece = {# call token_to_piece #}
 
 
 -- // Special tokens
 
 --
--- LLAMA_API llama_token llama_token_bos();  // beginning-of-sentence
+-- LLAMA_API llama_token llama_token_bos(const struct llama_model * model);  // beginning-of-sentence
 --
 tokenBos :: Model -> IO Token
 tokenBos = {# call token_bos #}
 
 
 --
--- LLAMA_API llama_token llama_token_eos();  // end-of-sentence
+-- LLAMA_API llama_token llama_token_eos(const struct llama_model * model);  // end-of-sentence
 --
 tokenEos :: Model -> IO Token
 tokenEos = {# call token_eos #}
 
 
 --
--- LLAMA_API llama_token llama_token_nl();   // next-line
+-- LLAMA_API llama_token llama_token_nl(const struct llama_model * model);   // next-line
 --
 tokenNl :: Model -> IO Token
 tokenNl = {# call token_nl #}

From ce4f92f7f64fd57094dedb1e6e770045be1372ce Mon Sep 17 00:00:00 2001
From: Ashton Ohms <me@ashtonoh.ms>
Date: Thu, 21 Dec 2023 15:44:11 -0800
Subject: [PATCH 7/9] Re-add test suite to cabal

---
 llama-cpp-bindings.cabal | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/llama-cpp-bindings.cabal b/llama-cpp-bindings.cabal
index fd2ea27..77f50c9 100644
--- a/llama-cpp-bindings.cabal
+++ b/llama-cpp-bindings.cabal
@@ -64,10 +64,10 @@ executable examples
   main-is:             Main.hs
   default-language:    GHC2021
 
--- test-suite llama-cpp-bindings-test
---   import:              shared
---   build-depends:       llama-cpp-bindings
---   hs-source-dirs:      test
---   main-is:             Main.hs
---   default-language:    GHC2021
---   type:                exitcode-stdio-1.0
+test-suite llama-cpp-bindings-test
+  import:              shared
+  build-depends:       llama-cpp-bindings
+  hs-source-dirs:      test
+  main-is:             Main.hs
+  default-language:    GHC2021
+  type:                exitcode-stdio-1.0

From ea9d8512fc593f4e13a2116c43d561c3e3da86cb Mon Sep 17 00:00:00 2001
From: Ashton Ohms <me@ashtonoh.ms>
Date: Thu, 21 Dec 2023 15:46:04 -0800
Subject: [PATCH 8/9] Remove sampleFrequencyAndPresencePenalties (no longer
 exists)

---
 examples/Main.hs | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/examples/Main.hs b/examples/Main.hs
index b1d098b..3a7a6a9 100644
--- a/examples/Main.hs
+++ b/examples/Main.hs
@@ -294,14 +294,6 @@ main = do
             L.sampleRepetitionPenalties
               ctx candidatesPPtr lastNTokensPtr lastNTokensLen (_repeatPenalty params') (1.0) (1.0)
 
-            --L.sampleFrequencyAndPresencePenalties
-            --  ctx
-            --  candidatesPPtr
-            --  lastNTokensPtr
-            --  lastNTokensLen
-            --  (_alphaFrequency params')
-            --  (_alphaPresence params')
-
             -- todo
             -- if (!penalize_nl) {
             --     logits[llama_token_nl()] = nl_logit;

From cdb3e391c77e2318c823f6d7a7aa810bca616575 Mon Sep 17 00:00:00 2001
From: Ashton Ohms <me@ashtonoh.ms>
Date: Thu, 21 Dec 2023 16:21:09 -0800
Subject: [PATCH 9/9] Fixing params

---
 examples/Main.hs | 21 ++++++++++++---------
 src/LLaMACPP.chs |  4 ++--
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/examples/Main.hs b/examples/Main.hs
index 3a7a6a9..aa7e4c5 100644
--- a/examples/Main.hs
+++ b/examples/Main.hs
@@ -44,7 +44,7 @@ import System.IO (hFlush, stdout)
 
 data Params = Params
   { _nCtx :: Int
-  , _nThreads :: CInt
+  , _nThreads :: Int
   , _nPredict :: Int
   , _nGpuLayers :: Int
   , _enableNumaOpts :: Bool
@@ -329,22 +329,25 @@ main = do
       -- todo
       -- putStrLn "\ndefault model quantize params"
 
-      cpp <- malloc
+      putStrLn "\ninit model & context params"
       mpp <- malloc
-      putStrLn "\ninit context params"
-      L.contextDefaultParams cpp
       L.modelDefaultParams mpp
+      modelParams' <- peek mpp
+
+      cpp <- malloc
+      L.contextDefaultParams cpp
       ctxParams' <- peek cpp
 
       let
+        modelParams = modelParams'
+          { L._nGpuLayers = _nGpuLayers params'
+          }
         ctxParams = ctxParams'
-          { L._seed = 1234
-          --, L._nCtx = 2048
-          --, L._nBatch = 2048
-          --, L._nThreads = 2
-          --, L._nThreadsBatch = 2
+          { L._nCtx = _nCtx params'
+          , L._nThreads = _nThreads params'
           }
 
+      poke mpp modelParams
       poke cpp ctxParams
 
       putStrLn "\nloading model"
diff --git a/src/LLaMACPP.chs b/src/LLaMACPP.chs
index ee6834b..d440750 100644
--- a/src/LLaMACPP.chs
+++ b/src/LLaMACPP.chs
@@ -131,7 +131,7 @@ type ProgressCallback = CFloat -> Ptr () -> IO ()
 --         bool use_mlock;  // force system to keep model in RAM
 --     };
 data ModelParams = ModelParams
-    { _nGpuLayers :: Word32
+    { _nGpuLayers :: Int
     , _mainGpu :: Word32
     , _tensorSplit :: Ptr CFloat
     , _progressCallback :: FunPtr ProgressCallback
@@ -194,7 +194,7 @@ data ContextParams = ContextParams
   { _seed :: Word32
   , _nCtx :: Int
   , _nBatch :: Int
-  , _nThreads :: Word32
+  , _nThreads :: Int
   , _nThreadsBatch :: Word32
   , _ropeScalingType :: Word8
   , _ropeFreqBase :: CFloat