ddellacosta · AshtonOhms · Dec 21, 2023 · Dec 21, 2023 · Dec 21, 2023 · Dec 21, 2023
diff --git a/cbits/wrapper.c b/cbits/wrapper.c
@@ -3,14 +3,25 @@
 #include "wrapper.h"
 
 struct llama_model * wrapper_load_model_from_file
-( const char * path_model, struct llama_context_params * params) {
+( const char * path_model, struct llama_model_params * params) {
   return llama_load_model_from_file(path_model, *params);
 }
 
+struct llama_context * wrapper_new_context_with_model(
+            struct llama_model * model,
+            struct llama_context_params * params
+        ) {
+    return llama_new_context_with_model(model, *params);
+}
+
 void wrapper_context_default_params(struct llama_context_params * default_params) {
   *default_params = llama_context_default_params();
 }
 
+void wrapper_model_default_params(struct llama_model_params * default_params) {
+  *default_params = llama_model_default_params();
+}
+
 void wrapper_model_quantize_default_params(struct llama_model_quantize_params * default_params) {
   *default_params = llama_model_quantize_default_params();
 }

diff --git a/cbits/wrapper.h b/cbits/wrapper.h
@@ -3,11 +3,19 @@
 
 struct llama_model * wrapper_load_model_from_file(
   const char * path_model,
-  struct llama_context_params * params
+  struct llama_model_params * params
   );
 
+
+struct llama_context * wrapper_new_context_with_model(
+            struct llama_model * model,
+            struct llama_context_params * params
+        );
+
 void wrapper_context_default_params(struct llama_context_params *);
 
+void wrapper_model_default_params(struct llama_model_params *);
+
 void wrapper_model_quantize_default_params(struct llama_model_quantize_params *);
 
 void wrapper_get_timings(struct llama_context * ctx, struct llama_timings * timings);
diff --git a/examples/Main.hs b/examples/Main.hs
@@ -1,14 +1,15 @@
 module Main where
 
 import Prelude hiding (takeWhile)
-
 import Control.Applicative ((<**>))
 import Control.Exception (bracket)
 import Control.Monad.IO.Class (MonadIO, liftIO)
 import Control.Monad.Reader (MonadReader, ReaderT, runReaderT, asks)
+import Data.Either (fromRight)
 import Data.Functor ((<&>), void)
 import qualified Data.Vector.Storable as V
-import Foreign.C.String (peekCString, withCString)
+import Foreign (allocaBytes)
+import Foreign.C.String (peekCString, peekCStringLen, withCString, withCStringLen)
 import Foreign.C.Types (CFloat, CInt)
 import Foreign.ForeignPtr (newForeignPtr_)
 import Foreign.Marshal.Alloc (alloca, free, malloc)
@@ -39,10 +40,11 @@ import Options.Applicative
   )
 import Pipes (Producer, (>->), for, lift, runEffect, yield)
 import Pipes.Prelude (takeWhile)
+import System.IO (hFlush, stdout)
 
 data Params = Params
   { _nCtx :: Int
-  , _nThreads :: CInt
+  , _nThreads :: Int
   , _nPredict :: Int
   , _nGpuLayers :: Int
   , _enableNumaOpts :: Bool
@@ -118,6 +120,24 @@ runContextM :: Context -> ContextM () -> IO ()
 runContextM ctx (ContextM ctxM) = runReaderT ctxM ctx
 
 
+tokenToString :: L.Model -> L.Token -> IO String
+tokenToString m t =
+  let
+    -- Returns (Right tokenString) if successful, otherwise 
+    -- (Left len), where len is correct token size to allocate
+    tokenBufToStr size = allocaBytes size $ \buf -> do
+          nTokens <- fromIntegral <$> L.tokenToPiece m t buf (fromIntegral size)
+          if nTokens >= 0
+              then Right <$> peekCStringLen (buf, nTokens)
+              -- tokenToPiece will return the token size, negated,
+              -- iff the size we allocate is too small.
+              else pure $ Left (negate nTokens)
+   in do
+      r <- tokenBufToStr 8
+      either (\nTokens -> do
+          r' <- tokenBufToStr nTokens
+          pure $ fromRight (error "token allocation failed") r'
+          ) (pure . id) r
 --
 
 -- mimicking call here, somewhat: https://huggingface.co/TheBloke/open-llama-7B-v2-open-instruct-GGML#how-to-run-in-llamacpp 
@@ -147,32 +167,33 @@ main = do
 
       params' <- asks _params
       ctx <- asks _llamaCtx
+      model <- asks _model
 
       -- tokenizing & eval
 
       -- "do one empty run to warm up the model"
       liftIO . allocaArray 1 $ \(tmp :: Ptr L.Token) -> do
-        bos <- L.tokenBos
+        bos <- L.tokenBos model
         pokeArray tmp [bos]
-        _evalRes <- L.eval ctx tmp 1 0 (_nThreads $ params')
+        _evalRes <- L.eval ctx tmp 1 0
         L.resetTimings ctx
 
       let
         -- todo why is this constant here
         maxTokens = 1024
-        tokenize s tks addBos =
-          L.tokenize ctx s tks (fromIntegral maxTokens) (fromBool addBos)
+        tokenize s l tks addBos =
+          L.tokenize model s (fromIntegral l) tks (fromIntegral maxTokens) (fromBool addBos) (fromBool False)
 
       (tokenized, tokenizedCount) <- liftIO . allocaArray maxTokens $ \tokensPtr -> do
-        tokenizedCount' <- withCString (_prompt params') $ \ts -> tokenize ts tokensPtr True
+        tokenizedCount' <- withCStringLen (_prompt params') $ \(ts, l) -> tokenize ts l tokensPtr True
 
         putStrLn "\nPrompt"
         putStrLn $ _prompt params' <> "\n\n"
 
         putStrLn $ "\nTokenized " <> show tokenizedCount'
 
         putStrLn "\nRunning first eval of entire prompt"
-        _evalRes <- L.eval ctx tokensPtr tokenizedCount' 0 (_nThreads params')
+        _evalRes <- L.eval ctx tokensPtr tokenizedCount' 0
 
         (, tokenizedCount') <$>
           peekArray (fromIntegral tokenizedCount') tokensPtr
@@ -190,29 +211,32 @@ main = do
 
       liftIO . putStrLn $ "\nsampling"
 
-      eos <- liftIO L.tokenEos
+      eos <- liftIO $ L.tokenEos model
 
       -- I feel like this is not the right way to do this?
       runEffect $
         for (sample >-> takeWhile (/= eos)) $ \id' ->
-          lift . liftIO $ putStr =<< peekCString =<< L.tokenToStr ctx id'
+          lift . liftIO $ do
+              putStr =<< tokenToString model id'
+              hFlush stdout
 
 
     sample :: Producer L.Token ContextM ()
     sample = do
       params' <- asks _params
       ctx <- asks _llamaCtx
+      model <- asks _model
       nPastTV <- asks _nPast
       nPast' <- liftIO . readTVarIO $ nPastTV
       lastNTokensTV <- asks _lastNTokens
       lastNTokens' <- liftIO . readTVarIO $ lastNTokensTV
 
-      nVocab <- fromIntegral <$> (liftIO . L.nVocab $ ctx)
+      nVocab <- fromIntegral <$> (liftIO . L.nVocab $ model)
 
-      id' <- liftIO $ sample' params' ctx nVocab lastNTokensTV
+      id' <- liftIO $ sample' model params' ctx nVocab lastNTokensTV
 
       void . liftIO . withArray [id'] $ \newTokenArrPtr ->
-        L.eval ctx newTokenArrPtr 1 (fromIntegral nPast') (_nThreads $ params')
+        L.eval ctx newTokenArrPtr 1 (fromIntegral nPast')
 
       liftIO . atomically $ do
         writeTVar nPastTV $
@@ -221,13 +245,11 @@ main = do
           else nPast' + 1
         writeTVar lastNTokensTV $ drop 1 lastNTokens' <> [id']
 
-      -- liftIO $ putStr =<< peekCString =<< L.tokenToStr ctx id'
-
       yield id' *> sample
 
 
-    sample' :: Params -> L.Context -> Int -> TVar [L.Token] -> IO L.Token
-    sample' params' ctx nVocab lastNTokensTV = do
+    sample' :: L.Model -> Params -> L.Context -> Int -> TVar [L.Token] -> IO L.Token
+    sample' model params' ctx nVocab lastNTokensTV = do
       alloca $ \candidatesPPtr ->
         allocaArray nVocab $ \logitsCopyPtr -> do
 
@@ -262,23 +284,15 @@ main = do
               lastNTokensLen = fromIntegral . length $ lastNTokens'
 
             -- float nl_logit = logits[llama_token_nl()];
-            _nlLogit <- V.unsafeIndex logitsCopy . fromIntegral <$> L.tokenNl
+            _nlLogit <- V.unsafeIndex logitsCopy . fromIntegral <$> (L.tokenNl model)
 
             --
             -- lastNTokensPtr should be a pointer just to the last
             -- set of tokens at the end of lastNTokens matching the
             -- count of lastNRepeat, right now it's the entire thing
             --
-            L.sampleRepetitionPenalty
-              ctx candidatesPPtr lastNTokensPtr lastNTokensLen (_repeatPenalty params')
-
-            L.sampleFrequencyAndPresencePenalties
-              ctx
-              candidatesPPtr
-              lastNTokensPtr
-              lastNTokensLen
-              (_alphaFrequency params')
-              (_alphaPresence params')
+            L.sampleRepetitionPenalties
+              ctx candidatesPPtr lastNTokensPtr lastNTokensLen (_repeatPenalty params') (1.0) (1.0)
 
             -- todo
             -- if (!penalize_nl) {
@@ -315,22 +329,30 @@ main = do
       -- todo
       -- putStrLn "\ndefault model quantize params"
 
+      putStrLn "\ninit model & context params"
+      mpp <- malloc
+      L.modelDefaultParams mpp
+      modelParams' <- peek mpp
+
       cpp <- malloc
-      putStrLn "\ninit context params"
       L.contextDefaultParams cpp
       ctxParams' <- peek cpp
 
       let
+        modelParams = modelParams'
+          { L._nGpuLayers = _nGpuLayers params'
+          }
         ctxParams = ctxParams'
           { L._nCtx = _nCtx params'
-          , L._nGpuLayers = _nGpuLayers params'
+          , L._nThreads = _nThreads params'
           }
 
+      poke mpp modelParams
       poke cpp ctxParams
 
       putStrLn "\nloading model"
 
-      model' <- withCString (_modelPath params') $ flip L.loadModelFromFile cpp
+      model' <- withCString (_modelPath params') $ flip L.loadModelFromFile mpp
 
       putStrLn "\nloading context"