From 2ebdf63539ef19ae7c0a5c9240c6fdf0341c6f35 Mon Sep 17 00:00:00 2001 From: Ashton Ohms Date: Wed, 20 Dec 2023 21:51:13 -0800 Subject: [PATCH 1/9] [WIP] Update for latest llama.cpp --- cbits/wrapper.c | 6 +- cbits/wrapper.h | 4 +- examples/Main.hs | 68 +++++------ llama-cpp-bindings.cabal | 14 +-- src/LLaMACPP.chs | 243 ++++++++++++++++++++++++++++----------- 5 files changed, 226 insertions(+), 109 deletions(-) diff --git a/cbits/wrapper.c b/cbits/wrapper.c index 5b1bb98..0ff94d5 100644 --- a/cbits/wrapper.c +++ b/cbits/wrapper.c @@ -3,7 +3,7 @@ #include "wrapper.h" struct llama_model * wrapper_load_model_from_file -( const char * path_model, struct llama_context_params * params) { +( const char * path_model, struct llama_model_params * params) { return llama_load_model_from_file(path_model, *params); } @@ -11,6 +11,10 @@ void wrapper_context_default_params(struct llama_context_params * default_params *default_params = llama_context_default_params(); } +void wrapper_model_default_params(struct llama_model_params * default_params) { + *default_params = llama_model_default_params(); +} + void wrapper_model_quantize_default_params(struct llama_model_quantize_params * default_params) { *default_params = llama_model_quantize_default_params(); } diff --git a/cbits/wrapper.h b/cbits/wrapper.h index b94ebfc..e118d74 100644 --- a/cbits/wrapper.h +++ b/cbits/wrapper.h @@ -3,11 +3,13 @@ struct llama_model * wrapper_load_model_from_file( const char * path_model, - struct llama_context_params * params + struct llama_model_params * params ); void wrapper_context_default_params(struct llama_context_params *); +void wrapper_model_default_params(struct llama_model_params *); + void wrapper_model_quantize_default_params(struct llama_model_quantize_params *); void wrapper_get_timings(struct llama_context * ctx, struct llama_timings * timings); diff --git a/examples/Main.hs b/examples/Main.hs index a498d62..b294291 100644 --- a/examples/Main.hs +++ b/examples/Main.hs @@ -8,7 +8,7 @@ import Control.Monad.IO.Class (MonadIO, liftIO) import Control.Monad.Reader (MonadReader, ReaderT, runReaderT, asks) import Data.Functor ((<&>), void) import qualified Data.Vector.Storable as V -import Foreign.C.String (peekCString, withCString) +import Foreign.C.String (peekCString, withCString, withCStringLen) import Foreign.C.Types (CFloat, CInt) import Foreign.ForeignPtr (newForeignPtr_) import Foreign.Marshal.Alloc (alloca, free, malloc) @@ -147,24 +147,25 @@ main = do params' <- asks _params ctx <- asks _llamaCtx + model <- asks _model -- tokenizing & eval -- "do one empty run to warm up the model" liftIO . allocaArray 1 $ \(tmp :: Ptr L.Token) -> do - bos <- L.tokenBos + bos <- L.tokenBos model pokeArray tmp [bos] - _evalRes <- L.eval ctx tmp 1 0 (_nThreads $ params') + _evalRes <- L.eval ctx tmp 1 0 --(_nThreads $ params') L.resetTimings ctx let -- todo why is this constant here maxTokens = 1024 - tokenize s tks addBos = - L.tokenize ctx s tks (fromIntegral maxTokens) (fromBool addBos) + tokenize s l tks addBos = + L.tokenize model s (fromIntegral l) tks (fromIntegral maxTokens) (fromBool addBos) (fromBool False) (tokenized, tokenizedCount) <- liftIO . allocaArray maxTokens $ \tokensPtr -> do - tokenizedCount' <- withCString (_prompt params') $ \ts -> tokenize ts tokensPtr True + tokenizedCount' <- withCStringLen (_prompt params') $ \(ts, l) -> tokenize ts l tokensPtr True putStrLn "\nPrompt" putStrLn $ _prompt params' <> "\n\n" @@ -172,7 +173,7 @@ main = do putStrLn $ "\nTokenized " <> show tokenizedCount' putStrLn "\nRunning first eval of entire prompt" - _evalRes <- L.eval ctx tokensPtr tokenizedCount' 0 (_nThreads params') + _evalRes <- L.eval ctx tokensPtr tokenizedCount' 0 --(_nThreads params') (, tokenizedCount') <$> peekArray (fromIntegral tokenizedCount') tokensPtr @@ -190,29 +191,30 @@ main = do liftIO . putStrLn $ "\nsampling" - eos <- liftIO L.tokenEos + eos <- liftIO $ L.tokenEos model -- I feel like this is not the right way to do this? runEffect $ for (sample >-> takeWhile (/= eos)) $ \id' -> - lift . liftIO $ putStr =<< peekCString =<< L.tokenToStr ctx id' + lift . liftIO $ putStr =<< L.tokenToPiece model id' sample :: Producer L.Token ContextM () sample = do params' <- asks _params ctx <- asks _llamaCtx + model <- asks _model nPastTV <- asks _nPast nPast' <- liftIO . readTVarIO $ nPastTV lastNTokensTV <- asks _lastNTokens lastNTokens' <- liftIO . readTVarIO $ lastNTokensTV - nVocab <- fromIntegral <$> (liftIO . L.nVocab $ ctx) + nVocab <- fromIntegral <$> (liftIO . L.nVocab $ model) - id' <- liftIO $ sample' params' ctx nVocab lastNTokensTV + id' <- liftIO $ sample' model params' ctx nVocab lastNTokensTV void . liftIO . withArray [id'] $ \newTokenArrPtr -> - L.eval ctx newTokenArrPtr 1 (fromIntegral nPast') (_nThreads $ params') + L.eval ctx newTokenArrPtr 1 (fromIntegral nPast') -- (_nThreads $ params') liftIO . atomically $ do writeTVar nPastTV $ @@ -226,8 +228,8 @@ main = do yield id' *> sample - sample' :: Params -> L.Context -> Int -> TVar [L.Token] -> IO L.Token - sample' params' ctx nVocab lastNTokensTV = do + sample' :: L.Model -> Params -> L.Context -> Int -> TVar [L.Token] -> IO L.Token + sample' model params' ctx nVocab lastNTokensTV = do alloca $ \candidatesPPtr -> allocaArray nVocab $ \logitsCopyPtr -> do @@ -262,23 +264,23 @@ main = do lastNTokensLen = fromIntegral . length $ lastNTokens' -- float nl_logit = logits[llama_token_nl()]; - _nlLogit <- V.unsafeIndex logitsCopy . fromIntegral <$> L.tokenNl + _nlLogit <- V.unsafeIndex logitsCopy . fromIntegral <$> (L.tokenNl model) -- -- lastNTokensPtr should be a pointer just to the last -- set of tokens at the end of lastNTokens matching the -- count of lastNRepeat, right now it's the entire thing -- - L.sampleRepetitionPenalty - ctx candidatesPPtr lastNTokensPtr lastNTokensLen (_repeatPenalty params') + L.sampleRepetitionPenalties + ctx candidatesPPtr lastNTokensPtr lastNTokensLen (_repeatPenalty params') (1.0) (1.0) - L.sampleFrequencyAndPresencePenalties - ctx - candidatesPPtr - lastNTokensPtr - lastNTokensLen - (_alphaFrequency params') - (_alphaPresence params') + --L.sampleFrequencyAndPresencePenalties + -- ctx + -- candidatesPPtr + -- lastNTokensPtr + -- lastNTokensLen + -- (_alphaFrequency params') + -- (_alphaPresence params') -- todo -- if (!penalize_nl) { @@ -316,21 +318,23 @@ main = do -- putStrLn "\ndefault model quantize params" cpp <- malloc + mpp <- malloc putStrLn "\ninit context params" L.contextDefaultParams cpp - ctxParams' <- peek cpp + L.modelDefaultParams mpp + --ctxParams' <- peek cpp - let - ctxParams = ctxParams' - { L._nCtx = _nCtx params' - , L._nGpuLayers = _nGpuLayers params' - } + --let + -- ctxParams = ctxParams' + -- -- { L._nCtx = _nCtx params' + -- -- , L._nGpuLayers = _nGpuLayers params' + -- -- } - poke cpp ctxParams + --poke cpp ctxParams putStrLn "\nloading model" - model' <- withCString (_modelPath params') $ flip L.loadModelFromFile cpp + model' <- withCString (_modelPath params') $ flip L.loadModelFromFile mpp putStrLn "\nloading context" diff --git a/llama-cpp-bindings.cabal b/llama-cpp-bindings.cabal index 77f50c9..fd2ea27 100644 --- a/llama-cpp-bindings.cabal +++ b/llama-cpp-bindings.cabal @@ -64,10 +64,10 @@ executable examples main-is: Main.hs default-language: GHC2021 -test-suite llama-cpp-bindings-test - import: shared - build-depends: llama-cpp-bindings - hs-source-dirs: test - main-is: Main.hs - default-language: GHC2021 - type: exitcode-stdio-1.0 +-- test-suite llama-cpp-bindings-test +-- import: shared +-- build-depends: llama-cpp-bindings +-- hs-source-dirs: test +-- main-is: Main.hs +-- default-language: GHC2021 +-- type: exitcode-stdio-1.0 diff --git a/src/LLaMACPP.chs b/src/LLaMACPP.chs index d6d07f4..c0a6ab2 100644 --- a/src/LLaMACPP.chs +++ b/src/LLaMACPP.chs @@ -1,7 +1,10 @@ module LLaMACPP where +import Control.Exception (assert) import Data.Word (Word32) +import Foreign +import Foreign.C.String (peekCStringLen) import Foreign.C.Types (CChar, CDouble, CFloat, CInt, CLong, CSize, CUChar, CULong) import Foreign.Marshal.Utils (fromBool, toBool) import Foreign.Ptr (FunPtr, Ptr, castPtr) @@ -103,6 +106,40 @@ instance Storable TokenDataArray where {# set token_data_array->size #} p $ fromIntegral _size {# set token_data_array->sorted #} p _sorted +-- struct llama_model_params { +-- int32_t n_gpu_layers; // number of layers to store in VRAM +-- int32_t main_gpu; // the GPU that is used for scratch and small tensors +-- const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES) +-- +-- // called with a progress value between 0 and 1, pass NULL to disable +-- llama_progress_callback progress_callback; +-- +-- // context pointer passed to the progress callback +-- void * progress_callback_user_data; +-- +-- // override key-value pairs of the model meta data +-- const struct llama_model_kv_override * kv_overrides; +-- +-- // Keep the booleans together to avoid misalignment during copy-by-value. +-- bool vocab_only; // only load the vocabulary, no weights +-- bool use_mmap; // use mmap if possible +-- bool use_mlock; // force system to keep model in RAM +-- }; +data ModelParams = ModelParams + { _nGpuLayers :: Word32 + , _mainGpu :: Word32 + } +-- TODO, implement storable + +instance Storable ModelParams where + sizeOf _ = {# sizeof model_params #} + alignment _ = {# alignof model_params #} + peek p = ModelParams + <$> (fromIntegral <$> {# get model_params->n_gpu_layers #} p) + <*> (fromIntegral <$> {# get model_params->main_gpu #} p) + poke p mps = do + {# set model_params->n_gpu_layers #} p $ fromIntegral $ _nGpuLayers mps + {# set model_params->main_gpu #} p $ fromIntegral $ _mainGpu mps -- -- typedef void (*llama_progress_callback)(float progress, void *ctx); @@ -138,22 +175,39 @@ data ContextParams = ContextParams { _seed :: Word32 , _nCtx :: Int , _nBatch :: Int - , _nGpuLayers :: Int - , _mainGpu :: Int - , _tensorSplit :: Ptr CFloat - , _progressCallback :: FunPtr ProgressCallback - , _progressCallbackUserData :: Ptr () - , _lowVRAM :: Bool - , _f16KV :: Bool + , _nThreads :: Word32 + , _nThreadsBatch :: Word32 + , _ropeScalingType :: Word8 + , _ropeFreqBase :: CFloat + , _ropeFreqScale :: CFloat + , _yarnExtFactor :: CFloat + , _yarnAttnFactor :: CFloat + , _yarnBetaFast :: CFloat + , _yarnBetaSlow :: CFloat + , _yarnOrigCtx :: Word32 + , _typeK :: CInt + , _typeV :: CInt + , _mulMatQ :: Bool , _logitsAll :: Bool - , _vocabOnly :: Bool - , _useMmap :: Bool - , _useMlock :: Bool , _embedding :: Bool + , _offloadKqv :: Bool + --, _nGpuLayers :: Int + --, _mainGpu :: Int + --, _tensorSplit :: Ptr CFloat + --, _progressCallback :: FunPtr ProgressCallback + --, _progressCallbackUserData :: Ptr () + --, _lowVRAM :: Bool + --, _f16KV :: Bool + --, _logitsAll :: Bool + --, _vocabOnly :: Bool + --, _useMmap :: Bool + --, _useMlock :: Bool + --, _embedding :: Bool } -- deriving (Eq, Show) -- needs ProgressCallback instance {# pointer *context_params as ContextParamsPtr -> ContextParams #} +{# pointer *model_params as ModelParamsPtr -> ModelParams #} instance Storable ContextParams where sizeOf _ = {# sizeof context_params #} @@ -162,34 +216,66 @@ instance Storable ContextParams where <$> (fromIntegral <$> {# get context_params->seed #} p) <*> (fromIntegral <$> {# get context_params->n_ctx #} p) <*> (fromIntegral <$> {# get context_params->n_batch #} p) - <*> (fromIntegral <$> {# get context_params->n_gpu_layers #} p) - <*> (fromIntegral <$> {# get context_params->main_gpu #} p) - <*> {# get context_params->tensor_split #} p - <*> {# get context_params->progress_callback #} p - <*> {# get context_params->progress_callback_user_data #} p - <*> {# get context_params->low_vram #} p - <*> {# get context_params->f16_kv #} p - <*> {# get context_params->logits_all #} p - <*> {# get context_params->vocab_only #} p - <*> {# get context_params->use_mmap #} p - <*> {# get context_params->use_mlock #} p - <*> {# get context_params->embedding #} p + <*> (fromIntegral <$> {# get context_params->n_threads #} p) + <*> (fromIntegral <$> {# get context_params->n_threads_batch #} p) + <*> (fromIntegral <$> {# get context_params->rope_scaling_type #} p) + <*> {# get context_params->rope_freq_base #} p + <*> {# get context_params->rope_freq_scale #} p + <*> {# get context_params->yarn_ext_factor #} p + <*> {# get context_params->yarn_attn_factor #} p + <*> {# get context_params->yarn_beta_fast #} p + <*> {# get context_params->yarn_beta_slow #} p + <*> (fromIntegral <$> {# get context_params->yarn_orig_ctx #} p) + <*> {# get context_params->type_k #} p + <*> {# get context_params->type_k #} p + <*> {# get context_params->mul_mat_q #} p + <*> {# get context_params->logits_all #} p + <*> {# get context_params->embedding #} p + <*> {# get context_params->offload_kqv #} p + -- <*> (fromIntegral <$> {# get context_params->n_gpu_layers #} p) + -- <*> (fromIntegral <$> {# get context_params->main_gpu #} p) + -- <*> {# get context_params->tensor_split #} p + -- <*> {# get context_params->progress_callback #} p + -- <*> {# get context_params->progress_callback_user_data #} p + -- <*> {# get context_params->low_vram #} p + -- <*> {# get context_params->f16_kv #} p + -- <*> {# get context_params->logits_all #} p + -- <*> {# get context_params->vocab_only #} p + -- <*> {# get context_params->use_mmap #} p + -- <*> {# get context_params->use_mlock #} p + -- <*> {# get context_params->embedding #} p poke p cps = do {# set context_params->seed #} p $ fromIntegral $ _seed cps {# set context_params->n_ctx #} p $ fromIntegral $ _nCtx cps {# set context_params->n_batch #} p $ fromIntegral $ _nBatch cps - {# set context_params->n_gpu_layers #} p $ fromIntegral $ _nGpuLayers cps - {# set context_params->main_gpu #} p $ fromIntegral $ _mainGpu cps - {# set context_params->tensor_split #} p $ _tensorSplit cps - {# set context_params->progress_callback #} p $ _progressCallback cps - {# set context_params->progress_callback_user_data #} p $ _progressCallbackUserData cps - {# set context_params->low_vram #} p $ _lowVRAM cps - {# set context_params->f16_kv #} p $ _f16KV cps - {# set context_params->logits_all #} p $ _logitsAll cps - {# set context_params->vocab_only #} p $ _vocabOnly cps - {# set context_params->use_mmap #} p $ _useMmap cps - {# set context_params->use_mlock #} p $ _useMlock cps - {# set context_params->embedding #} p $ _embedding cps + {# set context_params-> n_threads #} p $ fromIntegral $ _nThreads cps + {# set context_params-> n_threads_batch #} p $ fromIntegral $ _nThreadsBatch cps + {# set context_params-> rope_scaling_type #} p $ fromIntegral $ _ropeScalingType cps + {# set context_params-> rope_freq_base #} p $ _ropeFreqBase cps + {# set context_params-> rope_freq_scale #} p $ _ropeFreqScale cps + {# set context_params-> yarn_ext_factor #} p $ _yarnExtFactor cps + {# set context_params-> yarn_attn_factor #} p $ _yarnAttnFactor cps + {# set context_params-> yarn_beta_fast #} p $ _yarnBetaFast cps + {# set context_params-> yarn_beta_slow #} p $ _yarnBetaSlow cps + {# set context_params-> yarn_orig_ctx #} p $ fromIntegral $ _yarnOrigCtx cps + {# set context_params-> type_k #} p $ _typeK cps + {# set context_params-> type_v #} p $ _typeV cps + {# set context_params-> mul_mat_q #} p $ _mulMatQ cps + {# set context_params-> logits_all #} p $ _logitsAll cps + {# set context_params-> embedding #} p $ _embedding cps + {# set context_params-> offload_kqv #} p $ _offloadKqv cps + -- {# set context_params->n_gpu_layers #} p $ fromIntegral $ _nGpuLayers cps + -- {# set context_params->main_gpu #} p $ fromIntegral $ _mainGpu cps + -- {# set context_params->tensor_split #} p $ _tensorSplit cps + -- {# set context_params->progress_callback #} p $ _progressCallback cps + -- {# set context_params->progress_callback_user_data #} p $ _progressCallbackUserData cps + -- {# set context_params->low_vram #} p $ _lowVRAM cps + -- {# set context_params->f16_kv #} p $ _f16KV cps + -- {# set context_params->logits_all #} p $ _logitsAll cps + -- {# set context_params->vocab_only #} p $ _vocabOnly cps + -- {# set context_params->use_mmap #} p $ _useMmap cps + -- {# set context_params->use_mlock #} p $ _useMlock cps + -- {# set context_params->embedding #} p $ _embedding cps -- @@ -266,6 +352,9 @@ maxDevices = {# call max_devices #} contextDefaultParams :: ContextParamsPtr -> IO () contextDefaultParams = {# call wrapper_context_default_params #} +modelDefaultParams :: ModelParamsPtr -> IO () +modelDefaultParams = {# call wrapper_model_default_params #} + -- -- LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(); @@ -319,9 +408,9 @@ timeUs = {# call time_us #} -- -- Can't pass structs by value via FFI, so wrote a wrapper: -- -loadModelFromFile :: Ptr CChar -> ContextParamsPtr -> IO Model -loadModelFromFile modelPath ctxParamsPtr = - {# call wrapper_load_model_from_file #} modelPath (castPtr ctxParamsPtr) +loadModelFromFile :: Ptr CChar -> ModelParamsPtr -> IO Model +loadModelFromFile modelPath modelParamsPtr = + {# call wrapper_load_model_from_file #} modelPath (castPtr modelParamsPtr) -- @@ -369,7 +458,7 @@ modelQuantize = {# call model_quantize #} -- const char * path_base_model, -- int n_threads); -- -modelApplyLoraFromFile :: Model -> Ptr CChar -> Ptr CChar -> CInt -> IO CInt +modelApplyLoraFromFile :: Model -> Ptr CChar -> CFloat -> Ptr CChar -> CInt -> IO CInt modelApplyLoraFromFile = {# call model_apply_lora_from_file #} @@ -444,7 +533,7 @@ saveSessionFile = {# call save_session_file #} -- int n_past, -- int n_threads); -- -eval :: Context -> Ptr Token -> CInt -> CInt -> CInt -> IO CInt +eval :: Context -> Ptr CInt -> CInt -> CInt -> IO CInt eval = {# call eval #} @@ -457,7 +546,7 @@ eval = {# call eval #} -- int n_past, -- int n_threads); -- -evalEmbd :: Context -> Ptr CFloat -> CInt -> CInt -> CInt -> IO CInt +evalEmbd :: Context -> Ptr CFloat -> CInt -> CInt -> IO CInt evalEmbd = {# call eval_embd #} @@ -468,8 +557,8 @@ evalEmbd = {# call eval_embd #} -- // IMPORTANT: do not use for anything else other than debugging and testing! -- LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname); -- -evalExport :: Context -> Ptr CChar -> IO CInt -evalExport = {# call eval_export #} +-- evalExport :: Context -> Ptr CChar -> IO CInt +-- evalExport = {# call eval_export #} -- @@ -485,7 +574,14 @@ evalExport = {# call eval_export #} -- int n_max_tokens, -- bool add_bos); -- -tokenize :: Context -> Ptr CChar -> Ptr Token -> CInt -> CUChar -> IO CInt +tokenize :: Model + -> Ptr CChar + -> CInt + -> Ptr CInt + -> CInt + -> CUChar + -> CUChar + -> IO CInt tokenize = {# call tokenize #} -- LLAMA_API int llama_tokenize_with_model( @@ -494,14 +590,14 @@ tokenize = {# call tokenize #} -- llama_token * tokens, -- int n_max_tokens, -- bool add_bos); -tokenizeWithModel :: Model -> Ptr CChar -> Ptr Token -> CInt -> CUChar -> IO CInt -tokenizeWithModel = {# call tokenize_with_model #} +-- tokenizeWithModel :: Model -> Ptr CChar -> Ptr Token -> CInt -> CUChar -> IO CInt +-- tokenizeWithModel = {# call tokenize_with_model #} -- -- LLAMA_API int llama_n_vocab(const struct llama_context * ctx); -- -nVocab :: Context -> IO CInt +nVocab :: Model -> IO CInt nVocab = {# call n_vocab #} -- @@ -513,28 +609,28 @@ nCtx = {# call n_ctx #} -- -- LLAMA_API int llama_n_embd (const struct llama_context * ctx); -- -nEmbd :: Context -> IO CInt +nEmbd :: Model -> IO CInt nEmbd = {# call n_embd #} -- -- LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model); -- -nVocabFromModel :: Model -> IO CInt -nVocabFromModel = {# call n_vocab_from_model #} +-- nVocabFromModel :: Model -> IO CInt +-- nVocabFromModel = {# call n_vocab_from_model #} -- -- LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model); -- -nCtxFromModel :: Model -> IO CInt -nCtxFromModel = {# call n_ctx_from_model #} +-- nCtxFromModel :: Model -> IO CInt +-- nCtxFromModel = {# call n_ctx_from_model #} -- -- LLAMA_API int llama_n_embd_from_model (const struct llama_model * model); -- -nEmbdFromModel :: Model -> IO CInt -nEmbdFromModel = {# call n_embd_from_model #} +-- nEmbdFromModel :: Model -> IO CInt +-- nEmbdFromModel = {# call n_embd_from_model #} -- @@ -546,8 +642,8 @@ nEmbdFromModel = {# call n_embd_from_model #} -- float * scores, -- int capacity); -- -getVocab :: Context -> Ptr (Ptr CChar) -> Ptr CFloat -> CInt -> IO CInt -getVocab = {# call get_vocab #} +-- getVocab :: Context -> Ptr (Ptr CChar) -> Ptr CFloat -> CInt -> IO CInt +-- getVocab = {# call get_vocab #} -- @@ -571,19 +667,30 @@ getEmbeddings :: Context -> IO (Ptr CFloat) getEmbeddings = {# call llama_get_embeddings #} +tokenToPiece :: Model -> Token -> IO String +tokenToPiece m t = do + allocaBytes 8 $ \buf -> do + nTokens <- fromIntegral <$> tokenToPiece' m t buf 8 + if nTokens >= 0 + then peekCStringLen (buf, nTokens) + else allocaBytes nTokens $ \buf' -> do + nTokens' <- fromIntegral <$> tokenToPiece' m t buf' (fromIntegral (negate nTokens)) + assert (nTokens==nTokens') (pure ()) + peekCStringLen (buf', fromIntegral nTokens) + -- -- // Token Id -> String. Uses the vocabulary in the provided context -- LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token); -- -tokenToStr :: Context -> Token -> IO (Ptr CChar) -tokenToStr = {# call token_to_str #} +tokenToPiece' :: Model -> Token -> Ptr CChar -> CInt -> IO CInt +tokenToPiece' = {# call token_to_piece #} -- LLAMA_API const char * llama_token_to_str_with_model( -- const struct llama_model * model, -- llama_token token); -tokenToStrWithModel :: Model -> Token -> IO (Ptr CChar) -tokenToStrWithModel = {# call token_to_str_with_model #} +-- tokenToStrWithModel :: Model -> Token -> IO (Ptr CChar) +-- tokenToStrWithModel = {# call token_to_str_with_model #} -- // Special tokens @@ -591,21 +698,21 @@ tokenToStrWithModel = {# call token_to_str_with_model #} -- -- LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence -- -tokenBos :: IO Token +tokenBos :: Model -> IO Token tokenBos = {# call token_bos #} -- -- LLAMA_API llama_token llama_token_eos(); // end-of-sentence -- -tokenEos :: IO Token +tokenEos :: Model -> IO Token tokenEos = {# call token_eos #} -- -- LLAMA_API llama_token llama_token_nl(); // next-line -- -tokenNl :: IO Token +tokenNl :: Model -> IO Token tokenNl = {# call token_nl #} @@ -613,19 +720,19 @@ tokenNl = {# call token_nl #} -- -- /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. --- LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty); +-- LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty, float penalty_freq); -- -sampleRepetitionPenalty :: Context -> Ptr TokenDataArray -> Ptr Token -> CULong -> CFloat -> IO () -sampleRepetitionPenalty = {# call sample_repetition_penalty #} +sampleRepetitionPenalties :: Context -> Ptr TokenDataArray -> Ptr Token -> CULong -> CFloat -> CFloat -> CFloat -> IO () +sampleRepetitionPenalties = {# call sample_repetition_penalties #} -- -- /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. -- LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence); -- -sampleFrequencyAndPresencePenalties - :: Context -> Ptr TokenDataArray -> Ptr Token -> CULong -> CFloat -> CFloat -> IO () -sampleFrequencyAndPresencePenalties = {# call sample_frequency_and_presence_penalties #} +-- sampleFrequencyAndPresencePenalties +-- :: Context -> Ptr TokenDataArray -> Ptr Token -> CULong -> CFloat -> CFloat -> IO () +-- sampleFrequencyAndPresencePenalties = {# call sample_frequency_and_presence_penalties #} -- /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806 From dd9867320b90aa868a6519b1af8ed10b8ff482af Mon Sep 17 00:00:00 2001 From: Ashton Ohms Date: Thu, 21 Dec 2023 02:03:39 -0800 Subject: [PATCH 2/9] It works! Stops + leaks mem after ~2doz tokens though --- cbits/wrapper.c | 7 +++++++ cbits/wrapper.h | 6 ++++++ examples/Main.hs | 26 ++++++++++++++++---------- src/LLaMACPP.chs | 6 +++--- 4 files changed, 32 insertions(+), 13 deletions(-) diff --git a/cbits/wrapper.c b/cbits/wrapper.c index 0ff94d5..55f6cdd 100644 --- a/cbits/wrapper.c +++ b/cbits/wrapper.c @@ -7,6 +7,13 @@ struct llama_model * wrapper_load_model_from_file return llama_load_model_from_file(path_model, *params); } +struct llama_context * wrapper_new_context_with_model( + struct llama_model * model, + struct llama_context_params * params + ) { + return llama_new_context_with_model(model, *params); +} + void wrapper_context_default_params(struct llama_context_params * default_params) { *default_params = llama_context_default_params(); } diff --git a/cbits/wrapper.h b/cbits/wrapper.h index e118d74..9482ecb 100644 --- a/cbits/wrapper.h +++ b/cbits/wrapper.h @@ -6,6 +6,12 @@ struct llama_model * wrapper_load_model_from_file( struct llama_model_params * params ); + +struct llama_context * wrapper_new_context_with_model( + struct llama_model * model, + struct llama_context_params * params + ); + void wrapper_context_default_params(struct llama_context_params *); void wrapper_model_default_params(struct llama_model_params *); diff --git a/examples/Main.hs b/examples/Main.hs index b294291..400fe7e 100644 --- a/examples/Main.hs +++ b/examples/Main.hs @@ -1,7 +1,8 @@ module Main where import Prelude hiding (takeWhile) - +import System.Exit +import System.IO import Control.Applicative ((<**>)) import Control.Exception (bracket) import Control.Monad.IO.Class (MonadIO, liftIO) @@ -196,7 +197,9 @@ main = do -- I feel like this is not the right way to do this? runEffect $ for (sample >-> takeWhile (/= eos)) $ \id' -> - lift . liftIO $ putStr =<< L.tokenToPiece model id' + lift . liftIO $ do + putStr =<< L.tokenToPiece model id' + hFlush stdout sample :: Producer L.Token ContextM () @@ -322,15 +325,18 @@ main = do putStrLn "\ninit context params" L.contextDefaultParams cpp L.modelDefaultParams mpp - --ctxParams' <- peek cpp - - --let - -- ctxParams = ctxParams' - -- -- { L._nCtx = _nCtx params' - -- -- , L._nGpuLayers = _nGpuLayers params' - -- -- } + ctxParams' <- peek cpp - --poke cpp ctxParams + let + ctxParams = ctxParams' + { L._nCtx = 2048 + , L._nBatch = 2048 + , L._seed = 1234 + --, L._nThreads = 2 + --, L._nThreadsBatch = 2 + } + + poke cpp ctxParams putStrLn "\nloading model" diff --git a/src/LLaMACPP.chs b/src/LLaMACPP.chs index c0a6ab2..f93c7e1 100644 --- a/src/LLaMACPP.chs +++ b/src/LLaMACPP.chs @@ -130,6 +130,7 @@ data ModelParams = ModelParams , _mainGpu :: Word32 } -- TODO, implement storable +{# pointer *model_params as ModelParamsPtr -> ModelParams #} instance Storable ModelParams where sizeOf _ = {# sizeof model_params #} @@ -204,10 +205,9 @@ data ContextParams = ContextParams --, _useMlock :: Bool --, _embedding :: Bool } - -- deriving (Eq, Show) -- needs ProgressCallback instance + deriving (Eq, Show) -- needs ProgressCallback instance {# pointer *context_params as ContextParamsPtr -> ContextParams #} -{# pointer *model_params as ModelParamsPtr -> ModelParams #} instance Storable ContextParams where sizeOf _ = {# sizeof context_params #} @@ -428,7 +428,7 @@ freeModel = {# call free_model #} newContextWithModel :: Model -> ContextParamsPtr -> IO (Context) newContextWithModel model ctxParamsPtr = - {# call new_context_with_model #} model (castPtr ctxParamsPtr) + {# call wrapper_new_context_with_model #} model (castPtr ctxParamsPtr) -- From ca940083f4d15fa1772e31797cbfbd360ec57952 Mon Sep 17 00:00:00 2001 From: Ashton Ohms Date: Thu, 21 Dec 2023 14:57:12 -0800 Subject: [PATCH 3/9] Fixed mem leak in tokenToPiece --- examples/Main.hs | 6 +++--- src/LLaMACPP.chs | 30 +++++++++++++++++++++--------- 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/examples/Main.hs b/examples/Main.hs index 400fe7e..02b09d6 100644 --- a/examples/Main.hs +++ b/examples/Main.hs @@ -329,9 +329,9 @@ main = do let ctxParams = ctxParams' - { L._nCtx = 2048 - , L._nBatch = 2048 - , L._seed = 1234 + { L._seed = 1234 + --, L._nCtx = 2048 + --, L._nBatch = 2048 --, L._nThreads = 2 --, L._nThreadsBatch = 2 } diff --git a/src/LLaMACPP.chs b/src/LLaMACPP.chs index f93c7e1..41a1546 100644 --- a/src/LLaMACPP.chs +++ b/src/LLaMACPP.chs @@ -1,6 +1,7 @@ module LLaMACPP where +import Data.Either (fromRight) import Control.Exception (assert) import Data.Word (Word32) import Foreign @@ -668,15 +669,26 @@ getEmbeddings = {# call llama_get_embeddings #} tokenToPiece :: Model -> Token -> IO String -tokenToPiece m t = do - allocaBytes 8 $ \buf -> do - nTokens <- fromIntegral <$> tokenToPiece' m t buf 8 - if nTokens >= 0 - then peekCStringLen (buf, nTokens) - else allocaBytes nTokens $ \buf' -> do - nTokens' <- fromIntegral <$> tokenToPiece' m t buf' (fromIntegral (negate nTokens)) - assert (nTokens==nTokens') (pure ()) - peekCStringLen (buf', fromIntegral nTokens) +tokenToPiece m t = + let alloc' size = allocaBytes size $ \buf -> do + nTokens <- fromIntegral <$> tokenToPiece' m t buf (fromIntegral size) + if nTokens >= 0 + then Right <$> peekCStringLen (buf, nTokens) + else pure $ Left (negate nTokens) + in do + r <- alloc' 8 + either (\nTokens -> do + r' <- alloc' nTokens + pure $ fromRight "??" r' + ) (pure . id) r + + + + + --else allocaBytes nTokens $ \buf' -> do + -- nTokens' <- fromIntegral <$> tokenToPiece' m t buf' (fromIntegral (negate nTokens)) + -- assert (nTokens==nTokens') (pure ()) + -- peekCStringLen (buf', fromIntegral nTokens) -- -- // Token Id -> String. Uses the vocabulary in the provided context From 4021e9871777903a840d1c5332a9d2114be7cd32 Mon Sep 17 00:00:00 2001 From: Ashton Ohms Date: Thu, 21 Dec 2023 15:08:36 -0800 Subject: [PATCH 4/9] Populate ModelParams --- src/LLaMACPP.chs | 53 ++++++++++++++++++++++-------------------------- 1 file changed, 24 insertions(+), 29 deletions(-) diff --git a/src/LLaMACPP.chs b/src/LLaMACPP.chs index 41a1546..7f826ec 100644 --- a/src/LLaMACPP.chs +++ b/src/LLaMACPP.chs @@ -107,6 +107,13 @@ instance Storable TokenDataArray where {# set token_data_array->size #} p $ fromIntegral _size {# set token_data_array->sorted #} p _sorted +-- +-- typedef void (*llama_progress_callback)(float progress, void *ctx); +-- +type ProgressCallback = CFloat -> Ptr () -> IO () + +{# pointer progress_callback as ProgressCallbackPtr -> ProgressCallback #} + -- struct llama_model_params { -- int32_t n_gpu_layers; // number of layers to store in VRAM -- int32_t main_gpu; // the GPU that is used for scratch and small tensors @@ -129,8 +136,16 @@ instance Storable TokenDataArray where data ModelParams = ModelParams { _nGpuLayers :: Word32 , _mainGpu :: Word32 + , _tensorSplit :: Ptr CFloat + , _progressCallback :: FunPtr ProgressCallback + , _progressCallbackUserData :: Ptr () + , _kvOverrides :: Ptr () + , _vocabOnly :: Bool + , _useMmap :: Bool + , _useMlock :: Bool } --- TODO, implement storable + deriving (Show, Eq) + {# pointer *model_params as ModelParamsPtr -> ModelParams #} instance Storable ModelParams where @@ -139,17 +154,17 @@ instance Storable ModelParams where peek p = ModelParams <$> (fromIntegral <$> {# get model_params->n_gpu_layers #} p) <*> (fromIntegral <$> {# get model_params->main_gpu #} p) + <*> ({# get model_params->tensor_split #} p) + <*> ({# get model_params->progress_callback #} p) + <*> ({# get model_params->progress_callback_user_data #} p) + <*> ({# get model_params->kv_overrides #} p) + <*> ({# get model_params->vocab_only #} p) + <*> ({# get model_params->use_mmap #} p) + <*> ({# get model_params->use_mlock #} p) poke p mps = do {# set model_params->n_gpu_layers #} p $ fromIntegral $ _nGpuLayers mps {# set model_params->main_gpu #} p $ fromIntegral $ _mainGpu mps --- --- typedef void (*llama_progress_callback)(float progress, void *ctx); --- -type ProgressCallback = CFloat -> Ptr () -> IO () - -{# pointer progress_callback as ProgressCallbackPtr -> ProgressCallback #} - -- --struct llama_context_params { -- uint32_t seed; // RNG seed, -1 for random @@ -193,20 +208,8 @@ data ContextParams = ContextParams , _logitsAll :: Bool , _embedding :: Bool , _offloadKqv :: Bool - --, _nGpuLayers :: Int - --, _mainGpu :: Int - --, _tensorSplit :: Ptr CFloat - --, _progressCallback :: FunPtr ProgressCallback - --, _progressCallbackUserData :: Ptr () - --, _lowVRAM :: Bool - --, _f16KV :: Bool - --, _logitsAll :: Bool - --, _vocabOnly :: Bool - --, _useMmap :: Bool - --, _useMlock :: Bool - --, _embedding :: Bool } - deriving (Eq, Show) -- needs ProgressCallback instance + deriving (Eq, Show) {# pointer *context_params as ContextParamsPtr -> ContextParams #} @@ -682,14 +685,6 @@ tokenToPiece m t = pure $ fromRight "??" r' ) (pure . id) r - - - - --else allocaBytes nTokens $ \buf' -> do - -- nTokens' <- fromIntegral <$> tokenToPiece' m t buf' (fromIntegral (negate nTokens)) - -- assert (nTokens==nTokens') (pure ()) - -- peekCStringLen (buf', fromIntegral nTokens) - -- -- // Token Id -> String. Uses the vocabulary in the provided context -- LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token); From 9409bbdaeb3a686a8f541795ccf484edb1af5614 Mon Sep 17 00:00:00 2001 From: Ashton Ohms Date: Thu, 21 Dec 2023 15:25:02 -0800 Subject: [PATCH 5/9] Cleanup, update doc comments --- src/LLaMACPP.chs | 118 ++++++++++------------------------------------- 1 file changed, 25 insertions(+), 93 deletions(-) diff --git a/src/LLaMACPP.chs b/src/LLaMACPP.chs index 7f826ec..0363f99 100644 --- a/src/LLaMACPP.chs +++ b/src/LLaMACPP.chs @@ -167,26 +167,31 @@ instance Storable ModelParams where -- --struct llama_context_params { --- uint32_t seed; // RNG seed, -1 for random --- int32_t n_ctx; // text context --- int32_t n_batch; // prompt processing batch size --- int32_t n_gpu_layers; // number of layers to store in VRAM --- int32_t main_gpu; // the GPU that is used for scratch and small tensors --- float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs --- // called with a progress value between 0 and 1, pass NULL to disable --- llama_progress_callback progress_callback; --- // context pointer passed to the progress callback --- void * progress_callback_user_data; --- --- // Keep the booleans together to avoid misalignment during copy-by-value. --- bool low_vram; // if true, reduce VRAM usage at the cost of performance --- bool f16_kv; // use fp16 for KV cache --- bool logits_all; // the llama_eval() call computes all logits, not just the last one --- bool vocab_only; // only load the vocabulary, no weights --- bool use_mmap; // use mmap if possible --- bool use_mlock; // force system to keep model in RAM --- bool embedding; // embedding mode only --- }; +-- uint32_t seed; // RNG seed, -1 for random +-- uint32_t n_ctx; // text context, 0 = from model +-- uint32_t n_batch; // prompt processing maximum batch size +-- uint32_t n_threads; // number of threads to use for generation +-- uint32_t n_threads_batch; // number of threads to use for batch processing +-- int8_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` +-- +-- // ref: https://github.com/ggerganov/llama.cpp/pull/2054 +-- float rope_freq_base; // RoPE base frequency, 0 = from model +-- float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model +-- float yarn_ext_factor; // YaRN extrapolation mix factor, negative = from model +-- float yarn_attn_factor; // YaRN magnitude scaling factor +-- float yarn_beta_fast; // YaRN low correction dim +-- float yarn_beta_slow; // YaRN high correction dim +-- uint32_t yarn_orig_ctx; // YaRN original context size +-- +-- enum ggml_type type_k; // data type for K cache +-- enum ggml_type type_v; // data type for V cache +-- +-- // Keep the booleans together to avoid misalignment during copy-by-value. +-- bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true) +-- bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) +-- bool embedding; // embedding mode only +-- bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU +-- }; -- data ContextParams = ContextParams { _seed :: Word32 @@ -553,18 +558,6 @@ eval = {# call eval #} evalEmbd :: Context -> Ptr CFloat -> CInt -> CInt -> IO CInt evalEmbd = {# call eval_embd #} - --- --- // Export a static computation graph for context of 511 and batch size of 1 --- // NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these --- // parameters here to keep things simple --- // IMPORTANT: do not use for anything else other than debugging and testing! --- LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname); --- --- evalExport :: Context -> Ptr CChar -> IO CInt --- evalExport = {# call eval_export #} - - -- -- // Convert the provided text into tokens. -- // The tokens pointer must be large enough to hold the resulting tokens. @@ -588,16 +581,6 @@ tokenize :: Model -> IO CInt tokenize = {# call tokenize #} --- LLAMA_API int llama_tokenize_with_model( --- const struct llama_model * model, --- const char * text, --- llama_token * tokens, --- int n_max_tokens, --- bool add_bos); --- tokenizeWithModel :: Model -> Ptr CChar -> Ptr Token -> CInt -> CUChar -> IO CInt --- tokenizeWithModel = {# call tokenize_with_model #} - - -- -- LLAMA_API int llama_n_vocab(const struct llama_context * ctx); -- @@ -616,40 +599,6 @@ nCtx = {# call n_ctx #} nEmbd :: Model -> IO CInt nEmbd = {# call n_embd #} - --- --- LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model); --- --- nVocabFromModel :: Model -> IO CInt --- nVocabFromModel = {# call n_vocab_from_model #} - --- --- LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model); --- --- nCtxFromModel :: Model -> IO CInt --- nCtxFromModel = {# call n_ctx_from_model #} - - --- --- LLAMA_API int llama_n_embd_from_model (const struct llama_model * model); --- --- nEmbdFromModel :: Model -> IO CInt --- nEmbdFromModel = {# call n_embd_from_model #} - - --- --- // Get the vocabulary as output parameters. --- // Returns number of results. --- LLAMA_API int llama_get_vocab( --- const struct llama_context * ctx, --- const char * * strings, --- float * scores, --- int capacity); --- --- getVocab :: Context -> Ptr (Ptr CChar) -> Ptr CFloat -> CInt -> IO CInt --- getVocab = {# call get_vocab #} - - -- -- // Token logits obtained from the last call to llama_eval() -- // The logits for the last token are stored in the last row @@ -661,7 +610,6 @@ nEmbd = {# call n_embd #} getLogits :: Context -> IO (Ptr CFloat) getLogits = {# call llama_get_logits #} - -- -- // Get the embeddings for the input -- // shape: [n_embd] (1-dimensional) @@ -693,13 +641,6 @@ tokenToPiece' :: Model -> Token -> Ptr CChar -> CInt -> IO CInt tokenToPiece' = {# call token_to_piece #} --- LLAMA_API const char * llama_token_to_str_with_model( --- const struct llama_model * model, --- llama_token token); --- tokenToStrWithModel :: Model -> Token -> IO (Ptr CChar) --- tokenToStrWithModel = {# call token_to_str_with_model #} - - -- // Special tokens -- @@ -733,15 +674,6 @@ sampleRepetitionPenalties :: Context -> Ptr TokenDataArray -> Ptr Token -> CULon sampleRepetitionPenalties = {# call sample_repetition_penalties #} --- --- /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. --- LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence); --- --- sampleFrequencyAndPresencePenalties --- :: Context -> Ptr TokenDataArray -> Ptr Token -> CULong -> CFloat -> CFloat -> IO () --- sampleFrequencyAndPresencePenalties = {# call sample_frequency_and_presence_penalties #} - - -- /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806 -- /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted. -- /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context. From 1883545933a29d1e58e03860698f379af28791c6 Mon Sep 17 00:00:00 2001 From: Ashton Ohms Date: Thu, 21 Dec 2023 15:42:05 -0800 Subject: [PATCH 6/9] Refactor tokenToPiece as tokenToString in Main --- examples/Main.hs | 35 ++++++++++++++++++++++++++--------- src/LLaMACPP.chs | 29 ++++++----------------------- 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/examples/Main.hs b/examples/Main.hs index 02b09d6..b1d098b 100644 --- a/examples/Main.hs +++ b/examples/Main.hs @@ -1,15 +1,15 @@ module Main where import Prelude hiding (takeWhile) -import System.Exit -import System.IO import Control.Applicative ((<**>)) import Control.Exception (bracket) import Control.Monad.IO.Class (MonadIO, liftIO) import Control.Monad.Reader (MonadReader, ReaderT, runReaderT, asks) +import Data.Either (fromRight) import Data.Functor ((<&>), void) import qualified Data.Vector.Storable as V -import Foreign.C.String (peekCString, withCString, withCStringLen) +import Foreign (allocaBytes) +import Foreign.C.String (peekCString, peekCStringLen, withCString, withCStringLen) import Foreign.C.Types (CFloat, CInt) import Foreign.ForeignPtr (newForeignPtr_) import Foreign.Marshal.Alloc (alloca, free, malloc) @@ -40,6 +40,7 @@ import Options.Applicative ) import Pipes (Producer, (>->), for, lift, runEffect, yield) import Pipes.Prelude (takeWhile) +import System.IO (hFlush, stdout) data Params = Params { _nCtx :: Int @@ -119,6 +120,24 @@ runContextM :: Context -> ContextM () -> IO () runContextM ctx (ContextM ctxM) = runReaderT ctxM ctx +tokenToString :: L.Model -> L.Token -> IO String +tokenToString m t = + let + -- Returns (Right tokenString) if successful, otherwise + -- (Left len), where len is correct token size to allocate + tokenBufToStr size = allocaBytes size $ \buf -> do + nTokens <- fromIntegral <$> L.tokenToPiece m t buf (fromIntegral size) + if nTokens >= 0 + then Right <$> peekCStringLen (buf, nTokens) + -- tokenToPiece will return the token size, negated, + -- iff the size we allocate is too small. + else pure $ Left (negate nTokens) + in do + r <- tokenBufToStr 8 + either (\nTokens -> do + r' <- tokenBufToStr nTokens + pure $ fromRight (error "token allocation failed") r' + ) (pure . id) r -- -- mimicking call here, somewhat: https://huggingface.co/TheBloke/open-llama-7B-v2-open-instruct-GGML#how-to-run-in-llamacpp @@ -156,7 +175,7 @@ main = do liftIO . allocaArray 1 $ \(tmp :: Ptr L.Token) -> do bos <- L.tokenBos model pokeArray tmp [bos] - _evalRes <- L.eval ctx tmp 1 0 --(_nThreads $ params') + _evalRes <- L.eval ctx tmp 1 0 L.resetTimings ctx let @@ -174,7 +193,7 @@ main = do putStrLn $ "\nTokenized " <> show tokenizedCount' putStrLn "\nRunning first eval of entire prompt" - _evalRes <- L.eval ctx tokensPtr tokenizedCount' 0 --(_nThreads params') + _evalRes <- L.eval ctx tokensPtr tokenizedCount' 0 (, tokenizedCount') <$> peekArray (fromIntegral tokenizedCount') tokensPtr @@ -198,7 +217,7 @@ main = do runEffect $ for (sample >-> takeWhile (/= eos)) $ \id' -> lift . liftIO $ do - putStr =<< L.tokenToPiece model id' + putStr =<< tokenToString model id' hFlush stdout @@ -217,7 +236,7 @@ main = do id' <- liftIO $ sample' model params' ctx nVocab lastNTokensTV void . liftIO . withArray [id'] $ \newTokenArrPtr -> - L.eval ctx newTokenArrPtr 1 (fromIntegral nPast') -- (_nThreads $ params') + L.eval ctx newTokenArrPtr 1 (fromIntegral nPast') liftIO . atomically $ do writeTVar nPastTV $ @@ -226,8 +245,6 @@ main = do else nPast' + 1 writeTVar lastNTokensTV $ drop 1 lastNTokens' <> [id'] - -- liftIO $ putStr =<< peekCString =<< L.tokenToStr ctx id' - yield id' *> sample diff --git a/src/LLaMACPP.chs b/src/LLaMACPP.chs index 0363f99..ee6834b 100644 --- a/src/LLaMACPP.chs +++ b/src/LLaMACPP.chs @@ -1,11 +1,8 @@ module LLaMACPP where -import Data.Either (fromRight) import Control.Exception (assert) -import Data.Word (Word32) -import Foreign -import Foreign.C.String (peekCStringLen) +import Data.Word (Word8, Word32) import Foreign.C.Types (CChar, CDouble, CFloat, CInt, CLong, CSize, CUChar, CULong) import Foreign.Marshal.Utils (fromBool, toBool) import Foreign.Ptr (FunPtr, Ptr, castPtr) @@ -619,46 +616,32 @@ getEmbeddings :: Context -> IO (Ptr CFloat) getEmbeddings = {# call llama_get_embeddings #} -tokenToPiece :: Model -> Token -> IO String -tokenToPiece m t = - let alloc' size = allocaBytes size $ \buf -> do - nTokens <- fromIntegral <$> tokenToPiece' m t buf (fromIntegral size) - if nTokens >= 0 - then Right <$> peekCStringLen (buf, nTokens) - else pure $ Left (negate nTokens) - in do - r <- alloc' 8 - either (\nTokens -> do - r' <- alloc' nTokens - pure $ fromRight "??" r' - ) (pure . id) r - -- -- // Token Id -> String. Uses the vocabulary in the provided context -- LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token); -- -tokenToPiece' :: Model -> Token -> Ptr CChar -> CInt -> IO CInt -tokenToPiece' = {# call token_to_piece #} +tokenToPiece :: Model -> Token -> Ptr CChar -> CInt -> IO CInt +tokenToPiece = {# call token_to_piece #} -- // Special tokens -- --- LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence +-- LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence -- tokenBos :: Model -> IO Token tokenBos = {# call token_bos #} -- --- LLAMA_API llama_token llama_token_eos(); // end-of-sentence +-- LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence -- tokenEos :: Model -> IO Token tokenEos = {# call token_eos #} -- --- LLAMA_API llama_token llama_token_nl(); // next-line +-- LLAMA_API llama_token llama_token_nl(const struct llama_model * model); // next-line -- tokenNl :: Model -> IO Token tokenNl = {# call token_nl #} From ce4f92f7f64fd57094dedb1e6e770045be1372ce Mon Sep 17 00:00:00 2001 From: Ashton Ohms Date: Thu, 21 Dec 2023 15:44:11 -0800 Subject: [PATCH 7/9] Re-add test suite to cabal --- llama-cpp-bindings.cabal | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/llama-cpp-bindings.cabal b/llama-cpp-bindings.cabal index fd2ea27..77f50c9 100644 --- a/llama-cpp-bindings.cabal +++ b/llama-cpp-bindings.cabal @@ -64,10 +64,10 @@ executable examples main-is: Main.hs default-language: GHC2021 --- test-suite llama-cpp-bindings-test --- import: shared --- build-depends: llama-cpp-bindings --- hs-source-dirs: test --- main-is: Main.hs --- default-language: GHC2021 --- type: exitcode-stdio-1.0 +test-suite llama-cpp-bindings-test + import: shared + build-depends: llama-cpp-bindings + hs-source-dirs: test + main-is: Main.hs + default-language: GHC2021 + type: exitcode-stdio-1.0 From ea9d8512fc593f4e13a2116c43d561c3e3da86cb Mon Sep 17 00:00:00 2001 From: Ashton Ohms Date: Thu, 21 Dec 2023 15:46:04 -0800 Subject: [PATCH 8/9] Remove sampleFrequencyAndPresencePenalties (no longer exists) --- examples/Main.hs | 8 -------- 1 file changed, 8 deletions(-) diff --git a/examples/Main.hs b/examples/Main.hs index b1d098b..3a7a6a9 100644 --- a/examples/Main.hs +++ b/examples/Main.hs @@ -294,14 +294,6 @@ main = do L.sampleRepetitionPenalties ctx candidatesPPtr lastNTokensPtr lastNTokensLen (_repeatPenalty params') (1.0) (1.0) - --L.sampleFrequencyAndPresencePenalties - -- ctx - -- candidatesPPtr - -- lastNTokensPtr - -- lastNTokensLen - -- (_alphaFrequency params') - -- (_alphaPresence params') - -- todo -- if (!penalize_nl) { -- logits[llama_token_nl()] = nl_logit; From cdb3e391c77e2318c823f6d7a7aa810bca616575 Mon Sep 17 00:00:00 2001 From: Ashton Ohms Date: Thu, 21 Dec 2023 16:21:09 -0800 Subject: [PATCH 9/9] Fixing params --- examples/Main.hs | 21 ++++++++++++--------- src/LLaMACPP.chs | 4 ++-- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/examples/Main.hs b/examples/Main.hs index 3a7a6a9..aa7e4c5 100644 --- a/examples/Main.hs +++ b/examples/Main.hs @@ -44,7 +44,7 @@ import System.IO (hFlush, stdout) data Params = Params { _nCtx :: Int - , _nThreads :: CInt + , _nThreads :: Int , _nPredict :: Int , _nGpuLayers :: Int , _enableNumaOpts :: Bool @@ -329,22 +329,25 @@ main = do -- todo -- putStrLn "\ndefault model quantize params" - cpp <- malloc + putStrLn "\ninit model & context params" mpp <- malloc - putStrLn "\ninit context params" - L.contextDefaultParams cpp L.modelDefaultParams mpp + modelParams' <- peek mpp + + cpp <- malloc + L.contextDefaultParams cpp ctxParams' <- peek cpp let + modelParams = modelParams' + { L._nGpuLayers = _nGpuLayers params' + } ctxParams = ctxParams' - { L._seed = 1234 - --, L._nCtx = 2048 - --, L._nBatch = 2048 - --, L._nThreads = 2 - --, L._nThreadsBatch = 2 + { L._nCtx = _nCtx params' + , L._nThreads = _nThreads params' } + poke mpp modelParams poke cpp ctxParams putStrLn "\nloading model" diff --git a/src/LLaMACPP.chs b/src/LLaMACPP.chs index ee6834b..d440750 100644 --- a/src/LLaMACPP.chs +++ b/src/LLaMACPP.chs @@ -131,7 +131,7 @@ type ProgressCallback = CFloat -> Ptr () -> IO () -- bool use_mlock; // force system to keep model in RAM -- }; data ModelParams = ModelParams - { _nGpuLayers :: Word32 + { _nGpuLayers :: Int , _mainGpu :: Word32 , _tensorSplit :: Ptr CFloat , _progressCallback :: FunPtr ProgressCallback @@ -194,7 +194,7 @@ data ContextParams = ContextParams { _seed :: Word32 , _nCtx :: Int , _nBatch :: Int - , _nThreads :: Word32 + , _nThreads :: Int , _nThreadsBatch :: Word32 , _ropeScalingType :: Word8 , _ropeFreqBase :: CFloat