Skip to content

Updating to latest llama.cpp #2

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion cbits/wrapper.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,25 @@
#include "wrapper.h"

struct llama_model * wrapper_load_model_from_file
( const char * path_model, struct llama_context_params * params) {
( const char * path_model, struct llama_model_params * params) {
return llama_load_model_from_file(path_model, *params);
}

struct llama_context * wrapper_new_context_with_model(
struct llama_model * model,
struct llama_context_params * params
) {
return llama_new_context_with_model(model, *params);
}

void wrapper_context_default_params(struct llama_context_params * default_params) {
*default_params = llama_context_default_params();
}

void wrapper_model_default_params(struct llama_model_params * default_params) {
*default_params = llama_model_default_params();
}

void wrapper_model_quantize_default_params(struct llama_model_quantize_params * default_params) {
*default_params = llama_model_quantize_default_params();
}
Expand Down
10 changes: 9 additions & 1 deletion cbits/wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,19 @@

struct llama_model * wrapper_load_model_from_file(
const char * path_model,
struct llama_context_params * params
struct llama_model_params * params
);


struct llama_context * wrapper_new_context_with_model(
struct llama_model * model,
struct llama_context_params * params
);

void wrapper_context_default_params(struct llama_context_params *);

void wrapper_model_default_params(struct llama_model_params *);

void wrapper_model_quantize_default_params(struct llama_model_quantize_params *);

void wrapper_get_timings(struct llama_context * ctx, struct llama_timings * timings);
86 changes: 54 additions & 32 deletions examples/Main.hs
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
module Main where

import Prelude hiding (takeWhile)

import Control.Applicative ((<**>))
import Control.Exception (bracket)
import Control.Monad.IO.Class (MonadIO, liftIO)
import Control.Monad.Reader (MonadReader, ReaderT, runReaderT, asks)
import Data.Either (fromRight)
import Data.Functor ((<&>), void)
import qualified Data.Vector.Storable as V
import Foreign.C.String (peekCString, withCString)
import Foreign (allocaBytes)
import Foreign.C.String (peekCString, peekCStringLen, withCString, withCStringLen)
import Foreign.C.Types (CFloat, CInt)
import Foreign.ForeignPtr (newForeignPtr_)
import Foreign.Marshal.Alloc (alloca, free, malloc)
Expand Down Expand Up @@ -39,10 +40,11 @@ import Options.Applicative
)
import Pipes (Producer, (>->), for, lift, runEffect, yield)
import Pipes.Prelude (takeWhile)
import System.IO (hFlush, stdout)

data Params = Params
{ _nCtx :: Int
, _nThreads :: CInt
, _nThreads :: Int
, _nPredict :: Int
, _nGpuLayers :: Int
, _enableNumaOpts :: Bool
Expand Down Expand Up @@ -118,6 +120,24 @@ runContextM :: Context -> ContextM () -> IO ()
runContextM ctx (ContextM ctxM) = runReaderT ctxM ctx


tokenToString :: L.Model -> L.Token -> IO String
tokenToString m t =
let
-- Returns (Right tokenString) if successful, otherwise
-- (Left len), where len is correct token size to allocate
tokenBufToStr size = allocaBytes size $ \buf -> do
nTokens <- fromIntegral <$> L.tokenToPiece m t buf (fromIntegral size)
if nTokens >= 0
then Right <$> peekCStringLen (buf, nTokens)
-- tokenToPiece will return the token size, negated,
-- iff the size we allocate is too small.
else pure $ Left (negate nTokens)
in do
r <- tokenBufToStr 8
either (\nTokens -> do
r' <- tokenBufToStr nTokens
pure $ fromRight (error "token allocation failed") r'
) (pure . id) r
--

-- mimicking call here, somewhat: https://huggingface.co/TheBloke/open-llama-7B-v2-open-instruct-GGML#how-to-run-in-llamacpp
Expand Down Expand Up @@ -147,32 +167,33 @@ main = do

params' <- asks _params
ctx <- asks _llamaCtx
model <- asks _model

-- tokenizing & eval

-- "do one empty run to warm up the model"
liftIO . allocaArray 1 $ \(tmp :: Ptr L.Token) -> do
bos <- L.tokenBos
bos <- L.tokenBos model
pokeArray tmp [bos]
_evalRes <- L.eval ctx tmp 1 0 (_nThreads $ params')
_evalRes <- L.eval ctx tmp 1 0
L.resetTimings ctx

let
-- todo why is this constant here
maxTokens = 1024
tokenize s tks addBos =
L.tokenize ctx s tks (fromIntegral maxTokens) (fromBool addBos)
tokenize s l tks addBos =
L.tokenize model s (fromIntegral l) tks (fromIntegral maxTokens) (fromBool addBos) (fromBool False)

(tokenized, tokenizedCount) <- liftIO . allocaArray maxTokens $ \tokensPtr -> do
tokenizedCount' <- withCString (_prompt params') $ \ts -> tokenize ts tokensPtr True
tokenizedCount' <- withCStringLen (_prompt params') $ \(ts, l) -> tokenize ts l tokensPtr True

putStrLn "\nPrompt"
putStrLn $ _prompt params' <> "\n\n"

putStrLn $ "\nTokenized " <> show tokenizedCount'

putStrLn "\nRunning first eval of entire prompt"
_evalRes <- L.eval ctx tokensPtr tokenizedCount' 0 (_nThreads params')
_evalRes <- L.eval ctx tokensPtr tokenizedCount' 0

(, tokenizedCount') <$>
peekArray (fromIntegral tokenizedCount') tokensPtr
Expand All @@ -190,29 +211,32 @@ main = do

liftIO . putStrLn $ "\nsampling"

eos <- liftIO L.tokenEos
eos <- liftIO $ L.tokenEos model

-- I feel like this is not the right way to do this?
runEffect $
for (sample >-> takeWhile (/= eos)) $ \id' ->
lift . liftIO $ putStr =<< peekCString =<< L.tokenToStr ctx id'
lift . liftIO $ do
putStr =<< tokenToString model id'
hFlush stdout


sample :: Producer L.Token ContextM ()
sample = do
params' <- asks _params
ctx <- asks _llamaCtx
model <- asks _model
nPastTV <- asks _nPast
nPast' <- liftIO . readTVarIO $ nPastTV
lastNTokensTV <- asks _lastNTokens
lastNTokens' <- liftIO . readTVarIO $ lastNTokensTV

nVocab <- fromIntegral <$> (liftIO . L.nVocab $ ctx)
nVocab <- fromIntegral <$> (liftIO . L.nVocab $ model)

id' <- liftIO $ sample' params' ctx nVocab lastNTokensTV
id' <- liftIO $ sample' model params' ctx nVocab lastNTokensTV

void . liftIO . withArray [id'] $ \newTokenArrPtr ->
L.eval ctx newTokenArrPtr 1 (fromIntegral nPast') (_nThreads $ params')
L.eval ctx newTokenArrPtr 1 (fromIntegral nPast')

liftIO . atomically $ do
writeTVar nPastTV $
Expand All @@ -221,13 +245,11 @@ main = do
else nPast' + 1
writeTVar lastNTokensTV $ drop 1 lastNTokens' <> [id']

-- liftIO $ putStr =<< peekCString =<< L.tokenToStr ctx id'

yield id' *> sample


sample' :: Params -> L.Context -> Int -> TVar [L.Token] -> IO L.Token
sample' params' ctx nVocab lastNTokensTV = do
sample' :: L.Model -> Params -> L.Context -> Int -> TVar [L.Token] -> IO L.Token
sample' model params' ctx nVocab lastNTokensTV = do
alloca $ \candidatesPPtr ->
allocaArray nVocab $ \logitsCopyPtr -> do

Expand Down Expand Up @@ -262,23 +284,15 @@ main = do
lastNTokensLen = fromIntegral . length $ lastNTokens'

-- float nl_logit = logits[llama_token_nl()];
_nlLogit <- V.unsafeIndex logitsCopy . fromIntegral <$> L.tokenNl
_nlLogit <- V.unsafeIndex logitsCopy . fromIntegral <$> (L.tokenNl model)

--
-- lastNTokensPtr should be a pointer just to the last
-- set of tokens at the end of lastNTokens matching the
-- count of lastNRepeat, right now it's the entire thing
--
L.sampleRepetitionPenalty
ctx candidatesPPtr lastNTokensPtr lastNTokensLen (_repeatPenalty params')

L.sampleFrequencyAndPresencePenalties
ctx
candidatesPPtr
lastNTokensPtr
lastNTokensLen
(_alphaFrequency params')
(_alphaPresence params')
L.sampleRepetitionPenalties
ctx candidatesPPtr lastNTokensPtr lastNTokensLen (_repeatPenalty params') (1.0) (1.0)

-- todo
-- if (!penalize_nl) {
Expand Down Expand Up @@ -315,22 +329,30 @@ main = do
-- todo
-- putStrLn "\ndefault model quantize params"

putStrLn "\ninit model & context params"
mpp <- malloc
L.modelDefaultParams mpp
modelParams' <- peek mpp

cpp <- malloc
putStrLn "\ninit context params"
L.contextDefaultParams cpp
ctxParams' <- peek cpp

let
modelParams = modelParams'
{ L._nGpuLayers = _nGpuLayers params'
}
ctxParams = ctxParams'
{ L._nCtx = _nCtx params'
, L._nGpuLayers = _nGpuLayers params'
, L._nThreads = _nThreads params'
}

poke mpp modelParams
poke cpp ctxParams

putStrLn "\nloading model"

model' <- withCString (_modelPath params') $ flip L.loadModelFromFile cpp
model' <- withCString (_modelPath params') $ flip L.loadModelFromFile mpp

putStrLn "\nloading context"

Expand Down
Loading