From 4199dae4156ae3c37b5ac50ad6ed1568577dcec7 Mon Sep 17 00:00:00 2001 From: Justine Tunney Date: Sat, 12 Oct 2024 19:34:06 -0700 Subject: [PATCH] Make chat+server hybrid the new default mode --- llama.cpp/main/main.1 | 85 ++++----- llama.cpp/main/main.1.asc | 331 +++++++++++++++++------------------- llama.cpp/main/main.cpp | 11 +- llama.cpp/server/server.cpp | 27 ++- llama.cpp/server/server.h | 7 + llama.cpp/server/utils.h | 3 + llamafile/chatbot.cpp | 81 ++++++--- 7 files changed, 303 insertions(+), 242 deletions(-) diff --git a/llama.cpp/main/main.1 b/llama.cpp/main/main.1 index d2f51bfcd3..b396f3318e 100644 --- a/llama.cpp/main/main.1 +++ b/llama.cpp/main/main.1 @@ -1,4 +1,4 @@ -.Dd January 1, 2024 +.Dd October 12, 2024 .Dt LLAMAFILE 1 .Os Mozilla Ocho .Sh NAME @@ -6,6 +6,10 @@ .Nd large language model runner .Sh SYNOPSIS .Nm +.Op Fl Fl chat +.Op flags... +.Fl m Ar model.gguf +.Nm .Op Fl Fl server .Op flags... .Fl m Ar model.gguf @@ -36,13 +40,20 @@ Chatbot that passes the Turing test .It Text/image summarization and analysis .El -.Sh OPTIONS -The following options are available: +.Sh MODES +.Pp +There's three modes of operation: +.Fl Fl chat , +.Fl Fl server , +and +.Fl Fl cli . +If none of these flags is specified, then llamafile makes its best guess +about which mode is best. By default, the +.Fl Fl chat +interface is launched in the foreground with a +.Fl Fl server +in the background. .Bl -tag -width indent -.It Fl Fl version -Print version and exit. -.It Fl h , Fl Fl help -Show help message and exit. .It Fl Fl cli Puts program in command line interface mode. This flag is implied when a prompt is supplied using either the @@ -50,9 +61,19 @@ prompt is supplied using either the or .Fl f flags. +.It Fl Fl chat +Puts program in command line chatbot only mode. This mode launches an +interactive shell that lets you talk to your LLM, which should be +specified using the +.Fl m +flag. This mode also launches a server in the background. The system +prompt that's displayed at the start of your conversation may be changed +by passing the +.Fl p +flag. .It Fl Fl server -Puts program in server mode. This will launch an HTTP server on a local -port. This server has both a web UI and an OpenAI API compatible +Puts program in server only mode. This will launch an HTTP server on a +local port. This server has both a web UI and an OpenAI API compatible completions endpoint. When the server is run on a desk system, a tab browser tab will be launched automatically that displays the web UI. This @@ -62,6 +83,15 @@ flag is implied if no prompt is specified, i.e. neither the or .Fl f flags are passed. +.El +.Sh OPTIONS +.Pp +The following options are available: +.Bl -tag -width indent +.It Fl Fl version +Print version and exit. +.It Fl h , Fl Fl help +Show help message and exit. .It Fl m Ar FNAME , Fl Fl model Ar FNAME Model path in the GGUF file format. .Pp @@ -83,25 +113,6 @@ Default: -1 Number of threads to use during generation. .Pp Default: $(nproc)/2 -.It Fl tb Ar N , Fl Fl threads-batch Ar N -Set the number of threads to use during batch and prompt processing. In -some systems, it is beneficial to use a higher number of threads during -batch processing than during generation. If not specified, the number of -threads used for batch processing will be the same as the number of -threads used for generation. -.Pp -Default: Same as -.Fl Fl threads -.It Fl td Ar N , Fl Fl threads-draft Ar N -Number of threads to use during generation. -.Pp -Default: Same as -.Fl Fl threads -.It Fl tbd Ar N , Fl Fl threads-batch-draft Ar N -Number of threads to use during batch and prompt processing. -.Pp -Default: Same as -.Fl Fl threads-draft .It Fl Fl in-prefix-bos Prefix BOS to user inputs, preceding the .Fl Fl in-prefix @@ -143,21 +154,15 @@ Number of tokens to predict. .Pp Default: -1 .It Fl c Ar N , Fl Fl ctx-size Ar N -Set the size of the prompt context. A larger context size helps the -model to better comprehend and generate responses for longer input or -conversations. The LLaMA models were built with a context of 2048, which -yields the best results on longer input / inference. -.Pp -.Bl -dash -compact -.It -0 = loaded automatically from model -.El -.Pp -Default: 512 +Sets the maximum context size, in tokens. In +.Fl Fl chat +mode, this value sets a hard limit on how long your conversation can be. +The default is 8192 tokens. If this value is zero, then it'll be set to +the maximum context size the model allows. .It Fl b Ar N , Fl Fl batch-size Ar N Batch size for prompt processing. .Pp -Default: 512 +Default: 2048 .It Fl Fl top-k Ar N Top-k sampling. .Pp diff --git a/llama.cpp/main/main.1.asc b/llama.cpp/main/main.1.asc index 2fa76cbf83..6acae6febf 100644 --- a/llama.cpp/main/main.1.asc +++ b/llama.cpp/main/main.1.asc @@ -4,6 +4,7 @@ llamafile — large language model runner SYNOPSIS + llamafile [--chat] [flags...] -m model.gguf llamafile [--server] [flags...] -m model.gguf [--mmproj vision.gguf] llamafile [--cli] [flags...] -m model.gguf -p prompt llamafile [--cli] [flags...] -m model.gguf --mmproj vision.gguf --image @@ -17,6 +18,32 @@ - Chatbot that passes the Turing test - Text/image summarization and analysis +MODES + There's three modes of operation: --chat, --server, and --cli. If none + of these flags is specified, then llamafile makes its best guess about + which mode is best. By default, the --chat interface is launched in the + foreground with a --server in the background. + + --cli Puts program in command line interface mode. This flag is im‐ + plied when a prompt is supplied using either the -p or -f + flags. + + --chat Puts program in command line chatbot only mode. This mode + launches an interactive shell that lets you talk to your LLM, + which should be specified using the -m flag. This mode also + launches a server in the background. The system prompt that's + displayed at the start of your conversation may be changed by + passing the -p flag. + + --server + Puts program in server only mode. This will launch an HTTP + server on a local port. This server has both a web UI and an + OpenAI API compatible completions endpoint. When the server is + run on a desk system, a tab browser tab will be launched auto‐ + matically that displays the web UI. This --server flag is im‐ + plied if no prompt is specified, i.e. neither the -p or -f + flags are passed. + OPTIONS The following options are available: @@ -26,31 +53,18 @@ -h, --help Show help message and exit. - --cli Puts program in command line interface mode. This flag is im‐ - plied when a prompt is supplied using either the -p or -f - flags. - - --server - Puts program in server mode. This will launch an HTTP server on - a local port. This server has both a web UI and an OpenAI API - compatible completions endpoint. When the server is run on a - desk system, a tab browser tab will be launched automatically - that displays the web UI. This --server flag is implied if no - prompt is specified, i.e. neither the -p or -f flags are - passed. - -m FNAME, --model FNAME Model path in the GGUF file format. Default: models/7B/ggml-model-f16.gguf --mmproj FNAME - Specifies path of the LLaVA vision model in the GGUF file for‐ - mat. If this flag is supplied, then the --model and --image + Specifies path of the LLaVA vision model in the GGUF file for‐ + mat. If this flag is supplied, then the --model and --image flags should also be supplied. -s SEED, --seed SEED - Random Number Generator (RNG) seed. A random seed is used if + Random Number Generator (RNG) seed. A random seed is used if this is less than zero. Default: -1 @@ -60,26 +74,6 @@ Default: $(nproc)/2 - -tb N, --threads-batch N - Set the number of threads to use during batch and prompt pro‐ - cessing. In some systems, it is beneficial to use a higher num‐ - ber of threads during batch processing than during generation. - If not specified, the number of threads used for batch process‐ - ing will be the same as the number of threads used for genera‐ - tion. - - Default: Same as --threads - - -td N, --threads-draft N - Number of threads to use during generation. - - Default: Same as --threads - - -tbd N, --threads-batch-draft N - Number of threads to use during batch and prompt processing. - - Default: Same as --threads-draft - --in-prefix-bos Prefix BOS to user inputs, preceding the --in-prefix string. @@ -115,20 +109,15 @@ Default: -1 -c N, --ctx-size N - Set the size of the prompt context. A larger context size helps - the model to better comprehend and generate responses for - longer input or conversations. The LLaMA models were built with - a context of 2048, which yields the best results on longer in‐ - put / inference. - - - 0 = loaded automatically from model - - Default: 512 + Sets the maximum context size, in tokens. In --chat mode, this + value sets a hard limit on how long your conversation can be. + The default is 8192 tokens. If this value is zero, then it'll + be set to the maximum context size the model allows. -b N, --batch-size N Batch size for prompt processing. - Default: 512 + Default: 2048 --top-k N Top-k sampling. @@ -195,7 +184,7 @@ Default: 0.0 --mirostat N - Use Mirostat sampling. Top K, Nucleus, Tail Free and Locally + Use Mirostat sampling. Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.. - 0 = disabled @@ -215,8 +204,8 @@ Default: 5.0 -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS - Modifies the likelihood of token appearing in the completion, - i.e. --logit-bias 15043+1 to increase likelihood of token + Modifies the likelihood of token appearing in the completion, + i.e. --logit-bias 15043+1 to increase likelihood of token ' Hello', or --logit-bias 15043-1 to decrease likelihood of to‐ ken ' Hello'. @@ -247,13 +236,13 @@ fied by the model --rope-scale N - RoPE context scaling factor, expands context by a factor of N - where N is the linear scaling factor used by the fine-tuned - model. Some fine-tuned models have extended the context length + RoPE context scaling factor, expands context by a factor of N + where N is the linear scaling factor used by the fine-tuned + model. Some fine-tuned models have extended the context length by scaling RoPE. For example, if the original pre-trained model - have a context length (max sequence length) of 4096 (4k) and - the fine-tuned model have 32k. That is a scaling factor of 8, - and should work by setting the above --ctx-size to 32768 (32k) + have a context length (max sequence length) of 4096 (4k) and + the fine-tuned model have 32k. That is a scaling factor of 8, + and should work by setting the above --ctx-size to 32768 (32k) and --rope-scale to 8. --rope-freq-base N @@ -262,7 +251,7 @@ Default: loaded from model --rope-freq-scale N - RoPE frequency scaling factor, expands context by a factor of + RoPE frequency scaling factor, expands context by a factor of 1/N --yarn-orig-ctx N @@ -293,7 +282,7 @@ Default: 32.0 --ignore-eos - Ignore end of stream token and continue generating (implies + Ignore end of stream token and continue generating (implies --logit-bias 2-inf) --no-penalize-nl @@ -310,7 +299,7 @@ Default: disabled --hellaswag - Compute HellaSwag score over random tasks from datafile sup‐ + Compute HellaSwag score over random tasks from datafile sup‐ plied with -f --hellaswag-tasks N @@ -319,10 +308,10 @@ Default: 400 --keep N - This flag allows users to retain the original prompt when the + This flag allows users to retain the original prompt when the model runs out of context, ensuring a connection to the initial instruction or conversation topic is maintained, where N is the - number of tokens from the initial prompt to retain when the + number of tokens from the initial prompt to retain when the model resets its internal context. - 0 = no tokens are kept from initial prompt @@ -358,15 +347,15 @@ Default: 0.1 --mlock - Force system to keep model in RAM rather than swapping or com‐ + Force system to keep model in RAM rather than swapping or com‐ pressing. --no-mmap Do not memory-map model (slower load but may reduce pageouts if not using mlock). - --numa Attempt optimizations that help on some NUMA systems if run - without this previously, it is recommended to drop the system + --numa Attempt optimizations that help on some NUMA systems if run + without this previously, it is recommended to drop the system page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437. @@ -376,48 +365,48 @@ --nocompile Never compile GPU support at runtime. - If the appropriate DSO file already exists under ~/.llamafile/ - then it'll be linked as-is without question. If a prebuilt DSO - is present in the PKZIP content of the executable, then it'll - be extracted and linked if possible. Otherwise, llamafile will + If the appropriate DSO file already exists under ~/.llamafile/ + then it'll be linked as-is without question. If a prebuilt DSO + is present in the PKZIP content of the executable, then it'll + be extracted and linked if possible. Otherwise, llamafile will skip any attempt to compile GPU support and simply fall back to using CPU inference. --gpu GPU Specifies which brand of GPU should be used. Valid choices are: - - AUTO: Use any GPU if possible, otherwise fall back to CPU + - AUTO: Use any GPU if possible, otherwise fall back to CPU inference (default) - APPLE: Use Apple Metal GPU. This is only available on MacOS - ARM64. If Metal could not be used for any reason, then a + ARM64. If Metal could not be used for any reason, then a fatal error will be raised. - AMD: Use AMD GPUs. The AMD HIP ROCm SDK should be installed - in which case we assume the HIP_PATH environment variable - has been defined. The set of gfx microarchitectures needed - to run on the host machine is determined automatically - based on the output of the hipInfo command. On Windows, - llamafile release binaries are distributed with a tinyBLAS - DLL so it'll work out of the box without requiring the HIP - SDK to be installed. However, tinyBLAS is slower than + in which case we assume the HIP_PATH environment variable + has been defined. The set of gfx microarchitectures needed + to run on the host machine is determined automatically + based on the output of the hipInfo command. On Windows, + llamafile release binaries are distributed with a tinyBLAS + DLL so it'll work out of the box without requiring the HIP + SDK to be installed. However, tinyBLAS is slower than rocBLAS for batch and image processing, so it's recommended - that the SDK be installed anyway. If an AMD GPU could not + that the SDK be installed anyway. If an AMD GPU could not be used for any reason, then a fatal error will be raised. - NVIDIA: Use NVIDIA GPUs. If an NVIDIA GPU could not be used - for any reason, a fatal error will be raised. On Windows, - NVIDIA GPU support will use our tinyBLAS library, since it - works on stock Windows installs. However, tinyBLAS goes + for any reason, a fatal error will be raised. On Windows, + NVIDIA GPU support will use our tinyBLAS library, since it + works on stock Windows installs. However, tinyBLAS goes slower for batch and image processing. It's possible to use - NVIDIA's closed-source cuBLAS library instead. To do that, - both MSVC and CUDA need to be installed and the llamafile + NVIDIA's closed-source cuBLAS library instead. To do that, + both MSVC and CUDA need to be installed and the llamafile command should be run once from the x64 MSVC command prompt - with the --recompile flag passed. The GGML library will - then be compiled and saved to ~/.llamafile/ so the special + with the --recompile flag passed. The GGML library will + then be compiled and saved to ~/.llamafile/ so the special process only needs to happen a single time. - - DISABLE: Never use GPU and instead use CPU inference. This + - DISABLE: Never use GPU and instead use CPU inference. This setting is implied by -ngl 0. -ngl N, --n-gpu-layers N @@ -434,12 +423,12 @@ -ts SPLIT, --tensor-split SPLIT When using multiple GPUs this option controls how large tensors - should be split across all GPUs. SPLIT is a comma-separated + should be split across all GPUs. SPLIT is a comma-separated list of non-negative values that assigns the proportion of data - that each GPU should get in order. For example, "3,2" will as‐ - sign 60% of the data to GPU 0 and 40% to GPU 1. By default the + that each GPU should get in order. For example, "3,2" will as‐ + sign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal - for performance. Requires cuBLAS. How to split tensors across + for performance. Requires cuBLAS. How to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1 -mg i, --main-gpu i @@ -453,7 +442,7 @@ Print prompt before generation. --simple-io - Use basic IO for better compatibility in subprocesses and lim‐ + Use basic IO for better compatibility in subprocesses and lim‐ ited consoles. --lora FNAME @@ -464,14 +453,14 @@ --no-mmap) --lora-base FNAME - Optional model to use as a base for the layers modified by the + Optional model to use as a base for the layers modified by the LoRA adapter --unsecure Disables pledge() sandboxing on Linux and OpenBSD. --samplers - Samplers that will be used for generation in the order, sepa‐ + Samplers that will be used for generation in the order, sepa‐ rated by semicolon, for example: top_k;tfs;typi‐ cal;top_p;min_p;temp @@ -507,7 +496,7 @@ Binary file containing multiple choice tasks. --winogrande - Compute Winogrande score over random tasks from datafile sup‐ + Compute Winogrande score over random tasks from datafile sup‐ plied by the -f flag. --winogrande-tasks N @@ -516,17 +505,17 @@ Default: 0 --multiple-choice - Compute multiple choice score over random tasks from datafile + Compute multiple choice score over random tasks from datafile supplied by the -f flag. --multiple-choice-tasks N - Number of tasks to use when computing the multiple choice + Number of tasks to use when computing the multiple choice score. Default: 0 --kl-divergence - Computes KL-divergence to logits provided via the + Computes KL-divergence to logits provided via the --kl-divergence-base flag. --save-all-logits FNAME, --kl-divergence-base FNAME @@ -547,21 +536,21 @@ The model default is used if unspecified. CLI OPTIONS - The following options may be specified when llamafile is running in + The following options may be specified when llamafile is running in --cli mode. -e, --escape Process prompt escapes sequences (\n, \r, \t, \´, \", \\) -p STRING, --prompt STRING - Prompt to start text generation. Your LLM works by auto-com‐ + Prompt to start text generation. Your LLM works by auto-com‐ pleting this text. For example: llamafile -m model.gguf -p "four score and" - Stands a pretty good chance of printing Lincoln's Gettysburg - Address. Prompts can take on a structured format too. Depend‐ - ing on how your model was trained, it may specify in its docs + Stands a pretty good chance of printing Lincoln's Gettysburg + Address. Prompts can take on a structured format too. Depend‐ + ing on how your model was trained, it may specify in its docs an instruction notation. With some models that might be: llamafile -p "[INST]Summarize this: $(cat file)[/INST]" @@ -579,36 +568,36 @@ root ::= "yes" | "no" - will force the LLM to only output yes or no before exiting. - This is useful for shell scripts when the --no-display-prompt + will force the LLM to only output yes or no before exiting. + This is useful for shell scripts when the --no-display-prompt flag is also supplied. --grammar-file FNAME File to read grammar from. - --fast Put llamafile into fast math mode. This disables algorithms - that reduce floating point rounding, e.g. Kahan summation, and + --fast Put llamafile into fast math mode. This disables algorithms + that reduce floating point rounding, e.g. Kahan summation, and certain functions like expf() will be vectorized but handle un‐ - derflows less gracefully. It's unspecified whether llamafile - runs in fast or precise math mode when neither flag is speci‐ + derflows less gracefully. It's unspecified whether llamafile + runs in fast or precise math mode when neither flag is speci‐ fied. --precise - Put llamafile into precise math mode. This enables algorithms - that reduce floating point rounding, e.g. Kahan summation, and - certain functions like expf() will always handle subnormals - correctly. It's unspecified whether llamafile runs in fast or + Put llamafile into precise math mode. This enables algorithms + that reduce floating point rounding, e.g. Kahan summation, and + certain functions like expf() will always handle subnormals + correctly. It's unspecified whether llamafile runs in fast or precise math mode when neither flag is specified. - --trap Put llamafile into math trapping mode. When floating point ex‐ - ceptions occur, such as NaNs, overflow, and divide by zero, - llamafile will print a warning to the console. This warning - will include a C++ backtrace the first time an exception is - trapped. The op graph will also be dumped to a file, and lla‐ - mafile will report the specific op where the exception oc‐ - curred. This is useful for troubleshooting when reporting is‐ - sues. USing this feature will disable sandboxing. Math trap‐ - ping is only possible if your CPU supports it. That is gener‐ + --trap Put llamafile into math trapping mode. When floating point ex‐ + ceptions occur, such as NaNs, overflow, and divide by zero, + llamafile will print a warning to the console. This warning + will include a C++ backtrace the first time an exception is + trapped. The op graph will also be dumped to a file, and lla‐ + mafile will report the specific op where the exception oc‐ + curred. This is useful for troubleshooting when reporting is‐ + sues. USing this feature will disable sandboxing. Math trap‐ + ping is only possible if your CPU supports it. That is gener‐ ally the case on AMD64, however it's less common on ARM64. --prompt-cache FNAME @@ -617,12 +606,12 @@ Default: none -fa FNAME, --flash-attn - Enable Flash Attention. This is a mathematical shortcut that - can speed up inference for certain models. This feature is + Enable Flash Attention. This is a mathematical shortcut that + can speed up inference for certain models. This feature is still under active development. --prompt-cache-all - If specified, saves user input and generations to cache as + If specified, saves user input and generations to cache as well. Not supported with --interactive or other interactive op‐ tions. @@ -634,47 +623,47 @@ --image IMAGE_FILE Path to an image file. This should be used with multimodal mod‐ - els. Alternatively, it's possible to embed an image directly - into the prompt instead; in which case, it must be base64 en‐ - coded into an HTML img tag URL with the image/jpeg MIME type. + els. Alternatively, it's possible to embed an image directly + into the prompt instead; in which case, it must be base64 en‐ + coded into an HTML img tag URL with the image/jpeg MIME type. See also the --mmproj flag for supplying the vision model. -i, --interactive - Run the program in interactive mode, allowing users to engage - in real-time conversations or provide specific instructions to + Run the program in interactive mode, allowing users to engage + in real-time conversations or provide specific instructions to the model. --interactive-first - Run the program in interactive mode and immediately wait for + Run the program in interactive mode and immediately wait for user input before starting the text generation. -ins, --instruct - Run the program in instruction mode, which is specifically de‐ - signed to work with Alpaca models that excel in completing + Run the program in instruction mode, which is specifically de‐ + signed to work with Alpaca models that excel in completing tasks based on user instructions. Technical details: The user's input is internally prefixed with - the reverse prompt (or "### Instruction:" as the default), and - followed by "### Response:" (except if you just press Return + the reverse prompt (or "### Instruction:" as the default), and + followed by "### Response:" (except if you just press Return without any input, to keep generating a longer response). - By understanding and utilizing these interaction options, you + By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA mod‐ - els, tailoring the text generation process to your specific + els, tailoring the text generation process to your specific needs. -r PROMPT, --reverse-prompt PROMPT - Specify one or multiple reverse prompts to pause text genera‐ - tion and switch to interactive mode. For example, -r "User:" - can be used to jump back into the conversation whenever it's - the user's turn to speak. This helps create a more interactive - and conversational experience. However, the reverse prompt - doesn't work when it ends with a space. To overcome this limi‐ - tation, you can use the --in-prefix flag to add a space or any + Specify one or multiple reverse prompts to pause text genera‐ + tion and switch to interactive mode. For example, -r "User:" + can be used to jump back into the conversation whenever it's + the user's turn to speak. This helps create a more interactive + and conversational experience. However, the reverse prompt + doesn't work when it ends with a space. To overcome this limi‐ + tation, you can use the --in-prefix flag to add a space or any other characters after the reverse prompt. --color - Enable colorized output to differentiate visually distinguish‐ + Enable colorized output to differentiate visually distinguish‐ ing between prompts, user input, and generated text. --no-display-prompt, --silent-prompt @@ -689,23 +678,23 @@ in '\'. --cont-batching - Enables continuous batching, a.k.a. dynamic batching. is -1 + Enables continuous batching, a.k.a. dynamic batching. is -1 which means all tokens. --embedding - In CLI mode, the embedding flag may be use to print embeddings - to standard output. By default, embeddings are computed over a - whole prompt. However the --multiline flag may be passed, to + In CLI mode, the embedding flag may be use to print embeddings + to standard output. By default, embeddings are computed over a + whole prompt. However the --multiline flag may be passed, to have a separate embeddings array computed for each line of text - in the prompt. In multiline mode, each embedding array will be - printed on its own line to standard output, where individual - floats are separated by space. If both the --multiline-input - and --interactive flags are passed, then a pretty-printed sum‐ - mary of embeddings along with a cosine similarity matrix will + in the prompt. In multiline mode, each embedding array will be + printed on its own line to standard output, where individual + floats are separated by space. If both the --multiline-input + and --interactive flags are passed, then a pretty-printed sum‐ + mary of embeddings along with a cosine similarity matrix will be printed to the terminal. SERVER OPTIONS - The following options may be specified when llamafile is running in + The following options may be specified when llamafile is running in --server mode. --port PORT @@ -734,11 +723,11 @@ Default: disabled -spf FNAME, --system-prompt-file FNAME - Set a file to load a system prompt (initial prompt of all + Set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications. -a ALIAS, --alias ALIAS - Set an alias for the model. This will be added as the model + Set an alias for the model. This will be added as the model field in completion responses. --path PUBLIC_PATH @@ -750,13 +739,13 @@ Do not attempt to open a web browser tab at startup. -gan N, --grp-attn-n N - Set the group attention factor to extend context size through - self-extend. The default value is 1 which means disabled. This + Set the group attention factor to extend context size through + self-extend. The default value is 1 which means disabled. This flag is used together with --grp-attn-w. -gaw N, --grp-attn-w N - Set the group attention width to extend context size through - self-extend. The default value is 512. This flag is used to‐ + Set the group attention width to extend context size through + self-extend. The default value is 512. This flag is used to‐ gether with --grp-attn-n. LOG OPTIONS @@ -778,15 +767,15 @@ Specify a log filename (without extension) --log-new - Create a separate new log file on start. Each log file will + Create a separate new log file on start. Each log file will have unique name: ..log --log-append Don't truncate the old log file. EXAMPLES - Here's an example of how to run llama.cpp's built-in HTTP server. This - example uses LLaVA v1.5-7B, a multimodal LLM that works with + Here's an example of how to run llama.cpp's built-in HTTP server. This + example uses LLaVA v1.5-7B, a multimodal LLM that works with llama.cpp's recently-added support for image inputs. llamafile \ @@ -802,14 +791,14 @@ -m wizardcoder-python-13b-v1.0.Q8_0.gguf --temp 0 -r '}\n' -r '```\n' \ -e -p '```c\nvoid *memcpy(void *dst, const void *src, size_t size) {\n' - Here's a similar example that instead utilizes Mistral-7B-Instruct + Here's a similar example that instead utilizes Mistral-7B-Instruct weights for prose composition: llamafile \ -m mistral-7b-instruct-v0.2.Q5_K_M.gguf \ -p '[INST]Write a story about llamas[/INST]' - Here's an example of how llamafile can be used as an interactive chat‐ + Here's an example of how llamafile can be used as an interactive chat‐ bot that lets you query knowledge contained in training data: llamafile -m llama-65b-Q5_K.gguf -p ' @@ -849,7 +838,7 @@ -e -p '### User: What do you see?\n### Assistant: ' \ --no-display-prompt 2>/dev/null - If you wanted to write a script to rename all your image files, you + If you wanted to write a script to rename all your image files, you could use the following command to generate a safe filename: llamafile --temp 0 \ @@ -862,8 +851,8 @@ sed -e's/ /_/g' -e's/$/.jpg/' three_baby_lemurs_on_the_back_of_an_adult_lemur.jpg - Here's an example of how to make an API request to the OpenAI API com‐ - patible completions endpoint when your llamafile is running in the + Here's an example of how to make an API request to the OpenAI API com‐ + patible completions endpoint when your llamafile is running in the background in --server mode. curl -s http://localhost:8080/v1/chat/completions \ @@ -888,12 +877,12 @@ PROTIP The -ngl 35 flag needs to be passed in order to use GPUs made by NVIDIA - and AMD. It's not enabled by default since it sometimes needs to be - tuned based on the system hardware and model architecture, in order to + and AMD. It's not enabled by default since it sometimes needs to be + tuned based on the system hardware and model architecture, in order to achieve optimal performance, and avoid compromising a shared display. SEE ALSO - llamafile-quantize(1), llamafile-perplexity(1), llava-quantize(1), + llamafile-quantize(1), llamafile-perplexity(1), llava-quantize(1), zipalign(1), unzip(1) -Mozilla Ocho January 1, 2024 LLAMAFILE(1) +Mozilla Ocho October 12, 2024 LLAMAFILE(1) diff --git a/llama.cpp/main/main.cpp b/llama.cpp/main/main.cpp index abdf66e53c..85cb95fd9c 100644 --- a/llama.cpp/main/main.cpp +++ b/llama.cpp/main/main.cpp @@ -194,15 +194,14 @@ int main(int argc, char ** argv) { enum Program prog = determine_program(argv); - if (prog == SERVER || + if (prog == SERVER) + return server_cli(argc, argv); + + if (prog == CHATBOT || (prog == UNKNOWN && !llamafile_has(argv, "-p") && !llamafile_has(argv, "-f") && !llamafile_has(argv, "--random-prompt"))) { - return server_cli(argc, argv); - } - - if (prog == CHATBOT) { int chatbot_main(int, char **); return chatbot_main(argc, argv); } @@ -236,7 +235,7 @@ int main(int argc, char ** argv) { return 1; } - if (!FLAG_unsecure && !llamafile_has_gpu()) { + if (!FLAG_unsecure && !llamafile_has_gpu() && !g_server_background_mode) { // Enable pledge() security on Linux and OpenBSD. // - We do this *after* opening the log file for writing. // - We do this *before* loading any weights or graphdefs. diff --git a/llama.cpp/server/server.cpp b/llama.cpp/server/server.cpp index c94eb15888..da758c81de 100644 --- a/llama.cpp/server/server.cpp +++ b/llama.cpp/server/server.cpp @@ -29,6 +29,10 @@ double g_prompt_per_second_jart; +bool g_server_background_mode; +llama_model *g_server_force_llama_model; +void (*g_server_on_listening)(const char *host, int port); + using json = nlohmann::json; struct server_params @@ -443,9 +447,16 @@ struct llama_server_context } } - llama_init_result llama_init = llama_init_from_gpt_params(params); - model = llama_init.model; - ctx = llama_init.context; + if (!g_server_force_llama_model) { + llama_init_result llama_init = llama_init_from_gpt_params(params); + model = llama_init.model; + ctx = llama_init.context; + } else { + llama_context_params ctx_params = llama_context_params_from_gpt_params(params); + model = g_server_force_llama_model; + ctx = llama_new_context_with_model(model, ctx_params); + } + if (model == nullptr) { LOG_ERROR("unable to load model", {{"model", params.model}}); @@ -3129,13 +3140,16 @@ int server_cli(int argc, char **argv) } // launch browser tab - if (!sparams.nobrowser) { + if (!sparams.nobrowser && !g_server_background_mode) { char url[128]; snprintf(url, sizeof(url), "http://%s:%d/", connect_host, sparams.port); llamafile_launch_browser(url); } + if (g_server_on_listening) { + g_server_on_listening(connect_host, sparams.port); + } - if (!FLAG_unsecure) { + if (!FLAG_unsecure && !g_server_background_mode) { if (IsXnu()) { // Cosmopolitan libc explicitly does not support cosmo_dlopen on x64 // macOS and mac_sandbox_init depends on cosmo_dlopen. We'll attempt @@ -3685,6 +3699,7 @@ int server_cli(int argc, char **argv) llama.queue_tasks.terminate(); }; + if (!g_server_background_mode) { #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) struct sigaction sigint_action; sigint_action.sa_handler = signal_handler; @@ -3697,6 +3712,8 @@ int server_cli(int argc, char **argv) }; SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); #endif + } + llama.queue_tasks.start_loop(); svr.stop(); t.join(); diff --git a/llama.cpp/server/server.h b/llama.cpp/server/server.h index f90fa4e2ec..572e4f1e8b 100644 --- a/llama.cpp/server/server.h +++ b/llama.cpp/server/server.h @@ -1,2 +1,9 @@ #pragma once +#include "llama.cpp/llama.h" + +extern bool server_log_json; +extern bool g_server_background_mode; +extern llama_model *g_server_force_llama_model; +extern void (*g_server_on_listening)(const char *host, int port); + int server_cli(int, char **); diff --git a/llama.cpp/server/utils.h b/llama.cpp/server/utils.h index ad169d4ff3..cde8548b28 100644 --- a/llama.cpp/server/utils.h +++ b/llama.cpp/server/utils.h @@ -138,6 +138,9 @@ struct completion_token_output static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) { + if (FLAG_log_disable) // [jart] + return; + std::stringstream ss_tid; ss_tid << std::this_thread::get_id(); json log = nlohmann::ordered_json{ diff --git a/llamafile/chatbot.cpp b/llamafile/chatbot.cpp index 54d88e92fe..eecfe7d155 100644 --- a/llamafile/chatbot.cpp +++ b/llamafile/chatbot.cpp @@ -27,6 +27,7 @@ #include "llama.cpp/common.h" #include "llama.cpp/llama.h" +#include "llama.cpp/server/server.h" #include "llamafile/bestline.h" #include "llamafile/highlight.h" #include "llamafile/llamafile.h" @@ -43,9 +44,17 @@ #define BRIGHT_GREEN "\e[92m" #define CLEAR_FORWARD "\e[K" +struct ServerArgs { + int argc; + char **argv; +}; + static int n_past; static llama_model *g_model; static llama_context *g_ctx; +static pthread_cond_t g_cond = PTHREAD_COND_INITIALIZER; +static pthread_mutex_t g_lock = PTHREAD_MUTEX_INITIALIZER; +static std::string g_listen_url; static volatile sig_atomic_t g_got_sigint; static void on_sigint(int sig) { @@ -195,19 +204,25 @@ static void eval_string(const std::string &str, int n_batch, bool add_special, b eval_tokens(llama_tokenize(g_ctx, str, add_special, parse_special), n_batch); } +static void on_server_listening(const char *host, int port) { + pthread_mutex_lock(&g_lock); + g_listen_url = format("http://%s:%d/", host, port); + pthread_cond_signal(&g_cond); + pthread_mutex_unlock(&g_lock); +} + +static void *server_thread(void *arg) { + ServerArgs *sargs = (ServerArgs *)arg; + server_log_json = false; + g_server_background_mode = true; + g_server_force_llama_model = g_model; + g_server_on_listening = on_server_listening; + exit(server_cli(sargs->argc, sargs->argv)); +} + int chatbot_main(int argc, char **argv) { - llamafile_check_cpu(); - ShowCrashReports(); log_disable(); - gpt_params params; - params.n_batch = 512; // for better progress indication - params.sparams.temp = 0; // don't believe in randomness by default - if (!gpt_params_parse(argc, argv, params)) { - fprintf(stderr, "error: failed to parse flags\n"); - exit(1); - } - print_logo(u"\n\ ██╗ ██╗ █████╗ ███╗ ███╗ █████╗ ███████╗██╗██╗ ███████╗\n\ ██║ ██║ ██╔══██╗████╗ ████║██╔══██╗██╔════╝██║██║ ██╔════╝\n\ @@ -216,15 +231,18 @@ int chatbot_main(int argc, char **argv) { ███████╗███████╗██║ ██║██║ ╚═╝ ██║██║ ██║██║ ██║███████╗███████╗\n\ ╚══════╝╚══════╝╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═╝╚══════╝╚══════╝\n"); - printf(BOLD "software" UNBOLD ": llamafile " LLAMAFILE_VERSION_STRING "\n" // - BOLD "model" UNBOLD ": %s\n\n", - basename(params.model).c_str()); - - print_ephemeral("initializing backend..."); + print_ephemeral("loading backend..."); llama_backend_init(); + gpt_params params; + params.n_batch = 512; // for better progress indication + params.sparams.temp = 0; // don't believe in randomness by default + if (!gpt_params_parse(argc, argv, params)) { // also loads gpu module + fprintf(stderr, "error: failed to parse flags\n"); + exit(1); + } clear_ephemeral(); - print_ephemeral("initializing model..."); + print_ephemeral("loading model..."); llama_model_params model_params = llama_model_default_params(); model_params.n_gpu_layers = llamafile_gpu_layers(35); g_model = llama_load_model_from_file(params.model.c_str(), model_params); @@ -233,12 +251,35 @@ int chatbot_main(int argc, char **argv) { fprintf(stderr, "%s: failed to load model\n", params.model.c_str()); exit(2); } - if (!params.n_ctx) + if (params.n_ctx <= 0 || params.n_ctx > llama_n_ctx_train(g_model)) params.n_ctx = llama_n_ctx_train(g_model); if (params.n_ctx < params.n_batch) params.n_batch = params.n_ctx; clear_ephemeral(); + bool want_server = !llamafile_has(argv, "--chat"); + if (want_server) { + print_ephemeral("launching server..."); + pthread_t thread; + pthread_attr_t attr; + ServerArgs sargs = {argc, argv}; + pthread_mutex_lock(&g_lock); + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + pthread_create(&thread, &attr, server_thread, &sargs); + pthread_attr_destroy(&attr); + pthread_cond_wait(&g_cond, &g_lock); + pthread_mutex_unlock(&g_lock); + clear_ephemeral(); + } + + printf(BOLD "software" UNBOLD ": llamafile " LLAMAFILE_VERSION_STRING "\n" // + BOLD "model" UNBOLD ": %s\n", + basename(params.model).c_str()); + if (want_server) + printf(BOLD "server" UNBOLD ": %s\n", g_listen_url.c_str()); + printf("\n"); + print_ephemeral("initializing context..."); llama_context_params ctx_params = llama_context_params_from_gpt_params(params); g_ctx = llama_new_context_with_model(g_model, ctx_params); @@ -250,9 +291,9 @@ int chatbot_main(int argc, char **argv) { clear_ephemeral(); if (params.prompt.empty()) - params.prompt = - "A chat between a curious human and an artificial intelligence assistant. The " - "assistant gives helpful, detailed, and polite answers to the human's questions."; + params.prompt = "A chat between a curious human and an artificial intelligence assistant. " + "The assistant gives helpful, detailed, and polite answers to the " + "human's questions."; bool add_bos = llama_should_add_bos_token(llama_get_model(g_ctx)); std::vector chat = {{"system", params.prompt}};