From 4199dae4156ae3c37b5ac50ad6ed1568577dcec7 Mon Sep 17 00:00:00 2001
From: Justine Tunney <jtunney@mozilla.com>
Date: Sat, 12 Oct 2024 19:34:06 -0700
Subject: [PATCH] Make chat+server hybrid the new default mode

---
 llama.cpp/main/main.1       |  85 ++++-----
 llama.cpp/main/main.1.asc   | 331 +++++++++++++++++-------------------
 llama.cpp/main/main.cpp     |  11 +-
 llama.cpp/server/server.cpp |  27 ++-
 llama.cpp/server/server.h   |   7 +
 llama.cpp/server/utils.h    |   3 +
 llamafile/chatbot.cpp       |  81 ++++++---
 7 files changed, 303 insertions(+), 242 deletions(-)

diff --git a/llama.cpp/main/main.1 b/llama.cpp/main/main.1
index d2f51bfcd3..b396f3318e 100644
--- a/llama.cpp/main/main.1
+++ b/llama.cpp/main/main.1
@@ -1,4 +1,4 @@
-.Dd January 1, 2024
+.Dd October 12, 2024
 .Dt LLAMAFILE 1
 .Os Mozilla Ocho
 .Sh NAME
@@ -6,6 +6,10 @@
 .Nd large language model runner
 .Sh SYNOPSIS
 .Nm
+.Op Fl Fl chat
+.Op flags...
+.Fl m Ar model.gguf
+.Nm
 .Op Fl Fl server
 .Op flags...
 .Fl m Ar model.gguf
@@ -36,13 +40,20 @@ Chatbot that passes the Turing test
 .It
 Text/image summarization and analysis
 .El
-.Sh OPTIONS
-The following options are available:
+.Sh MODES
+.Pp
+There's three modes of operation:
+.Fl Fl chat ,
+.Fl Fl server ,
+and
+.Fl Fl cli .
+If none of these flags is specified, then llamafile makes its best guess
+about which mode is best. By default, the
+.Fl Fl chat
+interface is launched in the foreground with a
+.Fl Fl server
+in the background.
 .Bl -tag -width indent
-.It Fl Fl version
-Print version and exit.
-.It Fl h , Fl Fl help
-Show help message and exit.
 .It Fl Fl cli
 Puts program in command line interface mode. This flag is implied when a
 prompt is supplied using either the
@@ -50,9 +61,19 @@ prompt is supplied using either the
 or
 .Fl f
 flags.
+.It Fl Fl chat
+Puts program in command line chatbot only mode. This mode launches an
+interactive shell that lets you talk to your LLM, which should be
+specified using the
+.Fl m
+flag. This mode also launches a server in the background. The system
+prompt that's displayed at the start of your conversation may be changed
+by passing the
+.Fl p
+flag.
 .It Fl Fl server
-Puts program in server mode. This will launch an HTTP server on a local
-port. This server has both a web UI and an OpenAI API compatible
+Puts program in server only mode. This will launch an HTTP server on a
+local port. This server has both a web UI and an OpenAI API compatible
 completions endpoint. When the server is run on a desk system, a tab
 browser tab will be launched automatically that displays the web UI.
 This
@@ -62,6 +83,15 @@ flag is implied if no prompt is specified, i.e. neither the
 or
 .Fl f
 flags are passed.
+.El
+.Sh OPTIONS
+.Pp
+The following options are available:
+.Bl -tag -width indent
+.It Fl Fl version
+Print version and exit.
+.It Fl h , Fl Fl help
+Show help message and exit.
 .It Fl m Ar FNAME , Fl Fl model Ar FNAME
 Model path in the GGUF file format.
 .Pp
@@ -83,25 +113,6 @@ Default: -1
 Number of threads to use during generation.
 .Pp
 Default: $(nproc)/2
-.It Fl tb Ar N , Fl Fl threads-batch Ar N
-Set the number of threads to use during batch and prompt processing. In
-some systems, it is beneficial to use a higher number of threads during
-batch processing than during generation. If not specified, the number of
-threads used for batch processing will be the same as the number of
-threads used for generation.
-.Pp
-Default: Same as
-.Fl Fl threads
-.It Fl td Ar N , Fl Fl threads-draft Ar N
-Number of threads to use during generation.
-.Pp
-Default: Same as
-.Fl Fl threads
-.It Fl tbd Ar N , Fl Fl threads-batch-draft Ar N
-Number of threads to use during batch and prompt processing.
-.Pp
-Default: Same as
-.Fl Fl threads-draft
 .It Fl Fl in-prefix-bos
 Prefix BOS to user inputs, preceding the
 .Fl Fl in-prefix
@@ -143,21 +154,15 @@ Number of tokens to predict.
 .Pp
 Default: -1
 .It Fl c Ar N , Fl Fl ctx-size Ar N
-Set the size of the prompt context. A larger context size helps the
-model to better comprehend and generate responses for longer input or
-conversations. The LLaMA models were built with a context of 2048, which
-yields the best results on longer input / inference.
-.Pp
-.Bl -dash -compact
-.It
-0 = loaded automatically from model
-.El
-.Pp
-Default: 512
+Sets the maximum context size, in tokens. In
+.Fl Fl chat
+mode, this value sets a hard limit on how long your conversation can be.
+The default is 8192 tokens. If this value is zero, then it'll be set to
+the maximum context size the model allows.
 .It Fl b Ar N , Fl Fl batch-size Ar N
 Batch size for prompt processing.
 .Pp
-Default: 512
+Default: 2048
 .It Fl Fl top-k Ar N
 Top-k sampling.
 .Pp
diff --git a/llama.cpp/main/main.1.asc b/llama.cpp/main/main.1.asc
index 2fa76cbf83..6acae6febf 100644
--- a/llama.cpp/main/main.1.asc
+++ b/llama.cpp/main/main.1.asc
@@ -4,6 +4,7 @@
        llamafile — large language model runner
 
 [1mSYNOPSIS[0m
+       [1mllamafile [22m[[1m--chat[22m] [flags...] [1m-m [4m[22mmodel.gguf[0m
        [1mllamafile [22m[[1m--server[22m] [flags...] [1m-m [4m[22mmodel.gguf[24m [[1m--mmproj [4m[22mvision.gguf[24m]
        [1mllamafile [22m[[1m--cli[22m] [flags...] [1m-m [4m[22mmodel.gguf[24m [1m-p [4m[22mprompt[0m
        [1mllamafile [22m[[1m--cli[22m] [flags...] [1m-m [4m[22mmodel.gguf[24m [1m--mmproj [4m[22mvision.gguf[24m [1m--image[0m
@@ -17,6 +18,32 @@
        [1m-   [22mChatbot that passes the Turing test
        [1m-   [22mText/image summarization and analysis
 
+[1mMODES[0m
+       There's three modes of operation: [1m--chat[22m, [1m--server[22m, and [1m--cli[22m.  If none
+       of  these flags is specified, then llamafile makes its best guess about
+       which mode is best. By default, the [1m--chat [22minterface is launched in the
+       foreground with a [1m--server [22min the background.
+
+       [1m--cli   [22mPuts program in command line interface mode. This flag  is  im‐
+               plied  when  a  prompt  is  supplied  using either the [1m-p [22mor [1m-f[0m
+               flags.
+
+       [1m--chat  [22mPuts program in command  line  chatbot  only  mode.  This  mode
+               launches  an  interactive shell that lets you talk to your LLM,
+               which should be specified using the [1m-m  [22mflag.  This  mode  also
+               launches  a  server in the background. The system prompt that's
+               displayed at the start of your conversation may be  changed  by
+               passing the [1m-p [22mflag.
+
+       [1m--server[0m
+               Puts  program  in  server  only  mode. This will launch an HTTP
+               server on a local port. This server has both a web  UI  and  an
+               OpenAI  API compatible completions endpoint. When the server is
+               run on a desk system, a tab browser tab will be launched  auto‐
+               matically  that displays the web UI.  This [1m--server [22mflag is im‐
+               plied if no prompt is specified, i.e.  neither  the  [1m-p  [22mor  [1m-f[0m
+               flags are passed.
+
 [1mOPTIONS[0m
        The following options are available:
 
@@ -26,31 +53,18 @@
        [1m-h[22m, [1m--help[0m
                Show help message and exit.
 
-       [1m--cli   [22mPuts  program  in command line interface mode. This flag is im‐
-               plied when a prompt is supplied  using  either  the  [1m-p  [22mor  [1m-f[0m
-               flags.
-
-       [1m--server[0m
-               Puts program in server mode. This will launch an HTTP server on
-               a  local  port. This server has both a web UI and an OpenAI API
-               compatible completions endpoint. When the server is  run  on  a
-               desk  system,  a tab browser tab will be launched automatically
-               that displays the web UI.  This [1m--server [22mflag is implied if  no
-               prompt  is  specified,  i.e.  neither  the  [1m-p  [22mor [1m-f [22mflags are
-               passed.
-
        [1m-m [4m[22mFNAME[24m, [1m--model [4m[22mFNAME[0m
                Model path in the GGUF file format.
 
                Default: [4mmodels/7B/ggml-model-f16.gguf[0m
 
        [1m--mmproj [4m[22mFNAME[0m
-               Specifies path of the LLaVA vision model in the GGUF file  for‐
-               mat.  If  this  flag  is supplied, then the [1m--model [22mand [1m--image[0m
+               Specifies  path of the LLaVA vision model in the GGUF file for‐
+               mat. If this flag is supplied, then  the  [1m--model  [22mand  [1m--image[0m
                flags should also be supplied.
 
        [1m-s [4m[22mSEED[24m, [1m--seed [4m[22mSEED[0m
-               Random Number Generator (RNG) seed. A random seed  is  used  if
+               Random  Number  Generator  (RNG) seed. A random seed is used if
                this is less than zero.
 
                Default: -1
@@ -60,26 +74,6 @@
 
                Default: $(nproc)/2
 
-       [1m-tb [4m[22mN[24m, [1m--threads-batch [4m[22mN[0m
-               Set  the  number of threads to use during batch and prompt pro‐
-               cessing. In some systems, it is beneficial to use a higher num‐
-               ber of threads during batch processing than during  generation.
-               If not specified, the number of threads used for batch process‐
-               ing  will be the same as the number of threads used for genera‐
-               tion.
-
-               Default: Same as [1m--threads[0m
-
-       [1m-td [4m[22mN[24m, [1m--threads-draft [4m[22mN[0m
-               Number of threads to use during generation.
-
-               Default: Same as [1m--threads[0m
-
-       [1m-tbd [4m[22mN[24m, [1m--threads-batch-draft [4m[22mN[0m
-               Number of threads to use during batch and prompt processing.
-
-               Default: Same as [1m--threads-draft[0m
-
        [1m--in-prefix-bos[0m
                Prefix BOS to user inputs, preceding the [1m--in-prefix [22mstring.
 
@@ -115,20 +109,15 @@
                Default: -1
 
        [1m-c [4m[22mN[24m, [1m--ctx-size [4m[22mN[0m
-               Set the size of the prompt context. A larger context size helps
-               the  model  to  better  comprehend  and  generate responses for
-               longer input or conversations. The LLaMA models were built with
-               a context of 2048, which yields the best results on longer  in‐
-               put / inference.
-
-               [1m-   [22m0 = loaded automatically from model
-
-               Default: 512
+               Sets  the maximum context size, in tokens. In [1m--chat [22mmode, this
+               value sets a hard limit on how long your conversation  can  be.
+               The  default  is 8192 tokens. If this value is zero, then it'll
+               be set to the maximum context size the model allows.
 
        [1m-b [4m[22mN[24m, [1m--batch-size [4m[22mN[0m
                Batch size for prompt processing.
 
-               Default: 512
+               Default: 2048
 
        [1m--top-k [4m[22mN[0m
                Top-k sampling.
@@ -195,7 +184,7 @@
                Default: 0.0
 
        [1m--mirostat [4m[22mN[0m
-               Use  Mirostat  sampling.  Top K, Nucleus, Tail Free and Locally
+               Use Mirostat sampling. Top K, Nucleus, Tail  Free  and  Locally
                Typical samplers are ignored if used..
 
                [1m-   [22m0 = disabled
@@ -215,8 +204,8 @@
                Default: 5.0
 
        [1m-l [4m[22mTOKEN_ID(+/-)BIAS[24m, [1m--logit-bias [4m[22mTOKEN_ID(+/-)BIAS[0m
-               Modifies the likelihood of token appearing in  the  completion,
-               i.e.   [1m--logit-bias  [4m[22m15043+1[24m  to  increase  likelihood of token
+               Modifies  the  likelihood of token appearing in the completion,
+               i.e.  [1m--logit-bias [4m[22m15043+1[24m  to  increase  likelihood  of  token
                [4m'[24m [4mHello'[24m, or [1m--logit-bias [4m[22m15043-1[24m to decrease likelihood of to‐
                ken [4m'[24m [4mHello'[24m.
 
@@ -247,13 +236,13 @@
                fied by the model
 
        [1m--rope-scale [4m[22mN[0m
-               RoPE context scaling factor, expands context by a factor  of  [4mN[0m
-               where  [4mN[24m  is  the  linear scaling factor used by the fine-tuned
-               model. Some fine-tuned models have extended the context  length
+               RoPE  context  scaling factor, expands context by a factor of [4mN[0m
+               where [4mN[24m is the linear scaling factor  used  by  the  fine-tuned
+               model.  Some fine-tuned models have extended the context length
                by scaling RoPE. For example, if the original pre-trained model
-               have  a  context  length (max sequence length) of 4096 (4k) and
-               the fine-tuned model have 32k. That is a scaling factor  of  8,
-               and  should work by setting the above [1m--ctx-size [22mto 32768 (32k)
+               have a context length (max sequence length) of  4096  (4k)  and
+               the  fine-tuned  model have 32k. That is a scaling factor of 8,
+               and should work by setting the above [1m--ctx-size [22mto 32768  (32k)
                and [1m--rope-scale [22mto 8.
 
        [1m--rope-freq-base [4m[22mN[0m
@@ -262,7 +251,7 @@
                Default: loaded from model
 
        [1m--rope-freq-scale [4m[22mN[0m
-               RoPE frequency scaling factor, expands context by a  factor  of
+               RoPE  frequency  scaling factor, expands context by a factor of
                1/N
 
        [1m--yarn-orig-ctx [4m[22mN[0m
@@ -293,7 +282,7 @@
                Default: 32.0
 
        [1m--ignore-eos[0m
-               Ignore  end  of  stream  token and continue generating (implies
+               Ignore end of stream token  and  continue  generating  (implies
                [1m--logit-bias [4m[22m2-inf[24m)
 
        [1m--no-penalize-nl[0m
@@ -310,7 +299,7 @@
                Default: disabled
 
        [1m--hellaswag[0m
-               Compute HellaSwag score over random tasks  from  datafile  sup‐
+               Compute  HellaSwag  score  over random tasks from datafile sup‐
                plied with -f
 
        [1m--hellaswag-tasks [4m[22mN[0m
@@ -319,10 +308,10 @@
                Default: 400
 
        [1m--keep [4m[22mN[0m
-               This  flag  allows users to retain the original prompt when the
+               This flag allows users to retain the original prompt  when  the
                model runs out of context, ensuring a connection to the initial
                instruction or conversation topic is maintained, where [4mN[24m is the
-               number of tokens from the initial prompt  to  retain  when  the
+               number  of  tokens  from  the initial prompt to retain when the
                model resets its internal context.
 
                [1m-   [22m0 = no tokens are kept from initial prompt
@@ -358,15 +347,15 @@
                Default: 0.1
 
        [1m--mlock[0m
-               Force  system to keep model in RAM rather than swapping or com‐
+               Force system to keep model in RAM rather than swapping or  com‐
                pressing.
 
        [1m--no-mmap[0m
                Do not memory-map model (slower load but may reduce pageouts if
                not using mlock).
 
-       [1m--numa  [22mAttempt optimizations that help on some  NUMA  systems  if  run
-               without  this  previously, it is recommended to drop the system
+       [1m--numa  [22mAttempt  optimizations  that  help  on some NUMA systems if run
+               without this previously, it is recommended to drop  the  system
                page       cache       before       using       this.       See
                https://github.com/ggerganov/llama.cpp/issues/1437.
 
@@ -376,48 +365,48 @@
        [1m--nocompile[0m
                Never compile GPU support at runtime.
 
-               If  the appropriate DSO file already exists under [4m~/.llamafile/[0m
-               then it'll be linked as-is without question. If a prebuilt  DSO
-               is  present  in the PKZIP content of the executable, then it'll
-               be extracted and linked if possible. Otherwise, [1mllamafile  [22mwill
+               If the appropriate DSO file already exists under  [4m~/.llamafile/[0m
+               then  it'll be linked as-is without question. If a prebuilt DSO
+               is present in the PKZIP content of the executable,  then  it'll
+               be  extracted and linked if possible. Otherwise, [1mllamafile [22mwill
                skip any attempt to compile GPU support and simply fall back to
                using CPU inference.
 
        [1m--gpu [4m[22mGPU[0m
                Specifies which brand of GPU should be used. Valid choices are:
 
-               [1m-   [4m[22mAUTO[24m:  Use  any GPU if possible, otherwise fall back to CPU
+               [1m-   [4m[22mAUTO[24m: Use any GPU if possible, otherwise fall back  to  CPU
                    inference (default)
 
                [1m-   [4m[22mAPPLE[24m: Use Apple Metal GPU. This is only available on MacOS
-                   ARM64. If Metal could not be used for any  reason,  then  a
+                   ARM64.  If  Metal  could not be used for any reason, then a
                    fatal error will be raised.
 
                [1m-   [4m[22mAMD[24m: Use AMD GPUs. The AMD HIP ROCm SDK should be installed
-                   in  which  case we assume the HIP_PATH environment variable
-                   has been defined. The set of gfx microarchitectures  needed
-                   to  run  on  the  host  machine is determined automatically
-                   based on the output of the  hipInfo  command.  On  Windows,
-                   [1mllamafile  [22mrelease binaries are distributed with a tinyBLAS
-                   DLL so it'll work out of the box without requiring the  HIP
-                   SDK  to  be  installed.  However,  tinyBLAS  is slower than
+                   in which case we assume the HIP_PATH  environment  variable
+                   has  been defined. The set of gfx microarchitectures needed
+                   to run on the  host  machine  is  determined  automatically
+                   based  on  the  output  of the hipInfo command. On Windows,
+                   [1mllamafile [22mrelease binaries are distributed with a  tinyBLAS
+                   DLL  so it'll work out of the box without requiring the HIP
+                   SDK to be  installed.  However,  tinyBLAS  is  slower  than
                    rocBLAS for batch and image processing, so it's recommended
-                   that the SDK be installed anyway. If an AMD GPU  could  not
+                   that  the  SDK be installed anyway. If an AMD GPU could not
                    be used for any reason, then a fatal error will be raised.
 
                [1m-   [4m[22mNVIDIA[24m: Use NVIDIA GPUs. If an NVIDIA GPU could not be used
-                   for  any  reason, a fatal error will be raised. On Windows,
-                   NVIDIA GPU support will use our tinyBLAS library, since  it
-                   works  on  stock  Windows  installs. However, tinyBLAS goes
+                   for any reason, a fatal error will be raised.  On  Windows,
+                   NVIDIA  GPU support will use our tinyBLAS library, since it
+                   works on stock Windows  installs.  However,  tinyBLAS  goes
                    slower for batch and image processing. It's possible to use
-                   NVIDIA's closed-source cuBLAS library instead. To do  that,
-                   both  MSVC  and CUDA need to be installed and the [1mllamafile[0m
+                   NVIDIA's  closed-source cuBLAS library instead. To do that,
+                   both MSVC and CUDA need to be installed and  the  [1mllamafile[0m
                    command should be run once from the x64 MSVC command prompt
-                   with the [1m--recompile [22mflag passed.  The  GGML  library  will
-                   then  be compiled and saved to [4m~/.llamafile/[24m so the special
+                   with  the  [1m--recompile  [22mflag  passed. The GGML library will
+                   then be compiled and saved to [4m~/.llamafile/[24m so the  special
                    process only needs to happen a single time.
 
-               [1m-   [4m[22mDISABLE[24m: Never use GPU and instead use CPU inference.  This
+               [1m-   [4m[22mDISABLE[24m:  Never use GPU and instead use CPU inference. This
                    setting is implied by [1m-ngl [4m[22m0[24m.
 
        [1m-ngl [4m[22mN[24m, [1m--n-gpu-layers [4m[22mN[0m
@@ -434,12 +423,12 @@
 
        [1m-ts [4m[22mSPLIT[24m, [1m--tensor-split [4m[22mSPLIT[0m
                When using multiple GPUs this option controls how large tensors
-               should  be  split  across all GPUs.  [4mSPLIT[24m is a comma-separated
+               should be split across all GPUs.  [4mSPLIT[24m  is  a  comma-separated
                list of non-negative values that assigns the proportion of data
-               that each GPU should get in order. For example, "3,2" will  as‐
-               sign  60% of the data to GPU 0 and 40% to GPU 1. By default the
+               that  each GPU should get in order. For example, "3,2" will as‐
+               sign 60% of the data to GPU 0 and 40% to GPU 1. By default  the
                data is split in proportion to VRAM but this may not be optimal
-               for performance. Requires cuBLAS.  How to split tensors  across
+               for  performance. Requires cuBLAS.  How to split tensors across
                multiple GPUs, comma-separated list of proportions, e.g. 3,1
 
        [1m-mg [4m[22mi[24m, [1m--main-gpu [4m[22mi[0m
@@ -453,7 +442,7 @@
                Print prompt before generation.
 
        [1m--simple-io[0m
-               Use  basic IO for better compatibility in subprocesses and lim‐
+               Use basic IO for better compatibility in subprocesses and  lim‐
                ited consoles.
 
        [1m--lora [4m[22mFNAME[0m
@@ -464,14 +453,14 @@
                [1m--no-mmap[22m)
 
        [1m--lora-base [4m[22mFNAME[0m
-               Optional  model to use as a base for the layers modified by the
+               Optional model to use as a base for the layers modified by  the
                LoRA adapter
 
        [1m--unsecure[0m
                Disables pledge() sandboxing on Linux and OpenBSD.
 
        [1m--samplers[0m
-               Samplers that will be used for generation in the  order,  sepa‐
+               Samplers  that  will be used for generation in the order, sepa‐
                rated    by    semicolon,    for    example:    top_k;tfs;typi‐
                cal;top_p;min_p;temp
 
@@ -507,7 +496,7 @@
                Binary file containing multiple choice tasks.
 
        [1m--winogrande[0m
-               Compute Winogrande score over random tasks from  datafile  sup‐
+               Compute  Winogrande  score over random tasks from datafile sup‐
                plied by the [1m-f [22mflag.
 
        [1m--winogrande-tasks [4m[22mN[0m
@@ -516,17 +505,17 @@
                Default: 0
 
        [1m--multiple-choice[0m
-               Compute  multiple  choice score over random tasks from datafile
+               Compute multiple choice score over random tasks  from  datafile
                supplied by the [1m-f [22mflag.
 
        [1m--multiple-choice-tasks [4m[22mN[0m
-               Number of tasks to  use  when  computing  the  multiple  choice
+               Number  of  tasks  to  use  when  computing the multiple choice
                score.
 
                Default: 0
 
        [1m--kl-divergence[0m
-               Computes    KL-divergence    to   logits   provided   via   the
+               Computes   KL-divergence   to   logits   provided    via    the
                [1m--kl-divergence-base [22mflag.
 
        [1m--save-all-logits [4m[22mFNAME[24m, [1m--kl-divergence-base [4m[22mFNAME[0m
@@ -547,21 +536,21 @@
                The model default is used if unspecified.
 
 [1mCLI OPTIONS[0m
-       The following options may be specified when  [1mllamafile  [22mis  running  in
+       The  following  options  may  be specified when [1mllamafile [22mis running in
        [1m--cli [22mmode.
 
        [1m-e[22m, [1m--escape[0m
                Process prompt escapes sequences (\n, \r, \t, \´, \", \\)
 
        [1m-p [4m[22mSTRING[24m, [1m--prompt [4m[22mSTRING[0m
-               Prompt  to  start  text generation. Your LLM works by auto-com‐
+               Prompt to start text generation. Your LLM  works  by  auto-com‐
                pleting this text. For example:
 
                      [1mllamafile -m model.gguf -p "four score and"[0m
 
-               Stands a pretty good chance of  printing  Lincoln's  Gettysburg
-               Address.   Prompts can take on a structured format too. Depend‐
-               ing on how your model was trained, it may specify in  its  docs
+               Stands  a  pretty  good chance of printing Lincoln's Gettysburg
+               Address.  Prompts can take on a structured format too.  Depend‐
+               ing  on  how your model was trained, it may specify in its docs
                an instruction notation. With some models that might be:
 
                      [1mllamafile -p "[INST]Summarize this: $(cat file)[/INST]"[0m
@@ -579,36 +568,36 @@
 
                      [1mroot ::= "yes" | "no"[0m
 
-               will  force  the  LLM  to only output yes or no before exiting.
-               This is useful for shell scripts when  the  [1m--no-display-prompt[0m
+               will force the LLM to only output yes  or  no  before  exiting.
+               This  is  useful for shell scripts when the [1m--no-display-prompt[0m
                flag is also supplied.
 
        [1m--grammar-file [4m[22mFNAME[0m
                File to read grammar from.
 
-       [1m--fast  [22mPut  llamafile  into  fast  math mode. This disables algorithms
-               that reduce floating point rounding, e.g. Kahan summation,  and
+       [1m--fast  [22mPut llamafile into fast math  mode.  This  disables  algorithms
+               that  reduce floating point rounding, e.g. Kahan summation, and
                certain functions like expf() will be vectorized but handle un‐
-               derflows  less  gracefully.  It's unspecified whether llamafile
-               runs in fast or precise math mode when neither flag  is  speci‐
+               derflows less gracefully.  It's unspecified  whether  llamafile
+               runs  in  fast or precise math mode when neither flag is speci‐
                fied.
 
        [1m--precise[0m
-               Put  llamafile  into precise math mode. This enables algorithms
-               that reduce floating point rounding, e.g. Kahan summation,  and
-               certain  functions  like  expf()  will always handle subnormals
-               correctly. It's unspecified whether llamafile runs in  fast  or
+               Put llamafile into precise math mode. This  enables  algorithms
+               that  reduce floating point rounding, e.g. Kahan summation, and
+               certain functions like expf()  will  always  handle  subnormals
+               correctly.  It's  unspecified whether llamafile runs in fast or
                precise math mode when neither flag is specified.
 
-       [1m--trap  [22mPut  llamafile into math trapping mode. When floating point ex‐
-               ceptions occur, such as NaNs, overflow,  and  divide  by  zero,
-               llamafile  will  print  a  warning to the console. This warning
-               will include a C++ backtrace the first  time  an  exception  is
-               trapped.  The  op graph will also be dumped to a file, and lla‐
-               mafile will report the specific  op  where  the  exception  oc‐
-               curred.  This  is useful for troubleshooting when reporting is‐
-               sues.  USing this feature will disable sandboxing.  Math  trap‐
-               ping  is  only possible if your CPU supports it. That is gener‐
+       [1m--trap  [22mPut llamafile into math trapping mode. When floating point  ex‐
+               ceptions  occur,  such  as  NaNs, overflow, and divide by zero,
+               llamafile will print a warning to  the  console.  This  warning
+               will  include  a  C++  backtrace the first time an exception is
+               trapped. The op graph will also be dumped to a file,  and  lla‐
+               mafile  will  report  the  specific  op where the exception oc‐
+               curred. This is useful for troubleshooting when  reporting  is‐
+               sues.   USing  this feature will disable sandboxing. Math trap‐
+               ping is only possible if your CPU supports it. That  is  gener‐
                ally the case on AMD64, however it's less common on ARM64.
 
        [1m--prompt-cache [4m[22mFNAME[0m
@@ -617,12 +606,12 @@
                Default: none
 
        [1m-fa [4m[22mFNAME[24m, [1m--flash-attn[0m
-               Enable Flash Attention. This is a  mathematical  shortcut  that
-               can  speed  up  inference  for  certain models. This feature is
+               Enable  Flash  Attention.  This is a mathematical shortcut that
+               can speed up inference for  certain  models.  This  feature  is
                still under active development.
 
        [1m--prompt-cache-all[0m
-               If specified, saves user input  and  generations  to  cache  as
+               If  specified,  saves  user  input  and generations to cache as
                well. Not supported with [1m--interactive [22mor other interactive op‐
                tions.
 
@@ -634,47 +623,47 @@
 
        [1m--image [4m[22mIMAGE_FILE[0m
                Path to an image file. This should be used with multimodal mod‐
-               els.   Alternatively,  it's possible to embed an image directly
-               into the prompt instead; in which case, it must be  base64  en‐
-               coded  into  an HTML img tag URL with the image/jpeg MIME type.
+               els.  Alternatively, it's possible to embed an  image  directly
+               into  the  prompt instead; in which case, it must be base64 en‐
+               coded into an HTML img tag URL with the image/jpeg  MIME  type.
                See also the [1m--mmproj [22mflag for supplying the vision model.
 
        [1m-i[22m, [1m--interactive[0m
-               Run the program in interactive mode, allowing users  to  engage
-               in  real-time conversations or provide specific instructions to
+               Run  the  program in interactive mode, allowing users to engage
+               in real-time conversations or provide specific instructions  to
                the model.
 
        [1m--interactive-first[0m
-               Run the program in interactive mode and  immediately  wait  for
+               Run  the  program  in interactive mode and immediately wait for
                user input before starting the text generation.
 
        [1m-ins[22m, [1m--instruct[0m
-               Run  the program in instruction mode, which is specifically de‐
-               signed to work with Alpaca  models  that  excel  in  completing
+               Run the program in instruction mode, which is specifically  de‐
+               signed  to  work  with  Alpaca  models that excel in completing
                tasks based on user instructions.
 
                Technical details: The user's input is internally prefixed with
-               the  reverse prompt (or "### Instruction:" as the default), and
-               followed by "### Response:" (except if you  just  press  Return
+               the reverse prompt (or "### Instruction:" as the default),  and
+               followed  by  "###  Response:" (except if you just press Return
                without any input, to keep generating a longer response).
 
-               By  understanding  and utilizing these interaction options, you
+               By understanding and utilizing these interaction  options,  you
                can create engaging and dynamic experiences with the LLaMA mod‐
-               els, tailoring the text generation  process  to  your  specific
+               els,  tailoring  the  text  generation process to your specific
                needs.
 
        [1m-r [4m[22mPROMPT[24m, [1m--reverse-prompt [4m[22mPROMPT[0m
-               Specify  one  or multiple reverse prompts to pause text genera‐
-               tion and switch to interactive mode. For  example,  [1m-r  [4m[22m"User:"[0m
-               can  be  used  to jump back into the conversation whenever it's
-               the user's turn to speak. This helps create a more  interactive
-               and  conversational  experience.  However,  the  reverse prompt
-               doesn't work when it ends with a space. To overcome this  limi‐
-               tation,  you can use the [1m--in-prefix [22mflag to add a space or any
+               Specify one or multiple reverse prompts to pause  text  genera‐
+               tion  and  switch  to interactive mode. For example, [1m-r [4m[22m"User:"[0m
+               can be used to jump back into the  conversation  whenever  it's
+               the  user's turn to speak. This helps create a more interactive
+               and conversational  experience.  However,  the  reverse  prompt
+               doesn't  work when it ends with a space. To overcome this limi‐
+               tation, you can use the [1m--in-prefix [22mflag to add a space or  any
                other characters after the reverse prompt.
 
        [1m--color[0m
-               Enable colorized output to differentiate visually  distinguish‐
+               Enable  colorized output to differentiate visually distinguish‐
                ing between prompts, user input, and generated text.
 
        [1m--no-display-prompt[22m, [1m--silent-prompt[0m
@@ -689,23 +678,23 @@
                in '\'.
 
        [1m--cont-batching[0m
-               Enables  continuous  batching,  a.k.a. dynamic batching.  is -1
+               Enables continuous batching, a.k.a. dynamic  batching.   is  -1
                which means all tokens.
 
        [1m--embedding[0m
-               In CLI mode, the embedding flag may be use to print  embeddings
-               to  standard output. By default, embeddings are computed over a
-               whole prompt. However the [1m--multiline [22mflag may  be  passed,  to
+               In  CLI mode, the embedding flag may be use to print embeddings
+               to standard output. By default, embeddings are computed over  a
+               whole  prompt.  However  the [1m--multiline [22mflag may be passed, to
                have a separate embeddings array computed for each line of text
-               in  the prompt. In multiline mode, each embedding array will be
-               printed on its own line to standard  output,  where  individual
-               floats  are  separated  by space. If both the [1m--multiline-input[0m
-               and [1m--interactive [22mflags are passed, then a pretty-printed  sum‐
-               mary  of  embeddings along with a cosine similarity matrix will
+               in the prompt. In multiline mode, each embedding array will  be
+               printed  on  its  own line to standard output, where individual
+               floats are separated by space. If  both  the  [1m--multiline-input[0m
+               and  [1m--interactive [22mflags are passed, then a pretty-printed sum‐
+               mary of embeddings along with a cosine similarity  matrix  will
                be printed to the terminal.
 
 [1mSERVER OPTIONS[0m
-       The following options may be specified when  [1mllamafile  [22mis  running  in
+       The  following  options  may  be specified when [1mllamafile [22mis running in
        [1m--server [22mmode.
 
        [1m--port [4m[22mPORT[0m
@@ -734,11 +723,11 @@
                Default: disabled
 
        [1m-spf [4m[22mFNAME[24m, [1m--system-prompt-file [4m[22mFNAME[0m
-               Set  a  file  to  load  a  system prompt (initial prompt of all
+               Set a file to load a  system  prompt  (initial  prompt  of  all
                slots), this is useful for chat applications.
 
        [1m-a [4m[22mALIAS[24m, [1m--alias [4m[22mALIAS[0m
-               Set an alias for the model. This will be  added  as  the  [4mmodel[0m
+               Set  an  alias  for  the model. This will be added as the [4mmodel[0m
                field in completion responses.
 
        [1m--path [4m[22mPUBLIC_PATH[0m
@@ -750,13 +739,13 @@
                Do not attempt to open a web browser tab at startup.
 
        [1m-gan [4m[22mN[24m, [1m--grp-attn-n [4m[22mN[0m
-               Set  the  group attention factor to extend context size through
-               self-extend. The default value is [4m1[24m which means disabled.  This
+               Set the group attention factor to extend context  size  through
+               self-extend.  The default value is [4m1[24m which means disabled. This
                flag is used together with [1m--grp-attn-w[22m.
 
        [1m-gaw [4m[22mN[24m, [1m--grp-attn-w [4m[22mN[0m
-               Set  the  group  attention width to extend context size through
-               self-extend. The default value is [4m512[24m.  This flag is  used  to‐
+               Set the group attention width to extend  context  size  through
+               self-extend.  The  default value is [4m512[24m.  This flag is used to‐
                gether with [1m--grp-attn-n[22m.
 
 [1mLOG OPTIONS[0m
@@ -778,15 +767,15 @@
                Specify a log filename (without extension)
 
        [1m--log-new[0m
-               Create  a  separate  new  log file on start. Each log file will
+               Create a separate new log file on start.  Each  log  file  will
                have unique name: [4m<name>.<ID>.log[0m
 
        [1m--log-append[0m
                Don't truncate the old log file.
 
 [1mEXAMPLES[0m
-       Here's an example of how to run llama.cpp's built-in HTTP server.  This
-       example   uses   LLaVA  v1.5-7B,  a  multimodal  LLM  that  works  with
+       Here's  an example of how to run llama.cpp's built-in HTTP server. This
+       example  uses  LLaVA  v1.5-7B,  a  multimodal  LLM  that   works   with
        llama.cpp's recently-added support for image inputs.
 
              llamafile \
@@ -802,14 +791,14 @@
                -m wizardcoder-python-13b-v1.0.Q8_0.gguf --temp 0 -r '}\n' -r '```\n' \
                -e -p '```c\nvoid *memcpy(void *dst, const void *src, size_t size) {\n'
 
-       Here's  a  similar  example  that  instead utilizes Mistral-7B-Instruct
+       Here's a similar  example  that  instead  utilizes  Mistral-7B-Instruct
        weights for prose composition:
 
              llamafile \
                -m mistral-7b-instruct-v0.2.Q5_K_M.gguf \
                -p '[INST]Write a story about llamas[/INST]'
 
-       Here's an example of how llamafile can be used as an interactive  chat‐
+       Here's  an example of how llamafile can be used as an interactive chat‐
        bot that lets you query knowledge contained in training data:
 
              llamafile -m llama-65b-Q5_K.gguf -p '
@@ -849,7 +838,7 @@
                -e -p '### User: What do you see?\n### Assistant: ' \
                --no-display-prompt 2>/dev/null
 
-       If  you  wanted  to  write a script to rename all your image files, you
+       If you wanted to write a script to rename all  your  image  files,  you
        could use the following command to generate a safe filename:
 
              llamafile --temp 0 \
@@ -862,8 +851,8 @@
                sed -e's/ /_/g' -e's/$/.jpg/'
              three_baby_lemurs_on_the_back_of_an_adult_lemur.jpg
 
-       Here's an example of how to make an API request to the OpenAI API  com‐
-       patible  completions  endpoint  when  your  [1mllamafile [22mis running in the
+       Here's  an example of how to make an API request to the OpenAI API com‐
+       patible completions endpoint when your  [1mllamafile  [22mis  running  in  the
        background in [1m--server [22mmode.
 
              curl -s http://localhost:8080/v1/chat/completions \
@@ -888,12 +877,12 @@
 
 [1mPROTIP[0m
        The [1m-ngl [4m[22m35[24m flag needs to be passed in order to use GPUs made by NVIDIA
-       and AMD.  It's not enabled by default since it sometimes  needs  to  be
-       tuned  based on the system hardware and model architecture, in order to
+       and  AMD.   It's  not enabled by default since it sometimes needs to be
+       tuned based on the system hardware and model architecture, in order  to
        achieve optimal performance, and avoid compromising a shared display.
 
 [1mSEE ALSO[0m
-       [4mllamafile-quantize[24m(1),   [4mllamafile-perplexity[24m(1),    [4mllava-quantize[24m(1),
+       [4mllamafile-quantize[24m(1),    [4mllamafile-perplexity[24m(1),   [4mllava-quantize[24m(1),
        [4mzipalign[24m(1), [4munzip[24m(1)
 
-Mozilla Ocho                    January 1, 2024                   [4mLLAMAFILE[24m(1)
+Mozilla Ocho                   October 12, 2024                   [4mLLAMAFILE[24m(1)
diff --git a/llama.cpp/main/main.cpp b/llama.cpp/main/main.cpp
index abdf66e53c..85cb95fd9c 100644
--- a/llama.cpp/main/main.cpp
+++ b/llama.cpp/main/main.cpp
@@ -194,15 +194,14 @@ int main(int argc, char ** argv) {
 
     enum Program prog = determine_program(argv);
 
-    if (prog == SERVER ||
+    if (prog == SERVER)
+        return server_cli(argc, argv);
+
+    if (prog == CHATBOT ||
         (prog == UNKNOWN &&
          !llamafile_has(argv, "-p") &&
          !llamafile_has(argv, "-f") &&
          !llamafile_has(argv, "--random-prompt"))) {
-        return server_cli(argc, argv);
-    }
-
-    if (prog == CHATBOT) {
         int chatbot_main(int, char **);
         return chatbot_main(argc, argv);
     }
@@ -236,7 +235,7 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    if (!FLAG_unsecure && !llamafile_has_gpu()) {
+    if (!FLAG_unsecure && !llamafile_has_gpu() && !g_server_background_mode) {
         // Enable pledge() security on Linux and OpenBSD.
         // - We do this *after* opening the log file for writing.
         // - We do this *before* loading any weights or graphdefs.
diff --git a/llama.cpp/server/server.cpp b/llama.cpp/server/server.cpp
index c94eb15888..da758c81de 100644
--- a/llama.cpp/server/server.cpp
+++ b/llama.cpp/server/server.cpp
@@ -29,6 +29,10 @@
 
 double g_prompt_per_second_jart;
 
+bool g_server_background_mode;
+llama_model *g_server_force_llama_model;
+void (*g_server_on_listening)(const char *host, int port);
+
 using json = nlohmann::json;
 
 struct server_params
@@ -443,9 +447,16 @@ struct llama_server_context
             }
         }
 
-        llama_init_result llama_init = llama_init_from_gpt_params(params);
-        model = llama_init.model;
-        ctx = llama_init.context;
+        if (!g_server_force_llama_model) {
+            llama_init_result llama_init = llama_init_from_gpt_params(params);
+            model = llama_init.model;
+            ctx = llama_init.context;
+        } else {
+            llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
+            model = g_server_force_llama_model;
+            ctx = llama_new_context_with_model(model, ctx_params);
+        }
+
         if (model == nullptr)
         {
             LOG_ERROR("unable to load model", {{"model", params.model}});
@@ -3129,13 +3140,16 @@ int server_cli(int argc, char **argv)
     }
 
     // launch browser tab
-    if (!sparams.nobrowser) {
+    if (!sparams.nobrowser && !g_server_background_mode) {
         char url[128];
         snprintf(url, sizeof(url), "http://%s:%d/", connect_host, sparams.port);
         llamafile_launch_browser(url);
     }
+    if (g_server_on_listening) {
+        g_server_on_listening(connect_host, sparams.port);
+    }
 
-    if (!FLAG_unsecure) {
+    if (!FLAG_unsecure && !g_server_background_mode) {
         if (IsXnu()) {
             // Cosmopolitan libc explicitly does not support cosmo_dlopen on x64
             // macOS and mac_sandbox_init depends on cosmo_dlopen. We'll attempt
@@ -3685,6 +3699,7 @@ int server_cli(int argc, char **argv)
         llama.queue_tasks.terminate();
     };
 
+    if (!g_server_background_mode) {
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
     struct sigaction sigint_action;
     sigint_action.sa_handler = signal_handler;
@@ -3697,6 +3712,8 @@ int server_cli(int argc, char **argv)
     };
     SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif
+    }
+
     llama.queue_tasks.start_loop();
     svr.stop();
     t.join();
diff --git a/llama.cpp/server/server.h b/llama.cpp/server/server.h
index f90fa4e2ec..572e4f1e8b 100644
--- a/llama.cpp/server/server.h
+++ b/llama.cpp/server/server.h
@@ -1,2 +1,9 @@
 #pragma once
+#include "llama.cpp/llama.h"
+
+extern bool server_log_json;
+extern bool g_server_background_mode;
+extern llama_model *g_server_force_llama_model;
+extern void (*g_server_on_listening)(const char *host, int port);
+
 int server_cli(int, char **);
diff --git a/llama.cpp/server/utils.h b/llama.cpp/server/utils.h
index ad169d4ff3..cde8548b28 100644
--- a/llama.cpp/server/utils.h
+++ b/llama.cpp/server/utils.h
@@ -138,6 +138,9 @@ struct completion_token_output
 
 static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra)
 {
+    if (FLAG_log_disable) // [jart]
+        return;
+
     std::stringstream ss_tid;
     ss_tid << std::this_thread::get_id();
     json log = nlohmann::ordered_json{
diff --git a/llamafile/chatbot.cpp b/llamafile/chatbot.cpp
index 54d88e92fe..eecfe7d155 100644
--- a/llamafile/chatbot.cpp
+++ b/llamafile/chatbot.cpp
@@ -27,6 +27,7 @@
 
 #include "llama.cpp/common.h"
 #include "llama.cpp/llama.h"
+#include "llama.cpp/server/server.h"
 #include "llamafile/bestline.h"
 #include "llamafile/highlight.h"
 #include "llamafile/llamafile.h"
@@ -43,9 +44,17 @@
 #define BRIGHT_GREEN "\e[92m"
 #define CLEAR_FORWARD "\e[K"
 
+struct ServerArgs {
+    int argc;
+    char **argv;
+};
+
 static int n_past;
 static llama_model *g_model;
 static llama_context *g_ctx;
+static pthread_cond_t g_cond = PTHREAD_COND_INITIALIZER;
+static pthread_mutex_t g_lock = PTHREAD_MUTEX_INITIALIZER;
+static std::string g_listen_url;
 static volatile sig_atomic_t g_got_sigint;
 
 static void on_sigint(int sig) {
@@ -195,19 +204,25 @@ static void eval_string(const std::string &str, int n_batch, bool add_special, b
     eval_tokens(llama_tokenize(g_ctx, str, add_special, parse_special), n_batch);
 }
 
+static void on_server_listening(const char *host, int port) {
+    pthread_mutex_lock(&g_lock);
+    g_listen_url = format("http://%s:%d/", host, port);
+    pthread_cond_signal(&g_cond);
+    pthread_mutex_unlock(&g_lock);
+}
+
+static void *server_thread(void *arg) {
+    ServerArgs *sargs = (ServerArgs *)arg;
+    server_log_json = false;
+    g_server_background_mode = true;
+    g_server_force_llama_model = g_model;
+    g_server_on_listening = on_server_listening;
+    exit(server_cli(sargs->argc, sargs->argv));
+}
+
 int chatbot_main(int argc, char **argv) {
-    llamafile_check_cpu();
-    ShowCrashReports();
     log_disable();
 
-    gpt_params params;
-    params.n_batch = 512; // for better progress indication
-    params.sparams.temp = 0; // don't believe in randomness by default
-    if (!gpt_params_parse(argc, argv, params)) {
-        fprintf(stderr, "error: failed to parse flags\n");
-        exit(1);
-    }
-
     print_logo(u"\n\
 ██╗     ██╗      █████╗ ███╗   ███╗ █████╗ ███████╗██╗██╗     ███████╗\n\
 ██║     ██║     ██╔══██╗████╗ ████║██╔══██╗██╔════╝██║██║     ██╔════╝\n\
@@ -216,15 +231,18 @@ int chatbot_main(int argc, char **argv) {
 ███████╗███████╗██║  ██║██║ ╚═╝ ██║██║  ██║██║     ██║███████╗███████╗\n\
 ╚══════╝╚══════╝╚═╝  ╚═╝╚═╝     ╚═╝╚═╝  ╚═╝╚═╝     ╚═╝╚══════╝╚══════╝\n");
 
-    printf(BOLD "software" UNBOLD ": llamafile " LLAMAFILE_VERSION_STRING "\n" //
-           BOLD "model" UNBOLD ":    %s\n\n",
-           basename(params.model).c_str());
-
-    print_ephemeral("initializing backend...");
+    print_ephemeral("loading backend...");
     llama_backend_init();
+    gpt_params params;
+    params.n_batch = 512; // for better progress indication
+    params.sparams.temp = 0; // don't believe in randomness by default
+    if (!gpt_params_parse(argc, argv, params)) { // also loads gpu module
+        fprintf(stderr, "error: failed to parse flags\n");
+        exit(1);
+    }
     clear_ephemeral();
 
-    print_ephemeral("initializing model...");
+    print_ephemeral("loading model...");
     llama_model_params model_params = llama_model_default_params();
     model_params.n_gpu_layers = llamafile_gpu_layers(35);
     g_model = llama_load_model_from_file(params.model.c_str(), model_params);
@@ -233,12 +251,35 @@ int chatbot_main(int argc, char **argv) {
         fprintf(stderr, "%s: failed to load model\n", params.model.c_str());
         exit(2);
     }
-    if (!params.n_ctx)
+    if (params.n_ctx <= 0 || params.n_ctx > llama_n_ctx_train(g_model))
         params.n_ctx = llama_n_ctx_train(g_model);
     if (params.n_ctx < params.n_batch)
         params.n_batch = params.n_ctx;
     clear_ephemeral();
 
+    bool want_server = !llamafile_has(argv, "--chat");
+    if (want_server) {
+        print_ephemeral("launching server...");
+        pthread_t thread;
+        pthread_attr_t attr;
+        ServerArgs sargs = {argc, argv};
+        pthread_mutex_lock(&g_lock);
+        pthread_attr_init(&attr);
+        pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+        pthread_create(&thread, &attr, server_thread, &sargs);
+        pthread_attr_destroy(&attr);
+        pthread_cond_wait(&g_cond, &g_lock);
+        pthread_mutex_unlock(&g_lock);
+        clear_ephemeral();
+    }
+
+    printf(BOLD "software" UNBOLD ": llamafile " LLAMAFILE_VERSION_STRING "\n" //
+           BOLD "model" UNBOLD ":    %s\n",
+           basename(params.model).c_str());
+    if (want_server)
+        printf(BOLD "server" UNBOLD ":   %s\n", g_listen_url.c_str());
+    printf("\n");
+
     print_ephemeral("initializing context...");
     llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
     g_ctx = llama_new_context_with_model(g_model, ctx_params);
@@ -250,9 +291,9 @@ int chatbot_main(int argc, char **argv) {
     clear_ephemeral();
 
     if (params.prompt.empty())
-        params.prompt =
-            "A chat between a curious human and an artificial intelligence assistant. The "
-            "assistant gives helpful, detailed, and polite answers to the human's questions.";
+        params.prompt = "A chat between a curious human and an artificial intelligence assistant. "
+                        "The assistant gives helpful, detailed, and polite answers to the "
+                        "human's questions.";
 
     bool add_bos = llama_should_add_bos_token(llama_get_model(g_ctx));
     std::vector<llama_chat_msg> chat = {{"system", params.prompt}};