Skip to content

Commit

Permalink
Fix OpenAI server sampling w.r.t. temp and seed
Browse files Browse the repository at this point in the history
The default values for tfs_z and typical_p were being set to zero, which
caused the token candidates array to get shrunk down to one element thus
preventing any sampling. Note this only applies to OpenAI API compatible
HTTP server requests.

The solution is to use the default values that OpenAI documents, as well
as ensuring we use the llama.cpp defaults for the rest. I've tested this
change still ensures deterministic output by default. If a "temperature"
greater than 0 is explicitly passed, then output is unique each time. If
"seed" is specified in addition to "temperature" then the output becomes
deterministic once more.

Fixes #117
  • Loading branch information
jart committed Dec 28, 2023
1 parent 8bea4e1 commit 9e4bf29
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 13 deletions.
2 changes: 2 additions & 0 deletions llama.cpp/README.llamafile
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,10 @@ LOCAL MODIFICATIONS
- Add support to main() programs for Cosmo /zip/.args files
- Introduce pledge() SECCOMP sandboxing to improve security
- Use condition variables for server rather than busy loop
- Fix OpenAI server sampling w.r.t. temperature and seed
- Remove log callback pointer API from Metal GPU module
- Write log to /dev/null when main.log fails to open
- Use _rand64() rather than time() as default seed
- Avoid bind() conflicts on port 8080 w/ server
- Allow --grammar to be used on --image prompts
- Use runtime dispatching for matmul quants
Expand Down
2 changes: 1 addition & 1 deletion llama.cpp/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7217,7 +7217,7 @@ struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar)

void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
if (seed == LLAMA_DEFAULT_SEED) {
seed = time(NULL);
seed = _rand64();
}
ctx->rng.seed(seed);
}
Expand Down
31 changes: 19 additions & 12 deletions llama.cpp/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -443,7 +443,6 @@ struct llama_client_slot
}

images.clear();
// llama_set_rng_seed(ctx, params.seed); in batched the seed matter???????
}

bool has_budget(gpt_params &global_params) {
Expand Down Expand Up @@ -893,6 +892,7 @@ struct llama_server_context
llama_sampling_free(slot->ctx_sampling);
}
slot->ctx_sampling = llama_sampling_init(slot->sparams);
llama_set_rng_seed(ctx, slot->params.seed);
slot->command = LOAD_PROMPT;

all_slots_are_idle = false;
Expand Down Expand Up @@ -1183,7 +1183,7 @@ struct llama_server_context
{"n_ctx", slot.n_ctx},
{"model", params.model_alias},
{"seed", slot.params.seed},
{"temp", slot.sparams.temp},
{"temperature", slot.sparams.temp},
{"top_k", slot.sparams.top_k},
{"top_p", slot.sparams.top_p},
{"min_p", slot.sparams.min_p},
Expand Down Expand Up @@ -2388,26 +2388,33 @@ json oaicompat_completion_params_parse(
llama_params["__oaicompat"] = true;

// Map OpenAI parameters to llama.cpp parameters
//
// For parameters that are defined by the OpenAI documentation (e.g.
// temperature), we explicitly specify OpenAI's intended default; we
// need to do that because sometimes OpenAI disagrees with llama.cpp
//
// https://platform.openai.com/docs/api-reference/chat/create
llama_sampling_params default_sparams;
llama_params["model"] = json_value(body, "model", std::string("uknown"));
llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
llama_params["temperature"] = json_value(body, "temperature", 0.8);
llama_params["top_k"] = json_value(body, "top_k", 40);
llama_params["top_p"] = json_value(body, "top_p", 0.95);
llama_params["temperature"] = json_value(body, "temperature", 0.0);
llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k);
llama_params["top_p"] = json_value(body, "top_p", 1.0);
llama_params["n_predict"] = json_value(body, "max_tokens", -1);
llama_params["logit_bias"] = json_value(body, "logit_bias",json::object());
llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
llama_params["seed"] = json_value(body, "seed", 0);
llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED);
llama_params["stream"] = json_value(body, "stream", false);
llama_params["mirostat"] = json_value(body, "mirostat", false);
llama_params["mirostat_tau"] = json_value(body, "mirostat_tau", 0.0);
llama_params["mirostat_eta"] = json_value(body, "mirostat_eta", 0.0);
llama_params["penalize_nl"] = json_value(body, "penalize_nl", false);
llama_params["typical_p"] = json_value(body, "typical_p", 0.0);
llama_params["mirostat"] = json_value(body, "mirostat", default_sparams.mirostat);
llama_params["mirostat_tau"] = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
llama_params["mirostat_eta"] = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
llama_params["penalize_nl"] = json_value(body, "penalize_nl", default_sparams.penalize_nl);
llama_params["typical_p"] = json_value(body, "typical_p", default_sparams.typical_p);
llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", 0);
llama_params["ignore_eos"] = json_value(body, "ignore_eos", false);
llama_params["tfs_z"] = json_value(body, "tfs_z", 0.0);
llama_params["tfs_z"] = json_value(body, "tfs_z", default_sparams.tfs_z);

if (llama_params.count("grammar") != 0) {
llama_params["grammar"] = json_value(body, "grammar", json::object());
Expand Down

0 comments on commit 9e4bf29

Please sign in to comment.