Skip to content

Commit

Permalink
Merge pull request #20 from jmont-dev/manual_requests
Browse files Browse the repository at this point in the history
Add support for manual requests for generate, chat, and embedding endpoints.
  • Loading branch information
jmont-dev authored Aug 13, 2024
2 parents fdd114f + 3cc83e9 commit 8977537
Show file tree
Hide file tree
Showing 4 changed files with 161 additions and 17 deletions.
16 changes: 15 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ The test cases do a good job of providing discrete examples for each of the API
- [Chat with Images](#chat-with-images)
- [Embedding Generation](#embedding-generation)
- [Debug Information](#debug-information)
- [Manual Requests](#manual-requests)
- [Single-header vs Separate Headers](#single-header-vs-separate-headers)
- [About this software:](#about-this-software)
- [License](#license)
Expand Down Expand Up @@ -389,7 +390,20 @@ Debug logging for requests and replies to the server can easily be turned on and
ollama::show_requests(true);
ollama::show_replies(true);
```


### Manual Requests
For those looking for greater control of the requests sent to the ollama server, manual requests can be created through the `ollama::request` class. This class extends `nlohmann::json` and can be treated as a standard JSON object.

```C++
ollama::request request(ollama::message_type::generation);
request["model"]="mistral";
request["prompt"]="Why is the sky blue?";
request["stream"] = false;
request["system"] = "Talk like a pirate for the next reply."
std::cout << ollama::generate(request) << std::endl;
```
This provides the most customization of the request. Users should take care to ensure that valid fields are provided, otherwise an exception will likely be thrown on response. Manual requests can be made for generate, chat, and embedding endpoints.
## Single-header vs Separate Headers
For convenience, ollama-hpp includes a single-header version of the library in `singleheader/ollama.hpp` which bundles the core ollama.hpp code with single-header versions of nlohmann json, httplib, and base64.h. Each of these libraries is available under the MIT license and their respective licenses are included.
The single-header include can be regenerated from these standalone files by running `./make_single_header.sh`
Expand Down
66 changes: 58 additions & 8 deletions include/ollama.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -377,12 +377,16 @@ class Ollama
Ollama(): Ollama("http://localhost:11434") {}
~Ollama() { delete this->cli; }

// Generate a non-streaming reply as a string.
ollama::response generate(const std::string& model,const std::string& prompt, const json& options=nullptr, const std::vector<std::string>& images=std::vector<std::string>())
{
ollama::request request(model, prompt, options, false, images);
return generate(request);
}

// Generate a non-streaming reply as a string.
ollama::response generate(const ollama::request& request)
{
ollama::response response;
ollama::request request(model, prompt, options, false, images);

std::string request_string = request.dump();
if (ollama::log_requests) std::cout << request_string << std::endl;
Expand All @@ -400,14 +404,19 @@ class Ollama
if (ollama::use_exceptions) throw ollama::exception("No response returned from server "+this->server_url+". Error was: "+httplib::to_string( res.error() ));
}

return response;
return response;
}

// Generate a streaming reply where a user-defined callback function is invoked when each token is received.
bool generate(const std::string& model,const std::string& prompt, std::function<void(const ollama::response&)> on_receive_token, const json& options=nullptr, const std::vector<std::string>& images=std::vector<std::string>())
{

ollama::request request(model, prompt, options, true, images);
return generate(request, on_receive_token);
}


// Generate a streaming reply where a user-defined callback function is invoked when each token is received.
bool generate(ollama::request& request, std::function<void(const ollama::response&)> on_receive_token)
{

std::string request_string = request.dump();
if (ollama::log_requests) std::cout << request_string << std::endl;
Expand All @@ -428,12 +437,17 @@ class Ollama
return false;
}

// Generate a non-streaming reply as a string.
ollama::response chat(const std::string& model, const ollama::messages& messages, json options=nullptr, const std::string& format="json", const std::string& keep_alive_duration="5m")
{
ollama::request request(model, messages, options, false, format, keep_alive_duration);
return chat(request);
}


// Generate a non-streaming reply as a string.
ollama::response chat(ollama::request& request)
{
ollama::response response;
ollama::request request(model, messages, options, false, format, keep_alive_duration);

std::string request_string = request.dump();
if (ollama::log_requests) std::cout << request_string << std::endl;
Expand All @@ -456,9 +470,14 @@ class Ollama

bool chat(const std::string& model, const ollama::messages& messages, std::function<void(const ollama::response&)> on_receive_token, const json& options=nullptr, const std::string& format="json", const std::string& keep_alive_duration="5m")
{
ollama::request request(model, messages, options, true, format, keep_alive_duration);
return chat(request, on_receive_token);
}


bool chat(ollama::request& request, std::function<void(const ollama::response&)> on_receive_token)
{
ollama::response response;
ollama::request request(model, messages, options, true, format, keep_alive_duration);

std::string request_string = request.dump();
if (ollama::log_requests) std::cout << request_string << std::endl;
Expand Down Expand Up @@ -719,6 +738,12 @@ class Ollama
ollama::response generate_embeddings(const std::string& model, const std::string& input, const json& options=nullptr, bool truncate = true, const std::string& keep_alive_duration="5m")
{
ollama::request request = ollama::request::from_embedding(model, input, options, truncate, keep_alive_duration);
return generate_embeddings(request);
}


ollama::response generate_embeddings(ollama::request& request)
{
ollama::response response;

std::string request_string = request.dump();
Expand Down Expand Up @@ -806,21 +831,41 @@ namespace ollama
return ollama.generate(model, prompt, options, images);
}

inline ollama::response generate(const ollama::request& request)
{
return ollama.generate(request);
}

inline bool generate(const std::string& model,const std::string& prompt, std::function<void(const ollama::response&)> on_receive_response, const json& options=nullptr, const std::vector<std::string>& images=std::vector<std::string>())
{
return ollama.generate(model, prompt, on_receive_response, options, images);
}

inline bool generate(ollama::request& request, std::function<void(const ollama::response&)> on_receive_response)
{
return ollama.generate(request, on_receive_response);
}

inline ollama::response chat(const std::string& model, const ollama::messages& messages, const json& options=nullptr, const std::string& format="json", const std::string& keep_alive_duration="5m")
{
return ollama.chat(model, messages, options, format, keep_alive_duration);
}

inline ollama::response chat(ollama::request& request)
{
return ollama.chat(request);
}

inline bool chat(const std::string& model, const ollama::messages& messages, std::function<void(const ollama::response&)> on_receive_response, const json& options=nullptr, const std::string& format="json", const std::string& keep_alive_duration="5m")
{
return ollama.chat(model, messages, on_receive_response, options, format, keep_alive_duration);
}

inline bool chat(ollama::request& request, std::function<void(const ollama::response&)> on_receive_response)
{
return ollama.chat(request, on_receive_response);
}

inline bool create(const std::string& modelName, const std::string& modelFile, bool loadFromFile=true)
{
return ollama.create_model(modelName, modelFile, loadFromFile);
Expand Down Expand Up @@ -891,6 +936,11 @@ namespace ollama
return ollama.generate_embeddings(model, input, options, truncate, keep_alive_duration);
}

inline ollama::response generate_embeddings(ollama::request& request)
{
return ollama.generate_embeddings(request);
}

inline void setReadTimeout(const int& seconds)
{
ollama.setReadTimeout(seconds);
Expand Down
66 changes: 58 additions & 8 deletions singleheader/ollama.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35167,12 +35167,16 @@ class Ollama
Ollama(): Ollama("http://localhost:11434") {}
~Ollama() { delete this->cli; }

// Generate a non-streaming reply as a string.
ollama::response generate(const std::string& model,const std::string& prompt, const json& options=nullptr, const std::vector<std::string>& images=std::vector<std::string>())
{
ollama::request request(model, prompt, options, false, images);
return generate(request);
}

// Generate a non-streaming reply as a string.
ollama::response generate(const ollama::request& request)
{
ollama::response response;
ollama::request request(model, prompt, options, false, images);

std::string request_string = request.dump();
if (ollama::log_requests) std::cout << request_string << std::endl;
Expand All @@ -35190,14 +35194,19 @@ class Ollama
if (ollama::use_exceptions) throw ollama::exception("No response returned from server "+this->server_url+". Error was: "+httplib::to_string( res.error() ));
}

return response;
return response;
}

// Generate a streaming reply where a user-defined callback function is invoked when each token is received.
bool generate(const std::string& model,const std::string& prompt, std::function<void(const ollama::response&)> on_receive_token, const json& options=nullptr, const std::vector<std::string>& images=std::vector<std::string>())
{

ollama::request request(model, prompt, options, true, images);
return generate(request, on_receive_token);
}


// Generate a streaming reply where a user-defined callback function is invoked when each token is received.
bool generate(ollama::request& request, std::function<void(const ollama::response&)> on_receive_token)
{

std::string request_string = request.dump();
if (ollama::log_requests) std::cout << request_string << std::endl;
Expand All @@ -35218,12 +35227,17 @@ class Ollama
return false;
}

// Generate a non-streaming reply as a string.
ollama::response chat(const std::string& model, const ollama::messages& messages, json options=nullptr, const std::string& format="json", const std::string& keep_alive_duration="5m")
{
ollama::request request(model, messages, options, false, format, keep_alive_duration);
return chat(request);
}


// Generate a non-streaming reply as a string.
ollama::response chat(ollama::request& request)
{
ollama::response response;
ollama::request request(model, messages, options, false, format, keep_alive_duration);

std::string request_string = request.dump();
if (ollama::log_requests) std::cout << request_string << std::endl;
Expand All @@ -35246,9 +35260,14 @@ class Ollama

bool chat(const std::string& model, const ollama::messages& messages, std::function<void(const ollama::response&)> on_receive_token, const json& options=nullptr, const std::string& format="json", const std::string& keep_alive_duration="5m")
{
ollama::request request(model, messages, options, true, format, keep_alive_duration);
return chat(request, on_receive_token);
}


bool chat(ollama::request& request, std::function<void(const ollama::response&)> on_receive_token)
{
ollama::response response;
ollama::request request(model, messages, options, true, format, keep_alive_duration);

std::string request_string = request.dump();
if (ollama::log_requests) std::cout << request_string << std::endl;
Expand Down Expand Up @@ -35509,6 +35528,12 @@ class Ollama
ollama::response generate_embeddings(const std::string& model, const std::string& input, const json& options=nullptr, bool truncate = true, const std::string& keep_alive_duration="5m")
{
ollama::request request = ollama::request::from_embedding(model, input, options, truncate, keep_alive_duration);
return generate_embeddings(request);
}


ollama::response generate_embeddings(ollama::request& request)
{
ollama::response response;

std::string request_string = request.dump();
Expand Down Expand Up @@ -35596,21 +35621,41 @@ namespace ollama
return ollama.generate(model, prompt, options, images);
}

inline ollama::response generate(const ollama::request& request)
{
return ollama.generate(request);
}

inline bool generate(const std::string& model,const std::string& prompt, std::function<void(const ollama::response&)> on_receive_response, const json& options=nullptr, const std::vector<std::string>& images=std::vector<std::string>())
{
return ollama.generate(model, prompt, on_receive_response, options, images);
}

inline bool generate(ollama::request& request, std::function<void(const ollama::response&)> on_receive_response)
{
return ollama.generate(request, on_receive_response);
}

inline ollama::response chat(const std::string& model, const ollama::messages& messages, const json& options=nullptr, const std::string& format="json", const std::string& keep_alive_duration="5m")
{
return ollama.chat(model, messages, options, format, keep_alive_duration);
}

inline ollama::response chat(ollama::request& request)
{
return ollama.chat(request);
}

inline bool chat(const std::string& model, const ollama::messages& messages, std::function<void(const ollama::response&)> on_receive_response, const json& options=nullptr, const std::string& format="json", const std::string& keep_alive_duration="5m")
{
return ollama.chat(model, messages, on_receive_response, options, format, keep_alive_duration);
}

inline bool chat(ollama::request& request, std::function<void(const ollama::response&)> on_receive_response)
{
return ollama.chat(request, on_receive_response);
}

inline bool create(const std::string& modelName, const std::string& modelFile, bool loadFromFile=true)
{
return ollama.create_model(modelName, modelFile, loadFromFile);
Expand Down Expand Up @@ -35681,6 +35726,11 @@ namespace ollama
return ollama.generate_embeddings(model, input, options, truncate, keep_alive_duration);
}

inline ollama::response generate_embeddings(ollama::request& request)
{
return ollama.generate_embeddings(request);
}

inline void setReadTimeout(const int& seconds)
{
ollama.setReadTimeout(seconds);
Expand Down
30 changes: 30 additions & 0 deletions test/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,36 @@ TEST_SUITE("Ollama Tests") {
CHECK(response.as_json().contains("embeddings") == true);
}

TEST_CASE("Manual Requests") {

ollama::request request(ollama::message_type::generation);
request["model"] = test_model;
request["prompt"] = "Why is the sky blue?";
request["stream"] = false;
ollama::response response = ollama::generate(request);

CHECK(response.as_json().contains("response") == true);

request = ollama::request(ollama::message_type::chat);
request["model"] = test_model;
ollama::messages messages = { ollama::message("user","Why is the sky blue?") };
request["messages"] = messages.to_json();
request["stream"] = false;
response = ollama::chat(request);

CHECK(response.as_json().contains("message") == true);

request = ollama::request(ollama::message_type::embedding);
request["model"] = test_model;
request["input"] = "Why is the sky blue?";
request["stream"] = false;
response = ollama::generate_embeddings(request);

CHECK(response.as_json().contains("embeddings") == true);
}



TEST_CASE("Enable Debug Logging") {

ollama::show_requests(true);
Expand Down

0 comments on commit 8977537

Please sign in to comment.