Merge pull request #20 from jmont-dev/manual_requests

Add support for manual requests for generate, chat, and embedding endpoints.
jmont-dev · Aug 13, 2024 · 8977537 · 8977537
2 parents fdd114f + 3cc83e9
commit 8977537
Show file tree

Hide file tree

Showing 4 changed files with 161 additions and 17 deletions.
diff --git a/README.md b/README.md
@@ -56,6 +56,7 @@ The test cases do a good job of providing discrete examples for each of the API
     - [Chat with Images](#chat-with-images)
     - [Embedding Generation](#embedding-generation)
     - [Debug Information](#debug-information)
+    - [Manual Requests](#manual-requests)
   - [Single-header vs Separate Headers](#single-header-vs-separate-headers)
   - [About this software:](#about-this-software)
   - [License](#license)
@@ -389,7 +390,20 @@ Debug logging for requests and replies to the server can easily be turned on and
 ollama::show_requests(true);
 ollama::show_replies(true);
   ```
-
+
+### Manual Requests
+For those looking for greater control of the requests sent to the ollama server, manual requests can be created through the `ollama::request` class. This class extends `nlohmann::json` and can be treated as a standard JSON object.
+
+```C++
+ollama::request request(ollama::message_type::generation);
+request["model"]="mistral";
+request["prompt"]="Why is the sky blue?";
+request["stream"] = false;
+request["system"] = "Talk like a pirate for the next reply."
+std::cout << ollama::generate(request) << std::endl;
+```
+This provides the most customization of the request. Users should take care to ensure that valid fields are provided, otherwise an exception will likely be thrown on response. Manual requests can be made for generate, chat, and embedding endpoints.
+
 ## Single-header vs Separate Headers
 For convenience, ollama-hpp includes a single-header version of the library in `singleheader/ollama.hpp` which bundles the core ollama.hpp code with single-header versions of nlohmann json, httplib, and base64.h. Each of these libraries is available under the MIT license and their respective licenses are included.
 The single-header include can be regenerated from these standalone files by running `./make_single_header.sh`

diff --git a/include/ollama.hpp b/include/ollama.hpp
@@ -377,12 +377,16 @@ class Ollama
         Ollama(): Ollama("http://localhost:11434") {}
         ~Ollama() { delete this->cli; }
 
-    // Generate a non-streaming reply as a string.
     ollama::response generate(const std::string& model,const std::string& prompt, const json& options=nullptr, const std::vector<std::string>& images=std::vector<std::string>())
     {
+        ollama::request request(model, prompt, options, false, images);
+        return generate(request);
+    }
 
+    // Generate a non-streaming reply as a string.
+    ollama::response generate(const ollama::request& request)
+    {
         ollama::response response;
-        ollama::request request(model, prompt, options, false, images);
 
         std::string request_string = request.dump();
         if (ollama::log_requests) std::cout << request_string << std::endl;      
@@ -400,14 +404,19 @@ class Ollama
             if (ollama::use_exceptions) throw ollama::exception("No response returned from server "+this->server_url+". Error was: "+httplib::to_string( res.error() ));
         }
 
-        return response;
+        return response;        
     }
 
-    // Generate a streaming reply where a user-defined callback function is invoked when each token is received.
     bool generate(const std::string& model,const std::string& prompt, std::function<void(const ollama::response&)> on_receive_token, const json& options=nullptr, const std::vector<std::string>& images=std::vector<std::string>())
     {
-
         ollama::request request(model, prompt, options, true, images);
+        return generate(request, on_receive_token);
+    }
+
+
+    // Generate a streaming reply where a user-defined callback function is invoked when each token is received.
+    bool generate(ollama::request& request, std::function<void(const ollama::response&)> on_receive_token)
+    {
 
         std::string request_string = request.dump();
         if (ollama::log_requests) std::cout << request_string << std::endl;
@@ -428,12 +437,17 @@ class Ollama
         return false;
     }
 
-    // Generate a non-streaming reply as a string.
     ollama::response chat(const std::string& model, const ollama::messages& messages, json options=nullptr, const std::string& format="json", const std::string& keep_alive_duration="5m")
     {
+        ollama::request request(model, messages, options, false, format, keep_alive_duration);
+        return chat(request);
+    }
+
 
+    // Generate a non-streaming reply as a string.
+    ollama::response chat(ollama::request& request)
+    {
         ollama::response response;
-        ollama::request request(model, messages, options, false, format, keep_alive_duration);
 
         std::string request_string = request.dump();
         if (ollama::log_requests) std::cout << request_string << std::endl;      
@@ -456,9 +470,14 @@ class Ollama
 
     bool chat(const std::string& model, const ollama::messages& messages, std::function<void(const ollama::response&)> on_receive_token, const json& options=nullptr, const std::string& format="json", const std::string& keep_alive_duration="5m")
     {
+        ollama::request request(model, messages, options, true, format, keep_alive_duration);
+        return chat(request, on_receive_token);
+    }
 
+
+    bool chat(ollama::request& request, std::function<void(const ollama::response&)> on_receive_token)
+    {
         ollama::response response;
-        ollama::request request(model, messages, options, true, format, keep_alive_duration);
 
         std::string request_string = request.dump();
         if (ollama::log_requests) std::cout << request_string << std::endl;      
@@ -719,6 +738,12 @@ class Ollama
     ollama::response generate_embeddings(const std::string& model, const std::string& input, const json& options=nullptr, bool truncate = true, const std::string& keep_alive_duration="5m")
     {
         ollama::request request = ollama::request::from_embedding(model, input, options, truncate, keep_alive_duration);
+        return generate_embeddings(request);
+    }
+
+
+    ollama::response generate_embeddings(ollama::request& request)
+    {
         ollama::response response;
 
         std::string request_string = request.dump();
@@ -806,21 +831,41 @@ namespace ollama
         return ollama.generate(model, prompt, options, images);
     }
 
+    inline ollama::response generate(const ollama::request& request)
+    {
+        return ollama.generate(request);
+    }
+
     inline bool generate(const std::string& model,const std::string& prompt, std::function<void(const ollama::response&)> on_receive_response, const json& options=nullptr, const std::vector<std::string>& images=std::vector<std::string>())
     {
         return ollama.generate(model, prompt, on_receive_response, options, images);
     }
 
+    inline bool generate(ollama::request& request, std::function<void(const ollama::response&)> on_receive_response)
+    {
+        return ollama.generate(request, on_receive_response);
+    }
+
     inline ollama::response chat(const std::string& model, const ollama::messages& messages, const json& options=nullptr, const std::string& format="json", const std::string& keep_alive_duration="5m")
     {
         return ollama.chat(model, messages, options, format, keep_alive_duration);
     }
 
+    inline ollama::response chat(ollama::request& request)
+    {
+        return ollama.chat(request);
+    }
+
     inline bool chat(const std::string& model, const ollama::messages& messages, std::function<void(const ollama::response&)> on_receive_response, const json& options=nullptr, const std::string& format="json", const std::string& keep_alive_duration="5m")
     {
         return ollama.chat(model, messages, on_receive_response, options, format, keep_alive_duration);
     }
 
+    inline bool chat(ollama::request& request, std::function<void(const ollama::response&)> on_receive_response)
+    {
+        return ollama.chat(request, on_receive_response);
+    }
+
     inline bool create(const std::string& modelName, const std::string& modelFile, bool loadFromFile=true)
     {
         return ollama.create_model(modelName, modelFile, loadFromFile);
@@ -891,6 +936,11 @@ namespace ollama
         return ollama.generate_embeddings(model, input, options, truncate, keep_alive_duration);
     }
 
+    inline ollama::response generate_embeddings(ollama::request& request)
+    {
+        return ollama.generate_embeddings(request);
+    }
+
     inline void setReadTimeout(const int& seconds)
     {
         ollama.setReadTimeout(seconds);

diff --git a/singleheader/ollama.hpp b/singleheader/ollama.hpp
@@ -35167,12 +35167,16 @@ class Ollama
         Ollama(): Ollama("http://localhost:11434") {}
         ~Ollama() { delete this->cli; }
 
-    // Generate a non-streaming reply as a string.
     ollama::response generate(const std::string& model,const std::string& prompt, const json& options=nullptr, const std::vector<std::string>& images=std::vector<std::string>())
     {
+        ollama::request request(model, prompt, options, false, images);
+        return generate(request);
+    }
 
+    // Generate a non-streaming reply as a string.
+    ollama::response generate(const ollama::request& request)
+    {
         ollama::response response;
-        ollama::request request(model, prompt, options, false, images);
 
         std::string request_string = request.dump();
         if (ollama::log_requests) std::cout << request_string << std::endl;      
@@ -35190,14 +35194,19 @@ class Ollama
             if (ollama::use_exceptions) throw ollama::exception("No response returned from server "+this->server_url+". Error was: "+httplib::to_string( res.error() ));
         }
 
-        return response;
+        return response;        
     }
 
-    // Generate a streaming reply where a user-defined callback function is invoked when each token is received.
     bool generate(const std::string& model,const std::string& prompt, std::function<void(const ollama::response&)> on_receive_token, const json& options=nullptr, const std::vector<std::string>& images=std::vector<std::string>())
     {
-
         ollama::request request(model, prompt, options, true, images);
+        return generate(request, on_receive_token);
+    }
+
+
+    // Generate a streaming reply where a user-defined callback function is invoked when each token is received.
+    bool generate(ollama::request& request, std::function<void(const ollama::response&)> on_receive_token)
+    {
 
         std::string request_string = request.dump();
         if (ollama::log_requests) std::cout << request_string << std::endl;
@@ -35218,12 +35227,17 @@ class Ollama
         return false;
     }
 
-    // Generate a non-streaming reply as a string.
     ollama::response chat(const std::string& model, const ollama::messages& messages, json options=nullptr, const std::string& format="json", const std::string& keep_alive_duration="5m")
     {
+        ollama::request request(model, messages, options, false, format, keep_alive_duration);
+        return chat(request);
+    }
+
 
+    // Generate a non-streaming reply as a string.
+    ollama::response chat(ollama::request& request)
+    {
         ollama::response response;
-        ollama::request request(model, messages, options, false, format, keep_alive_duration);
 
         std::string request_string = request.dump();
         if (ollama::log_requests) std::cout << request_string << std::endl;      
@@ -35246,9 +35260,14 @@ class Ollama
 
     bool chat(const std::string& model, const ollama::messages& messages, std::function<void(const ollama::response&)> on_receive_token, const json& options=nullptr, const std::string& format="json", const std::string& keep_alive_duration="5m")
     {
+        ollama::request request(model, messages, options, true, format, keep_alive_duration);
+        return chat(request, on_receive_token);
+    }
 
+
+    bool chat(ollama::request& request, std::function<void(const ollama::response&)> on_receive_token)
+    {
         ollama::response response;
-        ollama::request request(model, messages, options, true, format, keep_alive_duration);
 
         std::string request_string = request.dump();
         if (ollama::log_requests) std::cout << request_string << std::endl;      
@@ -35509,6 +35528,12 @@ class Ollama
     ollama::response generate_embeddings(const std::string& model, const std::string& input, const json& options=nullptr, bool truncate = true, const std::string& keep_alive_duration="5m")
     {
         ollama::request request = ollama::request::from_embedding(model, input, options, truncate, keep_alive_duration);
+        return generate_embeddings(request);
+    }
+
+
+    ollama::response generate_embeddings(ollama::request& request)
+    {
         ollama::response response;
 
         std::string request_string = request.dump();
@@ -35596,21 +35621,41 @@ namespace ollama
         return ollama.generate(model, prompt, options, images);
     }
 
+    inline ollama::response generate(const ollama::request& request)
+    {
+        return ollama.generate(request);
+    }
+
     inline bool generate(const std::string& model,const std::string& prompt, std::function<void(const ollama::response&)> on_receive_response, const json& options=nullptr, const std::vector<std::string>& images=std::vector<std::string>())
     {
         return ollama.generate(model, prompt, on_receive_response, options, images);
     }
 
+    inline bool generate(ollama::request& request, std::function<void(const ollama::response&)> on_receive_response)
+    {
+        return ollama.generate(request, on_receive_response);
+    }
+
     inline ollama::response chat(const std::string& model, const ollama::messages& messages, const json& options=nullptr, const std::string& format="json", const std::string& keep_alive_duration="5m")
     {
         return ollama.chat(model, messages, options, format, keep_alive_duration);
     }
 
+    inline ollama::response chat(ollama::request& request)
+    {
+        return ollama.chat(request);
+    }
+
     inline bool chat(const std::string& model, const ollama::messages& messages, std::function<void(const ollama::response&)> on_receive_response, const json& options=nullptr, const std::string& format="json", const std::string& keep_alive_duration="5m")
     {
         return ollama.chat(model, messages, on_receive_response, options, format, keep_alive_duration);
     }
 
+    inline bool chat(ollama::request& request, std::function<void(const ollama::response&)> on_receive_response)
+    {
+        return ollama.chat(request, on_receive_response);
+    }
+
     inline bool create(const std::string& modelName, const std::string& modelFile, bool loadFromFile=true)
     {
         return ollama.create_model(modelName, modelFile, loadFromFile);
@@ -35681,6 +35726,11 @@ namespace ollama
         return ollama.generate_embeddings(model, input, options, truncate, keep_alive_duration);
     }
 
+    inline ollama::response generate_embeddings(ollama::request& request)
+    {
+        return ollama.generate_embeddings(request);
+    }
+
     inline void setReadTimeout(const int& seconds)
     {
         ollama.setReadTimeout(seconds);

diff --git a/test/test.cpp b/test/test.cpp
@@ -241,6 +241,36 @@ TEST_SUITE("Ollama Tests") {
         CHECK(response.as_json().contains("embeddings") == true);
     }
 
+    TEST_CASE("Manual Requests") {
+
+        ollama::request request(ollama::message_type::generation);
+        request["model"] = test_model;
+        request["prompt"] = "Why is the sky blue?";
+        request["stream"] = false;
+        ollama::response response = ollama::generate(request);
+
+        CHECK(response.as_json().contains("response") == true);
+
+        request = ollama::request(ollama::message_type::chat);
+        request["model"] = test_model;
+        ollama::messages messages = { ollama::message("user","Why is the sky blue?") };
+        request["messages"] = messages.to_json();
+        request["stream"] = false;
+        response = ollama::chat(request);
+
+        CHECK(response.as_json().contains("message") == true);
+
+        request = ollama::request(ollama::message_type::embedding);
+        request["model"] = test_model;
+        request["input"] = "Why is the sky blue?";
+        request["stream"] = false;
+        response = ollama::generate_embeddings(request);
+
+        CHECK(response.as_json().contains("embeddings") == true);                
+    }
+
+
+
     TEST_CASE("Enable Debug Logging") {
 
         ollama::show_requests(true);