From 584327d751289f737fb7f5890f97f94507eb1f61 Mon Sep 17 00:00:00 2001
From: James Montgomery <James.Brian.Montgomery@gmail.com>
Date: Sun, 18 Aug 2024 17:26:33 -0400
Subject: [PATCH] Automatically set streaming when invoking requests, add
 support for running model endpoint.

---
 README.md               | 15 +++++++++----
 include/ollama.hpp      | 49 +++++++++++++++++++++++++++++++++++++----
 singleheader/ollama.hpp | 49 +++++++++++++++++++++++++++++++++++++----
 test/test.cpp           | 10 +++++++++
 4 files changed, 111 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 5cefcd5..0806b96 100644
--- a/README.md
+++ b/README.md
@@ -42,7 +42,7 @@ The test cases do a good job of providing discrete examples for each of the API
     - [Load a Model into Memory](#load-a-model-into-memory)
     - [Pull, Copy, and Delete Models](#pull-copy-and-delete-models)
     - [Retrieve Model Info](#retrieve-model-info)
-    - [List locally available models](#list-locally-available-models)
+    - [List locally-available and running models](#list-locally-available-and-running-models)
     - [Exception Handling](#exception-handling)
     - [Basic Generation](#basic-generation)
     - [Using Options](#using-options)
@@ -161,7 +161,7 @@ nlohmann::json model_info = ollama::show_model_info("llama3:8b");
 std::cout << "Model family is " << model_info["details"]["family"] << std::endl;
 ```
 
-### List locally available models
+### List locally-available and running models
 You can query a list of locally-available models on your ollama server using the following. This is returned as a `std::vector` of `std::string`.
 
 ```C++
@@ -169,6 +169,15 @@ You can query a list of locally-available models on your ollama server using the
 std::vector<std::string> models = ollama::list_models();
 ```
 
+You can similarly query a list of currently-running models on your ollama server using:
+
+```C++
+// List the models available locally in the ollama server
+std::vector<std::string> models = ollama::list_running_models();
+```
+
+For detailed parameters for these models, you can obtain the verbose JSON model descriptions using `ollama::list_model_json()` and `ollama::running_model_json()`.
+
 ### Exception Handling
 Most calls will throw `ollama::exception` in the event of an error, with details on the exception that has occurred. Exceptions are enabled by default.
 
@@ -400,7 +409,6 @@ For those looking for greater control of the requests sent to the ollama server,
 ollama::request request(ollama::message_type::generation);
 request["model"]="mistral";
 request["prompt"]="Why is the sky blue?";
-request["stream"] = false;
 request["system"] = "Talk like a pirate for the next reply."
 std::cout << ollama::generate(request) << std::endl;
 ```
@@ -430,7 +438,6 @@ ollama::response response = ollama::generate("llama3.1:8b", "Why is the sky blue
 ollama::request request(ollama::message_type::generation);
 request["model"]="llama3.1:8b";
 request["prompt"]="Why is the sky blue?";
-request["stream"] = false;
 request["context"] = response.as_json()["context"];
 std::cout << ollama::generate(request) << std::endl;
 ```
diff --git a/include/ollama.hpp b/include/ollama.hpp
index 6da2e0a..cd19032 100644
--- a/include/ollama.hpp
+++ b/include/ollama.hpp
@@ -395,10 +395,11 @@ class Ollama
     }
 
     // Generate a non-streaming reply as a string.
-    ollama::response generate(const ollama::request& request)
+    ollama::response generate(ollama::request& request)
     {
         ollama::response response;
 
+        request["stream"] = false;
         std::string request_string = request.dump();
         if (ollama::log_requests) std::cout << request_string << std::endl;      
 
@@ -435,6 +436,7 @@ class Ollama
     // Generate a streaming reply where a user-defined callback function is invoked when each token is received.
     bool generate(ollama::request& request, std::function<void(const ollama::response&)> on_receive_token)
     {
+        request["stream"] = true;
 
         std::string request_string = request.dump();
         if (ollama::log_requests) std::cout << request_string << std::endl;
@@ -476,6 +478,7 @@ class Ollama
     {
         ollama::response response;
 
+        request["stream"] = false;        
         std::string request_string = request.dump();
         if (ollama::log_requests) std::cout << request_string << std::endl;      
 
@@ -504,7 +507,8 @@ class Ollama
 
     bool chat(ollama::request& request, std::function<void(const ollama::response&)> on_receive_token)
     {
-        ollama::response response;
+        ollama::response response;        
+        request["stream"] = true;
 
         std::string request_string = request.dump();
         if (ollama::log_requests) std::cout << request_string << std::endl;      
@@ -640,6 +644,33 @@ class Ollama
         return models;
     }
 
+    json running_model_json()
+    {
+        json models;
+        if (auto res = cli->Get("/api/ps"))
+        {
+            if (ollama::log_replies) std::cout << res->body << std::endl;
+            models = json::parse(res->body);
+        }
+        else { if (ollama::use_exceptions) throw ollama::exception("No response returned from server when querying running models: "+httplib::to_string( res.error() ) );}        
+
+        return models;
+    }
+
+    std::vector<std::string> list_running_models()
+    {
+        std::vector<std::string> models;
+
+        json json_response = running_model_json();
+        
+        for (auto& model: json_response["models"])
+        {
+            models.push_back(model["name"]);
+        }
+
+        return models;
+    }
+
     bool blob_exists(const std::string& digest)
     {
         if (auto res = cli->Head("/api/blobs/"+digest))
@@ -869,12 +900,12 @@ namespace ollama
         return ollama.generate(model, prompt, options, images);
     }
 
-    ollama::response generate(const std::string& model,const std::string& prompt, const ollama::response& context, const json& options=nullptr, const std::vector<std::string>& images=std::vector<std::string>())
+    inline ollama::response generate(const std::string& model,const std::string& prompt, const ollama::response& context, const json& options=nullptr, const std::vector<std::string>& images=std::vector<std::string>())
     {
         return ollama.generate(model, prompt, context, options, images);
     }
 
-    inline ollama::response generate(const ollama::request& request)
+    inline ollama::response generate(ollama::request& request)
     {
         return ollama.generate(request);
     }
@@ -944,6 +975,16 @@ namespace ollama
         return ollama.list_model_json();
     }
 
+    inline std::vector<std::string> list_running_models()
+    {
+        return ollama.list_running_models();
+    }
+
+    inline json running_model_json()
+    {
+        return ollama.running_model_json();
+    }
+
     inline bool blob_exists(const std::string& digest)
     {
         return ollama.blob_exists(digest);
diff --git a/singleheader/ollama.hpp b/singleheader/ollama.hpp
index 0505473..a23fcd1 100644
--- a/singleheader/ollama.hpp
+++ b/singleheader/ollama.hpp
@@ -35185,10 +35185,11 @@ class Ollama
     }
 
     // Generate a non-streaming reply as a string.
-    ollama::response generate(const ollama::request& request)
+    ollama::response generate(ollama::request& request)
     {
         ollama::response response;
 
+        request["stream"] = false;
         std::string request_string = request.dump();
         if (ollama::log_requests) std::cout << request_string << std::endl;      
 
@@ -35225,6 +35226,7 @@ class Ollama
     // Generate a streaming reply where a user-defined callback function is invoked when each token is received.
     bool generate(ollama::request& request, std::function<void(const ollama::response&)> on_receive_token)
     {
+        request["stream"] = true;
 
         std::string request_string = request.dump();
         if (ollama::log_requests) std::cout << request_string << std::endl;
@@ -35266,6 +35268,7 @@ class Ollama
     {
         ollama::response response;
 
+        request["stream"] = false;        
         std::string request_string = request.dump();
         if (ollama::log_requests) std::cout << request_string << std::endl;      
 
@@ -35294,7 +35297,8 @@ class Ollama
 
     bool chat(ollama::request& request, std::function<void(const ollama::response&)> on_receive_token)
     {
-        ollama::response response;
+        ollama::response response;        
+        request["stream"] = true;
 
         std::string request_string = request.dump();
         if (ollama::log_requests) std::cout << request_string << std::endl;      
@@ -35430,6 +35434,33 @@ class Ollama
         return models;
     }
 
+    json running_model_json()
+    {
+        json models;
+        if (auto res = cli->Get("/api/ps"))
+        {
+            if (ollama::log_replies) std::cout << res->body << std::endl;
+            models = json::parse(res->body);
+        }
+        else { if (ollama::use_exceptions) throw ollama::exception("No response returned from server when querying running models: "+httplib::to_string( res.error() ) );}        
+
+        return models;
+    }
+
+    std::vector<std::string> list_running_models()
+    {
+        std::vector<std::string> models;
+
+        json json_response = running_model_json();
+        
+        for (auto& model: json_response["models"])
+        {
+            models.push_back(model["name"]);
+        }
+
+        return models;
+    }
+
     bool blob_exists(const std::string& digest)
     {
         if (auto res = cli->Head("/api/blobs/"+digest))
@@ -35659,12 +35690,12 @@ namespace ollama
         return ollama.generate(model, prompt, options, images);
     }
 
-    ollama::response generate(const std::string& model,const std::string& prompt, const ollama::response& context, const json& options=nullptr, const std::vector<std::string>& images=std::vector<std::string>())
+    inline ollama::response generate(const std::string& model,const std::string& prompt, const ollama::response& context, const json& options=nullptr, const std::vector<std::string>& images=std::vector<std::string>())
     {
         return ollama.generate(model, prompt, context, options, images);
     }
 
-    inline ollama::response generate(const ollama::request& request)
+    inline ollama::response generate(ollama::request& request)
     {
         return ollama.generate(request);
     }
@@ -35734,6 +35765,16 @@ namespace ollama
         return ollama.list_model_json();
     }
 
+    inline std::vector<std::string> list_running_models()
+    {
+        return ollama.list_running_models();
+    }
+
+    inline json running_model_json()
+    {
+        return ollama.running_model_json();
+    }
+
     inline bool blob_exists(const std::string& digest)
     {
         return ollama.blob_exists(digest);
diff --git a/test/test.cpp b/test/test.cpp
index d50e2a2..34d47c4 100644
--- a/test/test.cpp
+++ b/test/test.cpp
@@ -88,6 +88,16 @@ TEST_SUITE("Ollama Tests") {
         CHECK( contains_model );
     }
 
+    TEST_CASE("List Running Models") {
+
+        // List the models available locally in the ollama server
+        std::vector<std::string> models = ollama::list_running_models();    
+
+        bool contains_model = (std::find(models.begin(), models.end(), test_model) != models.end() );
+
+        CHECK( contains_model );
+    }
+
     TEST_CASE("Exception Handling") {
 
         bool exception_handled = false;