From 584327d751289f737fb7f5890f97f94507eb1f61 Mon Sep 17 00:00:00 2001 From: James Montgomery Date: Sun, 18 Aug 2024 17:26:33 -0400 Subject: [PATCH] Automatically set streaming when invoking requests, add support for running model endpoint. --- README.md | 15 +++++++++---- include/ollama.hpp | 49 +++++++++++++++++++++++++++++++++++++---- singleheader/ollama.hpp | 49 +++++++++++++++++++++++++++++++++++++---- test/test.cpp | 10 +++++++++ 4 files changed, 111 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 5cefcd5..0806b96 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ The test cases do a good job of providing discrete examples for each of the API - [Load a Model into Memory](#load-a-model-into-memory) - [Pull, Copy, and Delete Models](#pull-copy-and-delete-models) - [Retrieve Model Info](#retrieve-model-info) - - [List locally available models](#list-locally-available-models) + - [List locally-available and running models](#list-locally-available-and-running-models) - [Exception Handling](#exception-handling) - [Basic Generation](#basic-generation) - [Using Options](#using-options) @@ -161,7 +161,7 @@ nlohmann::json model_info = ollama::show_model_info("llama3:8b"); std::cout << "Model family is " << model_info["details"]["family"] << std::endl; ``` -### List locally available models +### List locally-available and running models You can query a list of locally-available models on your ollama server using the following. This is returned as a `std::vector` of `std::string`. ```C++ @@ -169,6 +169,15 @@ You can query a list of locally-available models on your ollama server using the std::vector models = ollama::list_models(); ``` +You can similarly query a list of currently-running models on your ollama server using: + +```C++ +// List the models available locally in the ollama server +std::vector models = ollama::list_running_models(); +``` + +For detailed parameters for these models, you can obtain the verbose JSON model descriptions using `ollama::list_model_json()` and `ollama::running_model_json()`. + ### Exception Handling Most calls will throw `ollama::exception` in the event of an error, with details on the exception that has occurred. Exceptions are enabled by default. @@ -400,7 +409,6 @@ For those looking for greater control of the requests sent to the ollama server, ollama::request request(ollama::message_type::generation); request["model"]="mistral"; request["prompt"]="Why is the sky blue?"; -request["stream"] = false; request["system"] = "Talk like a pirate for the next reply." std::cout << ollama::generate(request) << std::endl; ``` @@ -430,7 +438,6 @@ ollama::response response = ollama::generate("llama3.1:8b", "Why is the sky blue ollama::request request(ollama::message_type::generation); request["model"]="llama3.1:8b"; request["prompt"]="Why is the sky blue?"; -request["stream"] = false; request["context"] = response.as_json()["context"]; std::cout << ollama::generate(request) << std::endl; ``` diff --git a/include/ollama.hpp b/include/ollama.hpp index 6da2e0a..cd19032 100644 --- a/include/ollama.hpp +++ b/include/ollama.hpp @@ -395,10 +395,11 @@ class Ollama } // Generate a non-streaming reply as a string. - ollama::response generate(const ollama::request& request) + ollama::response generate(ollama::request& request) { ollama::response response; + request["stream"] = false; std::string request_string = request.dump(); if (ollama::log_requests) std::cout << request_string << std::endl; @@ -435,6 +436,7 @@ class Ollama // Generate a streaming reply where a user-defined callback function is invoked when each token is received. bool generate(ollama::request& request, std::function on_receive_token) { + request["stream"] = true; std::string request_string = request.dump(); if (ollama::log_requests) std::cout << request_string << std::endl; @@ -476,6 +478,7 @@ class Ollama { ollama::response response; + request["stream"] = false; std::string request_string = request.dump(); if (ollama::log_requests) std::cout << request_string << std::endl; @@ -504,7 +507,8 @@ class Ollama bool chat(ollama::request& request, std::function on_receive_token) { - ollama::response response; + ollama::response response; + request["stream"] = true; std::string request_string = request.dump(); if (ollama::log_requests) std::cout << request_string << std::endl; @@ -640,6 +644,33 @@ class Ollama return models; } + json running_model_json() + { + json models; + if (auto res = cli->Get("/api/ps")) + { + if (ollama::log_replies) std::cout << res->body << std::endl; + models = json::parse(res->body); + } + else { if (ollama::use_exceptions) throw ollama::exception("No response returned from server when querying running models: "+httplib::to_string( res.error() ) );} + + return models; + } + + std::vector list_running_models() + { + std::vector models; + + json json_response = running_model_json(); + + for (auto& model: json_response["models"]) + { + models.push_back(model["name"]); + } + + return models; + } + bool blob_exists(const std::string& digest) { if (auto res = cli->Head("/api/blobs/"+digest)) @@ -869,12 +900,12 @@ namespace ollama return ollama.generate(model, prompt, options, images); } - ollama::response generate(const std::string& model,const std::string& prompt, const ollama::response& context, const json& options=nullptr, const std::vector& images=std::vector()) + inline ollama::response generate(const std::string& model,const std::string& prompt, const ollama::response& context, const json& options=nullptr, const std::vector& images=std::vector()) { return ollama.generate(model, prompt, context, options, images); } - inline ollama::response generate(const ollama::request& request) + inline ollama::response generate(ollama::request& request) { return ollama.generate(request); } @@ -944,6 +975,16 @@ namespace ollama return ollama.list_model_json(); } + inline std::vector list_running_models() + { + return ollama.list_running_models(); + } + + inline json running_model_json() + { + return ollama.running_model_json(); + } + inline bool blob_exists(const std::string& digest) { return ollama.blob_exists(digest); diff --git a/singleheader/ollama.hpp b/singleheader/ollama.hpp index 0505473..a23fcd1 100644 --- a/singleheader/ollama.hpp +++ b/singleheader/ollama.hpp @@ -35185,10 +35185,11 @@ class Ollama } // Generate a non-streaming reply as a string. - ollama::response generate(const ollama::request& request) + ollama::response generate(ollama::request& request) { ollama::response response; + request["stream"] = false; std::string request_string = request.dump(); if (ollama::log_requests) std::cout << request_string << std::endl; @@ -35225,6 +35226,7 @@ class Ollama // Generate a streaming reply where a user-defined callback function is invoked when each token is received. bool generate(ollama::request& request, std::function on_receive_token) { + request["stream"] = true; std::string request_string = request.dump(); if (ollama::log_requests) std::cout << request_string << std::endl; @@ -35266,6 +35268,7 @@ class Ollama { ollama::response response; + request["stream"] = false; std::string request_string = request.dump(); if (ollama::log_requests) std::cout << request_string << std::endl; @@ -35294,7 +35297,8 @@ class Ollama bool chat(ollama::request& request, std::function on_receive_token) { - ollama::response response; + ollama::response response; + request["stream"] = true; std::string request_string = request.dump(); if (ollama::log_requests) std::cout << request_string << std::endl; @@ -35430,6 +35434,33 @@ class Ollama return models; } + json running_model_json() + { + json models; + if (auto res = cli->Get("/api/ps")) + { + if (ollama::log_replies) std::cout << res->body << std::endl; + models = json::parse(res->body); + } + else { if (ollama::use_exceptions) throw ollama::exception("No response returned from server when querying running models: "+httplib::to_string( res.error() ) );} + + return models; + } + + std::vector list_running_models() + { + std::vector models; + + json json_response = running_model_json(); + + for (auto& model: json_response["models"]) + { + models.push_back(model["name"]); + } + + return models; + } + bool blob_exists(const std::string& digest) { if (auto res = cli->Head("/api/blobs/"+digest)) @@ -35659,12 +35690,12 @@ namespace ollama return ollama.generate(model, prompt, options, images); } - ollama::response generate(const std::string& model,const std::string& prompt, const ollama::response& context, const json& options=nullptr, const std::vector& images=std::vector()) + inline ollama::response generate(const std::string& model,const std::string& prompt, const ollama::response& context, const json& options=nullptr, const std::vector& images=std::vector()) { return ollama.generate(model, prompt, context, options, images); } - inline ollama::response generate(const ollama::request& request) + inline ollama::response generate(ollama::request& request) { return ollama.generate(request); } @@ -35734,6 +35765,16 @@ namespace ollama return ollama.list_model_json(); } + inline std::vector list_running_models() + { + return ollama.list_running_models(); + } + + inline json running_model_json() + { + return ollama.running_model_json(); + } + inline bool blob_exists(const std::string& digest) { return ollama.blob_exists(digest); diff --git a/test/test.cpp b/test/test.cpp index d50e2a2..34d47c4 100644 --- a/test/test.cpp +++ b/test/test.cpp @@ -88,6 +88,16 @@ TEST_SUITE("Ollama Tests") { CHECK( contains_model ); } + TEST_CASE("List Running Models") { + + // List the models available locally in the ollama server + std::vector models = ollama::list_running_models(); + + bool contains_model = (std::find(models.begin(), models.end(), test_model) != models.end() ); + + CHECK( contains_model ); + } + TEST_CASE("Exception Handling") { bool exception_handled = false;