Merge pull request #52 from intelligentnode/3-create-unified-vision-c…

…ontroller 3 create unified vision controller
intelligentnode · Feb 11, 2024 · 0e2f63a · 0e2f63a
2 parents 00d4897 + 962148e
commit 0e2f63a
Show file tree

Hide file tree

Showing 18 changed files with 410 additions and 104 deletions.
diff --git a/PIPREADME.md b/PIPREADME.md
@@ -19,36 +19,6 @@ pip install intelli
 
 # Code Examples
 
-## Create AI Flows
-You can create a flow of tasks executed by different AI models. Here's an example of creating a blog post flow:
-- ChatGPT agent to write a post.
-- Google gemini agent to write image description.
-- Stable diffusion to generate images.
-
-```python
-from intelli.flow.agents.agent import Agent
-from intelli.flow.tasks.task import Task
-from intelli.flow.sequence_flow import SequenceFlow
-from intelli.flow.input.task_input import TextTaskInput
-from intelli.flow.processors.basic_processor import TextProcessor
-
-# define agents
-blog_agent = Agent(agent_type='text', provider='openai', mission='write blog posts', model_params={'key': YOUR_OPENAI_API_KEY, 'model': 'gpt-4'})
-copy_agent = Agent(agent_type='text', provider='gemini', mission='generate description', model_params={'key': YOUR_GEMINI_API_KEY, 'model': 'gemini'})
-artist_agent = Agent(agent_type='image', provider='stability', mission='generate image', model_params={'key': YOUR_STABILITY_API_KEY})
-
-# define tasks
-task1 = Task(TextTaskInput('blog post about electric cars'), blog_agent, log=True)
-task2 = Task(TextTaskInput('Generate short image description for image model'), copy_agent, pre_process=TextProcessor.text_head, log=True)
-task3 = Task(TextTaskInput('Generate cartoon style image'), artist_agent, log=True)
-
-# start sequence flow
-flow = SequenceFlow([task1, task2, task3], log=True)
-final_result = flow.start()
-```
-
-To build async AI flows with multiple paths, refer to the [flow tutorial](https://github.com/intelligentnode/Intelli/wiki/Flows).
-
 ## Create Chatbot
 Switch between multiple chatbot providers without changing your code.
 
@@ -62,7 +32,7 @@ def call_chatbot(provider, model=None):
     input.add_user_message("What is the capital of France?")
 
     # creating chatbot instance
-    openai_bot = Chatbot(YOUR_OPENAI_API_KEY, "openai")
+    openai_bot = Chatbot(YOUR_API_KEY, provider)
     response = openai_bot.chat(input)
 
     return response
@@ -73,11 +43,10 @@ call_chatbot("openai", "gpt-4")
 # call mistralai
 call_chatbot("mistral", "mistral-medium")
 
-# call gooogle gemini
+# call google gemini
 call_chatbot("gemini")
 ```
 
-
 ## Connect Your Docs With Chatbot
 IntelliPy allows you to chat with your docs using multiple LLMs. To connect your data, visit the [IntelliNode App](https://app.intellinode.ai/), start a project using the Document option, upload your documents or images, and copy the generated One Key. This key will be used to connect the chatbot to your uploaded data.
 
@@ -110,6 +79,36 @@ wrapper = RemoteImageModel(your_api_key, provider)
 results = wrapper.generate_images(image_input)
 ```
 
+## Create AI Flows
+You can create a flow of tasks executed by different AI models. Here's an example of creating a blog post flow:
+- ChatGPT agent to write a post.
+- Google gemini agent to write image description.
+- Stable diffusion to generate images.
+
+```python
+from intelli.flow.agents.agent import Agent
+from intelli.flow.tasks.task import Task
+from intelli.flow.sequence_flow import SequenceFlow
+from intelli.flow.input.task_input import TextTaskInput
+from intelli.flow.processors.basic_processor import TextProcessor
+
+# define agents
+blog_agent = Agent(agent_type='text', provider='openai', mission='write blog posts', model_params={'key': YOUR_OPENAI_API_KEY, 'model': 'gpt-4'})
+copy_agent = Agent(agent_type='text', provider='gemini', mission='generate description', model_params={'key': YOUR_GEMINI_API_KEY, 'model': 'gemini'})
+artist_agent = Agent(agent_type='image', provider='stability', mission='generate image', model_params={'key': YOUR_STABILITY_API_KEY})
+
+# define tasks
+task1 = Task(TextTaskInput('blog post about electric cars'), blog_agent, log=True)
+task2 = Task(TextTaskInput('Generate short image description for image model'), copy_agent, pre_process=TextProcessor.text_head, log=True)
+task3 = Task(TextTaskInput('Generate cartoon style image'), artist_agent, log=True)
+
+# start sequence flow
+flow = SequenceFlow([task1, task2, task3], log=True)
+final_result = flow.start()
+```
+
+To build async AI flows with multiple paths, refer to the [flow tutorial](https://github.com/intelligentnode/Intelli/wiki/Flows).
+
 # Pillars
 - **The wrapper layer** provides low-level access to the latest AI models.
 - **The controller layer** offers a unified input to any AI model by handling the differences.

diff --git a/README.md b/README.md
@@ -28,6 +28,34 @@ pip install intelli
 
 # Code Examples
 
+## Create Chatbot
+Switch between multiple chatbot providers without changing your code.
+
+```python
+from intelli.function.chatbot import Chatbot
+from intelli.model.input.chatbot_input import ChatModelInput
+
+def call_chatbot(provider, model=None):
+    # prepare common input 
+    input = ChatModelInput("You are a helpful assistant.", model)
+    input.add_user_message("What is the capital of France?")
+
+    # creating chatbot instance
+    openai_bot = Chatbot(YOUR_API_KEY, provider)
+    response = openai_bot.chat(input)
+
+    return response
+
+# call openai
+call_chatbot("openai", "gpt-4")
+
+# call mistralai
+call_chatbot("mistral", "mistral-medium")
+
+# call google gemini
+call_chatbot("gemini")
+```
+
 ## Create AI Flows
 You can create a flow of tasks executed by different AI models. Here's an example of creating a blog post flow:
 
@@ -62,34 +90,6 @@ final_result = flow.start()
 
 To build async flows with multiple paths, refer to the [flow tutorial](https://github.com/intelligentnode/Intelli/wiki/Flows).
 
-## Create Chatbot
-Switch between multiple chatbot providers without changing your code.
-
-```python
-from intelli.function.chatbot import Chatbot
-from intelli.model.input.chatbot_input import ChatModelInput
-
-def call_chatbot(provider, model=None):
-    # prepare common input 
-    input = ChatModelInput("You are a helpful assistant.", model)
-    input.add_user_message("What is the capital of France?")
-
-    # creating chatbot instance
-    openai_bot = Chatbot(YOUR_OPENAI_API_KEY, "openai")
-    response = openai_bot.chat(input)
-
-    return response
-
-# call openai
-call_chatbot("openai", "gpt-4")
-
-# call mistralai
-call_chatbot("mistral", "mistral-medium")
-
-# call google gemini
-call_chatbot("gemini")
-```
-
 
 ## Connect Your Docs With Chatbot 
 IntelliPy allows you to chat with your docs using multiple LLMs. To connect your data, visit the [IntelliNode App](https://app.intellinode.ai/), start a project using the Document option, upload your documents or images, and copy the generated One Key. This key will be used to connect the chatbot to your uploaded data.

diff --git a/instructions/run_integration_text.sh b/instructions/run_integration_text.sh
@@ -22,6 +22,9 @@ python3 -m unittest intelli.test.integration.test_remote_embed_model
 # images
 python3 -m unittest intelli.test.integration.test_remote_image_model
 
+# vision
+python3 -m unittest intelli.test.integration.test_remote_vision_model
+
 ## functions
 # chatbot
 python3 -m unittest intelli.test.integration.test_chatbot

diff --git a/intelli/controller/remote_vision_model.py b/intelli/controller/remote_vision_model.py
@@ -0,0 +1,44 @@
+from intelli.wrappers.openai_wrapper import OpenAIWrapper
+from intelli.wrappers.geminiai_wrapper import GeminiAIWrapper
+from intelli.model.input.vision_input import VisionModelInput
+
+class RemoteVisionModel:
+    supported_vision_models = {
+        "openai": OpenAIWrapper,
+        "gemini": GeminiAIWrapper,
+    }
+
+    def __init__(self, api_key, provider="openai"):
+
+        self.api_key = api_key
+
+        if provider in self.supported_vision_models:
+            self.provider = provider
+            self.provider_wrapper = self.supported_vision_models[provider](api_key)
+        else:
+            supported_models = ", ".join(self.supported_vision_models.keys())
+            raise ValueError(f"The provided provider {provider} not supported. Supported providers: {supported_models}")
+
+    def image_to_text(self, vision_input):
+
+        if isinstance(vision_input, dict):
+            inputs = vision_input
+        elif isinstance(vision_input, VisionModelInput):
+            inputs = vision_input.get_provider_inputs(self.provider)
+        else:
+            raise ValueError("vision_input must be an instance of VisionModelInput or a dictionary.")
+
+
+        if self.provider == "openai":
+            return self.call_openai_vision(inputs)
+        elif self.provider == "gemini":
+            return self.call_gemini_vision(inputs)
+
+
+    def call_openai_vision(self, inputs):
+        data = self.provider_wrapper.image_to_text(inputs)
+        return " ".join(choice['message']['content'] for choice in data['choices'])
+
+    def call_gemini_vision(self, inputs):
+        data = self.provider_wrapper.image_to_text_params(inputs)
+        return " ".join(part['text'] for part in data['candidates'][0]['content']['parts'])
diff --git a/intelli/flow/agents/agent.py b/intelli/flow/agents/agent.py
@@ -6,6 +6,8 @@
 from intelli.model.input.chatbot_input import ChatModelInput
 from intelli.model.input.image_input import ImageModelInput
 from intelli.flow.input.agent_input import AgentInput, TextAgentInput, ImageAgentInput
+from intelli.controller.remote_vision_model import RemoteVisionModel
+from intelli.model.input.vision_input import VisionModelInput
 
 
 class BasicAgent(ABC):
@@ -31,14 +33,24 @@ def execute(self, agent_input: AgentInput):
 
         # Check the agent type and call the appropriate function
         if self.type == AgentTypes.TEXT.value:
-            chatbot = Chatbot(self.model_params['key'], self.provider, self.options)
             chat_input = ChatModelInput(self.mission, model=self.model_params.get('model'))
+
+            chatbot = Chatbot(self.model_params['key'], self.provider, self.options)
             chat_input.add_user_message(agent_input.desc)
             result = chatbot.chat(chat_input)[0]
         elif self.type == AgentTypes.IMAGE.value:
+            image_input = ImageModelInput(prompt=self.mission + ": " + agent_input.desc, model=self.model_params.get('model'))
+
             image_model = RemoteImageModel(self.model_params['key'], self.provider)
-            image_input = ImageModelInput(prompt=agent_input.desc, model=self.model_params.get('model'))
-            result = image_model.generate_images(image_input)
+            result = image_model.generate_images(image_input)[0]
+        elif self.type == AgentTypes.VISION.value:
+            vision_input = VisionModelInput(content=self.mission + ": " + agent_input.desc, 
+                                            image_data=agent_input.img, 
+                                            extension=self.model_params.get('extension', 'png'),
+                                            model=self.model_params['model'])
+
+            vision_model = RemoteVisionModel(self.model_params['key'], self.provider)
+            result = vision_model.image_to_text(vision_input)
         else:
             raise ValueError(f"Unsupported agent type: {self.type}.")
 

diff --git a/intelli/flow/flow.py b/intelli/flow/flow.py
@@ -1,6 +1,7 @@
 import asyncio
 import networkx as nx
 from intelli.utils.logging import Logger
+from intelli.flow.types import AgentTypes, InputTypes, Matcher
 from functools import partial
 
 
@@ -42,8 +43,14 @@ async def _execute_task(self, task_name):
                 print(f"Warning: Output for predecessor task '{pred}' not found. Skipping...")
 
         self.logger.log(f'The number of combined inputs for task {task_name} is {len(predecessor_outputs)}')
-        merged_input = " ".join(predecessor_outputs)
         merged_type = next(iter(predecessor_types)) if len(predecessor_types) == 1 else None
+        if merged_type and merged_type == InputTypes.TEXT.value:
+            merged_input = " ".join(predecessor_outputs)
+        elif predecessor_outputs:
+            # get one input if not combined strings
+            merged_input = predecessor_outputs[0]
+        else:
+            merged_input = None
 
         # Execute task with merged input
         loop = asyncio.get_event_loop()

diff --git a/intelli/flow/tasks/task.py b/intelli/flow/tasks/task.py
@@ -1,47 +1,81 @@
 from intelli.flow.template.basic_template import TextInputTemplate
-from intelli.flow.types import AgentTypes, InputTypes
+from intelli.flow.types import AgentTypes, InputTypes, Matcher
 from intelli.utils.logging import Logger
 from intelli.flow.input.agent_input import AgentInput, TextAgentInput, ImageAgentInput
 
 
 class Task:
     def __init__(self, task_input, agent, exclude=False, pre_process=None,
                  post_process=None, template=None, log=False):
+        self.task_input = task_input
         self.desc = task_input.desc
         self.agent = agent
         self.pre_process = pre_process
         self.post_process = post_process
         self.exclude = exclude
         self.output = None
-        self.output_type = agent.type
+        self.output_type = Matcher.output[agent.type]
         self.template = template
         self.logger = Logger(log)
-        if not template and agent.type in [AgentTypes.TEXT.value, AgentTypes.IMAGE.value]:
+        if not template and Matcher.input[agent.type] in [InputTypes.TEXT.value]:
             self.template = TextInputTemplate(self.desc)
 
     def execute(self, input_data=None, input_type=None):
 
         # logging
-        if input_type in [InputTypes.TEXT.value, InputTypes.IMAGE.value]:
+        if input_type in [InputTypes.TEXT.value]:
             self.logger.log_head('- Inside the task with input data head: ', input_data)
-        elif input_type == InputTypes.IMAGE.value and self.agent.type in [AgentTypes.TEXT.value,
-                                                                          AgentTypes.IMAGE.value]:
-            self.logger.log_head('- Inside the task. the previous step input not supported')
+        elif input_type == InputTypes.IMAGE.value and self.agent.type in [AgentTypes.TEXT.value, AgentTypes.IMAGE.value]:
+            self.logger.log('- Inside the task. the previous step input not supported')
+        elif input_type == InputTypes.IMAGE.value:
+            self.logger.log('- Inside the task with previous image, size: ', len(input_data))
 
         # Run task pre procesing
         if self.pre_process:
             input_data = self.pre_process(input_data)
 
-        # Apply template
-        if input_data and input_type in [InputTypes.TEXT.value, InputTypes.IMAGE.value]:
+        # Apply input template
+        if input_data and input_type in [InputTypes.TEXT.value]:
             agent_text = self.template.apply_input(input_data)
             # log
             self.logger.log_head('- Input data with template: ', agent_text)
         else:
             agent_text = self.desc
 
+        # Prepare the input
+        agent_inputs = []
+        if Matcher.input[self.agent.type] == InputTypes.IMAGE.value:
+
+            if self.task_input.img:
+                agent_input = ImageAgentInput(desc=agent_text, img=self.task_input.img)
+                agent_inputs.append(agent_input)
+
+            # add previous output as input, in case of second input for image, only if the output supported
+            if len(agent_inputs) == 0 or Matcher.output[self.agent.type] == InputTypes.TEXT.value:
+                if input_data and input_type == InputTypes.IMAGE.value:
+                    agent_input = ImageAgentInput(desc=agent_text, img=input_data)
+                    agent_inputs.append(agent_input)
+
+        elif Matcher.input[self.agent.type] == AgentTypes.TEXT.value:
+            agent_input = TextAgentInput(agent_text)
+            agent_inputs.append(agent_input)
+
         # Check the agent type and call the appropriate function
-        result = self.agent.execute(TextAgentInput(agent_text))
+        combined_results = []
+        for current_agent_input in agent_inputs:
+
+            result = self.agent.execute(current_agent_input)
+
+            if isinstance(result, list):
+                combined_results.extend(result)
+            else:
+                combined_results.append(str(result))
+
+        if Matcher.output[self.agent.type] == InputTypes.TEXT.value:
+            result = " ".join(combined_results)
+        else:
+            # get first result only for none text outputs
+            result = combined_results[0]
 
         # log
         if self.agent.type in [AgentTypes.TEXT.value]:

diff --git a/intelli/flow/types.py b/intelli/flow/types.py
@@ -4,8 +4,23 @@
 class AgentTypes(Enum):
     TEXT = 'text'
     IMAGE = 'image'
+    VISION = 'vision'
 
 
 class InputTypes(Enum):
     TEXT = 'text'
     IMAGE = 'image'
+    VISION = 'vision'
+
+class Matcher():
+    input = {
+        'text': 'text',
+        'image': 'text',
+        'vision': 'image'
+    }
+
+    output = {
+        'text': 'text',
+        'image': 'image',
+        'vision': 'text'
+    }