Skyvern-AI · computer2s · Oct 8, 2025 · coderabbitai · Oct 8, 2025 · ellipsis-dev
diff --git a/scripts/run_evolution.py b/scripts/run_evolution.py
@@ -0,0 +1,52 @@
+import asyncio
+import structlog
+
+from skyvern.evolution.evolve import Evolve
+from skyvern.evolution.prompt_manager import PromptManager
+
+LOG = structlog.get_logger()
+
+async def main():
+    """
+    Main function to run the prompt evolution loop.
+    """
+    LOG.info("Initializing prompt evolution process...")
+
+    prompt_manager = PromptManager()
+    evolver = Evolve(prompt_manager)
+
+    # Check if the baseline prompt was loaded correctly
+    if not prompt_manager.get_prompt("baseline"):
+        LOG.error("Failed to load baseline prompt. Aborting evolution process.")
+        return
+
+    LOG.info("Starting evolution loop...")
+
+    # Run the evolution loop for a few generations as a demonstration
+    num_generations = 5
+    for i in range(num_generations):
+        LOG.info(f"--- Generation {i+1}/{num_generations} ---")
+
+        # Evolve the prompts to create new variations
+        await evolver.evolve_prompts()
+
+        # Evaluate the performance of the new prompts
+        evolver.evaluate_and_score_prompts()
+
+        # Log the best prompt of the current generation
+        best_prompt = prompt_manager.get_best_prompt()
+        if best_prompt:
+            LOG.info(f"Best prompt of generation {i+1}: '{best_prompt.name}' with score {best_prompt.score}")
+        else:
+            LOG.warning("No prompts in manager after evolution and evaluation.")
+
+        # In a real application, you might add a delay or run this as a continuous background process
+        await asyncio.sleep(5)
+
+    LOG.info("Evolution loop finished.")
-async def main():
-    """
-    Main function to run the prompt evolution loop.
-    """
-    LOG.info("Initializing prompt evolution process...")
-
-    prompt_manager = PromptManager()
-    evolver = Evolve(prompt_manager)
-
-    # Check if the baseline prompt was loaded correctly
-    if not prompt_manager.get_prompt("baseline"):
-        LOG.error("Failed to load baseline prompt. Aborting evolution process.")
-        return
-
-    LOG.info("Starting evolution loop...")
-
-    # Run the evolution loop for a few generations as a demonstration
-    num_generations = 5
-    for i in range(num_generations):
-        LOG.info(f"--- Generation {i+1}/{num_generations} ---")
-
-        # Evolve the prompts to create new variations
-        await evolver.evolve_prompts()
-
-        # Evaluate the performance of the new prompts
-        evolver.evaluate_and_score_prompts()
-
-        # Log the best prompt of the current generation
-        best_prompt = prompt_manager.get_best_prompt()
-        if best_prompt:
-            LOG.info(f"Best prompt of generation {i+1}: '{best_prompt.name}' with score {best_prompt.score}")
-        else:
-            LOG.warning("No prompts in manager after evolution and evaluation.")
-
-        # In a real application, you might add a delay or run this as a continuous background process
-        await asyncio.sleep(5)
-
-    LOG.info("Evolution loop finished.")
+async def main() -> None:
+    """
+    Main function to run the prompt evolution loop.
+    """
+    LOG.info("Initializing prompt evolution process...")
+
+    prompt_manager = PromptManager()
+    evolver = Evolve(prompt_manager)
+
+    # Check if the baseline prompt was loaded correctly
+    if not prompt_manager.get_prompt("baseline"):
+        LOG.error("Failed to load baseline prompt. Aborting evolution process.")
+        return
+
+    LOG.info("Starting evolution loop...")
+
+    # Run the evolution loop for a few generations as a demonstration
+    num_generations = 5
+    for i in range(num_generations):
+        LOG.info(f"--- Generation {i+1}/{num_generations} ---")
+
+        # Evolve the prompts to create new variations
+        await evolver.evolve_prompts()
+
+        # Evaluate the performance of the new prompts
+        evolver.evaluate_and_score_prompts()
+
+        # Log the best prompt of the current generation
+        best_prompt = prompt_manager.get_best_prompt()
+        if best_prompt:
+            LOG.info(f"Best prompt of generation {i+1}: '{best_prompt.name}' with score {best_prompt.score}")
+        else:
+            LOG.warning("No prompts in manager after evolution and evaluation.")
+
+        # In a real application, you might add a delay or run this as a continuous background process
+        await asyncio.sleep(5)
+
+    LOG.info("Evolution loop finished.")
-async def main():
-    """
-    Main function to run the prompt evolution loop.
-    """
-    LOG.info("Initializing prompt evolution process...")
-
-    prompt_manager = PromptManager()
-    evolver = Evolve(prompt_manager)
-
-    # Check if the baseline prompt was loaded correctly
-    if not prompt_manager.get_prompt("baseline"):
-        LOG.error("Failed to load baseline prompt. Aborting evolution process.")
-        return
-
-    LOG.info("Starting evolution loop...")
-
-    # Run the evolution loop for a few generations as a demonstration
-    num_generations = 5
-    for i in range(num_generations):
-        LOG.info(f"--- Generation {i+1}/{num_generations} ---")
-
-        # Evolve the prompts to create new variations
-        await evolver.evolve_prompts()
-
-        # Evaluate the performance of the new prompts
-        evolver.evaluate_and_score_prompts()
-
-        # Log the best prompt of the current generation
-        best_prompt = prompt_manager.get_best_prompt()
-        if best_prompt:
-            LOG.info(f"Best prompt of generation {i+1}: '{best_prompt.name}' with score {best_prompt.score}")
-        else:
-            LOG.warning("No prompts in manager after evolution and evaluation.")
-
-        # In a real application, you might add a delay or run this as a continuous background process
-        await asyncio.sleep(5)
-
-    LOG.info("Evolution loop finished.")
+async def main() -> None:
+    """
+    Main function to run the prompt evolution loop.
+    """
+    LOG.info("Initializing prompt evolution process...")
+
+    prompt_manager = PromptManager()
+    evolver = Evolve(prompt_manager)
+
+    # Check if the baseline prompt was loaded correctly
+    if not prompt_manager.get_prompt("baseline"):
+        LOG.error("Failed to load baseline prompt. Aborting evolution process.")
+        return
+
+    LOG.info("Starting evolution loop...")
+
+    # Run the evolution loop for a few generations as a demonstration
+    num_generations = 5
+    for i in range(num_generations):
+        LOG.info(f"--- Generation {i+1}/{num_generations} ---")
+
+        # Evolve the prompts to create new variations
+        await evolver.evolve_prompts()
+
+        # Evaluate the performance of the new prompts
+        evolver.evaluate_and_score_prompts()
+
+        # Log the best prompt of the current generation
+        best_prompt = prompt_manager.get_best_prompt()
+        if best_prompt:
+            LOG.info(f"Best prompt of generation {i+1}: '{best_prompt.name}' with score {best_prompt.score}")
+        else:
+            LOG.warning("No prompts in manager after evolution and evaluation.")
+
+        # In a real application, you might add a delay or run this as a continuous background process
+        await asyncio.sleep(5)
+
+    LOG.info("Evolution loop finished.")
+
+if __name__ == "__main__":
+    # This script needs to be run in an environment where the skyvern package is installed
+    # and the necessary configurations (like .env for LLM providers) are set up.
+    # Example: poetry run python scripts/run_evolution.py
+    asyncio.run(main())
diff --git a/skyvern/evolution/__init__.py b/skyvern/evolution/__init__.py
@@ -0,0 +1 @@
+# This file is intentionally left blank to mark the directory as a Python package.
diff --git a/skyvern/evolution/evolve.py b/skyvern/evolution/evolve.py
@@ -0,0 +1,74 @@
+import structlog
+import random
-import random
-import random
+import structlog
+
+from skyvern.forge.prompts import prompt_engine
-import random
-import random
+import structlog
+
+from skyvern.forge.prompts import prompt_engine
+
+from skyvern.forge.prompts import prompt_engine
+from skyvern.forge.sdk.llm import LLM_API_HANDLER
+
+LOG = structlog.get_logger()
+
+class Evolve:
+    def __init__(self, prompt_manager):
+        self.prompt_manager = prompt_manager
+        self.evolution_count = 0
+
+    async def evolve_prompts(self):
+        """
+        Takes the top-performing prompts and uses an LLM to generate new variations.
+        """
+        best_prompt = self.prompt_manager.get_best_prompt()
+        if not best_prompt:
+            LOG.warning("No prompts found to evolve.")
+            return
+
+        LOG.info(f"Evolving prompt '{best_prompt.name}' with score {best_prompt.score}")
+
+        # Use an LLM to generate a new variation of the prompt.
+        evolution_prompt = prompt_engine.load_prompt(
+            "evolve-prompt",
+            prompt_to_evolve=best_prompt.template,
+        )
+
+        # In a real implementation, a 'step' object would be passed here.
+        # This is a placeholder for demonstration purposes.
+        response = await LLM_API_HANDLER(prompt=evolution_prompt, step=None)
+
+        # Assuming the response is the raw string of the new prompt
+        evolved_prompt_str = response if isinstance(response, str) else str(response)
+
+        # Add the new prompt to the population
+        self.evolution_count += 1
+        new_prompt_name = f"evolved_v{self.evolution_count}"
+        self.prompt_manager.add_prompt(new_prompt_name, evolved_prompt_str, score=0)
+
+        LOG.info(f"Evolved new prompt '{new_prompt_name}': {evolved_prompt_str[:100]}...")
+
+    def evaluate_and_score_prompts(self):
+        """
+        Simulates the evaluation of prompts and updates their scores based on deterministic criteria.
+        In a real-world scenario, this would involve running benchmarks.
+        """
+        LOG.info("Evaluating and scoring prompts...")
+        for name, prompt in self.prompt_manager.prompts.items():
+            # Skip the baseline prompt as its score is fixed.
+            if name == "baseline":
+                continue
+
+            score = 0
+            # Score based on length (ideal length between 500 and 1500 characters)
+            length = len(prompt.template)
+            if 500 <= length <= 1500:
+                score += 0.5
+            else:
+                score -= 0.2
+
+            # Score based on presence of keywords
+            keywords = ["action", "reasoning", "COMPLETE", "TERMINATE", "element", "goal"]
+            for keyword in keywords:
+                if keyword in prompt.template.lower():
+                    score += 0.2
+
+            # Normalize score to be between 0 and 2 for this simulation
+            normalized_score = max(0, min(2, score))
+
+            self.prompt_manager.update_score(name, normalized_score)
+            LOG.info(f"Evaluated '{name}', assigned score: {normalized_score}")
-class Evolve:
-    def __init__(self, prompt_manager):
-        self.prompt_manager = prompt_manager
-        self.evolution_count = 0
-
-    async def evolve_prompts(self):
-        """
-        Takes the top-performing prompts and uses an LLM to generate new variations.
-        """
-        best_prompt = self.prompt_manager.get_best_prompt()
-        if not best_prompt:
-            LOG.warning("No prompts found to evolve.")
-            return
-
-        LOG.info(f"Evolving prompt '{best_prompt.name}' with score {best_prompt.score}")
-
-        # Use an LLM to generate a new variation of the prompt.
-        evolution_prompt = prompt_engine.load_prompt(
-            "evolve-prompt",
-            prompt_to_evolve=best_prompt.template,
-        )
-
-        # In a real implementation, a 'step' object would be passed here.
-        # This is a placeholder for demonstration purposes.
-        response = await LLM_API_HANDLER(prompt=evolution_prompt, step=None)
-
-        # Assuming the response is the raw string of the new prompt
-        evolved_prompt_str = response if isinstance(response, str) else str(response)
-
-        # Add the new prompt to the population
-        self.evolution_count += 1
-        new_prompt_name = f"evolved_v{self.evolution_count}"
-        self.prompt_manager.add_prompt(new_prompt_name, evolved_prompt_str, score=0)
-
-        LOG.info(f"Evolved new prompt '{new_prompt_name}': {evolved_prompt_str[:100]}...")
-
-    def evaluate_and_score_prompts(self):
-        """
-        Simulates the evaluation of prompts and updates their scores based on deterministic criteria.
-        In a real-world scenario, this would involve running benchmarks.
-        """
-        LOG.info("Evaluating and scoring prompts...")
-        for name, prompt in self.prompt_manager.prompts.items():
-            # Skip the baseline prompt as its score is fixed.
-            if name == "baseline":
-                continue
-
-            score = 0
-            # Score based on length (ideal length between 500 and 1500 characters)
-            length = len(prompt.template)
-            if 500 <= length <= 1500:
-                score += 0.5
-            else:
-                score -= 0.2
-
-            # Score based on presence of keywords
-            keywords = ["action", "reasoning", "COMPLETE", "TERMINATE", "element", "goal"]
-            for keyword in keywords:
-                if keyword in prompt.template.lower():
-                    score += 0.2
-
-            # Normalize score to be between 0 and 2 for this simulation
-            normalized_score = max(0, min(2, score))
-
-            self.prompt_manager.update_score(name, normalized_score)
-            LOG.info(f"Evaluated '{name}', assigned score: {normalized_score}")
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from skyvern.evolution.prompt_manager import PromptManager
+
+class Evolve:
+    def __init__(self, prompt_manager: "PromptManager") -> None:
+        self.prompt_manager = prompt_manager
+        self.evolution_count = 0
+
+    async def evolve_prompts(self) -> None:
+        """
+        Takes the top-performing prompts and uses an LLM to generate new variations.
+        """
+        best_prompt = self.prompt_manager.get_best_prompt()
+        if not best_prompt:
+            LOG.warning("No prompts found to evolve.")
+            return
+
+        LOG.info(f"Evolving prompt '{best_prompt.name}' with score {best_prompt.score}")
+
+        # Use an LLM to generate a new variation of the prompt.
+        evolution_prompt = prompt_engine.load_prompt(
+            "evolve-prompt",
+            prompt_to_evolve=best_prompt.template,
+        )
+
+        # In a real implementation, a 'step' object would be passed here.
+        # This is a placeholder for demonstration purposes.
+        response = await LLM_API_HANDLER(prompt=evolution_prompt, step=None)
+
+        # Assuming the response is the raw string of the new prompt
+        evolved_prompt_str = response if isinstance(response, str) else str(response)
+
+        # Add the new prompt to the population
+        self.evolution_count += 1
+        new_prompt_name = f"evolved_v{self.evolution_count}"
+        self.prompt_manager.add_prompt(new_prompt_name, evolved_prompt_str, score=0)
+
+        LOG.info(f"Evolved new prompt '{new_prompt_name}': {evolved_prompt_str[:100]}...")
+
+    def evaluate_and_score_prompts(self) -> None:
+        """
+        Simulates the evaluation of prompts and updates their scores based on deterministic criteria.
+        In a real-world scenario, this would involve running benchmarks.
+        """
+        LOG.info("Evaluating and scoring prompts...")
+        for name, prompt in self.prompt_manager.prompts.items():
+            # Skip the baseline prompt as its score is fixed.
+            if name == "baseline":
+                continue
+
+            score = 0
+            # Score based on length (ideal length between 500 and 1500 characters)
+            length = len(prompt.template)
+            if 500 <= length <= 1500:
+                score += 0.5
+            else:
+                score -= 0.2
+
+            # Score based on presence of keywords
+            keywords = ["action", "reasoning", "COMPLETE", "TERMINATE", "element", "goal"]
+            for keyword in keywords:
+                if keyword in prompt.template.lower():
+                    score += 0.2
+
+            # Normalize score to be between 0 and 2 for this simulation
+            normalized_score = max(0, min(2, score))
+
+            self.prompt_manager.update_score(name, normalized_score)
+            LOG.info(f"Evaluated '{name}', assigned score: {normalized_score}")
-class Evolve:
-    def __init__(self, prompt_manager):
-        self.prompt_manager = prompt_manager
-        self.evolution_count = 0
-
-    async def evolve_prompts(self):
-        """
-        Takes the top-performing prompts and uses an LLM to generate new variations.
-        """
-        best_prompt = self.prompt_manager.get_best_prompt()
-        if not best_prompt:
-            LOG.warning("No prompts found to evolve.")
-            return
-
-        LOG.info(f"Evolving prompt '{best_prompt.name}' with score {best_prompt.score}")
-
-        # Use an LLM to generate a new variation of the prompt.
-        evolution_prompt = prompt_engine.load_prompt(
-            "evolve-prompt",
-            prompt_to_evolve=best_prompt.template,
-        )
-
-        # In a real implementation, a 'step' object would be passed here.
-        # This is a placeholder for demonstration purposes.
-        response = await LLM_API_HANDLER(prompt=evolution_prompt, step=None)
-
-        # Assuming the response is the raw string of the new prompt
-        evolved_prompt_str = response if isinstance(response, str) else str(response)
-
-        # Add the new prompt to the population
-        self.evolution_count += 1
-        new_prompt_name = f"evolved_v{self.evolution_count}"
-        self.prompt_manager.add_prompt(new_prompt_name, evolved_prompt_str, score=0)
-
-        LOG.info(f"Evolved new prompt '{new_prompt_name}': {evolved_prompt_str[:100]}...")
-
-    def evaluate_and_score_prompts(self):
-        """
-        Simulates the evaluation of prompts and updates their scores based on deterministic criteria.
-        In a real-world scenario, this would involve running benchmarks.
-        """
-        LOG.info("Evaluating and scoring prompts...")
-        for name, prompt in self.prompt_manager.prompts.items():
-            # Skip the baseline prompt as its score is fixed.
-            if name == "baseline":
-                continue
-
-            score = 0
-            # Score based on length (ideal length between 500 and 1500 characters)
-            length = len(prompt.template)
-            if 500 <= length <= 1500:
-                score += 0.5
-            else:
-                score -= 0.2
-
-            # Score based on presence of keywords
-            keywords = ["action", "reasoning", "COMPLETE", "TERMINATE", "element", "goal"]
-            for keyword in keywords:
-                if keyword in prompt.template.lower():
-                    score += 0.2
-
-            # Normalize score to be between 0 and 2 for this simulation
-            normalized_score = max(0, min(2, score))
-
-            self.prompt_manager.update_score(name, normalized_score)
-            LOG.info(f"Evaluated '{name}', assigned score: {normalized_score}")
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from skyvern.evolution.prompt_manager import PromptManager
+
+class Evolve:
+    def __init__(self, prompt_manager: "PromptManager") -> None:
+        self.prompt_manager = prompt_manager
+        self.evolution_count = 0
+
+    async def evolve_prompts(self) -> None:
+        """
+        Takes the top-performing prompts and uses an LLM to generate new variations.
+        """
+        best_prompt = self.prompt_manager.get_best_prompt()
+        if not best_prompt:
+            LOG.warning("No prompts found to evolve.")
+            return
+
+        LOG.info(f"Evolving prompt '{best_prompt.name}' with score {best_prompt.score}")
+
+        # Use an LLM to generate a new variation of the prompt.
+        evolution_prompt = prompt_engine.load_prompt(
+            "evolve-prompt",
+            prompt_to_evolve=best_prompt.template,
+        )
+
+        # In a real implementation, a 'step' object would be passed here.
+        # This is a placeholder for demonstration purposes.
+        response = await LLM_API_HANDLER(prompt=evolution_prompt, step=None)
+
+        # Assuming the response is the raw string of the new prompt
+        evolved_prompt_str = response if isinstance(response, str) else str(response)
+
+        # Add the new prompt to the population
+        self.evolution_count += 1
+        new_prompt_name = f"evolved_v{self.evolution_count}"
+        self.prompt_manager.add_prompt(new_prompt_name, evolved_prompt_str, score=0)
+
+        LOG.info(f"Evolved new prompt '{new_prompt_name}': {evolved_prompt_str[:100]}...")
+
+    def evaluate_and_score_prompts(self) -> None:
+        """
+        Simulates the evaluation of prompts and updates their scores based on deterministic criteria.
+        In a real-world scenario, this would involve running benchmarks.
+        """
+        LOG.info("Evaluating and scoring prompts...")
+        for name, prompt in self.prompt_manager.prompts.items():
+            # Skip the baseline prompt as its score is fixed.
+            if name == "baseline":
+                continue
+
+            score = 0
+            # Score based on length (ideal length between 500 and 1500 characters)
+            length = len(prompt.template)
+            if 500 <= length <= 1500:
+                score += 0.5
+            else:
+                score -= 0.2
+
+            # Score based on presence of keywords
+            keywords = ["action", "reasoning", "COMPLETE", "TERMINATE", "element", "goal"]
+            for keyword in keywords:
+                if keyword in prompt.template.lower():
+                    score += 0.2
+
+            # Normalize score to be between 0 and 2 for this simulation
+            normalized_score = max(0, min(2, score))
+
+            self.prompt_manager.update_score(name, normalized_score)
+            LOG.info(f"Evaluated '{name}', assigned score: {normalized_score}")
diff --git a/skyvern/evolution/prompt_manager.py b/skyvern/evolution/prompt_manager.py
@@ -0,0 +1,68 @@
+import structlog
+
+from skyvern.forge.prompts import prompt_engine
+
+LOG = structlog.get_logger()
+
+class Prompt:
+    def __init__(self, name, template, score=0):
+        self.name = name
+        self.template = template
+        self.score = score
-class Prompt:
-    def __init__(self, name, template, score=0):
-        self.name = name
-        self.template = template
-        self.score = score
+class Prompt:
+    """
+    Represents a prompt template with its associated metadata.
+
+    Attributes:
+        name: Unique identifier for the prompt.
+        template: The Jinja2 template string.
+        score: Performance score for ranking (default: 0).
+    """
+    def __init__(self, name: str, template: str, score: float = 0) -> None:
+        self.name = name
+        self.template = template
+        self.score = score
-class Prompt:
-    def __init__(self, name, template, score=0):
-        self.name = name
-        self.template = template
-        self.score = score
+class Prompt:
+    """
+    Represents a prompt template with its associated metadata.
+
+    Attributes:
+        name: Unique identifier for the prompt.
+        template: The Jinja2 template string.
+        score: Performance score for ranking (default: 0).
+    """
+    def __init__(self, name: str, template: str, score: float = 0) -> None:
+        self.name = name
+        self.template = template
+        self.score = score
+
+class PromptManager:
+    def __init__(self):
+        self.prompts = {}
+        self._load_baseline_prompt()
+
+    def _load_baseline_prompt(self):
+        """
+        Loads the original 'extract-action.j2' prompt as the baseline.
+        """
+        try:
+            # Access the Jinja2 environment from the prompt_engine
+            env = prompt_engine.env
+            # Construct the path to the template within the Jinja2 environment
+            template_path = "skyvern/extract-action.j2"
+            # Get the template source from the loader
+            baseline_template = env.loader.get_source(env, template_path)[0]
+
+            self.add_prompt("baseline", baseline_template, score=1.0) # Assuming baseline is good.
+            LOG.info("Loaded baseline prompt 'extract-action.j2'.")
+        except Exception as e:
+            LOG.error(f"Failed to load baseline prompt: {e}", exc_info=True)
+
+    def add_prompt(self, name, template, score=0):
+        """
+        Adds a new prompt to the population.
+        """
+        if name in self.prompts:
+            LOG.warning(f"Prompt with name '{name}' already exists. Overwriting.")
+
+        self.prompts[name] = Prompt(name, template, score)
+        LOG.info(f"Added prompt '{name}' with score {score}.")
+
+    def get_prompt(self, name):
+        """
+        Retrieves a prompt object by its name.
+        """
+        return self.prompts.get(name)
+
+    def get_best_prompt(self):
+        """
+        Returns the prompt with the highest score.
+        """
+        if not self.prompts:
+            return None
+
+        return max(self.prompts.values(), key=lambda p: p.score)
+
+    def update_score(self, name, score):
+        """
+        Updates the score of a prompt after evaluation.
+        """
+        if name in self.prompts:
+            self.prompts[name].score = score
+            LOG.info(f"Updated score for prompt '{name}' to {score}.")
+        else:
+            LOG.warning(f"Prompt '{name}' not found for score update.")
-class PromptManager:
-    def __init__(self):
-        self.prompts = {}
-        self._load_baseline_prompt()
-
-    def _load_baseline_prompt(self):
-        """
-        Loads the original 'extract-action.j2' prompt as the baseline.
-        """
-        try:
-            # Access the Jinja2 environment from the prompt_engine
-            env = prompt_engine.env
-            # Construct the path to the template within the Jinja2 environment
-            template_path = "skyvern/extract-action.j2"
-            # Get the template source from the loader
-            baseline_template = env.loader.get_source(env, template_path)[0]
-
-            self.add_prompt("baseline", baseline_template, score=1.0) # Assuming baseline is good.
-            LOG.info("Loaded baseline prompt 'extract-action.j2'.")
-        except Exception as e:
-            LOG.error(f"Failed to load baseline prompt: {e}", exc_info=True)
-
-    def add_prompt(self, name, template, score=0):
-        """
-        Adds a new prompt to the population.
-        """
-        if name in self.prompts:
-            LOG.warning(f"Prompt with name '{name}' already exists. Overwriting.")
-
-        self.prompts[name] = Prompt(name, template, score)
-        LOG.info(f"Added prompt '{name}' with score {score}.")
-
-    def get_prompt(self, name):
-        """
-        Retrieves a prompt object by its name.
-        """
-        return self.prompts.get(name)
-
-    def get_best_prompt(self):
-        """
-        Returns the prompt with the highest score.
-        """
-        if not self.prompts:
-            return None
-
-        return max(self.prompts.values(), key=lambda p: p.score)
-
-    def update_score(self, name, score):
-        """
-        Updates the score of a prompt after evaluation.
-        """
-        if name in self.prompts:
-            self.prompts[name].score = score
-            LOG.info(f"Updated score for prompt '{name}' to {score}.")
-        else:
-            LOG.warning(f"Prompt '{name}' not found for score update.")
+from typing import Optional
+
+class PromptManager:
+    def __init__(self) -> None:
+        self.prompts: dict[str, Prompt] = {}
+        self._load_baseline_prompt()
+
+    def _load_baseline_prompt(self) -> None:
+        """
+        Loads the original 'extract-action.j2' prompt as the baseline.
+        """
+        try:
+            # Access the Jinja2 environment from the prompt_engine
+            env = prompt_engine.env
+            # Construct the path to the template within the Jinja2 environment
+            template_path = "skyvern/extract-action.j2"
+            # Get the template source from the loader
+            baseline_template = env.loader.get_source(env, template_path)[0]
+
+            self.add_prompt("baseline", baseline_template, score=1.0)  # Assuming baseline is good.
+            LOG.info("Loaded baseline prompt 'extract-action.j2'.")
+        except Exception as e:
+            LOG.error(f"Failed to load baseline prompt: {e}", exc_info=True)
+
+    def add_prompt(self, name: str, template: str, score: float = 0) -> None:
+        """
+        Adds a new prompt to the population.
+        """
+        if name in self.prompts:
+            LOG.warning(f"Prompt with name '{name}' already exists. Overwriting.")
+
+        self.prompts[name] = Prompt(name, template, score)
+        LOG.info(f"Added prompt '{name}' with score {score}.")
+
+    def get_prompt(self, name: str) -> Optional[Prompt]:
+        """
+        Retrieves a prompt object by its name.
+        """
+        return self.prompts.get(name)
+
+    def get_best_prompt(self) -> Optional[Prompt]:
+        """
+        Returns the prompt with the highest score.
+        """
+        if not self.prompts:
+            return None
+
+        return max(self.prompts.values(), key=lambda p: p.score)
+
+    def update_score(self, name: str, score: float) -> None:
+        """
+        Updates the score of a prompt after evaluation.
+        """
+        if name in self.prompts:
+            self.prompts[name].score = score
+            LOG.info(f"Updated score for prompt '{name}' to {score}.")
+        else:
+            LOG.warning(f"Prompt '{name}' not found for score update.")
-class PromptManager:
-    def __init__(self):
-        self.prompts = {}
-        self._load_baseline_prompt()
-
-    def _load_baseline_prompt(self):
-        """
-        Loads the original 'extract-action.j2' prompt as the baseline.
-        """
-        try:
-            # Access the Jinja2 environment from the prompt_engine
-            env = prompt_engine.env
-            # Construct the path to the template within the Jinja2 environment
-            template_path = "skyvern/extract-action.j2"
-            # Get the template source from the loader
-            baseline_template = env.loader.get_source(env, template_path)[0]
-
-            self.add_prompt("baseline", baseline_template, score=1.0) # Assuming baseline is good.
-            LOG.info("Loaded baseline prompt 'extract-action.j2'.")
-        except Exception as e:
-            LOG.error(f"Failed to load baseline prompt: {e}", exc_info=True)
-
-    def add_prompt(self, name, template, score=0):
-        """
-        Adds a new prompt to the population.
-        """
-        if name in self.prompts:
-            LOG.warning(f"Prompt with name '{name}' already exists. Overwriting.")
-
-        self.prompts[name] = Prompt(name, template, score)
-        LOG.info(f"Added prompt '{name}' with score {score}.")
-
-    def get_prompt(self, name):
-        """
-        Retrieves a prompt object by its name.
-        """
-        return self.prompts.get(name)
-
-    def get_best_prompt(self):
-        """
-        Returns the prompt with the highest score.
-        """
-        if not self.prompts:
-            return None
-
-        return max(self.prompts.values(), key=lambda p: p.score)
-
-    def update_score(self, name, score):
-        """
-        Updates the score of a prompt after evaluation.
-        """
-        if name in self.prompts:
-            self.prompts[name].score = score
-            LOG.info(f"Updated score for prompt '{name}' to {score}.")
-        else:
-            LOG.warning(f"Prompt '{name}' not found for score update.")
+from typing import Optional
+
+class PromptManager:
+    def __init__(self) -> None:
+        self.prompts: dict[str, Prompt] = {}
+        self._load_baseline_prompt()
+
+    def _load_baseline_prompt(self) -> None:
+        """
+        Loads the original 'extract-action.j2' prompt as the baseline.
+        """
+        try:
+            # Access the Jinja2 environment from the prompt_engine
+            env = prompt_engine.env
+            # Construct the path to the template within the Jinja2 environment
+            template_path = "skyvern/extract-action.j2"
+            # Get the template source from the loader
+            baseline_template = env.loader.get_source(env, template_path)[0]
+
+            self.add_prompt("baseline", baseline_template, score=1.0)  # Assuming baseline is good.
+            LOG.info("Loaded baseline prompt 'extract-action.j2'.")
+        except Exception as e:
+            LOG.error(f"Failed to load baseline prompt: {e}", exc_info=True)
+
+    def add_prompt(self, name: str, template: str, score: float = 0) -> None:
+        """
+        Adds a new prompt to the population.
+        """
+        if name in self.prompts:
+            LOG.warning(f"Prompt with name '{name}' already exists. Overwriting.")
+
+        self.prompts[name] = Prompt(name, template, score)
+        LOG.info(f"Added prompt '{name}' with score {score}.")
+
+    def get_prompt(self, name: str) -> Optional[Prompt]:
+        """
+        Retrieves a prompt object by its name.
+        """
+        return self.prompts.get(name)
+
+    def get_best_prompt(self) -> Optional[Prompt]:
+        """
+        Returns the prompt with the highest score.
+        """
+        if not self.prompts:
+            return None
+
+        return max(self.prompts.values(), key=lambda p: p.score)
+
+    def update_score(self, name: str, score: float) -> None:
+        """
+        Updates the score of a prompt after evaluation.
+        """
+        if name in self.prompts:
+            self.prompts[name].score = score
+            LOG.info(f"Updated score for prompt '{name}' to {score}.")
+        else:
+            LOG.warning(f"Prompt '{name}' not found for score update.")
diff --git a/skyvern/forge/agent.py b/skyvern/forge/agent.py
@@ -1316,11 +1316,31 @@ async def _build_extract_action_prompt(
         )
 
         task_type = task.task_type if task.task_type else TaskType.general
-        template = ""
+
+        # Determine which template to use. Evolved prompts are handled as raw strings,
+        # while standard prompts are handled by name.
+        template_name: str | None = None
+        template_str: str | None = None
+
         if task_type == TaskType.general:
-            template = "extract-action"
+            # For general tasks, try to use the best prompt from our evolution manager.
+            best_prompt = app.PROMPT_MANAGER.get_best_prompt()
+            if best_prompt:
+                LOG.info(f"Using evolved prompt: {best_prompt.name} with score {best_prompt.score}")
+                template_str = best_prompt.template
+            else:
+                # If no evolved prompts, fall back to the baseline prompt.
+                LOG.warning("PromptManager has no prompts. Falling back to baseline 'extract-action'.")
+                baseline_prompt = app.PROMPT_MANAGER.get_prompt("baseline")
+                if baseline_prompt:
+                    template_str = baseline_prompt.template
+                else:
+                    # If even the baseline is missing, this is a critical error.
+                    LOG.error("Baseline prompt could not be loaded from PromptManager.")
+                    # As a last resort, use the template name.
+                    template_name = "extract-action"
         elif task_type == TaskType.validation:
-            template = "decisive-criterion-validate"
+            template_name = "decisive-criterion-validate"
         elif task_type == TaskType.action:
             prompt = prompt_engine.load_prompt("infer-action-type", navigation_goal=navigation_goal)
             json_response = await app.LLM_API_HANDLER(prompt=prompt, step=step)
@@ -1329,26 +1349,22 @@ async def _build_extract_action_prompt(
                     reason=json_response.get("thought"), error_type=json_response.get("error")
                 )
 
-            action_type: str = json_response.get("action_type") or ""
-            action_type = ActionType[action_type.upper()]
+            action_type_str: str = json_response.get("action_type") or ""
+            action_type = ActionType[action_type_str.upper()]
 
             if action_type == ActionType.CLICK:
-                template = "single-click-action"
+                template_name = "single-click-action"
             elif action_type == ActionType.INPUT_TEXT:
-                template = "single-input-action"
+                template_name = "single-input-action"
             elif action_type == ActionType.UPLOAD_FILE:
-                template = "single-upload-action"
+                template_name = "single-upload-action"
             elif action_type == ActionType.SELECT_OPTION:
-                template = "single-select-action"
+                template_name = "single-select-action"
             else:
                 raise UnsupportedActionType(action_type=action_type)
 
-        if not template:
-            raise UnsupportedTaskType(task_type=task_type)
-
         context = skyvern_context.ensure_context()
-        return prompt_engine.load_prompt(
-            template=template,
+        render_kwargs = dict(
             navigation_goal=navigation_goal,
             navigation_payload_str=json.dumps(final_navigation_payload),
             starting_url=starting_url,
@@ -1363,6 +1379,22 @@ async def _build_extract_action_prompt(
             terminate_criterion=task.terminate_criterion,
         )
 
+        if template_str is not None:
+            # Render the prompt from a raw string (used for evolved prompts)
+            return prompt_engine.load_prompt_from_string(
+                template=template_str,
+                **render_kwargs,
+            )
+
+        if template_name is not None:
+            # Render the prompt from a template file by name (standard behavior)
+            return prompt_engine.load_prompt(
+                template=template_name,
+                **render_kwargs,
+            )
+
+        raise UnsupportedTaskType(task_type=task_type)
+
     def _build_navigation_payload(
         self,
         task: Task,

diff --git a/skyvern/forge/app.py b/skyvern/forge/app.py
@@ -2,6 +2,7 @@
 
 from fastapi import FastAPI
 
+from skyvern.evolution.prompt_manager import PromptManager
 from skyvern.forge.agent import ForgeAgent
 from skyvern.forge.agent_functions import AgentFunction
 from skyvern.forge.sdk.api.llm.api_handler_factory import LLMAPIHandlerFactory
@@ -43,4 +44,5 @@
 authentication_function: Callable[[str], Awaitable[Organization]] | None = None
 setup_api_app: Callable[[FastAPI], None] | None = None
 
+PROMPT_MANAGER = PromptManager()
 agent = ForgeAgent()
diff --git a/skyvern/forge/prompts/skyvern/evolve-prompt.j2 b/skyvern/forge/prompts/skyvern/evolve-prompt.j2
@@ -0,0 +1,17 @@
+You are an expert in prompt engineering for large language models that control web automation agents.
+Your task is to evolve the following prompt to make it more effective. The goal is to improve the agent's ability to understand a webpage and decide on the next best action to achieve a user's goal.
+
+Here are some principles for a good prompt:
+- **Clarity and Conciseness:** The prompt should be easy for the LLM to understand. Avoid ambiguity.
+- **Role-setting:** Clearly define the role and capabilities of the agent.
+- **Comprehensive Context:** Ensure all necessary information (like page elements, user goal, history) is presented logically.
+- **Action-oriented:** The prompt should guide the LLM towards producing a concrete, executable action.
+- **Robustness:** The prompt should encourage the model to handle unexpected situations gracefully (e.g., by providing fallback actions or reasoning about errors).
+
+Here is the prompt to evolve:
+---
+{{ prompt_to_evolve }}
+---
+
+Based on the principles above, please provide a new, improved version of this prompt.
+Only output the new prompt template. Do not include any other text or explanation.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		# This file is intentionally left blank to mark the directory as a Python package.