feat:[DocAgent] Dynamic RAG

priyansh4320 · priyansh4320 · commit f1daca912092 · 2025-09-18T03:58:22.000+05:30
diff --git a/autogen/agents/experimental/document_agent/document_agent.py b/autogen/agents/experimental/document_agent/document_agent.py
@@ -84,6 +84,7 @@ def __init__(
         parsed_docs_path: str | Path | None = None,
         collection_name: str | None = None,
         query_engine: RAGQueryEngine | None = None,
+        rag_config: dict[str, dict[str, Any]] | None = None,  # NEW: {"vector": {}, "graph": {...}}
     ):
         """Initialize the DocAgent.
 
@@ -94,6 +95,7 @@ def __init__(
             parsed_docs_path: The path where parsed documents will be stored.
             collection_name: The unique name for the data store collection.
             query_engine: The query engine to use for querying documents.
+            rag_config: Configuration for RAG engines {"vector": {}, "graph": {...}}.
         """
         name = name or "DocAgent"
         llm_config = llm_config or LLMConfig.get_current_llm_config()
@@ -120,6 +122,7 @@ def __init__(
             query_engine=query_engine,
             parsed_docs_path=parsed_docs_path,
             collection_name=collection_name,
+            rag_config=rag_config,  # NEW
         )
 
         def update_ingested_documents() -> None:
diff --git a/autogen/agents/experimental/document_agent/task_manager.py b/autogen/agents/experimental/document_agent/task_manager.py
@@ -4,11 +4,18 @@
 
 import asyncio
 import logging
+import tempfile
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
 from typing import Any
 
+import fitz  # PyMuPDF  # pyright: ignore[reportMissingImports]
+import requests
+import urllib3
+
 from autogen import ConversableAgent
+from autogen.agentchat.contrib.capabilities.text_compressors import LLMLingua
+from autogen.agentchat.contrib.capabilities.transforms import TextMessageCompressor
 from autogen.agentchat.contrib.rag.query_engine import RAGQueryEngine
 from autogen.agentchat.group.context_variables import ContextVariables
 from autogen.agentchat.group.reply_result import ReplyResult
@@ -43,6 +50,33 @@
 """
 
 
+def extract_text_from_pdf(doc_path: str) -> list[dict[str, str]]:
+    """Extract compressed text from a PDF file"""
+    if isinstance(doc_path, str) and urllib3.util.url.parse_url(doc_path).scheme:
+        # Download the PDF
+        response = requests.get(doc_path)
+        response.raise_for_status()  # Ensure the download was successful
+
+        text = ""
+        # Save the PDF to a temporary file
+        with tempfile.TemporaryDirectory() as temp_dir:
+            with open(temp_dir + "temp.pdf", "wb") as f:
+                f.write(response.content)
+
+            # Open the PDF
+            with fitz.open(temp_dir + "temp.pdf") as doc:
+                # Read and extract text from each page
+                for page in doc:
+                    text += page.get_text()
+            llm_lingua = LLMLingua()
+            text_compressor = TextMessageCompressor(text_compressor=llm_lingua)
+            compressed_text = text_compressor.apply_transform([{"content": text}])
+
+        return compressed_text
+    else:
+        raise ValueError("doc_path must be a string or a URL")
+
+
 @export_module("autogen.agents.experimental")
 class TaskManagerAgent(ConversableAgent):
     """TaskManagerAgent with integrated tools for document ingestion and query processing."""
@@ -57,6 +91,7 @@ def __init__(
         return_agent_error: str = "SummaryAgent",
         collection_name: str | None = None,
         max_workers: int | None = None,
+        rag_config: dict[str, dict[str, Any]] | None = None,
     ):
         """Initialize the TaskManagerAgent.
 
@@ -69,55 +104,222 @@ def __init__(
             return_agent_error: The agent to return on error
             collection_name: The collection name for the RAG query engine
             max_workers: Maximum number of threads for concurrent processing (None for default)
+            rag_config: Configuration for RAG engines {"vector": {}, "graph": {...}}
         """
-        self.query_engine = query_engine if query_engine else VectorChromaQueryEngine(collection_name=collection_name)
+        self.rag_config = rag_config or {"vector": {}}  # Default to vector only
         self.parsed_docs_path = Path(parsed_docs_path) if parsed_docs_path else Path("./parsed_docs")
         self.executor = ThreadPoolExecutor(max_workers=max_workers)
 
+        # Initialize RAG engines
+        self.rag_engines = self._create_rag_engines(collection_name)
+
+        # Keep backward compatibility
+        self.query_engine = query_engine if query_engine else self.rag_engines.get("vector")
+
+        def _aggregate_rag_results(self: "TaskManagerAgent", query: str, results: dict[str, Any]) -> str:
+            """Aggregate results from multiple RAG engines."""
+            if not results:
+                return f"Query: {query}\nAnswer: No results found from any RAG engine."
+
+            # Simple aggregation
+            answer_parts = [f"Query: {query}"]
+
+            for engine_name, result in results.items():
+                answer_parts.append(f"\n{engine_name.upper()} Results:")
+                answer_parts.append(f"Answer: {result.get('answer', 'No answer available')}")
+
+                # Add citations if available
+                if "citations" in result and result["citations"]:
+                    answer_parts.append("Citations:")
+                    for i, citation in enumerate(result["citations"], 1):
+                        answer_parts.append(f"  [{i}] {citation.get('file_path', 'Unknown')}")
+
+            return "\n".join(answer_parts)
+
         def _process_single_document(self: "TaskManagerAgent", input_file_path: str) -> tuple[str, bool, str]:
             """Process a single document. Returns (path, success, error_msg)."""
+
+            def compress_and_save_text(text: str, input_path: str) -> str:
+                """Compress text and save as markdown file."""
+                llm_lingua = LLMLingua()
+                text_compressor = TextMessageCompressor(text_compressor=llm_lingua)
+                compressed_text = text_compressor.apply_transform([{"content": text}])
+
+                # Create a markdown file with the extracted text
+                output_file = self.parsed_docs_path / f"{Path(input_path).stem}.md"
+                self.parsed_docs_path.mkdir(parents=True, exist_ok=True)
+
+                with open(output_file, "w", encoding="utf-8") as f:
+                    f.write(compressed_text[0]["content"])
+
+                return str(output_file)
+
+            def ingest_to_engines(self: "TaskManagerAgent", output_file: str, input_path: str) -> None:
+                """Ingest document to configured RAG engines."""
+                from autogen.agentchat.contrib.graph_rag.document import Document, DocumentType
+
+                # Determine document type
+                doc_type = DocumentType.TEXT
+                if input_path.lower().endswith(".pdf"):
+                    doc_type = DocumentType.PDF
+                elif input_path.lower().endswith((".html", ".htm")):
+                    doc_type = DocumentType.HTML
+                elif input_path.lower().endswith(".json"):
+                    doc_type = DocumentType.JSON
+
+                # Create Document object for graph engines
+                graph_doc = Document(doctype=doc_type, path_or_url=output_file, data=None)
+
+                # Ingest to configured engines only
+                for rag_type in self.rag_config.keys():
+                    engine = self.rag_engines.get(rag_type)
+                    if engine is None:
+                        continue
+
+                    try:
+                        if rag_type == "vector":
+                            engine.add_docs(new_doc_paths_or_urls=[output_file])
+                        elif rag_type == "graph":
+                            # For graph engines, we need to initialize if not done already
+                            if not hasattr(engine, "_initialized"):
+                                engine.init_db([graph_doc])
+                                engine._initialized = True
+                            else:
+                                # Add new records to existing graph
+                                if hasattr(engine, "add_records"):
+                                    engine.add_records([graph_doc])
+                    except Exception as e:
+                        logger.warning(f"Failed to ingest to {rag_type} engine: {e}")
+
             try:
-                output_files = docling_parse_docs(
-                    input_file_path=input_file_path,
-                    output_dir_path=self.parsed_docs_path,
-                    output_formats=["markdown"],
-                )
+                # Check if the document is a PDF
+                is_pdf = False
+                if isinstance(input_file_path, str) and (
+                    input_file_path.lower().endswith(".pdf")
+                    or (urllib3.util.url.parse_url(input_file_path).scheme and input_file_path.lower().endswith(".pdf"))
+                ):
+                    # Check for PDF extension or URL ending with .pdf
+                    is_pdf = True
+
+                if is_pdf:
+                    # Handle PDF with PyMuPDF
+                    print("PDF found using PyMuPDF")
+                    if urllib3.util.url.parse_url(input_file_path).scheme:
+                        # Download the PDF
+                        response = requests.get(input_file_path)
+                        response.raise_for_status()
+
+                        text = ""
+                        # Save the PDF to a temporary file and extract text
+                        with tempfile.TemporaryDirectory() as temp_dir:
+                            temp_pdf_path = Path(temp_dir) / "temp.pdf"
+                            with open(temp_pdf_path, "wb") as f:
+                                f.write(response.content)
+
+                            # Open the PDF and extract text
+                            with fitz.open(temp_pdf_path) as doc:
+                                for page in doc:
+                                    text += page.get_text()
+
+                            # Compress and save
+                            output_file = compress_and_save_text(text, input_file_path)
+
+                            # Ingest to all active engines
+                            ingest_to_engines(self, output_file, input_file_path)
+
+                            return (input_file_path, True, "")
+                    else:
+                        # Local PDF file
+                        text = ""
+                        with fitz.open(input_file_path) as doc:
+                            for page in doc:
+                                text += page.get_text()
+
+                        # Compress and save
+                        output_file = compress_and_save_text(text, input_file_path)
+
+                        # Ingest to all active engines
+                        ingest_to_engines(self, output_file, input_file_path)
 
-                # Limit to one output markdown file for now.
-                if output_files:
-                    output_file = output_files[0]
-                    if output_file.suffix == ".md":
-                        self.query_engine.add_docs(new_doc_paths_or_urls=[output_file])
                         return (input_file_path, True, "")
+                else:
+                    # Handle non-PDF documents with docling
+                    output_files = docling_parse_docs(
+                        input_file_path=input_file_path,
+                        output_dir_path=self.parsed_docs_path,
+                        output_formats=["markdown"],
+                    )
+
+                    # Limit to one output markdown file for now.
+                    if output_files:
+                        parsed_output_file: Path = output_files[0]
+                        if parsed_output_file.suffix == ".md":
+                            # Ingest to all active engines
+                            ingest_to_engines(self, str(parsed_output_file), input_file_path)
+                            return (input_file_path, True, "")
+
+                    return (input_file_path, False, "No valid markdown output generated")
 
-                return (input_file_path, False, "No valid markdown output generated")
             except Exception as doc_error:
                 return (input_file_path, False, str(doc_error))
 
         def _execute_single_query(self: "TaskManagerAgent", query_text: str) -> tuple[str, str]:
-            """Execute a single query. Returns (query, result)."""
+            """Execute a single query across configured RAG engines. Returns (query, result)."""
             try:
-                # Check for citations support
-                if (
-                    hasattr(self.query_engine, "enable_query_citations")
-                    and getattr(self.query_engine, "enable_query_citations", False)
-                    and hasattr(self.query_engine, "query_with_citations")
-                    and callable(getattr(self.query_engine, "query_with_citations", None))
-                ):
-                    answer_with_citations = getattr(self.query_engine, "query_with_citations")(query_text)
-                    answer = answer_with_citations.answer
-                    txt_citations = [
-                        {
-                            "text_chunk": source.node.get_text(),
-                            "file_path": source.metadata.get("file_path", "Unknown"),
-                        }
-                        for source in answer_with_citations.citations
-                    ]
-                    logger.info(f"Citations: {txt_citations}")
-                else:
-                    answer = self.query_engine.query(query_text) if self.query_engine else "Query engine not available"
+                results = {}
+
+                # Only query engines that are configured in rag_config
+                for rag_type in self.rag_config.keys():
+                    engine = self.rag_engines.get(rag_type)
+                    if engine is None:
+                        continue
 
-                return (query_text, f"Query: {query_text}\nAnswer: {answer}")
+                    try:
+                        if rag_type == "vector":
+                            # Handle vector queries
+                            if (
+                                hasattr(engine, "enable_query_citations")
+                                and getattr(engine, "enable_query_citations", False)
+                                and hasattr(engine, "query_with_citations")
+                                and callable(getattr(engine, "query_with_citations", None))
+                            ):
+                                answer_with_citations = getattr(engine, "query_with_citations")(query_text)
+                                answer = answer_with_citations.answer
+                                txt_citations = [
+                                    {
+                                        "text_chunk": source.node.get_text(),
+                                        "file_path": source.metadata.get("file_path", "Unknown"),
+                                    }
+                                    for source in answer_with_citations.citations
+                                ]
+                                results[rag_type] = {"answer": answer, "citations": txt_citations}
+                                logger.info(f"Vector Citations: {txt_citations}")
+                            else:
+                                answer = engine.query(query_text) if engine else "Vector engine not available"
+                                results[rag_type] = {"answer": answer}
+
+                        elif rag_type == "graph":
+                            # Handle graph queries
+                            # Try to connect to existing graph if not already connected
+                            if not hasattr(engine, "index"):
+                                try:
+                                    engine.connect_db()
+                                    logger.info("Connected to existing Neo4j graph for querying")
+                                except Exception as connect_error:
+                                    logger.warning(f"Failed to connect to Neo4j graph: {connect_error}")
+                                    results[rag_type] = {"answer": f"Error connecting to graph: {connect_error}"}
+                                    continue
+
+                            graph_result = engine.query(query_text)
+                            results[rag_type] = {"answer": graph_result.answer, "results": graph_result.results}
+
+                    except Exception as engine_error:
+                        logger.warning(f"Failed to query {rag_type} engine: {engine_error}")
+                        results[rag_type] = {"answer": f"Error querying {rag_type}: {engine_error}"}
+
+                # Aggregate results
+                aggregated_answer = _aggregate_rag_results(self, query_text, results)
+                return (query_text, aggregated_answer)
 
             except Exception as query_error:
                 logger.warning(f"Failed to execute query '{query_text}': {query_error}")
@@ -306,3 +508,36 @@ def __del__(self) -> None:
         """Clean up the ThreadPoolExecutor when the agent is destroyed."""
         if hasattr(self, "executor"):
             self.executor.shutdown(wait=True)
+
+    def _create_rag_engines(self, collection_name: str | None = None) -> dict[str, Any]:
+        """Create RAG engines based on rag_config."""
+        engines = {}
+
+        for rag_type, config in self.rag_config.items():
+            if rag_type == "vector":
+                engines["vector"] = VectorChromaQueryEngine(
+                    collection_name=config.get("collection_name", collection_name),
+                    **{k: v for k, v in config.items() if k != "collection_name"},
+                )
+            elif rag_type == "graph":
+                engines["graph"] = self._create_neo4j_engine(config)
+
+        return engines
+
+    def _create_neo4j_engine(self, config: dict[str, Any]) -> Any:
+        """Create Neo4j graph query engine."""
+        try:
+            from autogen.agentchat.contrib.graph_rag.neo4j_graph_query_engine import Neo4jGraphQueryEngine
+
+            return Neo4jGraphQueryEngine(
+                host=config.get("host", "bolt://localhost"),
+                port=config.get("port", 7687),
+                database=config.get("database", "neo4j"),
+                username=config.get("username", "neo4j"),
+                password=config.get("password", "neo4j"),
+                llm=config.get("llm"),
+                embedding=config.get("embedding"),
+            )
+        except ImportError as e:
+            logger.warning(f"Neo4j dependencies not available: {e}. Skipping graph engine.")
+            return None