MLSysOps
diff --git a/‎mle/utils/component_memory.py‎
Lines changed: 162 additions & 149 deletions b/‎mle/utils/component_memory.py‎
Lines changed: 162 additions & 149 deletions
@@ -9,20 +9,20 @@
 import json
 import uuid
 import time
-import sqlite3
 import traceback
 import functools
 from datetime import datetime
 from typing import Dict, List, Any, Optional, Union, Tuple
 
+from .memory import LanceDBMemory
+
 
 class ComponentMemory:
     """
     Tracks and stores execution traces for different components in MLE-Agent.
 
-    Uses SQLite as the backend for efficient storage and querying of component traces.
-    Each component type has its own table, and traces can be queried by component,
-    timestamp, or content.
+    Uses LanceDB as the backend for efficient storage and querying of component traces.
+    Component traces are organized by component type and can be queried by various attributes.
     """
 
     def __init__(self, project_dir: str):
@@ -33,56 +33,16 @@ def __init__(self, project_dir: str):
             project_dir: The project directory path.
         """
         self.project_dir = project_dir
-
-        # Ensure the .mle directory exists
-        self.memory_dir = os.path.join(project_dir, '.mle')
-        os.makedirs(self.memory_dir, exist_ok=True)
-
-        # Initialize SQLite database for storing traces
-        self.db_path = os.path.join(self.memory_dir, 'component_traces.db')
-        self.conn = sqlite3.connect(self.db_path)
-        self.conn.row_factory = sqlite3.Row  # Access rows by name
-
-        # Initialize tables
-        self._initialize_tables()
-
-    def _initialize_tables(self):
-        """Initialize the database tables for storing component traces."""
-        cursor = self.conn.cursor()
-
-        # Create a table for each component type
-        components = [
+        
+        # Initialize LanceDB memory as the backend storage
+        self.memory = LanceDBMemory(project_dir)
+        
+        # Track components for easier access
+        self.components = [
             'advisor', 'planner', 'coder', 'debugger', 'reporter', 'chat',
             'github_summarizer', 'git_summarizer'
         ]
 
-        for component in components:
-            cursor.execute(f'''
-            CREATE TABLE IF NOT EXISTS {component}_traces (
-                id TEXT PRIMARY KEY,
-                timestamp TEXT,
-                project_name TEXT,
-                input_data TEXT,
-                output_data TEXT,
-                execution_time REAL,
-                context TEXT,
-                status TEXT
-            )
-            ''')
-
-        # Create a table for tracking relationships between traces
-        cursor.execute('''
-        CREATE TABLE IF NOT EXISTS trace_relationships (
-            source_id TEXT,
-            target_id TEXT,
-            relationship_type TEXT,
-            metadata TEXT,
-            PRIMARY KEY (source_id, target_id, relationship_type)
-        )
-        ''')
-
-        self.conn.commit()
-
     def store_trace(self,
                    component: str,
                    input_data: Any,
@@ -107,32 +67,38 @@ def store_trace(self,
         trace_id = str(uuid.uuid4())
         timestamp = datetime.now().isoformat()
         project_name = os.path.basename(self.project_dir)
-
-        # Serialize complex data types
-        input_json = self._serialize_data(input_data)
-        output_json = self._serialize_data(output_data)
-        context_json = self._serialize_data(context or {})
-
-        # Store in the appropriate table
-        cursor = self.conn.cursor()
-        query = f'''
-        INSERT INTO {component}_traces
-        (id, timestamp, project_name, input_data, output_data, execution_time, context, status)
-        VALUES (?, ?, ?, ?, ?, ?, ?, ?)
-        '''
-
-        cursor.execute(query, (
-            trace_id,
-            timestamp,
-            project_name,
-            input_json,
-            output_json,
-            execution_time,
-            context_json,
-            status
-        ))
-
-        self.conn.commit()
+        
+        # Prepare text representation for vector embedding
+        # This combines the most important fields for semantic search
+        if isinstance(input_data, str):
+            input_text = input_data[:1000]  # Limit length for embedding
+        else:
+            input_text = str(input_data)[:1000]
+            
+        text_for_embedding = f"Component: {component}\nStatus: {status}\nInput: {input_text}"
+        
+        # Prepare metadata containing all trace details
+        metadata = {
+            "trace_id": trace_id,
+            "component": component,
+            "timestamp": timestamp,
+            "project_name": project_name,
+            "execution_time": execution_time,
+            "status": status,
+            "input_data": self._serialize_data(input_data),
+            "output_data": self._serialize_data(output_data),
+            "context": self._serialize_data(context or {})
+        }
+        
+        # Store in the component-specific table
+        table_name = f"component_{component}_traces"
+        self.memory.add(
+            texts=[text_for_embedding],
+            metadata=[metadata],
+            table_name=table_name,
+            ids=[trace_id]
+        )
+        
         return trace_id
 
     def get_trace(self, component: str, trace_id: str) -> Optional[Dict[str, Any]]:
@@ -146,14 +112,13 @@ def get_trace(self, component: str, trace_id: str) -> Optional[Dict[str, Any]]:
         Returns:
             Dict or None: The trace data if found, None otherwise.
         """
-        cursor = self.conn.cursor()
-        query = f"SELECT * FROM {component}_traces WHERE id = ?"
-        cursor.execute(query, (trace_id,))
-
-        row = cursor.fetchone()
-        if row:
-            return self._row_to_dict(row)
-        return None
+        table_name = f"component_{component}_traces"
+        results = self.memory.get(trace_id, table_name=table_name)
+        
+        if not results:
+            return None
+            
+        return self._process_trace_result(results[0])
 
     def get_recent_traces(self, component: str, limit: int = 10) -> List[Dict[str, Any]]:
         """
@@ -166,11 +131,29 @@ def get_recent_traces(self, component: str, limit: int = 10) -> List[Dict[str, A
         Returns:
             List[Dict]: List of trace data dictionaries.
         """
-        cursor = self.conn.cursor()
-        query = f"SELECT * FROM {component}_traces ORDER BY timestamp DESC LIMIT ?"
-        cursor.execute(query, (limit,))
-
-        return [self._row_to_dict(row) for row in cursor.fetchall()]
+        table_name = f"component_{component}_traces"
+        
+        # 1. Get all IDs
+        # 2. Get metadata for each ID
+        # 3. Sort by timestamp
+        # 4. Take the most recent ones
+        
+        all_keys = self.memory.list_all_keys(table_name=table_name)
+        if not all_keys:
+            return []
+            
+        # Get all traces for this component
+        traces = []
+        for key in all_keys:
+            result = self.memory.get(key, table_name=table_name)
+            if result:
+                traces.append(self._process_trace_result(result[0]))
+        
+        # Sort by timestamp (newest first)
+        traces.sort(key=lambda x: x['timestamp'], reverse=True)
+        
+        # Return only the requested number
+        return traces[:limit]
 
     def search_traces(self,
                      component: str,
@@ -187,16 +170,19 @@ def search_traces(self,
         Returns:
             List[Dict]: List of matching trace dictionaries.
         """
-        cursor = self.conn.cursor()
-        query = f'''
-        SELECT * FROM {component}_traces
-        WHERE input_data LIKE ? OR output_data LIKE ?
-        ORDER BY timestamp DESC LIMIT ?
-        '''
-        search_pattern = f"%{search_text}%"
-        cursor.execute(query, (search_pattern, search_pattern, limit))
-
-        return [self._row_to_dict(row) for row in cursor.fetchall()]
+        table_name = f"component_{component}_traces"
+        
+        # Use LanceDB's vector search capability
+        results = self.memory.query(
+            query_texts=[search_text],
+            table_name=table_name,
+            n_results=limit
+        )
+        
+        if not results or not results[0]:
+            return []
+            
+        return [self._process_trace_result(item) for item in results[0]]
 
     def add_relationship(self,
                         source_id: str,
@@ -215,21 +201,27 @@ def add_relationship(self,
         Returns:
             bool: True if relationship was added successfully.
         """
-        cursor = self.conn.cursor()
-        metadata_json = self._serialize_data(metadata or {})
-
-        try:
-            cursor.execute('''
-            INSERT INTO trace_relationships
-            (source_id, target_id, relationship_type, metadata)
-            VALUES (?, ?, ?, ?)
-            ''', (source_id, target_id, relationship_type, metadata_json))
-
-            self.conn.commit()
-            return True
-        except sqlite3.IntegrityError:
-            # Relationship already exists
-            return False
+        relationship_id = f"{source_id}_{target_id}_{relationship_type}"
+        
+        relationship_text = f"Relationship: {relationship_type} from {source_id} to {target_id}"
+        
+        relationship_metadata = {
+            "source_id": source_id,
+            "target_id": target_id,
+            "relationship_type": relationship_type,
+            "metadata": self._serialize_data(metadata or {})
+        }
+        
+        # Store in the relationships table
+        table_name = "component_trace_relationships"
+        self.memory.add(
+            texts=[relationship_text],
+            metadata=[relationship_metadata],
+            table_name=table_name,
+            ids=[relationship_id]
+        )
+        
+        return True
 
     def get_related_traces(self,
                           trace_id: str,
@@ -244,37 +236,47 @@ def get_related_traces(self,
         Returns:
             List[Dict]: List of related trace data.
         """
-        cursor = self.conn.cursor()
-
+        table_name = "component_trace_relationships"
+        
+        # Get all relationships where this trace is the source
         if relationship_type:
-            query = '''
-            SELECT * FROM trace_relationships
-            WHERE source_id = ? AND relationship_type = ?
-            '''
-            cursor.execute(query, (trace_id, relationship_type))
+            # Get relationships with specific type
+            results = self.memory.query(
+                query_texts=[f"Relationship: {relationship_type} from {trace_id}"],
+                table_name=table_name,
+                n_results=100  # Get many potential matches
+            )
         else:
-            query = '''
-            SELECT * FROM trace_relationships
-            WHERE source_id = ?
-            '''
-            cursor.execute(query, (trace_id,))
-
+            # Get all relationships for this trace
+            all_keys = self.memory.list_all_keys(table_name=table_name)
+            results = []
+            
+            for key in all_keys:
+                if key.startswith(f"{trace_id}_"):
+                    rel = self.memory.get(key, table_name=table_name)
+                    if rel:
+                        results.append(rel[0])
+        
+        if not results or (isinstance(results, list) and not results[0]):
+            return []
+            
+        # Process and return relationship data
         relationships = []
-        for row in cursor.fetchall():
-            rel = {
-                'source_id': row['source_id'],
-                'target_id': row['target_id'],
-                'relationship_type': row['relationship_type'],
-                'metadata': json.loads(row['metadata'])
-            }
-            relationships.append(rel)
-
+        for item in results if not isinstance(results[0], list) else results[0]:
+            if 'metadata' in item and isinstance(item['metadata'], dict):
+                rel_data = {
+                    'source_id': item['metadata'].get('source_id'),
+                    'target_id': item['metadata'].get('target_id'),
+                    'relationship_type': item['metadata'].get('relationship_type'),
+                    'metadata': json.loads(item['metadata'].get('metadata', '{}'))
+                }
+                relationships.append(rel_data)
+        
         return relationships
 
     def close(self):
-        """Close the database connection."""
-        if self.conn:
-            self.conn.close()
+        """Close the memory connections."""
+        pass
 
     def _serialize_data(self, data: Any) -> str:
         """Serialize data to JSON string."""
@@ -290,16 +292,27 @@ def _deserialize_data(self, json_str: str) -> Any:
             return json.loads(json_str)
         except (json.JSONDecodeError, TypeError):
             return json_str
-
-    def _row_to_dict(self, row: sqlite3.Row) -> Dict[str, Any]:
-        """Convert an SQLite row to a dictionary with deserialized data."""
-        trace = dict(row)
-
-        # Deserialize JSON fields
-        trace['input_data'] = self._deserialize_data(trace['input_data'])
-        trace['output_data'] = self._deserialize_data(trace['output_data'])
-        trace['context'] = self._deserialize_data(trace['context'])
-
+    
+    def _process_trace_result(self, result: Dict) -> Dict[str, Any]:
+        """Process a raw trace result from LanceDB into a standardized format."""
+        if not result or 'metadata' not in result:
+            return {}
+            
+        metadata = result['metadata']
+        
+        # Extract and deserialize the trace data
+        trace = {
+            'id': metadata.get('trace_id'),
+            'component': metadata.get('component'),
+            'timestamp': metadata.get('timestamp'),
+            'project_name': metadata.get('project_name'),
+            'execution_time': metadata.get('execution_time'),
+            'status': metadata.get('status'),
+            'input_data': self._deserialize_data(metadata.get('input_data', '{}')),
+            'output_data': self._deserialize_data(metadata.get('output_data', '{}')),
+            'context': self._deserialize_data(metadata.get('context', '{}'))
+        }
+        
         return trace