openspace-openhands-evolution/test_e2e.py at main · firefox-669/openspace-openhands-evolution · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
"""
端到端生产环境测试

验证真实 LLM 调用和代码执行
"""

import asyncio
import os
import sys


async def test_production_flow():
    """测试完整的生产流程"""
    print("="*60)
    print("Production End-to-End Test")
    print("="*60)
    print()

    # 检查 API key
    api_key = os.getenv('OPENAI_API_KEY', '')
    if not api_key or api_key == 'your-api-key-here':
        print("⚠️  OPENAI_API_KEY not set")
        print("   Set it to test real LLM integration:")
        print("   export OPENAI_API_KEY=sk-your-key")
        print()
        use_mock = input("Continue with mock test? (y/n): ").strip().lower()
        if use_mock != 'y':
            return False

    try:
        from openspace_openhands_evolution import EvolutionOrchestrator, TaskRequest

        # 配置
        config = {
            'llm': {
                'provider': 'openai',
                'model': 'gpt-4',
                'api_key': api_key if api_key and api_key != 'your-api-key-here' else None
            },
            'openspace': {'registry_path': './data/skills'},
            'openhands': {
                'model': 'gpt-4',
                'sandbox_timeout': 30,
                'max_retries': 2
            },
            'monitor': {'quality_threshold': 0.7},
            'governance': {'enable_gatekeeping': True}
        }

        print("1️⃣  Initializing orchestrator...")
        orchestrator = EvolutionOrchestrator(config)
        print("   ✅ Orchestrator initialized")
        print()

        # 创建测试任务
        print("2️⃣  Creating test task...")
        task = TaskRequest(
            id="e2e-test-001",
            description="Write a Python function that calculates factorial",
            project_id="test-project",
            language="python"
        )
        print(f"   Task: {task.description}")
        print()

        # 执行任务
        print("3️⃣  Executing task...")
        print("   (This may take 10-30 seconds with real LLM)")
        print()

        result = await orchestrator.execute_task(task)

        print()
        print("4️⃣  Results:")
        print(f"   Success: {result.success}")
        print(f"   Output length: {len(result.output)} chars")

        if result.success:
            print(f"\n   Preview:\n   {result.output[:200]}...")

        if result.metrics:
            score = result.metrics.get('overall_score', 0)
            print(f"\n   Quality Score: {score:.2f}")

        if result.reasoning_trace:
            print(f"   Reasoning Steps: {len(result.reasoning_trace)}")

        print()

        if result.success:
            print("✅ PRODUCTION TEST PASSED!")
            print("\n🎉 The system is working with real LLM and code execution!")
            return True
        else:
            print(f"❌ Task failed: {result.error}")
            return False

    except Exception as e:
        print(f"\n❌ Test failed with error: {e}")
        import traceback
        traceback.print_exc()
        return False


async def test_sandbox_only():
    """仅测试沙箱执行（不需要 API key）"""
    print("="*60)
    print("Sandbox Execution Test (No API Key Required)")
    print("="*60)
    print()

    try:
        from openspace_openhands_evolution.execution_engine import ExecutionSandbox

        print("1️⃣  Creating sandbox...")
        sandbox = ExecutionSandbox(timeout=10)
        print("   ✅ Sandbox created")
        print()

        print("2️⃣  Executing Python code...")
        code = """
def factorial(n):
    if n <= 1:
        return 1
    return n * factorial(n - 1)

result = factorial(5)
print(f"Factorial of 5 is {result}")
"""

        result = await sandbox.execute_python(code)

        print(f"   Success: {result['success']}")
        if result['success']:
            print(f"   Output: {result['stdout'].strip()}")
            print("   ✅ Code execution works!")
        else:
            print(f"   Error: {result['stderr']}")
            print("   ❌ Code execution failed")

        print()

        print("3️⃣  Testing file operations...")
        sandbox.write_file("test.txt", "Hello from sandbox!")
        content = sandbox.read_file("test.txt")

        if content == "Hello from sandbox!":
            print("   ✅ File operations work!")
        else:
            print("   ❌ File operations failed")

        sandbox.cleanup()

        print()
        print("✅ SANDBOX TEST PASSED!")
        print("\nℹ️  To test full production flow, set OPENAI_API_KEY")
        return True

    except Exception as e:
        print(f"\n❌ Sandbox test failed: {e}")
        import traceback
        traceback.print_exc()
        return False


async def main():
    """主测试函数"""
    print()

    # 先测试沙箱（总是可以运行）
    sandbox_ok = await test_sandbox_only()

    print("\n" + "="*60)

    if sandbox_ok:
        # 询问是否测试完整流程
        print()
        test_full = input("Test full production flow with LLM? (y/n): ").strip().lower()

        if test_full == 'y':
            print()
            full_ok = await test_production_flow()
            return full_ok
        else:
            print("\n✅ Sandbox test passed - Core execution engine works!")
            print("\nTo enable full production mode:")
            print("  1. Set OPENAI_API_KEY environment variable")
            print("  2. Or create config.yaml with your API key")
            print("  3. Run: python test_e2e.py again")
            return True
    else:
        print("\n❌ Sandbox test failed - Please fix issues first")
        return False


if __name__ == "__main__":
    success = asyncio.run(main())
    sys.exit(0 if success else 1)