init

2025-09-26 17:15:54 +08:00
commit db0e5965ec
211 changed files with 40437 additions and 0 deletions
--- a/vw-agentic-rag/docs/testing.md
+++ b/vw-agentic-rag/docs/testing.md
@@ -0,0 +1,959 @@
+# 🧪 Testing Guide
+
+This guide covers the testing strategy, test structure, and best practices for the Agentic RAG system. It includes unit tests, integration tests, end-to-end tests, and performance testing approaches.
+
+## Testing Philosophy
+
+Our testing strategy follows the testing pyramid:
+
+```
+        /\
+       /  \
+      / E2E \ (Few, Slow, High Confidence)
+     /______\
+    /        \
+   /Integration\ (Some, Medium Speed)
+  /____________\
+ /              \
+/   Unit Tests   \ (Many, Fast, Low Level)
+/________________\
+```
+
+### Test Categories
+
+- **Unit Tests**: Fast, isolated tests for individual functions and classes
+- **Integration Tests**: Test component interactions with real dependencies
+- **End-to-End Tests**: Full workflow tests simulating real user scenarios
+- **Performance Tests**: Load testing and performance benchmarks
+
+## Test Structure
+
+```
+tests/
+├── conftest.py                    # Shared pytest fixtures
+├── unit/                         # Unit tests (fast, isolated)
+│   ├── test_config.py
+│   ├── test_retrieval.py
+│   ├── test_memory.py
+│   ├── test_graph.py
+│   ├── test_llm_client.py
+│   └── test_sse.py
+├── integration/                  # Integration tests  
+│   ├── test_api.py
+│   ├── test_streaming.py
+│   ├── test_full_workflow.py
+│   ├── test_mocked_streaming.py
+│   └── test_e2e_tool_ui.py
+└── performance/                  # Performance tests
+    ├── test_load.py
+    ├── test_memory_usage.py
+    └── test_concurrent_users.py
+```
+
+## Running Tests
+
+### Quick Test Commands
+
+```bash
+# Run all tests
+make test
+
+# Run specific test categories
+make test-unit              # Unit tests only
+make test-integration       # Integration tests only  
+make test-e2e              # End-to-end tests
+
+# Run with coverage
+uv run pytest --cov=service --cov-report=html tests/
+
+# Run specific test file
+uv run pytest tests/unit/test_retrieval.py -v
+
+# Run specific test method
+uv run pytest tests/integration/test_api.py::test_chat_endpoint -v
+
+# Run tests in parallel (faster)
+uv run pytest -n auto tests/
+
+# Run tests with detailed output
+uv run pytest -s -vvv tests/
+```
+
+### Test Configuration
+
+The test configuration is defined in `conftest.py`:
+
+```python
+# conftest.py
+import pytest
+import asyncio
+import httpx
+from unittest.mock import Mock, AsyncMock
+from fastapi.testclient import TestClient
+
+from service.main import create_app
+from service.config import Config
+
+@pytest.fixture(scope="session")
+def event_loop():
+    """Create an instance of the default event loop for the test session."""
+    loop = asyncio.get_event_loop_policy().new_event_loop()
+    yield loop
+    loop.close()
+
+@pytest.fixture
+def test_config():
+    """Test configuration with safe defaults."""
+    return Config(
+        provider="openai",
+        openai_api_key="test-key",
+        retrieval_endpoint="http://test-endpoint",
+        retrieval_api_key="test-key",
+        postgresql_host="localhost",
+        postgresql_database="test_db",
+        memory_ttl_days=1
+    )
+
+@pytest.fixture
+def app(test_config):
+    """Create test FastAPI app."""
+    app = create_app()
+    app.state.config = test_config
+    return app
+
+@pytest.fixture
+def client(app):
+    """Create test client."""
+    return TestClient(app)
+
+@pytest.fixture
+def mock_llm():
+    """Mock LLM client for testing."""
+    mock = AsyncMock()
+    mock.agenerate.return_value = Mock(
+        generations=[[Mock(text="Mocked response")]]
+    )
+    return mock
+```
+
+## Unit Tests
+
+Unit tests focus on testing individual components in isolation.
+
+### Testing Retrieval Tools
+
+```python
+# tests/unit/test_retrieval.py
+import pytest
+from unittest.mock import AsyncMock, patch
+import httpx
+
+from service.retrieval.agentic_retrieval import RetrievalTool
+
+class TestRetrievalTool:
+    
+    @pytest.fixture
+    def tool(self):
+        return RetrievalTool(
+            endpoint="http://test-endpoint",
+            api_key="test-key"
+        )
+    
+    @pytest.mark.asyncio
+    async def test_search_standards_success(self, tool):
+        mock_response = {
+            "results": [
+                {"title": "ISO 26262", "content": "Functional safety"},
+                {"title": "UN 38.3", "content": "Battery safety"}
+            ],
+            "metadata": {"total": 2, "took_ms": 150}
+        }
+        
+        with patch('httpx.AsyncClient.post') as mock_post:
+            mock_post.return_value.json.return_value = mock_response
+            mock_post.return_value.status_code = 200
+            
+            result = await tool.search_standards("battery safety")
+            
+            assert len(result["results"]) == 2
+            assert result["results"][0]["title"] == "ISO 26262"
+            assert result["metadata"]["took_ms"] == 150
+    
+    @pytest.mark.asyncio
+    async def test_search_standards_http_error(self, tool):
+        with patch('httpx.AsyncClient.post') as mock_post:
+            mock_post.side_effect = httpx.HTTPStatusError(
+                message="Not Found",
+                request=Mock(),
+                response=Mock(status_code=404)
+            )
+            
+            with pytest.raises(Exception) as exc_info:
+                await tool.search_standards("nonexistent")
+            
+            assert "HTTP error" in str(exc_info.value)
+    
+    def test_format_query(self, tool):
+        query = tool._format_query("test query", {"history": "previous"})
+        assert "test query" in query
+        assert "previous" in query
+```
+
+### Testing Configuration
+
+```python
+# tests/unit/test_config.py
+import os
+import pytest
+from pydantic import ValidationError
+
+from service.config import Config, load_config
+
+class TestConfig:
+    
+    def test_config_validation_success(self):
+        config = Config(
+            provider="openai",
+            openai_api_key="test-key",
+            retrieval_endpoint="http://test.com",
+            retrieval_api_key="test-key"
+        )
+        assert config.provider == "openai"
+        assert config.openai_api_key == "test-key"
+    
+    def test_config_validation_missing_required(self):
+        with pytest.raises(ValidationError):
+            Config(provider="openai")  # Missing required fields
+    
+    def test_load_config_from_env(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "env-key")
+        monkeypatch.setenv("RETRIEVAL_API_KEY", "env-retrieval-key")
+        
+        # Mock config file loading
+        with patch('service.config.yaml.safe_load') as mock_yaml:
+            mock_yaml.return_value = {
+                "provider": "openai",
+                "retrieval": {"endpoint": "http://test.com"}
+            }
+            
+            config = load_config()
+            assert config.openai_api_key == "env-key"
+```
+
+### Testing LLM Client
+
+```python
+# tests/unit/test_llm_client.py
+import pytest
+from unittest.mock import Mock, AsyncMock, patch
+
+from service.llm_client import get_llm_client, OpenAIClient
+
+class TestLLMClient:
+    
+    @pytest.mark.asyncio
+    async def test_openai_client_generate(self):
+        with patch('openai.AsyncOpenAI') as mock_openai:
+            mock_client = AsyncMock()
+            mock_openai.return_value = mock_client
+            
+            mock_response = Mock()
+            mock_response.choices = [
+                Mock(message=Mock(content="Generated response"))
+            ]
+            mock_client.chat.completions.create.return_value = mock_response
+            
+            client = OpenAIClient(api_key="test", model="gpt-4")
+            result = await client.generate([{"role": "user", "content": "test"}])
+            
+            assert result == "Generated response"
+    
+    def test_get_llm_client_openai(self, test_config):
+        test_config.provider = "openai"
+        test_config.openai_api_key = "test-key"
+        
+        client = get_llm_client(test_config)
+        assert isinstance(client, OpenAIClient)
+    
+    def test_get_llm_client_unsupported(self, test_config):
+        test_config.provider = "unsupported"
+        
+        with pytest.raises(ValueError, match="Unsupported provider"):
+            get_llm_client(test_config)
+```
+
+## Integration Tests
+
+Integration tests verify that components work together correctly.
+
+### Testing API Endpoints
+
+```python
+# tests/integration/test_api.py
+import pytest
+import json
+from fastapi.testclient import TestClient
+
+def test_health_endpoint(client):
+    """Test health check endpoint."""
+    response = client.get("/health")
+    assert response.status_code == 200
+    assert response.json() == {"status": "healthy", "service": "agentic-rag"}
+
+def test_root_endpoint(client):
+    """Test root endpoint."""
+    response = client.get("/")
+    assert response.status_code == 200
+    data = response.json()
+    assert "Agentic RAG API" in data["message"]
+
+@pytest.mark.asyncio
+async def test_chat_endpoint_integration():
+    """Integration test for chat endpoint using httpx client."""
+    async with httpx.AsyncClient() as client:
+        request_data = {
+            "messages": [{"role": "user", "content": "test question"}],
+            "session_id": "test_session_123"
+        }
+        
+        response = await client.post(
+            "http://localhost:8000/api/chat",
+            json=request_data,
+            timeout=30.0
+        )
+        
+        assert response.status_code == 200
+        assert response.headers["content-type"] == "text/event-stream"
+
+def test_chat_request_validation(client):
+    """Test chat request validation."""
+    # Missing messages
+    response = client.post("/api/chat", json={})
+    assert response.status_code == 422
+    
+    # Invalid message format
+    response = client.post("/api/chat", json={
+        "messages": [{"role": "invalid", "content": "test"}]
+    })
+    assert response.status_code == 422
+    
+    # Valid request
+    response = client.post("/api/chat", json={
+        "messages": [{"role": "user", "content": "test"}],
+        "session_id": "test_session"
+    })
+    assert response.status_code == 200
+```
+
+### Testing Streaming
+
+```python
+# tests/integration/test_streaming.py
+import pytest
+import json
+import asyncio
+from httpx import AsyncClient
+
+@pytest.mark.asyncio
+async def test_streaming_event_format():
+    """Test streaming response format."""
+    async with AsyncClient() as client:
+        request_data = {
+            "messages": [{"role": "user", "content": "What is ISO 26262?"}],
+            "session_id": "stream_test_session"
+        }
+        
+        async with client.stream(
+            "POST",
+            "http://localhost:8000/api/chat",
+            json=request_data,
+            timeout=60.0
+        ) as response:
+            assert response.status_code == 200
+            
+            events = []
+            async for line in response.aiter_lines():
+                if line.startswith("data: "):
+                    try:
+                        data = json.loads(line[6:])  # Remove "data: " prefix
+                        events.append(data)
+                    except json.JSONDecodeError:
+                        continue
+            
+            # Verify we got expected event types
+            event_types = [event.get("type") for event in events if "type" in event]
+            assert "tool_start" in event_types
+            assert "tokens" in event_types
+            assert "tool_result" in event_types
+
+@pytest.mark.asyncio
+async def test_concurrent_streaming():
+    """Test concurrent streaming requests."""
+    async def single_request(session_id: str):
+        async with AsyncClient() as client:
+            request_data = {
+                "messages": [{"role": "user", "content": f"Test {session_id}"}],
+                "session_id": session_id
+            }
+            
+            response = await client.post(
+                "http://localhost:8000/api/chat",
+                json=request_data,
+                timeout=30.0
+            )
+            return response.status_code
+    
+    # Run 5 concurrent requests
+    tasks = [
+        single_request(f"concurrent_test_{i}")
+        for i in range(5)
+    ]
+    
+    results = await asyncio.gather(*tasks)
+    assert all(status == 200 for status in results)
+```
+
+### Testing Memory Persistence
+
+```python
+# tests/integration/test_memory.py
+import pytest
+from service.memory.postgresql_memory import PostgreSQLMemoryManager
+
+@pytest.mark.asyncio
+async def test_session_persistence():
+    """Test that conversations persist across requests."""
+    memory_manager = PostgreSQLMemoryManager("postgresql://test:test@localhost/test")
+    
+    if not memory_manager.test_connection():
+        pytest.skip("PostgreSQL not available for testing")
+    
+    checkpointer = memory_manager.get_checkpointer()
+    
+    # Simulate first conversation turn
+    session_id = "memory_test_session"
+    initial_state = {
+        "messages": [
+            {"role": "user", "content": "Hello"},
+            {"role": "assistant", "content": "Hi there!"}
+        ]
+    }
+    
+    # Save state
+    await checkpointer.aput(
+        config={"configurable": {"session_id": session_id}},
+        checkpoint={
+            "id": "checkpoint_1",
+            "ts": "2024-01-01T00:00:00Z"
+        },
+        metadata={},
+        new_versions={}
+    )
+    
+    # Retrieve state
+    retrieved = await checkpointer.aget_tuple(
+        config={"configurable": {"session_id": session_id}}
+    )
+    
+    assert retrieved is not None
+    assert retrieved.checkpoint["id"] == "checkpoint_1"
+```
+
+## End-to-End Tests
+
+E2E tests simulate complete user workflows.
+
+### Full Workflow Test
+
+```python
+# tests/integration/test_full_workflow.py
+import pytest
+import asyncio
+import json
+from httpx import AsyncClient
+
+@pytest.mark.asyncio
+async def test_complete_rag_workflow():
+    """Test complete RAG workflow from query to citation."""
+    
+    async with AsyncClient() as client:
+        # Step 1: Send initial query
+        request_data = {
+            "messages": [
+                {"role": "user", "content": "What are the safety standards for lithium-ion batteries?"}
+            ],
+            "session_id": "e2e_workflow_test"
+        }
+        
+        response = await client.post(
+            "http://localhost:8000/api/chat",
+            json=request_data,
+            timeout=120.0
+        )
+        
+        assert response.status_code == 200
+        
+        # Step 2: Parse streaming response
+        events = []
+        tool_calls = []
+        final_answer = None
+        citations = None
+        
+        async for line in response.aiter_lines():
+            if line.startswith("data: "):
+                try:
+                    data = json.loads(line[6:])
+                    events.append(data)
+                    
+                    if data.get("type") == "tool_start":
+                        tool_calls.append(data["name"])
+                    elif data.get("type") == "post_append_1":
+                        final_answer = data.get("answer")
+                        citations = data.get("citations_mapping_csv")
+                        
+                except json.JSONDecodeError:
+                    continue
+        
+        # Step 3: Verify workflow execution
+        assert len(tool_calls) > 0, "No tools were called"
+        assert "retrieve_standard_regulation" in tool_calls or \
+               "retrieve_doc_chunk_standard_regulation" in tool_calls
+        
+        assert final_answer is not None, "No final answer received"
+        assert "safety" in final_answer.lower() or "standard" in final_answer.lower()
+        
+        if citations:
+            assert len(citations.split('\n')) > 0, "No citations provided"
+        
+        # Step 4: Follow-up question to test memory
+        followup_request = {
+            "messages": [
+                {"role": "user", "content": "What are the safety standards for lithium-ion batteries?"},
+                {"role": "assistant", "content": final_answer},
+                {"role": "user", "content": "What about testing procedures?"}
+            ],
+            "session_id": "e2e_workflow_test"  # Same session
+        }
+        
+        followup_response = await client.post(
+            "http://localhost:8000/api/chat",
+            json=followup_request,
+            timeout=120.0
+        )
+        
+        assert followup_response.status_code == 200
+
+@pytest.mark.asyncio  
+async def test_error_handling():
+    """Test error handling in workflow."""
+    
+    async with AsyncClient() as client:
+        # Test with invalid session format
+        request_data = {
+            "messages": [{"role": "user", "content": "test"}],
+            "session_id": ""  # Invalid session ID
+        }
+        
+        response = await client.post(
+            "http://localhost:8000/api/chat",
+            json=request_data,
+            timeout=30.0
+        )
+        
+        # Should handle gracefully (generate new session ID)
+        assert response.status_code == 200
+```
+
+### Frontend Integration Test
+
+```python
+# tests/integration/test_e2e_tool_ui.py
+import pytest
+from playwright.sync_api import sync_playwright
+
+@pytest.mark.skipif(
+    not os.getenv("RUN_E2E_TESTS"),
+    reason="E2E tests require RUN_E2E_TESTS=1"
+)
+def test_chat_interface():
+    """Test the frontend chat interface."""
+    
+    with sync_playwright() as p:
+        browser = p.chromium.launch(headless=True)
+        page = browser.new_page()
+        
+        # Navigate to chat interface
+        page.goto("http://localhost:3000")
+        
+        # Wait for chat interface to load
+        page.wait_for_selector('[data-testid="chat-input"]')
+        
+        # Send a message
+        chat_input = page.locator('[data-testid="chat-input"]')
+        chat_input.fill("What is ISO 26262?")
+        
+        send_button = page.locator('[data-testid="send-button"]')
+        send_button.click()
+        
+        # Wait for response
+        page.wait_for_selector('[data-testid="assistant-message"]', timeout=30000)
+        
+        # Verify response appeared
+        response = page.locator('[data-testid="assistant-message"]').first
+        assert response.is_visible()
+        
+        # Check for tool UI elements
+        tool_ui = page.locator('[data-testid="tool-call"]')
+        if tool_ui.count() > 0:
+            assert tool_ui.first.is_visible()
+        
+        browser.close()
+```
+
+## Performance Tests
+
+### Load Testing
+
+```python
+# tests/performance/test_load.py
+import pytest
+import asyncio
+import time
+import statistics
+from httpx import AsyncClient
+
+@pytest.mark.asyncio
+async def test_concurrent_requests():
+    """Test system performance under concurrent load."""
+    
+    async def single_request(client: AsyncClient, request_id: int):
+        start_time = time.time()
+        
+        request_data = {
+            "messages": [{"role": "user", "content": f"Test query {request_id}"}],
+            "session_id": f"load_test_{request_id}"
+        }
+        
+        try:
+            response = await client.post(
+                "http://localhost:8000/api/chat",
+                json=request_data,
+                timeout=30.0
+            )
+            
+            end_time = time.time()
+            return {
+                "status_code": response.status_code,
+                "response_time": end_time - start_time,
+                "success": response.status_code == 200
+            }
+        except Exception as e:
+            end_time = time.time()
+            return {
+                "status_code": 0,
+                "response_time": end_time - start_time,
+                "success": False,
+                "error": str(e)
+            }
+    
+    # Test with 20 concurrent requests
+    async with AsyncClient() as client:
+        tasks = [single_request(client, i) for i in range(20)]
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+    
+    # Analyze results
+    successful_requests = [r for r in results if isinstance(r, dict) and r["success"]]
+    response_times = [r["response_time"] for r in successful_requests]
+    
+    success_rate = len(successful_requests) / len(results)
+    avg_response_time = statistics.mean(response_times) if response_times else 0
+    p95_response_time = statistics.quantiles(response_times, n=20)[18] if len(response_times) > 5 else 0
+    
+    print(f"Success rate: {success_rate:.2%}")
+    print(f"Average response time: {avg_response_time:.2f}s")
+    print(f"95th percentile: {p95_response_time:.2f}s")
+    
+    # Performance assertions
+    assert success_rate >= 0.95, f"Success rate too low: {success_rate:.2%}"
+    assert avg_response_time < 10.0, f"Average response time too high: {avg_response_time:.2f}s"
+    assert p95_response_time < 20.0, f"95th percentile too high: {p95_response_time:.2f}s"
+
+@pytest.mark.asyncio
+async def test_memory_usage():
+    """Test memory usage under load."""
+    import psutil
+    import gc
+    
+    process = psutil.Process()
+    initial_memory = process.memory_info().rss / 1024 / 1024  # MB
+    
+    # Run multiple requests
+    async with AsyncClient() as client:
+        for i in range(50):
+            request_data = {
+                "messages": [{"role": "user", "content": f"Memory test {i}"}],
+                "session_id": f"memory_test_{i}"
+            }
+            
+            await client.post(
+                "http://localhost:8000/api/chat",
+                json=request_data,
+                timeout=30.0
+            )
+            
+            if i % 10 == 0:
+                gc.collect()  # Force garbage collection
+    
+    final_memory = process.memory_info().rss / 1024 / 1024  # MB
+    memory_increase = final_memory - initial_memory
+    
+    print(f"Initial memory: {initial_memory:.1f} MB")
+    print(f"Final memory: {final_memory:.1f} MB")
+    print(f"Memory increase: {memory_increase:.1f} MB")
+    
+    # Memory assertions (adjust based on expected usage)
+    assert memory_increase < 100, f"Memory increase too high: {memory_increase:.1f} MB"
+```
+
+## Test Data Management
+
+### Test Fixtures
+
+```python
+# tests/fixtures.py
+import pytest
+from typing import List, Dict
+
+@pytest.fixture
+def sample_messages() -> List[Dict]:
+    """Sample message history for testing."""
+    return [
+        {"role": "user", "content": "What is ISO 26262?"},
+        {"role": "assistant", "content": "ISO 26262 is a functional safety standard..."},
+        {"role": "user", "content": "What about testing procedures?"}
+    ]
+
+@pytest.fixture
+def mock_retrieval_response() -> Dict:
+    """Mock response from retrieval API."""
+    return {
+        "results": [
+            {
+                "title": "ISO 26262-1:2018",
+                "content": "Road vehicles — Functional safety — Part 1: Vocabulary",
+                "source": "ISO",
+                "url": "https://iso.org/26262-1",
+                "score": 0.95
+            },
+            {
+                "title": "ISO 26262-3:2018", 
+                "content": "Road vehicles — Functional safety — Part 3: Concept phase",
+                "source": "ISO",
+                "url": "https://iso.org/26262-3",
+                "score": 0.88
+            }
+        ],
+        "metadata": {
+            "total": 2,
+            "took_ms": 150,
+            "query": "ISO 26262"
+        }
+    }
+
+@pytest.fixture
+def mock_llm_response() -> str:
+    """Mock LLM response with citations."""
+    return """ISO 26262 is an international standard for functional safety of electrical and electronic systems in road vehicles <sup>1</sup>. 
+
+The standard consists of multiple parts:
+- Part 1: Vocabulary <sup>1</sup>
+- Part 3: Concept phase <sup>2</sup>
+
+These standards ensure that safety-critical automotive systems operate reliably even in the presence of faults."""
+```
+
+### Database Test Setup
+
+```python
+# tests/database_setup.py
+import asyncio
+import pytest
+from sqlalchemy import create_engine, text
+from service.memory.postgresql_memory import PostgreSQLMemoryManager
+
+@pytest.fixture(scope="session")
+async def test_database():
+    """Set up test database."""
+    
+    # Create test database
+    engine = create_engine("postgresql://test:test@localhost/postgres")
+    with engine.connect() as conn:
+        conn.execute(text("DROP DATABASE IF EXISTS test_agentic_rag"))
+        conn.execute(text("CREATE DATABASE test_agentic_rag"))
+        conn.commit()
+    
+    # Initialize schema
+    test_connection_string = "postgresql://test:test@localhost/test_agentic_rag"
+    memory_manager = PostgreSQLMemoryManager(test_connection_string)
+    checkpointer = memory_manager.get_checkpointer()
+    checkpointer.setup()
+    
+    yield test_connection_string
+    
+    # Cleanup
+    with engine.connect() as conn:
+        conn.execute(text("DROP DATABASE test_agentic_rag"))
+        conn.commit()
+```
+
+## Continuous Integration
+
+### GitHub Actions Workflow
+
+```yaml
+# .github/workflows/test.yml
+name: Tests
+
+on:
+  push:
+    branches: [ main, develop ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    
+    services:
+      postgres:
+        image: postgres:15
+        env:
+          POSTGRES_PASSWORD: test
+          POSTGRES_USER: test
+          POSTGRES_DB: test
+        options: >-
+          --health-cmd pg_isready
+          --health-interval 10s
+          --health-timeout 5s
+          --health-retries 5
+        ports:
+          - 5432:5432
+    
+    steps:
+    - uses: actions/checkout@v4
+    
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.12'
+    
+    - name: Install uv
+      uses: astral-sh/setup-uv@v1
+    
+    - name: Install dependencies
+      run: uv sync --dev
+    
+    - name: Run unit tests
+      run: uv run pytest tests/unit/ -v --cov=service --cov-report=xml
+      env:
+        DATABASE_URL: postgresql://test:test@localhost:5432/test
+        OPENAI_API_KEY: test-key
+        RETRIEVAL_API_KEY: test-key
+    
+    - name: Start test server
+      run: |
+        uv run uvicorn service.main:app --host 0.0.0.0 --port 8000 &
+        sleep 10
+      env:
+        DATABASE_URL: postgresql://test:test@localhost:5432/test
+        OPENAI_API_KEY: test-key
+        RETRIEVAL_API_KEY: test-key
+    
+    - name: Run integration tests
+      run: uv run pytest tests/integration/ -v
+      env:
+        DATABASE_URL: postgresql://test:test@localhost:5432/test
+        OPENAI_API_KEY: test-key
+        RETRIEVAL_API_KEY: test-key
+    
+    - name: Upload coverage to Codecov
+      uses: codecov/codecov-action@v3
+      with:
+        file: ./coverage.xml
+```
+
+## Testing Best Practices
+
+### 1. Test Organization
+
+- **Keep tests close to code**: Mirror the source structure in test directories
+- **Use descriptive names**: Test names should clearly describe what they test
+- **Group related tests**: Use test classes to group related functionality
+
+### 2. Test Data
+
+- **Use fixtures**: Create reusable test data with pytest fixtures
+- **Avoid hardcoded values**: Use factories or builders for test data generation
+- **Clean up after tests**: Ensure tests don't affect each other
+
+### 3. Mocking Strategy
+
+```python
+# Good: Mock external dependencies
+@patch('service.retrieval.httpx.AsyncClient')
+async def test_retrieval_with_mock(mock_client):
+    # Test implementation
+    pass
+
+# Good: Mock at the right level
+@patch('service.llm_client.OpenAIClient.generate')
+async def test_agent_workflow(mock_generate):
+    # Test workflow logic without hitting LLM API
+    pass
+
+# Avoid: Over-mocking (mocking everything)
+# Avoid: Under-mocking (hitting real APIs in unit tests)
+```
+
+### 4. Async Testing
+
+```python
+# Proper async test setup
+@pytest.mark.asyncio
+async def test_async_function():
+    result = await async_function()
+    assert result is not None
+
+# Use async context managers
+@pytest.mark.asyncio
+async def test_with_async_client():
+    async with AsyncClient() as client:
+        response = await client.get("/")
+        assert response.status_code == 200
+```
+
+### 5. Performance Testing
+
+- **Set realistic timeouts**: Don't make tests too strict or too loose
+- **Test under load**: Verify system behavior with concurrent requests
+- **Monitor resource usage**: Check memory leaks and CPU usage
+
+### 6. Error Testing
+
+```python
+def test_error_handling():
+    """Test that errors are handled gracefully."""
+    
+    # Test invalid input
+    with pytest.raises(ValueError):
+        function_with_validation("")
+    
+    # Test network errors
+    with patch('httpx.post', side_effect=httpx.ConnectError("Connection failed")):
+        result = robust_function()
+        assert result["error"] is not None
+```
+
+---
+
+This testing guide provides a comprehensive framework for ensuring the quality and reliability of the Agentic RAG system. Regular testing at all levels helps maintain code quality and prevents regressions as the system evolves.