"""Integration tests for CrawlService.""" from __future__ import annotations from unittest.mock import MagicMock import hashlib import pytest from app.infrastructure.perception.crawlers.base import RawEvent from app.infrastructure.perception.mock_event_store import MockEventStore def _make_raw_event(code="TST-001"): return RawEvent( source="TEST", source_label="Test", standard_code=code, title=f"Test {code}", summary="Summary", full_text_url="https://example.com", status="enacted", published_at="2026-01-01", effective_at=None, category="test", tags=["test"], raw_text="full text", ) def _make_service(raw_events): from app.application.perception.crawl_service import CrawlService mock_crawler = MagicMock() mock_crawler.fetch.return_value = raw_events mock_pipeline = MagicMock() mock_pipeline.extract_structure.return_value = { "obligations": [], "deadlines": [], "scope": "test", "penalties": None, "impact_level": "low", } mock_pipeline.assess_impact.return_value = [] mock_pipeline.compute_diff.return_value = { "changed_sections": [], "change_summary": "No changes.", } mock_retrieval = MagicMock() store = MockEventStore() return CrawlService( crawlers={"TEST": mock_crawler}, event_store=store, llm_pipeline=mock_pipeline, retrieval_service=mock_retrieval, ) def test_crawl_yields_progress_and_done(): svc = _make_service([_make_raw_event("TST-001")]) events = list(svc.run_crawl()) event_types = [e.get("event") for e in events] assert "done" in event_types def test_crawl_upserts_to_store(): store = MockEventStore() from app.application.perception.crawl_service import CrawlService mock_crawler = MagicMock() mock_crawler.fetch.return_value = [_make_raw_event("NEW-001")] mock_pipeline = MagicMock() mock_pipeline.extract_structure.return_value = { "obligations": [], "deadlines": [], "scope": "", "penalties": None, "impact_level": "medium", } mock_pipeline.assess_impact.return_value = [] mock_pipeline.compute_diff.return_value = { "changed_sections": [], "change_summary": "", } svc = CrawlService( crawlers={"TEST": mock_crawler}, event_store=store, llm_pipeline=mock_pipeline, retrieval_service=MagicMock(), ) list(svc.run_crawl()) result = store.get_by_standard_code("NEW-001") assert result is not None assert result["title"] == "Test NEW-001" def test_crawl_skips_unchanged_events(): store = MockEventStore() raw = _make_raw_event("SKIP-001") content_hash = hashlib.sha256(raw.raw_text.encode()).hexdigest() store.upsert({ "id": hashlib.sha256(f"TEST-SKIP-001".encode()).hexdigest()[:12], "standard_code": "SKIP-001", "source": "TEST", "source_label": "Test", "title": "Test SKIP-001", "summary": "", "full_text_url": "", "status": "enacted", "impact_level": "low", "published_at": "2026-01-01", "effective_at": None, "category": "test", "tags": [], "content_hash": content_hash, }) mock_pipeline = MagicMock() from app.application.perception.crawl_service import CrawlService mock_crawler = MagicMock() mock_crawler.fetch.return_value = [raw] svc = CrawlService( crawlers={"TEST": mock_crawler}, event_store=store, llm_pipeline=mock_pipeline, retrieval_service=MagicMock(), ) list(svc.run_crawl()) mock_pipeline.extract_structure.assert_not_called()