Files
AIRegulation-DocAnalysis/backend/tests/perception/test_crawl_service.py

112 lines
3.6 KiB
Python
Raw Normal View History

2026-06-08 11:16:28 +08:00
"""Integration tests for CrawlService."""
from __future__ import annotations
from unittest.mock import MagicMock
import hashlib
import pytest
from app.infrastructure.perception.crawlers.base import RawEvent
from app.infrastructure.perception.mock_event_store import MockEventStore
def _make_raw_event(code="TST-001"):
return RawEvent(
source="TEST", source_label="Test", standard_code=code,
title=f"Test {code}", summary="Summary", full_text_url="https://example.com",
status="enacted", published_at="2026-01-01", effective_at=None,
category="test", tags=["test"], raw_text="full text",
)
def _make_service(raw_events):
from app.application.perception.crawl_service import CrawlService
mock_crawler = MagicMock()
mock_crawler.fetch.return_value = raw_events
mock_pipeline = MagicMock()
mock_pipeline.extract_structure.return_value = {
"obligations": [], "deadlines": [], "scope": "test",
"penalties": None, "impact_level": "low",
}
mock_pipeline.assess_impact.return_value = []
mock_pipeline.compute_diff.return_value = {
"changed_sections": [], "change_summary": "No changes.",
}
mock_retrieval = MagicMock()
store = MockEventStore()
return CrawlService(
crawlers={"TEST": mock_crawler},
event_store=store,
llm_pipeline=mock_pipeline,
retrieval_service=mock_retrieval,
)
def test_crawl_yields_progress_and_done():
svc = _make_service([_make_raw_event("TST-001")])
events = list(svc.run_crawl())
event_types = [e.get("event") for e in events]
assert "done" in event_types
def test_crawl_upserts_to_store():
store = MockEventStore()
from app.application.perception.crawl_service import CrawlService
mock_crawler = MagicMock()
mock_crawler.fetch.return_value = [_make_raw_event("NEW-001")]
mock_pipeline = MagicMock()
mock_pipeline.extract_structure.return_value = {
"obligations": [], "deadlines": [], "scope": "",
"penalties": None, "impact_level": "medium",
}
mock_pipeline.assess_impact.return_value = []
mock_pipeline.compute_diff.return_value = {
"changed_sections": [], "change_summary": "",
}
svc = CrawlService(
crawlers={"TEST": mock_crawler},
event_store=store,
llm_pipeline=mock_pipeline,
retrieval_service=MagicMock(),
)
list(svc.run_crawl())
result = store.get_by_standard_code("NEW-001")
assert result is not None
assert result["title"] == "Test NEW-001"
def test_crawl_skips_unchanged_events():
store = MockEventStore()
raw = _make_raw_event("SKIP-001")
content_hash = hashlib.sha256(raw.raw_text.encode()).hexdigest()
store.upsert({
"id": hashlib.sha256(f"TEST-SKIP-001".encode()).hexdigest()[:12],
"standard_code": "SKIP-001",
"source": "TEST",
"source_label": "Test",
"title": "Test SKIP-001",
"summary": "",
"full_text_url": "",
"status": "enacted",
"impact_level": "low",
"published_at": "2026-01-01",
"effective_at": None,
"category": "test",
"tags": [],
"content_hash": content_hash,
})
mock_pipeline = MagicMock()
from app.application.perception.crawl_service import CrawlService
mock_crawler = MagicMock()
mock_crawler.fetch.return_value = [raw]
svc = CrawlService(
crawlers={"TEST": mock_crawler},
event_store=store,
llm_pipeline=mock_pipeline,
retrieval_service=MagicMock(),
)
list(svc.run_crawl())
mock_pipeline.extract_structure.assert_not_called()