112 lines
3.6 KiB
Python
112 lines
3.6 KiB
Python
|
|
"""Integration tests for CrawlService."""
|
||
|
|
from __future__ import annotations
|
||
|
|
from unittest.mock import MagicMock
|
||
|
|
import hashlib
|
||
|
|
import pytest
|
||
|
|
|
||
|
|
from app.infrastructure.perception.crawlers.base import RawEvent
|
||
|
|
from app.infrastructure.perception.mock_event_store import MockEventStore
|
||
|
|
|
||
|
|
|
||
|
|
def _make_raw_event(code="TST-001"):
|
||
|
|
return RawEvent(
|
||
|
|
source="TEST", source_label="Test", standard_code=code,
|
||
|
|
title=f"Test {code}", summary="Summary", full_text_url="https://example.com",
|
||
|
|
status="enacted", published_at="2026-01-01", effective_at=None,
|
||
|
|
category="test", tags=["test"], raw_text="full text",
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def _make_service(raw_events):
|
||
|
|
from app.application.perception.crawl_service import CrawlService
|
||
|
|
|
||
|
|
mock_crawler = MagicMock()
|
||
|
|
mock_crawler.fetch.return_value = raw_events
|
||
|
|
|
||
|
|
mock_pipeline = MagicMock()
|
||
|
|
mock_pipeline.extract_structure.return_value = {
|
||
|
|
"obligations": [], "deadlines": [], "scope": "test",
|
||
|
|
"penalties": None, "impact_level": "low",
|
||
|
|
}
|
||
|
|
mock_pipeline.assess_impact.return_value = []
|
||
|
|
mock_pipeline.compute_diff.return_value = {
|
||
|
|
"changed_sections": [], "change_summary": "No changes.",
|
||
|
|
}
|
||
|
|
|
||
|
|
mock_retrieval = MagicMock()
|
||
|
|
store = MockEventStore()
|
||
|
|
|
||
|
|
return CrawlService(
|
||
|
|
crawlers={"TEST": mock_crawler},
|
||
|
|
event_store=store,
|
||
|
|
llm_pipeline=mock_pipeline,
|
||
|
|
retrieval_service=mock_retrieval,
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def test_crawl_yields_progress_and_done():
|
||
|
|
svc = _make_service([_make_raw_event("TST-001")])
|
||
|
|
events = list(svc.run_crawl())
|
||
|
|
event_types = [e.get("event") for e in events]
|
||
|
|
assert "done" in event_types
|
||
|
|
|
||
|
|
|
||
|
|
def test_crawl_upserts_to_store():
|
||
|
|
store = MockEventStore()
|
||
|
|
from app.application.perception.crawl_service import CrawlService
|
||
|
|
mock_crawler = MagicMock()
|
||
|
|
mock_crawler.fetch.return_value = [_make_raw_event("NEW-001")]
|
||
|
|
mock_pipeline = MagicMock()
|
||
|
|
mock_pipeline.extract_structure.return_value = {
|
||
|
|
"obligations": [], "deadlines": [], "scope": "",
|
||
|
|
"penalties": None, "impact_level": "medium",
|
||
|
|
}
|
||
|
|
mock_pipeline.assess_impact.return_value = []
|
||
|
|
mock_pipeline.compute_diff.return_value = {
|
||
|
|
"changed_sections": [], "change_summary": "",
|
||
|
|
}
|
||
|
|
svc = CrawlService(
|
||
|
|
crawlers={"TEST": mock_crawler},
|
||
|
|
event_store=store,
|
||
|
|
llm_pipeline=mock_pipeline,
|
||
|
|
retrieval_service=MagicMock(),
|
||
|
|
)
|
||
|
|
list(svc.run_crawl())
|
||
|
|
result = store.get_by_standard_code("NEW-001")
|
||
|
|
assert result is not None
|
||
|
|
assert result["title"] == "Test NEW-001"
|
||
|
|
|
||
|
|
|
||
|
|
def test_crawl_skips_unchanged_events():
|
||
|
|
store = MockEventStore()
|
||
|
|
raw = _make_raw_event("SKIP-001")
|
||
|
|
content_hash = hashlib.sha256(raw.raw_text.encode()).hexdigest()
|
||
|
|
store.upsert({
|
||
|
|
"id": hashlib.sha256(f"TEST-SKIP-001".encode()).hexdigest()[:12],
|
||
|
|
"standard_code": "SKIP-001",
|
||
|
|
"source": "TEST",
|
||
|
|
"source_label": "Test",
|
||
|
|
"title": "Test SKIP-001",
|
||
|
|
"summary": "",
|
||
|
|
"full_text_url": "",
|
||
|
|
"status": "enacted",
|
||
|
|
"impact_level": "low",
|
||
|
|
"published_at": "2026-01-01",
|
||
|
|
"effective_at": None,
|
||
|
|
"category": "test",
|
||
|
|
"tags": [],
|
||
|
|
"content_hash": content_hash,
|
||
|
|
})
|
||
|
|
mock_pipeline = MagicMock()
|
||
|
|
from app.application.perception.crawl_service import CrawlService
|
||
|
|
mock_crawler = MagicMock()
|
||
|
|
mock_crawler.fetch.return_value = [raw]
|
||
|
|
svc = CrawlService(
|
||
|
|
crawlers={"TEST": mock_crawler},
|
||
|
|
event_store=store,
|
||
|
|
llm_pipeline=mock_pipeline,
|
||
|
|
retrieval_service=MagicMock(),
|
||
|
|
)
|
||
|
|
list(svc.run_crawl())
|
||
|
|
mock_pipeline.extract_structure.assert_not_called()
|