148 lines
5.1 KiB
Python
148 lines
5.1 KiB
Python
|
|
"""Orchestrates regulatory source crawlers and LLM enrichment pipeline."""
|
||
|
|
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import hashlib
|
||
|
|
from typing import Any, Generator
|
||
|
|
|
||
|
|
from loguru import logger
|
||
|
|
|
||
|
|
from app.infrastructure.perception.base_event_store import BaseEventStore
|
||
|
|
from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
|
||
|
|
from app.infrastructure.perception.llm_pipeline import LlmPipeline
|
||
|
|
|
||
|
|
|
||
|
|
def _event_id(source: str, standard_code: str) -> str:
|
||
|
|
"""Deterministic 12-char ID from source + standard_code."""
|
||
|
|
return hashlib.sha256(f"{source}-{standard_code}".encode()).hexdigest()[:12]
|
||
|
|
|
||
|
|
|
||
|
|
def _content_hash(raw_text: str) -> str:
|
||
|
|
return hashlib.sha256(raw_text.encode()).hexdigest()
|
||
|
|
|
||
|
|
|
||
|
|
def _raw_to_dict(raw: RawEvent, event_id: str, content_hash: str) -> dict:
|
||
|
|
return {
|
||
|
|
"id": event_id,
|
||
|
|
"source": raw.source,
|
||
|
|
"source_label": raw.source_label,
|
||
|
|
"standard_code": raw.standard_code,
|
||
|
|
"title": raw.title,
|
||
|
|
"summary": raw.summary,
|
||
|
|
"full_text_url": raw.full_text_url,
|
||
|
|
"status": raw.status,
|
||
|
|
"impact_level": "medium",
|
||
|
|
"published_at": raw.published_at,
|
||
|
|
"effective_at": raw.effective_at,
|
||
|
|
"category": raw.category,
|
||
|
|
"tags": raw.tags,
|
||
|
|
"content_hash": content_hash,
|
||
|
|
"previous_hash": None,
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
class CrawlService:
|
||
|
|
"""Orchestrate crawlers, hash-based change detection, and LLM enrichment."""
|
||
|
|
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
crawlers: dict[str, BaseCrawler],
|
||
|
|
event_store: BaseEventStore,
|
||
|
|
llm_pipeline: LlmPipeline,
|
||
|
|
retrieval_service: Any,
|
||
|
|
) -> None:
|
||
|
|
self._crawlers = crawlers
|
||
|
|
self._store = event_store
|
||
|
|
self._pipeline = llm_pipeline
|
||
|
|
self._retrieval = retrieval_service
|
||
|
|
|
||
|
|
def run_crawl(
|
||
|
|
self, sources: list[str] | None = None
|
||
|
|
) -> Generator[dict, None, None]:
|
||
|
|
"""Run crawl for selected sources. Yields SSE-ready progress dicts."""
|
||
|
|
targets = sources or list(self._crawlers.keys())
|
||
|
|
total_new = 0
|
||
|
|
total_updated = 0
|
||
|
|
|
||
|
|
for source_key in targets:
|
||
|
|
crawler = self._crawlers.get(source_key)
|
||
|
|
if not crawler:
|
||
|
|
yield {"event": "error", "data": f"Unknown source: {source_key}"}
|
||
|
|
continue
|
||
|
|
|
||
|
|
yield {"event": "progress", "data": {"source": source_key, "stage": "fetching"}}
|
||
|
|
try:
|
||
|
|
raw_events = crawler.fetch(limit=100)
|
||
|
|
except Exception as exc:
|
||
|
|
logger.exception("Crawler failed source={}", source_key)
|
||
|
|
yield {"event": "error", "data": {"source": source_key, "message": str(exc)}}
|
||
|
|
continue
|
||
|
|
|
||
|
|
yield {
|
||
|
|
"event": "progress",
|
||
|
|
"data": {"source": source_key, "stage": "processing", "fetched": len(raw_events)},
|
||
|
|
}
|
||
|
|
|
||
|
|
new_count = 0
|
||
|
|
updated_count = 0
|
||
|
|
|
||
|
|
for raw in raw_events:
|
||
|
|
eid = _event_id(raw.source, raw.standard_code)
|
||
|
|
new_hash = _content_hash(raw.raw_text or raw.title)
|
||
|
|
existing = self._store.get(eid)
|
||
|
|
|
||
|
|
if existing and existing.get("content_hash") == new_hash:
|
||
|
|
continue
|
||
|
|
|
||
|
|
is_update = existing is not None
|
||
|
|
old_text = existing.get("summary", "") if is_update else ""
|
||
|
|
previous_hash = existing.get("content_hash") if is_update else None
|
||
|
|
|
||
|
|
event_dict = _raw_to_dict(raw, eid, new_hash)
|
||
|
|
event_dict["previous_hash"] = previous_hash
|
||
|
|
|
||
|
|
try:
|
||
|
|
structure = self._pipeline.extract_structure(event_dict)
|
||
|
|
event_dict.update(structure)
|
||
|
|
except Exception as exc:
|
||
|
|
logger.warning("Structure extraction failed id={} err={}", eid, exc)
|
||
|
|
|
||
|
|
try:
|
||
|
|
affected = self._pipeline.assess_impact(event_dict, self._retrieval)
|
||
|
|
event_dict["affected_docs"] = affected
|
||
|
|
except Exception as exc:
|
||
|
|
logger.warning("Impact assessment failed id={} err={}", eid, exc)
|
||
|
|
|
||
|
|
if is_update and old_text and raw.raw_text:
|
||
|
|
try:
|
||
|
|
diff = self._pipeline.compute_diff(old_text, raw.raw_text)
|
||
|
|
event_dict["change_summary"] = diff.get("change_summary")
|
||
|
|
event_dict["changed_sections"] = diff.get("changed_sections")
|
||
|
|
except Exception as exc:
|
||
|
|
logger.warning("Diff failed id={} err={}", eid, exc)
|
||
|
|
|
||
|
|
self._store.upsert(event_dict)
|
||
|
|
|
||
|
|
if is_update:
|
||
|
|
updated_count += 1
|
||
|
|
else:
|
||
|
|
new_count += 1
|
||
|
|
|
||
|
|
total_new += new_count
|
||
|
|
total_updated += updated_count
|
||
|
|
|
||
|
|
yield {
|
||
|
|
"event": "progress",
|
||
|
|
"data": {
|
||
|
|
"source": source_key,
|
||
|
|
"stage": "done",
|
||
|
|
"new": new_count,
|
||
|
|
"updated": updated_count,
|
||
|
|
},
|
||
|
|
}
|
||
|
|
|
||
|
|
yield {
|
||
|
|
"event": "done",
|
||
|
|
"data": {"total_new": total_new, "total_updated": total_updated},
|
||
|
|
}
|