"""Orchestrates regulatory source crawlers and LLM enrichment pipeline.""" from __future__ import annotations import hashlib from typing import Any, Generator from loguru import logger from app.infrastructure.perception.base_event_store import BaseEventStore from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent from app.infrastructure.perception.llm_pipeline import LlmPipeline def _event_id(source: str, standard_code: str) -> str: """Deterministic 12-char ID from source + standard_code.""" return hashlib.sha256(f"{source}-{standard_code}".encode()).hexdigest()[:12] def _content_hash(raw_text: str) -> str: return hashlib.sha256(raw_text.encode()).hexdigest() def _raw_to_dict(raw: RawEvent, event_id: str, content_hash: str) -> dict: return { "id": event_id, "source": raw.source, "source_label": raw.source_label, "standard_code": raw.standard_code, "title": raw.title, "summary": raw.summary, "full_text_url": raw.full_text_url, "status": raw.status, "impact_level": "medium", "published_at": raw.published_at, "effective_at": raw.effective_at, "category": raw.category, "tags": raw.tags, "content_hash": content_hash, "previous_hash": None, } class CrawlService: """Orchestrate crawlers, hash-based change detection, and LLM enrichment.""" def __init__( self, crawlers: dict[str, BaseCrawler], event_store: BaseEventStore, llm_pipeline: LlmPipeline, retrieval_service: Any, ) -> None: self._crawlers = crawlers self._store = event_store self._pipeline = llm_pipeline self._retrieval = retrieval_service def run_crawl( self, sources: list[str] | None = None ) -> Generator[dict, None, None]: """Run crawl for selected sources. Yields SSE-ready progress dicts.""" targets = sources or list(self._crawlers.keys()) total_new = 0 total_updated = 0 for source_key in targets: crawler = self._crawlers.get(source_key) if not crawler: yield {"event": "error", "data": f"Unknown source: {source_key}"} continue yield {"event": "progress", "data": {"source": source_key, "stage": "fetching"}} try: raw_events = crawler.fetch(limit=100) except Exception as exc: logger.exception("Crawler failed source={}", source_key) yield {"event": "error", "data": {"source": source_key, "message": str(exc)}} continue yield { "event": "progress", "data": {"source": source_key, "stage": "processing", "fetched": len(raw_events)}, } new_count = 0 updated_count = 0 for raw in raw_events: eid = _event_id(raw.source, raw.standard_code) new_hash = _content_hash(raw.raw_text or raw.title) existing = self._store.get(eid) if existing and existing.get("content_hash") == new_hash: continue is_update = existing is not None old_text = existing.get("summary", "") if is_update else "" previous_hash = existing.get("content_hash") if is_update else None event_dict = _raw_to_dict(raw, eid, new_hash) event_dict["previous_hash"] = previous_hash try: structure = self._pipeline.extract_structure(event_dict) event_dict.update(structure) except Exception as exc: logger.warning("Structure extraction failed id={} err={}", eid, exc) try: affected = self._pipeline.assess_impact(event_dict, self._retrieval) event_dict["affected_docs"] = affected except Exception as exc: logger.warning("Impact assessment failed id={} err={}", eid, exc) if is_update and old_text and raw.raw_text: try: diff = self._pipeline.compute_diff(old_text, raw.raw_text) event_dict["change_summary"] = diff.get("change_summary") event_dict["changed_sections"] = diff.get("changed_sections") except Exception as exc: logger.warning("Diff failed id={} err={}", eid, exc) self._store.upsert(event_dict) if is_update: updated_count += 1 else: new_count += 1 total_new += new_count total_updated += updated_count yield { "event": "progress", "data": { "source": source_key, "stage": "done", "new": new_count, "updated": updated_count, }, } yield { "event": "done", "data": {"total_new": total_new, "total_updated": total_updated}, }