fix somethings
This commit is contained in:
147
backend/app/application/perception/crawl_service.py
Normal file
147
backend/app/application/perception/crawl_service.py
Normal file
@@ -0,0 +1,147 @@
|
||||
"""Orchestrates regulatory source crawlers and LLM enrichment pipeline."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
from typing import Any, Generator
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from app.infrastructure.perception.base_event_store import BaseEventStore
|
||||
from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
|
||||
from app.infrastructure.perception.llm_pipeline import LlmPipeline
|
||||
|
||||
|
||||
def _event_id(source: str, standard_code: str) -> str:
|
||||
"""Deterministic 12-char ID from source + standard_code."""
|
||||
return hashlib.sha256(f"{source}-{standard_code}".encode()).hexdigest()[:12]
|
||||
|
||||
|
||||
def _content_hash(raw_text: str) -> str:
|
||||
return hashlib.sha256(raw_text.encode()).hexdigest()
|
||||
|
||||
|
||||
def _raw_to_dict(raw: RawEvent, event_id: str, content_hash: str) -> dict:
|
||||
return {
|
||||
"id": event_id,
|
||||
"source": raw.source,
|
||||
"source_label": raw.source_label,
|
||||
"standard_code": raw.standard_code,
|
||||
"title": raw.title,
|
||||
"summary": raw.summary,
|
||||
"full_text_url": raw.full_text_url,
|
||||
"status": raw.status,
|
||||
"impact_level": "medium",
|
||||
"published_at": raw.published_at,
|
||||
"effective_at": raw.effective_at,
|
||||
"category": raw.category,
|
||||
"tags": raw.tags,
|
||||
"content_hash": content_hash,
|
||||
"previous_hash": None,
|
||||
}
|
||||
|
||||
|
||||
class CrawlService:
|
||||
"""Orchestrate crawlers, hash-based change detection, and LLM enrichment."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
crawlers: dict[str, BaseCrawler],
|
||||
event_store: BaseEventStore,
|
||||
llm_pipeline: LlmPipeline,
|
||||
retrieval_service: Any,
|
||||
) -> None:
|
||||
self._crawlers = crawlers
|
||||
self._store = event_store
|
||||
self._pipeline = llm_pipeline
|
||||
self._retrieval = retrieval_service
|
||||
|
||||
def run_crawl(
|
||||
self, sources: list[str] | None = None
|
||||
) -> Generator[dict, None, None]:
|
||||
"""Run crawl for selected sources. Yields SSE-ready progress dicts."""
|
||||
targets = sources or list(self._crawlers.keys())
|
||||
total_new = 0
|
||||
total_updated = 0
|
||||
|
||||
for source_key in targets:
|
||||
crawler = self._crawlers.get(source_key)
|
||||
if not crawler:
|
||||
yield {"event": "error", "data": f"Unknown source: {source_key}"}
|
||||
continue
|
||||
|
||||
yield {"event": "progress", "data": {"source": source_key, "stage": "fetching"}}
|
||||
try:
|
||||
raw_events = crawler.fetch(limit=100)
|
||||
except Exception as exc:
|
||||
logger.exception("Crawler failed source={}", source_key)
|
||||
yield {"event": "error", "data": {"source": source_key, "message": str(exc)}}
|
||||
continue
|
||||
|
||||
yield {
|
||||
"event": "progress",
|
||||
"data": {"source": source_key, "stage": "processing", "fetched": len(raw_events)},
|
||||
}
|
||||
|
||||
new_count = 0
|
||||
updated_count = 0
|
||||
|
||||
for raw in raw_events:
|
||||
eid = _event_id(raw.source, raw.standard_code)
|
||||
new_hash = _content_hash(raw.raw_text or raw.title)
|
||||
existing = self._store.get(eid)
|
||||
|
||||
if existing and existing.get("content_hash") == new_hash:
|
||||
continue
|
||||
|
||||
is_update = existing is not None
|
||||
old_text = existing.get("summary", "") if is_update else ""
|
||||
previous_hash = existing.get("content_hash") if is_update else None
|
||||
|
||||
event_dict = _raw_to_dict(raw, eid, new_hash)
|
||||
event_dict["previous_hash"] = previous_hash
|
||||
|
||||
try:
|
||||
structure = self._pipeline.extract_structure(event_dict)
|
||||
event_dict.update(structure)
|
||||
except Exception as exc:
|
||||
logger.warning("Structure extraction failed id={} err={}", eid, exc)
|
||||
|
||||
try:
|
||||
affected = self._pipeline.assess_impact(event_dict, self._retrieval)
|
||||
event_dict["affected_docs"] = affected
|
||||
except Exception as exc:
|
||||
logger.warning("Impact assessment failed id={} err={}", eid, exc)
|
||||
|
||||
if is_update and old_text and raw.raw_text:
|
||||
try:
|
||||
diff = self._pipeline.compute_diff(old_text, raw.raw_text)
|
||||
event_dict["change_summary"] = diff.get("change_summary")
|
||||
event_dict["changed_sections"] = diff.get("changed_sections")
|
||||
except Exception as exc:
|
||||
logger.warning("Diff failed id={} err={}", eid, exc)
|
||||
|
||||
self._store.upsert(event_dict)
|
||||
|
||||
if is_update:
|
||||
updated_count += 1
|
||||
else:
|
||||
new_count += 1
|
||||
|
||||
total_new += new_count
|
||||
total_updated += updated_count
|
||||
|
||||
yield {
|
||||
"event": "progress",
|
||||
"data": {
|
||||
"source": source_key,
|
||||
"stage": "done",
|
||||
"new": new_count,
|
||||
"updated": updated_count,
|
||||
},
|
||||
}
|
||||
|
||||
yield {
|
||||
"event": "done",
|
||||
"data": {"total_new": total_new, "total_updated": total_updated},
|
||||
}
|
||||
Reference in New Issue
Block a user