AIRegulation-DocAnalysis/backend/app/application/perception/crawl_service.py

"""Orchestrates regulatory source crawlers and LLM enrichment pipeline."""

from __future__ import annotations

import hashlib
from typing import Any, Generator

from loguru import logger

from app.infrastructure.perception.base_event_store import BaseEventStore
from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
from app.infrastructure.perception.llm_pipeline import LlmPipeline


def _event_id(source: str, standard_code: str) -> str:
    """Deterministic 12-char ID from source + standard_code."""
    return hashlib.sha256(f"{source}-{standard_code}".encode()).hexdigest()[:12]


def _content_hash(raw_text: str) -> str:
    return hashlib.sha256(raw_text.encode()).hexdigest()


def _raw_to_dict(raw: RawEvent, event_id: str, content_hash: str) -> dict:
    return {
        "id": event_id,
        "source": raw.source,
        "source_label": raw.source_label,
        "standard_code": raw.standard_code,
        "title": raw.title,
        "summary": raw.summary,
        "full_text_url": raw.full_text_url,
        "status": raw.status,
        "impact_level": "medium",
        "published_at": raw.published_at,
        "effective_at": raw.effective_at,
        "category": raw.category,
        "tags": raw.tags,
        "content_hash": content_hash,
        "previous_hash": None,
    }


class CrawlService:
    """Orchestrate crawlers, hash-based change detection, and LLM enrichment."""

    def __init__(
        self,
        crawlers: dict[str, BaseCrawler],
        event_store: BaseEventStore,
        llm_pipeline: LlmPipeline,
        retrieval_service: Any,
    ) -> None:
        self._crawlers = crawlers
        self._store = event_store
        self._pipeline = llm_pipeline
        self._retrieval = retrieval_service

    def run_crawl(
        self, sources: list[str] | None = None
    ) -> Generator[dict, None, None]:
        """Run crawl for selected sources. Yields SSE-ready progress dicts."""
        targets = sources or list(self._crawlers.keys())
        total_new = 0
        total_updated = 0

        for source_key in targets:
            crawler = self._crawlers.get(source_key)
            if not crawler:
                yield {"event": "error", "data": f"Unknown source: {source_key}"}
                continue

            yield {"event": "progress", "data": {"source": source_key, "stage": "fetching"}}
            try:
                raw_events = crawler.fetch(limit=100)
            except Exception as exc:
                logger.exception("Crawler failed source={}", source_key)
                yield {"event": "error", "data": {"source": source_key, "message": str(exc)}}
                continue

            yield {
                "event": "progress",
                "data": {"source": source_key, "stage": "processing", "fetched": len(raw_events)},
            }

            new_count = 0
            updated_count = 0

            for raw in raw_events:
                eid = _event_id(raw.source, raw.standard_code)
                new_hash = _content_hash(raw.raw_text or raw.title)
                existing = self._store.get(eid)

                if existing and existing.get("content_hash") == new_hash:
                    continue

                is_update = existing is not None
                old_text = existing.get("summary", "") if is_update else ""
                previous_hash = existing.get("content_hash") if is_update else None

                event_dict = _raw_to_dict(raw, eid, new_hash)
                event_dict["previous_hash"] = previous_hash

                try:
                    structure = self._pipeline.extract_structure(event_dict)
                    event_dict.update(structure)
                except Exception as exc:
                    logger.warning("Structure extraction failed id={} err={}", eid, exc)

                try:
                    affected = self._pipeline.assess_impact(event_dict, self._retrieval)
                    event_dict["affected_docs"] = affected
                except Exception as exc:
                    logger.warning("Impact assessment failed id={} err={}", eid, exc)

                if is_update and old_text and raw.raw_text:
                    try:
                        diff = self._pipeline.compute_diff(old_text, raw.raw_text)
                        event_dict["change_summary"] = diff.get("change_summary")
                        event_dict["changed_sections"] = diff.get("changed_sections")
                    except Exception as exc:
                        logger.warning("Diff failed id={} err={}", eid, exc)

                self._store.upsert(event_dict)

                if is_update:
                    updated_count += 1
                else:
                    new_count += 1

            total_new += new_count
            total_updated += updated_count

            yield {
                "event": "progress",
                "data": {
                    "source": source_key,
                    "stage": "done",
                    "new": new_count,
                    "updated": updated_count,
                },
            }

        yield {
            "event": "done",
            "data": {"total_new": total_new, "total_updated": total_updated},
        }
fix somethings 2026-06-08 11:16:28 +08:00			`"""Orchestrates regulatory source crawlers and LLM enrichment pipeline."""`

			`from __future__ import annotations`

			`import hashlib`
			`from typing import Any, Generator`

			`from loguru import logger`

			`from app.infrastructure.perception.base_event_store import BaseEventStore`
			`from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent`
			`from app.infrastructure.perception.llm_pipeline import LlmPipeline`


			`def _event_id(source: str, standard_code: str) -> str:`
			`"""Deterministic 12-char ID from source + standard_code."""`
			`return hashlib.sha256(f"{source}-{standard_code}".encode()).hexdigest()[:12]`


			`def _content_hash(raw_text: str) -> str:`
			`return hashlib.sha256(raw_text.encode()).hexdigest()`


			`def _raw_to_dict(raw: RawEvent, event_id: str, content_hash: str) -> dict:`
			`return {`
			`"id": event_id,`
			`"source": raw.source,`
			`"source_label": raw.source_label,`
			`"standard_code": raw.standard_code,`
			`"title": raw.title,`
			`"summary": raw.summary,`
			`"full_text_url": raw.full_text_url,`
			`"status": raw.status,`
			`"impact_level": "medium",`
			`"published_at": raw.published_at,`
			`"effective_at": raw.effective_at,`
			`"category": raw.category,`
			`"tags": raw.tags,`
			`"content_hash": content_hash,`
			`"previous_hash": None,`
			`}`


			`class CrawlService:`
			`"""Orchestrate crawlers, hash-based change detection, and LLM enrichment."""`

			`def __init__(`
			`self,`
			`crawlers: dict[str, BaseCrawler],`
			`event_store: BaseEventStore,`
			`llm_pipeline: LlmPipeline,`
			`retrieval_service: Any,`
			`) -> None:`
			`self._crawlers = crawlers`
			`self._store = event_store`
			`self._pipeline = llm_pipeline`
			`self._retrieval = retrieval_service`

			`def run_crawl(`
			`self, sources: list[str] \| None = None`
			`) -> Generator[dict, None, None]:`
			`"""Run crawl for selected sources. Yields SSE-ready progress dicts."""`
			`targets = sources or list(self._crawlers.keys())`
			`total_new = 0`
			`total_updated = 0`

			`for source_key in targets:`
			`crawler = self._crawlers.get(source_key)`
			`if not crawler:`
			`yield {"event": "error", "data": f"Unknown source: {source_key}"}`
			`continue`

			`yield {"event": "progress", "data": {"source": source_key, "stage": "fetching"}}`
			`try:`
			`raw_events = crawler.fetch(limit=100)`
			`except Exception as exc:`
			`logger.exception("Crawler failed source={}", source_key)`
			`yield {"event": "error", "data": {"source": source_key, "message": str(exc)}}`
			`continue`

			`yield {`
			`"event": "progress",`
			`"data": {"source": source_key, "stage": "processing", "fetched": len(raw_events)},`
			`}`

			`new_count = 0`
			`updated_count = 0`

			`for raw in raw_events:`
			`eid = _event_id(raw.source, raw.standard_code)`
			`new_hash = _content_hash(raw.raw_text or raw.title)`
			`existing = self._store.get(eid)`

			`if existing and existing.get("content_hash") == new_hash:`
			`continue`

			`is_update = existing is not None`
			`old_text = existing.get("summary", "") if is_update else ""`
			`previous_hash = existing.get("content_hash") if is_update else None`

			`event_dict = _raw_to_dict(raw, eid, new_hash)`
			`event_dict["previous_hash"] = previous_hash`

			`try:`
			`structure = self._pipeline.extract_structure(event_dict)`
			`event_dict.update(structure)`
			`except Exception as exc:`
			`logger.warning("Structure extraction failed id={} err={}", eid, exc)`

			`try:`
			`affected = self._pipeline.assess_impact(event_dict, self._retrieval)`
			`event_dict["affected_docs"] = affected`
			`except Exception as exc:`
			`logger.warning("Impact assessment failed id={} err={}", eid, exc)`

			`if is_update and old_text and raw.raw_text:`
			`try:`
			`diff = self._pipeline.compute_diff(old_text, raw.raw_text)`
			`event_dict["change_summary"] = diff.get("change_summary")`
			`event_dict["changed_sections"] = diff.get("changed_sections")`
			`except Exception as exc:`
			`logger.warning("Diff failed id={} err={}", eid, exc)`

			`self._store.upsert(event_dict)`

			`if is_update:`
			`updated_count += 1`
			`else:`
			`new_count += 1`

			`total_new += new_count`
			`total_updated += updated_count`

			`yield {`
			`"event": "progress",`
			`"data": {`
			`"source": source_key,`
			`"stage": "done",`
			`"new": new_count,`
			`"updated": updated_count,`
			`},`
			`}`

			`yield {`
			`"event": "done",`
			`"data": {"total_new": total_new, "total_updated": total_updated},`
			`}`