"""Orchestrates regulatory source crawlers and LLM enrichment pipeline."""

from __future__ import annotations

import hashlib
from typing import Any, Generator

from loguru import logger

from app.infrastructure.perception.base_event_store import BaseEventStore
from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
from app.infrastructure.perception.llm_pipeline import LlmPipeline


def _event_id(source: str, standard_code: str) -> str:
    """Deterministic 12-char ID from source + standard_code."""
    return hashlib.sha256(f"{source}-{standard_code}".encode()).hexdigest()[:12]


def _content_hash(raw_text: str) -> str:
    return hashlib.sha256(raw_text.encode()).hexdigest()


def _raw_to_dict(raw: RawEvent, event_id: str, content_hash: str) -> dict:
    return {
        "id": event_id,
        "source": raw.source,
        "source_label": raw.source_label,
        "standard_code": raw.standard_code,
        "title": raw.title,
        "summary": raw.summary,
        "full_text_url": raw.full_text_url,
        "status": raw.status,
        "impact_level": "medium",
        "published_at": raw.published_at,
        "effective_at": raw.effective_at,
        "category": raw.category,
        "tags": raw.tags,
        "content_hash": content_hash,
        "previous_hash": None,
    }


class CrawlService:
    """Orchestrate crawlers, hash-based change detection, and LLM enrichment."""

    def __init__(
        self,
        crawlers: dict[str, BaseCrawler],
        event_store: BaseEventStore,
        llm_pipeline: LlmPipeline,
        retrieval_service: Any,
    ) -> None:
        self._crawlers = crawlers
        self._store = event_store
        self._pipeline = llm_pipeline
        self._retrieval = retrieval_service

    def run_crawl(
        self, sources: list[str] | None = None
    ) -> Generator[dict, None, None]:
        """Run crawl for selected sources. Yields SSE-ready progress dicts."""
        targets = sources or list(self._crawlers.keys())
        total_new = 0
        total_updated = 0

        for source_key in targets:
            crawler = self._crawlers.get(source_key)
            if not crawler:
                yield {"event": "error", "data": f"Unknown source: {source_key}"}
                continue

            yield {"event": "progress", "data": {"source": source_key, "stage": "fetching"}}
            try:
                raw_events = crawler.fetch(limit=100)
            except Exception as exc:
                logger.exception("Crawler failed source={}", source_key)
                yield {"event": "error", "data": {"source": source_key, "message": str(exc)}}
                continue

            yield {
                "event": "progress",
                "data": {"source": source_key, "stage": "processing", "fetched": len(raw_events)},
            }

            new_count = 0
            updated_count = 0

            for raw in raw_events:
                eid = _event_id(raw.source, raw.standard_code)
                new_hash = _content_hash(raw.raw_text or raw.title)
                existing = self._store.get(eid)

                if existing and existing.get("content_hash") == new_hash:
                    continue

                is_update = existing is not None
                old_text = existing.get("summary", "") if is_update else ""
                previous_hash = existing.get("content_hash") if is_update else None

                event_dict = _raw_to_dict(raw, eid, new_hash)
                event_dict["previous_hash"] = previous_hash

                try:
                    structure = self._pipeline.extract_structure(event_dict)
                    event_dict.update(structure)
                except Exception as exc:
                    logger.warning("Structure extraction failed id={} err={}", eid, exc)

                try:
                    affected = self._pipeline.assess_impact(event_dict, self._retrieval)
                    event_dict["affected_docs"] = affected
                except Exception as exc:
                    logger.warning("Impact assessment failed id={} err={}", eid, exc)

                if is_update and old_text and raw.raw_text:
                    try:
                        diff = self._pipeline.compute_diff(old_text, raw.raw_text)
                        event_dict["change_summary"] = diff.get("change_summary")
                        event_dict["changed_sections"] = diff.get("changed_sections")
                    except Exception as exc:
                        logger.warning("Diff failed id={} err={}", eid, exc)

                self._store.upsert(event_dict)

                if is_update:
                    updated_count += 1
                else:
                    new_count += 1

            total_new += new_count
            total_updated += updated_count

            yield {
                "event": "progress",
                "data": {
                    "source": source_key,
                    "stage": "done",
                    "new": new_count,
                    "updated": updated_count,
                },
            }

        yield {
            "event": "done",
            "data": {"total_new": total_new, "total_updated": total_updated},
        }